Identify EGFP-positive cells via scRNA-Seq

1 Introduction

In this doc, we try to identify what cell types those EGFP-positive cells may be via the following steps:

Construct the mm39 reference via mobivision mkindex for mobivision quantify.
Quantify paired-end reads via mobivision quantify against the mm39 refernece.

Note: in this study, each scRNA-Seq sample contains two FASTQ files (i.e., R1 and R2), where R1 mainly contains cell barcodes and UMIs, and R2 mainly contains insert fragments. mobivision quantify only uses R1 to extract cell barcodes and UMIs, and uses R2 to quantify gene expressions.

Extract unmapped reads from aligned BAM file using samtools view.
For unmapped reads, write those 3’ poly-A-removed reads satisfying the minimum length threshold to a FASTQ file.
Perform de novo transcriptome assembly using Trinity to construct the EGFP mRNA sequence.
Add the constructed EGFP mRNA sequence to both annotation GTF file and genome FASTA file of mm39.
Construct the mm39 reference with the EGFP mRNA sequence via mobivision mkindex again.
Quantify paired-end reads via mobivision quantify against the mm39 refernece with the EGFP mRNA sequence again.
Downstream analyses using Seurat, etc.

2 Construct the mm39 reference

# make mobivision available in the current Bash session
source /data/softwares/mobivision-v3.2/source.sh

# make index
nohup mobivision mkindex -n mobivision_GRCm39_pri_gencode_vM37_pri_basic -f /data/biodatabase/species/mm10/genome/genome/GRCm39.primary_assembly.genome.fa -g /data/biodatabase/species/mm10/genome/anno/gencode.vM37.primary_assembly.basic.annotation.gtf &> run_mobivision_mkindex.log &

echo $! > mobivision_mkindex.pid
if kill -0 $(cat mobivision_mkindex.pid) 2> /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
if ps -p $(cat mobivision_mkindex.pid) > /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi

3 Quantify scRNA-Seq reads

# make mobivision available in the current Bash session
source /data/softwares/mobivision-v3.2/source.sh

# quantify reads
nohup mobivision quantify -i /data/biodatabase/species/mm10/indices/mobivision_GRCm39_pri_gencode_vM37_pri_basic -t 90 -f /data/users/dell/hth_neurons_innervate_breast/JZ25092505-250522-HY-250522-HY_combined -o /data/users/dell/hth_neurons_innervate_breast/analysis_results &> run_mobivision_quantify.log &

echo $! > mobivision_quantify.pid
if kill -0 $(cat mobivision_quantify.pid) 2> /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
if ps -p $(cat mobivision_quantify.pid) > /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi

4 Extract unmapped reads

# extract unmapped reads from aligned BAM file
samtools view -f 4 -@ 80 -b HTH20250618_Aligned.sort.bam -o HTH20250618_unmapped.bam

5 Dealing with unmaped reads

# extract reads from the unmapped BAM file and
# write those 3' poly-A-removed reads
# satisfying the minimum length threshold to a FASTQ file
using DataFrames, FASTX, CodecZlib, XAM, YRUtils

unmapped_bam_file = "/data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_only_mm39/HTH20250618_outs/HTH20250618_unmapped.bam"
# FASTQ quality scheme (64/33)
quality_scheme = 33
# 3' poly-A sequence longer than the length specified below will be trimmed
# its trailing sequence will also be trimmed
min_polya_len = 15
# those trimmed reads longer than the length specified below will be kept
min_read_len = 20
# due to the fact that there are two or more potential poly-A sequences in a single read
# and those may not be from the gene body itself
# this may confuse the de novo transcriptome assembly
# so we can trim them all via more rounds
trim_round = 3

sambamba_path = YRUtils.ShellUtils.find_cmd("sambamba"; return_nothing=false)
YRUtils.ShellUtils.cmd_valid(Cmd(string.([sambamba_path, "--version"])); return_false=false)
if !isfile(string(unmapped_bam_file, ".bai"))
    idx_cmd = Cmd(string.(["sambamba", "index", "-t", "80", unmapped_bam_file]))
    @info string("running ", idx_cmd, " ...")
    run(idx_cmd; wait=true)
    @info string("running ", idx_cmd, " done!")
end

# remove poly-A sequences with lengths >= 15bp for up to n rounds
# in each round, the longest poly-A sequence (with rightmost precedence) and its trailing sequence are removed
function get_offset(df::DataFrame, n::Int)
    offset = 0
    df = sort(df, [order(:match_length; rev=true), order(:match_offset; rev=true)])
    for _ in 1:n
        offset = df[1, :match_offset]
        df = filter(:match_offset => x -> x < offset, df)

        if isempty(df)
            return offset
        end
    end

    return offset
end

# remove leading and trailing Ns
function trim_n(seq_str::String, qual_str::String)
    lead_m = match(r"^N+", seq_str)
    if !isnothing(lead_m)
        left_pos = length(lead_m.match) + 1
        seq_str = seq_str[left_pos:end]
        qual_str = qual_str[left_pos:end]
    end

    trail_m = match(r"N+$", seq_str)
    if !isnothing(trail_m)
        right_pos = trail_m.offset - 1
        seq_str = seq_str[1:right_pos]
        qual_str = qual_str[1:right_pos]
    end

    return (seq_str, qual_str)
end

FASTQWriter(GzipCompressorStream(open(replace(unmapped_bam_file, r"bam$" => "fastq.gz"), "w"))) do writer
    reader = open(BAM.Reader, unmapped_bam_file; index=string(unmapped_bam_file, ".bai"))
    record = BAM.Record()
    while !eof(reader)
        empty!(record)
        read!(reader, record)

        seq_str = string(BAM.sequence(record))
        qual_str = join(Char.(BAM.quality(record) .+ quality_scheme))

        ms = collect(eachmatch(Regex(string("A{", min_polya_len, ",}")), seq_str; overlap=false))
        if !isempty(ms)
            df = DataFrame([(match_length=length(m.match), match_offset=m.offset) for m in ms])
            offset = get_offset(df, trim_round)
            seq_str, qual_str = seq_str[1:(offset-3)], qual_str[1:(offset-3)]
        end
        seq_str, qual_str = trim_n(seq_str, qual_str)

        if length(seq_str) >= min_read_len
            write(writer, FASTQRecord(
                BAM.tempname(record),
                seq_str,
                qual_str,
            ))
        end
    end
    close(reader)
end

6 De novo transcriptome assembly

Before running Trinity, please make sure what is your RNA-Seq library type (e.g., stranded vs. non-stranded).

Run Trinity

# run Trinity to perform de novo transcriptome assembly
singularity exec -e /data/softwares/trinity_v2.15.2/trinityrnaseq.v2.15.2.simg Trinity \
--seqType fq --CPU 100 --SS_lib_type F --max_memory 500G \
--single /data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_only_mm39/HTH20250618_outs/HTH20250618_unmapped.fastq.gz \
--output /data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_only_mm39/HTH20250618_outs/trinity_out_dir

Extract candidate isoforms based on partial EGFP fragments

# extract candidate isoforms based on PRV531 fragments
using FASTX, BioAlignments, BioSequences, YRUtils

plasmid_fa_file = "/data/users/dell/hth_neurons_innervate_breast/prv531_plasmid_info/fragments.fasta"
trinity_fa_file = "/data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_only_mm39/HTH20250618_outs/trinity_out_dir.Trinity.fasta"
# the minimum length of consecutive matches
min_len = 20

plasmid_vec = FASTAReader(open(plasmid_fa_file)) do reader
    vec = Tuple{String,String}[]
    record = FASTARecord()
    while !eof(reader)
        empty!(record)
        read!(reader, record)
        push!(vec, (FASTX.identifier(record), FASTX.sequence(record)))
    end
    return vec
end

score_model = AffineGapScoreModel(EDNAFULL; gap_open=-5, gap_extend=-2)
aln_dict = FASTAReader(open(trinity_fa_file)) do reader
    dict = Dict{String,Vector{String}}()
    record = FASTARecord()
    while !eof(reader)
        empty!(record)
        read!(reader, record)
        for (plasmid_id, plasmid_seq) in plasmid_vec
            res = pairalign(LocalAlignment(), plasmid_seq, FASTX.sequence(record), score_model)
            aln = alignment(res)

            match_flag = false
            for m in eachmatch(r"(?<num>\d+)(?<op>[^\d])", cigar(alignment(aln)))
                if m["op"] == "=" && parse(Int, m["num"], base=10) >= min_len
                    dict[string(plasmid_id, " vs. ", FASTX.description(record))] = [join([x for (x, _) in aln]), join([y for (_, y) in aln])]
                    match_flag = true
                    break
                end
            end

            if match_flag
                break
            end
        end
    end
    return dict
end
YRUtils.BioUtils.show_align(aln_dict, string(trinity_fa_file, ".html"))

Format FASTA sequence

# format a sequence into fixed-width sub-sequences
id = "prv531_plasmid_mRNA"
s = "GGTCGCTGCGCGCTGCCTTCGCCCCGTGCCCCGCTCCGCCGCCGCCTCGCGCCGCCCGCCCCGGCTCTGACTGACCGCGTTACTCCCACAGCGCACCATGGAGAACCCTGGACCTATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAATAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATCAACCTCTGGATTACAAAATTTGTGAAAGATTGACTGGTATTCTTAACTATGTTGCTCCTTTTACGCTATGTGGATACGCTGCTTTAATGCCTTTGTATCATGCTATTGCTTCCCGTATGGCTTTCATTTTCTCCTCCTTGTATAAATCCTGGTTGCTGTCTCTTTATGAGGAGTTGTGGCCCGTTGTCAGGCAACGTGGCGTGGTGTGCACTGTGTTTGCTGACGCAACCCCCACTGGTTGGGGCATTGCCACCACCTGTCAGCTCCTTTCCGGGACTTTCGCTTTCCCCCTCCCTATTGCCACGGCGGAACTCATCGCCGCCTGCCTTGCCCGCTGCTGGACAGGGGCTCGGCTGTTGGGCACTGACAATTCCGTGGTGTTGTCGGGGAAATCATCGTCCTTTCCTTGGCTGCTCGCCTGTGTTGCCACCTGGATTCTGCGCGGGACGTCCTTCTGCTACGTCCCTTCGGCCCTCAATCCAGCGGACCTTCCTTCCCGCGGCCTGCTGCCGGCTCTGCGGCCTCTTCCGCGTCTTCGCCTTCGCCCTCAGACGAGTCGGATCTCCCTTTGGGCCGCCTCCCCGCAGGGCCCGTTTAAACCCGCTGATCAGCCTCGACTGTGCCTTCTAGTTGCCAGCCATCTGTTGTTTGCCCCTCCCCCGTGCCTTCCTTGACCCTGGAAGGTGCCACTCCCACTGTCCTTTCCTAATAAAATGAGGAAATTGCATCGCATTGTCTGAGTAGGTGTCATTCTATTCTGGGGGGTGGGGTGGGGCAGGACAGCAAGGGGGAGGATTGGGAAGACAATAGCAGGCATGCTGGGGATGCGGTGGGCTCTATGG"
line_width = 60
output_file = "/data/users/dell/hth_neurons_innervate_breast/temp/prv531_plasmid_mRNA.fasta"

s_vec = collect(Base.Iterators.partition(s, line_width))
pushfirst!(s_vec, string(">", id))
open(output_file, "w") do io
    print(io, join(s_vec, "\n"))
end

7 Add the EGFP mRNA sequence to the mm39 GTF and FASTA files

# add prv531 anntation info and genome sequence to mm39 anntation GTF file and genome FASTA file
echo -e 'prv531_plasmid_mRNA\tTRINITY\texon\t1\t838\t.\t+\t.\tgene_id "prv531"; transcript_id "prv531.1"; gene_name "prv531"; gene_type "protein_coding"; exon_number 1; exon_id "prv531.1";' > prv531_plasmid_mRNA.gtf
echo -e 'prv531_plasmid_mRNA\tTRINITY\texon\t889\t1734\t.\t+\t.\tgene_id "prv531"; transcript_id "prv531.1"; gene_name "prv531"; gene_type "protein_coding"; exon_number 2; exon_id "prv531.2";' >> prv531_plasmid_mRNA.gtf

mv gencode.vM37.primary_assembly.basic.annotation.gtf gencode.vM37.primary_assembly.basic.annotation.prv531.gtf
mv GRCm39.primary_assembly.genome.fa GRCm39.primary_assembly.genome.prv531.fa

cat prv531_plasmid_mRNA.gtf >> gencode.vM37.primary_assembly.basic.annotation.prv531.gtf
cat prv531_plasmid_mRNA.fasta >> GRCm39.primary_assembly.genome.prv531.fa

8 Construct the modified mm39 reference again

# make mobivision available in the current Bash session
source /data/softwares/mobivision-v3.2/source.sh

# make index
nohup mobivision mkindex -n mobivision_GRCm39_pri_gencode_vM37_pri_basic_prv531 -f /data/biodatabase/species/mm10/genome/genome/GRCm39.primary_assembly.genome.prv531.fa.gz -g /data/biodatabase/species/mm10/genome/anno/gencode.vM37.primary_assembly.basic.annotation.prv531.gtf.gz &> run_mobivision_mkindex.log &

echo $! > mobivision_mkindex.pid
if kill -0 $(cat mobivision_mkindex.pid) 2> /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
if ps -p $(cat mobivision_mkindex.pid) > /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi

9 Quantify scRNA-Seq reads again

# make mobivision available in the current Bash session
source /data/softwares/mobivision-v3.2/source.sh

# quantify reads
nohup mobivision quantify -i /data/biodatabase/species/mm10/indices/mobivision_GRCm39_pri_gencode_vM37_pri_basic_prv531 -t 90 -f /data/users/dell/hth_neurons_innervate_breast/JZ25092505-250522-HY-250522-HY_combined -o /data/users/dell/hth_neurons_innervate_breast/analysis_results &> run_mobivision_quantify.log &

echo $! > mobivision_quantify.pid
if kill -0 $(cat mobivision_quantify.pid) 2> /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
if ps -p $(cat mobivision_quantify.pid) > /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi

10 Standard scRNA-Seq analysis via `Seurat`

10.1 Method 1

In the following code, we also try to rescue some EGFP-positive cells from unfiltered cell by gene matrix.

library(Seurat)
library(tidyverse)
library(patchwork)
library(vroom)
library(YRUtils)

gtf_file <- "/data/biodatabase/species/mm10/genome/anno/gencode.vM37.primary_assembly.basic.annotation.prv531.gtf.gz"
cellxgene_dir <- "/data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_with_prv531/HTH20250618_outs/filtered_cell_gene_matrix"
raw_cellxgene_dir <- "/data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_with_prv531/HTH20250618_outs/raw_cell_gene_matrix"
output_dir <- "/data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_with_prv531/seurat_res/with_rescue"

gtf_df <- extract_gene_id_from_gff(gtf_file, "gene")
gtf_df <- bind_rows(
    gtf_df,
    tibble(
        seqnames = "prv531_plasmid_mRNA", gene_type = "gene",
        gene_id_without_version = "prv531", gene_version = "1",
        gene_id = "prv531.1", gene_name = "prv531",
        gene_biotype = "protein_coding", gene_level = "1"
    )
)

counts <- Read10X(cellxgene_dir)
seu_obj <- CreateSeuratObject(counts = counts, project = "HTH20250618", min.cells = 3, min.features = 200)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")

p <- VlnPlot(seu_obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
p[[1]] <- p[[1]] + geom_hline(yintercept = c(5500, 6000, 6500, 7000), color = "red", linetype = "dashed")
p[[2]] <- p[[2]] + geom_hline(yintercept = c(25000, 30000, 35000), color = "red", linetype = "dashed")
p[[3]] <- p[[3]] + geom_hline(yintercept = c(7.5, 12.5, 15), color = "red", linetype = "dashed")
ppreview(p, file = file.path(output_dir, "qc_violin.raw.pdf"))

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ppreview(p1 | p2, file = file.path(output_dir, "qc_scatter.raw.pdf"))

seu_obj <- subset(seu_obj, subset = nFeature_RNA >= 200 & nFeature_RNA <= 6000 & nCount_RNA >= 200 & nCount_RNA <= 30000 & percent.mt <= 15)

p <- VlnPlot(seu_obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ppreview(p, file = file.path(output_dir, "qc_violin.filtered.pdf"))

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ppreview(p1 | p2, file = file.path(output_dir, "qc_scatter.filtered.pdf"))

prv531_expr_mat <- LayerData(seu_obj, assay = "RNA", layer = "counts", features = "prv531")
prv531_pos_cells <- colnames(prv531_expr_mat)[prv531_expr_mat["prv531", ] > 0]
seu_obj[["cell_bc"]] <- row.names(seu_obj[[]])
seu_obj[["prv531_pos"]] <- as.character(seu_obj$cell_bc %in% prv531_pos_cells)

# rescue prv531-positive cells >>>
raw_counts <- Read10X(raw_cellxgene_dir)
raw_counts <- raw_counts[, colnames(raw_counts)[raw_counts["prv531", ] > 0]]
raw_counts <- raw_counts[, !(colnames(raw_counts) %in% prv531_pos_cells)]
raw_counts <- raw_counts[rowSums(raw_counts) > 0, ]
raw_seu_obj <- CreateSeuratObject(counts = raw_counts, project = "HTH20250618_raw", min.cells = 1, min.features = 1)
raw_seu_obj[["prv531_pos"]] <- "TRUE"
raw_seu_obj[["percent.mt"]] <- PercentageFeatureSet(raw_seu_obj, pattern = "^mt-")

p <- VlnPlot(raw_seu_obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ppreview(p, file = file.path(output_dir, "qc_violin.prv531.raw.pdf"))

p1 <- FeatureScatter(raw_seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(raw_seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ppreview(p1 | p2, file = file.path(output_dir, "qc_scatter.prv531.raw.pdf"))

raw_seu_obj <- subset(raw_seu_obj, subset = nCount_RNA >= 50 & nFeature_RNA >= 50 & percent.mt <= 40)

p <- VlnPlot(raw_seu_obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ppreview(p, file = file.path(output_dir, "qc_violin.prv531.filtered.pdf"))

p1 <- FeatureScatter(raw_seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(raw_seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ppreview(p1 | p2, file = file.path(output_dir, "qc_scatter.prv531.filtered.pdf"))

merged_obj <- merge(x = seu_obj, y = raw_seu_obj)
merged_obj <- JoinLayers(merged_obj)
merged_obj[["project.prv531_pos"]] <- paste0(merged_obj$orig.ident, ".", merged_obj$prv531_pos)
seu_obj <- merged_obj
seu_obj[["cell_bc"]] <- row.names(seu_obj[[]])
# rescue prv531-positive cells <<<

seu_obj <- NormalizeData(seu_obj, normalization.method = "LogNormalize", scale.factor = 10000)
seu_obj <- FindVariableFeatures(seu_obj, selection.method = "vst", nfeatures = 2000)

p1 <- VariableFeaturePlot(seu_obj)
p2 <- LabelPoints(plot = p1, points = head(VariableFeatures(seu_obj), 10), repel = TRUE, xnudge = 0, ynudge = 0)
ppreview(p1 | p2, file = file.path(output_dir, "hvf_scatter.pdf"))

seu_obj <- ScaleData(seu_obj, features = row.names(seu_obj))
seu_obj <- RunPCA(seu_obj, features = VariableFeatures(object = seu_obj))

p <- DimPlot(seu_obj, reduction = "pca")
ppreview(p, file = file.path(output_dir, "pca_scatter.pdf"))

p <- ElbowPlot(seu_obj, ndims = 50, reduction = "pca")
ppreview(p, file = file.path(output_dir, "elbow_line.pdf"))

p <- DimHeatmap(seu_obj, dims = 1:20, cells = 500, balanced = TRUE)
p

seu_obj <- FindNeighbors(seu_obj, dims = 1:50, reduction = "pca")
seu_obj <- FindClusters(seu_obj, resolution = 0.5)
seu_obj <- FindClusters(seu_obj, resolution = 1)
seu_obj <- RunUMAP(seu_obj, dims = 1:50, reduction = "pca")

seu_obj$seurat_clusters <- seu_obj$RNA_snn_res.0.5
resolution <- 0.5

p1 <- DimPlot(seu_obj, reduction = "umap", label = TRUE, group.by = "seurat_clusters")
# p2 <- DimPlot(seu_obj,
#     reduction = "umap", label = TRUE, group.by = "seurat_clusters",
#     cells.highlight = list(prv531.pos = seu_obj$cell_bc[seu_obj$prv531_pos == "TRUE"]),
#     cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
# )
p2 <- DimPlot(seu_obj,
    reduction = "umap", label = TRUE, group.by = "seurat_clusters",
    cells.highlight = list(
        HTH20250618.TRUE = seu_obj$cell_bc[seu_obj$project.prv531_pos == "HTH20250618.TRUE"],
        HTH20250618_raw.TRUE = seu_obj$cell_bc[seu_obj$project.prv531_pos == "HTH20250618_raw.TRUE"]
    ),
    cols.highlight = c("#EE4000", "#008B8B"), sizes.highlight = 2, alpha = 0.6
)
ppreview(p1 | p2, file = file.path(output_dir, paste0("umap_scatter.res_", resolution, ".pdf")))

p1_data <- p1$data %>%
    mutate(cell_bc = row.names(.)) %>%
    inner_join(
        seu_obj[[]] %>%
            select(cell_bc, nCount_RNA, nFeature_RNA, percent.mt),
        by = "cell_bc"
    )
colors <- c("darkblue", "lightskyblue", "palegreen", "yellow", "orange", "red", "darkred")

p <- ggplot(p1_data, aes(umap_1, umap_2, color = nCount_RNA)) +
    geom_point() +
    scale_color_gradientn(colors = colors, trans = "log10") +
    theme_classic()
ppreview(p, file = file.path(output_dir, paste0("umap_scatter.nCount_RNA.pdf")))

p <- ggplot(p1_data, aes(umap_1, umap_2, color = nFeature_RNA)) +
    geom_point() +
    scale_color_gradientn(colors = colors, trans = "log10") +
    theme_classic()
ppreview(p, file = file.path(output_dir, paste0("umap_scatter.nFeature_RNA.pdf")))

p <- ggplot(p1_data, aes(umap_1, umap_2, color = percent.mt)) +
    geom_point() +
    scale_color_gradientn(colors = colors) +
    theme_classic()
ppreview(p, file = file.path(output_dir, paste0("umap_scatter.percent_mt.pdf")))

# prv531_pos_stat_df <- inner_join(
#     seu_obj[[]] %>%
#         group_by(seurat_clusters) %>%
#         reframe(cluster_n = n()),
#     filter(seu_obj[[]], prv531_pos == "TRUE") %>%
#         group_by(seurat_clusters) %>%
#         reframe(prv531_n = n()),
#     by = "seurat_clusters"
# ) %>%
#     mutate(percent = prv531_n / cluster_n * 100) %>%
#     arrange(desc(prv531_n), desc(percent))
prv531_pos_stat_df <- inner_join(
    seu_obj[[]] %>%
        group_by(seurat_clusters) %>%
        reframe(cluster_n = n()),
    filter(seu_obj[[]], project.prv531_pos %in% c("HTH20250618.TRUE", "HTH20250618_raw.TRUE")) %>%
        mutate(project.prv531_pos = factor(project.prv531_pos, levels = c("HTH20250618.TRUE", "HTH20250618_raw.TRUE"))) %>%
        group_by(seurat_clusters, project.prv531_pos) %>%
        reframe(prv531_n = n()),
    by = "seurat_clusters"
) %>%
    mutate(percent = prv531_n / cluster_n * 100) %>%
    arrange(seurat_clusters, project.prv531_pos)
vroom_write(prv531_pos_stat_df, file = file.path(output_dir, paste0("prv531_pos_stat.res_", resolution, ".tsv")))

saveRDS(seu_obj, file = file.path(output_dir, "seurat_obj.to_umap.rds"))