# extract reads from the unmapped BAM file and
# write those 3' poly-A-removed reads
# satisfying the minimum length threshold to a FASTQ file
using DataFrames, FASTX, CodecZlib, XAM, YRUtils
unmapped_bam_file = "/data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_only_mm39/HTH20250618_outs/HTH20250618_unmapped.bam"
# FASTQ quality scheme (64/33)
quality_scheme = 33
# 3' poly-A sequence longer than the length specified below will be trimmed
# its trailing sequence will also be trimmed
min_polya_len = 15
# those trimmed reads longer than the length specified below will be kept
min_read_len = 20
# due to the fact that there are two or more potential poly-A sequences in a single read
# and those may not be from the gene body itself
# this may confuse the de novo transcriptome assembly
# so we can trim them all via more rounds
trim_round = 3
sambamba_path = YRUtils.ShellUtils.find_cmd("sambamba"; return_nothing=false)
YRUtils.ShellUtils.cmd_valid(Cmd(string.([sambamba_path, "--version"])); return_false=false)
if !isfile(string(unmapped_bam_file, ".bai"))
idx_cmd = Cmd(string.(["sambamba", "index", "-t", "80", unmapped_bam_file]))
@info string("running ", idx_cmd, " ...")
run(idx_cmd; wait=true)
@info string("running ", idx_cmd, " done!")
end
# remove poly-A sequences with lengths >= 15bp for up to n rounds
# in each round, the longest poly-A sequence (with rightmost precedence) and its trailing sequence are removed
function get_offset(df::DataFrame, n::Int)
offset = 0
df = sort(df, [order(:match_length; rev=true), order(:match_offset; rev=true)])
for _ in 1:n
offset = df[1, :match_offset]
df = filter(:match_offset => x -> x < offset, df)
if isempty(df)
return offset
end
end
return offset
end
# remove leading and trailing Ns
function trim_n(seq_str::String, qual_str::String)
lead_m = match(r"^N+", seq_str)
if !isnothing(lead_m)
left_pos = length(lead_m.match) + 1
seq_str = seq_str[left_pos:end]
qual_str = qual_str[left_pos:end]
end
trail_m = match(r"N+$", seq_str)
if !isnothing(trail_m)
right_pos = trail_m.offset - 1
seq_str = seq_str[1:right_pos]
qual_str = qual_str[1:right_pos]
end
return (seq_str, qual_str)
end
FASTQWriter(GzipCompressorStream(open(replace(unmapped_bam_file, r"bam$" => "fastq.gz"), "w"))) do writer
reader = open(BAM.Reader, unmapped_bam_file; index=string(unmapped_bam_file, ".bai"))
record = BAM.Record()
while !eof(reader)
empty!(record)
read!(reader, record)
seq_str = string(BAM.sequence(record))
qual_str = join(Char.(BAM.quality(record) .+ quality_scheme))
ms = collect(eachmatch(Regex(string("A{", min_polya_len, ",}")), seq_str; overlap=false))
if !isempty(ms)
df = DataFrame([(match_length=length(m.match), match_offset=m.offset) for m in ms])
offset = get_offset(df, trim_round)
seq_str, qual_str = seq_str[1:(offset-3)], qual_str[1:(offset-3)]
end
seq_str, qual_str = trim_n(seq_str, qual_str)
if length(seq_str) >= min_read_len
write(writer, FASTQRecord(
BAM.tempname(record),
seq_str,
qual_str,
))
end
end
close(reader)
end1 Introduction
In this doc, we try to identify what cell types those EGFP-positive cells may be via the following steps:
Construct the mm39 reference via
mobivision mkindexformobivision quantify.Quantify paired-end reads via
mobivision quantifyagainst the mm39 refernece.
Note: in this study, each scRNA-Seq sample contains two FASTQ files (i.e., R1 and R2), where R1 mainly contains cell barcodes and UMIs, and R2 mainly contains insert fragments. mobivision quantify only uses R1 to extract cell barcodes and UMIs, and uses R2 to quantify gene expressions.
Extract unmapped reads from aligned BAM file using
samtools view.For unmapped reads, write those 3’ poly-A-removed reads satisfying the minimum length threshold to a FASTQ file.
Perform de novo transcriptome assembly using
Trinityto construct the EGFP mRNA sequence.Add the constructed EGFP mRNA sequence to both annotation GTF file and genome FASTA file of mm39.
Construct the mm39 reference with the EGFP mRNA sequence via
mobivision mkindexagain.Quantify paired-end reads via
mobivision quantifyagainst the mm39 refernece with the EGFP mRNA sequence again.Downstream analyses using
Seurat, etc.
2 Construct the mm39 reference
# make mobivision available in the current Bash session
source /data/softwares/mobivision-v3.2/source.sh
# make index
nohup mobivision mkindex -n mobivision_GRCm39_pri_gencode_vM37_pri_basic -f /data/biodatabase/species/mm10/genome/genome/GRCm39.primary_assembly.genome.fa -g /data/biodatabase/species/mm10/genome/anno/gencode.vM37.primary_assembly.basic.annotation.gtf &> run_mobivision_mkindex.log &
echo $! > mobivision_mkindex.pid
if kill -0 $(cat mobivision_mkindex.pid) 2> /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
if ps -p $(cat mobivision_mkindex.pid) > /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
3 Quantify scRNA-Seq reads
# make mobivision available in the current Bash session
source /data/softwares/mobivision-v3.2/source.sh
# quantify reads
nohup mobivision quantify -i /data/biodatabase/species/mm10/indices/mobivision_GRCm39_pri_gencode_vM37_pri_basic -t 90 -f /data/users/dell/hth_neurons_innervate_breast/JZ25092505-250522-HY-250522-HY_combined -o /data/users/dell/hth_neurons_innervate_breast/analysis_results &> run_mobivision_quantify.log &
echo $! > mobivision_quantify.pid
if kill -0 $(cat mobivision_quantify.pid) 2> /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
if ps -p $(cat mobivision_quantify.pid) > /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
4 Extract unmapped reads
# extract unmapped reads from aligned BAM file
samtools view -f 4 -@ 80 -b HTH20250618_Aligned.sort.bam -o HTH20250618_unmapped.bam
5 Dealing with unmaped reads
6 De novo transcriptome assembly
Before running Trinity, please make sure what is your RNA-Seq library type (e.g., stranded vs. non-stranded).
- Run
Trinity
# run Trinity to perform de novo transcriptome assembly
singularity exec -e /data/softwares/trinity_v2.15.2/trinityrnaseq.v2.15.2.simg Trinity \
--seqType fq --CPU 100 --SS_lib_type F --max_memory 500G \
--single /data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_only_mm39/HTH20250618_outs/HTH20250618_unmapped.fastq.gz \
--output /data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_only_mm39/HTH20250618_outs/trinity_out_dir
- Extract candidate isoforms based on partial EGFP fragments
# extract candidate isoforms based on PRV531 fragments
using FASTX, BioAlignments, BioSequences, YRUtils
plasmid_fa_file = "/data/users/dell/hth_neurons_innervate_breast/prv531_plasmid_info/fragments.fasta"
trinity_fa_file = "/data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_only_mm39/HTH20250618_outs/trinity_out_dir.Trinity.fasta"
# the minimum length of consecutive matches
min_len = 20
plasmid_vec = FASTAReader(open(plasmid_fa_file)) do reader
vec = Tuple{String,String}[]
record = FASTARecord()
while !eof(reader)
empty!(record)
read!(reader, record)
push!(vec, (FASTX.identifier(record), FASTX.sequence(record)))
end
return vec
end
score_model = AffineGapScoreModel(EDNAFULL; gap_open=-5, gap_extend=-2)
aln_dict = FASTAReader(open(trinity_fa_file)) do reader
dict = Dict{String,Vector{String}}()
record = FASTARecord()
while !eof(reader)
empty!(record)
read!(reader, record)
for (plasmid_id, plasmid_seq) in plasmid_vec
res = pairalign(LocalAlignment(), plasmid_seq, FASTX.sequence(record), score_model)
aln = alignment(res)
match_flag = false
for m in eachmatch(r"(?<num>\d+)(?<op>[^\d])", cigar(alignment(aln)))
if m["op"] == "=" && parse(Int, m["num"], base=10) >= min_len
dict[string(plasmid_id, " vs. ", FASTX.description(record))] = [join([x for (x, _) in aln]), join([y for (_, y) in aln])]
match_flag = true
break
end
end
if match_flag
break
end
end
end
return dict
end
YRUtils.BioUtils.show_align(aln_dict, string(trinity_fa_file, ".html"))- Format FASTA sequence
# format a sequence into fixed-width sub-sequences
id = "prv531_plasmid_mRNA"
s = "GGTCGCTGCGCGCTGCCTTCGCCCCGTGCCCCGCTCCGCCGCCGCCTCGCGCCGCCCGCCCCGGCTCTGACTGACCGCGTTACTCCCACAGCGCACCATGGAGAACCCTGGACCTATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAATAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATCAACCTCTGGATTACAAAATTTGTGAAAGATTGACTGGTATTCTTAACTATGTTGCTCCTTTTACGCTATGTGGATACGCTGCTTTAATGCCTTTGTATCATGCTATTGCTTCCCGTATGGCTTTCATTTTCTCCTCCTTGTATAAATCCTGGTTGCTGTCTCTTTATGAGGAGTTGTGGCCCGTTGTCAGGCAACGTGGCGTGGTGTGCACTGTGTTTGCTGACGCAACCCCCACTGGTTGGGGCATTGCCACCACCTGTCAGCTCCTTTCCGGGACTTTCGCTTTCCCCCTCCCTATTGCCACGGCGGAACTCATCGCCGCCTGCCTTGCCCGCTGCTGGACAGGGGCTCGGCTGTTGGGCACTGACAATTCCGTGGTGTTGTCGGGGAAATCATCGTCCTTTCCTTGGCTGCTCGCCTGTGTTGCCACCTGGATTCTGCGCGGGACGTCCTTCTGCTACGTCCCTTCGGCCCTCAATCCAGCGGACCTTCCTTCCCGCGGCCTGCTGCCGGCTCTGCGGCCTCTTCCGCGTCTTCGCCTTCGCCCTCAGACGAGTCGGATCTCCCTTTGGGCCGCCTCCCCGCAGGGCCCGTTTAAACCCGCTGATCAGCCTCGACTGTGCCTTCTAGTTGCCAGCCATCTGTTGTTTGCCCCTCCCCCGTGCCTTCCTTGACCCTGGAAGGTGCCACTCCCACTGTCCTTTCCTAATAAAATGAGGAAATTGCATCGCATTGTCTGAGTAGGTGTCATTCTATTCTGGGGGGTGGGGTGGGGCAGGACAGCAAGGGGGAGGATTGGGAAGACAATAGCAGGCATGCTGGGGATGCGGTGGGCTCTATGG"
line_width = 60
output_file = "/data/users/dell/hth_neurons_innervate_breast/temp/prv531_plasmid_mRNA.fasta"
s_vec = collect(Base.Iterators.partition(s, line_width))
pushfirst!(s_vec, string(">", id))
open(output_file, "w") do io
print(io, join(s_vec, "\n"))
end7 Add the EGFP mRNA sequence to the mm39 GTF and FASTA files
# add prv531 anntation info and genome sequence to mm39 anntation GTF file and genome FASTA file
echo -e 'prv531_plasmid_mRNA\tTRINITY\texon\t1\t838\t.\t+\t.\tgene_id "prv531"; transcript_id "prv531.1"; gene_name "prv531"; gene_type "protein_coding"; exon_number 1; exon_id "prv531.1";' > prv531_plasmid_mRNA.gtf
echo -e 'prv531_plasmid_mRNA\tTRINITY\texon\t889\t1734\t.\t+\t.\tgene_id "prv531"; transcript_id "prv531.1"; gene_name "prv531"; gene_type "protein_coding"; exon_number 2; exon_id "prv531.2";' >> prv531_plasmid_mRNA.gtf
mv gencode.vM37.primary_assembly.basic.annotation.gtf gencode.vM37.primary_assembly.basic.annotation.prv531.gtf
mv GRCm39.primary_assembly.genome.fa GRCm39.primary_assembly.genome.prv531.fa
cat prv531_plasmid_mRNA.gtf >> gencode.vM37.primary_assembly.basic.annotation.prv531.gtf
cat prv531_plasmid_mRNA.fasta >> GRCm39.primary_assembly.genome.prv531.fa
8 Construct the modified mm39 reference again
# make mobivision available in the current Bash session
source /data/softwares/mobivision-v3.2/source.sh
# make index
nohup mobivision mkindex -n mobivision_GRCm39_pri_gencode_vM37_pri_basic_prv531 -f /data/biodatabase/species/mm10/genome/genome/GRCm39.primary_assembly.genome.prv531.fa.gz -g /data/biodatabase/species/mm10/genome/anno/gencode.vM37.primary_assembly.basic.annotation.prv531.gtf.gz &> run_mobivision_mkindex.log &
echo $! > mobivision_mkindex.pid
if kill -0 $(cat mobivision_mkindex.pid) 2> /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
if ps -p $(cat mobivision_mkindex.pid) > /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
9 Quantify scRNA-Seq reads again
# make mobivision available in the current Bash session
source /data/softwares/mobivision-v3.2/source.sh
# quantify reads
nohup mobivision quantify -i /data/biodatabase/species/mm10/indices/mobivision_GRCm39_pri_gencode_vM37_pri_basic_prv531 -t 90 -f /data/users/dell/hth_neurons_innervate_breast/JZ25092505-250522-HY-250522-HY_combined -o /data/users/dell/hth_neurons_innervate_breast/analysis_results &> run_mobivision_quantify.log &
echo $! > mobivision_quantify.pid
if kill -0 $(cat mobivision_quantify.pid) 2> /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
if ps -p $(cat mobivision_quantify.pid) > /dev/null; then echo "The process is still running"; else echo "The process has finished"; fi
10 Standard scRNA-Seq analysis via Seurat
10.1 Method 1
In the following code, we also try to rescue some EGFP-positive cells from unfiltered cell by gene matrix.
library(Seurat)
library(tidyverse)
library(patchwork)
library(vroom)
library(YRUtils)
gtf_file <- "/data/biodatabase/species/mm10/genome/anno/gencode.vM37.primary_assembly.basic.annotation.prv531.gtf.gz"
cellxgene_dir <- "/data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_with_prv531/HTH20250618_outs/filtered_cell_gene_matrix"
raw_cellxgene_dir <- "/data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_with_prv531/HTH20250618_outs/raw_cell_gene_matrix"
output_dir <- "/data/users/dell/hth_neurons_innervate_breast/analysis_results/HTH20250618_with_prv531/seurat_res/with_rescue"
gtf_df <- extract_gene_id_from_gff(gtf_file, "gene")
gtf_df <- bind_rows(
gtf_df,
tibble(
seqnames = "prv531_plasmid_mRNA", gene_type = "gene",
gene_id_without_version = "prv531", gene_version = "1",
gene_id = "prv531.1", gene_name = "prv531",
gene_biotype = "protein_coding", gene_level = "1"
)
)
counts <- Read10X(cellxgene_dir)
seu_obj <- CreateSeuratObject(counts = counts, project = "HTH20250618", min.cells = 3, min.features = 200)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")
p <- VlnPlot(seu_obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
p[[1]] <- p[[1]] + geom_hline(yintercept = c(5500, 6000, 6500, 7000), color = "red", linetype = "dashed")
p[[2]] <- p[[2]] + geom_hline(yintercept = c(25000, 30000, 35000), color = "red", linetype = "dashed")
p[[3]] <- p[[3]] + geom_hline(yintercept = c(7.5, 12.5, 15), color = "red", linetype = "dashed")
ppreview(p, file = file.path(output_dir, "qc_violin.raw.pdf"))
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ppreview(p1 | p2, file = file.path(output_dir, "qc_scatter.raw.pdf"))
seu_obj <- subset(seu_obj, subset = nFeature_RNA >= 200 & nFeature_RNA <= 6000 & nCount_RNA >= 200 & nCount_RNA <= 30000 & percent.mt <= 15)
p <- VlnPlot(seu_obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ppreview(p, file = file.path(output_dir, "qc_violin.filtered.pdf"))
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ppreview(p1 | p2, file = file.path(output_dir, "qc_scatter.filtered.pdf"))
prv531_expr_mat <- LayerData(seu_obj, assay = "RNA", layer = "counts", features = "prv531")
prv531_pos_cells <- colnames(prv531_expr_mat)[prv531_expr_mat["prv531", ] > 0]
seu_obj[["cell_bc"]] <- row.names(seu_obj[[]])
seu_obj[["prv531_pos"]] <- as.character(seu_obj$cell_bc %in% prv531_pos_cells)
# rescue prv531-positive cells >>>
raw_counts <- Read10X(raw_cellxgene_dir)
raw_counts <- raw_counts[, colnames(raw_counts)[raw_counts["prv531", ] > 0]]
raw_counts <- raw_counts[, !(colnames(raw_counts) %in% prv531_pos_cells)]
raw_counts <- raw_counts[rowSums(raw_counts) > 0, ]
raw_seu_obj <- CreateSeuratObject(counts = raw_counts, project = "HTH20250618_raw", min.cells = 1, min.features = 1)
raw_seu_obj[["prv531_pos"]] <- "TRUE"
raw_seu_obj[["percent.mt"]] <- PercentageFeatureSet(raw_seu_obj, pattern = "^mt-")
p <- VlnPlot(raw_seu_obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ppreview(p, file = file.path(output_dir, "qc_violin.prv531.raw.pdf"))
p1 <- FeatureScatter(raw_seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(raw_seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ppreview(p1 | p2, file = file.path(output_dir, "qc_scatter.prv531.raw.pdf"))
raw_seu_obj <- subset(raw_seu_obj, subset = nCount_RNA >= 50 & nFeature_RNA >= 50 & percent.mt <= 40)
p <- VlnPlot(raw_seu_obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ppreview(p, file = file.path(output_dir, "qc_violin.prv531.filtered.pdf"))
p1 <- FeatureScatter(raw_seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(raw_seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ppreview(p1 | p2, file = file.path(output_dir, "qc_scatter.prv531.filtered.pdf"))
merged_obj <- merge(x = seu_obj, y = raw_seu_obj)
merged_obj <- JoinLayers(merged_obj)
merged_obj[["project.prv531_pos"]] <- paste0(merged_obj$orig.ident, ".", merged_obj$prv531_pos)
seu_obj <- merged_obj
seu_obj[["cell_bc"]] <- row.names(seu_obj[[]])
# rescue prv531-positive cells <<<
seu_obj <- NormalizeData(seu_obj, normalization.method = "LogNormalize", scale.factor = 10000)
seu_obj <- FindVariableFeatures(seu_obj, selection.method = "vst", nfeatures = 2000)
p1 <- VariableFeaturePlot(seu_obj)
p2 <- LabelPoints(plot = p1, points = head(VariableFeatures(seu_obj), 10), repel = TRUE, xnudge = 0, ynudge = 0)
ppreview(p1 | p2, file = file.path(output_dir, "hvf_scatter.pdf"))
seu_obj <- ScaleData(seu_obj, features = row.names(seu_obj))
seu_obj <- RunPCA(seu_obj, features = VariableFeatures(object = seu_obj))
p <- DimPlot(seu_obj, reduction = "pca")
ppreview(p, file = file.path(output_dir, "pca_scatter.pdf"))
p <- ElbowPlot(seu_obj, ndims = 50, reduction = "pca")
ppreview(p, file = file.path(output_dir, "elbow_line.pdf"))
p <- DimHeatmap(seu_obj, dims = 1:20, cells = 500, balanced = TRUE)
p
seu_obj <- FindNeighbors(seu_obj, dims = 1:50, reduction = "pca")
seu_obj <- FindClusters(seu_obj, resolution = 0.5)
seu_obj <- FindClusters(seu_obj, resolution = 1)
seu_obj <- RunUMAP(seu_obj, dims = 1:50, reduction = "pca")
seu_obj$seurat_clusters <- seu_obj$RNA_snn_res.0.5
resolution <- 0.5
p1 <- DimPlot(seu_obj, reduction = "umap", label = TRUE, group.by = "seurat_clusters")
# p2 <- DimPlot(seu_obj,
# reduction = "umap", label = TRUE, group.by = "seurat_clusters",
# cells.highlight = list(prv531.pos = seu_obj$cell_bc[seu_obj$prv531_pos == "TRUE"]),
# cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
# )
p2 <- DimPlot(seu_obj,
reduction = "umap", label = TRUE, group.by = "seurat_clusters",
cells.highlight = list(
HTH20250618.TRUE = seu_obj$cell_bc[seu_obj$project.prv531_pos == "HTH20250618.TRUE"],
HTH20250618_raw.TRUE = seu_obj$cell_bc[seu_obj$project.prv531_pos == "HTH20250618_raw.TRUE"]
),
cols.highlight = c("#EE4000", "#008B8B"), sizes.highlight = 2, alpha = 0.6
)
ppreview(p1 | p2, file = file.path(output_dir, paste0("umap_scatter.res_", resolution, ".pdf")))
p1_data <- p1$data %>%
mutate(cell_bc = row.names(.)) %>%
inner_join(
seu_obj[[]] %>%
select(cell_bc, nCount_RNA, nFeature_RNA, percent.mt),
by = "cell_bc"
)
colors <- c("darkblue", "lightskyblue", "palegreen", "yellow", "orange", "red", "darkred")
p <- ggplot(p1_data, aes(umap_1, umap_2, color = nCount_RNA)) +
geom_point() +
scale_color_gradientn(colors = colors, trans = "log10") +
theme_classic()
ppreview(p, file = file.path(output_dir, paste0("umap_scatter.nCount_RNA.pdf")))
p <- ggplot(p1_data, aes(umap_1, umap_2, color = nFeature_RNA)) +
geom_point() +
scale_color_gradientn(colors = colors, trans = "log10") +
theme_classic()
ppreview(p, file = file.path(output_dir, paste0("umap_scatter.nFeature_RNA.pdf")))
p <- ggplot(p1_data, aes(umap_1, umap_2, color = percent.mt)) +
geom_point() +
scale_color_gradientn(colors = colors) +
theme_classic()
ppreview(p, file = file.path(output_dir, paste0("umap_scatter.percent_mt.pdf")))
# prv531_pos_stat_df <- inner_join(
# seu_obj[[]] %>%
# group_by(seurat_clusters) %>%
# reframe(cluster_n = n()),
# filter(seu_obj[[]], prv531_pos == "TRUE") %>%
# group_by(seurat_clusters) %>%
# reframe(prv531_n = n()),
# by = "seurat_clusters"
# ) %>%
# mutate(percent = prv531_n / cluster_n * 100) %>%
# arrange(desc(prv531_n), desc(percent))
prv531_pos_stat_df <- inner_join(
seu_obj[[]] %>%
group_by(seurat_clusters) %>%
reframe(cluster_n = n()),
filter(seu_obj[[]], project.prv531_pos %in% c("HTH20250618.TRUE", "HTH20250618_raw.TRUE")) %>%
mutate(project.prv531_pos = factor(project.prv531_pos, levels = c("HTH20250618.TRUE", "HTH20250618_raw.TRUE"))) %>%
group_by(seurat_clusters, project.prv531_pos) %>%
reframe(prv531_n = n()),
by = "seurat_clusters"
) %>%
mutate(percent = prv531_n / cluster_n * 100) %>%
arrange(seurat_clusters, project.prv531_pos)
vroom_write(prv531_pos_stat_df, file = file.path(output_dir, paste0("prv531_pos_stat.res_", resolution, ".tsv")))
saveRDS(seu_obj, file = file.path(output_dir, "seurat_obj.to_umap.rds"))