library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(DoubletFinder)
library(vroom)
set.seed(123456)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
"Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
)
n <- names(sample_dir)
work_dir <- file.path(work_dir, paste0(n, "_DoubletFinder"))
dir.create(work_dir)
setwd(work_dir)
counts <- Read10X(sample_dir)
seu_obj <- CreateSeuratObject(
counts = counts, project = n,
min.cells = 10
)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.raw.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.raw.pdf"),
width = 900,
height = 400
)
seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.lo.filtered.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
width = 900,
height = 400
)
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))
VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
dev.off()
}
p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.elbow.pdf"),
width = 1000,
height = 400
)
pcs <- 1:50
seu_obj <- FindNeighbors(seu_obj, dims = pcs)
# Stable clusters:
# 从 parent clusters 的角度看,其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看,其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加,一个 stable cluster 应该要么保持不变,
# 要么干净的分裂为自己的 child clusters(child clusters 只来源自身)。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
width = 1500,
height = 700
)
seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
width = 800,
height = 800
)
}
resolution <- 0.7
## Use DoubletFinder to identify heterotypic doublets (not homotypic doublets)
# the way DoubletFinder identifies heterotypic doublets can be broken up into 4 steps:
# 1. generate artifical doublets from existing scRNA-seq data;
# 2. pre-process merged real-artificial data;
# 3. perform PCA and use the PC distance matrix to find each cell's proportion of artifical k nearest neighbors (pANN);
# 4. rank order and threshold pANN values based on the expected number of doublets.
# DoubletFinder needs the following arguments:
# 1. seu: a fully-processed Seurat object (i.e., after NormalizeData, FindVariableFeatures, ScaleData, RunPCA, FindNeighbors, FindClusters, and RunUMAP);
# 2. PCs: the number of PCs;
# 3. pN: the number of generated artifical doublets, expressed as a proportion of the merged real-artificial data;
# 4. pK: the PC neighborhood size used to compute pANN, expressed as a proportion of the merged real-artificial data;
# 5. nExp: this defines the pANN threshold used to make final doublet/singlet predictions.
# 关于 DoubletFinder 的输入:
# 不应该用于来源于不同库的混合样本,因为来源于不同库的细胞之间就没有形成 doublets 的可能性(物理隔绝)。
# 分别应用于来源于同一库的样本的子(分割)样本是可以的。
# 确保过滤掉低质量的细胞,如低 nCount_RNA/nFeature_RNA,高 percent.mt。
# pK Identification
# DoubletFinder performance is largely invariant of pN value selection
sweep_res_list <- paramSweep(seu_obj, PCs = pcs, sct = TRUE, num.cores = 80)
sweep_stats <- summarizeSweep(sweep_res_list, GT = FALSE)
bcmvn <- find.pK(sweep_stats)
# maximize AUC for data with ground-truth doublet classifications
# maximize mean-variance normalized bimodality coefficient (BCmvn) for data without ground-truth info
optimal_pK <- as.numeric(as.character(bcmvn$pK[which.max(bcmvn$BCmetric)]))
p <- ggplot(bcmvn, aes(pK, BCmetric)) +
geom_line(group = 1) +
geom_point() +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".DoubletFinder.bcmvn.line.pdf"),
width = 700,
height = 400
)
vroom_write(bcmvn, file = paste0(n, ".DoubletFinder.bcmvn.tsv"), col_names = TRUE, append = FALSE)
# homotypic doublet proportion estimate
# DoubletFinder is insensitive to homotypic doublets
# it will be subtracted from the expected doublet rate
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025
# cell type (state) diversity is used to estimate the homotypic doublet proportion
# under the hypothesis that any two cells may form a doublet with equal chance
homotypic_prop <- modelHomotypic(seu_obj[[]][[paste0("SCT_snn_res.", resolution)]])
nExp_poi <- round(estimated_doublet_rate * nrow(seu_obj[[]]))
nExp_poi_adj <- round(nExp_poi * (1 - homotypic_prop))
# run DoubletFinder with varying classification stringencies
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi, reuse.pANN = NULL, sct = TRUE)
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi_adj, reuse.pANN = paste0("pANN_", 0.25, "_", optimal_pK, "_", nExp_poi), sct = TRUE)
p1 <- DimPlot(seu_obj,
label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
group.by = paste0("SCT_snn_res.", resolution),
cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]] == "Doublet"]),
cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi))
p2 <- DimPlot(seu_obj,
label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
group.by = paste0("SCT_snn_res.", resolution),
cells.highlight = list("Doublet_adj" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]] == "Doublet"]),
cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj))
ugd_save_inline(
{
print(p1 | p2)
},
file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
width = 1400,
height = 700
)
nExp_poi_dbl_stats_df <- inner_join(
seu_obj[[]] %>%
select(all_of(paste0("SCT_snn_res.", resolution))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
reframe(cluster_n = n()),
seu_obj[[]] %>%
select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]]) %>%
reframe(n = n()),
by = paste0("SCT_snn_res.", resolution)
) %>%
mutate(percent = n / cluster_n * 100) %>%
arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]], desc(percent))
vroom_write(nExp_poi_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi, ".nExp.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)
nExp_poi_adj_dbl_stats_df <- inner_join(
seu_obj[[]] %>%
select(all_of(paste0("SCT_snn_res.", resolution))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
reframe(cluster_n = n()),
seu_obj[[]] %>%
select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]]) %>%
reframe(n = n()),
by = paste0("SCT_snn_res.", resolution)
) %>%
mutate(percent = n / cluster_n * 100) %>%
arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]], desc(percent))
vroom_write(nExp_poi_adj_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj, ".nExp_adj.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)
vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "DoubletFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "DoubletFinder.seu_obj_to_umap.rds")1 Some concepts
1.1 Outlier detection methods for 1D metrics
1.1.1 \(\sigma\)-based
数据需服从正态分布。\(\sigma\) 对异常值很敏感,因此适合于极端严格的场景。
基于 \(3\sigma\) 原则,即 \(\mu \pm \sigma\) 应覆盖 \(68\%\) 的数据,\(\mu \pm 2\sigma\) 应覆盖 \(95\%\) 的数据,而 \(\mu \pm 3\sigma\) 应覆盖 \(99.7\%\) 的数据。
1.1.2 MAD-based
中位数绝对偏差(Median Absolute Deviation, MAD)是一个无分布假设,对异常值不敏感(稳健)的异常值检测方法。
先计算 \(MAD = median(|x_i - median(x)|)\),再计算 3 倍 MAD,即 \(median(x) \pm 3 \times MAD\)。
也有一个与 3 倍 \(\sigma\) 对齐的改进方案:
其假设当 \(n \rightarrow \infty\) 时有 \(median \rightarrow \mu\) 且 \(E(MAD) = \sigma / 1.4826\),其中 \(1.4826 \approx b = \frac{1}{\Phi^{-1}(0.75)}\) 为一个渐进因子,而 \(\Phi^{-1}\) 为标准正态分布的累积分布函数的反函数。
因此引入 \(\sigma_{MAD} = 1.4826 \times MAD\)。
对于小样本(e.g., \(n \lt 100\)),\(\sigma_{MAD}\) 会低估 \(\sigma\),因此有一个简单地修正公式 \(\sigma_{MAD,corrected} = 1.4826 \times \frac{n}{n-1} \times MAD\),其满足当 \(n \rightarrow \infty\) 时有 \(\sigma_{MAD,corrected} \rightarrow \sigma_{MAD}\)。
注:
时间序列数据应使用观测值和模型预测值之间的差值,即残差来计算 MAD。另外,对于残差,通常也不应用全局 MAD,应用滑动窗口 MAD,因为数据的波动性可能会随时间变化。
多维数据有其对应的处理指标。
注:不论是 \(\sigma\)-based,还是 MAD-based,推荐先过滤完 empty/dying cells 再计算用于过滤的上限,例如使用 nCount_RNA > 500 & nFeature_RNA > 250。应用他们的前提是大部分细胞都是高质量的。
2 Main analysis workflow
2.1 From QC to cell type annotations
2.1.1 Astrocyte-treated sample
Predict doublets using DoubletFinder:
Predict doublets using scDblFinder:
library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(scDblFinder)
library(vroom)
set.seed(123456)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
"Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
)
n <- names(sample_dir)
work_dir <- file.path(work_dir, paste0(n, "_scDblFinder"))
dir.create(work_dir)
setwd(work_dir)
counts <- Read10X(sample_dir)
seu_obj <- CreateSeuratObject(
counts = counts, project = n,
min.cells = 10
)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.raw.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.raw.pdf"),
width = 900,
height = 400
)
seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.lo.filtered.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
width = 900,
height = 400
)
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))
VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
dev.off()
}
p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.elbow.pdf"),
width = 1000,
height = 400
)
pcs <- 1:50
seu_obj <- FindNeighbors(seu_obj, dims = pcs)
# Stable clusters:
# 从 parent clusters 的角度看,其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看,其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加,一个 stable cluster 应该要么保持不变,
# 要么干净的分裂为自己的 child clusters(child clusters 只来源自身)。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
width = 1500,
height = 700
)
seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
width = 800,
height = 800
)
}
resolution <- 0.7
## Use scDblFinder to identify heterotypic doublets (not homotypic doublets)
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025
Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)
sce <- scDblFinder(
GetAssayData(seu_obj, assay = "RNA", slot = "counts"),
clusters = Idents(seu_obj),
dbr = estimated_doublet_rate
)
seu_obj <- AddMetaData(seu_obj, metadata = as.data.frame(sce@colData))
p <- DimPlot(seu_obj,
label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
group.by = paste0("SCT_snn_res.", resolution),
cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][["scDblFinder.class"]] == "doublet"]),
cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle("scDblFinder.class")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
width = 700,
height = 700
)
dbl_stats_df <- inner_join(
seu_obj[[]] %>%
select(all_of(paste0("SCT_snn_res.", resolution))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
reframe(cluster_n = n()),
seu_obj[[]] %>%
select(all_of(c(paste0("SCT_snn_res.", resolution), "scDblFinder.class"))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[["scDblFinder.class"]]) %>%
reframe(n = n()),
by = paste0("SCT_snn_res.", resolution)
) %>%
mutate(percent = n / cluster_n * 100) %>%
arrange(.data[["scDblFinder.class"]], desc(percent))
vroom_write(dbl_stats_df, file = "scDblFinder.doublet_proportion.tsv", col_names = TRUE, append = FALSE)
vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "scDblFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "scDblFinder.seu_obj_to_umap.rds")Standard Seurat processing workflow: remove low-quality cells/doublets.
library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(vroom)
library(magrittr)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
"Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
)
doubletfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_DoubletFinder/DoubletFinder.seurat_metadata.tsv"
scdblfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_scDblFinder/scDblFinder.seurat_metadata.tsv"
doubletfinder_column <- "DF.classifications_0.25_0.14_163"
n <- names(sample_dir)
work_dir <- file.path(work_dir, n)
dir.create(work_dir)
setwd(work_dir)
counts <- Read10X(sample_dir)
doubletfinder_metadata_df <- vroom(doubletfinder_metadata_file) %>%
select(all_of(c("cell_barcode", doubletfinder_column)))
scdblfinder_metadata_df <- vroom(scdblfinder_metadata_file) %>%
select(cell_barcode, scDblFinder.class)
doublet_metadata_df <- full_join(doubletfinder_metadata_df, scdblfinder_metadata_df, by = "cell_barcode")
if (any(is.na(doublet_metadata_df$cell_barcode))) {
stop("some cell barcodes are NAs")
}
doublet_metadata_df <- tibble(cell_barcode = colnames(counts)) %>%
left_join(doublet_metadata_df, by = "cell_barcode") %>%
mutate(
across(all_of(c(doubletfinder_column, "scDblFinder.class")), ~ replace(.x, is.na(.x), "singlet")),
across(all_of(c(doubletfinder_column, "scDblFinder.class")), str_to_lower),
merged_doublet_class = if_else(.data[[doubletfinder_column]] == "doublet" & scDblFinder.class == "doublet", "HC_Doublet",
if_else(.data[[doubletfinder_column]] == "doublet", "DF_Doublet",
if_else(scDblFinder.class == "doublet", "SDF_Doublet", "Singlet")
)
)
) %>%
as.data.frame() %>%
set_rownames(.[["cell_barcode"]]) %>%
select(-cell_barcode)
table(doublet_metadata_df$merged_doublet_class)
table(doublet_metadata_df[[doubletfinder_column]], doublet_metadata_df$scDblFinder.class)
seu_obj <- CreateSeuratObject(
counts = counts, project = n,
min.cells = 10, meta.data = doublet_metadata_df
)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")
p <- VlnPlot(seu_obj, group.by = "merged_doublet_class", features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 1, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.raw.pdf"),
width = 500,
height = 900
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.raw.pdf"),
width = 900,
height = 400
)
seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 2.5 & merged_doublet_class == "Singlet")
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.filtered.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.filtered.pdf"),
width = 900,
height = 400
)
# quick assessment of cell cycle effects
# seu_obj <- NormalizeData(seu_obj, normalization.method = "LogNormalize", scale.factor = 10000)
# s.genes <- str_to_title(cc.genes$s.genes)
# g2m.genes <- str_to_title(cc.genes$g2m.genes)
# length(intersect(row.names(seu_obj), s.genes)) / length(s.genes)
# length(intersect(row.names(seu_obj), g2m.genes)) / length(g2m.genes)
# 针对单个细胞计算基因集富集分数
# 对于每一个细胞,比较目标基因集的平均表达量与一组随机挑选的、但表达水平相似的对照基因集的平均表达量
# 基于每个基因在所有细胞中的平均表达量对其进行分箱(binning)
# 为目标基因集中的每一个基因匹配 N (e.g., 100) 个对照基因,由此得到对照基因集
# 针对每一个细胞,分别计算目标基因集和对照基因集的平均表达量
# 计算最终得分:score = target_mean - control_mean
# seu_obj <- CellCycleScoring(seu_obj, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
# seu_obj$CC.Difference <- seu_obj$S.Score - seu_obj$G2M.Score
# seu_obj <- ScaleData(seu_obj, assay = "RNA", features = c(s.genes, g2m.genes))
# seu_obj <- RunPCA(seu_obj, assay = "RNA", features = c(s.genes, g2m.genes), reduction.name = "cell_cycle_genes.pca")
# DimPlot(seu_obj, reduction = "cell_cycle_genes.pca")
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))
VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(n, ".pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
dev.off()
}
p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".elbow.pdf"),
width = 1000,
height = 400
)
pcs <- 1:30
seu_obj <- FindNeighbors(seu_obj, dims = pcs)
# Stable clusters:
# 从 parent clusters 的角度看,其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看,其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加,一个 stable cluster 应该要么保持不变,
# 要么干净的分裂为自己的 child clusters(child clusters 只来源自身)。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".resolutions.cluster_tree.pdf"),
width = 1500,
height = 700
)
seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".umap.", resolution_column, ".pdf"),
width = 800,
height = 800
)
}
resolution <- 0.5
Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)
saveRDS(seu_obj, file = "seu_obj_to_umap.rds")Annotate cell types based on canonical markers/DEGs:
library(Seurat)
library(vroom)
library(tidyverse)
library(YRUtils)
library(unigd)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
seu_obj_rds_file <- c(
"Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte/seu_obj_to_umap.rds"
)
canonical_markers_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/mm_hippocampus_gene_markers.txt"
resolution_column <- "SCT_snn_res.0.5"
n <- names(seu_obj_rds_file)
work_dir <- file.path(work_dir, paste0(n, "_annotate_cell_types"))
dir.create(work_dir)
setwd(work_dir)
seu_obj <- readRDS(seu_obj_rds_file)
Idents(seu_obj) <- resolution_column
DefaultAssay(seu_obj) <- "RNA"
seu_obj <- NormalizeData(seu_obj)
seu_obj <- ScaleData(seu_obj, features = row.names(seu_obj))
seu_obj_markers <- FindAllMarkers(seu_obj, only.pos = TRUE, logfc.threshold = 0.5, min.pct = 0.25)
seu_obj_markers <- seu_obj_markers %>%
mutate(pct_diff = pct.1 - pct.2) %>%
arrange(cluster, p_val_adj, desc(avg_log2FC), desc(pct_diff))
vroom_write(seu_obj_markers, file = "seu_obj.pos_markers.tsv", col_names = TRUE, append = FALSE)
top_n <- 10
top_seu_obj_markers <- seu_obj_markers %>%
group_by(cluster) %>%
filter(avg_log2FC > 1 & p_val_adj < 0.05) %>%
slice_head(n = top_n) %>%
ungroup()
p <- DoHeatmap(seu_obj, features = top_seu_obj_markers$gene, label = TRUE, group.bar.height = 0.005, raster = FALSE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".top", top_n, "_DEGs.heatmap.pdf"),
width = 1200,
height = 3200
)
canonical_markers_df <- vroom(canonical_markers_file) %>%
separate_longer_delim(cols = "marker", delim = "/") %>%
mutate(marker = str_to_title(trimws(marker))) %>%
distinct() %>%
filter(marker %in% row.names(seu_obj)) %>%
mutate(
marker = factor(marker, levels = marker),
rank = 1:n()
)
table(duplicated(canonical_markers_df$marker))
cell_type_anno_vec <- sapply(
split(canonical_markers_df, canonical_markers_df$cell_type),
function(df) sum(df$rank) / nrow(df)
)
cell_type_anno_df <- tibble(cell_type = names(cell_type_anno_vec), x = cell_type_anno_vec)
p1 <- DotPlot(
seu_obj,
features = canonical_markers_df$marker,
cluster.idents = TRUE,
dot.min = 0.1
) +
theme(
axis.text.x.bottom = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.title = element_blank(),
plot.margin = margin(0, 0, 0, 0)
)
Idents(seu_obj) <- factor(Idents(seu_obj), levels = levels(p1$data$id))
p2 <- ggplot() +
geom_tile(
mapping = aes(x = marker, y = 1, fill = cell_type),
data = canonical_markers_df, color = NA,
height = 2
) +
geom_text(
mapping = aes(x, y = -0.5, label = cell_type), data = cell_type_anno_df,
angle = 90, vjust = 0.5, hjust = 1
) +
scale_y_continuous(
expand = expansion(0),
limits = c(-50, 2)
) +
theme_void() +
theme(
legend.position = "none",
plot.margin = margin(0, 0, 0, 0)
)
p <- aplot::insert_bottom(p1, p2, height = 1)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".canonical_markers.dotplot.pdf"),
width = 1500,
height = 1000
)
p <- VlnPlot(
seu_obj,
features = canonical_markers_df$marker,
split.by = "ident",
stack = TRUE,
raster = TRUE
) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".canonical_markers.stacked_violin.pdf"),
width = 1400,
height = 600
)
p1 <- VlnPlot(seu_obj, features = "percent.mt", group.by = resolution_column) +
geom_hline(yintercept = 2.5) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p2 <- VlnPlot(seu_obj, features = "nCount_RNA", group.by = resolution_column) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p3 <- VlnPlot(seu_obj, features = "nFeature_RNA", group.by = resolution_column) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p <- p1 / p2 / p3
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".nCount_nFeature_MT.qc_violin.cluster-wise.pdf"),
width = 1000,
height = 1000
)
cell_types <- c(
"12" = "OPC", "1" = "Oligodendrocytes", "6" = "Microglia",
"7" = "Astrocytes", "13" = "VLMCs", "9" = "Sst IN",
"10" = "IN", "0" = "DG EN", "11" = "EN1", "5" = "EN2",
"8" = "EN3", "2" = "EN4", "4" = "EN5", "3" = "PLN",
"14" = "Cajal Retzius EN", "15" = "Unknown"
)
seu_obj <- RenameIdents(seu_obj, cell_types)
seu_obj$cell_types <- Idents(seu_obj)
p <- DimPlot(seu_obj, label.size = 4, repel = TRUE, reduction = "umap", group.by = "cell_types", label = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".umap.cell_types.pdf"),
width = 600,
height = 400
)
saveRDS(seu_obj, file = "seu_obj_to_annotation.rds")2.1.2 Glioma-treated sample
Predict doublets using DoubletFinder:
library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(DoubletFinder)
library(vroom)
set.seed(123456)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
"Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176044-250818-Glioma_20250928_004852/JZ25176044-250818-Glioma_outs/filtered_cell_gene_matrix"
)
n <- names(sample_dir)
work_dir <- file.path(work_dir, paste0(n, "_DoubletFinder"))
dir.create(work_dir)
setwd(work_dir)
counts <- Read10X(sample_dir)
seu_obj <- CreateSeuratObject(
counts = counts, project = n,
min.cells = 10
)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.raw.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.raw.pdf"),
width = 900,
height = 400
)
seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.lo.filtered.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
width = 900,
height = 400
)
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))
VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
dev.off()
}
p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.elbow.pdf"),
width = 1000,
height = 400
)
pcs <- 1:50
seu_obj <- FindNeighbors(seu_obj, dims = pcs)
# Stable clusters:
# 从 parent clusters 的角度看,其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看,其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加,一个 stable cluster 应该要么保持不变,
# 要么干净的分裂为自己的 child clusters(child clusters 只来源自身)。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
width = 1500,
height = 700
)
seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
width = 800,
height = 800
)
}
resolution <- 0.7
## Use DoubletFinder to identify heterotypic doublets (not homotypic doublets)
# the way DoubletFinder identifies heterotypic doublets can be broken up into 4 steps:
# 1. generate artifical doublets from existing scRNA-seq data;
# 2. pre-process merged real-artificial data;
# 3. perform PCA and use the PC distance matrix to find each cell's proportion of artifical k nearest neighbors (pANN);
# 4. rank order and threshold pANN values based on the expected number of doublets.
# DoubletFinder needs the following arguments:
# 1. seu: a fully-processed Seurat object (i.e., after NormalizeData, FindVariableFeatures, ScaleData, RunPCA, FindNeighbors, FindClusters, and RunUMAP);
# 2. PCs: the number of PCs;
# 3. pN: the number of generated artifical doublets, expressed as a proportion of the merged real-artificial data;
# 4. pK: the PC neighborhood size used to compute pANN, expressed as a proportion of the merged real-artificial data;
# 5. nExp: this defines the pANN threshold used to make final doublet/singlet predictions.
# 关于 DoubletFinder 的输入:
# 不应该用于来源于不同库的混合样本,因为来源于不同库的细胞之间就没有形成 doublets 的可能性(物理隔绝)。
# 分别应用于来源于同一库的样本的子(分割)样本是可以的。
# 确保过滤掉低质量的细胞,如低 nCount_RNA/nFeature_RNA,高 percent.mt。
# pK Identification
# DoubletFinder performance is largely invariant of pN value selection
sweep_res_list <- paramSweep(seu_obj, PCs = pcs, sct = TRUE, num.cores = 80)
sweep_stats <- summarizeSweep(sweep_res_list, GT = FALSE)
bcmvn <- find.pK(sweep_stats)
# maximize AUC for data with ground-truth doublet classifications
# maximize mean-variance normalized bimodality coefficient (BCmvn) for data without ground-truth info
optimal_pK <- as.numeric(as.character(bcmvn$pK[which.max(bcmvn$BCmetric)]))
p <- ggplot(bcmvn, aes(pK, BCmetric)) +
geom_line(group = 1) +
geom_point() +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".DoubletFinder.bcmvn.line.pdf"),
width = 700,
height = 400
)
vroom_write(bcmvn, file = paste0(n, ".DoubletFinder.bcmvn.tsv"), col_names = TRUE, append = FALSE)
# homotypic doublet proportion estimate
# DoubletFinder is insensitive to homotypic doublets
# it will be subtracted from the expected doublet rate
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025
# cell type (state) diversity is used to estimate the homotypic doublet proportion
# under the hypothesis that any two cells may form a doublet with equal chance
homotypic_prop <- modelHomotypic(seu_obj[[]][[paste0("SCT_snn_res.", resolution)]])
nExp_poi <- round(estimated_doublet_rate * nrow(seu_obj[[]]))
nExp_poi_adj <- round(nExp_poi * (1 - homotypic_prop))
# run DoubletFinder with varying classification stringencies
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi, reuse.pANN = NULL, sct = TRUE)
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi_adj, reuse.pANN = paste0("pANN_", 0.25, "_", optimal_pK, "_", nExp_poi), sct = TRUE)
p1 <- DimPlot(seu_obj,
label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
group.by = paste0("SCT_snn_res.", resolution),
cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]] == "Doublet"]),
cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi))
p2 <- DimPlot(seu_obj,
label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
group.by = paste0("SCT_snn_res.", resolution),
cells.highlight = list("Doublet_adj" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]] == "Doublet"]),
cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj))
ugd_save_inline(
{
print(p1 | p2)
},
file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
width = 1400,
height = 700
)
nExp_poi_dbl_stats_df <- inner_join(
seu_obj[[]] %>%
select(all_of(paste0("SCT_snn_res.", resolution))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
reframe(cluster_n = n()),
seu_obj[[]] %>%
select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]]) %>%
reframe(n = n()),
by = paste0("SCT_snn_res.", resolution)
) %>%
mutate(percent = n / cluster_n * 100) %>%
arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]], desc(percent))
vroom_write(nExp_poi_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi, ".nExp.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)
nExp_poi_adj_dbl_stats_df <- inner_join(
seu_obj[[]] %>%
select(all_of(paste0("SCT_snn_res.", resolution))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
reframe(cluster_n = n()),
seu_obj[[]] %>%
select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]]) %>%
reframe(n = n()),
by = paste0("SCT_snn_res.", resolution)
) %>%
mutate(percent = n / cluster_n * 100) %>%
arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]], desc(percent))
vroom_write(nExp_poi_adj_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj, ".nExp_adj.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)
vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "DoubletFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "DoubletFinder.seu_obj_to_umap.rds")Predict doublets using scDblFinder:
library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(scDblFinder)
library(vroom)
set.seed(123456)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
"Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176044-250818-Glioma_20250928_004852/JZ25176044-250818-Glioma_outs/filtered_cell_gene_matrix"
)
n <- names(sample_dir)
work_dir <- file.path(work_dir, paste0(n, "_scDblFinder"))
dir.create(work_dir)
setwd(work_dir)
counts <- Read10X(sample_dir)
seu_obj <- CreateSeuratObject(
counts = counts, project = n,
min.cells = 10
)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.raw.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.raw.pdf"),
width = 900,
height = 400
)
seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.lo.filtered.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
width = 900,
height = 400
)
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))
VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
dev.off()
}
p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.elbow.pdf"),
width = 1000,
height = 400
)
pcs <- 1:50
seu_obj <- FindNeighbors(seu_obj, dims = pcs)
# Stable clusters:
# 从 parent clusters 的角度看,其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看,其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加,一个 stable cluster 应该要么保持不变,
# 要么干净的分裂为自己的 child clusters(child clusters 只来源自身)。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
width = 1500,
height = 700
)
seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
width = 800,
height = 800
)
}
resolution <- 0.7
## Use scDblFinder to identify heterotypic doublets (not homotypic doublets)
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025
Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)
sce <- scDblFinder(
GetAssayData(seu_obj, assay = "RNA", slot = "counts"),
clusters = Idents(seu_obj),
dbr = estimated_doublet_rate
)
seu_obj <- AddMetaData(seu_obj, metadata = as.data.frame(sce@colData))
p <- DimPlot(seu_obj,
label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
group.by = paste0("SCT_snn_res.", resolution),
cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][["scDblFinder.class"]] == "doublet"]),
cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle("scDblFinder.class")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
width = 700,
height = 700
)
dbl_stats_df <- inner_join(
seu_obj[[]] %>%
select(all_of(paste0("SCT_snn_res.", resolution))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
reframe(cluster_n = n()),
seu_obj[[]] %>%
select(all_of(c(paste0("SCT_snn_res.", resolution), "scDblFinder.class"))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[["scDblFinder.class"]]) %>%
reframe(n = n()),
by = paste0("SCT_snn_res.", resolution)
) %>%
mutate(percent = n / cluster_n * 100) %>%
arrange(.data[["scDblFinder.class"]], desc(percent))
vroom_write(dbl_stats_df, file = "scDblFinder.doublet_proportion.tsv", col_names = TRUE, append = FALSE)
vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "scDblFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "scDblFinder.seu_obj_to_umap.rds")Standard Seurat processing workflow: remove low-quality cells/doublets.
library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(vroom)
library(magrittr)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
"Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176044-250818-Glioma_20250928_004852/JZ25176044-250818-Glioma_outs/filtered_cell_gene_matrix"
)
doubletfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Glioma_DoubletFinder/DoubletFinder.seurat_metadata.tsv"
scdblfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Glioma_scDblFinder/scDblFinder.seurat_metadata.tsv"
doubletfinder_column <- "DF.classifications_0.25_0.25_126"
n <- names(sample_dir)
work_dir <- file.path(work_dir, n)
dir.create(work_dir)
setwd(work_dir)
counts <- Read10X(sample_dir)
doubletfinder_metadata_df <- vroom(doubletfinder_metadata_file) %>%
select(all_of(c("cell_barcode", doubletfinder_column)))
scdblfinder_metadata_df <- vroom(scdblfinder_metadata_file) %>%
select(cell_barcode, scDblFinder.class)
doublet_metadata_df <- full_join(doubletfinder_metadata_df, scdblfinder_metadata_df, by = "cell_barcode")
if (any(is.na(doublet_metadata_df$cell_barcode))) {
stop("some cell barcodes are NAs")
}
doublet_metadata_df <- tibble(cell_barcode = colnames(counts)) %>%
left_join(doublet_metadata_df, by = "cell_barcode") %>%
mutate(
across(all_of(c(doubletfinder_column, "scDblFinder.class")), ~ replace(.x, is.na(.x), "singlet")),
across(all_of(c(doubletfinder_column, "scDblFinder.class")), str_to_lower),
merged_doublet_class = if_else(.data[[doubletfinder_column]] == "doublet" & scDblFinder.class == "doublet", "HC_Doublet",
if_else(.data[[doubletfinder_column]] == "doublet", "DF_Doublet",
if_else(scDblFinder.class == "doublet", "SDF_Doublet", "Singlet")
)
)
) %>%
as.data.frame() %>%
set_rownames(.[["cell_barcode"]]) %>%
select(-cell_barcode)
table(doublet_metadata_df$merged_doublet_class)
table(doublet_metadata_df[[doubletfinder_column]], doublet_metadata_df$scDblFinder.class)
seu_obj <- CreateSeuratObject(
counts = counts, project = n,
min.cells = 10, meta.data = doublet_metadata_df
)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")
p <- VlnPlot(seu_obj, group.by = "merged_doublet_class", features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 1, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.raw.pdf"),
width = 500,
height = 900
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.raw.pdf"),
width = 900,
height = 400
)
seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 2.5 & merged_doublet_class == "Singlet")
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.filtered.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.filtered.pdf"),
width = 900,
height = 400
)
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))
VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(n, ".pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
dev.off()
}
p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".elbow.pdf"),
width = 1000,
height = 400
)
pcs <- 1:25
seu_obj <- FindNeighbors(seu_obj, dims = pcs)
# Stable clusters:
# 从 parent clusters 的角度看,其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看,其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加,一个 stable cluster 应该要么保持不变,
# 要么干净的分裂为自己的 child clusters(child clusters 只来源自身)。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".resolutions.cluster_tree.pdf"),
width = 1500,
height = 700
)
seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".umap.", resolution_column, ".pdf"),
width = 800,
height = 800
)
}
resolution <- 0.5
Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)
saveRDS(seu_obj, file = "seu_obj_to_umap.rds")Annotate cell types based on canonical markers/DEGs:
library(Seurat)
library(vroom)
library(tidyverse)
library(YRUtils)
library(unigd)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
seu_obj_rds_file <- c(
"Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Glioma/seu_obj_to_umap.rds"
)
canonical_markers_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/mm_hippocampus_gene_markers.txt"
resolution_column <- "SCT_snn_res.0.5"
n <- names(seu_obj_rds_file)
work_dir <- file.path(work_dir, paste0(n, "_annotate_cell_types"))
dir.create(work_dir)
setwd(work_dir)
seu_obj <- readRDS(seu_obj_rds_file)
Idents(seu_obj) <- resolution_column
DefaultAssay(seu_obj) <- "RNA"
seu_obj <- NormalizeData(seu_obj)
seu_obj <- ScaleData(seu_obj, features = row.names(seu_obj))
seu_obj_markers <- FindAllMarkers(seu_obj, only.pos = TRUE, logfc.threshold = 0.5, min.pct = 0.25)
seu_obj_markers <- seu_obj_markers %>%
mutate(pct_diff = pct.1 - pct.2) %>%
arrange(cluster, p_val_adj, desc(avg_log2FC), desc(pct_diff))
vroom_write(seu_obj_markers, file = "seu_obj.pos_markers.tsv", col_names = TRUE, append = FALSE)
top_n <- 10
top_seu_obj_markers <- seu_obj_markers %>%
group_by(cluster) %>%
filter(avg_log2FC > 1 & p_val_adj < 0.05) %>%
slice_head(n = top_n) %>%
ungroup()
p <- DoHeatmap(seu_obj, features = top_seu_obj_markers$gene, label = TRUE, group.bar.height = 0.005, raster = FALSE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".top", top_n, "_DEGs.heatmap.pdf"),
width = 1200,
height = 3200
)
canonical_markers_df <- vroom(canonical_markers_file) %>%
separate_longer_delim(cols = "marker", delim = "/") %>%
mutate(marker = str_to_title(trimws(marker))) %>%
distinct() %>%
filter(marker %in% row.names(seu_obj)) %>%
mutate(
marker = factor(marker, levels = marker),
rank = 1:n()
)
table(duplicated(canonical_markers_df$marker))
cell_type_anno_vec <- sapply(
split(canonical_markers_df, canonical_markers_df$cell_type),
function(df) sum(df$rank) / nrow(df)
)
cell_type_anno_df <- tibble(cell_type = names(cell_type_anno_vec), x = cell_type_anno_vec)
p1 <- DotPlot(
seu_obj,
features = canonical_markers_df$marker,
cluster.idents = TRUE,
dot.min = 0.1
) +
theme(
axis.text.x.bottom = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.title = element_blank(),
plot.margin = margin(0, 0, 0, 0)
)
Idents(seu_obj) <- factor(Idents(seu_obj), levels = levels(p1$data$id))
p2 <- ggplot() +
geom_tile(
mapping = aes(x = marker, y = 1, fill = cell_type),
data = canonical_markers_df, color = NA,
height = 2
) +
geom_text(
mapping = aes(x, y = -0.5, label = cell_type), data = cell_type_anno_df,
angle = 90, vjust = 0.5, hjust = 1
) +
scale_y_continuous(
expand = expansion(0),
limits = c(-50, 2)
) +
theme_void() +
theme(
legend.position = "none",
plot.margin = margin(0, 0, 0, 0)
)
p <- aplot::insert_bottom(p1, p2, height = 1)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".canonical_markers.dotplot.pdf"),
width = 1500,
height = 1000
)
p <- VlnPlot(
seu_obj,
features = canonical_markers_df$marker,
split.by = "ident",
stack = TRUE,
raster = TRUE
) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".canonical_markers.stacked_violin.pdf"),
width = 1400,
height = 600
)
p1 <- VlnPlot(seu_obj, features = "percent.mt", group.by = resolution_column) +
geom_hline(yintercept = 2.5) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p2 <- VlnPlot(seu_obj, features = "nCount_RNA", group.by = resolution_column) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p3 <- VlnPlot(seu_obj, features = "nFeature_RNA", group.by = resolution_column) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p <- p1 / p2 / p3
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".nCount_nFeature_MT.qc_violin.cluster-wise.pdf"),
width = 1000,
height = 1000
)
cell_types <- c(
"12" = "OPC", "1" = "Oligodendrocytes", "7" = "Microglia",
"3" = "Astrocytes", "13" = "VLMCs", "9" = "Sst IN",
"10" = "Cck IN", "0" = "DG EN", "6" = "PLN",
"11" = "EN1", "8" = "EN2", "5" = "EN3", "2" = "EN4",
"4" = "EN5", "14" = "EN6"
)
seu_obj <- RenameIdents(seu_obj, cell_types)
seu_obj$cell_types <- Idents(seu_obj)
p <- DimPlot(seu_obj, label.size = 4, repel = TRUE, reduction = "umap", group.by = "cell_types", label = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".umap.cell_types.pdf"),
width = 600,
height = 400
)
saveRDS(seu_obj, file = "seu_obj_to_annotation.rds")2.2 Integrative analysis
2.2.1 Integration between astrocyte-treated sample and glioma-treated sample
Integration with CCA:
library(Seurat)
library(patchwork)
library(unigd)
library(tidyverse)
library(clustree)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
seu_obj_rds_files <- c(
"Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_annotate_cell_types/seu_obj_to_annotation.rds",
"Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Glioma_annotate_cell_types/seu_obj_to_annotation.rds"
)
work_dir <- file.path(work_dir, paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".integration"))
dir.create(work_dir)
setwd(work_dir)
seu_obj_ls <- list()
for (n in names(seu_obj_rds_files)) {
seu_obj_ls[[n]] <- readRDS(seu_obj_rds_files[n])
}
seu_obj <- merge(x = seu_obj_ls[[1]], y = seu_obj_ls[[2]])
n_hvgs <- 3000
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = n_hvgs)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))
VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
dev.off()
}
p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
{
print(p)
},
file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".elbow.pdf"),
width = 1000,
height = 400
)
# the number of PCs specified here differs from the number of PCs (controlled by the `dims` parameter) used for dataset integration
# just for checking batch effects
pcs <- 1:30
seu_obj <- RunUMAP(seu_obj, dims = pcs)
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = c("orig.ident", "cell_types"), label = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".umap.group.before_integration.pdf"),
width = 1600,
height = 800
)
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", split.by = "orig.ident", label = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".umap.split.before_integration.pdf"),
width = 1600,
height = 800
)
if (inherits(seu_obj[[DefaultAssay(seu_obj)]], what = "SCTAssay")) {
features <- SelectSCTIntegrationFeatures(seu_obj, assay = DefaultAssay(seu_obj), nfeatures = n_hvgs)
} else {
stop("please run SCTransform first and set SCT as the default assay")
}
# CCA
# extremely slow
# here, we leave the parameter `dims` unset
seu_obj <- IntegrateLayers(
seu_obj,
features = features, assay = DefaultAssay(seu_obj),
orig.reduction = "pca", new.reduction = "integrated.cca",
method = CCAIntegration, normalization.method = "SCT"
)
seu_obj[["RNA"]] <- JoinLayers(seu_obj[["RNA"]])
ncol(seu_obj@reductions$integrated.cca)
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".cca_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], reduction = "integrated.cca", cells = 500, balanced = TRUE)
dev.off()
}
pcs <- 1:30
seu_obj <- FindNeighbors(seu_obj, dims = pcs, reduction = "integrated.cca")
# Stable clusters:
# 从 parent clusters 的角度看,其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看,其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加,一个 stable cluster 应该要么保持不变,
# 要么干净的分裂为自己的 child clusters(child clusters 只来源自身)。
resolutions <- seq(0.2, 2, 0.1)
for (resolution in resolutions) {
seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".resolutions.cluster_tree.pdf"),
width = 2000,
height = 1400
)
seu_obj <- RunUMAP(seu_obj, dims = pcs, reduction = "integrated.cca")
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".umap.", resolution_column, ".pdf"),
width = 800,
height = 800
)
}
resolution <- 0.5
Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)
saveRDS(seu_obj, file = "seu_obj_to_umap.rds")Annotate cell types based on canonical markers/DEGs:
library(Seurat)
library(vroom)
library(tidyverse)
library(YRUtils)
library(unigd)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
seu_obj_rds_file <- c(
"Astrocyte_Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_Glioma.integration/seu_obj_to_umap.rds"
)
canonical_markers_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/mm_hippocampus_gene_markers.txt"
resolution_column <- "SCT_snn_res.0.4"
n <- names(seu_obj_rds_file)
work_dir <- file.path(work_dir, paste0(n, ".annotate_cell_types"))
dir.create(work_dir)
setwd(work_dir)
seu_obj <- readRDS(seu_obj_rds_file)
Idents(seu_obj) <- resolution_column
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = c("orig.ident", "cell_types"), label = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".umap.group.before_annotation.pdf"),
width = 1600,
height = 800
)
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", split.by = "orig.ident", label = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".umap.split.before_annotation.pdf"),
width = 1600,
height = 800
)
DefaultAssay(seu_obj) <- "RNA"
seu_obj <- NormalizeData(seu_obj)
seu_obj <- ScaleData(seu_obj, features = row.names(seu_obj))
seu_obj_markers <- FindAllMarkers(seu_obj, only.pos = TRUE, logfc.threshold = 0.5, min.pct = 0.25)
seu_obj_markers <- seu_obj_markers %>%
mutate(pct_diff = pct.1 - pct.2) %>%
arrange(cluster, p_val_adj, desc(avg_log2FC), desc(pct_diff))
vroom_write(seu_obj_markers, file = "seu_obj.pos_markers.tsv", col_names = TRUE, append = FALSE)
top_n <- 10
top_seu_obj_markers <- seu_obj_markers %>%
group_by(cluster) %>%
filter(avg_log2FC > 1 & p_val_adj < 0.05) %>%
slice_head(n = top_n) %>%
ungroup()
p <- DoHeatmap(seu_obj, features = top_seu_obj_markers$gene, label = TRUE, group.bar.height = 0.005, raster = FALSE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".top", top_n, "_DEGs.heatmap.pdf"),
width = 1200,
height = 3200
)
canonical_markers_df <- vroom(canonical_markers_file) %>%
separate_longer_delim(cols = "marker", delim = "/") %>%
mutate(marker = str_to_title(trimws(marker))) %>%
distinct() %>%
filter(marker %in% row.names(seu_obj)) %>%
mutate(
marker = factor(marker, levels = marker),
rank = 1:n()
)
table(duplicated(canonical_markers_df$marker))
cell_type_anno_vec <- sapply(
split(canonical_markers_df, canonical_markers_df$cell_type),
function(df) sum(df$rank) / nrow(df)
)
cell_type_anno_df <- tibble(cell_type = names(cell_type_anno_vec), x = cell_type_anno_vec)
p1 <- DotPlot(
seu_obj,
features = canonical_markers_df$marker,
cluster.idents = TRUE,
dot.min = 0.1
) +
theme(
axis.text.x.bottom = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.title = element_blank(),
plot.margin = margin(0, 0, 0, 0)
)
Idents(seu_obj) <- factor(Idents(seu_obj), levels = levels(p1$data$id))
p2 <- ggplot() +
geom_tile(
mapping = aes(x = marker, y = 1, fill = cell_type),
data = canonical_markers_df, color = NA,
height = 2
) +
geom_text(
mapping = aes(x, y = -0.5, label = cell_type), data = cell_type_anno_df,
angle = 90, vjust = 0.5, hjust = 1
) +
scale_y_continuous(
expand = expansion(0),
limits = c(-50, 2)
) +
theme_void() +
theme(
legend.position = "none",
plot.margin = margin(0, 0, 0, 0)
)
p <- aplot::insert_bottom(p1, p2, height = 1)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".canonical_markers.dotplot.pdf"),
width = 1500,
height = 1000
)
p <- VlnPlot(
seu_obj,
features = canonical_markers_df$marker,
split.by = "ident",
stack = TRUE,
raster = TRUE
) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".canonical_markers.stacked_violin.pdf"),
width = 1400,
height = 600
)
p1 <- VlnPlot(seu_obj, features = "percent.mt", group.by = resolution_column) +
geom_hline(yintercept = 2.5) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p2 <- VlnPlot(seu_obj, features = "nCount_RNA", group.by = resolution_column) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p3 <- VlnPlot(seu_obj, features = "nFeature_RNA", group.by = resolution_column) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p <- p1 / p2 / p3
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".nCount_nFeature_MT.qc_violin.cluster-wise.pdf"),
width = 1000,
height = 1000
)
cell_types <- c(
"12" = "OPC", "1" = "Oligodendrocytes", "8" = "Microglia",
"4" = "Astrocytes", "14" = "VLMCs", "9" = "Sst IN",
"10" = "Cck IN", "16" = "Cajal-Retzius EN",
"0" = "DG EN", "13" = "DG EN", "2" = "CA1 EN1",
"6" = "CA1 EN2", "7" = "CA1 EN3", "11" = "CA1 EN4",
"3" = "EN1", "5" = "PLN", "15" = "CA3 EN"
)
seu_obj <- RenameIdents(seu_obj, cell_types)
seu_obj$integrated_cell_types <- Idents(seu_obj)
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = c("orig.ident", "integrated_cell_types"), label = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".umap.group.after_annotation.pdf"),
width = 1400,
height = 600
)
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", split.by = "orig.ident", label = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".umap.split.after_annotation.pdf"),
width = 1400,
height = 600
)
saveRDS(seu_obj, file = "seu_obj_to_annotation.rds")DE analysis of each cell type between conditions: there is only one biological replicate here.
library(Seurat)
library(vroom)
library(tidyverse)
library(YRUtils)
library(unigd)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
gene_id_mpt_file <- "/data/database/annotation/release/mus_musculus/Mus_musculus.GRCm39.vM38.gencode.basic.chr.annotation.gene_id_name_mapping_table.tsv"
seu_obj_rds_file <- c(
"Astrocyte_Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_Glioma.annotate_cell_types/seu_obj_to_annotation.rds"
)
sample_levels <- c("Glioma", "Astrocyte")
n_th <- 50
n <- names(seu_obj_rds_file)
work_dir <- file.path(work_dir, paste0(n, ".de_analysis"))
dir.create(work_dir)
setwd(work_dir)
seu_obj <- readRDS(seu_obj_rds_file)
prop_df <- seu_obj[[]] %>%
select(orig.ident, integrated_cell_types) %>%
group_by(integrated_cell_types, orig.ident) %>%
reframe(n_condition = n()) %>%
group_by(integrated_cell_types) %>%
reframe(orig.ident, n_condition, n_total = sum(n_condition), pct_condition = n_condition / n_total) %>%
arrange(desc(n_total)) %>%
mutate(
integrated_cell_types = factor(integrated_cell_types, levels = unique(integrated_cell_types)),
orig.ident = factor(orig.ident, levels = sample_levels),
combined_ident = paste0(integrated_cell_types, "@", orig.ident),
gt_th = if_else(n_condition >= n_th, TRUE, FALSE)
)
vroom_write(
prop_df,
file = paste0(n, ".condition_prop_for_each_cell_type.tsv"),
col_names = TRUE, append = FALSE
)
p <- ggplot() +
geom_bar(
mapping = aes(integrated_cell_types, n_condition, fill = orig.ident),
data = prop_df, stat = "identity", color = NA
) +
geom_hline(yintercept = 100) +
labs(x = "Cell types", y = "Number of cells", fill = "Condition") +
scale_y_continuous(expand = expansion(0)) +
theme_classic(base_family = "Arial", base_size = 20) +
theme(
axis.text.x.bottom = element_text(angle = 45, vjust = 1, hjust = 1),
axis.text = element_text(colour = "black")
)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".condition_prop_for_each_cell_type.pdf"),
width = 700,
height = 400
)
seu_obj$combined_ident <- paste0(seu_obj$integrated_cell_types, "@", seu_obj$orig.ident)
Idents(seu_obj) <- "combined_ident"
gene_id_mpt <- vroom(gene_id_mpt_file)
for (cell_type in unique(prop_df$integrated_cell_types)) {
tmp_prop_df <- filter(prop_df, integrated_cell_types == cell_type)
if (nrow(tmp_prop_df) != 2 || !all(tmp_prop_df$gt_th)) {
message("skipping '", cell_type, "' ...")
next
}
downsample_n <- min(tmp_prop_df$n_condition)
ident_1 <- tmp_prop_df$combined_ident[tmp_prop_df$orig.ident == sample_levels[1]]
ident_2 <- tmp_prop_df$combined_ident[tmp_prop_df$orig.ident == sample_levels[2]]
de_res <- FindMarkers(
seu_obj,
ident.1 = ident_1,
ident.2 = ident_2,
logfc.threshold = 0,
test.use = "wilcox",
min.pct = 0.1,
min.diff.pct = -Inf,
only.pos = FALSE,
max.cells.per.ident = downsample_n,
random.seed = 123456
)
de_res <- de_res %>%
mutate(gene_name = row.names(.)) %>%
left_join(gene_id_mpt, by = "gene_name") %>%
arrange(desc(avg_log2FC), p_val_adj)
de_res_filtered <- filter(de_res, gene_biotype == "protein_coding") %>%
group_by(gene_name) %>%
slice_max(order_by = abs(avg_log2FC), n = 1) %>%
slice_sample(n = 1) %>%
arrange(desc(avg_log2FC), p_val_adj)
vroom_write(
de_res,
file = paste0(n, ".", gsub("@", "-", gsub(" ", "_", paste0(ident_1, "_vs_", ident_2))), ".raw.tsv"),
col_names = TRUE, append = FALSE
)
vroom_write(
de_res_filtered,
file = paste0(n, ".", gsub("@", "-", gsub(" ", "_", paste0(ident_1, "_vs_", ident_2))), ".unique.tsv"),
col_names = TRUE, append = FALSE
)
}GO enrichment analysis:
library(YRUtils)
library(tidyverse)
library(vroom)
library(clusterProfiler)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
orgdb_file <- "/data/database/annotation/release/mus_musculus/org.Mm.eg.db.sqlite"
degs_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_Glioma.de_analysis"
keytype <- "SYMBOL"
sample_levels <- c("Glioma", "Astrocyte")
logfc_th <- 0.5
pval_th <- 0.05
n <- "Astrocyte_Glioma"
work_dir <- file.path(work_dir, paste0(n, ".go_analysis"))
dir.create(work_dir)
setwd(work_dir)
orgdb <- AnnotationDbi::loadDb(orgdb_file)
# go_terms <- AnnotationDbi::Ontology(GO.db::GOTERM)
# go2gene <- AnnotationDbi::mapIds(orgdb, keys = names(go_terms), column = keytype, keytype = "GOALL", multiVals = "list")
# go_anno <- stack(go2gene)
# colnames(go_anno) <- c(keytype, "GOALL")
# go_anno <- unique(go_anno[!is.na(go_anno[, 1]), ])
# go_anno$ONTOLOGYALL <- go_terms[go_anno$GOALL]
# go_genes <- unique(na.omit(go_anno$SYMBOL))
degs_files <- list.files(degs_dir, pattern = "\\.unique\\.tsv$", full.names = TRUE, recursive = FALSE)
for (degs_file in degs_files) {
degs <- vroom(degs_file)
sig_degs <- degs %>%
dplyr::select(gene_name, avg_log2FC, p_val) %>%
na.omit() %>%
distinct() %>%
filter(abs(avg_log2FC) > logfc_th & p_val < pval_th) %>%
mutate(diff_flag = if_else(avg_log2FC > 0, sample_levels[1], sample_levels[2]))
if (nrow(sig_degs) > 0) {
gene_ls <- lapply(split(sig_degs, sig_degs$diff_flag), function(df) df[["gene_name"]])
} else {
message("no DEGs found in ", degs_file)
next
}
ego <- compareCluster(
gene_ls,
OrgDb = orgdb,
keyType = keytype,
fun = "enrichGO",
ont = "ALL",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
minGSSize = 10,
maxGSSize = 500,
readable = FALSE,
pool = FALSE
)
if (!is.null(ego)) {
saveRDS(ego, file = gsub("\\.unique\\.tsv$", ".GO_ALL.rds", basename(degs_file)))
vroom_write(
ego@compareClusterResult,
file = gsub("\\.unique\\.tsv$", ".GO_ALL.tsv", basename(degs_file)),
col_names = TRUE, append = FALSE
)
}
}3 Appendices
3.1 Remove ambient RNAs using decontX
3.1.1 Astrocyte-treated sample
Remove ambient RNAs using decontX:
library(Seurat)
library(decontX)
library(SingleCellExperiment)
filtered_mat_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
raw_mat_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/raw_cell_gene_matrix"
output_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/data"
n <- "Astrocyte"
filtered_counts <- Read10X(filtered_mat_dir)
raw_counts <- Read10X(raw_mat_dir)
filtered_sce <- SingleCellExperiment(list(counts = filtered_counts))
raw_sce <- SingleCellExperiment(list(counts = raw_counts))
filtered_sce <- decontX(filtered_sce, background = raw_sce)
filtered_seu_obj <- CreateSeuratObject(round(decontXcounts(filtered_sce)))
saveRDS(filtered_sce, file = file.path(output_dir, paste0(n, ".decontX_filtered.sce_obj.rds")))
saveRDS(filtered_seu_obj, file = file.path(output_dir, paste0(n, ".decontX_filtered.seu_obj.rds")))Predict doublets using DoubletFinder:
library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(DoubletFinder)
library(vroom)
set.seed(123456)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX"
ori_seu_obj_rds_file <- c(
"Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/data/Astrocyte.decontX_filtered.seu_obj.rds"
)
n <- names(ori_seu_obj_rds_file)
work_dir <- file.path(work_dir, paste0(n, "_DoubletFinder"))
dir.create(work_dir)
setwd(work_dir)
ori_seu_obj <- readRDS(ori_seu_obj_rds_file)
seu_obj <- CreateSeuratObject(
counts = GetAssayData(ori_seu_obj, assay = "RNA", layer = "counts"),
project = n,
min.cells = 10
)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.raw.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.raw.pdf"),
width = 900,
height = 400
)
seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.lo.filtered.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
width = 900,
height = 400
)
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))
VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
dev.off()
}
p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.elbow.pdf"),
width = 1000,
height = 400
)
pcs <- 1:50
seu_obj <- FindNeighbors(seu_obj, dims = pcs)
# Stable clusters:
# 从 parent clusters 的角度看,其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看,其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加,一个 stable cluster 应该要么保持不变,
# 要么干净的分裂为自己的 child clusters(child clusters 只来源自身)。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
width = 1500,
height = 700
)
seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
width = 800,
height = 800
)
}
resolution <- 0.7
## Use DoubletFinder to identify heterotypic doublets (not homotypic doublets)
# the way DoubletFinder identifies heterotypic doublets can be broken up into 4 steps:
# 1. generate artifical doublets from existing scRNA-seq data;
# 2. pre-process merged real-artificial data;
# 3. perform PCA and use the PC distance matrix to find each cell's proportion of artifical k nearest neighbors (pANN);
# 4. rank order and threshold pANN values based on the expected number of doublets.
# DoubletFinder needs the following arguments:
# 1. seu: a fully-processed Seurat object (i.e., after NormalizeData, FindVariableFeatures, ScaleData, RunPCA, FindNeighbors, FindClusters, and RunUMAP);
# 2. PCs: the number of PCs;
# 3. pN: the number of generated artifical doublets, expressed as a proportion of the merged real-artificial data;
# 4. pK: the PC neighborhood size used to compute pANN, expressed as a proportion of the merged real-artificial data;
# 5. nExp: this defines the pANN threshold used to make final doublet/singlet predictions.
# 关于 DoubletFinder 的输入:
# 不应该用于来源于不同库的混合样本,因为来源于不同库的细胞之间就没有形成 doublets 的可能性(物理隔绝)。
# 分别应用于来源于同一库的样本的子(分割)样本是可以的。
# 确保过滤掉低质量的细胞,如低 nCount_RNA/nFeature_RNA,高 percent.mt。
# pK Identification
# DoubletFinder performance is largely invariant of pN value selection
sweep_res_list <- paramSweep(seu_obj, PCs = pcs, sct = TRUE, num.cores = 80)
sweep_stats <- summarizeSweep(sweep_res_list, GT = FALSE)
bcmvn <- find.pK(sweep_stats)
# maximize AUC for data with ground-truth doublet classifications
# maximize mean-variance normalized bimodality coefficient (BCmvn) for data without ground-truth info
optimal_pK <- as.numeric(as.character(bcmvn$pK[which.max(bcmvn$BCmetric)]))
p <- ggplot(bcmvn, aes(pK, BCmetric)) +
geom_line(group = 1) +
geom_point() +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".DoubletFinder.bcmvn.line.pdf"),
width = 700,
height = 400
)
vroom_write(bcmvn, file = paste0(n, ".DoubletFinder.bcmvn.tsv"), col_names = TRUE, append = FALSE)
# homotypic doublet proportion estimate
# DoubletFinder is insensitive to homotypic doublets
# it will be subtracted from the expected doublet rate
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025
# cell type (state) diversity is used to estimate the homotypic doublet proportion
# under the hypothesis that any two cells may form a doublet with equal chance
homotypic_prop <- modelHomotypic(seu_obj[[]][[paste0("SCT_snn_res.", resolution)]])
nExp_poi <- round(estimated_doublet_rate * nrow(seu_obj[[]]))
nExp_poi_adj <- round(nExp_poi * (1 - homotypic_prop))
# run DoubletFinder with varying classification stringencies
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi, reuse.pANN = NULL, sct = TRUE)
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi_adj, reuse.pANN = paste0("pANN_", 0.25, "_", optimal_pK, "_", nExp_poi), sct = TRUE)
p1 <- DimPlot(seu_obj,
label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
group.by = paste0("SCT_snn_res.", resolution),
cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]] == "Doublet"]),
cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi))
p2 <- DimPlot(seu_obj,
label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
group.by = paste0("SCT_snn_res.", resolution),
cells.highlight = list("Doublet_adj" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]] == "Doublet"]),
cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj))
ugd_save_inline(
{
print(p1 | p2)
},
file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
width = 1400,
height = 700
)
nExp_poi_dbl_stats_df <- inner_join(
seu_obj[[]] %>%
select(all_of(paste0("SCT_snn_res.", resolution))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
reframe(cluster_n = n()),
seu_obj[[]] %>%
select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]]) %>%
reframe(n = n()),
by = paste0("SCT_snn_res.", resolution)
) %>%
mutate(percent = n / cluster_n * 100) %>%
arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]], desc(percent))
vroom_write(nExp_poi_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi, ".nExp.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)
nExp_poi_adj_dbl_stats_df <- inner_join(
seu_obj[[]] %>%
select(all_of(paste0("SCT_snn_res.", resolution))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
reframe(cluster_n = n()),
seu_obj[[]] %>%
select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]]) %>%
reframe(n = n()),
by = paste0("SCT_snn_res.", resolution)
) %>%
mutate(percent = n / cluster_n * 100) %>%
arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]], desc(percent))
vroom_write(nExp_poi_adj_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj, ".nExp_adj.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)
vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "DoubletFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "DoubletFinder.seu_obj_to_umap.rds")Predict doublets using scDblFinder:
library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(scDblFinder)
library(vroom)
set.seed(123456)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX"
ori_seu_obj_rds_file <- c(
"Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/data/Astrocyte.decontX_filtered.seu_obj.rds"
)
n <- names(ori_seu_obj_rds_file)
work_dir <- file.path(work_dir, paste0(n, "_scDblFinder"))
dir.create(work_dir)
setwd(work_dir)
ori_seu_obj <- readRDS(ori_seu_obj_rds_file)
seu_obj <- CreateSeuratObject(
counts = GetAssayData(ori_seu_obj, assay = "RNA", layer = "counts"),
project = n,
min.cells = 10
)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.raw.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.raw.pdf"),
width = 900,
height = 400
)
seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.lo.filtered.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
width = 900,
height = 400
)
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))
VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
dev.off()
}
p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.elbow.pdf"),
width = 1000,
height = 400
)
pcs <- 1:50
seu_obj <- FindNeighbors(seu_obj, dims = pcs)
# Stable clusters:
# 从 parent clusters 的角度看,其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看,其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加,一个 stable cluster 应该要么保持不变,
# 要么干净的分裂为自己的 child clusters(child clusters 只来源自身)。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
width = 1500,
height = 700
)
seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
width = 800,
height = 800
)
}
resolution <- 0.7
## Use scDblFinder to identify heterotypic doublets (not homotypic doublets)
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025
Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)
sce <- scDblFinder(
GetAssayData(seu_obj, assay = "RNA", slot = "counts"),
clusters = Idents(seu_obj),
dbr = estimated_doublet_rate
)
seu_obj <- AddMetaData(seu_obj, metadata = as.data.frame(sce@colData))
p <- DimPlot(seu_obj,
label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
group.by = paste0("SCT_snn_res.", resolution),
cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][["scDblFinder.class"]] == "doublet"]),
cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle("scDblFinder.class")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
width = 700,
height = 700
)
dbl_stats_df <- inner_join(
seu_obj[[]] %>%
select(all_of(paste0("SCT_snn_res.", resolution))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
reframe(cluster_n = n()),
seu_obj[[]] %>%
select(all_of(c(paste0("SCT_snn_res.", resolution), "scDblFinder.class"))) %>%
group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[["scDblFinder.class"]]) %>%
reframe(n = n()),
by = paste0("SCT_snn_res.", resolution)
) %>%
mutate(percent = n / cluster_n * 100) %>%
arrange(.data[["scDblFinder.class"]], desc(percent))
vroom_write(dbl_stats_df, file = "scDblFinder.doublet_proportion.tsv", col_names = TRUE, append = FALSE)
vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "scDblFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "scDblFinder.seu_obj_to_umap.rds")Standard Seurat processing workflow: remove low-quality cells/doublets.
library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(vroom)
library(magrittr)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX"
ori_seu_obj_rds_file <- c(
"Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/data/Astrocyte.decontX_filtered.seu_obj.rds"
)
doubletfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/Astrocyte_DoubletFinder/DoubletFinder.seurat_metadata.tsv"
scdblfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/Astrocyte_scDblFinder/scDblFinder.seurat_metadata.tsv"
doubletfinder_column <- "DF.classifications_0.25_0.14_127"
n <- names(ori_seu_obj_rds_file)
work_dir <- file.path(work_dir, n)
dir.create(work_dir)
setwd(work_dir)
ori_seu_obj <- readRDS(ori_seu_obj_rds_file)
doubletfinder_metadata_df <- vroom(doubletfinder_metadata_file) %>%
select(all_of(c("cell_barcode", doubletfinder_column)))
scdblfinder_metadata_df <- vroom(scdblfinder_metadata_file) %>%
select(cell_barcode, scDblFinder.class)
doublet_metadata_df <- full_join(doubletfinder_metadata_df, scdblfinder_metadata_df, by = "cell_barcode")
if (any(is.na(doublet_metadata_df$cell_barcode))) {
stop("some cell barcodes are NAs")
}
doublet_metadata_df <- tibble(cell_barcode = colnames(ori_seu_obj)) %>%
left_join(doublet_metadata_df, by = "cell_barcode") %>%
mutate(
across(all_of(c(doubletfinder_column, "scDblFinder.class")), ~ replace(.x, is.na(.x), "singlet")),
across(all_of(c(doubletfinder_column, "scDblFinder.class")), str_to_lower),
merged_doublet_class = if_else(.data[[doubletfinder_column]] == "doublet" & scDblFinder.class == "doublet", "HC_Doublet",
if_else(.data[[doubletfinder_column]] == "doublet", "DF_Doublet",
if_else(scDblFinder.class == "doublet", "SDF_Doublet", "Singlet")
)
)
) %>%
as.data.frame() %>%
set_rownames(.[["cell_barcode"]]) %>%
select(-cell_barcode)
table(doublet_metadata_df$merged_doublet_class)
table(doublet_metadata_df[[doubletfinder_column]], doublet_metadata_df$scDblFinder.class)
seu_obj <- CreateSeuratObject(
counts = GetAssayData(ori_seu_obj, assay = "RNA", layer = "counts"),
project = n, min.cells = 10, meta.data = doublet_metadata_df
)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")
p <- VlnPlot(seu_obj, group.by = "merged_doublet_class", features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 1, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.raw.pdf"),
width = 500,
height = 900
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.raw.pdf"),
width = 900,
height = 400
)
seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 1.5 & merged_doublet_class == "Singlet")
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.filtered.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.filtered.pdf"),
width = 900,
height = 400
)
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))
VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(n, ".pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
dev.off()
}
p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".elbow.pdf"),
width = 1000,
height = 400
)
pcs <- 1:33
seu_obj <- FindNeighbors(seu_obj, dims = pcs)
# Stable clusters:
# 从 parent clusters 的角度看,其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看,其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加,一个 stable cluster 应该要么保持不变,
# 要么干净的分裂为自己的 child clusters(child clusters 只来源自身)。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".resolutions.cluster_tree.pdf"),
width = 1500,
height = 700
)
seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".umap.", resolution_column, ".pdf"),
width = 800,
height = 800
)
}
resolution <- 0.9
Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)
saveRDS(seu_obj, file = "seu_obj_to_umap.rds")Annotate cell types based on canonical markers/DEGs:
library(Seurat)
library(vroom)
library(tidyverse)
library(YRUtils)
library(unigd)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX"
seu_obj_rds_file <- c(
"Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/Astrocyte/seu_obj_to_umap.rds"
)
canonical_markers_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/mm_hippocampus_gene_markers.txt"
resolution_column <- "SCT_snn_res.1.3"
n <- names(seu_obj_rds_file)
work_dir <- file.path(work_dir, paste0(n, "_annotate_cell_types"))
dir.create(work_dir)
setwd(work_dir)
seu_obj <- readRDS(seu_obj_rds_file)
Idents(seu_obj) <- resolution_column
DefaultAssay(seu_obj) <- "RNA"
seu_obj <- NormalizeData(seu_obj)
seu_obj <- ScaleData(seu_obj, features = row.names(seu_obj))
seu_obj_markers <- FindAllMarkers(seu_obj, only.pos = TRUE, logfc.threshold = 0.5, min.pct = 0.25)
seu_obj_markers <- seu_obj_markers %>%
mutate(pct_diff = pct.1 - pct.2) %>%
arrange(cluster, p_val_adj, desc(avg_log2FC), desc(pct_diff))
vroom_write(seu_obj_markers, file = "seu_obj.pos_markers.tsv", col_names = TRUE, append = FALSE)
top_n <- 10
top_seu_obj_markers <- seu_obj_markers %>%
group_by(cluster) %>%
filter(avg_log2FC > 1 & p_val_adj < 0.05) %>%
slice_head(n = top_n) %>%
ungroup()
p <- DoHeatmap(seu_obj, features = top_seu_obj_markers$gene, label = TRUE, group.bar.height = 0.005, raster = FALSE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".top", top_n, "_DEGs.heatmap.pdf"),
width = 1200,
height = 3200
)
canonical_markers_df <- vroom(canonical_markers_file) %>%
separate_longer_delim(cols = "marker", delim = "/") %>%
mutate(marker = str_to_title(trimws(marker))) %>%
distinct() %>%
filter(marker %in% row.names(seu_obj)) %>%
mutate(
marker = factor(marker, levels = marker),
rank = 1:n()
)
table(duplicated(canonical_markers_df$marker))
cell_type_anno_vec <- sapply(
split(canonical_markers_df, canonical_markers_df$cell_type),
function(df) sum(df$rank) / nrow(df)
)
cell_type_anno_df <- tibble(cell_type = names(cell_type_anno_vec), x = cell_type_anno_vec)
p1 <- DotPlot(
seu_obj,
features = canonical_markers_df$marker,
cluster.idents = TRUE,
dot.min = 0.1
) +
theme(
axis.text.x.bottom = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.title = element_blank(),
plot.margin = margin(0, 0, 0, 0)
)
Idents(seu_obj) <- factor(Idents(seu_obj), levels = levels(p1$data$id))
p2 <- ggplot() +
geom_tile(
mapping = aes(x = marker, y = 1, fill = cell_type),
data = canonical_markers_df, color = NA,
height = 2
) +
geom_text(
mapping = aes(x, y = -0.5, label = cell_type), data = cell_type_anno_df,
angle = 90, vjust = 0.5, hjust = 1
) +
scale_y_continuous(
expand = expansion(0),
limits = c(-50, 2)
) +
theme_void() +
theme(
legend.position = "none",
plot.margin = margin(0, 0, 0, 0)
)
p <- aplot::insert_bottom(p1, p2, height = 1)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".canonical_markers.dotplot.pdf"),
width = 1500,
height = 1000
)
p <- VlnPlot(
seu_obj,
features = canonical_markers_df$marker,
split.by = "ident",
stack = TRUE,
raster = TRUE
) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".canonical_markers.stacked_violin.pdf"),
width = 1400,
height = 600
)
p1 <- VlnPlot(seu_obj, features = "percent.mt", group.by = resolution_column) +
geom_hline(yintercept = 1.5) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p2 <- VlnPlot(seu_obj, features = "nCount_RNA", group.by = resolution_column) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p3 <- VlnPlot(seu_obj, features = "nFeature_RNA", group.by = resolution_column) +
NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p <- p1 / p2 / p3
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".nCount_nFeature_MT.qc_violin.cluster-wise.pdf"),
width = 1000,
height = 1000
)3.1.2 Glioma-treated sample
Remove ambient RNAs using decontX:
library(Seurat)
library(decontX)
library(SingleCellExperiment)
filtered_mat_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
raw_mat_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/raw_cell_gene_matrix"
output_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/data"
n <- "Astrocyte"
filtered_counts <- Read10X(filtered_mat_dir)
raw_counts <- Read10X(raw_mat_dir)
filtered_sce <- SingleCellExperiment(list(counts = filtered_counts))
raw_sce <- SingleCellExperiment(list(counts = raw_counts))
filtered_sce <- decontX(filtered_sce, background = raw_sce)
filtered_seu_obj <- CreateSeuratObject(round(decontXcounts(filtered_sce)))
saveRDS(filtered_sce, file = file.path(output_dir, paste0(n, ".decontX_filtered.sce_obj.rds")))
saveRDS(filtered_seu_obj, file = file.path(output_dir, paste0(n, ".decontX_filtered.seu_obj.rds")))3.2 Create 10X H5 file
library(scCustomize)
raw_data_path <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176044-250818-Glioma_20250928_004852/JZ25176044-250818-Glioma_outs/raw_cell_gene_matrix"
Create_10X_H5(raw_data_path,
source_type = "10X",
save_file_path = dirname(raw_data_path),
save_name = "raw_cell_gene_matrix"
)3.3 Remove ambient RNAs using CellBender
#!/usr/bin/bash -e
MAMBA_EXE=/home/yangrui/softwares/micromamba/bin/micromamba
raw_feature_bc_matrix_h5=/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/raw_cell_gene_matrix_35380e54375b12.h5
work_dir=/data/tmp/yangrui_tmp
cd "${work_dir}"
output_h5=raw_feature_bc_matrix_cellbender.h5
"${MAMBA_EXE}" run -n cellbender_v1.3.0_py3.11 cellbender remove-background \
--cuda --input "${raw_feature_bc_matrix_h5}" \
--output "${output_h5}"3.4 Visualize feature expressions between conditions
library(Seurat)
seu_obj_rds_file <- c(
"Astrocyte_Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_Glioma.annotate_cell_types/seu_obj_to_annotation.rds"
)
seu_obj <- readRDS(seu_obj_rds_file)
features <- c("Lgi1")
FeaturePlot(
seu_obj,
features = features,
reduction = "umap",
split.by = "orig.ident"
)3.5 Filter cells using adaptive thresholds
3SD or 3MAD:
library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
"Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
)
n <- names(sample_dir)
work_dir <- file.path(work_dir, n)
dir.create(work_dir)
setwd(work_dir)
counts <- Read10X(sample_dir)
seu_obj <- CreateSeuratObject(
counts = counts, project = n,
min.cells = 10
)
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.raw.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.raw.pdf"),
width = 900,
height = 400
)
seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)
# more conservative
# nCount_RNA_th <- median(seu_obj$nCount_RNA) + 3 * mad(seu_obj$nCount_RNA)
# more relax
nCount_RNA_th <- mean(seu_obj$nCount_RNA) + 3 * sd(seu_obj$nCount_RNA)
nFeature_RNA_th <- median(seu_obj$nFeature_RNA) + 3 * mad(seu_obj$nFeature_RNA)
seu_obj <- subset(seu_obj, subset = nCount_RNA < nCount_RNA_th & nFeature_RNA < nFeature_RNA_th)
p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".qc_violin.filtered.pdf"),
width = 800,
height = 400
)
p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
{
print(p1 + p2)
},
file = paste0(n, ".qc_scatter.filtered.pdf"),
width = 900,
height = 400
)
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))
VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
pdf(
file = paste0(n, ".pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
width = 9, height = 12
)
DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
dev.off()
}
p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".elbow.pdf"),
width = 1000,
height = 400
)
pcs <- 1:50
seu_obj <- FindNeighbors(seu_obj, dims = pcs)
# Stable clusters:
# 从 parent clusters 的角度看,其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看,其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加,一个 stable cluster 应该要么保持不变,
# 要么干净的分裂为自己的 child clusters(child clusters 只来源自身)。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".resolutions.cluster_tree.pdf"),
width = 1500,
height = 700
)
seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = paste0(n, ".umap.", resolution_column, ".pdf"),
width = 800,
height = 800
)
}
saveRDS(seu_obj, file = "seu_obj_to_umap.rds")