Single-cell analysis workflow for adult mouse hippocampus datasets dealed with either human astrocyte or human glioma cells

1 Some concepts

1.1 Outlier detection methods for 1D metrics

1.1.1 \(\sigma\)-based

数据需服从正态分布。\(\sigma\) 对异常值很敏感，因此适合于极端严格的场景。

基于 \(3\sigma\) 原则，即 \(\mu \pm \sigma\) 应覆盖 \(68\%\) 的数据，\(\mu \pm 2\sigma\) 应覆盖 \(95\%\) 的数据，而 \(\mu \pm 3\sigma\) 应覆盖 \(99.7\%\) 的数据。

1.1.2 MAD-based

中位数绝对偏差（Median Absolute Deviation, MAD）是一个无分布假设，对异常值不敏感（稳健）的异常值检测方法。

先计算 \(MAD = median(|x_i - median(x)|)\)，再计算 3 倍 MAD，即 \(median(x) \pm 3 \times MAD\)。

也有一个与 3 倍 \(\sigma\) 对齐的改进方案：

其假设当 \(n \rightarrow \infty\) 时有 \(median \rightarrow \mu\) 且 \(E(MAD) = \sigma / 1.4826\)，其中 \(1.4826 \approx b = \frac{1}{\Phi^{-1}(0.75)}\) 为一个渐进因子，而 \(\Phi^{-1}\) 为标准正态分布的累积分布函数的反函数。

因此引入 \(\sigma_{MAD} = 1.4826 \times MAD\)。

对于小样本（e.g., \(n \lt 100\)），\(\sigma_{MAD}\) 会低估 \(\sigma\)，因此有一个简单地修正公式 \(\sigma_{MAD,corrected} = 1.4826 \times \frac{n}{n-1} \times MAD\)，其满足当 \(n \rightarrow \infty\) 时有 \(\sigma_{MAD,corrected} \rightarrow \sigma_{MAD}\)。

注：

时间序列数据应使用观测值和模型预测值之间的差值，即残差来计算 MAD。另外，对于残差，通常也不应用全局 MAD，应用滑动窗口 MAD，因为数据的波动性可能会随时间变化。
多维数据有其对应的处理指标。

注：不论是 \(\sigma\)-based，还是 MAD-based，推荐先过滤完 empty/dying cells 再计算用于过滤的上限，例如使用 nCount_RNA > 500 & nFeature_RNA > 250。应用他们的前提是大部分细胞都是高质量的。

2 Main analysis workflow

2.1 From QC to cell type annotations

2.1.1 Astrocyte-treated sample

Predict doublets using DoubletFinder:

library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(DoubletFinder)
library(vroom)

set.seed(123456)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
    "Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
)

n <- names(sample_dir)

work_dir <- file.path(work_dir, paste0(n, "_DoubletFinder"))
dir.create(work_dir)
setwd(work_dir)

counts <- Read10X(sample_dir)
seu_obj <- CreateSeuratObject(
    counts = counts, project = n,
    min.cells = 10
)

seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.raw.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.raw.pdf"),
    width = 900,
    height = 400
)

seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.lo.filtered.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
    width = 900,
    height = 400
)

seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))

VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
    dev.off()
}

p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.elbow.pdf"),
    width = 1000,
    height = 400
)

pcs <- 1:50

seu_obj <- FindNeighbors(seu_obj, dims = pcs)

# Stable clusters:
# 从 parent clusters 的角度看，其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看，其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加，一个 stable cluster 应该要么保持不变，
# 要么干净的分裂为自己的 child clusters（child clusters 只来源自身）。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
    seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
    width = 1500,
    height = 700
)

seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
    p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
    ugd_save_inline(
        {
            print(p)
        },
        file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
        width = 800,
        height = 800
    )
}

resolution <- 0.7

## Use DoubletFinder to identify heterotypic doublets (not homotypic doublets)
# the way DoubletFinder identifies heterotypic doublets can be broken up into 4 steps:
# 1. generate artifical doublets from existing scRNA-seq data;
# 2. pre-process merged real-artificial data;
# 3. perform PCA and use the PC distance matrix to find each cell's proportion of artifical k nearest neighbors (pANN);
# 4. rank order and threshold pANN values based on the expected number of doublets.

# DoubletFinder needs the following arguments:
# 1. seu: a fully-processed Seurat object (i.e., after NormalizeData, FindVariableFeatures, ScaleData, RunPCA, FindNeighbors, FindClusters, and RunUMAP);
# 2. PCs: the number of PCs;
# 3. pN: the number of generated artifical doublets, expressed as a proportion of the merged real-artificial data;
# 4. pK: the PC neighborhood size used to compute pANN, expressed as a proportion of the merged real-artificial data;
# 5. nExp: this defines the pANN threshold used to make final doublet/singlet predictions.

# 关于 DoubletFinder 的输入：
# 不应该用于来源于不同库的混合样本，因为来源于不同库的细胞之间就没有形成 doublets 的可能性（物理隔绝）。
# 分别应用于来源于同一库的样本的子（分割）样本是可以的。
# 确保过滤掉低质量的细胞，如低 nCount_RNA/nFeature_RNA，高 percent.mt。

# pK Identification
# DoubletFinder performance is largely invariant of pN value selection
sweep_res_list <- paramSweep(seu_obj, PCs = pcs, sct = TRUE, num.cores = 80)
sweep_stats <- summarizeSweep(sweep_res_list, GT = FALSE)
bcmvn <- find.pK(sweep_stats)
# maximize AUC for data with ground-truth doublet classifications
# maximize mean-variance normalized bimodality coefficient (BCmvn) for data without ground-truth info
optimal_pK <- as.numeric(as.character(bcmvn$pK[which.max(bcmvn$BCmetric)]))

p <- ggplot(bcmvn, aes(pK, BCmetric)) +
    geom_line(group = 1) +
    geom_point() +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".DoubletFinder.bcmvn.line.pdf"),
    width = 700,
    height = 400
)
vroom_write(bcmvn, file = paste0(n, ".DoubletFinder.bcmvn.tsv"), col_names = TRUE, append = FALSE)

# homotypic doublet proportion estimate
# DoubletFinder is insensitive to homotypic doublets
# it will be subtracted from the expected doublet rate
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025

# cell type (state) diversity is used to estimate the homotypic doublet proportion
# under the hypothesis that any two cells may form a doublet with equal chance
homotypic_prop <- modelHomotypic(seu_obj[[]][[paste0("SCT_snn_res.", resolution)]])
nExp_poi <- round(estimated_doublet_rate * nrow(seu_obj[[]]))
nExp_poi_adj <- round(nExp_poi * (1 - homotypic_prop))

# run DoubletFinder with varying classification stringencies
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi, reuse.pANN = NULL, sct = TRUE)
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi_adj, reuse.pANN = paste0("pANN_", 0.25, "_", optimal_pK, "_", nExp_poi), sct = TRUE)

p1 <- DimPlot(seu_obj,
    label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
    group.by = paste0("SCT_snn_res.", resolution),
    cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]] == "Doublet"]),
    cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi))
p2 <- DimPlot(seu_obj,
    label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
    group.by = paste0("SCT_snn_res.", resolution),
    cells.highlight = list("Doublet_adj" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]] == "Doublet"]),
    cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj))
ugd_save_inline(
    {
        print(p1 | p2)
    },
    file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
    width = 1400,
    height = 700
)

nExp_poi_dbl_stats_df <- inner_join(
    seu_obj[[]] %>%
        select(all_of(paste0("SCT_snn_res.", resolution))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
        reframe(cluster_n = n()),
    seu_obj[[]] %>%
        select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]]) %>%
        reframe(n = n()),
    by = paste0("SCT_snn_res.", resolution)
) %>%
    mutate(percent = n / cluster_n * 100) %>%
    arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]], desc(percent))
vroom_write(nExp_poi_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi, ".nExp.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)

nExp_poi_adj_dbl_stats_df <- inner_join(
    seu_obj[[]] %>%
        select(all_of(paste0("SCT_snn_res.", resolution))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
        reframe(cluster_n = n()),
    seu_obj[[]] %>%
        select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]]) %>%
        reframe(n = n()),
    by = paste0("SCT_snn_res.", resolution)
) %>%
    mutate(percent = n / cluster_n * 100) %>%
    arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]], desc(percent))
vroom_write(nExp_poi_adj_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj, ".nExp_adj.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)

vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "DoubletFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "DoubletFinder.seu_obj_to_umap.rds")

Predict doublets using scDblFinder:

library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(scDblFinder)
library(vroom)

set.seed(123456)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
    "Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
)

n <- names(sample_dir)

work_dir <- file.path(work_dir, paste0(n, "_scDblFinder"))
dir.create(work_dir)
setwd(work_dir)

counts <- Read10X(sample_dir)
seu_obj <- CreateSeuratObject(
    counts = counts, project = n,
    min.cells = 10
)

seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.raw.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.raw.pdf"),
    width = 900,
    height = 400
)

seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.lo.filtered.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
    width = 900,
    height = 400
)

seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))

VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
    dev.off()
}

p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.elbow.pdf"),
    width = 1000,
    height = 400
)

pcs <- 1:50

seu_obj <- FindNeighbors(seu_obj, dims = pcs)

# Stable clusters:
# 从 parent clusters 的角度看，其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看，其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加，一个 stable cluster 应该要么保持不变，
# 要么干净的分裂为自己的 child clusters（child clusters 只来源自身）。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
    seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
    width = 1500,
    height = 700
)

seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
    p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
    ugd_save_inline(
        {
            print(p)
        },
        file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
        width = 800,
        height = 800
    )
}

resolution <- 0.7

## Use scDblFinder to identify heterotypic doublets (not homotypic doublets)
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025

Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)
sce <- scDblFinder(
    GetAssayData(seu_obj, assay = "RNA", slot = "counts"),
    clusters = Idents(seu_obj),
    dbr = estimated_doublet_rate
)
seu_obj <- AddMetaData(seu_obj, metadata = as.data.frame(sce@colData))

p <- DimPlot(seu_obj,
    label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
    group.by = paste0("SCT_snn_res.", resolution),
    cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][["scDblFinder.class"]] == "doublet"]),
    cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle("scDblFinder.class")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
    width = 700,
    height = 700
)

dbl_stats_df <- inner_join(
    seu_obj[[]] %>%
        select(all_of(paste0("SCT_snn_res.", resolution))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
        reframe(cluster_n = n()),
    seu_obj[[]] %>%
        select(all_of(c(paste0("SCT_snn_res.", resolution), "scDblFinder.class"))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[["scDblFinder.class"]]) %>%
        reframe(n = n()),
    by = paste0("SCT_snn_res.", resolution)
) %>%
    mutate(percent = n / cluster_n * 100) %>%
    arrange(.data[["scDblFinder.class"]], desc(percent))
vroom_write(dbl_stats_df, file = "scDblFinder.doublet_proportion.tsv", col_names = TRUE, append = FALSE)

vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "scDblFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "scDblFinder.seu_obj_to_umap.rds")

Standard Seurat processing workflow: remove low-quality cells/doublets.

library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(vroom)
library(magrittr)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
    "Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
)
doubletfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_DoubletFinder/DoubletFinder.seurat_metadata.tsv"
scdblfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_scDblFinder/scDblFinder.seurat_metadata.tsv"
doubletfinder_column <- "DF.classifications_0.25_0.14_163"

n <- names(sample_dir)

work_dir <- file.path(work_dir, n)
dir.create(work_dir)
setwd(work_dir)

counts <- Read10X(sample_dir)

doubletfinder_metadata_df <- vroom(doubletfinder_metadata_file) %>%
    select(all_of(c("cell_barcode", doubletfinder_column)))
scdblfinder_metadata_df <- vroom(scdblfinder_metadata_file) %>%
    select(cell_barcode, scDblFinder.class)
doublet_metadata_df <- full_join(doubletfinder_metadata_df, scdblfinder_metadata_df, by = "cell_barcode")

if (any(is.na(doublet_metadata_df$cell_barcode))) {
    stop("some cell barcodes are NAs")
}

doublet_metadata_df <- tibble(cell_barcode = colnames(counts)) %>%
    left_join(doublet_metadata_df, by = "cell_barcode") %>%
    mutate(
        across(all_of(c(doubletfinder_column, "scDblFinder.class")), ~ replace(.x, is.na(.x), "singlet")),
        across(all_of(c(doubletfinder_column, "scDblFinder.class")), str_to_lower),
        merged_doublet_class = if_else(.data[[doubletfinder_column]] == "doublet" & scDblFinder.class == "doublet", "HC_Doublet",
            if_else(.data[[doubletfinder_column]] == "doublet", "DF_Doublet",
                if_else(scDblFinder.class == "doublet", "SDF_Doublet", "Singlet")
            )
        )
    ) %>%
    as.data.frame() %>%
    set_rownames(.[["cell_barcode"]]) %>%
    select(-cell_barcode)

table(doublet_metadata_df$merged_doublet_class)
table(doublet_metadata_df[[doubletfinder_column]], doublet_metadata_df$scDblFinder.class)

seu_obj <- CreateSeuratObject(
    counts = counts, project = n,
    min.cells = 10, meta.data = doublet_metadata_df
)

seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")

p <- VlnPlot(seu_obj, group.by = "merged_doublet_class", features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 1, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.raw.pdf"),
    width = 500,
    height = 900
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.raw.pdf"),
    width = 900,
    height = 400
)

seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 2.5 & merged_doublet_class == "Singlet")

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.filtered.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.filtered.pdf"),
    width = 900,
    height = 400
)

# quick assessment of cell cycle effects
# seu_obj <- NormalizeData(seu_obj, normalization.method = "LogNormalize", scale.factor = 10000)

# s.genes <- str_to_title(cc.genes$s.genes)
# g2m.genes <- str_to_title(cc.genes$g2m.genes)

# length(intersect(row.names(seu_obj), s.genes)) / length(s.genes)
# length(intersect(row.names(seu_obj), g2m.genes)) / length(g2m.genes)

# 针对单个细胞计算基因集富集分数
# 对于每一个细胞，比较目标基因集的平均表达量与一组随机挑选的、但表达水平相似的对照基因集的平均表达量
# 基于每个基因在所有细胞中的平均表达量对其进行分箱（binning）
# 为目标基因集中的每一个基因匹配 N (e.g., 100) 个对照基因，由此得到对照基因集
# 针对每一个细胞，分别计算目标基因集和对照基因集的平均表达量
# 计算最终得分：score = target_mean - control_mean
# seu_obj <- CellCycleScoring(seu_obj, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
# seu_obj$CC.Difference <- seu_obj$S.Score - seu_obj$G2M.Score

# seu_obj <- ScaleData(seu_obj, assay = "RNA", features = c(s.genes, g2m.genes))
# seu_obj <- RunPCA(seu_obj, assay = "RNA", features = c(s.genes, g2m.genes), reduction.name = "cell_cycle_genes.pca")
# DimPlot(seu_obj, reduction = "cell_cycle_genes.pca")

seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))

VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(n, ".pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
    dev.off()
}

p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".elbow.pdf"),
    width = 1000,
    height = 400
)

pcs <- 1:30

seu_obj <- FindNeighbors(seu_obj, dims = pcs)

# Stable clusters:
# 从 parent clusters 的角度看，其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看，其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加，一个 stable cluster 应该要么保持不变，
# 要么干净的分裂为自己的 child clusters（child clusters 只来源自身）。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
    seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".resolutions.cluster_tree.pdf"),
    width = 1500,
    height = 700
)

seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
    p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
    ugd_save_inline(
        {
            print(p)
        },
        file = paste0(n, ".umap.", resolution_column, ".pdf"),
        width = 800,
        height = 800
    )
}

resolution <- 0.5

Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)

saveRDS(seu_obj, file = "seu_obj_to_umap.rds")

Annotate cell types based on canonical markers/DEGs:

library(Seurat)
library(vroom)
library(tidyverse)
library(YRUtils)
library(unigd)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
seu_obj_rds_file <- c(
    "Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte/seu_obj_to_umap.rds"
)
canonical_markers_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/mm_hippocampus_gene_markers.txt"
resolution_column <- "SCT_snn_res.0.5"

n <- names(seu_obj_rds_file)

work_dir <- file.path(work_dir, paste0(n, "_annotate_cell_types"))
dir.create(work_dir)
setwd(work_dir)

seu_obj <- readRDS(seu_obj_rds_file)
Idents(seu_obj) <- resolution_column

DefaultAssay(seu_obj) <- "RNA"
seu_obj <- NormalizeData(seu_obj)
seu_obj <- ScaleData(seu_obj, features = row.names(seu_obj))

seu_obj_markers <- FindAllMarkers(seu_obj, only.pos = TRUE, logfc.threshold = 0.5, min.pct = 0.25)
seu_obj_markers <- seu_obj_markers %>%
    mutate(pct_diff = pct.1 - pct.2) %>%
    arrange(cluster, p_val_adj, desc(avg_log2FC), desc(pct_diff))
vroom_write(seu_obj_markers, file = "seu_obj.pos_markers.tsv", col_names = TRUE, append = FALSE)

top_n <- 10

top_seu_obj_markers <- seu_obj_markers %>%
    group_by(cluster) %>%
    filter(avg_log2FC > 1 & p_val_adj < 0.05) %>%
    slice_head(n = top_n) %>%
    ungroup()
p <- DoHeatmap(seu_obj, features = top_seu_obj_markers$gene, label = TRUE, group.bar.height = 0.005, raster = FALSE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".top", top_n, "_DEGs.heatmap.pdf"),
    width = 1200,
    height = 3200
)

canonical_markers_df <- vroom(canonical_markers_file) %>%
    separate_longer_delim(cols = "marker", delim = "/") %>%
    mutate(marker = str_to_title(trimws(marker))) %>%
    distinct() %>%
    filter(marker %in% row.names(seu_obj)) %>%
    mutate(
        marker = factor(marker, levels = marker),
        rank = 1:n()
    )

table(duplicated(canonical_markers_df$marker))

cell_type_anno_vec <- sapply(
    split(canonical_markers_df, canonical_markers_df$cell_type),
    function(df) sum(df$rank) / nrow(df)
)
cell_type_anno_df <- tibble(cell_type = names(cell_type_anno_vec), x = cell_type_anno_vec)

p1 <- DotPlot(
    seu_obj,
    features = canonical_markers_df$marker,
    cluster.idents = TRUE,
    dot.min = 0.1
) +
    theme(
        axis.text.x.bottom = element_text(angle = 90, hjust = 1, vjust = 0.5),
        axis.title = element_blank(),
        plot.margin = margin(0, 0, 0, 0)
    )
Idents(seu_obj) <- factor(Idents(seu_obj), levels = levels(p1$data$id))

p2 <- ggplot() +
    geom_tile(
        mapping = aes(x = marker, y = 1, fill = cell_type),
        data = canonical_markers_df, color = NA,
        height = 2
    ) +
    geom_text(
        mapping = aes(x, y = -0.5, label = cell_type), data = cell_type_anno_df,
        angle = 90, vjust = 0.5, hjust = 1
    ) +
    scale_y_continuous(
        expand = expansion(0),
        limits = c(-50, 2)
    ) +
    theme_void() +
    theme(
        legend.position = "none",
        plot.margin = margin(0, 0, 0, 0)
    )
p <- aplot::insert_bottom(p1, p2, height = 1)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".canonical_markers.dotplot.pdf"),
    width = 1500,
    height = 1000
)

p <- VlnPlot(
    seu_obj,
    features = canonical_markers_df$marker,
    split.by = "ident",
    stack = TRUE,
    raster = TRUE
) + NoLegend()
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".canonical_markers.stacked_violin.pdf"),
    width = 1400,
    height = 600
)

p1 <- VlnPlot(seu_obj, features = "percent.mt", group.by = resolution_column) +
    geom_hline(yintercept = 2.5) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p2 <- VlnPlot(seu_obj, features = "nCount_RNA", group.by = resolution_column) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p3 <- VlnPlot(seu_obj, features = "nFeature_RNA", group.by = resolution_column) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p <- p1 / p2 / p3
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".nCount_nFeature_MT.qc_violin.cluster-wise.pdf"),
    width = 1000,
    height = 1000
)

cell_types <- c(
    "12" = "OPC", "1" = "Oligodendrocytes", "6" = "Microglia",
    "7" = "Astrocytes", "13" = "VLMCs", "9" = "Sst IN",
    "10" = "IN", "0" = "DG EN", "11" = "EN1", "5" = "EN2",
    "8" = "EN3", "2" = "EN4", "4" = "EN5", "3" = "PLN",
    "14" = "Cajal Retzius EN", "15" = "Unknown"
)

seu_obj <- RenameIdents(seu_obj, cell_types)
seu_obj$cell_types <- Idents(seu_obj)

p <- DimPlot(seu_obj, label.size = 4, repel = TRUE, reduction = "umap", group.by = "cell_types", label = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".umap.cell_types.pdf"),
    width = 600,
    height = 400
)

saveRDS(seu_obj, file = "seu_obj_to_annotation.rds")

2.1.2 Glioma-treated sample

Predict doublets using DoubletFinder:

library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(DoubletFinder)
library(vroom)

set.seed(123456)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
    "Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176044-250818-Glioma_20250928_004852/JZ25176044-250818-Glioma_outs/filtered_cell_gene_matrix"
)

n <- names(sample_dir)

work_dir <- file.path(work_dir, paste0(n, "_DoubletFinder"))
dir.create(work_dir)
setwd(work_dir)

counts <- Read10X(sample_dir)
seu_obj <- CreateSeuratObject(
    counts = counts, project = n,
    min.cells = 10
)

seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.raw.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.raw.pdf"),
    width = 900,
    height = 400
)

seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.lo.filtered.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
    width = 900,
    height = 400
)

seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))

VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
    dev.off()
}

p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.elbow.pdf"),
    width = 1000,
    height = 400
)

pcs <- 1:50

seu_obj <- FindNeighbors(seu_obj, dims = pcs)

# Stable clusters:
# 从 parent clusters 的角度看，其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看，其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加，一个 stable cluster 应该要么保持不变，
# 要么干净的分裂为自己的 child clusters（child clusters 只来源自身）。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
    seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
    width = 1500,
    height = 700
)

seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
    p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
    ugd_save_inline(
        {
            print(p)
        },
        file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
        width = 800,
        height = 800
    )
}

resolution <- 0.7

## Use DoubletFinder to identify heterotypic doublets (not homotypic doublets)
# the way DoubletFinder identifies heterotypic doublets can be broken up into 4 steps:
# 1. generate artifical doublets from existing scRNA-seq data;
# 2. pre-process merged real-artificial data;
# 3. perform PCA and use the PC distance matrix to find each cell's proportion of artifical k nearest neighbors (pANN);
# 4. rank order and threshold pANN values based on the expected number of doublets.

# DoubletFinder needs the following arguments:
# 1. seu: a fully-processed Seurat object (i.e., after NormalizeData, FindVariableFeatures, ScaleData, RunPCA, FindNeighbors, FindClusters, and RunUMAP);
# 2. PCs: the number of PCs;
# 3. pN: the number of generated artifical doublets, expressed as a proportion of the merged real-artificial data;
# 4. pK: the PC neighborhood size used to compute pANN, expressed as a proportion of the merged real-artificial data;
# 5. nExp: this defines the pANN threshold used to make final doublet/singlet predictions.

# 关于 DoubletFinder 的输入：
# 不应该用于来源于不同库的混合样本，因为来源于不同库的细胞之间就没有形成 doublets 的可能性（物理隔绝）。
# 分别应用于来源于同一库的样本的子（分割）样本是可以的。
# 确保过滤掉低质量的细胞，如低 nCount_RNA/nFeature_RNA，高 percent.mt。

# pK Identification
# DoubletFinder performance is largely invariant of pN value selection
sweep_res_list <- paramSweep(seu_obj, PCs = pcs, sct = TRUE, num.cores = 80)
sweep_stats <- summarizeSweep(sweep_res_list, GT = FALSE)
bcmvn <- find.pK(sweep_stats)
# maximize AUC for data with ground-truth doublet classifications
# maximize mean-variance normalized bimodality coefficient (BCmvn) for data without ground-truth info
optimal_pK <- as.numeric(as.character(bcmvn$pK[which.max(bcmvn$BCmetric)]))

p <- ggplot(bcmvn, aes(pK, BCmetric)) +
    geom_line(group = 1) +
    geom_point() +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".DoubletFinder.bcmvn.line.pdf"),
    width = 700,
    height = 400
)
vroom_write(bcmvn, file = paste0(n, ".DoubletFinder.bcmvn.tsv"), col_names = TRUE, append = FALSE)

# homotypic doublet proportion estimate
# DoubletFinder is insensitive to homotypic doublets
# it will be subtracted from the expected doublet rate
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025

# cell type (state) diversity is used to estimate the homotypic doublet proportion
# under the hypothesis that any two cells may form a doublet with equal chance
homotypic_prop <- modelHomotypic(seu_obj[[]][[paste0("SCT_snn_res.", resolution)]])
nExp_poi <- round(estimated_doublet_rate * nrow(seu_obj[[]]))
nExp_poi_adj <- round(nExp_poi * (1 - homotypic_prop))

# run DoubletFinder with varying classification stringencies
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi, reuse.pANN = NULL, sct = TRUE)
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi_adj, reuse.pANN = paste0("pANN_", 0.25, "_", optimal_pK, "_", nExp_poi), sct = TRUE)

p1 <- DimPlot(seu_obj,
    label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
    group.by = paste0("SCT_snn_res.", resolution),
    cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]] == "Doublet"]),
    cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi))
p2 <- DimPlot(seu_obj,
    label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
    group.by = paste0("SCT_snn_res.", resolution),
    cells.highlight = list("Doublet_adj" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]] == "Doublet"]),
    cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj))
ugd_save_inline(
    {
        print(p1 | p2)
    },
    file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
    width = 1400,
    height = 700
)

nExp_poi_dbl_stats_df <- inner_join(
    seu_obj[[]] %>%
        select(all_of(paste0("SCT_snn_res.", resolution))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
        reframe(cluster_n = n()),
    seu_obj[[]] %>%
        select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]]) %>%
        reframe(n = n()),
    by = paste0("SCT_snn_res.", resolution)
) %>%
    mutate(percent = n / cluster_n * 100) %>%
    arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]], desc(percent))
vroom_write(nExp_poi_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi, ".nExp.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)

nExp_poi_adj_dbl_stats_df <- inner_join(
    seu_obj[[]] %>%
        select(all_of(paste0("SCT_snn_res.", resolution))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
        reframe(cluster_n = n()),
    seu_obj[[]] %>%
        select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]]) %>%
        reframe(n = n()),
    by = paste0("SCT_snn_res.", resolution)
) %>%
    mutate(percent = n / cluster_n * 100) %>%
    arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]], desc(percent))
vroom_write(nExp_poi_adj_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj, ".nExp_adj.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)

vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "DoubletFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "DoubletFinder.seu_obj_to_umap.rds")

Predict doublets using scDblFinder:

library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(scDblFinder)
library(vroom)

set.seed(123456)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
    "Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176044-250818-Glioma_20250928_004852/JZ25176044-250818-Glioma_outs/filtered_cell_gene_matrix"
)

n <- names(sample_dir)

work_dir <- file.path(work_dir, paste0(n, "_scDblFinder"))
dir.create(work_dir)
setwd(work_dir)

counts <- Read10X(sample_dir)
seu_obj <- CreateSeuratObject(
    counts = counts, project = n,
    min.cells = 10
)

seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.raw.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.raw.pdf"),
    width = 900,
    height = 400
)

seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.lo.filtered.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
    width = 900,
    height = 400
)

seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))

VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
    dev.off()
}

p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.elbow.pdf"),
    width = 1000,
    height = 400
)

pcs <- 1:50

seu_obj <- FindNeighbors(seu_obj, dims = pcs)

# Stable clusters:
# 从 parent clusters 的角度看，其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看，其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加，一个 stable cluster 应该要么保持不变，
# 要么干净的分裂为自己的 child clusters（child clusters 只来源自身）。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
    seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
    width = 1500,
    height = 700
)

seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
    p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
    ugd_save_inline(
        {
            print(p)
        },
        file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
        width = 800,
        height = 800
    )
}

resolution <- 0.7

## Use scDblFinder to identify heterotypic doublets (not homotypic doublets)
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025

Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)
sce <- scDblFinder(
    GetAssayData(seu_obj, assay = "RNA", slot = "counts"),
    clusters = Idents(seu_obj),
    dbr = estimated_doublet_rate
)
seu_obj <- AddMetaData(seu_obj, metadata = as.data.frame(sce@colData))

p <- DimPlot(seu_obj,
    label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
    group.by = paste0("SCT_snn_res.", resolution),
    cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][["scDblFinder.class"]] == "doublet"]),
    cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle("scDblFinder.class")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
    width = 700,
    height = 700
)

dbl_stats_df <- inner_join(
    seu_obj[[]] %>%
        select(all_of(paste0("SCT_snn_res.", resolution))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
        reframe(cluster_n = n()),
    seu_obj[[]] %>%
        select(all_of(c(paste0("SCT_snn_res.", resolution), "scDblFinder.class"))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[["scDblFinder.class"]]) %>%
        reframe(n = n()),
    by = paste0("SCT_snn_res.", resolution)
) %>%
    mutate(percent = n / cluster_n * 100) %>%
    arrange(.data[["scDblFinder.class"]], desc(percent))
vroom_write(dbl_stats_df, file = "scDblFinder.doublet_proportion.tsv", col_names = TRUE, append = FALSE)

vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "scDblFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "scDblFinder.seu_obj_to_umap.rds")

Standard Seurat processing workflow: remove low-quality cells/doublets.

library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(vroom)
library(magrittr)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
    "Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176044-250818-Glioma_20250928_004852/JZ25176044-250818-Glioma_outs/filtered_cell_gene_matrix"
)
doubletfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Glioma_DoubletFinder/DoubletFinder.seurat_metadata.tsv"
scdblfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Glioma_scDblFinder/scDblFinder.seurat_metadata.tsv"
doubletfinder_column <- "DF.classifications_0.25_0.25_126"

n <- names(sample_dir)

work_dir <- file.path(work_dir, n)
dir.create(work_dir)
setwd(work_dir)

counts <- Read10X(sample_dir)

doubletfinder_metadata_df <- vroom(doubletfinder_metadata_file) %>%
    select(all_of(c("cell_barcode", doubletfinder_column)))
scdblfinder_metadata_df <- vroom(scdblfinder_metadata_file) %>%
    select(cell_barcode, scDblFinder.class)
doublet_metadata_df <- full_join(doubletfinder_metadata_df, scdblfinder_metadata_df, by = "cell_barcode")

if (any(is.na(doublet_metadata_df$cell_barcode))) {
    stop("some cell barcodes are NAs")
}

doublet_metadata_df <- tibble(cell_barcode = colnames(counts)) %>%
    left_join(doublet_metadata_df, by = "cell_barcode") %>%
    mutate(
        across(all_of(c(doubletfinder_column, "scDblFinder.class")), ~ replace(.x, is.na(.x), "singlet")),
        across(all_of(c(doubletfinder_column, "scDblFinder.class")), str_to_lower),
        merged_doublet_class = if_else(.data[[doubletfinder_column]] == "doublet" & scDblFinder.class == "doublet", "HC_Doublet",
            if_else(.data[[doubletfinder_column]] == "doublet", "DF_Doublet",
                if_else(scDblFinder.class == "doublet", "SDF_Doublet", "Singlet")
            )
        )
    ) %>%
    as.data.frame() %>%
    set_rownames(.[["cell_barcode"]]) %>%
    select(-cell_barcode)

table(doublet_metadata_df$merged_doublet_class)
table(doublet_metadata_df[[doubletfinder_column]], doublet_metadata_df$scDblFinder.class)

seu_obj <- CreateSeuratObject(
    counts = counts, project = n,
    min.cells = 10, meta.data = doublet_metadata_df
)

seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")

p <- VlnPlot(seu_obj, group.by = "merged_doublet_class", features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 1, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.raw.pdf"),
    width = 500,
    height = 900
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.raw.pdf"),
    width = 900,
    height = 400
)

seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 2.5 & merged_doublet_class == "Singlet")

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.filtered.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.filtered.pdf"),
    width = 900,
    height = 400
)

seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))

VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(n, ".pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
    dev.off()
}

p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".elbow.pdf"),
    width = 1000,
    height = 400
)

pcs <- 1:25

seu_obj <- FindNeighbors(seu_obj, dims = pcs)

# Stable clusters:
# 从 parent clusters 的角度看，其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看，其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加，一个 stable cluster 应该要么保持不变，
# 要么干净的分裂为自己的 child clusters（child clusters 只来源自身）。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
    seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".resolutions.cluster_tree.pdf"),
    width = 1500,
    height = 700
)

seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
    p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
    ugd_save_inline(
        {
            print(p)
        },
        file = paste0(n, ".umap.", resolution_column, ".pdf"),
        width = 800,
        height = 800
    )
}

resolution <- 0.5

Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)

saveRDS(seu_obj, file = "seu_obj_to_umap.rds")

Annotate cell types based on canonical markers/DEGs:

library(Seurat)
library(vroom)
library(tidyverse)
library(YRUtils)
library(unigd)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
seu_obj_rds_file <- c(
    "Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Glioma/seu_obj_to_umap.rds"
)
canonical_markers_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/mm_hippocampus_gene_markers.txt"
resolution_column <- "SCT_snn_res.0.5"

n <- names(seu_obj_rds_file)

work_dir <- file.path(work_dir, paste0(n, "_annotate_cell_types"))
dir.create(work_dir)
setwd(work_dir)

seu_obj <- readRDS(seu_obj_rds_file)
Idents(seu_obj) <- resolution_column

DefaultAssay(seu_obj) <- "RNA"
seu_obj <- NormalizeData(seu_obj)
seu_obj <- ScaleData(seu_obj, features = row.names(seu_obj))

seu_obj_markers <- FindAllMarkers(seu_obj, only.pos = TRUE, logfc.threshold = 0.5, min.pct = 0.25)
seu_obj_markers <- seu_obj_markers %>%
    mutate(pct_diff = pct.1 - pct.2) %>%
    arrange(cluster, p_val_adj, desc(avg_log2FC), desc(pct_diff))
vroom_write(seu_obj_markers, file = "seu_obj.pos_markers.tsv", col_names = TRUE, append = FALSE)

top_n <- 10

top_seu_obj_markers <- seu_obj_markers %>%
    group_by(cluster) %>%
    filter(avg_log2FC > 1 & p_val_adj < 0.05) %>%
    slice_head(n = top_n) %>%
    ungroup()
p <- DoHeatmap(seu_obj, features = top_seu_obj_markers$gene, label = TRUE, group.bar.height = 0.005, raster = FALSE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".top", top_n, "_DEGs.heatmap.pdf"),
    width = 1200,
    height = 3200
)

canonical_markers_df <- vroom(canonical_markers_file) %>%
    separate_longer_delim(cols = "marker", delim = "/") %>%
    mutate(marker = str_to_title(trimws(marker))) %>%
    distinct() %>%
    filter(marker %in% row.names(seu_obj)) %>%
    mutate(
        marker = factor(marker, levels = marker),
        rank = 1:n()
    )

table(duplicated(canonical_markers_df$marker))

cell_type_anno_vec <- sapply(
    split(canonical_markers_df, canonical_markers_df$cell_type),
    function(df) sum(df$rank) / nrow(df)
)
cell_type_anno_df <- tibble(cell_type = names(cell_type_anno_vec), x = cell_type_anno_vec)

p1 <- DotPlot(
    seu_obj,
    features = canonical_markers_df$marker,
    cluster.idents = TRUE,
    dot.min = 0.1
) +
    theme(
        axis.text.x.bottom = element_text(angle = 90, hjust = 1, vjust = 0.5),
        axis.title = element_blank(),
        plot.margin = margin(0, 0, 0, 0)
    )
Idents(seu_obj) <- factor(Idents(seu_obj), levels = levels(p1$data$id))

p2 <- ggplot() +
    geom_tile(
        mapping = aes(x = marker, y = 1, fill = cell_type),
        data = canonical_markers_df, color = NA,
        height = 2
    ) +
    geom_text(
        mapping = aes(x, y = -0.5, label = cell_type), data = cell_type_anno_df,
        angle = 90, vjust = 0.5, hjust = 1
    ) +
    scale_y_continuous(
        expand = expansion(0),
        limits = c(-50, 2)
    ) +
    theme_void() +
    theme(
        legend.position = "none",
        plot.margin = margin(0, 0, 0, 0)
    )
p <- aplot::insert_bottom(p1, p2, height = 1)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".canonical_markers.dotplot.pdf"),
    width = 1500,
    height = 1000
)

p <- VlnPlot(
    seu_obj,
    features = canonical_markers_df$marker,
    split.by = "ident",
    stack = TRUE,
    raster = TRUE
) + NoLegend()
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".canonical_markers.stacked_violin.pdf"),
    width = 1400,
    height = 600
)

p1 <- VlnPlot(seu_obj, features = "percent.mt", group.by = resolution_column) +
    geom_hline(yintercept = 2.5) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p2 <- VlnPlot(seu_obj, features = "nCount_RNA", group.by = resolution_column) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p3 <- VlnPlot(seu_obj, features = "nFeature_RNA", group.by = resolution_column) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p <- p1 / p2 / p3
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".nCount_nFeature_MT.qc_violin.cluster-wise.pdf"),
    width = 1000,
    height = 1000
)

cell_types <- c(
    "12" = "OPC", "1" = "Oligodendrocytes", "7" = "Microglia",
    "3" = "Astrocytes", "13" = "VLMCs", "9" = "Sst IN",
    "10" = "Cck IN", "0" = "DG EN", "6" = "PLN",
    "11" = "EN1", "8" = "EN2", "5" = "EN3", "2" = "EN4",
    "4" = "EN5", "14" = "EN6"
)

seu_obj <- RenameIdents(seu_obj, cell_types)
seu_obj$cell_types <- Idents(seu_obj)

p <- DimPlot(seu_obj, label.size = 4, repel = TRUE, reduction = "umap", group.by = "cell_types", label = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".umap.cell_types.pdf"),
    width = 600,
    height = 400
)

saveRDS(seu_obj, file = "seu_obj_to_annotation.rds")

2.2 Integrative analysis

2.2.1 Integration between astrocyte-treated sample and glioma-treated sample

Integration with CCA:

library(Seurat)
library(patchwork)
library(unigd)
library(tidyverse)
library(clustree)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
seu_obj_rds_files <- c(
    "Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_annotate_cell_types/seu_obj_to_annotation.rds",
    "Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Glioma_annotate_cell_types/seu_obj_to_annotation.rds"
)

work_dir <- file.path(work_dir, paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".integration"))
dir.create(work_dir)
setwd(work_dir)

seu_obj_ls <- list()
for (n in names(seu_obj_rds_files)) {
    seu_obj_ls[[n]] <- readRDS(seu_obj_rds_files[n])
}
seu_obj <- merge(x = seu_obj_ls[[1]], y = seu_obj_ls[[2]])

n_hvgs <- 3000
seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = n_hvgs)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))

VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
    dev.off()
}

p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".elbow.pdf"),
    width = 1000,
    height = 400
)

# the number of PCs specified here differs from the number of PCs (controlled by the `dims` parameter) used for dataset integration
# just for checking batch effects
pcs <- 1:30

seu_obj <- RunUMAP(seu_obj, dims = pcs)

p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = c("orig.ident", "cell_types"), label = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".umap.group.before_integration.pdf"),
    width = 1600,
    height = 800
)

p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", split.by = "orig.ident", label = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".umap.split.before_integration.pdf"),
    width = 1600,
    height = 800
)

if (inherits(seu_obj[[DefaultAssay(seu_obj)]], what = "SCTAssay")) {
    features <- SelectSCTIntegrationFeatures(seu_obj, assay = DefaultAssay(seu_obj), nfeatures = n_hvgs)
} else {
    stop("please run SCTransform first and set SCT as the default assay")
}

# CCA
# extremely slow
# here, we leave the parameter `dims` unset
seu_obj <- IntegrateLayers(
    seu_obj,
    features = features, assay = DefaultAssay(seu_obj),
    orig.reduction = "pca", new.reduction = "integrated.cca",
    method = CCAIntegration, normalization.method = "SCT"
)
seu_obj[["RNA"]] <- JoinLayers(seu_obj[["RNA"]])

ncol(seu_obj@reductions$integrated.cca)

for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".cca_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], reduction = "integrated.cca", cells = 500, balanced = TRUE)
    dev.off()
}

pcs <- 1:30

seu_obj <- FindNeighbors(seu_obj, dims = pcs, reduction = "integrated.cca")

# Stable clusters:
# 从 parent clusters 的角度看，其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看，其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加，一个 stable cluster 应该要么保持不变，
# 要么干净的分裂为自己的 child clusters（child clusters 只来源自身）。
resolutions <- seq(0.2, 2, 0.1)
for (resolution in resolutions) {
    seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".resolutions.cluster_tree.pdf"),
    width = 2000,
    height = 1400
)

seu_obj <- RunUMAP(seu_obj, dims = pcs, reduction = "integrated.cca")
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
    p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
    ugd_save_inline(
        {
            print(p)
        },
        file = paste0(paste0(names(seu_obj_rds_files), collapse = "_"), ".umap.", resolution_column, ".pdf"),
        width = 800,
        height = 800
    )
}

resolution <- 0.5

Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)

saveRDS(seu_obj, file = "seu_obj_to_umap.rds")

Annotate cell types based on canonical markers/DEGs:

library(Seurat)
library(vroom)
library(tidyverse)
library(YRUtils)
library(unigd)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
seu_obj_rds_file <- c(
    "Astrocyte_Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_Glioma.integration/seu_obj_to_umap.rds"
)
canonical_markers_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/mm_hippocampus_gene_markers.txt"
resolution_column <- "SCT_snn_res.0.4"

n <- names(seu_obj_rds_file)

work_dir <- file.path(work_dir, paste0(n, ".annotate_cell_types"))
dir.create(work_dir)
setwd(work_dir)

seu_obj <- readRDS(seu_obj_rds_file)
Idents(seu_obj) <- resolution_column

p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = c("orig.ident", "cell_types"), label = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".umap.group.before_annotation.pdf"),
    width = 1600,
    height = 800
)

p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", split.by = "orig.ident", label = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".umap.split.before_annotation.pdf"),
    width = 1600,
    height = 800
)

DefaultAssay(seu_obj) <- "RNA"
seu_obj <- NormalizeData(seu_obj)
seu_obj <- ScaleData(seu_obj, features = row.names(seu_obj))

seu_obj_markers <- FindAllMarkers(seu_obj, only.pos = TRUE, logfc.threshold = 0.5, min.pct = 0.25)
seu_obj_markers <- seu_obj_markers %>%
    mutate(pct_diff = pct.1 - pct.2) %>%
    arrange(cluster, p_val_adj, desc(avg_log2FC), desc(pct_diff))
vroom_write(seu_obj_markers, file = "seu_obj.pos_markers.tsv", col_names = TRUE, append = FALSE)

top_n <- 10

top_seu_obj_markers <- seu_obj_markers %>%
    group_by(cluster) %>%
    filter(avg_log2FC > 1 & p_val_adj < 0.05) %>%
    slice_head(n = top_n) %>%
    ungroup()
p <- DoHeatmap(seu_obj, features = top_seu_obj_markers$gene, label = TRUE, group.bar.height = 0.005, raster = FALSE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".top", top_n, "_DEGs.heatmap.pdf"),
    width = 1200,
    height = 3200
)

canonical_markers_df <- vroom(canonical_markers_file) %>%
    separate_longer_delim(cols = "marker", delim = "/") %>%
    mutate(marker = str_to_title(trimws(marker))) %>%
    distinct() %>%
    filter(marker %in% row.names(seu_obj)) %>%
    mutate(
        marker = factor(marker, levels = marker),
        rank = 1:n()
    )

table(duplicated(canonical_markers_df$marker))

cell_type_anno_vec <- sapply(
    split(canonical_markers_df, canonical_markers_df$cell_type),
    function(df) sum(df$rank) / nrow(df)
)
cell_type_anno_df <- tibble(cell_type = names(cell_type_anno_vec), x = cell_type_anno_vec)

p1 <- DotPlot(
    seu_obj,
    features = canonical_markers_df$marker,
    cluster.idents = TRUE,
    dot.min = 0.1
) +
    theme(
        axis.text.x.bottom = element_text(angle = 90, hjust = 1, vjust = 0.5),
        axis.title = element_blank(),
        plot.margin = margin(0, 0, 0, 0)
    )
Idents(seu_obj) <- factor(Idents(seu_obj), levels = levels(p1$data$id))

p2 <- ggplot() +
    geom_tile(
        mapping = aes(x = marker, y = 1, fill = cell_type),
        data = canonical_markers_df, color = NA,
        height = 2
    ) +
    geom_text(
        mapping = aes(x, y = -0.5, label = cell_type), data = cell_type_anno_df,
        angle = 90, vjust = 0.5, hjust = 1
    ) +
    scale_y_continuous(
        expand = expansion(0),
        limits = c(-50, 2)
    ) +
    theme_void() +
    theme(
        legend.position = "none",
        plot.margin = margin(0, 0, 0, 0)
    )
p <- aplot::insert_bottom(p1, p2, height = 1)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".canonical_markers.dotplot.pdf"),
    width = 1500,
    height = 1000
)

p <- VlnPlot(
    seu_obj,
    features = canonical_markers_df$marker,
    split.by = "ident",
    stack = TRUE,
    raster = TRUE
) + NoLegend()
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".canonical_markers.stacked_violin.pdf"),
    width = 1400,
    height = 600
)

p1 <- VlnPlot(seu_obj, features = "percent.mt", group.by = resolution_column) +
    geom_hline(yintercept = 2.5) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p2 <- VlnPlot(seu_obj, features = "nCount_RNA", group.by = resolution_column) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p3 <- VlnPlot(seu_obj, features = "nFeature_RNA", group.by = resolution_column) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p <- p1 / p2 / p3
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".nCount_nFeature_MT.qc_violin.cluster-wise.pdf"),
    width = 1000,
    height = 1000
)

cell_types <- c(
    "12" = "OPC", "1" = "Oligodendrocytes", "8" = "Microglia",
    "4" = "Astrocytes", "14" = "VLMCs", "9" = "Sst IN",
    "10" = "Cck IN", "16" = "Cajal-Retzius EN",
    "0" = "DG EN", "13" = "DG EN", "2" = "CA1 EN1",
    "6" = "CA1 EN2", "7" = "CA1 EN3", "11" = "CA1 EN4",
    "3" = "EN1", "5" = "PLN", "15" = "CA3 EN"
)

seu_obj <- RenameIdents(seu_obj, cell_types)
seu_obj$integrated_cell_types <- Idents(seu_obj)

p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = c("orig.ident", "integrated_cell_types"), label = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".umap.group.after_annotation.pdf"),
    width = 1400,
    height = 600
)

p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", split.by = "orig.ident", label = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".umap.split.after_annotation.pdf"),
    width = 1400,
    height = 600
)

saveRDS(seu_obj, file = "seu_obj_to_annotation.rds")

DE analysis of each cell type between conditions: there is only one biological replicate here.

library(Seurat)
library(vroom)
library(tidyverse)
library(YRUtils)
library(unigd)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
gene_id_mpt_file <- "/data/database/annotation/release/mus_musculus/Mus_musculus.GRCm39.vM38.gencode.basic.chr.annotation.gene_id_name_mapping_table.tsv"
seu_obj_rds_file <- c(
    "Astrocyte_Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_Glioma.annotate_cell_types/seu_obj_to_annotation.rds"
)
sample_levels <- c("Glioma", "Astrocyte")
n_th <- 50

n <- names(seu_obj_rds_file)

work_dir <- file.path(work_dir, paste0(n, ".de_analysis"))
dir.create(work_dir)
setwd(work_dir)

seu_obj <- readRDS(seu_obj_rds_file)

prop_df <- seu_obj[[]] %>%
    select(orig.ident, integrated_cell_types) %>%
    group_by(integrated_cell_types, orig.ident) %>%
    reframe(n_condition = n()) %>%
    group_by(integrated_cell_types) %>%
    reframe(orig.ident, n_condition, n_total = sum(n_condition), pct_condition = n_condition / n_total) %>%
    arrange(desc(n_total)) %>%
    mutate(
        integrated_cell_types = factor(integrated_cell_types, levels = unique(integrated_cell_types)),
        orig.ident = factor(orig.ident, levels = sample_levels),
        combined_ident = paste0(integrated_cell_types, "@", orig.ident),
        gt_th = if_else(n_condition >= n_th, TRUE, FALSE)
    )
vroom_write(
    prop_df,
    file = paste0(n, ".condition_prop_for_each_cell_type.tsv"),
    col_names = TRUE, append = FALSE
)

p <- ggplot() +
    geom_bar(
        mapping = aes(integrated_cell_types, n_condition, fill = orig.ident),
        data = prop_df, stat = "identity", color = NA
    ) +
    geom_hline(yintercept = 100) +
    labs(x = "Cell types", y = "Number of cells", fill = "Condition") +
    scale_y_continuous(expand = expansion(0)) +
    theme_classic(base_family = "Arial", base_size = 20) +
    theme(
        axis.text.x.bottom = element_text(angle = 45, vjust = 1, hjust = 1),
        axis.text = element_text(colour = "black")
    )
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".condition_prop_for_each_cell_type.pdf"),
    width = 700,
    height = 400
)

seu_obj$combined_ident <- paste0(seu_obj$integrated_cell_types, "@", seu_obj$orig.ident)
Idents(seu_obj) <- "combined_ident"

gene_id_mpt <- vroom(gene_id_mpt_file)

for (cell_type in unique(prop_df$integrated_cell_types)) {
    tmp_prop_df <- filter(prop_df, integrated_cell_types == cell_type)
    if (nrow(tmp_prop_df) != 2 || !all(tmp_prop_df$gt_th)) {
        message("skipping '", cell_type, "' ...")
        next
    }

    downsample_n <- min(tmp_prop_df$n_condition)
    ident_1 <- tmp_prop_df$combined_ident[tmp_prop_df$orig.ident == sample_levels[1]]
    ident_2 <- tmp_prop_df$combined_ident[tmp_prop_df$orig.ident == sample_levels[2]]
    de_res <- FindMarkers(
        seu_obj,
        ident.1 = ident_1,
        ident.2 = ident_2,
        logfc.threshold = 0,
        test.use = "wilcox",
        min.pct = 0.1,
        min.diff.pct = -Inf,
        only.pos = FALSE,
        max.cells.per.ident = downsample_n,
        random.seed = 123456
    )

    de_res <- de_res %>%
        mutate(gene_name = row.names(.)) %>%
        left_join(gene_id_mpt, by = "gene_name") %>%
        arrange(desc(avg_log2FC), p_val_adj)
    de_res_filtered <- filter(de_res, gene_biotype == "protein_coding") %>%
        group_by(gene_name) %>%
        slice_max(order_by = abs(avg_log2FC), n = 1) %>%
        slice_sample(n = 1) %>%
        arrange(desc(avg_log2FC), p_val_adj)

    vroom_write(
        de_res,
        file = paste0(n, ".", gsub("@", "-", gsub(" ", "_", paste0(ident_1, "_vs_", ident_2))), ".raw.tsv"),
        col_names = TRUE, append = FALSE
    )
    vroom_write(
        de_res_filtered,
        file = paste0(n, ".", gsub("@", "-", gsub(" ", "_", paste0(ident_1, "_vs_", ident_2))), ".unique.tsv"),
        col_names = TRUE, append = FALSE
    )
}

GO enrichment analysis:

library(YRUtils)
library(tidyverse)
library(vroom)
library(clusterProfiler)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
orgdb_file <- "/data/database/annotation/release/mus_musculus/org.Mm.eg.db.sqlite"
degs_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_Glioma.de_analysis"
keytype <- "SYMBOL"
sample_levels <- c("Glioma", "Astrocyte")
logfc_th <- 0.5
pval_th <- 0.05
n <- "Astrocyte_Glioma"

work_dir <- file.path(work_dir, paste0(n, ".go_analysis"))
dir.create(work_dir)
setwd(work_dir)

orgdb <- AnnotationDbi::loadDb(orgdb_file)
# go_terms <- AnnotationDbi::Ontology(GO.db::GOTERM)
# go2gene <- AnnotationDbi::mapIds(orgdb, keys = names(go_terms), column = keytype, keytype = "GOALL", multiVals = "list")
# go_anno <- stack(go2gene)
# colnames(go_anno) <- c(keytype, "GOALL")
# go_anno <- unique(go_anno[!is.na(go_anno[, 1]), ])
# go_anno$ONTOLOGYALL <- go_terms[go_anno$GOALL]
# go_genes <- unique(na.omit(go_anno$SYMBOL))

degs_files <- list.files(degs_dir, pattern = "\\.unique\\.tsv$", full.names = TRUE, recursive = FALSE)
for (degs_file in degs_files) {
    degs <- vroom(degs_file)
    sig_degs <- degs %>%
        dplyr::select(gene_name, avg_log2FC, p_val) %>%
        na.omit() %>%
        distinct() %>%
        filter(abs(avg_log2FC) > logfc_th & p_val < pval_th) %>%
        mutate(diff_flag = if_else(avg_log2FC > 0, sample_levels[1], sample_levels[2]))

    if (nrow(sig_degs) > 0) {
        gene_ls <- lapply(split(sig_degs, sig_degs$diff_flag), function(df) df[["gene_name"]])
    } else {
        message("no DEGs found in ", degs_file)
        next
    }

    ego <- compareCluster(
        gene_ls,
        OrgDb = orgdb,
        keyType = keytype,
        fun = "enrichGO",
        ont = "ALL",
        pvalueCutoff = 0.05,
        qvalueCutoff = 0.05,
        pAdjustMethod = "BH",
        minGSSize = 10,
        maxGSSize = 500,
        readable = FALSE,
        pool = FALSE
    )

    if (!is.null(ego)) {
        saveRDS(ego, file = gsub("\\.unique\\.tsv$", ".GO_ALL.rds", basename(degs_file)))
        vroom_write(
            ego@compareClusterResult,
            file = gsub("\\.unique\\.tsv$", ".GO_ALL.tsv", basename(degs_file)),
            col_names = TRUE, append = FALSE
        )
    }
}

3 Appendices

3.1 Remove ambient RNAs using decontX

3.1.1 Astrocyte-treated sample

Remove ambient RNAs using decontX:

library(Seurat)
library(decontX)
library(SingleCellExperiment)

filtered_mat_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
raw_mat_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/raw_cell_gene_matrix"
output_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/data"
n <- "Astrocyte"

filtered_counts <- Read10X(filtered_mat_dir)
raw_counts <- Read10X(raw_mat_dir)

filtered_sce <- SingleCellExperiment(list(counts = filtered_counts))
raw_sce <- SingleCellExperiment(list(counts = raw_counts))

filtered_sce <- decontX(filtered_sce, background = raw_sce)
filtered_seu_obj <- CreateSeuratObject(round(decontXcounts(filtered_sce)))

saveRDS(filtered_sce, file = file.path(output_dir, paste0(n, ".decontX_filtered.sce_obj.rds")))
saveRDS(filtered_seu_obj, file = file.path(output_dir, paste0(n, ".decontX_filtered.seu_obj.rds")))

Predict doublets using DoubletFinder:

library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(DoubletFinder)
library(vroom)

set.seed(123456)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX"
ori_seu_obj_rds_file <- c(
    "Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/data/Astrocyte.decontX_filtered.seu_obj.rds"
)

n <- names(ori_seu_obj_rds_file)

work_dir <- file.path(work_dir, paste0(n, "_DoubletFinder"))
dir.create(work_dir)
setwd(work_dir)

ori_seu_obj <- readRDS(ori_seu_obj_rds_file)
seu_obj <- CreateSeuratObject(
    counts = GetAssayData(ori_seu_obj, assay = "RNA", layer = "counts"),
    project = n,
    min.cells = 10
)

seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.raw.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.raw.pdf"),
    width = 900,
    height = 400
)

seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.lo.filtered.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
    width = 900,
    height = 400
)

seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))

VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
    dev.off()
}

p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.elbow.pdf"),
    width = 1000,
    height = 400
)

pcs <- 1:50

seu_obj <- FindNeighbors(seu_obj, dims = pcs)

# Stable clusters:
# 从 parent clusters 的角度看，其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看，其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加，一个 stable cluster 应该要么保持不变，
# 要么干净的分裂为自己的 child clusters（child clusters 只来源自身）。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
    seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
    width = 1500,
    height = 700
)

seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
    p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
    ugd_save_inline(
        {
            print(p)
        },
        file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
        width = 800,
        height = 800
    )
}

resolution <- 0.7

## Use DoubletFinder to identify heterotypic doublets (not homotypic doublets)
# the way DoubletFinder identifies heterotypic doublets can be broken up into 4 steps:
# 1. generate artifical doublets from existing scRNA-seq data;
# 2. pre-process merged real-artificial data;
# 3. perform PCA and use the PC distance matrix to find each cell's proportion of artifical k nearest neighbors (pANN);
# 4. rank order and threshold pANN values based on the expected number of doublets.

# DoubletFinder needs the following arguments:
# 1. seu: a fully-processed Seurat object (i.e., after NormalizeData, FindVariableFeatures, ScaleData, RunPCA, FindNeighbors, FindClusters, and RunUMAP);
# 2. PCs: the number of PCs;
# 3. pN: the number of generated artifical doublets, expressed as a proportion of the merged real-artificial data;
# 4. pK: the PC neighborhood size used to compute pANN, expressed as a proportion of the merged real-artificial data;
# 5. nExp: this defines the pANN threshold used to make final doublet/singlet predictions.

# 关于 DoubletFinder 的输入：
# 不应该用于来源于不同库的混合样本，因为来源于不同库的细胞之间就没有形成 doublets 的可能性（物理隔绝）。
# 分别应用于来源于同一库的样本的子（分割）样本是可以的。
# 确保过滤掉低质量的细胞，如低 nCount_RNA/nFeature_RNA，高 percent.mt。

# pK Identification
# DoubletFinder performance is largely invariant of pN value selection
sweep_res_list <- paramSweep(seu_obj, PCs = pcs, sct = TRUE, num.cores = 80)
sweep_stats <- summarizeSweep(sweep_res_list, GT = FALSE)
bcmvn <- find.pK(sweep_stats)
# maximize AUC for data with ground-truth doublet classifications
# maximize mean-variance normalized bimodality coefficient (BCmvn) for data without ground-truth info
optimal_pK <- as.numeric(as.character(bcmvn$pK[which.max(bcmvn$BCmetric)]))

p <- ggplot(bcmvn, aes(pK, BCmetric)) +
    geom_line(group = 1) +
    geom_point() +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".DoubletFinder.bcmvn.line.pdf"),
    width = 700,
    height = 400
)
vroom_write(bcmvn, file = paste0(n, ".DoubletFinder.bcmvn.tsv"), col_names = TRUE, append = FALSE)

# homotypic doublet proportion estimate
# DoubletFinder is insensitive to homotypic doublets
# it will be subtracted from the expected doublet rate
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025

# cell type (state) diversity is used to estimate the homotypic doublet proportion
# under the hypothesis that any two cells may form a doublet with equal chance
homotypic_prop <- modelHomotypic(seu_obj[[]][[paste0("SCT_snn_res.", resolution)]])
nExp_poi <- round(estimated_doublet_rate * nrow(seu_obj[[]]))
nExp_poi_adj <- round(nExp_poi * (1 - homotypic_prop))

# run DoubletFinder with varying classification stringencies
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi, reuse.pANN = NULL, sct = TRUE)
seu_obj <- doubletFinder(seu_obj, PCs = pcs, pN = 0.25, pK = optimal_pK, nExp = nExp_poi_adj, reuse.pANN = paste0("pANN_", 0.25, "_", optimal_pK, "_", nExp_poi), sct = TRUE)

p1 <- DimPlot(seu_obj,
    label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
    group.by = paste0("SCT_snn_res.", resolution),
    cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]] == "Doublet"]),
    cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi))
p2 <- DimPlot(seu_obj,
    label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
    group.by = paste0("SCT_snn_res.", resolution),
    cells.highlight = list("Doublet_adj" = row.names(seu_obj[[]])[seu_obj[[]][[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]] == "Doublet"]),
    cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle(paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj))
ugd_save_inline(
    {
        print(p1 | p2)
    },
    file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
    width = 1400,
    height = 700
)

nExp_poi_dbl_stats_df <- inner_join(
    seu_obj[[]] %>%
        select(all_of(paste0("SCT_snn_res.", resolution))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
        reframe(cluster_n = n()),
    seu_obj[[]] %>%
        select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]]) %>%
        reframe(n = n()),
    by = paste0("SCT_snn_res.", resolution)
) %>%
    mutate(percent = n / cluster_n * 100) %>%
    arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi)]], desc(percent))
vroom_write(nExp_poi_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi, ".nExp.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)

nExp_poi_adj_dbl_stats_df <- inner_join(
    seu_obj[[]] %>%
        select(all_of(paste0("SCT_snn_res.", resolution))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
        reframe(cluster_n = n()),
    seu_obj[[]] %>%
        select(all_of(c(paste0("SCT_snn_res.", resolution), paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]]) %>%
        reframe(n = n()),
    by = paste0("SCT_snn_res.", resolution)
) %>%
    mutate(percent = n / cluster_n * 100) %>%
    arrange(.data[[paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj)]], desc(percent))
vroom_write(nExp_poi_adj_dbl_stats_df, file = paste0("DF.classifications_", 0.25, "_", optimal_pK, "_", nExp_poi_adj, ".nExp_adj.doublet_proportion.tsv"), col_names = TRUE, append = FALSE)

vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "DoubletFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "DoubletFinder.seu_obj_to_umap.rds")

Predict doublets using scDblFinder:

library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(scDblFinder)
library(vroom)

set.seed(123456)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX"
ori_seu_obj_rds_file <- c(
    "Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/data/Astrocyte.decontX_filtered.seu_obj.rds"
)

n <- names(ori_seu_obj_rds_file)

work_dir <- file.path(work_dir, paste0(n, "_scDblFinder"))
dir.create(work_dir)
setwd(work_dir)

ori_seu_obj <- readRDS(ori_seu_obj_rds_file)
seu_obj <- CreateSeuratObject(
    counts = GetAssayData(ori_seu_obj, assay = "RNA", layer = "counts"),
    project = n,
    min.cells = 10
)

seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.raw.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.raw.pdf"),
    width = 900,
    height = 400
)

seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.lo.filtered.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.lo.filtered.pdf"),
    width = 900,
    height = 400
)

seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))

VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(n, ".lo.filtered.pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
    dev.off()
}

p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.elbow.pdf"),
    width = 1000,
    height = 400
)

pcs <- 1:50

seu_obj <- FindNeighbors(seu_obj, dims = pcs)

# Stable clusters:
# 从 parent clusters 的角度看，其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看，其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加，一个 stable cluster 应该要么保持不变，
# 要么干净的分裂为自己的 child clusters（child clusters 只来源自身）。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
    seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".lo.filtered.resolutions.cluster_tree.pdf"),
    width = 1500,
    height = 700
)

seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
    p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
    ugd_save_inline(
        {
            print(p)
        },
        file = paste0(n, ".lo.filtered.umap.", resolution_column, ".pdf"),
        width = 800,
        height = 800
    )
}

resolution <- 0.7

## Use scDblFinder to identify heterotypic doublets (not homotypic doublets)
# linear interpolation
# please contact your single-cell service provider to obtain
# the relationship between the number of recovered cells and the estimated doublet rate
estimated_doublet_rate <- (0.03 - 0.025) / (6000 - 5000) * (nrow(seu_obj[[]]) - 5000) + 0.025

Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)
sce <- scDblFinder(
    GetAssayData(seu_obj, assay = "RNA", slot = "counts"),
    clusters = Idents(seu_obj),
    dbr = estimated_doublet_rate
)
seu_obj <- AddMetaData(seu_obj, metadata = as.data.frame(sce@colData))

p <- DimPlot(seu_obj,
    label.size = 6, repel = TRUE, reduction = "umap", label = TRUE,
    group.by = paste0("SCT_snn_res.", resolution),
    cells.highlight = list("Doublet" = row.names(seu_obj[[]])[seu_obj[[]][["scDblFinder.class"]] == "doublet"]),
    cols.highlight = "#EE4000", sizes.highlight = 2, alpha = 0.6
) + ggtitle("scDblFinder.class")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".doublet_labeled.umap.SCT_snn_res.", resolution, ".pdf"),
    width = 700,
    height = 700
)

dbl_stats_df <- inner_join(
    seu_obj[[]] %>%
        select(all_of(paste0("SCT_snn_res.", resolution))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]]) %>%
        reframe(cluster_n = n()),
    seu_obj[[]] %>%
        select(all_of(c(paste0("SCT_snn_res.", resolution), "scDblFinder.class"))) %>%
        group_by(.data[[paste0("SCT_snn_res.", resolution)]], .data[["scDblFinder.class"]]) %>%
        reframe(n = n()),
    by = paste0("SCT_snn_res.", resolution)
) %>%
    mutate(percent = n / cluster_n * 100) %>%
    arrange(.data[["scDblFinder.class"]], desc(percent))
vroom_write(dbl_stats_df, file = "scDblFinder.doublet_proportion.tsv", col_names = TRUE, append = FALSE)

vroom_write(seu_obj[[]] %>% mutate(cell_barcode = row.names(.)), file = "scDblFinder.seurat_metadata.tsv", col_names = TRUE, append = FALSE)
saveRDS(seu_obj, file = "scDblFinder.seu_obj_to_umap.rds")

Standard Seurat processing workflow: remove low-quality cells/doublets.

library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)
library(vroom)
library(magrittr)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX"
ori_seu_obj_rds_file <- c(
    "Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/data/Astrocyte.decontX_filtered.seu_obj.rds"
)
doubletfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/Astrocyte_DoubletFinder/DoubletFinder.seurat_metadata.tsv"
scdblfinder_metadata_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/Astrocyte_scDblFinder/scDblFinder.seurat_metadata.tsv"
doubletfinder_column <- "DF.classifications_0.25_0.14_127"

n <- names(ori_seu_obj_rds_file)

work_dir <- file.path(work_dir, n)
dir.create(work_dir)
setwd(work_dir)

ori_seu_obj <- readRDS(ori_seu_obj_rds_file)

doubletfinder_metadata_df <- vroom(doubletfinder_metadata_file) %>%
    select(all_of(c("cell_barcode", doubletfinder_column)))
scdblfinder_metadata_df <- vroom(scdblfinder_metadata_file) %>%
    select(cell_barcode, scDblFinder.class)
doublet_metadata_df <- full_join(doubletfinder_metadata_df, scdblfinder_metadata_df, by = "cell_barcode")

if (any(is.na(doublet_metadata_df$cell_barcode))) {
    stop("some cell barcodes are NAs")
}

doublet_metadata_df <- tibble(cell_barcode = colnames(ori_seu_obj)) %>%
    left_join(doublet_metadata_df, by = "cell_barcode") %>%
    mutate(
        across(all_of(c(doubletfinder_column, "scDblFinder.class")), ~ replace(.x, is.na(.x), "singlet")),
        across(all_of(c(doubletfinder_column, "scDblFinder.class")), str_to_lower),
        merged_doublet_class = if_else(.data[[doubletfinder_column]] == "doublet" & scDblFinder.class == "doublet", "HC_Doublet",
            if_else(.data[[doubletfinder_column]] == "doublet", "DF_Doublet",
                if_else(scDblFinder.class == "doublet", "SDF_Doublet", "Singlet")
            )
        )
    ) %>%
    as.data.frame() %>%
    set_rownames(.[["cell_barcode"]]) %>%
    select(-cell_barcode)

table(doublet_metadata_df$merged_doublet_class)
table(doublet_metadata_df[[doubletfinder_column]], doublet_metadata_df$scDblFinder.class)

seu_obj <- CreateSeuratObject(
    counts = GetAssayData(ori_seu_obj, assay = "RNA", layer = "counts"),
    project = n, min.cells = 10, meta.data = doublet_metadata_df
)

seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")

p <- VlnPlot(seu_obj, group.by = "merged_doublet_class", features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 1, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.raw.pdf"),
    width = 500,
    height = 900
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.raw.pdf"),
    width = 900,
    height = 400
)

seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 1.5 & merged_doublet_class == "Singlet")

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.filtered.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.filtered.pdf"),
    width = 900,
    height = 400
)

seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))

VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(n, ".pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
    dev.off()
}

p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".elbow.pdf"),
    width = 1000,
    height = 400
)

pcs <- 1:33

seu_obj <- FindNeighbors(seu_obj, dims = pcs)

# Stable clusters:
# 从 parent clusters 的角度看，其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看，其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加，一个 stable cluster 应该要么保持不变，
# 要么干净的分裂为自己的 child clusters（child clusters 只来源自身）。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
    seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".resolutions.cluster_tree.pdf"),
    width = 1500,
    height = 700
)

seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
    p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
    ugd_save_inline(
        {
            print(p)
        },
        file = paste0(n, ".umap.", resolution_column, ".pdf"),
        width = 800,
        height = 800
    )
}

resolution <- 0.9

Idents(seu_obj) <- paste0("SCT_snn_res.", resolution)

saveRDS(seu_obj, file = "seu_obj_to_umap.rds")

Annotate cell types based on canonical markers/DEGs:

library(Seurat)
library(vroom)
library(tidyverse)
library(YRUtils)
library(unigd)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX"
seu_obj_rds_file <- c(
    "Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/Astrocyte/seu_obj_to_umap.rds"
)
canonical_markers_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/mm_hippocampus_gene_markers.txt"
resolution_column <- "SCT_snn_res.1.3"

n <- names(seu_obj_rds_file)

work_dir <- file.path(work_dir, paste0(n, "_annotate_cell_types"))
dir.create(work_dir)
setwd(work_dir)

seu_obj <- readRDS(seu_obj_rds_file)
Idents(seu_obj) <- resolution_column

DefaultAssay(seu_obj) <- "RNA"
seu_obj <- NormalizeData(seu_obj)
seu_obj <- ScaleData(seu_obj, features = row.names(seu_obj))

seu_obj_markers <- FindAllMarkers(seu_obj, only.pos = TRUE, logfc.threshold = 0.5, min.pct = 0.25)
seu_obj_markers <- seu_obj_markers %>%
    mutate(pct_diff = pct.1 - pct.2) %>%
    arrange(cluster, p_val_adj, desc(avg_log2FC), desc(pct_diff))
vroom_write(seu_obj_markers, file = "seu_obj.pos_markers.tsv", col_names = TRUE, append = FALSE)

top_n <- 10

top_seu_obj_markers <- seu_obj_markers %>%
    group_by(cluster) %>%
    filter(avg_log2FC > 1 & p_val_adj < 0.05) %>%
    slice_head(n = top_n) %>%
    ungroup()
p <- DoHeatmap(seu_obj, features = top_seu_obj_markers$gene, label = TRUE, group.bar.height = 0.005, raster = FALSE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".top", top_n, "_DEGs.heatmap.pdf"),
    width = 1200,
    height = 3200
)

canonical_markers_df <- vroom(canonical_markers_file) %>%
    separate_longer_delim(cols = "marker", delim = "/") %>%
    mutate(marker = str_to_title(trimws(marker))) %>%
    distinct() %>%
    filter(marker %in% row.names(seu_obj)) %>%
    mutate(
        marker = factor(marker, levels = marker),
        rank = 1:n()
    )

table(duplicated(canonical_markers_df$marker))

cell_type_anno_vec <- sapply(
    split(canonical_markers_df, canonical_markers_df$cell_type),
    function(df) sum(df$rank) / nrow(df)
)
cell_type_anno_df <- tibble(cell_type = names(cell_type_anno_vec), x = cell_type_anno_vec)

p1 <- DotPlot(
    seu_obj,
    features = canonical_markers_df$marker,
    cluster.idents = TRUE,
    dot.min = 0.1
) +
    theme(
        axis.text.x.bottom = element_text(angle = 90, hjust = 1, vjust = 0.5),
        axis.title = element_blank(),
        plot.margin = margin(0, 0, 0, 0)
    )
Idents(seu_obj) <- factor(Idents(seu_obj), levels = levels(p1$data$id))

p2 <- ggplot() +
    geom_tile(
        mapping = aes(x = marker, y = 1, fill = cell_type),
        data = canonical_markers_df, color = NA,
        height = 2
    ) +
    geom_text(
        mapping = aes(x, y = -0.5, label = cell_type), data = cell_type_anno_df,
        angle = 90, vjust = 0.5, hjust = 1
    ) +
    scale_y_continuous(
        expand = expansion(0),
        limits = c(-50, 2)
    ) +
    theme_void() +
    theme(
        legend.position = "none",
        plot.margin = margin(0, 0, 0, 0)
    )
p <- aplot::insert_bottom(p1, p2, height = 1)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".canonical_markers.dotplot.pdf"),
    width = 1500,
    height = 1000
)

p <- VlnPlot(
    seu_obj,
    features = canonical_markers_df$marker,
    split.by = "ident",
    stack = TRUE,
    raster = TRUE
) + NoLegend()
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".canonical_markers.stacked_violin.pdf"),
    width = 1400,
    height = 600
)

p1 <- VlnPlot(seu_obj, features = "percent.mt", group.by = resolution_column) +
    geom_hline(yintercept = 1.5) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p2 <- VlnPlot(seu_obj, features = "nCount_RNA", group.by = resolution_column) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p3 <- VlnPlot(seu_obj, features = "nFeature_RNA", group.by = resolution_column) +
    NoLegend() + theme(axis.title = element_blank(), axis.text.x.bottom = element_text(angle = 0, hjust = 0.5))
p <- p1 / p2 / p3
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".nCount_nFeature_MT.qc_violin.cluster-wise.pdf"),
    width = 1000,
    height = 1000
)

3.1.2 Glioma-treated sample

Remove ambient RNAs using decontX:

library(Seurat)
library(decontX)
library(SingleCellExperiment)

filtered_mat_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
raw_mat_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/raw_cell_gene_matrix"
output_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_corrected_decontX/data"
n <- "Astrocyte"

filtered_counts <- Read10X(filtered_mat_dir)
raw_counts <- Read10X(raw_mat_dir)

filtered_sce <- SingleCellExperiment(list(counts = filtered_counts))
raw_sce <- SingleCellExperiment(list(counts = raw_counts))

filtered_sce <- decontX(filtered_sce, background = raw_sce)
filtered_seu_obj <- CreateSeuratObject(round(decontXcounts(filtered_sce)))

saveRDS(filtered_sce, file = file.path(output_dir, paste0(n, ".decontX_filtered.sce_obj.rds")))
saveRDS(filtered_seu_obj, file = file.path(output_dir, paste0(n, ".decontX_filtered.seu_obj.rds")))

3.2 Create 10X H5 file

library(scCustomize)

raw_data_path <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176044-250818-Glioma_20250928_004852/JZ25176044-250818-Glioma_outs/raw_cell_gene_matrix"

Create_10X_H5(raw_data_path,
    source_type = "10X",
    save_file_path = dirname(raw_data_path),
    save_name = "raw_cell_gene_matrix"
)

3.3 Remove ambient RNAs using CellBender

#!/usr/bin/bash -e

MAMBA_EXE=/home/yangrui/softwares/micromamba/bin/micromamba
raw_feature_bc_matrix_h5=/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/raw_cell_gene_matrix_35380e54375b12.h5
work_dir=/data/tmp/yangrui_tmp

cd "${work_dir}"

output_h5=raw_feature_bc_matrix_cellbender.h5

"${MAMBA_EXE}" run -n cellbender_v1.3.0_py3.11 cellbender remove-background \
  --cuda --input "${raw_feature_bc_matrix_h5}" \
  --output "${output_h5}"

3.4 Visualize feature expressions between conditions

library(Seurat)

seu_obj_rds_file <- c(
    "Astrocyte_Glioma" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected/Astrocyte_Glioma.annotate_cell_types/seu_obj_to_annotation.rds"
)

seu_obj <- readRDS(seu_obj_rds_file)

features <- c("Lgi1")

FeaturePlot(
    seu_obj,
    features = features,
    reduction = "umap",
    split.by = "orig.ident"
)

3.5 Filter cells using adaptive thresholds

3SD or 3MAD:

library(Seurat)
library(patchwork)
library(YRUtils)
library(unigd)
library(tidyverse)
library(clustree)

work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/ambient_RNA_uncorrected"
sample_dir <- c(
    "Astrocyte" = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/filtered_cell_gene_matrix"
)

n <- names(sample_dir)

work_dir <- file.path(work_dir, n)
dir.create(work_dir)
setwd(work_dir)

counts <- Read10X(sample_dir)
seu_obj <- CreateSeuratObject(
    counts = counts, project = n,
    min.cells = 10
)

seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^mt-")

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.raw.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.raw.pdf"),
    width = 900,
    height = 400
)

seu_obj <- subset(seu_obj, subset = nCount_RNA > 500 & nFeature_RNA > 250 & percent.mt < 10)

# more conservative
# nCount_RNA_th <- median(seu_obj$nCount_RNA) + 3 * mad(seu_obj$nCount_RNA)
# more relax
nCount_RNA_th <- mean(seu_obj$nCount_RNA) + 3 * sd(seu_obj$nCount_RNA)
nFeature_RNA_th <- median(seu_obj$nFeature_RNA) + 3 * mad(seu_obj$nFeature_RNA)

seu_obj <- subset(seu_obj, subset = nCount_RNA < nCount_RNA_th & nFeature_RNA < nFeature_RNA_th)

p <- VlnPlot(seu_obj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"), ncol = 3, assay = "RNA", layer = "counts")
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".qc_violin.filtered.pdf"),
    width = 800,
    height = 400
)

p1 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
p2 <- FeatureScatter(seu_obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
ugd_save_inline(
    {
        print(p1 + p2)
    },
    file = paste0(n, ".qc_scatter.filtered.pdf"),
    width = 900,
    height = 400
)

seu_obj <- SCTransform(seu_obj, vars.to.regress = "percent.mt", variable.features.n = 3000)
seu_obj <- RunPCA(seu_obj, npcs = 100, features = VariableFeatures(object = seu_obj))

VizDimLoadings(seu_obj, dims = 1:2, reduction = "pca")
DimPlot(seu_obj, reduction = "pca") + NoLegend()
for (range in lapply(seq(1, 100, 10), function(i) c(i, i + 9))) {
    pdf(
        file = paste0(n, ".pc_dim_heatmap.", range[1], "-", range[2], ".pdf"),
        width = 9, height = 12
    )
    DimHeatmap(seu_obj, nfeatures = 30, dims = range[1]:range[2], cells = 500, balanced = TRUE)
    dev.off()
}

p <- ElbowPlot(seu_obj, ndims = 100) + scale_x_continuous(breaks = scales::breaks_extended(20))
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".elbow.pdf"),
    width = 1000,
    height = 400
)

pcs <- 1:50

seu_obj <- FindNeighbors(seu_obj, dims = pcs)

# Stable clusters:
# 从 parent clusters 的角度看，其不应该是 ≥ 2 个 parent clusters 的混合。
# 从 child clusters 的角度看，其 child clusters 应该只来源于其自身。
# 即随着 resolution 的增加，一个 stable cluster 应该要么保持不变，
# 要么干净的分裂为自己的 child clusters（child clusters 只来源自身）。
resolutions <- seq(0.5, 2, 0.2)
for (resolution in resolutions) {
    seu_obj <- FindClusters(seu_obj, resolution = resolution)
}
p <- clustree(seu_obj[[]], prefix = "SCT_snn_res.", node_label_size = 10, show_axis = TRUE)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0(n, ".resolutions.cluster_tree.pdf"),
    width = 1500,
    height = 700
)

seu_obj <- RunUMAP(seu_obj, dims = pcs)
for (resolution_column in paste0("SCT_snn_res.", resolutions)) {
    p <- DimPlot(seu_obj, label.size = 6, repel = TRUE, reduction = "umap", group.by = resolution_column, label = TRUE) + NoLegend()
    ugd_save_inline(
        {
            print(p)
        },
        file = paste0(n, ".umap.", resolution_column, ".pdf"),
        width = 800,
        height = 800
    )
}

saveRDS(seu_obj, file = "seu_obj_to_umap.rds")