#!/usr/bin/env python3
# tag_bam_by_mapq_gx.py
import pysam
import sys
import time
def tag_bam(input_bam, output_bam, threads=1):
GENE_TAG = "GX"
NEW_TAG = "GS"
count_total = 0
count_gene = 0
count_intergenic = 0
count_failed = 0
print(f"[INFO] Start dealing with {input_bam} ...")
start_time = time.time()
try:
with pysam.AlignmentFile(input_bam, "rb") as bam_in:
with pysam.AlignmentFile(
output_bam, "wb", template=bam_in, threads=threads
) as bam_out:
for read in bam_in:
count_total += 1
if read.mapping_quality != 255:
read.set_tag(NEW_TAG, "F", value_type="Z")
count_failed += 1
else:
try:
gx_value = read.get_tag(GENE_TAG)
if gx_value != "-":
read.set_tag(NEW_TAG, "G", value_type="Z")
count_gene += 1
else:
read.set_tag(NEW_TAG, "I", value_type="Z")
count_intergenic += 1
except KeyError:
read.set_tag(NEW_TAG, "I", value_type="Z")
count_intergenic += 1
bam_out.write(read)
if count_total % 1000000 == 0:
print(f"[INFO] {count_total:,} reads dealed")
except Exception as e:
print(f"[ERROR] Unknown error occurred: {e}")
sys.exit(1)
end_time = time.time()
print(f"[INFO] Dealing with {input_bam} done!")
print(f"\n[INFO] Elapsed time: {end_time - start_time:.2f} second(s)")
print(f"[INFO] Output file: {output_bam}")
print(f"[INFO] Total reads: {count_total:,}")
print(
f"[INFO] Reads uniquely mapped to genes (G): {count_gene:,} ({count_gene/count_total:.2%})"
)
print(
f"[INFO] Reads uniquely mapped to intergenic regions (I): {count_intergenic:,} ({count_intergenic/count_total:.2%})"
)
print(
f"[INFO] Reads not unqiuely mapped to genome (MAPQ != 255) (F): {count_failed:,} ({count_failed/count_total:.2%})"
)
print(
f"\n[INFO] Next step: you should create the index file for the output BAM file yourself (samtools index)"
)
if __name__ == "__main__":
threads = 1
if len(sys.argv) < 3 or len(sys.argv) > 4:
print(f"\nUsage: python {sys.argv[0]} <input.bam> <output.bam> [<threads>]")
print("threads <INT>: the number of threads/cores used by pysam (default: 1)\n")
sys.exit(1)
input_bam = sys.argv[1]
output_bam = sys.argv[2]
if len(sys.argv) == 4:
try:
threads = int(sys.argv[3])
except ValueError:
print(
f"[ERROR] Argument 'threads' {sys.argv[3]} must be a positive integer"
)
sys.exit(1)
else:
print(f"[INFO] {threads} threads will be used")
else:
print(f"[INFO] Argument 'threads' is not given, using the default: {threads}")
tag_bam(input_bam, output_bam, threads=threads)2 Merge genome annotations
singularity shell -B "${PWD}":/data /data/softwares/agat/agat_1.5.0--pl5321hdfd78af_0.sif
agat_sp_merge_annotations.pl --gff Neovision_vision.v1.v1.NCBI_Pipeline.basic.chr.annotation.gff3 --gff Neovision_vision.v1.v1.KIZ_ShengLab.basic.chr.annotation.gff3 --out merged.gff3
# phase correction, etc.
gffread merged.gff3 -o tidy.gff3 -O -F --keep-exon-attrs --keep-genes3 Download SRA files from NCBI GEO in batch
For detailed info, see https://github.com/ncbi/sra-tools/wiki and https://github.com/ncbi/sra-tools/wiki/08.-prefetch-and-fasterq-dump.
SRR_Acc_File=SRR_Acc_List.tsv
OUT_DIR=tmp_downloads
mkdir ${OUT_DIR}
while read -r SRR_Acc; do cmd="prefetch ${SRR_Acc} --max-size u --progress --output-directory ${OUT_DIR}"; echo -e "Running command: ${cmd} ..."; eval "${cmd}"; echo -e "Successfully running command: ${cmd}\n"; done < "${SRR_Acc_File}"4 Extract FASTQ files from SRA files in batch
Note:
For
fasterq-dump,--split-3and--skip-technicaloptions are enabled by default, but for clarity, you’d better specify them explicitly yourself.You should carefully consider whether you want to include technical reads; the splitting scheme of which you want. For detailed info, see https://github.com/ncbi/sra-tools/wiki and https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump.
Some examples:
- Standard single- or paired-end protocols (e.g., bulk RNA-, ATAC-, ChIP-seq, etc.) with technical reads omitted: biological reads mated \(\rightarrow\)
*_1.fastq&*_2.fastq; unmated reads \(\rightarrow\)*.fastq.
TMP_DIR=/dev/shm
NUM_THREADS=6
PWD=$(pwd)
shopt -s extglob
for SRR_Acc_Path in SRR+([0-9]); do [[ -d "${SRR_Acc_Path}" ]] || continue; SRR_Acc_Path="${PWD}/${SRR_Acc_Path}"; OUT_DIR="${SRR_Acc_Path}_dump_outs"; cmd="fasterq-dump ${SRR_Acc_Path} --outdir ${OUT_DIR} --temp ${TMP_DIR} --threads ${NUM_THREADS} --progress --skip-technical --split-3"; echo -e "Running command: ${cmd} ..."; eval "${cmd}"; echo -e "Successfully running command: ${cmd}\n"; done- Split the downloaded SRA files into individual, analysis-ready FASTQ (or FASTA) files, preserving any technical reads (e.g., cell-barcode, UMI, sample index, etc.) that are required for single-cell RNA-seq, ATAC-seq, multiome, or other protocols.
TMP_DIR=/dev/shm
NUM_THREADS=6
PWD=$(pwd)
shopt -s extglob
for SRR_Acc_Path in SRR+([0-9]); do [[ -d "${SRR_Acc_Path}" ]] || continue; SRR_Acc_Path="${PWD}/${SRR_Acc_Path}"; OUT_DIR="${SRR_Acc_Path}_dump_outs"; cmd="fasterq-dump ${SRR_Acc_Path} --outdir ${OUT_DIR} --temp ${TMP_DIR} --threads ${NUM_THREADS} --progress --include-technical --split-files"; echo -e "Running command: ${cmd} ..."; eval "${cmd}"; echo -e "Successfully running command: ${cmd}\n"; done5 Extract MT annotations from GTF file
# Extract MT annotations
# Add gene_name field if absent
# Prepend "MT-" to each gene name
gffread GCF_011764305.1_ASM1176430v1.1_genomic.gff -T | awk 'BEGIN{FS=OFS="\t"} $1 == "NC_020638.1"' | gawk 'BEGIN{FS=OFS="\t"} {gsub(/^[ \t]+|[ \t]+$/, "", $9); if ($9 !~ /;$/) {$9 = $9 ";"}; if ($9 ~ /gene_id/ && $9 !~ /gene_name/) {match($9, /gene_id "([^"]+)"/, arr); $9 = $9 " gene_name \"" arr[1] "\";"}; if ($9 ~ /gene_name/) {match($9, /gene_name "([^"]+)"/, arr); gsub(/gene_name "[^"]+"/, "gene_name \"MT-" arr[1] "\"", $9)}; print}' | gffread -T -o target.gtf6 Compress FASTQ files in batch
find "$(pwd)" -regextype gnu-awk -type f -iregex ".+\.(fastq|fq)$" | xargs -I {} pigz {}7 Download chromosome-level primary assembly FASTA files in batch from Ensembl
#!/bin/bash -e
release=release-115
species=rattus_norvegicus
assembly=GRCr8
mask_type=dna_sm
chr_set=( {1..20} X Y MT )
concurrent_jobs=1
merged_file=merged.fa.gz
out_dir="$(date +%Y%m%d_%H%M%S).downloads"
base_url="ftp://ftp.ensembl.org/pub/${release}/fasta/${species}/dna"
file_name="${species^}.${assembly}.${mask_type}.primary_assembly.{}.fa.gz"
mkdir -p "${out_dir}"
cd "${out_dir}"
download_failed_log="download_failed.log"
printf "%s\n" "${chr_set[@]}" | \
xargs -P "${concurrent_jobs}" -I {} \
sh -c 'wget -c "${1}" || echo "${1}" >> "${2}"' _ "${base_url}/${file_name}" "${download_failed_log}"
if [[ "${#chr_set[@]}" -ne "$(ls *.fa.gz 2>/dev/null | wc -l)" || -s "${download_failed_log}" ]]; then echo "ERROR: failed to download the following files:"; cat "${download_failed_log}"; exit 1; else echo "INFO: all files have been downloaded"; fi
test_failed_log="test_failed.log"
printf "%s\n" "${chr_set[@]}" | \
xargs -P "${concurrent_jobs}" -I {} \
sh -c 'pigz -t "${1}" || echo "${1}" >> "${2}"' _ "${file_name}" "${test_failed_log}"
if [[ -s "${test_failed_log}" ]]; then echo "ERROR: the following files are corrupted:"; cat "${test_failed_log}"; exit 1; else echo "INFO: all files are complete!"; fi
pigz -c -k -d *.fa.gz | pigz -nc > "${merged_file}"9 WanLi (v20260331)
import scanpy as sc
import anndata as ad
import decoupler as dc
import os
import numpy as np
import re
import pandas as pd
from scipy import sparse
from scipy.io import mmwrite
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as tickerwork_dir = "/data/users/yangrui/lab_projs/wanli"
os.chdir(work_dir)
h5ad_file = "/home/yangrui/mink_proj_v20260126/data/mouse_rna.raw/DevVIS_scRNA_processed.h5ad"adata = sc.read_h5ad(h5ad_file)df_stats = adata.obs[["subclass_label", "Age"]].value_counts().reset_index(name="Count")
df_stats["Global_Proportion"] = df_stats["Count"] / df_stats["Count"].sum()
df_stats["Age_Proportion"] = df_stats.groupby("Age")["Count"].transform(
lambda x: x / x.sum()
)
df_stats["CellType_Proportion"] = df_stats.groupby("subclass_label")["Count"].transform(
lambda x: x / x.sum()
)
df_stats = df_stats.sort_values(by=["subclass_label", "Age"], ascending=[True, True])
df_stats = df_stats.reset_index(drop=True)
df_stats.to_csv("cell_type_stats.tsv", sep="\t", index=False)filtered_df = df_stats[
(df_stats["subclass_label"].isin(["OPC NN", "Oligo NN"]))
& (df_stats["Count"] > 0)
& (df_stats["Age"].str.startswith("P"))
]
target_ages = filtered_df["Age"].unique().tolist()target_cell_types = {
"Oligo NN": "OL",
"OPC NN": "OPC",
"Astro-TE NN": "AS",
"Microglia NN": "MG",
}adata_subset = adata[
(adata.obs["Age"].isin(target_ages)) &
(adata.obs["subclass_label"].isin(target_cell_types.keys()))
].copy()
adata_subset.obs["cell_type_short"] = adata_subset.obs["subclass_label"].map(target_cell_types)adata_subset.write_h5ad("adata.h5ad")adata_subset.layers["counts"] = adata_subset.X.copy()sc.pp.normalize_total(adata_subset, target_sum=10000)
sc.pp.log1p(adata_subset)target_gene_name = "Fmr1"
target_gene_name in adata.var_namestarget_order = ["OPC", "OL", "AS", "MG"]
actual_order = [
c for c in target_order if c in adata_subset.obs["cell_type_short"].cat.categories
]
adata_subset.obs["cell_type_short"] = adata_subset.obs[
"cell_type_short"
].cat.reorder_categories(actual_order)fig, ax = plt.subplots(figsize=(3, 1.5))
sc.pl.violin(
adata_subset,
keys=target_gene_name,
groupby="cell_type_short",
stripplot=False,
rotation=90,
ax=ax,
show=False,
)
ax.yaxis.set_major_locator(ticker.MaxNLocator(nbins=3, integer=True))
ax.grid(False)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_ylim(bottom=0)
for pc in ax.collections:
pc.set_edgecolor(pc.get_facecolor())
plt.tight_layout()
plt.savefig(target_gene_name + ".violin.pdf", dpi=600, bbox_inches="tight")
plt.show()dp = sc.pl.dotplot(
adata_subset,
var_names=[target_gene_name],
groupby="cell_type_short",
standard_scale="var",
colorbar_title="Mean expression",
size_title="Fraction of cells",
linewidth=0,
figsize=(2, 1.5),
show=False,
)
dp["mainplot_ax"].grid(False)
dp["mainplot_ax"].spines["top"].set_visible(False)
dp["mainplot_ax"].spines["right"].set_visible(False)
ax_color = dp["color_legend_ax"]
for spine in ax_color.spines.values():
spine.set_visible(False)
ax_size = dp["size_legend_ax"]
ax_size.tick_params(axis="both", which="both", length=0)
for spine in ax_size.spines.values():
spine.set_visible(False)
plt.savefig(target_gene_name + ".dotplot.pdf", dpi=600, bbox_inches="tight")
plt.show()Keep OPC & OL only:
target_cell_types = {"Oligo NN": "OL", "OPC NN": "OPC"}adata_subset = adata[
(adata.obs["Age"].isin(target_ages)) &
(adata.obs["subclass_label"].isin(target_cell_types.keys()))
].copy()
adata_subset.obs["cell_type_short"] = adata_subset.obs["subclass_label"].map(target_cell_types)adata_subset.write_h5ad("OPC_OL.adata.h5ad")library(Seurat)
library(MuDataSeurat)
library(patchwork)
library(tidyverse)
library(YRUtils)
library(unigd)
library(ggrastr)
library(vroom)
work_dir <- "/data/users/yangrui/lab_projs/wanli"
setwd(work_dir)
seu_file <- "/data/users/yangrui/lab_projs/wanli/OPC_OL.adata.h5ad"
seu <- ReadH5AD(seu_file)
seu_bak <- seu
seu[["global.umap"]] <- NULL
Idents(seu) <- "cell_type_short"
seu <- NormalizeData(seu, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- FindVariableFeatures(seu, selection.method = "vst", nfeatures = 2000)
seu <- ScaleData(seu, features = row.names(seu))
seu <- RunPCA(seu, features = VariableFeatures(object = seu))
p <- ElbowPlot(seu, ndims = 50)
ugd_save_inline(
{
print(p)
},
file = "OPC_OL.elbow.pdf",
width = 400,
height = 200
)
npcs <- 1:20
seu <- FindNeighbors(seu, dims = npcs)
seu <- FindClusters(seu, resolution = 1)
seu <- RunUMAP(seu, dims = npcs)
p <- DimPlot(seu, reduction = "umap", group.by = "Age") + coord_fixed(ratio = 1)
ugd_save_inline(
{
print(p)
},
file = "OPC_OL.age.umap.pdf",
width = 400,
height = 400
)
p <- DimPlot(seu, reduction = "umap", group.by = "cell_type_short") + coord_fixed(ratio = 1)
ugd_save_inline(
{
print(p)
},
file = "OPC_OL.cell_type_short.umap.pdf",
width = 400,
height = 400
)
p <- DimPlot(seu, reduction = "umap", group.by = "seurat_clusters") + coord_fixed(ratio = 1)
ugd_save_inline(
{
print(p)
},
file = "OPC_OL.seurat_clusters.umap.pdf",
width = 400,
height = 400
)
p <- DimPlot(seu, reduction = "umap", group.by = "seurat_clusters", label = TRUE, repel = TRUE) + coord_fixed(ratio = 1)
ugd_save_inline(
{
print(p)
},
file = "OPC_OL.seurat_clusters.umap.labeled.pdf",
width = 400,
height = 400
)
markers <- c(
"Olig1", "Olig2", "Sox2", # univeral
"Cspg4", "Pdgfra", # OPC
"Smarcc1", "Itpr2", "Gpr17", "Bcas1", "Fyn", "Nkx2-2", # COP
"Myrf", "Plp1", "Cnp", "Mbp", "Mag", "Apc", # NFO
"Aspa", "Opalin" # MOL
)
all(markers %in% row.names(seu))
seurat_cluster_levels <- c("0", "1", "3", "4", "9", "11", "12", "13", "18", "19", "20", "6", "7", "8", "10", "14", "21", "22", "2", "5", "15", "16", "17")
seu$seurat_clusters <- factor(seu$seurat_clusters, levels = seurat_cluster_levels)
Idents(seu) <- "seurat_clusters"
p <- DotPlot(
seu,
features = rev(markers),
cluster.idents = FALSE,
dot.min = 0.1
) +
coord_flip()
ugd_save_inline(
{
print(p)
},
file = "OPC_OL.markers.dotplot.pdf",
width = 700,
height = 500
)
p <- VlnPlot(
seu,
features = markers,
split.by = "ident",
sort = FALSE,
stack = TRUE,
raster = TRUE
) + NoLegend()
ugd_save_inline(
{
print(p)
},
file = "OPC_OL.markers.violin.pdf",
width = 900,
height = 600
)
saveRDS(seu, file = "OPC_OL.seu_obj.0.rds")
seu <- readRDS(file = "OPC_OL.seu_obj.0.rds")
seu_0 <- seu
cell_types <- c(
"11" = "OPC", "12" = "OPC", "13" = "OPC", "18" = "OPC",
"3" = "OPC", "4" = "OPC", "9" = "OPC", "20" = "OPC",
"1" = "OPC", "0" = "OPC", "19" = "OPC",
"7" = "COP", "6" = "COP", "8" = "COP",
"14" = "NFO", "21" = "NFO", "22" = "NFO", "10" = "NFO",
"2" = "MOL", "5" = "MOL", "15" = "MOL", "16" = "MOL", "17" = "MOL"
)
# tmp_df <- tibble(
# seurat_clusters = names(cell_types),
# cell_type = cell_types
# )
# tmp_df <- tmp_df %>%
# mutate(
# cell_type = factor(cell_type, levels = c("OPC", "COP", "NFO", "MOL")),
# seurat_clusters = as.integer(seurat_clusters)
# ) %>%
# arrange(cell_type, seurat_clusters)
all(sort(unique(seu$seurat_clusters)) == sort(as.integer(names(cell_types))))
seu <- RenameIdents(seu, cell_types)
seu$opc_ol_cell_subtypes <- Idents(seu)
p <- DimPlot(seu, repel = TRUE, reduction = "umap", group.by = "opc_ol_cell_subtypes")
ugd_save_inline(
{
print(p)
},
file = "OPC_OL.opc_ol_cell_subtypes.umap.pdf",
width = 400,
height = 400
)
saveRDS(seu, file = "OPC_OL.seu_obj.1.rds")
seu <- readRDS(file = "OPC_OL.seu_obj.1.rds")
seu_1 <- seu
target_feature_name <- "Fmr1"
message("Feature name is valid: ", target_feature_name %in% row.names(seu))
cell_types_order <- c("OPC", "COP", "NFO", "MOL")
# dot plot
p <- DotPlot(seu, features = target_feature_name) +
labs(x = NULL, y = NULL) +
scale_y_discrete(limits = rev(cell_types_order)) +
scale_size_continuous(
range = c(0, 10),
limits = c(0, 100),
breaks = seq(0, 100, 25)
) +
theme(
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 90),
plot.margin = margin(100, 100, 100, 100)
)
ppreview(
plot = p,
file = paste0("OPC_OL.", target_feature_name, ".dotplot.pdf"),
width = 400, height = 320
)
# violin plot
ident_colors <- scales::hue_pal()(length(unique(cell_types_order)))
p <- VlnPlot(seu, features = target_feature_name, pt.size = 0, layer = "data", cols = ident_colors)
p <- p + rasterize(geom_jitter(mapping = aes(color = ident), data = p$data, size = 0.5),
dpi = 600
) +
scale_x_discrete(limits = unique(cell_types_order)) +
scale_y_continuous(expand = expansion(0), breaks = scales::breaks_extended(3)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
ppreview(
plot = p,
file = paste0("OPC_OL.", target_feature_name, ".violin.pdf"),
width = 420, height = 330
)
p <- VlnPlot(seu, features = target_feature_name, pt.size = 0, layer = "data", cols = ident_colors) +
scale_x_discrete(limits = unique(cell_types_order)) +
scale_y_continuous(expand = expansion(0), breaks = scales::breaks_extended(3)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
ppreview(
plot = p,
file = paste0("OPC_OL.", target_feature_name, ".violin.without_points.pdf"),
width = 350, height = 310
)
# Slingshot
library(slingshot)
library(SingleCellExperiment)
library(tradeSeq)
library(RColorBrewer)
RNGversion(getRversion())
palette(brewer.pal(12, "Set3"))
seu <- readRDS(file = "OPC_OL.seu_obj.1.rds")
sce <- as.SingleCellExperiment(seu)
sce <- slingshot(
sce,
clusterLabels = "opc_ol_cell_subtypes",
reducedDim = "PCA",
start.clus = "OPC",
end.clus = "MOL"
)
saveRDS(sce, file = "OPC_OL.sce_obj.rds")
sds <- embedCurves(sce, "UMAP")
sds <- as.SlingshotDataSet(sds)
counts <- GetAssayData(seu, assay = "RNA", layer = "counts")
min_num_cells <- floor(min(table(seu$opc_ol_cell_subtypes)) * 0.1)
counts <- as.matrix(counts[rowSums(counts > 0) > min_num_cells, ])
cell_types <- seu$opc_ol_cell_subtypes
BPPARAM <- BiocParallel::MulticoreParam(workers = 100, progressbar = TRUE)
# evaluate this multiple times using different seeds
# to check whether the results are reproducible across different gene subsets
set.seed(1234)
icMat <- evaluateK(
counts = counts,
sds = sds,
k = 3:10,
nGenes = 1000,
verbose = TRUE,
parallel = TRUE,
BPPARAM = BPPARAM
)
saveRDS(icMat, file = "OPC_OL.icMat_1234.rds")
set.seed(5678)
icMat2 <- evaluateK(
counts = counts,
sds = sds,
k = 3:10,
nGenes = 1000,
verbose = TRUE,
parallel = TRUE,
BPPARAM = BPPARAM
)
saveRDS(icMat2, file = "OPC_OL.icMat_5678.rds")
k <- 6
set.seed(123456)
sce <- fitGAM(
counts = counts,
sds = sds,
nknots = k,
verbose = TRUE,
parallel = TRUE,
BPPARAM = BPPARAM
)
saveRDS(sce, file = "OPC_OL.fitGAM_k6.rds")
sce <- readRDS(file = "OPC_OL.fitGAM_k6.rds")
sce_bak <- sce
table(rowData(sce)$tradeSeq$converged)
asso_res <- associationTest(sce)
asso_res$padj_fdr <- p.adjust(asso_res$pvalue, method = "BH")
asso_res$padj_bfrn <- p.adjust(asso_res$pvalue, method = "bonferroni")
asso_res$gene_name <- row.names(asso_res)
vroom_write(asso_res, file = "OPC_OL.fitGAM_k6.association_test.tsv")
target_feature_name <- "Fmr1"
target_feature_name %in% row.names(counts)
p <- plotSmoothers(sce,
counts = counts, gene = target_feature_name,
alpha = 0.5, sample = 1, pointCol = cell_types,
curvesCols = "black", lwd = 1
)
ugd_save_inline(
{
print(p)
},
file = paste0("OPC_OL.", target_feature_name, ".smoother.pdf"),
width = 350,
height = 170
)library(vroom)
library(tidyverse)
library(clusterProfiler)
library(AnnotationDbi)
work_dir <- "/data/users/yangrui/lab_projs/wanli"
setwd(work_dir)
go_db_file <- "/data/database/annotation/release/mus_musculus/org.Mm.eg.db.sqlite"
degs_file <- "/data/users/yangrui/lab_projs/wanli/FMRP_KO_DE_proteins.tsv"
degs <- vroom(degs_file)
# upregulated genes in KO vs WT?
genes <- unique(na.omit(degs$Gene[degs$KO_vs_WT_Ratio > 1]))
orgdb <- loadDb(go_db_file)
ego <- enrichGO(
genes,
keyType = "SYMBOL",
OrgDb = orgdb,
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = FALSE,
pool = FALSE
)
nrow(ego@result)
saveRDS(ego, file = "GO_ALL.rds")
vroom_write(
ego@result,
file = "GO_ALL.tsv",
col_names = TRUE,
append = FALSE
)library(tidyverse)
library(forcats)
library(unigd)
library(vroom)
work_dir <- "/data/users/yangrui/lab_projs/wanli"
setwd(work_dir)
go_file <- "/data/users/yangrui/lab_projs/wanli/GO_ALL.tsv"
go_data <- vroom(go_file, delim = "\t", col_names = TRUE)
go_data$ONTOLOGY <- factor(go_data$ONTOLOGY, levels = c("BP", "CC", "MF"))
go_data <- go_data %>%
group_by(ONTOLOGY) %>%
arrange(desc(Count)) %>%
ungroup()
plot_data <- go_data %>%
mutate(ONT_Desc = paste(ONTOLOGY, Description, sep = "_")) %>%
mutate(Description = fct_reorder(Description, Count, .desc = FALSE))
p <- ggplot(plot_data, aes(x = Count, y = Description, fill = p.adjust)) +
geom_bar(stat = "identity", width = 0.7) +
scale_fill_gradient(
low = "#E31A1C",
high = "#3182BD",
trans = "log10",
name = "p.adjust"
) +
scale_x_continuous(expand = expansion(mult = c(0, 0.1))) +
facet_grid(rows = vars(ONTOLOGY), scales = "free", space = "free") +
labs(
x = "Fold Enrichment",
y = NULL
) +
theme_classic(base_size = 12, base_family = "Arial")
ugd_save_inline(
{
print(p)
},
file = "GO_ALL.barplot.count.pdf",
width = 900,
height = 20000
)library(tidyverse)
library(forcats)
library(unigd)
library(vroom)
work_dir <- "/data/users/yangrui/lab_projs/wanli"
setwd(work_dir)
go_file <- "/data/users/yangrui/lab_projs/wanli/GO_ALL.tsv"
go_data <- vroom(go_file, delim = "\t", col_names = TRUE)
go_data$ONTOLOGY <- factor(go_data$ONTOLOGY, levels = c("BP", "CC", "MF"))
go_data <- go_data %>%
group_by(ONTOLOGY) %>%
arrange(desc(FoldEnrichment)) %>%
ungroup()
plot_data <- go_data %>%
mutate(ONT_Desc = paste(ONTOLOGY, Description, sep = "_")) %>%
mutate(Description = fct_reorder(Description, FoldEnrichment, .desc = FALSE))
p <- ggplot(plot_data, aes(x = FoldEnrichment, y = Description, fill = p.adjust)) +
geom_bar(stat = "identity", width = 0.7) +
scale_fill_gradient(
low = "#E31A1C",
high = "#3182BD",
trans = "log10",
name = "p.adjust"
) +
scale_x_continuous(expand = expansion(mult = c(0, 0.1))) +
facet_grid(rows = vars(ONTOLOGY), scales = "free", space = "free") +
labs(
x = "Fold Enrichment",
y = NULL
) +
theme_classic(base_size = 12, base_family = "Arial")
ugd_save_inline(
{
print(p)
},
file = "GO_ALL.barplot.fold_enrichment.pdf",
width = 900,
height = 20000
)library(tidyverse)
library(forcats)
library(unigd)
library(vroom)
work_dir <- "/data/users/yangrui/lab_projs/wanli"
setwd(work_dir)
go_file <- "/data/users/yangrui/lab_projs/wanli/GO_ALL.tsv"
select_terms_file <- "/data/users/yangrui/lab_projs/wanli/select_terms.tsv"
select_terms <- vroom(select_terms_file, delim = "\t", col_names = "Description") %>%
distinct()
go_data <- vroom(go_file, delim = "\t", col_names = TRUE)
all(select_terms$Description %in% go_data$Description)
go_data <- go_data %>%
filter(Description %in% select_terms$Description) %>%
arrange(desc(FoldEnrichment))
plot_data <- go_data %>%
mutate(Description = fct_reorder(Description, FoldEnrichment, .desc = FALSE))
p <- ggplot(plot_data, aes(x = FoldEnrichment, y = Description, fill = p.adjust)) +
geom_bar(stat = "identity", width = 0.8) +
scale_fill_gradient(
low = "#E31A1C",
high = "#3182BD",
trans = "log10",
name = "p.adjust",
limits = c(1e-10, 0.05),
oob = scales::squish
) +
scale_x_continuous(
expand = expansion(mult = c(0, 0.1)),
position = "top"
) +
labs(
x = "Fold Enrichment",
y = NULL
) +
theme_classic(base_size = 14, base_family = "Arial")
ugd_save_inline(
{
print(p)
},
file = "select_terms.barplot.fold_enrichment.pdf",
width = 700,
height = 500
)10 LiuYaMin (v20260331)
Adult mouse neocortex from nature:
import scanpy as sc
import anndata as ad
import decoupler as dc
import os
import numpy as np
import re
import pandas as pd
from scipy import sparse
from scipy.io import mmwritework_dir = "/data/users/yangrui/lab_projs/liuyamin"
os.chdir(work_dir)
h5ad_file = "/home/yangrui/mink_proj_v20260126/data/mouse_rna.raw/DevVIS_scRNA_processed.h5ad"adata = sc.read_h5ad(h5ad_file)target_ages = ["P21", "P23", "P25", "P28", "P56"]
target_cell_types = {
"L5 ET CTX Glut": "EN",
"L5 NP CTX Glut": "EN",
"L6 CT CTX Glut": "EN",
"L6b CTX Glut": "EN",
"CLA-EPd-CTX Car3 Glut": "EN",
"L2/3 IT CTX Glut": "EN",
"L4/5 IT CTX Glut": "EN",
"L5 IT CTX Glut": "EN",
"L6 IT CTX Glut": "EN",
"Lamp5 Gaba": "IN",
"Sncg Gaba": "IN",
"Vip Gaba": "IN",
"Lamp5 Lhx6 Gaba": "IN",
"Pvalb Gaba": "IN",
"Pvalb chandelier Gaba": "IN",
"Sst Gaba": "IN",
"Oligo NN": "OL",
"Astro-TE NN": "AS",
"Microglia NN": "MG",
}adata_subset = adata[
(adata.obs["Age"].isin(target_ages)) &
(adata.obs["subclass_label"].isin(target_cell_types.keys()))
].copy()
adata_subset.obs["cell_type_short"] = adata_subset.obs["subclass_label"].map(target_cell_types)adata_subset.write_h5ad("adata.h5ad")# adult mm neocortex
library(Seurat)
library(tidyverse)
library(vroom)
library(YRUtils)
library(MuDataSeurat)
library(ggrastr)
library(aplot)
work_dir <- "/data/users/yangrui/lab_projs/liuyamin"
setwd(work_dir)
seu_file <- "/data/users/yangrui/lab_projs/liuyamin/adata.h5ad"
target_ages <- c("P21", "P23", "P25", "P28", "P56")
target_cell_types <- c(
"L5 ET CTX Glut" = "EN", "L5 NP CTX Glut" = "EN",
"L6 CT CTX Glut" = "EN", "L6b CTX Glut" = "EN",
"CLA-EPd-CTX Car3 Glut" = "EN", "L2/3 IT CTX Glut" = "EN",
"L4/5 IT CTX Glut" = "EN", "L5 IT CTX Glut" = "EN", "L6 IT CTX Glut" = "EN",
"Lamp5 Gaba" = "IN", "Sncg Gaba" = "IN", "Vip Gaba" = "IN",
"Lamp5 Lhx6 Gaba" = "IN", "Pvalb Gaba" = "IN",
"Pvalb chandelier Gaba" = "IN", "Sst Gaba" = "IN",
"Oligo NN" = "OL",
"Astro-TE NN" = "AS", "Microglia NN" = "MG"
)
seu <- ReadH5AD(seu_file)
seu_bak <- seu
Idents(seu) <- "cell_type_short"
table(seu$Age, seu$cell_type_short)
seu <- NormalizeData(seu, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- ScaleData(seu, features = row.names(seu))
features <- c("Ddx3x", "Ddx3y")
features %in% row.names(seu)
# dot plot
p <- DotPlot(seu, features = features) +
labs(x = NULL, y = NULL) +
scale_y_discrete(limits = rev(unique(target_cell_types))) +
theme(
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 90),
plot.margin = margin(100, 100, 100, 100)
)
ppreview(
plot = p,
file = paste0(paste0(features, collapse = "_"), ".adult_mm_neocortex.dotplot.pdf"),
width = 410, height = 350
)
# violin plot with points
ident_colors <- scales::hue_pal()(length(unique(target_cell_types)))
px <- VlnPlot(seu, features = "Ddx3x", pt.size = 0, layer = "data", cols = ident_colors)
px <- px + rasterize(geom_jitter(mapping = aes(color = ident), data = px$data, size = 0.5),
dpi = 600
) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
py <- VlnPlot(seu, features = "Ddx3y", pt.size = 0, layer = "data", cols = ident_colors)
py <- py + rasterize(geom_jitter(mapping = aes(color = ident), data = py$data, size = 0.5),
dpi = 600
) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
p <- px %>% insert_bottom(py)
ppreview(
plot = p,
file = "Ddx3x_Ddx3y.adult_mm_neocortex.violin.pdf",
width = 300, height = 300
)
# violin plot without points
px <- VlnPlot(seu, features = "Ddx3x", pt.size = 0, layer = "data", cols = ident_colors) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
py <- VlnPlot(seu, features = "Ddx3y", pt.size = 0, layer = "data", cols = ident_colors) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
p <- px %>% insert_bottom(py)
ppreview(
plot = p,
file = "Ddx3x_Ddx3y.adult_mm_neocortex.violin.without_points.pdf",
width = 150, height = 200
)Adult mouse hippocampus from cell:
# adult mm hippocampus
library(Seurat)
library(tidyverse)
library(vroom)
library(YRUtils)
library(hdf5r)
library(ggrastr)
library(aplot)
work_dir <- "/data/users/yangrui/lab_projs/liuyamin"
setwd(work_dir)
meta_file <- "/data/users/yangrui/tmp_file_transfer/mm_ctx_hippo_10x/metadata.csv"
h5_file <- "/data/users/yangrui/tmp_file_transfer/mm_ctx_hippo_10x/expression_matrix.hdf5"
target_cell_types <- c(
"CA1-ProS" = "EN",
"CA2-IG-FC" = "EN",
"CA3" = "EN",
"DG" = "EN",
"Lamp5" = "IN",
"Pvalb" = "IN",
"Sncg" = "IN",
"Sst" = "IN",
"Vip" = "IN",
"Oligo" = "OL",
"Astro" = "AS",
"Micro-PVM" = "MG"
)
meta <- vroom(meta_file, delim = ",")
meta <- filter(meta, subclass_label %in% names(target_cell_types))
meta <- as.data.frame(meta)
rownames(meta) <- meta$sample_name
h5 <- H5File$new(h5_file, mode = "r")
h5$ls(recursive = TRUE)
counts <- h5[["data/counts"]][, ]
genes <- h5[["data/gene"]][]
samples <- h5[["data/samples"]][]
rownames(counts) <- samples
colnames(counts) <- genes
counts <- counts[rownames(meta), colSums(counts) > 0]
counts <- t(counts)
seu <- CreateSeuratObject(counts = counts, meta.data = meta[colnames(counts), ])
seu_bak <- seu
h5$close_all()
Idents(seu) <- "subclass_label"
seu <- RenameIdents(seu, target_cell_types)
seu$new_cell_type <- Idents(seu)
table(seu$new_cell_type)
seu <- NormalizeData(seu, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- ScaleData(seu, features = row.names(seu))
features <- c("Ddx3x", "Ddx3y")
features %in% row.names(seu)
# dot plot
p <- DotPlot(seu, features = features) +
labs(x = NULL, y = NULL) +
scale_y_discrete(limits = rev(unique(target_cell_types))) +
theme(
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 90),
plot.margin = margin(100, 100, 100, 100)
)
ppreview(
plot = p,
file = paste0(paste0(features, collapse = "_"), ".adult_mm_hippocampus.dotplot.pdf"),
width = 410, height = 350
)
# violin plot with points
ident_colors <- scales::hue_pal()(length(unique(target_cell_types)))
px <- VlnPlot(seu, features = "Ddx3x", pt.size = 0, layer = "data", cols = ident_colors)
px <- px + rasterize(geom_jitter(mapping = aes(color = ident), data = px$data, size = 0.5),
dpi = 600
) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
py <- VlnPlot(seu, features = "Ddx3y", pt.size = 0, layer = "data", cols = ident_colors)
py <- py + rasterize(geom_jitter(mapping = aes(color = ident), data = py$data, size = 0.5),
dpi = 600
) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
p <- px %>% insert_bottom(py)
ppreview(
plot = p,
file = "Ddx3x_Ddx3y.adult_mm_hippocampus.violin.pdf",
width = 300, height = 300
)
# violin plot without points
px <- VlnPlot(seu, features = "Ddx3x", pt.size = 0, layer = "data", cols = ident_colors) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
py <- VlnPlot(seu, features = "Ddx3y", pt.size = 0, layer = "data", cols = ident_colors) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
p <- px %>% insert_bottom(py)
ppreview(
plot = p,
file = "Ddx3x_Ddx3y.adult_mm_hippocampus.violin.without_points.pdf",
width = 150, height = 200
)Adult human neocortex from science advances:
# adult hs neocortex
library(Seurat)
library(tidyverse)
library(vroom)
library(YRUtils)
library(aplot)
library(ggrastr)
work_dir <- "/data/users/yangrui/lab_projs/liuyamin"
setwd(work_dir)
seu_file <- "/data/users/yangrui/sc_omics_ref_datasets/human/dataset_1/raw/cellxgene/cellxgene_scrnaseq.rds"
target_ages <- c(
"4-year-old human stage", "6-year-old human stage", "14-year-old human stage",
"20-year-old human stage", "39-year-old human stage"
)
target_cell_types <- c(
"glutamatergic neuron" = "EN",
"medial ganglionic eminence derived interneuron" = "IN",
"caudal ganglionic eminence derived interneuron" = "IN",
"oligodendrocyte" = "OL",
"astrocyte" = "AS",
"microglial cell" = "MG"
)
seu <- readRDS(seu_file)
seu_bak <- seu
seu <- subset(seu_bak, development_stage %in% target_ages & cell_type %in% names(target_cell_types))
Idents(seu) <- "cell_type"
seu <- RenameIdents(seu, target_cell_types)
seu$new_cell_type <- Idents(seu)
table(seu$development_stage, seu$new_cell_type)
seu <- NormalizeData(seu, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- ScaleData(seu, features = row.names(seu))
seu@assays$RNA@meta.features$feature_id <- row.names(seu)
feature_meta <- seu@assays$RNA@meta.features
features <- c("DDX3X", "DDX3Y")
features %in% feature_meta$feature_name
target_feature_id <- filter(feature_meta, feature_name %in% features)
target_feature_id
features <- c(
"ENSG00000215301" = "DDX3X",
"ENSG00000067048" = "DDX3Y"
)
# dot plot
p <- DotPlot(seu, features = names(features)) +
labs(x = NULL, y = NULL) +
scale_y_discrete(limits = rev(unique(target_cell_types))) +
scale_x_discrete(breaks = names(features), labels = features) +
theme(
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 90),
plot.margin = margin(100, 100, 100, 100)
)
ppreview(
plot = p,
file = paste0(paste0(features, collapse = "_"), ".adult_hs_neocortex.dotplot.pdf"),
width = 410, height = 350
)
# violin plot with points
ident_colors <- scales::hue_pal()(length(unique(target_cell_types)))
px <- VlnPlot(seu, features = "ENSG00000215301", pt.size = 0, layer = "data", cols = ident_colors)
px <- px + rasterize(geom_jitter(mapping = aes(color = ident), data = px$data, size = 0.5),
dpi = 600
) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
py <- VlnPlot(seu, features = "ENSG00000067048", pt.size = 0, layer = "data", cols = ident_colors)
py <- py + rasterize(geom_jitter(mapping = aes(color = ident), data = py$data, size = 0.5),
dpi = 600
) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
p <- px %>% insert_bottom(py)
ppreview(
plot = p,
file = "Ddx3x_Ddx3y.adult_hs_neocortex.violin.pdf",
width = 300, height = 300
)
# violin plot without points
px <- VlnPlot(seu, features = "ENSG00000215301", pt.size = 0, layer = "data", cols = ident_colors) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
py <- VlnPlot(seu, features = "ENSG00000067048", pt.size = 0, layer = "data", cols = ident_colors) +
scale_x_discrete(limits = unique(target_cell_types)) +
scale_y_continuous(expand = expansion(0)) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(
title = element_blank(),
legend.position = "none",
text = element_text(family = "Arial"),
axis.text.x.bottom = element_text(angle = 45),
plot.margin = margin(100, 100, 100, 100)
)
p <- px %>% insert_bottom(py)
ppreview(
plot = p,
file = "Ddx3x_Ddx3y.adult_hs_neocortex.violin.without_points.pdf",
width = 150, height = 200
)11 Misc
- Make a directory based on the current date and time
mkdir $(date +%Y%m%d_%H%M%S)- Add a prefix for each directory/file name
# dry-run
prefix=""; ls | awk -v p="${prefix}" '{printf "mv %s %s-%s\n", $0, p, $0}'
# run
prefix=""; ls | awk -v p="${prefix}" '{printf "mv %s %s-%s\n", $0, p, $0}' | sh- Example usage of
seqkit grep
# -P: only scan positive strand
# -i: ignore case
# -C: only count the number of occurrences
seqkit grep -P -C -j 60 -i -s -p "GCTGCAGAAGCTGAAAATCTTCTTCAT" Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa