Misc bioinformatics code

1 Tag BAM by MAPQ and GX tags

#!/usr/bin/env python3
# tag_bam_by_mapq_gx.py

import pysam
import sys
import time


def tag_bam(input_bam, output_bam, threads=1):
    GENE_TAG = "GX"
    NEW_TAG = "GS"

    count_total = 0
    count_gene = 0
    count_intergenic = 0
    count_failed = 0

    print(f"[INFO] Start dealing with {input_bam} ...")

    start_time = time.time()

    try:
        with pysam.AlignmentFile(input_bam, "rb") as bam_in:
            with pysam.AlignmentFile(
                output_bam, "wb", template=bam_in, threads=threads
            ) as bam_out:
                for read in bam_in:
                    count_total += 1

                    if read.mapping_quality != 255:
                        read.set_tag(NEW_TAG, "F", value_type="Z")
                        count_failed += 1
                    else:
                        try:
                            gx_value = read.get_tag(GENE_TAG)
                            if gx_value != "-":
                                read.set_tag(NEW_TAG, "G", value_type="Z")
                                count_gene += 1
                            else:
                                read.set_tag(NEW_TAG, "I", value_type="Z")
                                count_intergenic += 1
                        except KeyError:
                            read.set_tag(NEW_TAG, "I", value_type="Z")
                            count_intergenic += 1

                    bam_out.write(read)

                    if count_total % 1000000 == 0:
                        print(f"[INFO] {count_total:,} reads dealed")
    except Exception as e:
        print(f"[ERROR] Unknown error occurred: {e}")
        sys.exit(1)

    end_time = time.time()

    print(f"[INFO] Dealing with {input_bam} done!")
    print(f"\n[INFO] Elapsed time: {end_time - start_time:.2f} second(s)")
    print(f"[INFO] Output file: {output_bam}")
    print(f"[INFO] Total reads: {count_total:,}")
    print(
        f"[INFO] Reads uniquely mapped to genes (G): {count_gene:,} ({count_gene/count_total:.2%})"
    )
    print(
        f"[INFO] Reads uniquely mapped to intergenic regions (I): {count_intergenic:,} ({count_intergenic/count_total:.2%})"
    )
    print(
        f"[INFO] Reads not unqiuely mapped to genome (MAPQ != 255) (F): {count_failed:,} ({count_failed/count_total:.2%})"
    )
    print(
        f"\n[INFO] Next step: you should create the index file for the output BAM file yourself (samtools index)"
    )


if __name__ == "__main__":
    threads = 1

    if len(sys.argv) < 3 or len(sys.argv) > 4:
        print(f"\nUsage: python {sys.argv[0]} <input.bam> <output.bam> [<threads>]")
        print("threads <INT>: the number of threads/cores used by pysam (default: 1)\n")
        sys.exit(1)

    input_bam = sys.argv[1]
    output_bam = sys.argv[2]

    if len(sys.argv) == 4:
        try:
            threads = int(sys.argv[3])
        except ValueError:
            print(
                f"[ERROR] Argument 'threads' {sys.argv[3]} must be a positive integer"
            )
            sys.exit(1)
        else:
            print(f"[INFO] {threads} threads will be used")
    else:
        print(f"[INFO] Argument 'threads' is not given, using the default: {threads}")

    tag_bam(input_bam, output_bam, threads=threads)

2 Merge genome annotations

singularity shell -B "${PWD}":/data /data/softwares/agat/agat_1.5.0--pl5321hdfd78af_0.sif
agat_sp_merge_annotations.pl --gff Neovision_vision.v1.v1.NCBI_Pipeline.basic.chr.annotation.gff3 --gff Neovision_vision.v1.v1.KIZ_ShengLab.basic.chr.annotation.gff3 --out merged.gff3

# phase correction, etc.
gffread merged.gff3 -o tidy.gff3 -O -F --keep-exon-attrs --keep-genes

3 Download SRA files from NCBI GEO in batch

For detailed info, see https://github.com/ncbi/sra-tools/wiki and https://github.com/ncbi/sra-tools/wiki/08.-prefetch-and-fasterq-dump.

SRR_Acc_File=SRR_Acc_List.tsv
OUT_DIR=tmp_downloads
mkdir ${OUT_DIR}
while read -r SRR_Acc; do cmd="prefetch ${SRR_Acc} --max-size u --progress --output-directory ${OUT_DIR}"; echo -e "Running command: ${cmd} ..."; eval "${cmd}"; echo -e "Successfully running command: ${cmd}\n"; done < "${SRR_Acc_File}"

4 Extract FASTQ files from SRA files in batch

Note:

For fasterq-dump, --split-3 and --skip-technical options are enabled by default, but for clarity, you’d better specify them explicitly yourself.
You should carefully consider whether you want to include technical reads; the splitting scheme of which you want. For detailed info, see https://github.com/ncbi/sra-tools/wiki and https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump.

Some examples:

Standard single- or paired-end protocols (e.g., bulk RNA-, ATAC-, ChIP-seq, etc.) with technical reads omitted: biological reads mated \(\rightarrow\) *_1.fastq & *_2.fastq; unmated reads \(\rightarrow\) *.fastq.

TMP_DIR=/dev/shm
NUM_THREADS=6
PWD=$(pwd)
shopt -s extglob
for SRR_Acc_Path in SRR+([0-9]); do [[ -d "${SRR_Acc_Path}" ]] || continue; SRR_Acc_Path="${PWD}/${SRR_Acc_Path}"; OUT_DIR="${SRR_Acc_Path}_dump_outs"; cmd="fasterq-dump ${SRR_Acc_Path} --outdir ${OUT_DIR} --temp ${TMP_DIR} --threads ${NUM_THREADS} --progress --skip-technical --split-3"; echo -e "Running command: ${cmd} ..."; eval "${cmd}"; echo -e "Successfully running command: ${cmd}\n"; done

Split the downloaded SRA files into individual, analysis-ready FASTQ (or FASTA) files, preserving any technical reads (e.g., cell-barcode, UMI, sample index, etc.) that are required for single-cell RNA-seq, ATAC-seq, multiome, or other protocols.

TMP_DIR=/dev/shm
NUM_THREADS=6
PWD=$(pwd)
shopt -s extglob
for SRR_Acc_Path in SRR+([0-9]); do [[ -d "${SRR_Acc_Path}" ]] || continue; SRR_Acc_Path="${PWD}/${SRR_Acc_Path}"; OUT_DIR="${SRR_Acc_Path}_dump_outs"; cmd="fasterq-dump ${SRR_Acc_Path} --outdir ${OUT_DIR} --temp ${TMP_DIR} --threads ${NUM_THREADS} --progress --include-technical --split-files"; echo -e "Running command: ${cmd} ..."; eval "${cmd}"; echo -e "Successfully running command: ${cmd}\n"; done

5 Extract MT annotations from GTF file

# Extract MT annotations
# Add gene_name field if absent
# Prepend "MT-" to each gene name
gffread GCF_011764305.1_ASM1176430v1.1_genomic.gff -T | awk 'BEGIN{FS=OFS="\t"} $1 == "NC_020638.1"' | gawk 'BEGIN{FS=OFS="\t"} {gsub(/^[ \t]+|[ \t]+$/, "", $9); if ($9 !~ /;$/) {$9 = $9 ";"}; if ($9 ~ /gene_id/ && $9 !~ /gene_name/) {match($9, /gene_id "([^"]+)"/, arr); $9 = $9 " gene_name \"" arr[1] "\";"}; if ($9 ~ /gene_name/) {match($9, /gene_name "([^"]+)"/, arr); gsub(/gene_name "[^"]+"/, "gene_name \"MT-" arr[1] "\"", $9)}; print}' | gffread -T -o target.gtf

6 Compress FASTQ files in batch

find "$(pwd)" -regextype gnu-awk -type f -iregex ".+\.(fastq|fq)$" | xargs -I {} pigz {}

7 Download chromosome-level primary assembly FASTA files in batch from Ensembl

#!/bin/bash -e
release=release-115
species=rattus_norvegicus
assembly=GRCr8
mask_type=dna_sm
chr_set=( {1..20} X Y MT )
concurrent_jobs=1
merged_file=merged.fa.gz
out_dir="$(date +%Y%m%d_%H%M%S).downloads"

base_url="ftp://ftp.ensembl.org/pub/${release}/fasta/${species}/dna"
file_name="${species^}.${assembly}.${mask_type}.primary_assembly.{}.fa.gz"

mkdir -p "${out_dir}"
cd "${out_dir}"

download_failed_log="download_failed.log"
printf "%s\n" "${chr_set[@]}" | \
    xargs -P "${concurrent_jobs}" -I {} \
    sh -c 'wget -c "${1}" || echo "${1}" >> "${2}"' _ "${base_url}/${file_name}" "${download_failed_log}"
if [[ "${#chr_set[@]}" -ne "$(ls *.fa.gz 2>/dev/null | wc -l)" || -s "${download_failed_log}" ]]; then echo "ERROR: failed to download the following files:"; cat "${download_failed_log}"; exit 1; else echo "INFO: all files have been downloaded"; fi

test_failed_log="test_failed.log"
printf "%s\n" "${chr_set[@]}" | \
    xargs -P "${concurrent_jobs}" -I {} \
    sh -c 'pigz -t "${1}" || echo "${1}" >> "${2}"' _ "${file_name}" "${test_failed_log}"
if [[ -s "${test_failed_log}" ]]; then echo "ERROR: the following files are corrupted:"; cat "${test_failed_log}"; exit 1; else echo "INFO: all files are complete!"; fi

pigz -c -k -d *.fa.gz | pigz -nc > "${merged_file}"

8 MD5-related commonds

8.1 Merge MD5 files

awk '!/^[[:space:]]*$/ {print $0}' *.md5 > md5.txt

8.2 Split MD5 files

cat md5.txt | while read -r checksum filename; do printf "%s  %s\n" "${checksum}" "${filename}" > "${filename}.md5"; done

8.3 Generate MD5 checksums in batch

top_dir="$(pwd)"; for file in `find "${top_dir}" -regextype gnu-awk -type f ! -iregex "(.+\.md5|(.+/)?md5\.txt)(\.check)?$"`; do work_dir=$(dirname "${file}"); base_file=$(basename "${file}"); md5_file="${base_file}.md5"; cd "${work_dir}"; [ -e "${md5_file}" ] && continue; gen=$(md5sum "${base_file}"); echo "${gen}" > "${md5_file}"; echo "${gen}" | sed -E "s@ +@  ${work_dir}/@"; done; cd "${top_dir}"

8.4 Check MD5 checksums in batch

top_dir="$(pwd)"; for file in `find "${top_dir}" -regextype gnu-awk -type f -iregex ".+(\.md5$|/md5\.txt$)"`; do work_dir=$(dirname "${file}"); base_file=$(basename "${file}"); check_file="${base_file}.check"; cd "${work_dir}"; [ -e "${check_file}" ] && continue; check=$(md5sum -c "${base_file}"); echo "${check}" > "${check_file}"; echo "${check}" | awk -v OFS="/" -v d="${work_dir}" '{print d, $0}'; done; cd "${top_dir}"

8.5 Print failed items

top_dir="$(pwd)"; for file in `find "${top_dir}" -regextype gnu-awk -type f -iregex "(.+\.md5|(.+/)?md5\.txt)\.check$"`; do work_dir=$(dirname "${file}"); base_file=$(basename "${file}"); cd "${work_dir}"; check=$(cat "${base_file}" | grep "FAILED"); [ -z "${check}" ] && continue; echo "${check}" | awk -v OFS="/" -v d="${work_dir}" '{print d, $0}'; done; cd "${top_dir}"

9 WanLi (v20260331)

import scanpy as sc
import anndata as ad
import decoupler as dc
import os
import numpy as np
import re
import pandas as pd
from scipy import sparse
from scipy.io import mmwrite
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker

work_dir = "/data/users/yangrui/lab_projs/wanli"
os.chdir(work_dir)

h5ad_file = "/home/yangrui/mink_proj_v20260126/data/mouse_rna.raw/DevVIS_scRNA_processed.h5ad"

adata = sc.read_h5ad(h5ad_file)

df_stats = adata.obs[["subclass_label", "Age"]].value_counts().reset_index(name="Count")
df_stats["Global_Proportion"] = df_stats["Count"] / df_stats["Count"].sum()
df_stats["Age_Proportion"] = df_stats.groupby("Age")["Count"].transform(
    lambda x: x / x.sum()
)
df_stats["CellType_Proportion"] = df_stats.groupby("subclass_label")["Count"].transform(
    lambda x: x / x.sum()
)
df_stats = df_stats.sort_values(by=["subclass_label", "Age"], ascending=[True, True])
df_stats = df_stats.reset_index(drop=True)
df_stats.to_csv("cell_type_stats.tsv", sep="\t", index=False)

filtered_df = df_stats[
    (df_stats["subclass_label"].isin(["OPC NN", "Oligo NN"]))
    & (df_stats["Count"] > 0)
    & (df_stats["Age"].str.startswith("P"))
]
target_ages = filtered_df["Age"].unique().tolist()

target_cell_types = {
    "Oligo NN": "OL",
    "OPC NN": "OPC",
    "Astro-TE NN": "AS",
    "Microglia NN": "MG",
}

adata_subset = adata[
    (adata.obs["Age"].isin(target_ages)) & 
    (adata.obs["subclass_label"].isin(target_cell_types.keys()))
].copy()
adata_subset.obs["cell_type_short"] = adata_subset.obs["subclass_label"].map(target_cell_types)

adata_subset.write_h5ad("adata.h5ad")

adata_subset.layers["counts"] = adata_subset.X.copy()

sc.pp.normalize_total(adata_subset, target_sum=10000)
sc.pp.log1p(adata_subset)

target_gene_name = "Fmr1"

target_gene_name in adata.var_names

target_order = ["OPC", "OL", "AS", "MG"]

actual_order = [
    c for c in target_order if c in adata_subset.obs["cell_type_short"].cat.categories
]
adata_subset.obs["cell_type_short"] = adata_subset.obs[
    "cell_type_short"
].cat.reorder_categories(actual_order)

fig, ax = plt.subplots(figsize=(3, 1.5))
sc.pl.violin(
    adata_subset,
    keys=target_gene_name,
    groupby="cell_type_short",
    stripplot=False,
    rotation=90,
    ax=ax,
    show=False,
)
ax.yaxis.set_major_locator(ticker.MaxNLocator(nbins=3, integer=True))
ax.grid(False)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_ylim(bottom=0)
for pc in ax.collections:
    pc.set_edgecolor(pc.get_facecolor())
plt.tight_layout()
plt.savefig(target_gene_name + ".violin.pdf", dpi=600, bbox_inches="tight")
plt.show()

dp = sc.pl.dotplot(
    adata_subset,
    var_names=[target_gene_name],
    groupby="cell_type_short",
    standard_scale="var",
    colorbar_title="Mean expression",
    size_title="Fraction of cells",
    linewidth=0,
    figsize=(2, 1.5),
    show=False,
)
dp["mainplot_ax"].grid(False)
dp["mainplot_ax"].spines["top"].set_visible(False)
dp["mainplot_ax"].spines["right"].set_visible(False)
ax_color = dp["color_legend_ax"]
for spine in ax_color.spines.values():
    spine.set_visible(False)
ax_size = dp["size_legend_ax"]
ax_size.tick_params(axis="both", which="both", length=0)
for spine in ax_size.spines.values():
    spine.set_visible(False)
plt.savefig(target_gene_name + ".dotplot.pdf", dpi=600, bbox_inches="tight")
plt.show()

Keep OPC & OL only:

target_cell_types = {"Oligo NN": "OL", "OPC NN": "OPC"}

adata_subset = adata[
    (adata.obs["Age"].isin(target_ages)) & 
    (adata.obs["subclass_label"].isin(target_cell_types.keys()))
].copy()
adata_subset.obs["cell_type_short"] = adata_subset.obs["subclass_label"].map(target_cell_types)

adata_subset.write_h5ad("OPC_OL.adata.h5ad")

library(Seurat)
library(MuDataSeurat)
library(patchwork)
library(tidyverse)
library(YRUtils)
library(unigd)
library(ggrastr)
library(vroom)

work_dir <- "/data/users/yangrui/lab_projs/wanli"
setwd(work_dir)

seu_file <- "/data/users/yangrui/lab_projs/wanli/OPC_OL.adata.h5ad"

seu <- ReadH5AD(seu_file)
seu_bak <- seu
seu[["global.umap"]] <- NULL
Idents(seu) <- "cell_type_short"

seu <- NormalizeData(seu, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- FindVariableFeatures(seu, selection.method = "vst", nfeatures = 2000)
seu <- ScaleData(seu, features = row.names(seu))
seu <- RunPCA(seu, features = VariableFeatures(object = seu))

p <- ElbowPlot(seu, ndims = 50)
ugd_save_inline(
    {
        print(p)
    },
    file = "OPC_OL.elbow.pdf",
    width = 400,
    height = 200
)

npcs <- 1:20

seu <- FindNeighbors(seu, dims = npcs)
seu <- FindClusters(seu, resolution = 1)
seu <- RunUMAP(seu, dims = npcs)

p <- DimPlot(seu, reduction = "umap", group.by = "Age") + coord_fixed(ratio = 1)
ugd_save_inline(
    {
        print(p)
    },
    file = "OPC_OL.age.umap.pdf",
    width = 400,
    height = 400
)

p <- DimPlot(seu, reduction = "umap", group.by = "cell_type_short") + coord_fixed(ratio = 1)
ugd_save_inline(
    {
        print(p)
    },
    file = "OPC_OL.cell_type_short.umap.pdf",
    width = 400,
    height = 400
)

p <- DimPlot(seu, reduction = "umap", group.by = "seurat_clusters") + coord_fixed(ratio = 1)
ugd_save_inline(
    {
        print(p)
    },
    file = "OPC_OL.seurat_clusters.umap.pdf",
    width = 400,
    height = 400
)

p <- DimPlot(seu, reduction = "umap", group.by = "seurat_clusters", label = TRUE, repel = TRUE) + coord_fixed(ratio = 1)
ugd_save_inline(
    {
        print(p)
    },
    file = "OPC_OL.seurat_clusters.umap.labeled.pdf",
    width = 400,
    height = 400
)

markers <- c(
    "Olig1", "Olig2", "Sox2", # univeral
    "Cspg4", "Pdgfra", # OPC
    "Smarcc1", "Itpr2", "Gpr17", "Bcas1", "Fyn", "Nkx2-2", # COP
    "Myrf", "Plp1", "Cnp", "Mbp", "Mag", "Apc", # NFO
    "Aspa", "Opalin" # MOL
)

all(markers %in% row.names(seu))

seurat_cluster_levels <- c("0", "1", "3", "4", "9", "11", "12", "13", "18", "19", "20", "6", "7", "8", "10", "14", "21", "22", "2", "5", "15", "16", "17")
seu$seurat_clusters <- factor(seu$seurat_clusters, levels = seurat_cluster_levels)
Idents(seu) <- "seurat_clusters"

p <- DotPlot(
    seu,
    features = rev(markers),
    cluster.idents = FALSE,
    dot.min = 0.1
) +
    coord_flip()
ugd_save_inline(
    {
        print(p)
    },
    file = "OPC_OL.markers.dotplot.pdf",
    width = 700,
    height = 500
)

p <- VlnPlot(
    seu,
    features = markers,
    split.by = "ident",
    sort = FALSE,
    stack = TRUE,
    raster = TRUE
) + NoLegend()
ugd_save_inline(
    {
        print(p)
    },
    file = "OPC_OL.markers.violin.pdf",
    width = 900,
    height = 600
)

saveRDS(seu, file = "OPC_OL.seu_obj.0.rds")
seu <- readRDS(file = "OPC_OL.seu_obj.0.rds")
seu_0 <- seu

cell_types <- c(
    "11" = "OPC", "12" = "OPC", "13" = "OPC", "18" = "OPC",
    "3" = "OPC", "4" = "OPC", "9" = "OPC", "20" = "OPC",
    "1" = "OPC", "0" = "OPC", "19" = "OPC",
    "7" = "COP", "6" = "COP", "8" = "COP",
    "14" = "NFO", "21" = "NFO", "22" = "NFO", "10" = "NFO",
    "2" = "MOL", "5" = "MOL", "15" = "MOL", "16" = "MOL", "17" = "MOL"
)

# tmp_df <- tibble(
#     seurat_clusters = names(cell_types),
#     cell_type = cell_types
# )
# tmp_df <- tmp_df %>%
#     mutate(
#         cell_type = factor(cell_type, levels = c("OPC", "COP", "NFO", "MOL")),
#         seurat_clusters = as.integer(seurat_clusters)
#     ) %>%
#     arrange(cell_type, seurat_clusters)

all(sort(unique(seu$seurat_clusters)) == sort(as.integer(names(cell_types))))

seu <- RenameIdents(seu, cell_types)
seu$opc_ol_cell_subtypes <- Idents(seu)

p <- DimPlot(seu, repel = TRUE, reduction = "umap", group.by = "opc_ol_cell_subtypes")
ugd_save_inline(
    {
        print(p)
    },
    file = "OPC_OL.opc_ol_cell_subtypes.umap.pdf",
    width = 400,
    height = 400
)

saveRDS(seu, file = "OPC_OL.seu_obj.1.rds")
seu <- readRDS(file = "OPC_OL.seu_obj.1.rds")
seu_1 <- seu

target_feature_name <- "Fmr1"

message("Feature name is valid: ", target_feature_name %in% row.names(seu))

cell_types_order <- c("OPC", "COP", "NFO", "MOL")

# dot plot
p <- DotPlot(seu, features = target_feature_name) +
    labs(x = NULL, y = NULL) +
    scale_y_discrete(limits = rev(cell_types_order)) +
    scale_size_continuous(
        range = c(0, 10),
        limits = c(0, 100),
        breaks = seq(0, 100, 25)
    ) +
    theme(
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 90),
        plot.margin = margin(100, 100, 100, 100)
    )
ppreview(
    plot = p,
    file = paste0("OPC_OL.", target_feature_name, ".dotplot.pdf"),
    width = 400, height = 320
)

# violin plot
ident_colors <- scales::hue_pal()(length(unique(cell_types_order)))

p <- VlnPlot(seu, features = target_feature_name, pt.size = 0, layer = "data", cols = ident_colors)
p <- p + rasterize(geom_jitter(mapping = aes(color = ident), data = p$data, size = 0.5),
    dpi = 600
) +
    scale_x_discrete(limits = unique(cell_types_order)) +
    scale_y_continuous(expand = expansion(0), breaks = scales::breaks_extended(3)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
ppreview(
    plot = p,
    file = paste0("OPC_OL.", target_feature_name, ".violin.pdf"),
    width = 420, height = 330
)

p <- VlnPlot(seu, features = target_feature_name, pt.size = 0, layer = "data", cols = ident_colors) +
    scale_x_discrete(limits = unique(cell_types_order)) +
    scale_y_continuous(expand = expansion(0), breaks = scales::breaks_extended(3)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
ppreview(
    plot = p,
    file = paste0("OPC_OL.", target_feature_name, ".violin.without_points.pdf"),
    width = 350, height = 310
)

# Slingshot
library(slingshot)
library(SingleCellExperiment)
library(tradeSeq)
library(RColorBrewer)

RNGversion(getRversion())
palette(brewer.pal(12, "Set3"))

seu <- readRDS(file = "OPC_OL.seu_obj.1.rds")

sce <- as.SingleCellExperiment(seu)
sce <- slingshot(
    sce,
    clusterLabels = "opc_ol_cell_subtypes",
    reducedDim = "PCA",
    start.clus = "OPC",
    end.clus = "MOL"
)
saveRDS(sce, file = "OPC_OL.sce_obj.rds")

sds <- embedCurves(sce, "UMAP")
sds <- as.SlingshotDataSet(sds)

counts <- GetAssayData(seu, assay = "RNA", layer = "counts")
min_num_cells <- floor(min(table(seu$opc_ol_cell_subtypes)) * 0.1)
counts <- as.matrix(counts[rowSums(counts > 0) > min_num_cells, ])
cell_types <- seu$opc_ol_cell_subtypes

BPPARAM <- BiocParallel::MulticoreParam(workers = 100, progressbar = TRUE)

# evaluate this multiple times using different seeds
# to check whether the results are reproducible across different gene subsets
set.seed(1234)
icMat <- evaluateK(
    counts = counts,
    sds = sds,
    k = 3:10,
    nGenes = 1000,
    verbose = TRUE,
    parallel = TRUE,
    BPPARAM = BPPARAM
)
saveRDS(icMat, file = "OPC_OL.icMat_1234.rds")

set.seed(5678)
icMat2 <- evaluateK(
    counts = counts,
    sds = sds,
    k = 3:10,
    nGenes = 1000,
    verbose = TRUE,
    parallel = TRUE,
    BPPARAM = BPPARAM
)
saveRDS(icMat2, file = "OPC_OL.icMat_5678.rds")

k <- 6

set.seed(123456)
sce <- fitGAM(
    counts = counts,
    sds = sds,
    nknots = k,
    verbose = TRUE,
    parallel = TRUE,
    BPPARAM = BPPARAM
)
saveRDS(sce, file = "OPC_OL.fitGAM_k6.rds")
sce <- readRDS(file = "OPC_OL.fitGAM_k6.rds")
sce_bak <- sce

table(rowData(sce)$tradeSeq$converged)

asso_res <- associationTest(sce)
asso_res$padj_fdr <- p.adjust(asso_res$pvalue, method = "BH")
asso_res$padj_bfrn <- p.adjust(asso_res$pvalue, method = "bonferroni")
asso_res$gene_name <- row.names(asso_res)

vroom_write(asso_res, file = "OPC_OL.fitGAM_k6.association_test.tsv")

target_feature_name <- "Fmr1"

target_feature_name %in% row.names(counts)

p <- plotSmoothers(sce,
    counts = counts, gene = target_feature_name,
    alpha = 0.5, sample = 1, pointCol = cell_types,
    curvesCols = "black", lwd = 1
)
ugd_save_inline(
    {
        print(p)
    },
    file = paste0("OPC_OL.", target_feature_name, ".smoother.pdf"),
    width = 350,
    height = 170
)

library(vroom)
library(tidyverse)
library(clusterProfiler)
library(AnnotationDbi)

work_dir <- "/data/users/yangrui/lab_projs/wanli"
setwd(work_dir)

go_db_file <- "/data/database/annotation/release/mus_musculus/org.Mm.eg.db.sqlite"
degs_file <- "/data/users/yangrui/lab_projs/wanli/FMRP_KO_DE_proteins.tsv"

degs <- vroom(degs_file)
# upregulated genes in KO vs WT?
genes <- unique(na.omit(degs$Gene[degs$KO_vs_WT_Ratio > 1]))

orgdb <- loadDb(go_db_file)
ego <- enrichGO(
    genes,
    keyType = "SYMBOL",
    OrgDb = orgdb,
    ont = "ALL",
    pAdjustMethod = "BH",
    pvalueCutoff = 0.05,
    qvalueCutoff = 0.05,
    readable = FALSE,
    pool = FALSE
)
nrow(ego@result)

saveRDS(ego, file = "GO_ALL.rds")
vroom_write(
    ego@result,
    file = "GO_ALL.tsv",
    col_names = TRUE,
    append = FALSE
)

library(tidyverse)
library(forcats)
library(unigd)
library(vroom)

work_dir <- "/data/users/yangrui/lab_projs/wanli"
setwd(work_dir)

go_file <- "/data/users/yangrui/lab_projs/wanli/GO_ALL.tsv"

go_data <- vroom(go_file, delim = "\t", col_names = TRUE)
go_data$ONTOLOGY <- factor(go_data$ONTOLOGY, levels = c("BP", "CC", "MF"))
go_data <- go_data %>%
    group_by(ONTOLOGY) %>%
    arrange(desc(Count)) %>%
    ungroup()
plot_data <- go_data %>%
    mutate(ONT_Desc = paste(ONTOLOGY, Description, sep = "_")) %>%
    mutate(Description = fct_reorder(Description, Count, .desc = FALSE))

p <- ggplot(plot_data, aes(x = Count, y = Description, fill = p.adjust)) +
    geom_bar(stat = "identity", width = 0.7) +
    scale_fill_gradient(
        low = "#E31A1C",
        high = "#3182BD",
        trans = "log10",
        name = "p.adjust"
    ) +
    scale_x_continuous(expand = expansion(mult = c(0, 0.1))) +
    facet_grid(rows = vars(ONTOLOGY), scales = "free", space = "free") +
    labs(
        x = "Fold Enrichment",
        y = NULL
    ) +
    theme_classic(base_size = 12, base_family = "Arial")

ugd_save_inline(
    {
        print(p)
    },
    file = "GO_ALL.barplot.count.pdf",
    width = 900,
    height = 20000
)

library(tidyverse)
library(forcats)
library(unigd)
library(vroom)

work_dir <- "/data/users/yangrui/lab_projs/wanli"
setwd(work_dir)

go_file <- "/data/users/yangrui/lab_projs/wanli/GO_ALL.tsv"

go_data <- vroom(go_file, delim = "\t", col_names = TRUE)
go_data$ONTOLOGY <- factor(go_data$ONTOLOGY, levels = c("BP", "CC", "MF"))
go_data <- go_data %>%
    group_by(ONTOLOGY) %>%
    arrange(desc(FoldEnrichment)) %>%
    ungroup()
plot_data <- go_data %>%
    mutate(ONT_Desc = paste(ONTOLOGY, Description, sep = "_")) %>%
    mutate(Description = fct_reorder(Description, FoldEnrichment, .desc = FALSE))

p <- ggplot(plot_data, aes(x = FoldEnrichment, y = Description, fill = p.adjust)) +
    geom_bar(stat = "identity", width = 0.7) +
    scale_fill_gradient(
        low = "#E31A1C",
        high = "#3182BD",
        trans = "log10",
        name = "p.adjust"
    ) +
    scale_x_continuous(expand = expansion(mult = c(0, 0.1))) +
    facet_grid(rows = vars(ONTOLOGY), scales = "free", space = "free") +
    labs(
        x = "Fold Enrichment",
        y = NULL
    ) +
    theme_classic(base_size = 12, base_family = "Arial")

ugd_save_inline(
    {
        print(p)
    },
    file = "GO_ALL.barplot.fold_enrichment.pdf",
    width = 900,
    height = 20000
)

library(tidyverse)
library(forcats)
library(unigd)
library(vroom)

work_dir <- "/data/users/yangrui/lab_projs/wanli"
setwd(work_dir)

go_file <- "/data/users/yangrui/lab_projs/wanli/GO_ALL.tsv"
select_terms_file <- "/data/users/yangrui/lab_projs/wanli/select_terms.tsv"

select_terms <- vroom(select_terms_file, delim = "\t", col_names = "Description") %>%
    distinct()
go_data <- vroom(go_file, delim = "\t", col_names = TRUE)

all(select_terms$Description %in% go_data$Description)

go_data <- go_data %>%
    filter(Description %in% select_terms$Description) %>%
    arrange(desc(FoldEnrichment))
plot_data <- go_data %>%
    mutate(Description = fct_reorder(Description, FoldEnrichment, .desc = FALSE))

p <- ggplot(plot_data, aes(x = FoldEnrichment, y = Description, fill = p.adjust)) +
    geom_bar(stat = "identity", width = 0.8) +
    scale_fill_gradient(
        low = "#E31A1C",
        high = "#3182BD",
        trans = "log10",
        name = "p.adjust",
        limits = c(1e-10, 0.05),
        oob = scales::squish
    ) +
    scale_x_continuous(
        expand = expansion(mult = c(0, 0.1)),
        position = "top"
    ) +
    labs(
        x = "Fold Enrichment",
        y = NULL
    ) +
    theme_classic(base_size = 14, base_family = "Arial")

ugd_save_inline(
    {
        print(p)
    },
    file = "select_terms.barplot.fold_enrichment.pdf",
    width = 700,
    height = 500
)

10 LiuYaMin (v20260331)

Adult mouse neocortex from nature:

import scanpy as sc
import anndata as ad
import decoupler as dc
import os
import numpy as np
import re
import pandas as pd
from scipy import sparse
from scipy.io import mmwrite

work_dir = "/data/users/yangrui/lab_projs/liuyamin"
os.chdir(work_dir)

h5ad_file = "/home/yangrui/mink_proj_v20260126/data/mouse_rna.raw/DevVIS_scRNA_processed.h5ad"

adata = sc.read_h5ad(h5ad_file)

target_ages = ["P21", "P23", "P25", "P28", "P56"]
target_cell_types = {
    "L5 ET CTX Glut": "EN",
    "L5 NP CTX Glut": "EN",
    "L6 CT CTX Glut": "EN",
    "L6b CTX Glut": "EN",
    "CLA-EPd-CTX Car3 Glut": "EN",
    "L2/3 IT CTX Glut": "EN",
    "L4/5 IT CTX Glut": "EN",
    "L5 IT CTX Glut": "EN",
    "L6 IT CTX Glut": "EN",
    "Lamp5 Gaba": "IN",
    "Sncg Gaba": "IN",
    "Vip Gaba": "IN",
    "Lamp5 Lhx6 Gaba": "IN",
    "Pvalb Gaba": "IN",
    "Pvalb chandelier Gaba": "IN",
    "Sst Gaba": "IN",
    "Oligo NN": "OL",
    "Astro-TE NN": "AS",
    "Microglia NN": "MG",
}

adata_subset = adata[
    (adata.obs["Age"].isin(target_ages)) & 
    (adata.obs["subclass_label"].isin(target_cell_types.keys()))
].copy()
adata_subset.obs["cell_type_short"] = adata_subset.obs["subclass_label"].map(target_cell_types)

adata_subset.write_h5ad("adata.h5ad")

# adult mm neocortex
library(Seurat)
library(tidyverse)
library(vroom)
library(YRUtils)
library(MuDataSeurat)
library(ggrastr)
library(aplot)

work_dir <- "/data/users/yangrui/lab_projs/liuyamin"
setwd(work_dir)

seu_file <- "/data/users/yangrui/lab_projs/liuyamin/adata.h5ad"
target_ages <- c("P21", "P23", "P25", "P28", "P56")
target_cell_types <- c(
    "L5 ET CTX Glut" = "EN", "L5 NP CTX Glut" = "EN",
    "L6 CT CTX Glut" = "EN", "L6b CTX Glut" = "EN",
    "CLA-EPd-CTX Car3 Glut" = "EN", "L2/3 IT CTX Glut" = "EN",
    "L4/5 IT CTX Glut" = "EN", "L5 IT CTX Glut" = "EN", "L6 IT CTX Glut" = "EN",
    "Lamp5 Gaba" = "IN", "Sncg Gaba" = "IN", "Vip Gaba" = "IN",
    "Lamp5 Lhx6 Gaba" = "IN", "Pvalb Gaba" = "IN",
    "Pvalb chandelier Gaba" = "IN", "Sst Gaba" = "IN",
    "Oligo NN" = "OL",
    "Astro-TE NN" = "AS", "Microglia NN" = "MG"
)

seu <- ReadH5AD(seu_file)
seu_bak <- seu

Idents(seu) <- "cell_type_short"

table(seu$Age, seu$cell_type_short)

seu <- NormalizeData(seu, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- ScaleData(seu, features = row.names(seu))

features <- c("Ddx3x", "Ddx3y")
features %in% row.names(seu)

# dot plot
p <- DotPlot(seu, features = features) +
    labs(x = NULL, y = NULL) +
    scale_y_discrete(limits = rev(unique(target_cell_types))) +
    theme(
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 90),
        plot.margin = margin(100, 100, 100, 100)
    )
ppreview(
    plot = p,
    file = paste0(paste0(features, collapse = "_"), ".adult_mm_neocortex.dotplot.pdf"),
    width = 410, height = 350
)

# violin plot with points
ident_colors <- scales::hue_pal()(length(unique(target_cell_types)))

px <- VlnPlot(seu, features = "Ddx3x", pt.size = 0, layer = "data", cols = ident_colors)
px <- px + rasterize(geom_jitter(mapping = aes(color = ident), data = px$data, size = 0.5),
    dpi = 600
) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
py <- VlnPlot(seu, features = "Ddx3y", pt.size = 0, layer = "data", cols = ident_colors)
py <- py + rasterize(geom_jitter(mapping = aes(color = ident), data = py$data, size = 0.5),
    dpi = 600
) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
p <- px %>% insert_bottom(py)
ppreview(
    plot = p,
    file = "Ddx3x_Ddx3y.adult_mm_neocortex.violin.pdf",
    width = 300, height = 300
)

# violin plot without points
px <- VlnPlot(seu, features = "Ddx3x", pt.size = 0, layer = "data", cols = ident_colors) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
py <- VlnPlot(seu, features = "Ddx3y", pt.size = 0, layer = "data", cols = ident_colors) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
p <- px %>% insert_bottom(py)
ppreview(
    plot = p,
    file = "Ddx3x_Ddx3y.adult_mm_neocortex.violin.without_points.pdf",
    width = 150, height = 200
)

Adult mouse hippocampus from cell:

# adult mm hippocampus
library(Seurat)
library(tidyverse)
library(vroom)
library(YRUtils)
library(hdf5r)
library(ggrastr)
library(aplot)

work_dir <- "/data/users/yangrui/lab_projs/liuyamin"
setwd(work_dir)

meta_file <- "/data/users/yangrui/tmp_file_transfer/mm_ctx_hippo_10x/metadata.csv"
h5_file <- "/data/users/yangrui/tmp_file_transfer/mm_ctx_hippo_10x/expression_matrix.hdf5"
target_cell_types <- c(
    "CA1-ProS" = "EN",
    "CA2-IG-FC" = "EN",
    "CA3" = "EN",
    "DG" = "EN",
    "Lamp5" = "IN",
    "Pvalb" = "IN",
    "Sncg" = "IN",
    "Sst" = "IN",
    "Vip" = "IN",
    "Oligo" = "OL",
    "Astro" = "AS",
    "Micro-PVM" = "MG"
)

meta <- vroom(meta_file, delim = ",")
meta <- filter(meta, subclass_label %in% names(target_cell_types))
meta <- as.data.frame(meta)
rownames(meta) <- meta$sample_name

h5 <- H5File$new(h5_file, mode = "r")
h5$ls(recursive = TRUE)

counts <- h5[["data/counts"]][, ]
genes <- h5[["data/gene"]][]
samples <- h5[["data/samples"]][]
rownames(counts) <- samples
colnames(counts) <- genes
counts <- counts[rownames(meta), colSums(counts) > 0]
counts <- t(counts)

seu <- CreateSeuratObject(counts = counts, meta.data = meta[colnames(counts), ])
seu_bak <- seu
h5$close_all()

Idents(seu) <- "subclass_label"
seu <- RenameIdents(seu, target_cell_types)
seu$new_cell_type <- Idents(seu)

table(seu$new_cell_type)

seu <- NormalizeData(seu, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- ScaleData(seu, features = row.names(seu))

features <- c("Ddx3x", "Ddx3y")
features %in% row.names(seu)

# dot plot
p <- DotPlot(seu, features = features) +
    labs(x = NULL, y = NULL) +
    scale_y_discrete(limits = rev(unique(target_cell_types))) +
    theme(
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 90),
        plot.margin = margin(100, 100, 100, 100)
    )
ppreview(
    plot = p,
    file = paste0(paste0(features, collapse = "_"), ".adult_mm_hippocampus.dotplot.pdf"),
    width = 410, height = 350
)

# violin plot with points
ident_colors <- scales::hue_pal()(length(unique(target_cell_types)))

px <- VlnPlot(seu, features = "Ddx3x", pt.size = 0, layer = "data", cols = ident_colors)
px <- px + rasterize(geom_jitter(mapping = aes(color = ident), data = px$data, size = 0.5),
    dpi = 600
) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
py <- VlnPlot(seu, features = "Ddx3y", pt.size = 0, layer = "data", cols = ident_colors)
py <- py + rasterize(geom_jitter(mapping = aes(color = ident), data = py$data, size = 0.5),
    dpi = 600
) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
p <- px %>% insert_bottom(py)
ppreview(
    plot = p,
    file = "Ddx3x_Ddx3y.adult_mm_hippocampus.violin.pdf",
    width = 300, height = 300
)

# violin plot without points
px <- VlnPlot(seu, features = "Ddx3x", pt.size = 0, layer = "data", cols = ident_colors) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
py <- VlnPlot(seu, features = "Ddx3y", pt.size = 0, layer = "data", cols = ident_colors) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
p <- px %>% insert_bottom(py)
ppreview(
    plot = p,
    file = "Ddx3x_Ddx3y.adult_mm_hippocampus.violin.without_points.pdf",
    width = 150, height = 200
)

Adult human neocortex from science advances:

# adult hs neocortex
library(Seurat)
library(tidyverse)
library(vroom)
library(YRUtils)
library(aplot)
library(ggrastr)

work_dir <- "/data/users/yangrui/lab_projs/liuyamin"
setwd(work_dir)

seu_file <- "/data/users/yangrui/sc_omics_ref_datasets/human/dataset_1/raw/cellxgene/cellxgene_scrnaseq.rds"
target_ages <- c(
    "4-year-old human stage", "6-year-old human stage", "14-year-old human stage",
    "20-year-old human stage", "39-year-old human stage"
)
target_cell_types <- c(
    "glutamatergic neuron" = "EN",
    "medial ganglionic eminence derived interneuron" = "IN",
    "caudal ganglionic eminence derived interneuron" = "IN",
    "oligodendrocyte" = "OL",
    "astrocyte" = "AS",
    "microglial cell" = "MG"
)

seu <- readRDS(seu_file)
seu_bak <- seu

seu <- subset(seu_bak, development_stage %in% target_ages & cell_type %in% names(target_cell_types))
Idents(seu) <- "cell_type"
seu <- RenameIdents(seu, target_cell_types)
seu$new_cell_type <- Idents(seu)

table(seu$development_stage, seu$new_cell_type)

seu <- NormalizeData(seu, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- ScaleData(seu, features = row.names(seu))

seu@assays$RNA@meta.features$feature_id <- row.names(seu)
feature_meta <- seu@assays$RNA@meta.features

features <- c("DDX3X", "DDX3Y")
features %in% feature_meta$feature_name
target_feature_id <- filter(feature_meta, feature_name %in% features)
target_feature_id

features <- c(
    "ENSG00000215301" = "DDX3X",
    "ENSG00000067048" = "DDX3Y"
)

# dot plot
p <- DotPlot(seu, features = names(features)) +
    labs(x = NULL, y = NULL) +
    scale_y_discrete(limits = rev(unique(target_cell_types))) +
    scale_x_discrete(breaks = names(features), labels = features) +
    theme(
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 90),
        plot.margin = margin(100, 100, 100, 100)
    )
ppreview(
    plot = p,
    file = paste0(paste0(features, collapse = "_"), ".adult_hs_neocortex.dotplot.pdf"),
    width = 410, height = 350
)

# violin plot with points
ident_colors <- scales::hue_pal()(length(unique(target_cell_types)))

px <- VlnPlot(seu, features = "ENSG00000215301", pt.size = 0, layer = "data", cols = ident_colors)
px <- px + rasterize(geom_jitter(mapping = aes(color = ident), data = px$data, size = 0.5),
    dpi = 600
) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
py <- VlnPlot(seu, features = "ENSG00000067048", pt.size = 0, layer = "data", cols = ident_colors)
py <- py + rasterize(geom_jitter(mapping = aes(color = ident), data = py$data, size = 0.5),
    dpi = 600
) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
p <- px %>% insert_bottom(py)
ppreview(
    plot = p,
    file = "Ddx3x_Ddx3y.adult_hs_neocortex.violin.pdf",
    width = 300, height = 300
)

# violin plot without points
px <- VlnPlot(seu, features = "ENSG00000215301", pt.size = 0, layer = "data", cols = ident_colors) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
py <- VlnPlot(seu, features = "ENSG00000067048", pt.size = 0, layer = "data", cols = ident_colors) +
    scale_x_discrete(limits = unique(target_cell_types)) +
    scale_y_continuous(expand = expansion(0)) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(
        title = element_blank(),
        legend.position = "none",
        text = element_text(family = "Arial"),
        axis.text.x.bottom = element_text(angle = 45),
        plot.margin = margin(100, 100, 100, 100)
    )
p <- px %>% insert_bottom(py)
ppreview(
    plot = p,
    file = "Ddx3x_Ddx3y.adult_hs_neocortex.violin.without_points.pdf",
    width = 150, height = 200
)

11 Misc

Make a directory based on the current date and time

mkdir $(date +%Y%m%d_%H%M%S)

Add a prefix for each directory/file name

# dry-run
prefix=""; ls | awk -v p="${prefix}" '{printf "mv %s %s-%s\n", $0, p, $0}'
# run
prefix=""; ls | awk -v p="${prefix}" '{printf "mv %s %s-%s\n", $0, p, $0}' | sh

Example usage of seqkit grep

# -P: only scan positive strand
# -i: ignore case
# -C: only count the number of occurrences
seqkit grep -P -C -j 60 -i -s -p "GCTGCAGAAGCTGAAAATCTTCTTCAT" Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa