Misc R code – BrainArch

1 Introduction

Used to collect various R code snippets.

2 Calculate gene lengths

2.1 Gene body lengths

Used for RNA-seq counts normalization in intron-included mode.

library(GenomicFeatures)
library(tidyverse)
library(vroom)

gtf_file <- "/data/database/annotation/release/mus_musculus/Mus_musculus.GRCm39.vM38.gencode.basic.chr.annotation.gtf"

txdb <- makeTxDbFromGFF(gtf_file, format = "gtf")
genes_gr <- genes(txdb)
gene_body_lengths <- width(genes_gr)
gene_len_df <- tibble(
    gene_id = names(genes_gr),
    gene_length = gene_body_lengths
)
vroom_write(
    gene_len_df,
    file = paste0(gtf_file, ".gene_lengths.tsv"),
    col_names = TRUE, append = FALSE
)

2.2 Union exon lengths

Used for RNA-seq counts normalization in intron-excluded mode.

library(GenomicFeatures)
library(tidyverse)
library(vroom)

gtf_file <- "/data/database/annotation/release/mus_musculus/gencode.vM23.annotation.gtf"

txdb <- makeTxDbFromGFF(gtf_file, format = "gtf")
exons_list <- exonsBy(txdb, by = "gene")
exons_reduced <- GenomicRanges::reduce(exons_list)
gene_lengths <- sum(width(exons_reduced))
gene_len_df <- tibble(
    gene_id = names(gene_lengths),
    gene_length = gene_lengths
)
vroom_write(
    gene_len_df,
    file = paste0(gtf_file, ".union_exon_lengths.tsv"),
    col_names = TRUE, append = FALSE
)

3 Process one-to-one orthologs of OrthoFinder

library(tidyverse)
library(vroom)

orthogroups_gene_count_file <- "/data/database/annotation/derived/orthologs/20251104_202818/OrthoFinder/Orthogroups/Orthogroups.GeneCount.tsv"
orthologs_file <- "/data/database/annotation/derived/orthologs/20251104_202818/OrthoFinder/Orthologues/Orthologues_NeoVis_KIZ_NCBIv2/NeoVis_KIZ_NCBIv2__v__MusMus_GRCm39_GENCODEvM38.tsv"
species1_tx_id_mpt_file <- "/data/database/annotation/release/neogale_vison/Neogale_vison.v1.v2.KIZ_NCBI.basic.chr.annotation.transcript_id_name_mapping_table.tsv"
species2_tx_id_mpt_file <- "/data/database/annotation/release/mus_musculus/Mus_musculus.GRCm39.vM38.gencode.basic.chr.annotation.transcript_id_name_mapping_table.tsv"
species_columns <- c("NeoVis_KIZ_NCBIv2", "MusMus_GRCm39_GENCODEvM38")

species1_tx_id_mpt_df <- vroom(species1_tx_id_mpt_file)
names(species1_tx_id_mpt_df) <- paste0(species_columns[1], ".", names(species1_tx_id_mpt_df))
species1_tx_id_mpt_df[[paste0(species_columns[1], ".transcript_id")]] <- gsub(":", "_", species1_tx_id_mpt_df[[paste0(species_columns[1], ".transcript_id")]], fixed = TRUE)

species2_tx_id_mpt_df <- vroom(species2_tx_id_mpt_file)
names(species2_tx_id_mpt_df) <- paste0(species_columns[2], ".", names(species2_tx_id_mpt_df))
species2_tx_id_mpt_df[[paste0(species_columns[2], ".transcript_id")]] <- gsub(":", "_", species2_tx_id_mpt_df[[paste0(species_columns[2], ".transcript_id")]], fixed = TRUE)

orthogroups_gene_count_df <- vroom(orthogroups_gene_count_file)
one2one_orthogroups <- orthogroups_gene_count_df$Orthogroup[rowSums(orthogroups_gene_count_df[, species_columns]) == length(species_columns)]
one2one_orthogroups <- unique(na.omit(one2one_orthogroups))

orthologs_df <- vroom(orthologs_file)
orthologs_df <- orthologs_df %>%
    na.omit() %>%
    filter(Orthogroup %in% one2one_orthogroups) %>%
    separate_longer_delim(cols = all_of(species_columns[1]), delim = ",") %>%
    separate_longer_delim(cols = all_of(species_columns[2]), delim = ",") %>%
    mutate(
        !!sym(species_columns[1]) := trimws(.data[[species_columns[1]]]),
        !!sym(species_columns[2]) := trimws(.data[[species_columns[2]]])
    )

table(orthologs_df[[species_columns[1]]] %in% species1_tx_id_mpt_df[[paste0(species_columns[1], ".transcript_id")]])
table(orthologs_df[[species_columns[2]]] %in% species2_tx_id_mpt_df[[paste0(species_columns[2], ".transcript_id")]])

df <- orthologs_df %>%
    inner_join(species1_tx_id_mpt_df, by = setNames(paste0(species_columns[1], ".transcript_id"), species_columns[1])) %>%
    inner_join(species2_tx_id_mpt_df, by = setNames(paste0(species_columns[2], ".transcript_id"), species_columns[2])) %>%
    rename(
        !!sym(paste0(species_columns[1], ".transcript_id")) := !!sym(species_columns[1]),
        !!sym(paste0(species_columns[2], ".transcript_id")) := !!sym(species_columns[2])
    )
df[[paste0(species_columns[1], ".gene_name")]][is.na(df[[paste0(species_columns[1], ".gene_name")]])] <- df[[paste0(species_columns[1], ".gene_id")]][is.na(df[[paste0(species_columns[1], ".gene_name")]])]
df[[paste0(species_columns[2], ".gene_name")]][is.na(df[[paste0(species_columns[2], ".gene_name")]])] <- df[[paste0(species_columns[2], ".gene_id")]][is.na(df[[paste0(species_columns[2], ".gene_name")]])]

vroom_write(
    df,
    file = gsub("\\.(tsv|txt)$", ".with_gene_info.tsv", orthologs_file),
    col_names = TRUE, append = FALSE
)

4 Retrieve data from AnnotationHub

library(AnnotationHub)
library(AnnotationDbi)

work_dir <- "/Users/yangrui/Downloads"
setwd(work_dir)

ah <- AnnotationHub()

# BiocHubsShiny::BiocHubsShiny()

file <- "org.Petaurus_breviceps_papuanus.eg.sqlite"
hubid <- "AH120027"

x <- ah[[hubid]]

saveDb(x, file = file)
# loadDb(file)

5 Plot phylogenetic tree

Case 1

Download example data from here.

library(treeio)
library(ggtree)
library(tidyverse)
library(vroom)
library(patchwork)
library(YRUtils)

font_family <- "Arial"
work_dir <- "/home/yangrui/lab_projs/lyr"

tree_file <- "omm_v12_190tax_CDS_tree.nexus"
gi_file <- "GI.tsv"

gyr_species <- c("Human", "Dolphin", "Elephant")
lis_species <- c("Marmoset", "Cat", "Manatee")

# from top to bottom
tips_order <- c(
    "Human", "Marmoset", "Galago", "Tree shrew", "Europeans hrew", "Mouse",
    "Dolphin", "Cat", "Chinese rufous horseshoe bat", "Black flying fox", "Star-nosed mole", "Hedgehog",
    "Elephant", "Manatee", "Cape elephant shrew", "Tasmanian devil", "Opossum"
)

setwd(work_dir)
gi_df <- vroom(gi_file) %>%
    mutate(
        species_name = factor(species_name, levels = rev(tips_order)),
        label = gsub(" +", "_", scientific_name),
        highlight = if_else(species_name %in% gyr_species, "GYR",
            if_else(species_name %in% lis_species, "LIS", "NA")
        )
    ) %>%
    arrange(species_name) %>%
    # the column matching tip labels in tree object must be in the first column
    select(label, everything())

tree <- read.nexus(tree_file)
tips_to_drop <- unique(na.omit(tree$tip.label[!(tree$tip.label %in% gi_df$label)]))
tree <- drop.tip(tree, tips_to_drop)
tree <- ape::rotateConstr(tree, gi_df$label)

p_tree <- ggtree(
    tree,
    branch.length = "none",
    ladderize = FALSE
) %<+% gi_df +
    geom_tiplab(
        aes(
            label = species_name,
            color = highlight
        ),
        family = font_family,
        show.legend = FALSE,
        size = 6
    ) +
    scale_color_manual(values = c(
        "GYR" = "red",
        "LIS" = "blue",
        "NA" = "black"
    )) +
    scale_x_continuous(position = "top") +
    hexpand(1.1) +
    labs(x = "Species Tree") +
    theme(
        text = element_text(family = font_family),
        axis.title.x = element_text(
            face = "bold",
            size = 20
        ),
        panel.border = element_rect(
            fill = NA,
            color = "grey"
        )
    )

p_line <- ggplot(gi_df, aes(GI, species_name, group = 1)) +
    geom_path() +
    geom_point(
        aes(
            color = highlight,
            size = highlight
        ),
        show.legend = FALSE
    ) +
    scale_x_continuous(
        position = "top",
        expand = expansion(0),
        breaks = 0:ceiling(max(gi_df$GI)),
        limits = c(0, ceiling(max(gi_df$GI)))
    ) +
    scale_color_manual(values = c(
        "GYR" = "red",
        "LIS" = "blue",
        "NA" = "black"
    )) +
    scale_size_manual(
        values = c(
            "GYR" = 5,
            "LIS" = 5,
            "NA" = 3
        )
    ) +
    labs(x = "GI", y = NULL) +
    theme_classic(base_family = font_family) +
    theme(
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank(),
        axis.text.x = element_text(size = 6 * 2.845),
        axis.title.x = element_text(
            face = "bold",
            size = 20
        ),
        panel.border = element_rect(
            fill = NA,
            color = "grey"
        )
    )

p <- p_tree + p_line +
    plot_layout(
        ncol = 2,
        width = c(3, 1)
    )
ppreview(p, file = "tree_line.pdf")

6 Plot TF motif logos from PFMs

Case 1

library(ggplot2)
library(ggseqlogo)
library(vroom)
library(unigd)

in_file <- "/data/database/tf_motifs/20251014_153919/all_motifs_rmdup.from_peca2.txt"
out_dir <- "/home/yangrui/temp"
special_chars_regex <- "[\t<>:\"/\\\\|?*[:space:]`&;\\$(){}\\[\\]#~[:punct:]]"

data <- trimws(vroom_lines(in_file, skip_empty_rows = TRUE))
table(data == "")
table(is.na(data))
ls <- list()
for (line in data) {
    if (grepl("^>", line)) {
        if (exists("header") && exists("pfm") && is.matrix(pfm)) {
            row.names(pfm) <- c("A", "C", "G", "T")
            ls[[header]] <- pfm
        }
        header <- gsub("_{2,}", "_", gsub(special_chars_regex, "_", gsub("^>", "", line), perl = TRUE))
        pfm <- NULL
    } else {
        pf <- as.numeric(trimws(strsplit(line, "\\t")[[1]]))
        if (length(pf) != 4 || any(is.na(pf))) {
            stop("malformated line found in ", header)
        }
        if (is.null(pfm)) {
            pfm <- matrix(pf, ncol = 1, byrow = FALSE)
        } else {
            pfm <- cbind(pfm, matrix(pf, ncol = 1, byrow = FALSE))
        }
    }
}
row.names(pfm) <- c("A", "C", "G", "T")
ls[[header]] <- pfm

plot_method <- "bits"
for (header in names(ls)) {
    p <- ggseqlogo(ls[[header]], method = plot_method) +
        scale_y_continuous(breaks = scales::breaks_extended(4)) +
        theme(text = element_text(family = "Arial", size = 24))
    ugd_save_inline(
        {
            print(p)
        },
        file = file.path(out_dir, paste0(header, ".", plot_method, ".pdf")),
        width = 50 + ncol(ls[[header]]) * 30,
        height = 160
    )
}

7 Plot genome tracks using plotgardener

Case 1

Download example data from here.

library(plotgardener)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(org.Mm.eg.db)
library(vroom)
library(tidyverse)
library(unigd)

adjacentMeans <- function(x, n = 1) {
    for (i in 1:n) {
        idx <- seq(1, length(x) - 1)
        x <- c(rbind(x[idx], (x[idx] + x[idx + 1]) / 2), x[length(x)])
    }
    return(x)
}

work_dir <- "/home/yangrui/temp"
setwd(work_dir)

ugd()
pageWidth <- 8
pageHeight <- 8
defaultUnits <- "inches"
pageCreate(
    width = pageWidth,
    height = pageHeight,
    default.units = defaultUnits
)

assembly <- "mm10"
chrom <- "chr1"
chromRange <- c(56730000, 57280000)
regionWidth <- 7
defaultJust <- c("left", "top")
defaultFontSize <- 10
defaultFontColor <- "black"
defaultFontFamily <- "Arial"
genomicRegion <- pgParams(
    chrom = chrom,
    assembly = assembly,
    chromstart = chromRange[1],
    chromend = chromRange[2],
    width = regionWidth,
    default.units = defaultUnits,
    just = defaultJust,
    fontsize = defaultFontSize,
    fontcolor = defaultFontColor,
    fontfamily = defaultFontFamily
)

gpX <- 0.5
gpY <- 1.4
gpHeight <- 0.5
genesPlot <- plotGenes(
    params = genomicRegion,
    x = gpX, y = gpY,
    height = gpHeight,
    geneHighlights = data.frame(
        "gene" = c("Satb2", "9130024F11Rik", "BC055402"),
        "color" = c("steelblue", "white", "white")
    ),
    strandLabels = FALSE
)

aglX <- 0.5
aglY <- 0.5
defaultLineWidth <- 1
annoGenomeLabel(
    plot = genesPlot,
    x = aglX, y = aglY,
    scale = "Mb",
    at = adjacentMeans(chromRange, 2),
    lwd = defaultLineWidth,
    params = genomicRegion
)

peaksFile <- "pscnes.bed"
ppX <- 0.5
ppY <- 1
ppHeight <- 0.15
peaksPlot <- plotRanges(
    data = peaksFile,
    x = ppX, y = ppY,
    collapse = TRUE,
    height = ppHeight,
    fill = "black",
    linecolor = "fill",
    params = genomicRegion
)

hicLoopsFile <- "hic_loops.bed"
hicLoops_df <- vroom(hicLoopsFile, col_names = c("chrom1", "start1", "end1", "name")) %>%
    separate_wider_delim(cols = "name", delim = regex("[-:,]"), names = c("chrom2", "start2", "end2", "score")) %>%
    type_convert() %>%
    mutate(
        length = start2 - start1,
        height = length / max(length)
    )

apX <- 0.5
apY <- 2
apHeight <- 0.6
archesPlot <- plotPairsArches(
    data = hicLoops_df,
    params = genomicRegion,
    fill = colorby("score", palette = colorRampPalette(c("#FFE0B2", "#E65100"))),
    linecolor = "fill",
    archHeight = "height",
    alpha = 1,
    x = apX, y = apY,
    height = apHeight,
    flip = TRUE,
    baseline = TRUE,
    baseline.color = "grey25",
    baseline.lwd = 1
)

ahlX <- 1
ahlY <- 3
ahlWidth <- 0.1
ahlHeight <- 1
ahlBreaks <- seq(0, 1.4, 0.35)
annoHeatmapLegend(
    plot = archesPlot,
    x = ahlX, y = ahlY,
    width = ahlWidth, height = ahlHeight,
    orientation = "v",
    fontsize = 0,
    scientific = TRUE,
    ticks = TRUE,
    digits = 2,
    breaks = ahlBreaks,
    params = genomicRegion
)

plotText(
    label = as.character(ahlBreaks),
    x = ahlX + ahlWidth + 0.1,
    y = scales::rescale(ahlBreaks,
        to = c(ahlY + ahlHeight, ahlY)
    ),
    just = c("left", "center"),
    params = genomicRegion
)

plotText(
    label = "score",
    params = genomicRegion,
    x = ahlX + ahlWidth / 2,
    y = ahlY - 0.2,
    just = c("center", "center"),
    fontface = "bold"
)

plotSegments(
    x0 = c(ppX, gpX, apX) - 0.1,
    y0 = c(ppY + ppHeight / 2, gpY + gpHeight / 2, apY + apHeight / 2) - 0.25,
    x1 = c(ppX, gpX, apX) - 0.1,
    y1 = c(ppY + ppHeight / 2, gpY + gpHeight / 2, apY + apHeight / 2) + 0.25,
    lwd = 1,
    linecolor = "black"
)

plotText(
    label = c("PSCNEs", "Genes", "Links"),
    params = genomicRegion,
    x = c(ppX, gpX, apX) - 0.3,
    y = c(ppY + ppHeight / 2, gpY + gpHeight / 2, apY + apHeight / 2),
    rot = 90,
    just = c("center", "center")
)

pageGuideHide()
ugd_save(
    file = "genome_tracks.pdf",
    width = 800, height = 800
)
dev.off()

8 Output FASTA sequences with fixed-width wrapping

str <- paste0(c("ATCG", rep("N", 30), "ATCG"), collapse = "")
width <- 60
first_line <- ">20251013_204905_e8834659-1df1-4ae5-a838-86910aa57e31"
out_file <- "/home/dell/test/test.fa"

width <- ifelse(nchar(str) < width, nchar(str), width)
starts <- seq(1, nchar(str), width)
ends <- seq(width, nchar(str), width)
if (tail(ends, 1) != nchar(str)) ends <- c(ends, nchar(str))
substrs <- c(first_line, substring(str, starts, ends))
writeLines(substrs, con = out_file)

elem_num <- length(starts)
max_length <- nchar(tail(ends, 1))
message(paste0(
    c(
        paste0(stringr::str_pad(head(starts, 2), max_length, side = "right"), " - ", stringr::str_pad(head(ends, 2), max_length, side = "right")),
        if (elem_num > 5) paste0(stringr::str_pad("...", max_length, side = "right"), " - ", stringr::str_pad("...", max_length, side = "right")) else NULL,
        if (elem_num > 5) paste0(stringr::str_pad(tail(starts, 2), max_length, side = "right"), " - ", stringr::str_pad(tail(ends, 2), max_length, side = "right")) else NULL
    ),
    collapse = "\n"
))

9 Plot volcano plot in batch

library(tidyverse)
library(vroom)
library(magrittr)
library(ggrepel)
library(ggprism)
library(YRUtils)

input_dir <- "/home/yangrui/mywd/wutintin_transcriptome/input"
output_dir <- "/home/yangrui/mywd/wutintin_transcriptome/output"
top_n <- 10
gene_id_column <- "gene_name"
diff_flag_column <- "diff_flag"
logfc_column <- "log2FoldChange"
padj_column <- "padj"
color_column <- "diff_flag"
logfc_threshold <- 1
padj_threshold <- 0.05
# Up, Down, No
diff_flag_colors <- c("magenta4", "cyan4", "grey25")
geom_point_alpha <- 0.5
geom_point_size <- 2
geom_line_linewidth <- 1
geom_line_color <- "grey25"
geom_line_linetype <- "dashed"
geom_text_alpha <- 1
min_segment_length <- 3
geom_text_size <- 5
theme_prism_panel_border <- TRUE
font_family <- "Arial"
theme_base_font_size <- 16

files <- list.files(input_dir, pattern = "\\.(tsv|txt|csv)$", full.names = TRUE, recursive = FALSE)
for (file in files) {
    data <- vroom(file) %>%
        select(all_of(c(gene_id_column, diff_flag_column, logfc_column, padj_column))) %>%
        set_colnames(c("gene_id", "diff_flag", "logFC", "padj")) %>%
        distinct()
    anno_data <- data %>%
        filter(diff_flag != "NO") %>%
        group_by(diff_flag) %>%
        slice_max(abs(logFC), n = top_n)

    diff_flag_count_table <- count(data, diff_flag) %>%
        arrange(diff_flag)
    plot_title <- paste0(paste0(
        diff_flag_count_table$diff_flag, ": ",
        diff_flag_count_table$n
    ), collapse = "\n")

    p <- ggplot(data, aes(logFC, -log10(padj), color = diff_flag)) +
        geom_point(alpha = geom_point_alpha, size = geom_point_size) +
        geom_vline(
            xintercept = c(-logfc_threshold, logfc_threshold),
            linewidth = geom_line_linewidth,
            color = geom_line_color,
            linetype = geom_line_linetype
        ) +
        geom_hline(
            yintercept = -log10(padj_threshold),
            linewidth = geom_line_linewidth,
            color = geom_line_color,
            linetype = geom_line_linetype
        ) +
        geom_label_repel(
            data = anno_data,
            mapping = aes(logFC, -log10(padj), color = diff_flag, label = gene_id),
            max.overlaps = 10000, show.legend = FALSE, alpha = geom_text_alpha,
            min.segment.length = min_segment_length, size = geom_text_size
        ) +
        labs(
            title = plot_title,
            x = "logFC", y = "-log10(padj)",
            color = paste0("padj < ", padj_threshold, "\nlogFC > ", logfc_threshold)
        ) +
        theme_prism(border = theme_prism_panel_border, base_family = font_family, base_size = theme_base_font_size) +
        theme(legend.title = element_text()) +
        scale_color_manual(values = setNames(
            diff_flag_colors,
            c(paste0(strsplit(gsub("\\.(tsv|txt|csv)$", "", basename(file)), "_vs_", fixed = TRUE)[[1]], " Up"), "NO")
        ))

    ppreview(p, file = file.path(output_dir, gsub("\\.(tsv|txt|csv)$", ".pdf", basename(file))))
}

10 Visualize scRNA-Seq expression of a single gene using dot plot and violin plot of Seurat

library(Seurat)
library(tidyverse)
library(YRUtils)

rna_rds_file <- "/data/users/yangrui/sc_omics_ref_datasets/human/dataset_2/ana/res/rna_to_umap.rds"
output_dir <- "/home/yangrui/temp"
target_gene_names <- "BTN3A2"
target_cell_types <- c("Exc. GluN1" = "GluN1", "Exc. GluN2" = "GluN2", "Exc. GluN3" = "GluN3", "Exc. GluN4" = "GluN4", "Exc. GluN5" = "GluN5", "Exc. GluN6" = "GluN6", "Exc. GluN7" = "GluN7", "Exc. GluN8" = "GluN8", "CGE IN " = "CGE IN", "MGE IN " = "MGE IN", "Oligo" = "OPC/Oligo", "MG" = "MG")

rna <- readRDS(rna_rds_file)
echo_vec(sort(unique(rna[[]]$Name)))

# dot plot
p <- DotPlot(rna, features = target_gene_names, idents = target_cell_types) +
    RotatedAxis() +
    scale_y_discrete(
        limits = rev(target_cell_types),
        labels = rev(names(target_cell_types))
    ) +
    labs(x = NULL, y = NULL) +
    theme(text = element_text(family = "Arial"))

ppreview(p, file = file.path(output_dir, "dotplot.pdf"))

# violin plot
ident_colors <- scales::hue_pal()(length(target_cell_types))

p <- VlnPlot(rna, features = target_gene_names, idents = target_cell_types, pt.size = 0, cols = ident_colors)
p <- p + geom_jitter(mapping = aes(color = ident), data = p$data, size = 1) +
    scale_x_discrete(
        limits = target_cell_types,
        labels = names(target_cell_types)
    ) +
    scale_y_continuous(
        expand = expansion(0),
        limits = c(0, ceiling(max(p$data[[target_gene_names]])))
    ) +
    labs(x = NULL, y = NULL, color = "Cell Type") +
    guides(fill = "none") +
    theme(text = element_text(family = "Arial"))

ppreview(p, file = file.path(output_dir, "violinplot.pdf"))

11 LaTeX spaces

`a b`	\(a \qquad b\)	Two `m` widths
`a b`	\(a \quad b\)	One `m` width
`a \ b`	\(a \ b\)	\(\frac{1}{3}\) `m` widths
`a \; b`	\(a \; b\)	\(\frac{2}{7}\) `m` widths
`a \, b`	\(a \, b\)	\(\frac{1}{6}\) `m` widths
`ab`	\(ab\)	No space
`a \! b`	\(a \! b\)	\(-\frac{1}{6}\) `m` widths
`\hspace{length}`	\(a \hspace{1cm} b\)	Horizontal space of specified length

12 clusterProfiler enrichment results interpretation

超几何分布：

假定在 \(N\) 件产品中有 \(M\) 件不合理，即不合格率 \(p=\frac{M}{N}\)。

现在产品中随机抽取 \(n\) 件做检查，发现 \(k\) 件不合格品的概率为

\[P(X=k)=\frac{C^k_M C^{n-k}_{N-M}}{C^n_N},\ k=t, t+1, ...,s,\ s=\min(M, n),\ t=n-(N-M) \ge 0\]

在做 GO 富集时，记 \(N\) 为背景基因集大小（例如可用样本中所有表达的基因或 GO 数据库中的所有基因作为背景基因集），\(m\) 为某一通路下的基因数，\(n\) 为输入做富集的基因数，\(k\) 为属于该通路下的基因数，则依据超几何分布，我们可以计算如下指标：

p.adjust: adjusted p value (i.e. adjusted \(P(X=k)\)).
Count: \(k\).
Gene ratio: \(\frac{k}{n}\).
Background ratio: \(\frac{m}{N}\).
Rich factor: \(\frac{k}{m}\).
Fold enrichment: \(\frac{\text{Gene ratio}}{\text{Background ratio}}\).

13 Sort GO terms of two clusters based on `RichFactor` and `p.adjust` and visualize them using dot plot

library(tidyverse)
library(vroom)
library(YRUtils)

go_file <- "go_bp.tsv"
cluster_levels <- c("P2-Sul Up", "P10-Sul Up")
output_dir <- "~/temp"

go_df <- vroom(go_file) %>%
    mutate(Cluster = factor(Cluster, levels = cluster_levels))

# classify terms into five categories based on RichFactor
sample_flag_levels <- c(paste0(cluster_levels[1], c("@Specific", "@Up")), paste0(paste0(cluster_levels, collapse = "_vs_"), "@Equal"), paste0(cluster_levels[2], c("@Up", "@Specific")))
sample_flag_df <- go_df %>%
    pivot_wider(id_cols = "ID", names_from = "Cluster", values_from = "RichFactor") %>%
    mutate(
        SampleFlag = if_else(is.na(.[[cluster_levels[1]]]), paste0(cluster_levels[2], "@Specific"), if_else(is.na(.[[cluster_levels[2]]]), paste0(cluster_levels[1], "@Specific"), if_else(.[[cluster_levels[1]]] > .[[cluster_levels[2]]], paste0(cluster_levels[1], "@Up"), if_else(.[[cluster_levels[1]]] < .[[cluster_levels[2]]], paste0(cluster_levels[2], "@Up"), paste0(cluster_levels[1], "_vs_", cluster_levels[2], "@Equal"))))),
        SampleFlag = factor(SampleFlag, levels = sample_flag_levels)
    ) %>%
    pivot_longer(cols = !all_of(c("ID", "SampleFlag")), names_to = "Cluster", values_to = "RichFactor") %>%
    na.omit() %>%
    distinct()
# sort terms based on RichFactor and p.adjust
df <- inner_join(sample_flag_df, distinct(select(go_df, Cluster, ID, p.adjust)), by = c("Cluster", "ID")) %>%
    arrange(
        SampleFlag,
        case_when(
            Cluster == cluster_levels[1] ~ -RichFactor,
            Cluster == cluster_levels[2] ~ RichFactor,
            TRUE ~ RichFactor
        ),
        case_when(
            Cluster == cluster_levels[1] ~ -p.adjust,
            Cluster == cluster_levels[2] ~ p.adjust,
            TRUE ~ p.adjust
        )
    )

go_df <- go_df %>%
    mutate(ID = factor(ID, levels = unique(df[["ID"]]))) %>%
    arrange(ID) %>%
    mutate(Description = factor(Description, levels = rev(unique(Description))))

p <- ggplot(go_df, aes(Cluster, Description, size = RichFactor, color = p.adjust)) +
    geom_point() +
    scale_color_gradient(low = "#e06663", high = "#327eba") +
    scale_y_discrete(position = "right") +
    guides(x = guide_axis(angle = 90)) +
    labs(x = NULL, y = NULL) +
    theme_bw(base_size = 18, base_family = "Arial")
ppreview(p, file = file.path(output_dir, "temp.pdf"))

14 Retrieve UniProt entries using REST API in batch

library(vroom)
library(tidyverse)
library(glue)
library(rvest)

# no more than 25
max_length <- 20
organism_id <- 9606
url_template <- "https://rest.uniprot.org/uniprotkb/search?query=({accession_str})%20AND%20(active:true)%20AND%20(organism_id:{organism_id})&fields=accession,gene_primary,organism_name,organism_id,protein_name,annotation_score,protein_existence,reviewed,ft_intramem,cc_subcellular_location,ft_topo_dom,ft_transmem,go,go_p,go_c,go_f,cc_interaction&format=tsv"
file <- "test.tsv"

accessions <- vroom(file) %>%
    pull(Entry) %>%
    na.omit() %>%
    unique()

accession_ls <- split(accessions, ceiling(seq_along(accessions) / max_length))
df <- tibble()
for (accession_vec in accession_ls) {
    accession_str <- paste0(paste0("accession:", accession_vec), collapse = "%20OR%20")
    url_instance <- glue(url_template)

    e <- try(item_raw_text <- read_html(url_instance) %>% html_text(), silent = T)
    if ("try-error" %in% class(e)) {
        message("invalid accession string: ", accession_str)
    } else {
        item_df <- vroom(I(item_raw_text), delim = "\t") %>%
            mutate(
                `Organism (ID)` = as.integer(`Organism (ID)`),
                Annotation = as.integer(Annotation)
            )
        df <- bind_rows(df, item_df)
    }
}
df <- distinct(df)

15 Visualize multiple sequence alignments together with phylogenetic tree

For example data, see here.

library(Biostrings)
library(tidyverse)
library(YRUtils)
library(treeio)
library(ggtree)
library(aplot)

work_dir <- "/data/users/yangrui/lab_projs/yeyaxin"
setwd(work_dir)

fa_file <- "pscnes.fa"
tree_file <- "species.nwk"
trim_n <- 100
output_file <- "output.pdf"

# use nwk_name to bridge tree plot and heatmap
species_df <- tibble(
    show_name = c("Homo Sapiens", "Mus Musculus", "Sus Scrofa", "Tursiops Truncatus", "Felis Catus", "Ailuropoda Melanoleuca", "Odobenus Rosmarus Divergens", "Erinaceus Europaeus", "Sorex Araneus", "Loxodonta Africana", "Dasypus Novemcinctus", "Monodelphis Domestica", "Chelonia Mydas", "Columba Livia"),
    fa_name = c("hg38", "mm10", "susScr3", "turTru2", "felCat8", "ailMel1", "odoRosDiv1", "eriEur2", "sorAra2", "loxAfr3", "dasNov3", "monDom5", "cheMyd1", "colLiv1"),
    nwk_name = gsub(" ", "_", str_to_sentence(show_name))
)

tree <- read.newick(tree_file)
tree <- ape::rotateConstr(tree, rev(species_df$nwk_name))

seqs <- as.character(readDNAStringSet(fa_file, format = "fasta"))
seqs <- sapply(seqs, function(seq) substr(seq, trim_n + 1, nchar(seq) - trim_n))

base_mat <- strsplit(seqs, "") %>% do.call(rbind, .)
colnames(base_mat) <- paste0("col", seq_len(ncol(base_mat)))

uniq_bases <- c(sort(unique(as.character(base_mat)), decreasing = TRUE), "=")
uniq_base_colors <- setNames(c(scales::hue_pal()(length(uniq_bases) - 1), "grey"), uniq_bases)

color_mat <- apply(base_mat, 2, function(c) {
    ref_base <- names(sort(-table(c)))[1]
    c[c == ref_base] <- "="
    return(c)
})

df <- inner_join(
    as.data.frame(as.table(base_mat)),
    as.data.frame(as.table(color_mat)),
    by = c("Var1", "Var2"), suffix = c(".base", ".color")
) %>%
    inner_join(species_df, by = c("Var1" = "fa_name")) %>%
    mutate(
        nwk_name = factor(nwk_name, levels = rev(species_df$nwk_name)),
        Freq.color = as.factor(Freq.color)
    )
y_label_df <- distinct(df[, c("show_name", "nwk_name")])

p_base <- ggplot(df, aes(x = Var2, y = nwk_name, fill = Freq.color)) +
    geom_raster(alpha = 0.8) +
    geom_text(aes(label = Freq.base),
        family = "Courier New", fontface = "bold",
        size.unit = "pt", size = 20
    ) +
    scale_y_discrete(
        position = "right",
        labels = setNames(y_label_df$show_name, y_label_df$nwk_name)
    ) +
    scale_fill_manual(values = uniq_base_colors) +
    coord_fixed(ratio = 1.6) +
    theme_minimal(base_family = "Arial") +
    theme(
        axis.text = element_text(size = 20, color = "black"),
        legend.text = element_text(size = 20, family = "Courier New", face = "bold", color = "black"),
        legend.title = element_blank(),
        panel.grid = element_blank(),
        axis.text.x = element_blank(),
        axis.title = element_blank()
    )

p_tree <- ggtree(
    tree,
    ladderize = FALSE
)

p <- p_base %>% insert_left(p_tree, width = 0.6)

ppreview(
    p,
    file = output_file,
    width = 3300, height = 1000
)

16 Get installed packages and install them in batch from and in R

Get installed packages from R

# obtain installed packages and their sources
library(vroom)

work_dir <- "/data/users/yangrui/lab_projs/test"
setwd(work_dir)

pkgs <- as.data.frame(installed.packages())
to_check <- pkgs$Package[is.na(pkgs$Priority)]

source_info <- lapply(to_check, function(p) {
    desc <- packageDescription(p)

    origin <- "Unknown"
    if (!is.null(desc$Repository)) {
        if (grepl("CRAN", desc$Repository, ignore.case = TRUE)) {
            origin <- "CRAN"
        } else if (grepl("Bioconductor", desc$Repository, ignore.case = TRUE)) {
            origin <- "Bioconductor"
        }
    } else if (!is.null(desc$RemoteType)) {
        origin <- desc$RemoteType
    }

    desc_url <- if (!is.null(desc$URL)) {
        desc$URL
    } else if (!is.null(desc$BugReports)) {
        desc$BugReports
    } else if (!is.null(desc$RemoteUrl)) {
        desc$RemoteUrl
    } else {
        ""
    }

    remote_username <- if (!is.null(desc$RemoteUsername)) {
        desc$RemoteUsername
    } else {
        ""
    }

    remote_repo <- if (!is.null(desc$RemoteRepo)) {
        desc$RemoteRepo
    } else {
        ""
    }

    return(data.frame(
        Package = gsub("[[:space:]]", "", p),
        Origin = gsub("[[:space:]]", "", origin),
        URL = gsub("[[:space:]]", "", desc_url),
        RemoteUsername = gsub("[[:space:]]", "", remote_username),
        RemoteRepo = gsub("[[:space:]]", "", remote_repo),
        stringsAsFactors = FALSE
    ))
})
source_df <- do.call(rbind, source_info)
source_df <- source_df[order(source_df$Origin, source_df$Package), ]

vroom_write(source_df, file = "all_packages.tsv", delim = "\t", col_names = TRUE)

Install packages in R in batch

# install packages found but not currently installed
# it is recommended to use the latest CMake
# usethis::create_github_token()
# install.packages("languageserver", Ncpus = 100, quiet = TRUE)

work_dir <- "/data/users/yangrui/lab_projs/test"
setwd(work_dir)

pkgs_file <- "all_packages.tsv"
ncpus <- 100

if (!requireNamespace("vroom", quietly = TRUE)) {
    install.packages("vroom", Ncpus = ncpus, quiet = TRUE)
}
if (!requireNamespace("remotes", quietly = TRUE)) {
    install.packages("remotes", Ncpus = ncpus, quiet = TRUE)
}
if (!require("BiocManager", quietly = TRUE)) {
    install.packages("BiocManager", Ncpus = ncpus, quiet = TRUE)
}

pkgs <- vroom::vroom(pkgs_file, delim = "\t", col_names = TRUE)
pkgs <- pkgs[!(pkgs$Package %in% gsub("[[:space:]]", "", rownames(installed.packages()))), ]

cran_pkgs <- pkgs[grepl("CRAN", pkgs$Origin, ignore.case = TRUE), ]
bioc_pkgs <- pkgs[grepl("Bioconductor", pkgs$Origin, ignore.case = TRUE), ]
github_pkgs <- pkgs[grepl("github", pkgs$Origin, ignore.case = TRUE), ]
gitlab_pkgs <- pkgs[grepl("gitlab", pkgs$Origin, ignore.case = TRUE), ]
xgit_pkgs <- pkgs[grepl("xgit", pkgs$Origin, ignore.case = TRUE), ]
unknown_pkgs <- pkgs[!grepl("(CRAN|Bioconductor|github|gitlab|xgit)", pkgs$Origin, ignore.case = TRUE), ]

failed_log <- data.frame(Package = character(), Source = character(), Error = character(), stringsAsFactors = FALSE)
for (pkg in unique(c(cran_pkgs$Package, bioc_pkgs$Package))) {
    if (pkg %in% gsub("[[:space:]]", "", rownames(installed.packages()))) {
        next
    }

    status <- tryCatch(
        {
            BiocManager::install(pkg, Ncpus = ncpus, update = FALSE, ask = FALSE, quiet = TRUE)
            TRUE
        },
        error = function(e) {
            new_error <- data.frame(Package = pkg, Source = "CRAN/Bioconductor", Error = gsub("[[:space:]]+", " ", trimws(as.character(e$message))), stringsAsFactors = FALSE)
            failed_log <<- rbind(failed_log, new_error)
            return(FALSE)
        }
    )

    if (status) {
        message(paste0("Successfully installed CRAN/Bioconductor package: ", pkg))
    } else {
        message(paste0("Failed to install CRAN/Bioconductor package: ", pkg))
    }
}

for (i in seq_len(nrow(github_pkgs))) {
    tmp_df <- github_pkgs[i, ]
    if (!is.null(tmp_df$RemoteUsername) && tmp_df$RemoteUsername != "" && !is.null(tmp_df$RemoteRepo) && tmp_df$RemoteRepo != "") {
        if (tmp_df$Package %in% gsub("[[:space:]]", "", rownames(installed.packages()))) {
            next
        }

        status <- tryCatch(
            {
                remotes::install_github(paste0(tmp_df$RemoteUsername, "/", tmp_df$RemoteRepo), auth_token = Sys.getenv("GH_TOKEN"), upgrade = "never", quiet = FALSE)
                TRUE
            },
            error = function(e) {
                new_error <- data.frame(Package = tmp_df$Package, Source = "GitHub", Error = gsub("[[:space:]]+", " ", trimws(as.character(e$message))), stringsAsFactors = FALSE)
                failed_log <<- rbind(failed_log, new_error)
                return(FALSE)
            }
        )

        if (status) {
            message(paste0("Successfully installed GitHub package: ", tmp_df$Package))
        } else {
            message(paste0("Failed to install GitHub package: ", tmp_df$Package))
        }
    } else {
        warning(paste0("GitHub package '", tmp_df$Package, "' cannot be installed due to lacking either use name or repo name!"))
        new_error <- data.frame(Package = tmp_df$Package, Source = "GitHub", Error = "Missing username or repo name", stringsAsFactors = FALSE)
        failed_log <<- rbind(failed_log, new_error)
    }
}

for (i in seq_len(nrow(gitlab_pkgs))) {
    tmp_df <- gitlab_pkgs[i, ]
    if (!is.null(tmp_df$RemoteUsername) && tmp_df$RemoteUsername != "" && !is.null(tmp_df$RemoteRepo) && tmp_df$RemoteRepo != "") {
        if (tmp_df$Package %in% gsub("[[:space:]]", "", rownames(installed.packages()))) {
            next
        }

        status <- tryCatch(
            {
                remotes::install_gitlab(paste0(tmp_df$RemoteUsername, "/", tmp_df$RemoteRepo), upgrade = "never", quiet = FALSE)
                TRUE
            },
            error = function(e) {
                new_error <- data.frame(Package = tmp_df$Package, Source = "GitLab", Error = gsub("[[:space:]]+", " ", trimws(as.character(e$message))), stringsAsFactors = FALSE)
                failed_log <<- rbind(failed_log, new_error)
                return(FALSE)
            }
        )

        if (status) {
            message(paste0("Successfully installed GitLab package: ", tmp_df$Package))
        } else {
            message(paste0("Failed to install GitLab package: ", tmp_df$Package))
        }
    } else {
        message(paste0("GitLab package '", tmp_df$Package, "' cannot be installed due to lacking either use name or repo name!"))
        new_error <- data.frame(Package = tmp_df$Package, Source = "GitLab", Error = "Missing username or repo name", stringsAsFactors = FALSE)
        failed_log <<- rbind(failed_log, new_error)
    }
}

for (i in seq_len(nrow(xgit_pkgs))) {
    tmp_df <- xgit_pkgs[i, ]
    if (!is.null(tmp_df$URL) && tmp_df$URL != "" && grepl("\\.git$", tmp_df$URL)) {
        if (tmp_df$Package %in% gsub("[[:space:]]", "", rownames(installed.packages()))) {
            next
        }

        status <- tryCatch(
            {
                remotes::install_git(tmp_df$URL, upgrade = "never", quiet = FALSE)
                TRUE
            },
            error = function(e) {
                new_error <- data.frame(Package = tmp_df$Package, Source = "XGit", Error = gsub("[[:space:]]+", " ", trimws(as.character(e$message))), stringsAsFactors = FALSE)
                failed_log <<- rbind(failed_log, new_error)
                return(FALSE)
            }
        )

        if (status) {
            message(paste0("Successfully installed XGit package: ", tmp_df$Package))
        } else {
            message(paste0("Failed to install XGit package: ", tmp_df$Package, "."))
        }
    } else {
        message(paste0("XGit package '", tmp_df$Package, "' cannot be installed due to lacking git address!"))
        new_error <- data.frame(Package = tmp_df$Package, Source = "XGit", Error = "Missing git address", stringsAsFactors = FALSE)
        failed_log <<- rbind(failed_log, new_error)
    }
}

if (nrow(failed_log) > 0) {
    vroom::vroom_write(failed_log, file = "failed_installations.tsv", delim = "\t", col_names = TRUE)
    message("Some packages failed to install. Please check 'failed_installations.tsv' for details.")
} else {
    message("All packages installed successfully!")
}

unknown_pkgs <- unknown_pkgs[!(unknown_pkgs$Package %in% c(gsub("[[:space:]]", "", rownames(installed.packages())))), ]
if (nrow(unknown_pkgs) > 0) {
    vroom::vroom_write(unknown_pkgs, file = "unknown_source_packages.tsv", delim = "\t", col_names = TRUE)
    message("Some packages have unknown sources and have not been installed. Please check 'unknown_source_packages.tsv' for details.")
} else {
    message("All packages with unknown sources have been installed!")
}