library(GenomicFeatures)
library(tidyverse)
library(vroom)
gtf_file <- "/data/database/annotation/release/mus_musculus/Mus_musculus.GRCm39.vM38.gencode.basic.chr.annotation.gtf"
txdb <- makeTxDbFromGFF(gtf_file, format = "gtf")
genes_gr <- genes(txdb)
gene_body_lengths <- width(genes_gr)
gene_len_df <- tibble(
gene_id = names(genes_gr),
gene_length = gene_body_lengths
)
vroom_write(
gene_len_df,
file = paste0(gtf_file, ".gene_lengths.tsv"),
col_names = TRUE, append = FALSE
)1 Introduction
Used to collect various R code snippets.
2 Calculate gene lengths
2.1 Gene body lengths
Used for RNA-seq counts normalization in intron-included mode.
2.2 Union exon lengths
Used for RNA-seq counts normalization in intron-excluded mode.
library(GenomicFeatures)
library(tidyverse)
library(vroom)
gtf_file <- "/data/database/annotation/release/mus_musculus/gencode.vM23.annotation.gtf"
txdb <- makeTxDbFromGFF(gtf_file, format = "gtf")
exons_list <- exonsBy(txdb, by = "gene")
exons_reduced <- GenomicRanges::reduce(exons_list)
gene_lengths <- sum(width(exons_reduced))
gene_len_df <- tibble(
gene_id = names(gene_lengths),
gene_length = gene_lengths
)
vroom_write(
gene_len_df,
file = paste0(gtf_file, ".union_exon_lengths.tsv"),
col_names = TRUE, append = FALSE
)3 Process one-to-one orthologs of OrthoFinder
library(tidyverse)
library(vroom)
orthogroups_gene_count_file <- "/data/database/annotation/derived/orthologs/20251104_202818/OrthoFinder/Orthogroups/Orthogroups.GeneCount.tsv"
orthologs_file <- "/data/database/annotation/derived/orthologs/20251104_202818/OrthoFinder/Orthologues/Orthologues_NeoVis_KIZ_NCBIv2/NeoVis_KIZ_NCBIv2__v__MusMus_GRCm39_GENCODEvM38.tsv"
species1_tx_id_mpt_file <- "/data/database/annotation/release/neogale_vison/Neogale_vison.v1.v2.KIZ_NCBI.basic.chr.annotation.transcript_id_name_mapping_table.tsv"
species2_tx_id_mpt_file <- "/data/database/annotation/release/mus_musculus/Mus_musculus.GRCm39.vM38.gencode.basic.chr.annotation.transcript_id_name_mapping_table.tsv"
species_columns <- c("NeoVis_KIZ_NCBIv2", "MusMus_GRCm39_GENCODEvM38")
species1_tx_id_mpt_df <- vroom(species1_tx_id_mpt_file)
names(species1_tx_id_mpt_df) <- paste0(species_columns[1], ".", names(species1_tx_id_mpt_df))
species1_tx_id_mpt_df[[paste0(species_columns[1], ".transcript_id")]] <- gsub(":", "_", species1_tx_id_mpt_df[[paste0(species_columns[1], ".transcript_id")]], fixed = TRUE)
species2_tx_id_mpt_df <- vroom(species2_tx_id_mpt_file)
names(species2_tx_id_mpt_df) <- paste0(species_columns[2], ".", names(species2_tx_id_mpt_df))
species2_tx_id_mpt_df[[paste0(species_columns[2], ".transcript_id")]] <- gsub(":", "_", species2_tx_id_mpt_df[[paste0(species_columns[2], ".transcript_id")]], fixed = TRUE)
orthogroups_gene_count_df <- vroom(orthogroups_gene_count_file)
one2one_orthogroups <- orthogroups_gene_count_df$Orthogroup[rowSums(orthogroups_gene_count_df[, species_columns]) == length(species_columns)]
one2one_orthogroups <- unique(na.omit(one2one_orthogroups))
orthologs_df <- vroom(orthologs_file)
orthologs_df <- orthologs_df %>%
na.omit() %>%
filter(Orthogroup %in% one2one_orthogroups) %>%
separate_longer_delim(cols = all_of(species_columns[1]), delim = ",") %>%
separate_longer_delim(cols = all_of(species_columns[2]), delim = ",") %>%
mutate(
!!sym(species_columns[1]) := trimws(.data[[species_columns[1]]]),
!!sym(species_columns[2]) := trimws(.data[[species_columns[2]]])
)
table(orthologs_df[[species_columns[1]]] %in% species1_tx_id_mpt_df[[paste0(species_columns[1], ".transcript_id")]])
table(orthologs_df[[species_columns[2]]] %in% species2_tx_id_mpt_df[[paste0(species_columns[2], ".transcript_id")]])
df <- orthologs_df %>%
inner_join(species1_tx_id_mpt_df, by = setNames(paste0(species_columns[1], ".transcript_id"), species_columns[1])) %>%
inner_join(species2_tx_id_mpt_df, by = setNames(paste0(species_columns[2], ".transcript_id"), species_columns[2])) %>%
rename(
!!sym(paste0(species_columns[1], ".transcript_id")) := !!sym(species_columns[1]),
!!sym(paste0(species_columns[2], ".transcript_id")) := !!sym(species_columns[2])
)
df[[paste0(species_columns[1], ".gene_name")]][is.na(df[[paste0(species_columns[1], ".gene_name")]])] <- df[[paste0(species_columns[1], ".gene_id")]][is.na(df[[paste0(species_columns[1], ".gene_name")]])]
df[[paste0(species_columns[2], ".gene_name")]][is.na(df[[paste0(species_columns[2], ".gene_name")]])] <- df[[paste0(species_columns[2], ".gene_id")]][is.na(df[[paste0(species_columns[2], ".gene_name")]])]
vroom_write(
df,
file = gsub("\\.(tsv|txt)$", ".with_gene_info.tsv", orthologs_file),
col_names = TRUE, append = FALSE
)4 Retrieve data from AnnotationHub
library(AnnotationHub)
library(AnnotationDbi)
work_dir <- "/Users/yangrui/Downloads"
setwd(work_dir)
ah <- AnnotationHub()
# BiocHubsShiny::BiocHubsShiny()
file <- "org.Petaurus_breviceps_papuanus.eg.sqlite"
hubid <- "AH120027"
x <- ah[[hubid]]
saveDb(x, file = file)
# loadDb(file)5 Plot phylogenetic tree
- Case 1
Download example data from here.
library(treeio)
library(ggtree)
library(tidyverse)
library(vroom)
library(patchwork)
library(YRUtils)
font_family <- "Arial"
work_dir <- "/home/yangrui/lab_projs/lyr"
tree_file <- "omm_v12_190tax_CDS_tree.nexus"
gi_file <- "GI.tsv"
gyr_species <- c("Human", "Dolphin", "Elephant")
lis_species <- c("Marmoset", "Cat", "Manatee")
# from top to bottom
tips_order <- c(
"Human", "Marmoset", "Galago", "Tree shrew", "Europeans hrew", "Mouse",
"Dolphin", "Cat", "Chinese rufous horseshoe bat", "Black flying fox", "Star-nosed mole", "Hedgehog",
"Elephant", "Manatee", "Cape elephant shrew", "Tasmanian devil", "Opossum"
)
setwd(work_dir)
gi_df <- vroom(gi_file) %>%
mutate(
species_name = factor(species_name, levels = rev(tips_order)),
label = gsub(" +", "_", scientific_name),
highlight = if_else(species_name %in% gyr_species, "GYR",
if_else(species_name %in% lis_species, "LIS", "NA")
)
) %>%
arrange(species_name) %>%
# the column matching tip labels in tree object must be in the first column
select(label, everything())
tree <- read.nexus(tree_file)
tips_to_drop <- unique(na.omit(tree$tip.label[!(tree$tip.label %in% gi_df$label)]))
tree <- drop.tip(tree, tips_to_drop)
tree <- ape::rotateConstr(tree, gi_df$label)
p_tree <- ggtree(
tree,
branch.length = "none",
ladderize = FALSE
) %<+% gi_df +
geom_tiplab(
aes(
label = species_name,
color = highlight
),
family = font_family,
show.legend = FALSE,
size = 6
) +
scale_color_manual(values = c(
"GYR" = "red",
"LIS" = "blue",
"NA" = "black"
)) +
scale_x_continuous(position = "top") +
hexpand(1.1) +
labs(x = "Species Tree") +
theme(
text = element_text(family = font_family),
axis.title.x = element_text(
face = "bold",
size = 20
),
panel.border = element_rect(
fill = NA,
color = "grey"
)
)
p_line <- ggplot(gi_df, aes(GI, species_name, group = 1)) +
geom_path() +
geom_point(
aes(
color = highlight,
size = highlight
),
show.legend = FALSE
) +
scale_x_continuous(
position = "top",
expand = expansion(0),
breaks = 0:ceiling(max(gi_df$GI)),
limits = c(0, ceiling(max(gi_df$GI)))
) +
scale_color_manual(values = c(
"GYR" = "red",
"LIS" = "blue",
"NA" = "black"
)) +
scale_size_manual(
values = c(
"GYR" = 5,
"LIS" = 5,
"NA" = 3
)
) +
labs(x = "GI", y = NULL) +
theme_classic(base_family = font_family) +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank(),
axis.text.x = element_text(size = 6 * 2.845),
axis.title.x = element_text(
face = "bold",
size = 20
),
panel.border = element_rect(
fill = NA,
color = "grey"
)
)
p <- p_tree + p_line +
plot_layout(
ncol = 2,
width = c(3, 1)
)
ppreview(p, file = "tree_line.pdf")6 Plot TF motif logos from PFMs
- Case 1
library(ggplot2)
library(ggseqlogo)
library(vroom)
library(unigd)
in_file <- "/data/database/tf_motifs/20251014_153919/all_motifs_rmdup.from_peca2.txt"
out_dir <- "/home/yangrui/temp"
special_chars_regex <- "[\t<>:\"/\\\\|?*[:space:]`&;\\$(){}\\[\\]#~[:punct:]]"
data <- trimws(vroom_lines(in_file, skip_empty_rows = TRUE))
table(data == "")
table(is.na(data))
ls <- list()
for (line in data) {
if (grepl("^>", line)) {
if (exists("header") && exists("pfm") && is.matrix(pfm)) {
row.names(pfm) <- c("A", "C", "G", "T")
ls[[header]] <- pfm
}
header <- gsub("_{2,}", "_", gsub(special_chars_regex, "_", gsub("^>", "", line), perl = TRUE))
pfm <- NULL
} else {
pf <- as.numeric(trimws(strsplit(line, "\\t")[[1]]))
if (length(pf) != 4 || any(is.na(pf))) {
stop("malformated line found in ", header)
}
if (is.null(pfm)) {
pfm <- matrix(pf, ncol = 1, byrow = FALSE)
} else {
pfm <- cbind(pfm, matrix(pf, ncol = 1, byrow = FALSE))
}
}
}
row.names(pfm) <- c("A", "C", "G", "T")
ls[[header]] <- pfm
plot_method <- "bits"
for (header in names(ls)) {
p <- ggseqlogo(ls[[header]], method = plot_method) +
scale_y_continuous(breaks = scales::breaks_extended(4)) +
theme(text = element_text(family = "Arial", size = 24))
ugd_save_inline(
{
print(p)
},
file = file.path(out_dir, paste0(header, ".", plot_method, ".pdf")),
width = 50 + ncol(ls[[header]]) * 30,
height = 160
)
}7 Plot genome tracks using plotgardener
- Case 1
Download example data from here.
library(plotgardener)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(org.Mm.eg.db)
library(vroom)
library(tidyverse)
library(unigd)
adjacentMeans <- function(x, n = 1) {
for (i in 1:n) {
idx <- seq(1, length(x) - 1)
x <- c(rbind(x[idx], (x[idx] + x[idx + 1]) / 2), x[length(x)])
}
return(x)
}
work_dir <- "/home/yangrui/temp"
setwd(work_dir)
ugd()
pageWidth <- 8
pageHeight <- 8
defaultUnits <- "inches"
pageCreate(
width = pageWidth,
height = pageHeight,
default.units = defaultUnits
)
assembly <- "mm10"
chrom <- "chr1"
chromRange <- c(56730000, 57280000)
regionWidth <- 7
defaultJust <- c("left", "top")
defaultFontSize <- 10
defaultFontColor <- "black"
defaultFontFamily <- "Arial"
genomicRegion <- pgParams(
chrom = chrom,
assembly = assembly,
chromstart = chromRange[1],
chromend = chromRange[2],
width = regionWidth,
default.units = defaultUnits,
just = defaultJust,
fontsize = defaultFontSize,
fontcolor = defaultFontColor,
fontfamily = defaultFontFamily
)
gpX <- 0.5
gpY <- 1.4
gpHeight <- 0.5
genesPlot <- plotGenes(
params = genomicRegion,
x = gpX, y = gpY,
height = gpHeight,
geneHighlights = data.frame(
"gene" = c("Satb2", "9130024F11Rik", "BC055402"),
"color" = c("steelblue", "white", "white")
),
strandLabels = FALSE
)
aglX <- 0.5
aglY <- 0.5
defaultLineWidth <- 1
annoGenomeLabel(
plot = genesPlot,
x = aglX, y = aglY,
scale = "Mb",
at = adjacentMeans(chromRange, 2),
lwd = defaultLineWidth,
params = genomicRegion
)
peaksFile <- "pscnes.bed"
ppX <- 0.5
ppY <- 1
ppHeight <- 0.15
peaksPlot <- plotRanges(
data = peaksFile,
x = ppX, y = ppY,
collapse = TRUE,
height = ppHeight,
fill = "black",
linecolor = "fill",
params = genomicRegion
)
hicLoopsFile <- "hic_loops.bed"
hicLoops_df <- vroom(hicLoopsFile, col_names = c("chrom1", "start1", "end1", "name")) %>%
separate_wider_delim(cols = "name", delim = regex("[-:,]"), names = c("chrom2", "start2", "end2", "score")) %>%
type_convert() %>%
mutate(
length = start2 - start1,
height = length / max(length)
)
apX <- 0.5
apY <- 2
apHeight <- 0.6
archesPlot <- plotPairsArches(
data = hicLoops_df,
params = genomicRegion,
fill = colorby("score", palette = colorRampPalette(c("#FFE0B2", "#E65100"))),
linecolor = "fill",
archHeight = "height",
alpha = 1,
x = apX, y = apY,
height = apHeight,
flip = TRUE,
baseline = TRUE,
baseline.color = "grey25",
baseline.lwd = 1
)
ahlX <- 1
ahlY <- 3
ahlWidth <- 0.1
ahlHeight <- 1
ahlBreaks <- seq(0, 1.4, 0.35)
annoHeatmapLegend(
plot = archesPlot,
x = ahlX, y = ahlY,
width = ahlWidth, height = ahlHeight,
orientation = "v",
fontsize = 0,
scientific = TRUE,
ticks = TRUE,
digits = 2,
breaks = ahlBreaks,
params = genomicRegion
)
plotText(
label = as.character(ahlBreaks),
x = ahlX + ahlWidth + 0.1,
y = scales::rescale(ahlBreaks,
to = c(ahlY + ahlHeight, ahlY)
),
just = c("left", "center"),
params = genomicRegion
)
plotText(
label = "score",
params = genomicRegion,
x = ahlX + ahlWidth / 2,
y = ahlY - 0.2,
just = c("center", "center"),
fontface = "bold"
)
plotSegments(
x0 = c(ppX, gpX, apX) - 0.1,
y0 = c(ppY + ppHeight / 2, gpY + gpHeight / 2, apY + apHeight / 2) - 0.25,
x1 = c(ppX, gpX, apX) - 0.1,
y1 = c(ppY + ppHeight / 2, gpY + gpHeight / 2, apY + apHeight / 2) + 0.25,
lwd = 1,
linecolor = "black"
)
plotText(
label = c("PSCNEs", "Genes", "Links"),
params = genomicRegion,
x = c(ppX, gpX, apX) - 0.3,
y = c(ppY + ppHeight / 2, gpY + gpHeight / 2, apY + apHeight / 2),
rot = 90,
just = c("center", "center")
)
pageGuideHide()
ugd_save(
file = "genome_tracks.pdf",
width = 800, height = 800
)
dev.off()8 Output FASTA sequences with fixed-width wrapping
str <- paste0(c("ATCG", rep("N", 30), "ATCG"), collapse = "")
width <- 60
first_line <- ">20251013_204905_e8834659-1df1-4ae5-a838-86910aa57e31"
out_file <- "/home/dell/test/test.fa"
width <- ifelse(nchar(str) < width, nchar(str), width)
starts <- seq(1, nchar(str), width)
ends <- seq(width, nchar(str), width)
if (tail(ends, 1) != nchar(str)) ends <- c(ends, nchar(str))
substrs <- c(first_line, substring(str, starts, ends))
writeLines(substrs, con = out_file)
elem_num <- length(starts)
max_length <- nchar(tail(ends, 1))
message(paste0(
c(
paste0(stringr::str_pad(head(starts, 2), max_length, side = "right"), " - ", stringr::str_pad(head(ends, 2), max_length, side = "right")),
if (elem_num > 5) paste0(stringr::str_pad("...", max_length, side = "right"), " - ", stringr::str_pad("...", max_length, side = "right")) else NULL,
if (elem_num > 5) paste0(stringr::str_pad(tail(starts, 2), max_length, side = "right"), " - ", stringr::str_pad(tail(ends, 2), max_length, side = "right")) else NULL
),
collapse = "\n"
))9 Plot volcano plot in batch
library(tidyverse)
library(vroom)
library(magrittr)
library(ggrepel)
library(ggprism)
library(YRUtils)
input_dir <- "/home/yangrui/mywd/wutintin_transcriptome/input"
output_dir <- "/home/yangrui/mywd/wutintin_transcriptome/output"
top_n <- 10
gene_id_column <- "gene_name"
diff_flag_column <- "diff_flag"
logfc_column <- "log2FoldChange"
padj_column <- "padj"
color_column <- "diff_flag"
logfc_threshold <- 1
padj_threshold <- 0.05
# Up, Down, No
diff_flag_colors <- c("magenta4", "cyan4", "grey25")
geom_point_alpha <- 0.5
geom_point_size <- 2
geom_line_linewidth <- 1
geom_line_color <- "grey25"
geom_line_linetype <- "dashed"
geom_text_alpha <- 1
min_segment_length <- 3
geom_text_size <- 5
theme_prism_panel_border <- TRUE
font_family <- "Arial"
theme_base_font_size <- 16
files <- list.files(input_dir, pattern = "\\.(tsv|txt|csv)$", full.names = TRUE, recursive = FALSE)
for (file in files) {
data <- vroom(file) %>%
select(all_of(c(gene_id_column, diff_flag_column, logfc_column, padj_column))) %>%
set_colnames(c("gene_id", "diff_flag", "logFC", "padj")) %>%
distinct()
anno_data <- data %>%
filter(diff_flag != "NO") %>%
group_by(diff_flag) %>%
slice_max(abs(logFC), n = top_n)
diff_flag_count_table <- count(data, diff_flag) %>%
arrange(diff_flag)
plot_title <- paste0(paste0(
diff_flag_count_table$diff_flag, ": ",
diff_flag_count_table$n
), collapse = "\n")
p <- ggplot(data, aes(logFC, -log10(padj), color = diff_flag)) +
geom_point(alpha = geom_point_alpha, size = geom_point_size) +
geom_vline(
xintercept = c(-logfc_threshold, logfc_threshold),
linewidth = geom_line_linewidth,
color = geom_line_color,
linetype = geom_line_linetype
) +
geom_hline(
yintercept = -log10(padj_threshold),
linewidth = geom_line_linewidth,
color = geom_line_color,
linetype = geom_line_linetype
) +
geom_label_repel(
data = anno_data,
mapping = aes(logFC, -log10(padj), color = diff_flag, label = gene_id),
max.overlaps = 10000, show.legend = FALSE, alpha = geom_text_alpha,
min.segment.length = min_segment_length, size = geom_text_size
) +
labs(
title = plot_title,
x = "logFC", y = "-log10(padj)",
color = paste0("padj < ", padj_threshold, "\nlogFC > ", logfc_threshold)
) +
theme_prism(border = theme_prism_panel_border, base_family = font_family, base_size = theme_base_font_size) +
theme(legend.title = element_text()) +
scale_color_manual(values = setNames(
diff_flag_colors,
c(paste0(strsplit(gsub("\\.(tsv|txt|csv)$", "", basename(file)), "_vs_", fixed = TRUE)[[1]], " Up"), "NO")
))
ppreview(p, file = file.path(output_dir, gsub("\\.(tsv|txt|csv)$", ".pdf", basename(file))))
}10 Visualize scRNA-Seq expression of a single gene using dot plot and violin plot of Seurat
library(Seurat)
library(tidyverse)
library(YRUtils)
rna_rds_file <- "/data/users/yangrui/sc_omics_ref_datasets/human/dataset_2/ana/res/rna_to_umap.rds"
output_dir <- "/home/yangrui/temp"
target_gene_names <- "BTN3A2"
target_cell_types <- c("Exc. GluN1" = "GluN1", "Exc. GluN2" = "GluN2", "Exc. GluN3" = "GluN3", "Exc. GluN4" = "GluN4", "Exc. GluN5" = "GluN5", "Exc. GluN6" = "GluN6", "Exc. GluN7" = "GluN7", "Exc. GluN8" = "GluN8", "CGE IN " = "CGE IN", "MGE IN " = "MGE IN", "Oligo" = "OPC/Oligo", "MG" = "MG")
rna <- readRDS(rna_rds_file)
echo_vec(sort(unique(rna[[]]$Name)))
# dot plot
p <- DotPlot(rna, features = target_gene_names, idents = target_cell_types) +
RotatedAxis() +
scale_y_discrete(
limits = rev(target_cell_types),
labels = rev(names(target_cell_types))
) +
labs(x = NULL, y = NULL) +
theme(text = element_text(family = "Arial"))
ppreview(p, file = file.path(output_dir, "dotplot.pdf"))
# violin plot
ident_colors <- scales::hue_pal()(length(target_cell_types))
p <- VlnPlot(rna, features = target_gene_names, idents = target_cell_types, pt.size = 0, cols = ident_colors)
p <- p + geom_jitter(mapping = aes(color = ident), data = p$data, size = 1) +
scale_x_discrete(
limits = target_cell_types,
labels = names(target_cell_types)
) +
scale_y_continuous(
expand = expansion(0),
limits = c(0, ceiling(max(p$data[[target_gene_names]])))
) +
labs(x = NULL, y = NULL, color = "Cell Type") +
guides(fill = "none") +
theme(text = element_text(family = "Arial"))
ppreview(p, file = file.path(output_dir, "violinplot.pdf"))11 LaTeX spaces
| `a b` | \(a \qquad b\) | Two m widths |
| `a b` | \(a \quad b\) | One m width |
a \ b |
\(a \ b\) | \(\frac{1}{3}\) m widths |
a \; b |
\(a \; b\) | \(\frac{2}{7}\) m widths |
a \, b |
\(a \, b\) | \(\frac{1}{6}\) m widths |
ab |
\(ab\) | No space |
a \! b |
\(a \! b\) | \(-\frac{1}{6}\) m widths |
\hspace{length} |
\(a \hspace{1cm} b\) | Horizontal space of specified length |
12 clusterProfiler enrichment results interpretation
超几何分布:
假定在 \(N\) 件产品中有 \(M\) 件不合理,即不合格率 \(p=\frac{M}{N}\)。
现在产品中随机抽取 \(n\) 件做检查,发现 \(k\) 件不合格品的概率为
\[P(X=k)=\frac{C^k_M C^{n-k}_{N-M}}{C^n_N},\ k=t, t+1, ...,s,\ s=\min(M, n),\ t=n-(N-M) \ge 0\]
在做 GO 富集时,记 \(N\) 为背景基因集大小(例如可用样本中所有表达的基因或 GO 数据库中的所有基因作为背景基因集),\(m\) 为某一通路下的基因数,\(n\) 为输入做富集的基因数,\(k\) 为属于该通路下的基因数,则依据超几何分布,我们可以计算如下指标:
p.adjust: adjusted p value (i.e. adjusted \(P(X=k)\)).Count: \(k\).Gene ratio: \(\frac{k}{n}\).Background ratio: \(\frac{m}{N}\).Rich factor: \(\frac{k}{m}\).Fold enrichment: \(\frac{\text{Gene ratio}}{\text{Background ratio}}\).
13 Sort GO terms of two clusters based on RichFactor and p.adjust and visualize them using dot plot
library(tidyverse)
library(vroom)
library(YRUtils)
go_file <- "go_bp.tsv"
cluster_levels <- c("P2-Sul Up", "P10-Sul Up")
output_dir <- "~/temp"
go_df <- vroom(go_file) %>%
mutate(Cluster = factor(Cluster, levels = cluster_levels))
# classify terms into five categories based on RichFactor
sample_flag_levels <- c(paste0(cluster_levels[1], c("@Specific", "@Up")), paste0(paste0(cluster_levels, collapse = "_vs_"), "@Equal"), paste0(cluster_levels[2], c("@Up", "@Specific")))
sample_flag_df <- go_df %>%
pivot_wider(id_cols = "ID", names_from = "Cluster", values_from = "RichFactor") %>%
mutate(
SampleFlag = if_else(is.na(.[[cluster_levels[1]]]), paste0(cluster_levels[2], "@Specific"), if_else(is.na(.[[cluster_levels[2]]]), paste0(cluster_levels[1], "@Specific"), if_else(.[[cluster_levels[1]]] > .[[cluster_levels[2]]], paste0(cluster_levels[1], "@Up"), if_else(.[[cluster_levels[1]]] < .[[cluster_levels[2]]], paste0(cluster_levels[2], "@Up"), paste0(cluster_levels[1], "_vs_", cluster_levels[2], "@Equal"))))),
SampleFlag = factor(SampleFlag, levels = sample_flag_levels)
) %>%
pivot_longer(cols = !all_of(c("ID", "SampleFlag")), names_to = "Cluster", values_to = "RichFactor") %>%
na.omit() %>%
distinct()
# sort terms based on RichFactor and p.adjust
df <- inner_join(sample_flag_df, distinct(select(go_df, Cluster, ID, p.adjust)), by = c("Cluster", "ID")) %>%
arrange(
SampleFlag,
case_when(
Cluster == cluster_levels[1] ~ -RichFactor,
Cluster == cluster_levels[2] ~ RichFactor,
TRUE ~ RichFactor
),
case_when(
Cluster == cluster_levels[1] ~ -p.adjust,
Cluster == cluster_levels[2] ~ p.adjust,
TRUE ~ p.adjust
)
)
go_df <- go_df %>%
mutate(ID = factor(ID, levels = unique(df[["ID"]]))) %>%
arrange(ID) %>%
mutate(Description = factor(Description, levels = rev(unique(Description))))
p <- ggplot(go_df, aes(Cluster, Description, size = RichFactor, color = p.adjust)) +
geom_point() +
scale_color_gradient(low = "#e06663", high = "#327eba") +
scale_y_discrete(position = "right") +
guides(x = guide_axis(angle = 90)) +
labs(x = NULL, y = NULL) +
theme_bw(base_size = 18, base_family = "Arial")
ppreview(p, file = file.path(output_dir, "temp.pdf"))14 Retrieve UniProt entries using REST API in batch
library(vroom)
library(tidyverse)
library(glue)
library(rvest)
# no more than 25
max_length <- 20
organism_id <- 9606
url_template <- "https://rest.uniprot.org/uniprotkb/search?query=({accession_str})%20AND%20(active:true)%20AND%20(organism_id:{organism_id})&fields=accession,gene_primary,organism_name,organism_id,protein_name,annotation_score,protein_existence,reviewed,ft_intramem,cc_subcellular_location,ft_topo_dom,ft_transmem,go,go_p,go_c,go_f,cc_interaction&format=tsv"
file <- "test.tsv"
accessions <- vroom(file) %>%
pull(Entry) %>%
na.omit() %>%
unique()
accession_ls <- split(accessions, ceiling(seq_along(accessions) / max_length))
df <- tibble()
for (accession_vec in accession_ls) {
accession_str <- paste0(paste0("accession:", accession_vec), collapse = "%20OR%20")
url_instance <- glue(url_template)
e <- try(item_raw_text <- read_html(url_instance) %>% html_text(), silent = T)
if ("try-error" %in% class(e)) {
message("invalid accession string: ", accession_str)
} else {
item_df <- vroom(I(item_raw_text), delim = "\t") %>%
mutate(
`Organism (ID)` = as.integer(`Organism (ID)`),
Annotation = as.integer(Annotation)
)
df <- bind_rows(df, item_df)
}
}
df <- distinct(df)15 Visualize multiple sequence alignments together with phylogenetic tree
For example data, see here.
library(Biostrings)
library(tidyverse)
library(YRUtils)
library(treeio)
library(ggtree)
library(aplot)
work_dir <- "/data/users/yangrui/lab_projs/yeyaxin"
setwd(work_dir)
fa_file <- "pscnes.fa"
tree_file <- "species.nwk"
trim_n <- 100
output_file <- "output.pdf"
# use nwk_name to bridge tree plot and heatmap
species_df <- tibble(
show_name = c("Homo Sapiens", "Mus Musculus", "Sus Scrofa", "Tursiops Truncatus", "Felis Catus", "Ailuropoda Melanoleuca", "Odobenus Rosmarus Divergens", "Erinaceus Europaeus", "Sorex Araneus", "Loxodonta Africana", "Dasypus Novemcinctus", "Monodelphis Domestica", "Chelonia Mydas", "Columba Livia"),
fa_name = c("hg38", "mm10", "susScr3", "turTru2", "felCat8", "ailMel1", "odoRosDiv1", "eriEur2", "sorAra2", "loxAfr3", "dasNov3", "monDom5", "cheMyd1", "colLiv1"),
nwk_name = gsub(" ", "_", str_to_sentence(show_name))
)
tree <- read.newick(tree_file)
tree <- ape::rotateConstr(tree, rev(species_df$nwk_name))
seqs <- as.character(readDNAStringSet(fa_file, format = "fasta"))
seqs <- sapply(seqs, function(seq) substr(seq, trim_n + 1, nchar(seq) - trim_n))
base_mat <- strsplit(seqs, "") %>% do.call(rbind, .)
colnames(base_mat) <- paste0("col", seq_len(ncol(base_mat)))
uniq_bases <- c(sort(unique(as.character(base_mat)), decreasing = TRUE), "=")
uniq_base_colors <- setNames(c(scales::hue_pal()(length(uniq_bases) - 1), "grey"), uniq_bases)
color_mat <- apply(base_mat, 2, function(c) {
ref_base <- names(sort(-table(c)))[1]
c[c == ref_base] <- "="
return(c)
})
df <- inner_join(
as.data.frame(as.table(base_mat)),
as.data.frame(as.table(color_mat)),
by = c("Var1", "Var2"), suffix = c(".base", ".color")
) %>%
inner_join(species_df, by = c("Var1" = "fa_name")) %>%
mutate(
nwk_name = factor(nwk_name, levels = rev(species_df$nwk_name)),
Freq.color = as.factor(Freq.color)
)
y_label_df <- distinct(df[, c("show_name", "nwk_name")])
p_base <- ggplot(df, aes(x = Var2, y = nwk_name, fill = Freq.color)) +
geom_raster(alpha = 0.8) +
geom_text(aes(label = Freq.base),
family = "Courier New", fontface = "bold",
size.unit = "pt", size = 20
) +
scale_y_discrete(
position = "right",
labels = setNames(y_label_df$show_name, y_label_df$nwk_name)
) +
scale_fill_manual(values = uniq_base_colors) +
coord_fixed(ratio = 1.6) +
theme_minimal(base_family = "Arial") +
theme(
axis.text = element_text(size = 20, color = "black"),
legend.text = element_text(size = 20, family = "Courier New", face = "bold", color = "black"),
legend.title = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_blank(),
axis.title = element_blank()
)
p_tree <- ggtree(
tree,
ladderize = FALSE
)
p <- p_base %>% insert_left(p_tree, width = 0.6)
ppreview(
p,
file = output_file,
width = 3300, height = 1000
)