Agilent mRNA microarray analysis using limma

1 Introduction

A little example for how to use limma to analyze Agilent mRNA microarray data.

2 Download ferret (Mustela putorius furo, 9669) reference files

wget -c https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/764/305/GCF_011764305.1_ASM1176430v1.1/GCF_011764305.1_ASM1176430v1.1_genomic.fna.gz
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/764/305/GCF_011764305.1_ASM1176430v1.1/GCF_011764305.1_ASM1176430v1.1_genomic.gff.gz
wget -c https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/764/305/GCF_011764305.1_ASM1176430v1.1/GCF_011764305.1_ASM1176430v1.1_genomic.gtf.gz

3 Extract transcript sequences

# -w: write a fasta file with spliced exons for each transcript
gffread -w GCF_011764305.1_ASM1176430v1.1_genomic.transcripts.fna -g GCF_011764305.1_ASM1176430v1.1_genomic.fna GCF_011764305.1_ASM1176430v1.1_genomic.gff

4 Build blast database

makeblastdb -in GCF_011764305.1_ASM1176430v1.1_genomic.transcripts.fna -parse_seqids -taxid 9669 -blastdb_version 5 -title "GCF_011764305.1_ASM1176430v1.1 Ferret (Mustela putorius furo) spliced transcripts" -dbtype nucl

5 Prepare mRNA microarray data

library(GEOquery)
library(limma)
library(tidyverse)
library(vroom)

gse_accession <- "GSE60687"
out_dir <- "./data/microarray"

gse <- getGEO(gse_accession, GSEMatrix = T, getGPL = T)
sample_df <- gse[[paste0(gse_accession, "_series_matrix.txt.gz")]]@phenoData@data %>%
    mutate(
        sample_id = paste0(
            str_extract_all(description, "A(17|19)", simplify = T), ".",
            str_extract_all(description, "[A-Z]{2,}", simplify = T), ".",
            str_extract_all(description, "^\\d", simplify = T),
            str_extract_all(description, "_2", simplify = T)
        ),
        sample_file = file.path(out_dir, paste0(sample_id, ".txt.gz")),
        wget_cmd = paste0("wget -c -O ", sample_file, " ", supplementary_file)
    ) %>%
    arrange(sample_id)

sapply(sample_df$wget_cmd, function(x) {
    system(x)
})

vroom_write(sample_df, file = file.path(out_dir, "samples.tsv"))

gpl <- getGEO(gse[[paste0(gse_accession, "_series_matrix.txt.gz")]]@annotation)
gpl_fna <- filter(gpl@dataTable@table, SPOT_ID != "CONTROL") %>%
    mutate(fna = paste0(">", ID, ":", COL, ":", ROW, ":", NAME, ":", CONTROL_TYPE, ":", ACCESSION_STRING, ":", CHROMOSOMAL_LOCATION, "\n", SEQUENCE)) %>%
    pull(fna) %>%
    unique()

vroom_write(gpl@dataTable@table, file = file.path(out_dir, "gpl.tsv"))
vroom_write_lines(gpl_fna, file = file.path(out_dir, "gpl.fna"))

6 Run `blastn`

# GPL probe sequences against transcripts
blastn -task megablast -db ../genome/blastdb/GCF_011764305.1_ASM1176430v1.1_genomic.transcripts.fna -query gpl.fna -outfmt "6 qseqid sseqid evalue bitscore pident qcovs stitle" -dust no -max_target_seqs 1 -num_threads 16 -out gpl.blastn.txt

7 Attach gene symbols to GPL probes

library(rtracklayer)
library(vroom)
library(tidyverse)

gff_file <- "./data/genome/GCF_011764305.1_ASM1176430v1.1_genomic.gff"
gpl_blastn_file <- "./data/microarray/gpl.blastn.txt"
out_dir <- "./data/microarray"

gff <- as.data.frame(import(gff_file, version = "3")) %>%
    select(all_of(c(
        "seqnames", "start", "end", "width", "strand",
        "source", "type", "ID", "Dbxref", "Name", "gbkey",
        "gene", "gene_biotype", "Parent", "transcript_id"
    ))) %>%
    distinct()

gpl_blastn <- vroom(gpl_blastn_file, col_names = c("qseqid", "sseqid", "evalue", "bitscore", "pident", "qcovs", "stitle")) %>%
    distinct() %>%
    group_by(qseqid) %>%
    slice_min(evalue) %>%
    slice_max(bitscore) %>%
    slice_max(pident) %>%
    slice_max(qcovs) %>%
    ungroup()

table(duplicated(gpl_blastn$qseqid))

df <- left_join(gpl_blastn, gff, by = c("sseqid" = "ID"))
df <- left_join(df, filter(gff, type == "gene") %>%
    select(all_of(c("seqnames", "gene", "gene_biotype"))) %>%
    distinct(),
by = join_by(seqnames, gene),
suffix = c(".GPL_blastn", ".Parent_gene")
)

Dbxref_ls <- lapply(df$Dbxref, function(x) {
    if (length(x) > 0) {
        y <- strsplit(x, split = ":", fixed = T) %>%
            do.call(rbind, .)
        setNames(y[, 2], y[, 1]) %>%
            as.list() %>%
            as.data.frame()
    } else {
        data.frame()
    }
})
Dbxref_names <- lapply(Dbxref_ls, names) %>%
    unlist() %>%
    unique()
Dbxref_df <- lapply(Dbxref_ls, function(x) {
    if (nrow(x) == 0) {
        setNames(
            rep(NA, length(Dbxref_names)),
            Dbxref_names
        ) %>%
            as.list() %>%
            as.data.frame()
    } else {
        x
    }
}) %>% do.call(bind_rows, .)

names(Dbxref_df) <- paste0("Dbxref_", names(Dbxref_df))

df <- bind_cols(df, Dbxref_df)

vroom_write(df, file = file.path(out_dir, "gpl.with_gene_symbols.tsv"), delim = "\t")

8 Differential analysis using limma

library(limma)
library(vroom)
library(tidyverse)

sample_file <- "./data/microarray/samples.tsv"
gpl_file <- "./data/microarray/gpl.with_gene_symbols.tsv"
out_dir <- "./data/degs"

dir.create(out_dir)

gpl <- vroom(gpl_file, delim = "\t")
gpl$ProbeName <- sapply(gpl$qseqid, function(x) {
    strsplit(x, ":")[[1]][4]
})
gpl <- gpl %>%
    select(all_of(c(
        "ProbeName", "Dbxref_GeneID", "gene",
        "gene_biotype.GPL_blastn", "gene_biotype.Parent_gene"
    ))) %>%
    mutate(GeneBioType = if_else(is.na(gene_biotype.Parent_gene),
        gene_biotype.GPL_blastn,
        gene_biotype.Parent_gene
    )) %>%
    select(all_of(c("ProbeName", "Dbxref_GeneID", "gene", "GeneBioType"))) %>%
    distinct()
names(gpl) <- c("ProbeName", "EntrezID", "Symbol", "GeneBioType")

table(duplicated(gpl$ProbeName))

sample_df <- vroom(sample_file)

# read in data
# here, we are reading in single-channel Agilent (foreground: median signal; background: median signal) intensity data
# so source = "agilent" and green.only = T
# here, we read in the extra column gIsWellAboveBG, which records whether the intensity of each spot is considered above the background level for that array
x <- read.maimages(
    gsub(
        "~/mywd/agilent_mrna_expression_microarray_analysis_using_limma",
        ".",
        gsub("\\.gz$", "", sample_df$sample_file)
    ),
    source = "agilent", green.only = T,
    other.columns = "gIsWellAboveBG"
)
x_copy <- x

# gene annotation
x$genes <- left_join(x$genes, gpl, by = "ProbeName")
all(x$genes$ProbeName == x_copy$genes$ProbeName)

# background correction and normalization
# at this step, we need control probes to be existed in the dataset
y <- backgroundCorrect(x, method = "normexp")
y <- normalizeBetweenArrays(y, method = "quantile")

# gene filtering
# filter out control probes
Control <- y$genes$ControlType == 1L
# filter out probes without Symbol
NoSymbol <- is.na(y$genes$Symbol)
# keep probes that express in at least 3 arrays (because there are at least 3 replicates in each array)
IsExpr <- rowSums(y$other$gIsWellAboveBG > 0) >= 3

yfilt <- y[!Control & !NoSymbol & IsExpr, ]

genes_colnames <- c("EntrezID", "Symbol")
E_colnames <- colnames(yfilt$E)
exprMat <- bind_cols(
    yfilt$genes[, genes_colnames],
    as.data.frame(yfilt$E)
)
exprMat_dedup <- exprMat %>%
    arrange(Symbol) %>%
    group_by(EntrezID, Symbol) %>%
    reframe(across(everything(), mean))
yfilt$genes <- exprMat_dedup[, genes_colnames]
yfilt$E <- as.matrix(exprMat_dedup[, E_colnames])

# differential expression
treatments <- gsub("\\.[_0-9]+$", "", sample_df$sample_id)
levels <- unique(treatments)
treatments <- factor(treatments, levels = levels)
design <- model.matrix(~ 0 + treatments)
colnames(design) <- levels

fit <- lmFit(yfilt, design = design)
contrast_pairs <- expand.grid(x = levels, y = levels) %>%
    mutate(
        flag = if_else(x != y, T, F),
        pair = paste0(x, "-", y)
    ) %>%
    filter(flag) %>%
    pull(pair) %>%
    unique()
contrast_matrix <- makeContrasts(contrasts = contrast_pairs, levels = design)
fit2 <- contrasts.fit(fit, contrast_matrix)
fit2 <- eBayes(fit2, trend = T, robust = T)

for (pair in contrast_pairs) {
    res <- topTable(fit2, coef = pair, number = Inf, adjust.method = "BH", p.value = 1)
    vroom_write(res, file = file.path(out_dir, paste0(pair, ".tsv")), delim = "\t")
}

9 Filter DEGs and plot volcanos

library(vroom)
library(tidyverse)
library(YRUtils)

set_fonts()

degs_dir <- "./data/degs"
out_dir <- "./data/clean_degs/padj0.05_logfc1"
sig_colors <- c("#FF4757", "#546DE5", "#D2DAE2")
padj_th <- 0.05
logfc_th <- 1

dir.create(out_dir, recursive = T)

degs_files <- list.files(degs_dir, pattern = "\\.tsv$", full.names = T, recursive = F)
for (degs_file in degs_files) {
    pair <- strsplit(gsub("\\.[a-zA-Z0-9]+$", "", basename(degs_file)),
        split = "-", fixed = T
    )[[1]]

    degs_df <- vroom(degs_file, delim = "\t", col_names = T)
    clean_degs_df <- degs_df %>%
        group_by(Symbol) %>%
        slice_min(adj.P.Val, n = 1) %>%
        slice_max(abs(logFC), n = 1) %>%
        slice_sample(n = 1) %>%
        ungroup() %>%
        mutate(
            diff_flag = if_else(adj.P.Val < padj_th,
                if_else(logFC > logfc_th,
                    paste0(pair[1], " Up"),
                    if_else(logFC < -logfc_th,
                        paste0(pair[2], " Up"),
                        "NO"
                    )
                ),
                "NO"
            ),
            diff_flag = factor(diff_flag, levels = c(
                paste0(pair[1], " Up"),
                paste0(pair[2], " Up"),
                "NO"
            ))
        )

    vroom_write(clean_degs_df, file = file.path(out_dir, basename(degs_file)))

    if (any(duplicated(clean_degs_df$Symbol))) {
        stop("Duplicated items still existed after filtering for ", degs_file)
    }

    diff_counts <- count(clean_degs_df, diff_flag) %>%
        mutate(show_text = paste0(diff_flag, ": ", n))
    plot_title <- paste0(
        paste0(pair, collapse = "_vs_"), "\n",
        paste0(diff_counts$show_text, collapse = "    ")
    )

    p <- ggplot(clean_degs_df) +
        geom_point(aes(logFC, -log10(adj.P.Val), color = diff_flag),
            alpha = 0.5, size = 2
        ) +
        geom_vline(
            xintercept = c(-logfc_th, logfc_th),
            linewidth = 1, col = "grey25", linetype = "dashed"
        ) +
        geom_hline(
            yintercept = -log10(padj_th),
            linewidth = 1, col = "grey25", linetype = "dashed"
        ) +
        scale_color_manual(values = setNames(sig_colors, levels(clean_degs_df$diff_flag))) +
        labs(
            title = plot_title,
            x = "log2 Fold Change", y = "-log10(p.adjust)",
            color = paste0("p.adjust < ", padj_th, "\n|log2(FC)| > ", logfc_th)
        ) +
        theme_classic() +
        theme(
            plot.title = element_text(hjust = 0.5),
            axis.title.x = element_text(size = 26),
            axis.title.y = element_text(size = 26),
            axis.text.x = element_text(size = 24),
            axis.text.y = element_text(size = 24),
            legend.text = element_text(size = 24),
            legend.title = element_text(size = 26),
            text = element_text(family = "Arial")
        )

    ppreview(p, file = file.path(
        out_dir,
        gsub("\\.[a-zA-Z0-9]+$", ".pdf", basename(degs_file))
    ))
}

10 GO analysis

10.1 Prepare ferret gene IDs

library(vroom)
library(tidyverse)

gpl_file <- "./data/microarray/gpl.with_gene_symbols.tsv"
out_dir <- "./data/go"

dir.create(out_dir)

gpl <- vroom(gpl_file, delim = "\t")
gpl$ProbeName <- sapply(gpl$qseqid, function(x) {
    strsplit(x, ":")[[1]][4]
})
gpl <- gpl %>%
    select(all_of(c(
        "ProbeName", "Dbxref_GeneID", "gene",
        "gene_biotype.GPL_blastn", "gene_biotype.Parent_gene"
    ))) %>%
    mutate(GeneBioType = if_else(is.na(gene_biotype.Parent_gene),
        gene_biotype.GPL_blastn,
        gene_biotype.Parent_gene
    )) %>%
    select(all_of(c("ProbeName", "Dbxref_GeneID", "gene", "GeneBioType"))) %>%
    distinct()
names(gpl) <- c("ProbeName", "EntrezID", "Symbol", "GeneBioType")

vroom_write_lines(as.character(sort(unique(gpl$EntrezID))),
    file = file.path(out_dir, "ferret_entrezids.txt")
)

10.2 Prepare orthologous gene set between mm10 and ferret

library(vroom)
library(tidyverse)
library(magrittr)

ferret_mm_orthologs_file <- "./data/go/ferret_mm10.orthologs.tsv"
ferret_dataset_file <- "./data/go/ferret_ncbi_dataset.tsv"

ferret_dataset_df <- vroom(ferret_dataset_file) %>%
    select(all_of(c("NCBI GeneID", "Symbol", "Gene Type", "Gene Group Identifier"))) %>%
    set_colnames(c("ferret_EntrezID", "ferret_Symbol", "ferret_GeneType", "Ortholog_Group_Identifier")) %>%
    na.omit() %>%
    mutate_all(as.character) %>%
    distinct()
ferret_mm_orthologs_df <- vroom(ferret_mm_orthologs_file) %>%
    select(all_of(c("NCBI GeneID", "Symbol", "Gene Group Identifier"))) %>%
    set_colnames(c("mm10_EntrezID", "mm10_Symbol", "Ortholog_Group_Identifier")) %>%
    na.omit() %>%
    mutate_all(as.character) %>%
    distinct()
df <- inner_join(ferret_dataset_df, ferret_mm_orthologs_df, by = "Ortholog_Group_Identifier")
vroom_write(df, file = file.path(dirname(ferret_mm_orthologs_file), "ferret_mm10.orthologs.clean.tsv"))

10.3 Attach orthologous mm10 gene IDs to DEGs of ferret

library(vroom)
library(tidyverse)

ferret_mm_orthologs_file <- "./data/go/ferret_mm10.orthologs.clean.tsv"
degs_files <- c(
    "./data/clean_degs/padj0.05_logfc1/A17.VZ-A19.VZ.tsv",
    "./data/clean_degs/padj0.05_logfc1/A17.ISVZ-A19.ISVZ.tsv",
    "./data/clean_degs/padj0.05_logfc1/A17.OSVZ-A19.OSVZ.tsv"
)

ferret_mm_orthologs_df <- vroom(ferret_mm_orthologs_file) %>%
    mutate_all(as.character)
for (degs_file in degs_files) {
    vroom(degs_file) %>%
        select(all_of(c(
            "EntrezID", "Symbol",
            "logFC", "P.Value", "adj.P.Val", "diff_flag"
        ))) %>%
        mutate(EntrezID = as.character(EntrezID)) %>%
        inner_join(ferret_mm_orthologs_df, by = c("EntrezID" = "ferret_EntrezID")) %>%
        arrange(desc(logFC)) %>%
        distinct() %>%
        filter(ferret_GeneType == "PROTEIN_CODING") %>%
        vroom_write(file = gsub("tsv$", "with_mm10_IDs.tsv", degs_file))
}

11 Check the differential expression concordance between samples of mink and samples of ferret

library(vroom)
library(tidyverse)

mink_degs_file <- "H:/ubuntu_ssd_backup/projects/mink/proj/rna/degs/res/only_degs/mm10/P2_Gyr_vs_P2_Sul.txt"
degs_files <- c(
    "./data/clean_degs/padj0.05_logfc1/A17.VZ-A19.VZ.with_mm10_IDs.tsv",
    "./data/clean_degs/padj0.05_logfc1/A17.ISVZ-A19.ISVZ.with_mm10_IDs.tsv",
    "./data/clean_degs/padj0.05_logfc1/A17.OSVZ-A19.OSVZ.with_mm10_IDs.tsv"
)

mink_degs <- vroom(mink_degs_file) %>%
    select(all_of(c("final_gene_name", "diff_flag"))) %>%
    distinct()
degs_df <- tibble()
for (degs_file in degs_files) {
    tmp_df <- vroom(degs_file) %>%
        select(all_of(c("mm10_Symbol", "logFC"))) %>%
        mutate(tissue = gsub(
            "A17\\.[A-Z]+-A19\\.|\\.with_mm10_IDs\\.tsv$",
            "", basename(degs_file)
        )) %>%
        distinct()
    degs_df <- bind_rows(degs_df, tmp_df)
}
df <- inner_join(mink_degs, degs_df, by = c("final_gene_name" = "mm10_Symbol")) %>%
    mutate(
        tissue = factor(tissue, levels = c("VZ", "ISVZ", "OSVZ")),
        logFC_sign = if_else(logFC >= 0, "+", "-")
    )

ggplot(df) +
    geom_jitter(aes(tissue, logFC, color = diff_flag))

1 Introduction

2 Download ferret (Mustela putorius furo, 9669) reference files

3 Extract transcript sequences

4 Build blast database

5 Prepare mRNA microarray data

6 Run blastn

7 Attach gene symbols to GPL probes

8 Differential analysis using limma

9 Filter DEGs and plot volcanos

10 GO analysis

10.1 Prepare ferret gene IDs

10.2 Prepare orthologous gene set between mm10 and ferret

10.3 Attach orthologous mm10 gene IDs to DEGs of ferret

11 Check the differential expression concordance between samples of mink and samples of ferret

6 Run `blastn`