#!/usr/bin/bash -e
# for processing long reads only: add -L
# for processing both short and long reads: add --mix
# for strand-specific library: add either --rf or --fr
ref_anno=/data/database/annotation/release/mustela_putorius_furo/Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.MusPutFur1__0__115.Ensembl_release-115.basic.chr.annotation.gtf
read_alns_bam_dir=/data/users/dell/temp/tmp_downloads/encode_bulk_rna_seq_pipeline-6484b718c648ebe4
num_threads=32
out_dir=/data/users/dell/temp/tmp_downloads/stringtie_outs
mkdir -p "${out_dir}"
cd "${out_dir}"
while IFS= read -r -d '' read_alns_bam
do
sample_name=$(basename "${read_alns_bam}" _genome.bam)
out_gtf="${sample_name}".stringtie.gtf
echo "processing sample: ${sample_name}"
echo "input BAM: ${read_alns_bam}"
stringtie -o "${out_gtf}" -p "${num_threads}" -G "${ref_anno}" -l "${sample_name}" "${read_alns_bam}"
done < <(find "${read_alns_bam_dir}" -name "*_genome.bam" -print0)1 Bulk RNA-Seq + STAR + StringTie
Assemble transcripts:
Here, I use ENCODE Bulk RNA-Seq pipeline to generate genome-coordinated BAM files from FASTQ files.
Merge all GTF files:
#!/usr/bin/bash -e
ref_anno=/data/database/annotation/release/mustela_putorius_furo/Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.MusPutFur1__0__115.Ensembl_release-115.basic.chr.annotation.gtf
assembly_gtf_dir=/data/users/dell/temp/tmp_downloads/stringtie_outs
out_dir=/data/users/dell/temp/tmp_downloads/stringtie_merged
mkdir -p "${out_dir}"
cd "${out_dir}"
stringtie_gtf_list_file=stringtie_gtf_list.txt
find "${assembly_gtf_dir}" -name "*.stringtie.gtf" > "${stringtie_gtf_list_file}"
num_files=$(wc -l < "${stringtie_gtf_list_file}")
if [ "${num_files}" -eq 0 ]
then
echo "ERROR: no GTF files found in ${assembly_gtf_dir}"
exit 1
fi
echo "${num_files} GTF files found"
out_gtf=stringtie.merged.gtf
stringtie --merge -o "${out_gtf}" -G "${ref_anno}" "${stringtie_gtf_list_file}"Standardize merged GTF file:
library(tidyverse)
library(rtracklayer)
work_dir <- "/data/users/dell/temp/tmp_downloads"
setwd(work_dir)
stringtie_merged_gtf_file <- "stringtie_merged/stringtie.merged.gtf"
ref_anno_gtf_file <- "/data/database/annotation/release/mustela_putorius_furo/Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.MusPutFur1__0__115.Ensembl_release-115.basic.chr.annotation.gtf"
stringtie_merged_gtf <- import(stringtie_merged_gtf_file, format = "gtf")
ref_anno_gtf <- import(ref_anno_gtf_file, format = "gtf")
ref_anno_gtf_df <- as_tibble(as.data.frame(ref_anno_gtf))
gene_id_name_df <- distinct(na.omit(ref_anno_gtf_df[, c("gene_id", "gene_name")]))
gene_id_biotye_df <- distinct(na.omit(ref_anno_gtf_df[, c("gene_id", "gene_biotype")]))
stringtie_merged_gtf$gene_id[!is.na(stringtie_merged_gtf$ref_gene_id)] <- stringtie_merged_gtf$ref_gene_id[!is.na(stringtie_merged_gtf$ref_gene_id)]
stringtie_merged_gtf$gene_name <- gene_id_name_df$gene_name[match(stringtie_merged_gtf$gene_id, gene_id_name_df$gene_id)]
stringtie_merged_gtf$gene_name[is.na(stringtie_merged_gtf$gene_name)] <- stringtie_merged_gtf$gene_id[is.na(stringtie_merged_gtf$gene_name)]
stringtie_merged_gtf$gene_biotype <- gene_id_biotye_df$gene_biotype[match(stringtie_merged_gtf$gene_id, gene_id_biotye_df$gene_id)]
stringtie_merged_gtf$gene_biotype[is.na(stringtie_merged_gtf$gene_biotype)] <- "protein_coding"
stringtie_merged_gtf <- stringtie_merged_gtf[stringtie_merged_gtf@strand %in% c("+", "-")]
export(stringtie_merged_gtf, con = gsub("\\.gtf$", ".tidy.gtf", stringtie_merged_gtf_file), format = "gtf")2 Appendixes
2.1 Rename paired-end FASTQ files based on info from NCBI GEO
# rename paired-end FASTQ files based on info from NCBI GEO
library(vroom)
library(tidyverse)
work_dir <- "/data/users/dell/temp/tmp_downloads"
setwd(work_dir)
sra_run_table_file <- "SraRunTable.csv"
fq_name_pattern <- "^SRR[0-9]+_[12]\\.fastq$"
sra_run_table_df <- vroom(sra_run_table_file)
fq_files <- list.files(".", pattern = fq_name_pattern, full.names = TRUE, recursive = TRUE)
fq_df <- tibble(
fq_file = fq_files,
Run = gsub("_[12]\\.fastq$", "", basename(fq_file)),
read_type = gsub("^SRR[0-9]+_|\\.fastq$", "", basename(fq_file))
)
df <- inner_join(fq_df, sra_run_table_df, by = "Run") %>%
mutate(new_fq_file = paste0(bio_sample, ".R", read_type, ".fastq"))
ok <- file.rename(df$fq_file, df$new_fq_file)
if (all(ok)) {
message("all are well done")
} else {
stop("some tasks were failed")
}
ok <- sapply(df$new_fq_file, function(x) system2(command = "pigz", args = x))
if (all(ok == 0)) {
message("all are well done")
} else {
stop("some tasks were failed")
}
vroom_write(df, file = "metadata.tsv")2.2 Use RSeQC to infer the strandedness of RNA-Seq library
# generate reference gene model in BED format
gtfToGenePred Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.MusPutFur1__0__115.Ensembl_release-115.basic.chr.annotation.gtf Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.MusPutFur1__0__115.Ensembl_release-115.basic.chr.annotation.genePred
genePredToBed Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.MusPutFur1__0__115.Ensembl_release-115.basic.chr.annotation.genePred Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.MusPutFur1__0__115.Ensembl_release-115.basic.chr.annotation.bed
# infer the strandedness
infer_experiment.py -i /data/users/dell/temp/tmp_downloads/encode_bulk_rna_seq_pipeline-6484b718c648ebe4/neuron/align/rep1/rep1neuron_genome.bam -r /data/database/annotation/release/mustela_putorius_furo/Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.MusPutFur1__0__115.Ensembl_release-115.basic.chr.annotation.bed
# results interpretation
# non-strand-specific
This is PairEnd Data
Fraction of reads failed to determine: 0.0257
Fraction of reads explained by "1++,1--,2+-,2-+": 0.4803
Fraction of reads explained by "1+-,1-+,2++,2--": 0.4940
# strand-specific in fr mode
# 其中一个模式占比远超 90%
# 另一个模式几乎为 0
This is PairEnd Data
Fraction of reads failed to determine: 0.0072
Fraction of reads explained by "1++,1--,2+-,2-+": 0.9441
Fraction of reads explained by "1+-,1-+,2++,2--": 0.0487结果解读:
1++ 代表 R1 在 + 链,转录本也在 + 链(R1 以 1st strand cDNA 所在链为模板合成,即其方向与转录本方向总是相同的);2+- 代表 R2 在 + 链,转录本在 - 链(R2 以 mRNA 所在链为模板合成,即其方向与转录本方向总是相反的);其余同理。
对于 strand-specific RNA-Seq library,R1/R2 的合成链是和转录本所在链或其互补链绑定的,即如果 R1 是以 mRNA 所在链为模板合成的,则其方向和转录本的方向相反,也即 1+-/1-+,此时 R2 是以 1st strand cDNA 所在链为模板合成的,则其方向和转录本的方向相同,也即 2++/2–;同理如果反过来,即为 1++/1– 和 2+-/2-+。即对于 strand-specific RNA-Seq,有两种情况:R1 映射到 reverse strand(相对于转录本所在链),即 rf mode;R1 映射到 forward strand,即 fr mode。
对于转录本组装(如 StringTie/Trinity)或定量(如 RSEM/featureCounts)软件,如果是 strand-specific RNA-Seq library,需明确指定。