#!/usr/bin/bash -e
fa_file=/data/database/genome/release/tupaia_belangeri/Tupaia_belangeri.v3.TreeShrewDB.dna.primary_assembly.fa.gz
fa_file_basename=${fa_file%.fa.gz}
fa_file_basename=${fa_file_basename%.fasta.gz}
faCount_base_stats_file="${fa_file_basename}.faCount_base_stats.tsv"
if [ "${fa_file##*.}" == "gz" ]
then
pigz -k -c -d "${fa_file}" | \
faCount stdin > "${faCount_base_stats_file}"
else
faCount "${fa_file}" > "${faCount_base_stats_file}"
fi
egs_file="${fa_file_basename}.effective_genome_sizes.tsv"
cat "${faCount_base_stats_file}" | \
awk -v FS="\t" 'NF{last = $0; nf = NF} END{if (nf == 0) {print "ERROR: file is empty!" > "/dev/stderr"; exit 1} split(last, arr, FS); if (arr[1] == "total") {printf "non-Ns\t%d\n", arr[2] - arr[7]} else {print "ERROR: the 1st field of the last non-empty line is not \"total\"!" > "/dev/stderr"; exit 1}}' >> "${egs_file}"1 Introduction
For more details, refer to ENCODE ATAC-Seq pipeline from GitHub, ENCODE ATAC-Seq pipeline from Google doc, ENCODE ChIP-Seq pipeline from GitHub, and ENCODE ChIP-Seq pipeline from Google doc.
2 Calculate effective genome size
3 Extract chromosome lengths
pigz -c -k -d Rattus_norvegicus.GRCr8.Ensembl_release-115.dna_sm.primary_assembly.fa.gz | faSize -detailed stdin > Rattus_norvegicus.GRCr8.Ensembl_release-115.dna_sm.primary_assembly.chrom.sizes.tsv4 Extract mitochondrial genome if present
Recommended:
bgzip -@ 60 Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.Ensembl_release-115.dna_sm.toplevel.fa
samtools faidx -@ 60 Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.Ensembl_release-115.dna_sm.toplevel.fa.gz
samtools faidx Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.Ensembl_release-115.dna_sm.toplevel.fa.gz NC_020638.1 | bgzip -@ 60 - -o Mustela_putorius_furo.MusPutFur1__0_GCA_000215625__1.Ensembl_release-115.dna_sm.mito.fa.gzAlternative:
pigz -k -c -d Rattus_norvegicus.mRatBN7.2.dna_sm.toplevel.fa.gz | faOneRecord stdin MT > Rattus_norvegicus.mRatBN7.2.dna_sm.toplevel.mito_only.fa5 Prepare Bowtie2 index of species/mitochondrial genome
#!/bin/bash -e
ref_fa=/data/database/genome/release/petaurus_breviceps/Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa.gz
bowtie2_index_dir=/data/database/ref_idx/encode/epigenome/ATAC_ChIP-seq/Petaurus_breviceps-20251016_123355
bowtie2_index_prefix=petaurus_breviceps
bowtie2_n_threads=80
tmp_dir=/data/tmp/dell_tmp
cd "${bowtie2_index_dir}"
if [ "${ref_fa##*.}" == "gz" ]
then
new_ref_fa="${tmp_dir}/$(basename ${ref_fa} .gz)"
pigz -k -c -d "${ref_fa}" > "${new_ref_fa}"
else
new_ref_fa="${ref_fa}"
fi
bowtie2-build --threads "${bowtie2_n_threads}" -f "${new_ref_fa}" "${bowtie2_index_prefix}"
ref_fa_basename=${ref_fa##*/}
ref_fa_basename=${ref_fa_basename%.fa.gz}
ref_fa_basename=${ref_fa_basename%.fasta.gz}
tar -cvf "${ref_fa_basename}.bt2_idx.tar" *.bt2
rm *.bt2
if [ "${ref_fa##*.}" == "gz" ]
then
rm "${new_ref_fa}"
fi6 Prepare transcription start sites
Note: GFF3 is 1-based coordinate system, and BED is 0-based coordinate system.
library(rtracklayer)
library(vroom)
library(tidyverse)
root_dir <- "/data/database/annotation/release"
# types of either gene or transcript, not both
targets <- c("gene")
id_cols <- c("gene_id", "ID")
# id_cols <- c("transcript_id", "ID")
gff_files <- list.files(root_dir, pattern = "(?i).+\\.(gff3?|gtf)(\\.gz)?$", full.names = TRUE, recursive = TRUE)
for (gff_file in gff_files) {
df <- import(gff_file) %>%
as.data.frame() %>%
as_tibble()
id_col <- id_cols[which(id_cols %in% names(df))][1]
if (any(targets %in% unique(df$type)) && length(id_col) == 1) {
tss_df <- df %>%
filter(type %in% targets) %>%
select(all_of(c("seqnames", "start", "end", id_col, "strand"))) %>%
mutate(
tss_start = if_else(strand %in% c("+", ".", "*"), start - 1, end - 1),
tss_end = if_else(strand %in% c("+", ".", "*"), start, end),
score = 0
) %>%
select(all_of(c("seqnames", "tss_start", "tss_end", id_col, "score", "strand"))) %>%
arrange(seqnames, tss_start, tss_end) %>%
na.omit() %>%
distinct()
output_file <- gsub("(?i)(gff3?|gtf)(\\.gz)?$",
"tss.bed",
gff_file,
perl = TRUE
)
vroom_write(
tss_df,
file = output_file,
col_names = FALSE,
append = FALSE
)
system2("/usr/bin/pigz", output_file)
} else {
warning("GFF3/GTF does not meet the conditions, skipped [", gff_file, "]")
}
}