#!/usr/bin/env python3
# tag_bam_by_mapq_gx.py
import pysam
import sys
import time
def tag_bam(input_bam, output_bam, threads=1):
GENE_TAG = "GX"
NEW_TAG = "GS"
count_total = 0
count_gene = 0
count_intergenic = 0
count_failed = 0
print(f"[INFO] Start dealing with {input_bam} ...")
start_time = time.time()
try:
with pysam.AlignmentFile(input_bam, "rb") as bam_in:
with pysam.AlignmentFile(
output_bam, "wb", template=bam_in, threads=threads
) as bam_out:
for read in bam_in:
count_total += 1
if read.mapping_quality != 255:
read.set_tag(NEW_TAG, "F", value_type="Z")
count_failed += 1
else:
try:
gx_value = read.get_tag(GENE_TAG)
if gx_value != "-":
read.set_tag(NEW_TAG, "G", value_type="Z")
count_gene += 1
else:
read.set_tag(NEW_TAG, "I", value_type="Z")
count_intergenic += 1
except KeyError:
read.set_tag(NEW_TAG, "I", value_type="Z")
count_intergenic += 1
bam_out.write(read)
if count_total % 1000000 == 0:
print(f"[INFO] {count_total:,} reads dealed")
except Exception as e:
print(f"[ERROR] Unknown error occurred: {e}")
sys.exit(1)
end_time = time.time()
print(f"[INFO] Dealing with {input_bam} done!")
print(f"\n[INFO] Elapsed time: {end_time - start_time:.2f} second(s)")
print(f"[INFO] Output file: {output_bam}")
print(f"[INFO] Total reads: {count_total:,}")
print(
f"[INFO] Reads uniquely mapped to genes (G): {count_gene:,} ({count_gene/count_total:.2%})"
)
print(
f"[INFO] Reads uniquely mapped to intergenic regions (I): {count_intergenic:,} ({count_intergenic/count_total:.2%})"
)
print(
f"[INFO] Reads not unqiuely mapped to genome (MAPQ != 255) (F): {count_failed:,} ({count_failed/count_total:.2%})"
)
print(
f"\n[INFO] Next step: you should create the index file for the output BAM file yourself (samtools index)"
)
if __name__ == "__main__":
threads = 1
if len(sys.argv) < 3 or len(sys.argv) > 4:
print(f"\nUsage: python {sys.argv[0]} <input.bam> <output.bam> [<threads>]")
print("threads <INT>: the number of threads/cores used by pysam (default: 1)\n")
sys.exit(1)
input_bam = sys.argv[1]
output_bam = sys.argv[2]
if len(sys.argv) == 4:
try:
threads = int(sys.argv[3])
except ValueError:
print(
f"[ERROR] Argument 'threads' {sys.argv[3]} must be a positive integer"
)
sys.exit(1)
else:
print(f"[INFO] {threads} threads will be used")
else:
print(f"[INFO] Argument 'threads' is not given, using the default: {threads}")
tag_bam(input_bam, output_bam, threads=threads)2 Merge genome annotations
singularity shell -B "${PWD}":/data /data/softwares/agat/agat_1.5.0--pl5321hdfd78af_0.sif
agat_sp_merge_annotations.pl --gff Neovision_vision.v1.v1.NCBI_Pipeline.basic.chr.annotation.gff3 --gff Neovision_vision.v1.v1.KIZ_ShengLab.basic.chr.annotation.gff3 --out merged.gff3
# phase correction, etc.
gffread merged.gff3 -o tidy.gff3 -O -F --keep-exon-attrs --keep-genes
3 Download SRA files from NCBI GEO in batch
For detailed info, see https://github.com/ncbi/sra-tools/wiki and https://github.com/ncbi/sra-tools/wiki/08.-prefetch-and-fasterq-dump.
SRR_Acc_File=SRR_Acc_List.tsv
OUT_DIR=tmp_downloads
mkdir ${OUT_DIR}
while read -r SRR_Acc; do cmd="prefetch ${SRR_Acc} --max-size u --progress --output-directory ${OUT_DIR}"; echo -e "Running command: ${cmd} ..."; eval "${cmd}"; echo -e "Successfully running command: ${cmd}\n"; done < "${SRR_Acc_File}"
4 Extract FASTQ files from SRA files in batch
Note:
For
fasterq-dump,--split-3and--skip-technicaloptions are enabled by default, but for clarity, you’d better specify them explicitly yourself.You should carefully consider whether you want to include technical reads; the splitting scheme of which you want. For detailed info, see https://github.com/ncbi/sra-tools/wiki and https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump.
Some examples:
- Standard single- or paired-end protocols (e.g., bulk RNA-, ATAC-, ChIP-seq, etc.) with technical reads omitted: biological reads mated \(\rightarrow\)
*_1.fastq&*_2.fastq; unmated reads \(\rightarrow\)*.fastq.
TMP_DIR=/dev/shm
NUM_THREADS=6
PWD=$(pwd)
shopt -s extglob
for SRR_Acc_Path in SRR+([0-9]); do [[ -d "${SRR_Acc_Path}" ]] || continue; SRR_Acc_Path="${PWD}/${SRR_Acc_Path}"; OUT_DIR="${SRR_Acc_Path}_dump_outs"; cmd="fasterq-dump ${SRR_Acc_Path} --outdir ${OUT_DIR} --temp ${TMP_DIR} --threads ${NUM_THREADS} --progress --skip-technical --split-3"; echo -e "Running command: ${cmd} ..."; eval "${cmd}"; echo -e "Successfully running command: ${cmd}\n"; done
- Split the downloaded SRA files into individual, analysis-ready FASTQ (or FASTA) files, preserving any technical reads (e.g., cell-barcode, UMI, sample index, etc.) that are required for single-cell RNA-seq, ATAC-seq, multiome, or other protocols.
TMP_DIR=/dev/shm
NUM_THREADS=6
PWD=$(pwd)
shopt -s extglob
for SRR_Acc_Path in SRR+([0-9]); do [[ -d "${SRR_Acc_Path}" ]] || continue; SRR_Acc_Path="${PWD}/${SRR_Acc_Path}"; OUT_DIR="${SRR_Acc_Path}_dump_outs"; cmd="fasterq-dump ${SRR_Acc_Path} --outdir ${OUT_DIR} --temp ${TMP_DIR} --threads ${NUM_THREADS} --progress --include-technical --split-files"; echo -e "Running command: ${cmd} ..."; eval "${cmd}"; echo -e "Successfully running command: ${cmd}\n"; done
5 Extract MT annotations from GTF file
# Extract MT annotations
# Add gene_name field if absent
# Prepend "MT-" to each gene name
gffread GCF_011764305.1_ASM1176430v1.1_genomic.gff -T | awk 'BEGIN{FS=OFS="\t"} $1 == "NC_020638.1"' | gawk 'BEGIN{FS=OFS="\t"} {gsub(/^[ \t]+|[ \t]+$/, "", $9); if ($9 !~ /;$/) {$9 = $9 ";"}; if ($9 ~ /gene_id/ && $9 !~ /gene_name/) {match($9, /gene_id "([^"]+)"/, arr); $9 = $9 " gene_name \"" arr[1] "\";"}; if ($9 ~ /gene_name/) {match($9, /gene_name "([^"]+)"/, arr); gsub(/gene_name "[^"]+"/, "gene_name \"MT-" arr[1] "\"", $9)}; print}' | gffread -T -o target.gtf
6 Compress FASTQ files in batch
find "$(pwd)" -regextype gnu-awk -type f -iregex ".+\.(fastq|fq)$" | xargs -I {} pigz {}
7 Download chromosome-level primary assembly FASTA files in batch from Ensembl
#!/bin/bash -e
release=release-115
species=rattus_norvegicus
assembly=GRCr8
mask_type=dna_sm
chr_set=( {1..20} X Y MT )
concurrent_jobs=1
merged_file=merged.fa.gz
out_dir="$(date +%Y%m%d_%H%M%S).downloads"
base_url="ftp://ftp.ensembl.org/pub/${release}/fasta/${species}/dna"
file_name="${species^}.${assembly}.${mask_type}.primary_assembly.{}.fa.gz"
mkdir -p "${out_dir}"
cd "${out_dir}"
download_failed_log="download_failed.log"
printf "%s\n" "${chr_set[@]}" | \
xargs -P "${concurrent_jobs}" -I {} \
sh -c 'wget -c "${1}" || echo "${1}" >> "${2}"' _ "${base_url}/${file_name}" "${download_failed_log}"
if [[ "${#chr_set[@]}" -ne "$(ls *.fa.gz 2>/dev/null | wc -l)" || -s "${download_failed_log}" ]]; then echo "ERROR: failed to download the following files:"; cat "${download_failed_log}"; exit 1; else echo "INFO: all files have been downloaded"; fi
test_failed_log="test_failed.log"
printf "%s\n" "${chr_set[@]}" | \
xargs -P "${concurrent_jobs}" -I {} \
sh -c 'pigz -t "${1}" || echo "${1}" >> "${2}"' _ "${file_name}" "${test_failed_log}"
if [[ -s "${test_failed_log}" ]]; then echo "ERROR: the following files are corrupted:"; cat "${test_failed_log}"; exit 1; else echo "INFO: all files are complete!"; fi
pigz -c -k -d *.fa.gz | pigz -nc > "${merged_file}"
9 Misc
- Make a directory based on the current date and time
mkdir $(date +%Y%m%d_%H%M%S)
- Add a prefix for each directory/file name
# dry-run
prefix=""; ls | awk -v p="${prefix}" '{printf "mv %s %s-%s\n", $0, p, $0}'
# run
prefix=""; ls | awk -v p="${prefix}" '{printf "mv %s %s-%s\n", $0, p, $0}' | sh
- Example usage of
seqkit grep
# -P: only scan positive strand
# -i: ignore case
# -C: only count the number of occurrences
seqkit grep -P -C -j 60 -i -s -p "GCTGCAGAAGCTGAAAATCTTCTTCAT" Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa