1 TD2
For detailed info, see https://github.com/Markusjsommer/TD2.
- Extract transcript DNA sequences.
samtools faidx -@ 60 Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa
gffread -w Petaurus_breviceps.v1.v1.self.basic.chr.annotation.transcript_sequences.fa -g Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa Petaurus_breviceps.v1.v1.self.basic.chr.annotation.gtf
- Predict the longest open reading frame of each transcript.
# -G: genetic code (select the correct one for your species/organelle)
# -S: strand-specific scanning
TD2.LongOrfs -t Petaurus_breviceps.v1.v1.self.basic.chr.annotation.transcript_sequences.fa -O TD2_LongOrfs_outs --complete-orfs-only -G 1
- Identify ORFs with homology to known proteins via MMSeqs2.
conda activate mmseqs2
# blastp vs. NCBI NR: complete, classic
# MMseqs2 vs. UniProtKB/Swiss-Prot: curated, high quality
# HMMER3 vs. Pfam: identify functional domains for partial- and/or full-length protein sequences
mmseqs databases UniProtKB/Swiss-Prot swissprot tmp
mmseqs easy-search TD2_LongOrfs_outs/longest_orfs.pep ~/softwares/mmseqs2_dbs/swissprot alnRes.m8 tmp -s 7
- Predict the likely coding regions.
TD2.Predict -t Petaurus_breviceps.v1.v1.self.basic.chr.annotation.transcript_sequences.fa --retain-mmseqs-hits alnRes.m8 --complete-orfs-only -G 1 -O TD2_LongOrfs_outs
- Generate the new GFF3 annotation file.
/home/yangrui/softwares/TD2/util/gtf_to_alignment_gff3.pl Petaurus_breviceps.v1.v1.self.basic.chr.annotation.gtf > Petaurus_breviceps.v1.v1.self.basic.chr.annotation.aln_gff3_fmt.gff3
/home/yangrui/softwares/TD2/util/cdna_alignment_orf_to_genome_orf.pl Petaurus_breviceps.v1.v1.self.basic.chr.annotation.transcript_sequences.fa.TD2.gff3 Petaurus_breviceps.v1.v1.self.basic.chr.annotation.aln_gff3_fmt.gff3 Petaurus_breviceps.v1.v1.self.basic.chr.annotation.transcript_sequences.fa > Petaurus_breviceps.v1.v1.self.basic.chr.annotation.TD2.gff3
- (optional) Validate the GFF3 file.
conda activate genometools
gt gff3validator Petaurus_breviceps.v1.v2.KIZ_ShengLab.basic.chr.annotation.gff3
- (optional) Trim suffixes of IDs
# Trim trailing appendices from IDs.
library(rtracklayer)
library(tidyverse)
library(vroom)
seqinfo_file <- "/data/database/genome/release/petaurus_breviceps/Petaurus_breviceps.v1.self.dna_sm.primary_assembly.chrom.sizes.tsv"
genome <- "petaurus_breviceps.v1"
input_gff3_file <- "/home/yangrui/data/Petaurus_breviceps.v1.v1.self.basic.chr.annotation.TD2.gff3"
output_gff3_file <- "/home/yangrui/data/Petaurus_breviceps.v1.v2.KIZ_ShengLab.basic.chr.annotation.gff3"
seqinfo <- vroom(seqinfo_file, col_names = c("seqnames", "length")) %>%
mutate(is_circular = FALSE)
seqinfo <- Seqinfo(
seqnames = seqinfo$seqnames,
seqlengths = seqinfo$length,
isCircular = seqinfo$is_circular,
genome = genome
)
gff3 <- import.gff3(input_gff3_file)
df <- as_tibble(as.data.frame(gff3))
ls <- split(df, df$type)
table(is.na(ls$gene$ID))
table(is.na(ls$gene$Name))
table(duplicated(ls$gene$ID))
table(duplicated(ls$gene$Name))
table(sapply(ls$gene$Parent, length))
names <- setNames(trimws(gsub("^\"| type:complete len:[0-9]+ \\((\\+|-)\\)\"$", "", ls$gene$Name, perl = TRUE)), ls$gene$ID)
ids <- setNames(trimws(gsub("\\^(LG|Contig)[0-9]+\\^(\\+|-)$", "", ls$gene$ID, perl = TRUE)), ls$gene$Name)
ls$gene$ID <- trimws(gsub("\\^(LG|Contig)[0-9]+\\^(\\+|-)$", "", ls$gene$ID, perl = TRUE))
ls$gene$Name <- trimws(gsub("^\"| type:complete len:[0-9]+ \\((\\+|-)\\)\"$", "", ls$gene$Name, perl = TRUE))
ls$gene <- ls$gene %>%
mutate(
TMP_COL = ID,
ID = Name,
Name = TMP_COL
) %>%
select(-TMP_COL)
table(duplicated(ls$gene$ID))
table(duplicated(ls$gene$Name))
table(is.na(ls$mRNA$ID))
table(is.na(ls$mRNA$Name))
table(duplicated(ls$mRNA$ID))
table(duplicated(ls$mRNA$Name))
table(sapply(ls$mRNA$Parent, length))
table(sapply(ls$mRNA$Parent, is.na))
ls$mRNA <- ls$mRNA %>%
mutate(
Name = sapply(Name, function(x) setNames(ids[x], NULL), simplify = TRUE, USE.NAMES = FALSE),
Parent = lapply(Parent, function(x) setNames(names[x[1]], NULL))
)
table(is.na(ls$mRNA$ID))
table(is.na(ls$mRNA$Name))
table(duplicated(ls$mRNA$ID))
table(duplicated(ls$mRNA$Name))
table(sapply(ls$mRNA$Parent, length))
table(sapply(ls$mRNA$Parent, is.na))
fi_df <- bind_rows(ls) %>%
arrange(seqnames, start, type, end)
fi_gff3 <- makeGRangesFromDataFrame(
fi_df,
seqinfo = seqinfo,
keep.extra.columns = TRUE,
seqnames.field = "seqnames",
start.field = "start",
end.field = "end",
strand.field = "strand"
)
mcols(fi_gff3)$Parent <- CharacterList(mcols(fi_gff3)$Parent)
export.gff3(fi_gff3, output_gff3_file)