Predict CDSs from transcripts de novo

1 TD2

For detailed info, see https://github.com/Markusjsommer/TD2.

Extract transcript DNA sequences.

samtools faidx -@ 60 Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa
gffread -w Petaurus_breviceps.v1.v1.self.basic.chr.annotation.transcript_sequences.fa -g Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa Petaurus_breviceps.v1.v1.self.basic.chr.annotation.gtf

Predict the longest open reading frame of each transcript.

# -G: genetic code (select the correct one for your species/organelle)
# -S: strand-specific scanning
TD2.LongOrfs -t Petaurus_breviceps.v1.v1.self.basic.chr.annotation.transcript_sequences.fa -O TD2_LongOrfs_outs --complete-orfs-only -G 1

Identify ORFs with homology to known proteins via MMSeqs2.

conda activate mmseqs2

# blastp vs. NCBI NR: complete, classic
# MMseqs2 vs. UniProtKB/Swiss-Prot: curated, high quality
# HMMER3 vs. Pfam: identify functional domains for partial- and/or full-length protein sequences
mmseqs databases UniProtKB/Swiss-Prot swissprot tmp

mmseqs easy-search TD2_LongOrfs_outs/longest_orfs.pep ~/softwares/mmseqs2_dbs/swissprot alnRes.m8 tmp -s 7

Predict the likely coding regions.

TD2.Predict -t Petaurus_breviceps.v1.v1.self.basic.chr.annotation.transcript_sequences.fa --retain-mmseqs-hits alnRes.m8 --complete-orfs-only -G 1 -O TD2_LongOrfs_outs

Generate the new GFF3 annotation file.

/home/yangrui/softwares/TD2/util/gtf_to_alignment_gff3.pl Petaurus_breviceps.v1.v1.self.basic.chr.annotation.gtf > Petaurus_breviceps.v1.v1.self.basic.chr.annotation.aln_gff3_fmt.gff3

/home/yangrui/softwares/TD2/util/cdna_alignment_orf_to_genome_orf.pl Petaurus_breviceps.v1.v1.self.basic.chr.annotation.transcript_sequences.fa.TD2.gff3 Petaurus_breviceps.v1.v1.self.basic.chr.annotation.aln_gff3_fmt.gff3 Petaurus_breviceps.v1.v1.self.basic.chr.annotation.transcript_sequences.fa > Petaurus_breviceps.v1.v1.self.basic.chr.annotation.TD2.gff3

(optional) Validate the GFF3 file.

conda activate genometools

gt gff3validator Petaurus_breviceps.v1.v2.KIZ_ShengLab.basic.chr.annotation.gff3

(optional) Trim suffixes of IDs

# Trim trailing appendices from IDs.
library(rtracklayer)
library(tidyverse)
library(vroom)

seqinfo_file <- "/data/database/genome/release/petaurus_breviceps/Petaurus_breviceps.v1.self.dna_sm.primary_assembly.chrom.sizes.tsv"
genome <- "petaurus_breviceps.v1"
input_gff3_file <- "/home/yangrui/data/Petaurus_breviceps.v1.v1.self.basic.chr.annotation.TD2.gff3"
output_gff3_file <- "/home/yangrui/data/Petaurus_breviceps.v1.v2.KIZ_ShengLab.basic.chr.annotation.gff3"

seqinfo <- vroom(seqinfo_file, col_names = c("seqnames", "length")) %>%
    mutate(is_circular = FALSE)
seqinfo <- Seqinfo(
    seqnames = seqinfo$seqnames,
    seqlengths = seqinfo$length,
    isCircular = seqinfo$is_circular,
    genome = genome
)

gff3 <- import.gff3(input_gff3_file)
df <- as_tibble(as.data.frame(gff3))
ls <- split(df, df$type)

table(is.na(ls$gene$ID))
table(is.na(ls$gene$Name))
table(duplicated(ls$gene$ID))
table(duplicated(ls$gene$Name))
table(sapply(ls$gene$Parent, length))

names <- setNames(trimws(gsub("^\"| type:complete len:[0-9]+ \\((\\+|-)\\)\"$", "", ls$gene$Name, perl = TRUE)), ls$gene$ID)
ids <- setNames(trimws(gsub("\\^(LG|Contig)[0-9]+\\^(\\+|-)$", "", ls$gene$ID, perl = TRUE)), ls$gene$Name)

ls$gene$ID <- trimws(gsub("\\^(LG|Contig)[0-9]+\\^(\\+|-)$", "", ls$gene$ID, perl = TRUE))
ls$gene$Name <- trimws(gsub("^\"| type:complete len:[0-9]+ \\((\\+|-)\\)\"$", "", ls$gene$Name, perl = TRUE))
ls$gene <- ls$gene %>%
    mutate(
        TMP_COL = ID,
        ID = Name,
        Name = TMP_COL
    ) %>%
    select(-TMP_COL)

table(duplicated(ls$gene$ID))
table(duplicated(ls$gene$Name))

table(is.na(ls$mRNA$ID))
table(is.na(ls$mRNA$Name))
table(duplicated(ls$mRNA$ID))
table(duplicated(ls$mRNA$Name))
table(sapply(ls$mRNA$Parent, length))
table(sapply(ls$mRNA$Parent, is.na))

ls$mRNA <- ls$mRNA %>%
    mutate(
        Name = sapply(Name, function(x) setNames(ids[x], NULL), simplify = TRUE, USE.NAMES = FALSE),
        Parent = lapply(Parent, function(x) setNames(names[x[1]], NULL))
    )

table(is.na(ls$mRNA$ID))
table(is.na(ls$mRNA$Name))
table(duplicated(ls$mRNA$ID))
table(duplicated(ls$mRNA$Name))
table(sapply(ls$mRNA$Parent, length))
table(sapply(ls$mRNA$Parent, is.na))

fi_df <- bind_rows(ls) %>%
    arrange(seqnames, start, type, end)
fi_gff3 <- makeGRangesFromDataFrame(
    fi_df,
    seqinfo = seqinfo,
    keep.extra.columns = TRUE,
    seqnames.field = "seqnames",
    start.field = "start",
    end.field = "end",
    strand.field = "strand"
)
mcols(fi_gff3)$Parent <- CharacterList(mcols(fi_gff3)$Parent)
export.gff3(fi_gff3, output_gff3_file)