## 3. Attach gene name/ID to protein sequence header line
using FASTX, CSV, DataFrames
fa_file = "/data/biodatabase/species/hg38/genome/anno/gencode.v29.primary_assembly.annotation_UCSC_names.protein_sequences.fasta"
tx_mpt_file = "/data/biodatabase/species/hg38/genome/anno/gencode.v29.trna.ercc.phix.gtf.gz.transcript_id_name_mapping_table.tsv"
tx_mpt_df = CSV.read(tx_mpt_file, DataFrame; delim="\t", header=true)
tx_mpt_dict = Dict(r.transcript_id => (r.transcript_id, r.gene_id, r.gene_name) for r in eachrow(tx_mpt_df))
FASTAReader(open(fa_file)) do reader
record = FASTARecord()
FASTAWriter(open(replace(fa_file, r"(fa|fasta)$" => "with_gene_names.fasta"), "w"); width=60) do writer
while !eof(reader)
empty!(record)
read!(reader, record)
seq_id = identifier(record)
if haskey(tx_mpt_dict, seq_id)
write(writer, FASTARecord(string(tx_mpt_dict[seq_id][1], "@", tx_mpt_dict[seq_id][2], "@", tx_mpt_dict[seq_id][3]), sequence(record)))
end
end
end
end1 Basic concepts
Pairwise sequence alignments:
Multiple sequence alignments:
Alignment types:
Global alignment
Local alignment
2 Algorithms/tools for pairwise sequence alignments
2.1 Slow but optimal
2.1.1 Dynamic programming
Needleman-Wunsch (global)
Smith-Waterman (local)
2.2 Fast but approximate
2.2.1 BLAST
3 Algorithms/tools for multiple sequence alignments
3.1 MAFFT
3.2 Clustal Omega
3.2.1 Example 1: perform multiple protein sequence alignments across species
## 1. Use AGAT to extract the longest transcript for each gene
singularity run \
-B /data/biodatabase/species \
/data/softwares/agat/agat_1.5.0--pl5321hdfd78af_0.sif
agat_sp_keep_longest_isoform.pl \
--gff /data/biodatabase/species/mm10/genome/anno/gencode.vM21.primary_assembly.annotation_UCSC_names.gtf \
-o /data/biodatabase/species/mm10/genome/anno/gencode.vM21.primary_assembly.annotation_UCSC_names.longest_isoforms.gff3
## 2. Use gffread to get the protein sequence for each coding transcript
# Generate FASTA index file to accelerate gffread
samtools faidx -@ 60 /data/biodatabase/species/mm10/genome/genome/mm10_no_alt_analysis_set_ENCODE.fasta
gffread -y gencode.vM21.primary_assembly.annotation_UCSC_names.protein_sequences.fasta \
-g /data/biodatabase/species/mm10/genome/genome/mm10_no_alt_analysis_set_ENCODE.fasta \
gencode.vM21.primary_assembly.annotation_UCSC_names.longest_isoforms.gff3
## 4. Prepare FASTA files for alignment
using FASTX
work_dir = "/home/dell/test"
cd(work_dir)
gene_names = ("LYAR", "MYBL1", "SPC25")
fa_files = Dict("hg38" => "/data/biodatabase/species/hg38/genome/anno/gencode.v29.primary_assembly.annotation_UCSC_names.protein_sequences.with_gene_names.fasta",
"mink" => "/data/biodatabase/species/mink_slb_v1/genome/anno/mink.protein_sequences.with_gene_names.fasta",
"mm10" => "/data/biodatabase/species/mm10/genome/anno/gencode.vM21.primary_assembly.annotation_UCSC_names.protein_sequences.with_gene_names.fasta")
gene_names = uppercase.(gene_names)
dict = Dict(k => Tuple{String,String}[] for k in gene_names)
for (k, v) in fa_files
FASTAReader(open(v)) do reader
record = FASTARecord()
while !eof(reader)
global dict
empty!(record)
read!(reader, record)
record_gene_name = uppercase(split(identifier(record), "@")[3])
matched_gene_names = gene_names[collect(in.(gene_names, Ref((record_gene_name,))))]
if length(matched_gene_names) == 1
push!(dict[matched_gene_names[1]], (string(k, "@", matched_gene_names[1]), sequence(record)))
end
end
end
end
for (k, v) in dict
FASTAWriter(open(string(k, ".fasta"), "w"); width=60) do writer
for (seq_id, seq) in v
write(writer, FASTARecord(seq_id, seq))
end
end
end## 5. Perform multiple protein sequence alignments using Clustal Omega
using FASTX, YRUtils
input_dir = "/home/dell/test"
align_seq_type = "Protein"
align_nthreads = 100
fa_files = YRUtils.BaseUtils.list_files(input_dir, r"\.(fa|fasta)$"; recursive=false, full_name=true)
clustalo_path = YRUtils.ShellUtils.find_cmd("clustalo")
YRUtils.ShellUtils.cmd_valid(`$(clustalo_path) --version`)
output_fa_files = String[]
for fa_file in fa_files
global output_fa_files
output_fa_file = replace(fa_file, r"\.(fa|fasta)$" => ".aligned.fasta")
push!(output_fa_files, output_fa_file)
cmd = `clustalo -i $(fa_file) -t $(align_seq_type) --infmt=fasta -o $(output_fa_file) --outfmt=fasta --output-order=tree-order --threads=$(align_nthreads)`
@info string("running ", cmd, " ...")
run(cmd; wait=true)
end
dict = Dict{String,Vector{String}}()
for output_fa_file in output_fa_files
global dict
seq_id_vec = String[]
seq_vec = String[]
FASTAReader(open(output_fa_file)) do reader
record = FASTARecord()
while !eof(reader)
empty!(record)
read!(reader, record)
push!(seq_id_vec, identifier(record))
push!(seq_vec, sequence(record))
end
end
dict[join(seq_id_vec, " --> ")] = seq_vec
end
YRUtils.BioUtils.show_align(dict, joinpath(input_dir, "alignment_vis.html"); wrap_width=80)