1 IQ-TREE
For detailed info, see https://iqtree.github.io/ and https://github.com/iqtree/iqtree3.
- Pull Cactus singularity image.
singularity pull cactus_v3.0.1_gpu.sif docker://quay.io/comparative-genomics-toolkit/cactus:v3.0.1-gpu
singularity shell --nv -B "${PWD}":/data /data/softwares/cactus/cactus_v3.0.1_gpu.sif
- Extract MAF file of each chromosome.
#!/usr/bin/bash -e
hal_file=msa.hal
ref_seqnames_file=NeoVis_seqnames.txt
ref_genome=NeoVis
output_dir=NeoVis_chr_mafs
mkdir "${output_dir}"
while IFS= read -r seqname
do
hal2maf --hdf5InMemory --noDupes --onlyOrthologs --unique --noAncestors --refGenome "${ref_genome}" --refSequence "${seqname}" "${hal_file}" "${output_dir}/${seqname}.maf"
done < "${ref_seqnames_file}"
- Extract MAF file of each peak.
#!/usr/bin/bash -e
mafs_dir=NeoVis_chr_mafs
bed_file=P2-Sul_core_network.acc_cnes.rodent.new_mink_atac_acc_intersect.bed
output_dir=p2_sul
mkdir "${output_dir}"
mafsInRegion -outDir -keepInitialGaps "${bed_file}" "${output_dir}" "${mafs_dir}"/*.maf
- Convert MAF files to FASTA files.
library(vroom)
library(tidyverse)
input_dir <- "/home/yangrui/test/cnes_proj/mink_atac_acc_intersect_regions"
max_n <- 7
maf_files <- list.files(input_dir, pattern = ".+\\.maf$", full.names = TRUE, recursive = TRUE)
for (maf_file in maf_files) {
lines <- trimws(vroom_lines(maf_file))
lines <- lines[grepl("^(a|s)", lines)]
a_idx <- which(grepl("^a", lines))
s_idx <- which(grepl("^s", lines))
ls <- vector("list", length(a_idx))
for (i in seq_along(a_idx)) {
if (i == length(a_idx)) {
s_sub_idx <- seq(a_idx[i] + 1, length(lines), 1)
} else {
s_sub_idx <- seq(a_idx[i] + 1, a_idx[i + 1] - 1, 1)
}
if (all(s_sub_idx %in% s_idx)) {
ls[[i]] <- vroom(I(paste0(gsub("[ \t]+", "\t", lines[s_sub_idx]), collapse = "\n")),
delim = "\t", col_names = FALSE
)
if (ncol(ls[[i]]) != 7) {
stop("the number of columns is not 7 (", maf_file, ")")
}
} else {
stop("unknown error occurred (", maf_file, ")")
}
}
if (any(sapply(ls, nrow) > max_n)) {
stop(
"row counts > max_n in blocks [",
paste0(which(sapply(ls, nrow) > max_n), collapse = ", "),
"] out of ", length(ls), " (", maf_file, ")"
)
}
ls <- lapply(ls, `[`, c("X2", "X7"))
for (i in seq_along(ls)) {
if (i == 1) {
df <- ls[[i]]
} else {
df <- full_join(df, ls[[i]], by = "X2", suffix = paste0(".", c(i - 1, i)))
}
}
seq_col_names <- names(df)[grepl("^X7(\\.\\d+)?$", names(df))]
for (n in seq_col_names) {
if (all(is.na(df[[n]]))) {
stop("all values in column 7 are NAs (", maf_file, ")")
}
non_na_idx <- which(!is.na(df[[n]]))
non_na_seq_length <- unique(nchar(df[[n]][non_na_idx]))
if (length(non_na_seq_length) != 1) {
stop("the lengths of values in column 7 are not identical across species (", maf_file, ")")
}
na_idx <- which(is.na(df[[n]]))
if (length(na_idx) > 0) {
df[[n]][na_idx] <- paste0(rep("-", non_na_seq_length), collapse = "")
}
}
df <- unite(df, col = "concat_seq", all_of(seq_col_names), sep = "")
if (length(unique(nchar(df[["concat_seq"]]))) != 1) {
stop("the lengths of values in column concat_seq are not identical across species (", maf_file, ")")
}
df <- mutate(df, fa_fmt = paste0(">", X2, "\n", concat_seq))
vroom_write_lines(df[["fa_fmt"]], file = gsub("maf$", "fa", maf_file))
}
- Tree construction using IQ-TREE.
#!/usr/bin/bash -e
root_dir=/home/yangrui/test/cnes_proj/mink_atac_acc_intersect_regions
outgroup=PetBre
shopt -s extglob
for aln_file in "${root_dir}"/**/*.fa; do
detected_outgroup=$(grep -P "^>${outgroup}" "${aln_file}")
detected_outgroup="${detected_outgroup#>}"
if [[ -n "${detected_outgroup}" ]]; then
iqtree3 -s "${aln_file}" -m MFP -T AUTO -B 1000 --bnni --alrt 1000 -o "${detected_outgroup}"
echo "running IQ-Tree for ${aln_file} done!"
else
iqtree3 -s "${aln_file}" -m MFP -T AUTO -B 1000 --bnni --alrt 1000
echo "running IQ-Tree for ${aln_file} done!"
fi
done