quality assessment of genome assembly and annotation

1 Genome assembly QC using QUAST

#!/usr/bin/bash -e

conda activate quast

fa_file="/data/database/genome/release/neovision_vision/Neovision_vision.v1.KIZ_ShengLab.dna_sm.primary_assembly.fa.gz"
num_threads=100
tmp_dir="/data/tmp/yangrui_tmp"

cd "${tmp_dir}"

fa_file_basename=$(basename "${fa_file}")
output_dir=$(echo "${fa_file_basename}" | awk '{gsub(/\.(fa|fasta)\.gz$/, ".quast_out", $1); printf "%s", $1}')
tmp_fa_file=$(echo "${fa_file_basename}" | awk '{gsub(/\.(fa|fasta)\.gz$/, ".fa", $1); printf "%s", $1}')
pigz -k -c -d "${fa_file}" > "${tmp_fa_file}"
quast -o "${output_dir}" -t "${num_threads}" "${tmp_fa_file}"

2 BUSCO for both genome assembly and annotation QC

You should first extract the longest transcript of each gene, and then translate them into protein sequences before running BUSCO.

#!/usr/bin/bash -e

MICROMAMBA_EXE=/home/yangrui/softwares/micromamba/bin/micromamba
# FASTA format
# genome assembly/longest protein sequence of each gene/longest transcript sequence of each gene
input_file=/data/database/annotation/release/neovision_vision/Neovision_vision.v1.v2.KIZ_NCBI.basic.chr.annotation.longest_isoforms.period.protein_sequences.fa
# genome/proteins/transcriptome
mode=proteins
# select the nearest dataset for your species
lineage_dataset=/home/yangrui/softwares/busco_datasets/busco_downloads/lineages/carnivora_odb12
num_threads=60
tmp_dir=/data/tmp/yangrui_tmp

cd "${tmp_dir}"

output_dir=$(basename "${input_file}" .fa).busco_out

"${MICROMAMBA_EXE}" run -n busco-v6 busco -i "${input_file}" -m "${mode}" -l "${lineage_dataset}" -c "${num_threads}" -o "${output_dir}"

Gather BUSCO summary statistics to a plot:

# gather BUSCO summary statistics to a plot
library(tidyverse)
library(YRUtils)
library(jsonlite)

root_dir <- "/data/tmp/yangrui_tmp"
trim_file_pattern <- "^short_summary\\.specific\\.[a-zA-Z0-9]+_odb[0-9]+\\.|\\.busco_out\\.json$"
font_family <- "Arial"
font_size <- 24

json_files <- list.files(root_dir, pattern = "^short_summary\\..+\\.json$", full.names = TRUE, recursive = TRUE)
df <- tibble()
for (json_file in json_files) {
    json_obj <- fromJSON(json_file)
    df <- bind_rows(
        df,
        tibble(
            name = paste0(gsub(trim_file_pattern, "", basename(json_file)), "\n", json_obj$lineage_dataset$name),
            category = factor(c("C&S", "C&D", "F", "M"), levels = rev(c("C&S", "C&D", "F", "M"))),
            percent = c(json_obj$results[["Single copy percentage"]], json_obj$results[["Multi copy percentage"]], json_obj$results[["Fragmented percentage"]], json_obj$results[["Missing percentage"]]),
            one_line_summary = json_obj$results[["one_line_summary"]]
        )
    )
}

p <- ggplot() +
    geom_bar(aes(name, percent, fill = category), data = df, stat = "identity") +
    geom_text(aes(name, y, label = one_line_summary),
        data = mutate(distinct(select(df, name, one_line_summary)), y = 2),
        hjust = 0,
        size.unit = "pt",
        size = font_size,
        family = font_family
    ) +
    coord_flip() +
    theme_classic() +
    theme(
        text = element_text(size = font_size, family = font_family),
        axis.text = element_text(size = font_size, family = font_family),
        axis.title = element_text(size = font_size, family = font_family),
        legend.text = element_text(size = font_size, family = font_family),
        legend.title = element_text(size = font_size, family = font_family)
    )
ppreview(p, file = file.path(root_dir, "busco_short_summaries.pdf"))

BUSCO 质量评估参考标准：

质量等级	C% (Complete) (S+D)	S% (Single-Copy)	F% (Fragmented)	M% (Missing)	描述
优秀 (Excellent)	> 95%	> 90-95%	< 3-5%	< 3-5%	接近完整 (Near-complete)。通常是发表“参考基因组”级别的目标。
良好 (Good)	> 90%	> 85-90%	< 5-10%	< 5-10%	高质量草图 (High-quality draft)。完全足够用于大多数下游分析。
中等 (Moderate)	80% - 90%	> 75%	< 10-15%	< 10-15%	可用草图 (Usable draft)。对于非模式生物或特定研究目的可能已足够。
较差 (Poor)	< 80%	< 75%	> 15%	> 15%	高度片段化或不完整。表明组装或注释存在严重问题，可能需要重新测序或重新注释。

Note: some misc scripts used to locate the BUSCO dataset of your species.

wget -c ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
tar -xzvf taxdump.tar.gz
rm taxdump.tar.gz

library(taxizedb)
library(taxize)
library(vroom)
library(tidyverse)

llm_tax_trans_en2cn <- function(
    x,
    llm_model = "kimi-k2-0905-preview",
    llm_api_url = "https://api.moonshot.cn/v1/chat/completions",
    llm_api_key = Sys.getenv("LLM_KM_KEY")) {
    prompt_text <- paste(
        "You are a professional biological taxonomist.",
        "请将下列生物分类学名称翻译为中文，给出每个名称在中国大陆学术文献中最常用、最公认的中文译名；",
        "若尚无公认译名，则用能反映英文原意的中文译名；仍无法翻译时，原样保留英文名称。",
        "严禁输出任何解释、编号、换行或标点，仅用制表符分隔中文结果，输出顺序与输入顺序严格对应。",
        paste0(x, collapse = "\t"),
        sep = "\n"
    )

    req <- httr2::request(llm_api_url) %>%
        httr2::req_headers(
            "Content-Type" = "application/json",
            "Authorization" = paste("Bearer", llm_api_key)
        ) %>%
        httr2::req_body_json(list(
            model = llm_model,
            messages = list(list(
                role = "user",
                content = prompt_text
            )),
            temperature = 0.2,
            max_tokens = length(x) * 5 * 3
        ))
    resp <- req %>% httr2::req_perform()
    ans <- resp %>% httr2::resp_body_json()

    llm_zh_res <- trimws(strsplit(ans$choices[[1]]$message$content, "\t")[[1]])
    if (length(llm_zh_res) != length(x)) {
        stop("the length of the result is not exactly the same as the input, and you may need to try it again")
    }

    return(setNames(llm_zh_res, x))
}

fy_appid <- Sys.getenv("BD_FY_ID")
fy_key <- Sys.getenv("BD_FY_KEY")
fy_source <- "baidu"
fanyi::set_translate_option(appid = fy_appid, key = fy_key, source = fy_source)

# download NCBI database for offline use
# db_download_ncbi()

# NCBI taxonomy ID only
target_taxid <- 452646
ncbi_tax_merged_file <- "/home/yangrui/softwares/ncbi_taxonomy_dbs/merged.dmp"
ncbi_tax_nodes_file <- "/home/yangrui/softwares/ncbi_taxonomy_dbs/nodes.dmp"
ncbi_db_file <- "/home/yangrui/softwares/taxizedbs/NCBI_2025-10-30.sql"
busco_datasets_dir <- "/home/yangrui/softwares/busco_datasets/busco_downloads/lineages"
work_dir <- "/home/yangrui/tmpwd"

src <- src_ncbi(path = ncbi_db_file)

ncbi_tax_merged_df <- vroom(ncbi_tax_merged_file, delim = "|", trim_ws = TRUE, col_names = FALSE) %>%
    select(-X3) %>%
    distinct()
ncbi_tax_nodes_df <- vroom(ncbi_tax_nodes_file, delim = "|", trim_ws = TRUE, col_names = FALSE) %>%
    select(-X14) %>%
    distinct()

if (target_taxid %in% ncbi_tax_nodes_df$X1) {
    message("query ID ", target_taxid, " is the latest")
} else if (target_taxid %in% ncbi_tax_merged_df$X1) {
    message("query ID ", target_taxid, " is NOT the latest\nthis may cause some problems")
    target_taxids <- filter(ncbi_tax_merged_df, X1 == target_taxid) %>%
        as.list() %>%
        unlist()
    message("consider replacing the old [", target_taxids["X1"], "] with the latest [", target_taxids["X2"], "]")
}

busco_datasets_species_files <- list.files(busco_datasets_dir, pattern = "^species\\.info$", full.names = TRUE, recursive = TRUE)
names(busco_datasets_species_files) <- basename(gsub("/info/species\\.info$", "", busco_datasets_species_files))
busco_datasets_species_df <- tibble()
for (n in names(busco_datasets_species_files)) {
    busco_datasets_species_df <- bind_rows(
        busco_datasets_species_df,
        vroom(busco_datasets_species_files[n], col_names = c("tax_id", "species_name")) %>%
            mutate(busco_lineage_name = n) %>%
            na.omit() %>%
            distinct()
    )
}
busco_taxids_df <- busco_datasets_species_df %>%
    select(tax_id) %>%
    distinct() %>%
    mutate(
        is_new = if_else(tax_id %in% ncbi_tax_nodes_df$X1, TRUE, FALSE),
        is_old = if_else(tax_id %in% ncbi_tax_merged_df$X1, TRUE, FALSE),
        flag = if_else(is_new, "NEW", if_else(is_old, "OLD", "UNKNOWN"))
    )
busco_unknown_taxids_df <- filter(busco_taxids_df, flag == "UNKNOWN")
if (nrow(busco_unknown_taxids_df) > 0) {
    warning("these NCBI taxonomy IDs are neither new nor old, and will be removed: ", paste0(busco_unknown_taxids_df$tax_id, collapse = " "))
    busco_taxids_df <- filter(busco_taxids_df, flag != "UNKNOWN")
}
busco_taxids_df <- busco_taxids_df %>%
    left_join(ncbi_tax_merged_df, by = c("tax_id" = "X1")) %>%
    mutate(final_tax_id = if_else(flag == "OLD", X2, tax_id)) %>%
    select(tax_id, final_tax_id)
busco_datasets_species_df <- busco_taxids_df %>%
    inner_join(busco_datasets_species_df, by = "tax_id") %>%
    select(-tax_id) %>%
    rename(tax_id = final_tax_id)
busco_datasets_species_ls <- split(busco_datasets_species_df, busco_datasets_species_df$busco_lineage_name)

busco_taxids <- as.character(unique(na.omit(busco_datasets_species_df$tax_id)))
tax_paths <- taxizedb::classification(unique(c(target_taxid, busco_taxids)), db = "ncbi")
tax_paths_na_idx <- which(sapply(tax_paths, function(df) any(is.na(df))))
if (length(tax_paths_na_idx) > 0) {
    stop("elements with numerical indices [", paste0(tax_paths_na_idx, collapse = ", "), "] are NAs in the results returned by taxizedb::classification\nplease check them in detail or delete them before continuing")
}

lca_df <- tibble()
for (busco_lineage_name in names(busco_datasets_species_ls)) {
    message("dealing with ", busco_lineage_name, " ...")
    busco_lineage_taxids <- as.character(unique(na.omit(busco_datasets_species_ls[[busco_lineage_name]]$tax_id)))
    tmp_lca_df <- lowest_common(unique(c(target_taxid, busco_lineage_taxids)), class_list = tax_paths)
    if (!any(is.na(tmp_lca_df))) {
        lca_df <- bind_rows(lca_df, mutate(tmp_lca_df, busco_lineage_name = busco_lineage_name))
    }
}
lca_df <- mutate(lca_df, lca_depth = vapply(id, function(x) nrow(taxizedb::classification(as.character(x), db = "ncbi")[[1]]), integer(1)))
lca_df <- lca_df %>%
    arrange(desc(lca_depth)) %>%
    mutate(
        clean_busco_lineage_name = gsub("_odb\\d+$", "", busco_lineage_name),
        clean_busco_lineage_name_zh_baidu = fanyi::baidu_translate(clean_busco_lineage_name),
        name_zh_baidu = fanyi::baidu_translate(name)
    )

tax_names <- llm_tax_trans_en2cn(unique(lca_df$name))
lca_df <- left_join(lca_df, tibble(
    name = names(tax_names),
    name_zh_llm = tax_names
), by = "name")

clean_busco_lineage_names <- llm_tax_trans_en2cn(unique(lca_df$clean_busco_lineage_name))
lca_df <- left_join(lca_df, tibble(
    clean_busco_lineage_name = names(clean_busco_lineage_names),
    clean_busco_lineage_name_zh_llm = clean_busco_lineage_names
), by = "clean_busco_lineage_name")

vroom_write(lca_df, file = file.path(work_dir, paste0(target_taxid, ".busco_lca.tsv")))