1 Genome assembly QC using QUAST
#!/usr/bin/bash -e
conda activate quast
fa_file="/data/database/genome/release/neovision_vision/Neovision_vision.v1.KIZ_ShengLab.dna_sm.primary_assembly.fa.gz"
num_threads=100
tmp_dir="/data/tmp/yangrui_tmp"
cd "${tmp_dir}"
fa_file_basename=$(basename "${fa_file}")
output_dir=$(echo "${fa_file_basename}" | awk '{gsub(/\.(fa|fasta)\.gz$/, ".quast_out", $1); printf "%s", $1}')
tmp_fa_file=$(echo "${fa_file_basename}" | awk '{gsub(/\.(fa|fasta)\.gz$/, ".fa", $1); printf "%s", $1}')
pigz -k -c -d "${fa_file}" > "${tmp_fa_file}"
quast -o "${output_dir}" -t "${num_threads}" "${tmp_fa_file}"
2 BUSCO for both genome assembly and annotation QC
You should first extract the longest transcript of each gene, and then translate them into protein sequences before running BUSCO.
#!/usr/bin/bash -e
MICROMAMBA_EXE=/home/yangrui/softwares/micromamba/bin/micromamba
# FASTA format
# genome assembly/longest protein sequence of each gene/longest transcript sequence of each gene
input_file=/data/database/annotation/release/neovision_vision/Neovision_vision.v1.v2.KIZ_NCBI.basic.chr.annotation.longest_isoforms.period.protein_sequences.fa
# genome/proteins/transcriptome
mode=proteins
# select the nearest dataset for your species
lineage_dataset=/home/yangrui/softwares/busco_datasets/busco_downloads/lineages/carnivora_odb12
num_threads=60
tmp_dir=/data/tmp/yangrui_tmp
cd "${tmp_dir}"
output_dir=$(basename "${input_file}" .fa).busco_out
"${MICROMAMBA_EXE}" run -n busco-v6 busco -i "${input_file}" -m "${mode}" -l "${lineage_dataset}" -c "${num_threads}" -o "${output_dir}"
Gather BUSCO summary statistics to a plot:
# gather BUSCO summary statistics to a plot
library(tidyverse)
library(YRUtils)
library(jsonlite)
root_dir <- "/data/tmp/yangrui_tmp"
trim_file_pattern <- "^short_summary\\.specific\\.[a-zA-Z0-9]+_odb[0-9]+\\.|\\.busco_out\\.json$"
font_family <- "Arial"
font_size <- 24
json_files <- list.files(root_dir, pattern = "^short_summary\\..+\\.json$", full.names = TRUE, recursive = TRUE)
df <- tibble()
for (json_file in json_files) {
json_obj <- fromJSON(json_file)
df <- bind_rows(
df,
tibble(
name = paste0(gsub(trim_file_pattern, "", basename(json_file)), "\n", json_obj$lineage_dataset$name),
category = factor(c("C&S", "C&D", "F", "M"), levels = rev(c("C&S", "C&D", "F", "M"))),
percent = c(json_obj$results[["Single copy percentage"]], json_obj$results[["Multi copy percentage"]], json_obj$results[["Fragmented percentage"]], json_obj$results[["Missing percentage"]]),
one_line_summary = json_obj$results[["one_line_summary"]]
)
)
}
p <- ggplot() +
geom_bar(aes(name, percent, fill = category), data = df, stat = "identity") +
geom_text(aes(name, y, label = one_line_summary),
data = mutate(distinct(select(df, name, one_line_summary)), y = 2),
hjust = 0,
size.unit = "pt",
size = font_size,
family = font_family
) +
coord_flip() +
theme_classic() +
theme(
text = element_text(size = font_size, family = font_family),
axis.text = element_text(size = font_size, family = font_family),
axis.title = element_text(size = font_size, family = font_family),
legend.text = element_text(size = font_size, family = font_family),
legend.title = element_text(size = font_size, family = font_family)
)
ppreview(p, file = file.path(root_dir, "busco_short_summaries.pdf"))
BUSCO 质量评估参考标准:
| 质量等级 | C% (Complete) (S+D) | S% (Single-Copy) | F% (Fragmented) | M% (Missing) | 描述 |
|---|---|---|---|---|---|
| 优秀 (Excellent) | > 95% | > 90-95% | < 3-5% | < 3-5% | 接近完整 (Near-complete)。通常是发表“参考基因组”级别的目标。 |
| 良好 (Good) | > 90% | > 85-90% | < 5-10% | < 5-10% | 高质量草图 (High-quality draft)。完全足够用于大多数下游分析。 |
| 中等 (Moderate) | 80% - 90% | > 75% | < 10-15% | < 10-15% | 可用草图 (Usable draft)。对于非模式生物或特定研究目的可能已足够。 |
| 较差 (Poor) | < 80% | < 75% | > 15% | > 15% | 高度片段化或不完整。表明组装或注释存在严重问题,可能需要重新测序或重新注释。 |
Note: some misc scripts used to locate the BUSCO dataset of your species.
wget -c ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
tar -xzvf taxdump.tar.gz
rm taxdump.tar.gz
library(taxizedb)
library(taxize)
library(vroom)
library(tidyverse)
llm_tax_trans_en2cn <- function(
x,
llm_model = "kimi-k2-0905-preview",
llm_api_url = "https://api.moonshot.cn/v1/chat/completions",
llm_api_key = Sys.getenv("LLM_KM_KEY")) {
prompt_text <- paste(
"You are a professional biological taxonomist.",
"请将下列生物分类学名称翻译为中文,给出每个名称在中国大陆学术文献中最常用、最公认的中文译名;",
"若尚无公认译名,则用能反映英文原意的中文译名;仍无法翻译时,原样保留英文名称。",
"严禁输出任何解释、编号、换行或标点,仅用制表符分隔中文结果,输出顺序与输入顺序严格对应。",
paste0(x, collapse = "\t"),
sep = "\n"
)
req <- httr2::request(llm_api_url) %>%
httr2::req_headers(
"Content-Type" = "application/json",
"Authorization" = paste("Bearer", llm_api_key)
) %>%
httr2::req_body_json(list(
model = llm_model,
messages = list(list(
role = "user",
content = prompt_text
)),
temperature = 0.2,
max_tokens = length(x) * 5 * 3
))
resp <- req %>% httr2::req_perform()
ans <- resp %>% httr2::resp_body_json()
llm_zh_res <- trimws(strsplit(ans$choices[[1]]$message$content, "\t")[[1]])
if (length(llm_zh_res) != length(x)) {
stop("the length of the result is not exactly the same as the input, and you may need to try it again")
}
return(setNames(llm_zh_res, x))
}
fy_appid <- Sys.getenv("BD_FY_ID")
fy_key <- Sys.getenv("BD_FY_KEY")
fy_source <- "baidu"
fanyi::set_translate_option(appid = fy_appid, key = fy_key, source = fy_source)
# download NCBI database for offline use
# db_download_ncbi()
# NCBI taxonomy ID only
target_taxid <- 452646
ncbi_tax_merged_file <- "/home/yangrui/softwares/ncbi_taxonomy_dbs/merged.dmp"
ncbi_tax_nodes_file <- "/home/yangrui/softwares/ncbi_taxonomy_dbs/nodes.dmp"
ncbi_db_file <- "/home/yangrui/softwares/taxizedbs/NCBI_2025-10-30.sql"
busco_datasets_dir <- "/home/yangrui/softwares/busco_datasets/busco_downloads/lineages"
work_dir <- "/home/yangrui/tmpwd"
src <- src_ncbi(path = ncbi_db_file)
ncbi_tax_merged_df <- vroom(ncbi_tax_merged_file, delim = "|", trim_ws = TRUE, col_names = FALSE) %>%
select(-X3) %>%
distinct()
ncbi_tax_nodes_df <- vroom(ncbi_tax_nodes_file, delim = "|", trim_ws = TRUE, col_names = FALSE) %>%
select(-X14) %>%
distinct()
if (target_taxid %in% ncbi_tax_nodes_df$X1) {
message("query ID ", target_taxid, " is the latest")
} else if (target_taxid %in% ncbi_tax_merged_df$X1) {
message("query ID ", target_taxid, " is NOT the latest\nthis may cause some problems")
target_taxids <- filter(ncbi_tax_merged_df, X1 == target_taxid) %>%
as.list() %>%
unlist()
message("consider replacing the old [", target_taxids["X1"], "] with the latest [", target_taxids["X2"], "]")
}
busco_datasets_species_files <- list.files(busco_datasets_dir, pattern = "^species\\.info$", full.names = TRUE, recursive = TRUE)
names(busco_datasets_species_files) <- basename(gsub("/info/species\\.info$", "", busco_datasets_species_files))
busco_datasets_species_df <- tibble()
for (n in names(busco_datasets_species_files)) {
busco_datasets_species_df <- bind_rows(
busco_datasets_species_df,
vroom(busco_datasets_species_files[n], col_names = c("tax_id", "species_name")) %>%
mutate(busco_lineage_name = n) %>%
na.omit() %>%
distinct()
)
}
busco_taxids_df <- busco_datasets_species_df %>%
select(tax_id) %>%
distinct() %>%
mutate(
is_new = if_else(tax_id %in% ncbi_tax_nodes_df$X1, TRUE, FALSE),
is_old = if_else(tax_id %in% ncbi_tax_merged_df$X1, TRUE, FALSE),
flag = if_else(is_new, "NEW", if_else(is_old, "OLD", "UNKNOWN"))
)
busco_unknown_taxids_df <- filter(busco_taxids_df, flag == "UNKNOWN")
if (nrow(busco_unknown_taxids_df) > 0) {
warning("these NCBI taxonomy IDs are neither new nor old, and will be removed: ", paste0(busco_unknown_taxids_df$tax_id, collapse = " "))
busco_taxids_df <- filter(busco_taxids_df, flag != "UNKNOWN")
}
busco_taxids_df <- busco_taxids_df %>%
left_join(ncbi_tax_merged_df, by = c("tax_id" = "X1")) %>%
mutate(final_tax_id = if_else(flag == "OLD", X2, tax_id)) %>%
select(tax_id, final_tax_id)
busco_datasets_species_df <- busco_taxids_df %>%
inner_join(busco_datasets_species_df, by = "tax_id") %>%
select(-tax_id) %>%
rename(tax_id = final_tax_id)
busco_datasets_species_ls <- split(busco_datasets_species_df, busco_datasets_species_df$busco_lineage_name)
busco_taxids <- as.character(unique(na.omit(busco_datasets_species_df$tax_id)))
tax_paths <- taxizedb::classification(unique(c(target_taxid, busco_taxids)), db = "ncbi")
tax_paths_na_idx <- which(sapply(tax_paths, function(df) any(is.na(df))))
if (length(tax_paths_na_idx) > 0) {
stop("elements with numerical indices [", paste0(tax_paths_na_idx, collapse = ", "), "] are NAs in the results returned by taxizedb::classification\nplease check them in detail or delete them before continuing")
}
lca_df <- tibble()
for (busco_lineage_name in names(busco_datasets_species_ls)) {
message("dealing with ", busco_lineage_name, " ...")
busco_lineage_taxids <- as.character(unique(na.omit(busco_datasets_species_ls[[busco_lineage_name]]$tax_id)))
tmp_lca_df <- lowest_common(unique(c(target_taxid, busco_lineage_taxids)), class_list = tax_paths)
if (!any(is.na(tmp_lca_df))) {
lca_df <- bind_rows(lca_df, mutate(tmp_lca_df, busco_lineage_name = busco_lineage_name))
}
}
lca_df <- mutate(lca_df, lca_depth = vapply(id, function(x) nrow(taxizedb::classification(as.character(x), db = "ncbi")[[1]]), integer(1)))
lca_df <- lca_df %>%
arrange(desc(lca_depth)) %>%
mutate(
clean_busco_lineage_name = gsub("_odb\\d+$", "", busco_lineage_name),
clean_busco_lineage_name_zh_baidu = fanyi::baidu_translate(clean_busco_lineage_name),
name_zh_baidu = fanyi::baidu_translate(name)
)
tax_names <- llm_tax_trans_en2cn(unique(lca_df$name))
lca_df <- left_join(lca_df, tibble(
name = names(tax_names),
name_zh_llm = tax_names
), by = "name")
clean_busco_lineage_names <- llm_tax_trans_en2cn(unique(lca_df$clean_busco_lineage_name))
lca_df <- left_join(lca_df, tibble(
clean_busco_lineage_name = names(clean_busco_lineage_names),
clean_busco_lineage_name_zh_llm = clean_busco_lineage_names
), by = "clean_busco_lineage_name")
vroom_write(lca_df, file = file.path(work_dir, paste0(target_taxid, ".busco_lca.tsv")))