Build reference genome index and align/quantify reads

1 MobiVision

Caution: mobivision is not really robust for annotations from non-model organisms (it seems fine for Ensembl annotations). It’s highly recommended that you use cellranger instead of mobivison.

Relationships between the number of cells recovered and the expected doublet rates (MobiVision platform):

捕获细胞数	墨卓双胞率
5000	2.50%
6000	3.00%
7000	3.50%
8000	4.00%
9000	4.50%
10000	5.00%
11000	5.50%
12000	6.00%
13000	6.50%
14000	7.00%
15000	7.50%
16000	8.00%
17000	8.50%
18000	9.00%
19000	9.50%
20000	10.00%
21000	10.50%

1.1 Transcriptome

1.1.1 Build transcriptomic reference genome index

One genome file and one annotation file (single mode)

# !/usr/local/bin/Rscript
# check_fix_gtf_for_cellranger.R

args <- commandArgs(trailingOnly = TRUE)
if (length(args) != 1 || !file.exists(args[1])) {
    message("check_fix_gtf_for_cellranger.R example.gtf\n\nImportant: the input file i.e., example.gtf here will be overwritten, so pass a copy of your original file.")
    quit(save = "no", status = 1)
} else {
    message("dealing with ", args[1])
}

suppressMessages(library(rtracklayer))
suppressMessages(library(tidyverse))

gene_biotype_fields <- c("gene_type", "gene_biotype", "biotype")
gene_biotype_default <- "protein_coding"
keep_cols <- c(
    "source", "type", "score", "phase",
    "gene_id", "transcript_id", "gene_name", "gene_biotype",
    "transcript_biotype", "exon_number", "protein_id"
)

gtf_file <- args[1]

gtf <- import(gtf_file, format = "gtf")
df <- as_tibble(as.data.frame(gtf))

exon_df <- filter(df, type == "exon")
if (nrow(exon_df) == 0 || !("gene_id" %in% names(exon_df)) || !("transcript_id" %in% names(exon_df)) || any(is.na(exon_df$gene_id)) || any(is.na(exon_df$transcript_id)) || any(exon_df$gene_id == "") || any(exon_df$transcript_id == "")) {
    stop("some exons don't have 'gene_id' and/or 'transcript_id' attributes")
}

gtf$gene_id <- trimws(gtf$gene_id)
if (any(gtf$gene_id == "") || any(grepl("[ \t\r\n\"]", gtf$gene_id))) {
    stop("some gene_id values are empty")
}

gene_biotype_field <- gene_biotype_fields[which(gene_biotype_fields %in% names(df))][1]
if (!is.na(gene_biotype_field)) {
    gene_id_biotype_df <- df %>%
        mutate(gene_biotype = .data[[gene_biotype_field]]) %>%
        select(gene_id, gene_biotype) %>%
        na.omit() %>%
        distinct()
    gtf$gene_biotype <- gene_id_biotype_df$gene_biotype[match(gtf$gene_id, gene_id_biotype_df$gene_id)]
    gtf$gene_biotype[is.na(gtf$gene_biotype)] <- gene_biotype_default
} else {
    gtf$gene_biotype <- gene_biotype_default
}

if ("gene_name" %in% names(mcols(gtf))) {
    gtf$gene_name[is.na(gtf$gene_name)] <- gtf$gene_id[is.na(gtf$gene_name)]
} else {
    gtf$gene_name <- gtf$gene_id
}
gtf$gene_name[grepl("[ \t\r\n\"]", gtf$gene_name)] <- gtf$gene_id[grepl("[ \t\r\n\"]", gtf$gene_name)]

final_cols <- intersect(keep_cols, names(mcols(gtf)))
mcols(gtf) <- mcols(gtf)[, final_cols]

export(gtf, con = gtf_file, format = "gtf")
message("dealing with ", gtf_file, " done!")

#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp

cd "${work_dir}"
vim "${uuid_str}".run_mobivision_mkindex.sh
chmod +x "${uuid_str}".run_mobivision_mkindex.sh

log_file="${uuid_str}".run_mobivision_mkindex.log
pid_file="${uuid_str}".run_mobivision_mkindex.pid
nohup ./"${uuid_str}".run_mobivision_mkindex.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished

#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
# <genome build version>_<annotation build version>
build_names=(
  PetaurusBrevicepsV1_PetaurusBrevicepsV3
)
# Gzipped FASTA only
genomes=(
  /data/database/genome/release/petaurus_breviceps/Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa.gz
)
# Gzipped GTF only
annotations=(
  /data/database/annotation/release/petaurus_breviceps/Petaurus_breviceps.v1.v3.KIZ_NCBI.basic.chr.annotation.gtf.gz
)
output_dirs=(
  /data/database/ref_idx/mobivision_v4.1/transcriptome
)
# true
# any other values will be considered as false
check_gtf=true
root_work_dir=/data/tmp/dell_tmp

# shellcheck source=/dev/null
source ${mobivision_source_path}

(( ${#build_names[@]} == ${#genomes[@]} && ${#genomes[@]} == ${#annotations[@]} && ${#annotations[@]} == ${#output_dirs[@]} )) || {
  echo "ERROR: array lengths are not equal" >&2
  exit 1
}

echo "start running pipeline ..."
for((i=0;i<${#build_names[@]};i++)); do
  uuid_str=$(</proc/sys/kernel/random/uuid)
  work_dir="${root_work_dir}/${uuid_str}"
  mkdir -p "${work_dir}"
  cd "${work_dir}" || exit

  build_name="${build_names[i]}"
  genome="${genomes[i]}"
  annotation="${annotations[i]}"
  output_dir="${output_dirs[i]}"

  log_file="${build_name}".run_mobivision_mkindex.log

  # Generally, avoid running `mobivision fmt_gtf` unless absolutely necessary
  tmp_annotation_fixed=$(basename "${annotation}" .gtf.gz).fixed.gtf
  tmp_annotation=$(basename "${annotation}" .gtf.gz).rm_comments.gtf

  pigz -k -c -d "${annotation}" > "${tmp_annotation_fixed}"
  if [[ "${check_gtf}" == "true" ]]; then
    /data/softwares/misc_r_scripts/check_fix_gtf_for_cellranger.R "${tmp_annotation_fixed}"
  fi

  rm_comments_cmd="grep -v -P '^#' '${tmp_annotation_fixed}' > '${tmp_annotation}'"
  echo "running command: ${rm_comments_cmd}" &> "${log_file}"
  eval "${rm_comments_cmd}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${rm_comments_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${rm_comments_cmd}" &>> "${log_file}"
  fi

  mobivision_mkindex_cmd="mobivision mkindex -n '${build_name}' -f '${genome}' -g '${tmp_annotation}'"
  echo "running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
  eval "${mobivision_mkindex_cmd}" &>> "${log_file}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${mobivision_mkindex_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
  fi

  build_date=$(date +%Y%m%d_%H%M%S)
  output_dir="${output_dir}/${build_date}"
  mkdir -p "${output_dir}"
  mv_cmd="mv ${build_name} ${output_dir}/"
  echo "running command: ${mv_cmd}" &>> "${log_file}"
  eval "${mv_cmd}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${mv_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${mv_cmd}" &>> "${log_file}"
  fi

  metadata_file="${output_dir}/${build_name}".metadata.json
  echo "writing metadata to ${metadata_file}" &>> "${log_file}"
cat <<EOF |
{
  "build_name": "${build_name}",
  "build_date": "${build_date}",
  "software": "mobivision",
  "software_version": "v4.1",
  "library_type": "transcriptome",
  "reference_type": "single",
  "genomes": ["${genome}"],
  "annotations": ["${annotation}"],
  "cmd": ["${rm_comments_cmd}", "${mobivision_mkindex_cmd}", "${mv_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
  echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"
  mv "${log_file}" "${output_dir}/"

  cd ${root_work_dir} || exit
  rm -rf "${work_dir}"
done
echo "running pipeline done!"

Multiple genome assemblies and their corresponding annotation files (concatenate mode)

Merge all genome assemblies into a single reference sequence and all annotations into a unified annotation set, then map the reads to this consolidated reference.

#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp

cd "${work_dir}"
vim "${uuid_str}".run_mobivision_mkindex.sh
chmod +x "${uuid_str}".run_mobivision_mkindex.sh

log_file="${uuid_str}".run_mobivision_mkindex.log
pid_file="${uuid_str}".run_mobivision_mkindex.pid
nohup ./"${uuid_str}".run_mobivision_mkindex.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished

#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
build_names=(
    TupaiaBelangeriV3_HTNV76118
    GRCm39GENCODEvM38_PRV531PlasmidV1
)
# Gzipped FASTA only
# Do not use colons (:) in any path
# Separate multiple paths for the same reference with colons
genomes=(
  /data/database/genome/release/tupaia_belangeri/Tupaia_belangeri.v3.TreeShrewDB.dna.primary_assembly.fa.gz:/data/database/genome/release/hantaan_virus_strain_76-118/Hantaan_virus_strain_76-118.KT885047-9__1.NCBI.dna.primary_assembly.fa.gz
  /data/database/genome/release/mus_musculus/Mus_musculus.GRCm39.gencode.dna.primary_assembly.fa.gz:/data/database/genome/release/prv531_plasmid/prv531_plasmid.v1.concatenated_curated_TRINITY_fragments.dna.primary_assembly.fa.gz
)
# Gzipped GTF only
# Do not use colons (:) in any path
# Separate multiple paths for the same reference with colons
annotations=(
  /data/database/annotation/release/tupaia_belangeri/Tupaia_belangeri.v3.v3.TreeShrewDB.basic.chr.annotation.gtf.gz:/data/database/annotation/release/hantaan_virus_strain_76-118/Hantaan_virus_strain_76-118.KT885047-9__1.76118_yanglab.NCBI.basic.chr.annotation.gtf.gz
  /data/database/annotation/release/mus_musculus/Mus_musculus.GRCm39.vM38.gencode.basic.chr.annotation.gtf.gz:/data/database/annotation/release/prv531_plasmid/prv531_plasmid.v1.v1.TRINITY.basic.chr.annotation.gtf.gz
)
output_dirs=(
    /data/database/ref_idx/mobivision_v4.1/transcriptome
    /data/database/ref_idx/mobivision_v4.1/transcriptome
)
root_work_dir=/data/tmp/dell_tmp

# shellcheck source=/dev/null
source ${mobivision_source_path}

(( ${#build_names[@]} == ${#genomes[@]} && ${#genomes[@]} == ${#annotations[@]} && ${#annotations[@]} == ${#output_dirs[@]} )) || {
  echo "ERROR: array lengths are not equal" >&2
  exit 1
}

echo "start running pipeline ..."
for((i=0;i<${#build_names[@]};i++)); do
  uuid_str=$(</proc/sys/kernel/random/uuid)
  work_dir="${root_work_dir}/${uuid_str}"
  mkdir -p "${work_dir}"
  cd "${work_dir}" || exit

  build_name="${build_names[i]}"
  IFS=: read -ra genome <<<"${genomes[i]}"
  IFS=: read -ra annotation <<<"${annotations[i]}"
  output_dir="${output_dirs[i]}"

  log_file="${build_name}".run_mobivision_mkindex.log

  merged_genome="${build_name}".fa
  merge_genome_cmd="pigz -k -c -d ${genome[@]} > '${merged_genome}'"
  echo "running command: ${merge_genome_cmd}" &> "${log_file}"
  eval "${merge_genome_cmd}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${merge_genome_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${merge_genome_cmd}" &>> "${log_file}"
  fi

  seqnames=$(awk '/^>/ {print $1}' "${merged_genome}")
  seqnames_num=$(wc -l <<<"${seqnames}")
  seqnames_dedup_num=$(sort -u <<<"${seqnames}" | wc -l)
  if [[ ${seqnames_num} -eq ${seqnames_dedup_num} ]]; then
    echo "All chromosome names are unique" &>> "${log_file}"
  else
    echo "ERROR: some chromosome names are duplicated" &>> "${log_file}"
    echo "Duplicate IDs:" &>> "${log_file}"
    sort <<<"${seqnames}" | uniq -d &>> "${log_file}"
    exit 1
  fi

  # Generally, avoid running `mobivision fmt_gtf` unless absolutely necessary
  merged_annotation="${build_name}".rm_comments.gtf
  merge_annotation_cmd="pigz -k -c -d ${annotation[@]} | grep -v -P '^#' > '${merged_annotation}'"
  echo "running command: ${merge_annotation_cmd}" &>> "${log_file}"
  eval "${merge_annotation_cmd}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${merge_annotation_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${merge_annotation_cmd}" &>> "${log_file}"
  fi

  mobivision_mkindex_cmd="mobivision mkindex -n '${build_name}' -f '${merged_genome}' -g '${merged_annotation}'"
  echo "running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
  eval "${mobivision_mkindex_cmd}" &>> "${log_file}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${mobivision_mkindex_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
  fi

  build_date=$(date +%Y%m%d_%H%M%S)
  output_dir="${output_dir}/${build_date}"
  mkdir -p "${output_dir}"
  mv_cmd="mv ${build_name} ${output_dir}/"
  echo "running command: ${mv_cmd}" &>> "${log_file}"
  eval "${mv_cmd}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${mv_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${mv_cmd}" &>> "${log_file}"
  fi

  metadata_file="${output_dir}/${build_name}".metadata.json
  echo "writing metadata to ${metadata_file}" &>> "${log_file}"
  genomes_json=$(printf '%s\n' "${genome[@]}" | jq -R '.' | jq -s '.')
  annotations_json=$(printf '%s\n' "${annotation[@]}" | jq -R '.' | jq -s '.')
cat <<EOF |
{
  "build_name": "${build_name}",
  "build_date": "${build_date}",
  "software": "mobivision",
  "software_version": "v4.1",
  "library_type": "transcriptome",
  "reference_type": "concatenate",
  "genomes": ${genomes_json},
  "annotations": ${annotations_json},
  "cmd": ["${merge_genome_cmd}", "${merge_annotation_cmd}", "${mobivision_mkindex_cmd}", "${mv_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
  echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"
  mv "${log_file}" "${output_dir}/"

  cd ${root_work_dir} || exit
  rm -rf "${work_dir}"
done
echo "running pipeline done!"

Multiple genome assemblies and their corresponding annotation files (multiple mode)

Map reads to multiple species genomes simultaneously.

#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp

cd "${work_dir}"
vim "${uuid_str}".run_mobivision_mkindex.sh
chmod +x "${uuid_str}".run_mobivision_mkindex.sh

log_file="${uuid_str}".run_mobivision_mkindex.log
pid_file="${uuid_str}".run_mobivision_mkindex.pid
nohup ./"${uuid_str}".run_mobivision_mkindex.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished

#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
# <genome build version>_<annotation build version>
# Do not use colons (:) in any name
# Separate multiple names for the same reference with colons
build_names=(
  GRCh38_GENCODEv49:GRCm39_GENCODEvM38
)
# Gzipped FASTA only
# Do not use colons (:) in any path
# Separate multiple paths for the same reference with colons
genomes=(
  /data/database/genome/release/homo_sapiens/Homo_sapiens.GRCh38.gencode.dna.primary_assembly.fa.gz:/data/database/genome/release/mus_musculus/Mus_musculus.GRCm39.gencode.dna.primary_assembly.fa.gz
)
# Gzipped GTF only
# Do not use colons (:) in any path
# Separate multiple paths for the same reference with colons
annotations=(
  /data/database/annotation/release/homo_sapiens/Homo_sapiens.GRCh38.v49.gencode.basic.chr.annotation.gtf.gz:/data/database/annotation/release/mus_musculus/Mus_musculus.GRCm39.vM38.gencode.basic.chr.annotation.gtf.gz
)
output_dirs=(
    /data/database/ref_idx/mobivision_v4.1/transcriptome
)
root_work_dir=/data/tmp/dell_tmp

# shellcheck source=/dev/null
source ${mobivision_source_path}

(( ${#build_names[@]} == ${#genomes[@]} && ${#genomes[@]} == ${#annotations[@]} && ${#annotations[@]} == ${#output_dirs[@]} )) || {
  echo "ERROR: array lengths are not equal" >&2
  exit 1
}

echo "start running pipeline ..."
for((i=0;i<${#build_names[@]};i++)); do
  uuid_str=$(</proc/sys/kernel/random/uuid)
  work_dir="${root_work_dir}/${uuid_str}"
  mkdir -p "${work_dir}"
  cd "${work_dir}" || exit

  IFS=: read -ra build_name <<<"${build_names[i]}"
  IFS=: read -ra genome <<<"${genomes[i]}"
  IFS=: read -ra annotation <<<"${annotations[i]}"
  output_dir="${output_dirs[i]}"

  merged_build_name=$(printf '%s_and_' "${build_name[@]}" | sed 's/_and_$//')
  log_file="${merged_build_name}".run_mobivision_mkindex.log
  : > "${log_file}"

  # Generally, avoid running `mobivision fmt_gtf` unless absolutely necessary
  rm_comments_annotations=()
  rm_comments_cmds=()
  for anno in "${annotation[@]}"; do
    tmp_anno=$(basename "${anno}" .gtf.gz).rm_comments.gtf
    rm_comments_cmd="pigz -k -c -d '${anno}' | grep -v -P '^#' > '${tmp_anno}'"
    echo "running command: ${rm_comments_cmd}" &>> "${log_file}"
    eval "${rm_comments_cmd}"
    ret=$?
    if (( ret != 0 )); then
      echo "ERROR: running ${rm_comments_cmd} failed with exit code ${ret}" &>> "${log_file}"
    else
      echo "successfully running command: ${rm_comments_cmd}" &>> "${log_file}"
    fi

    rm_comments_annotations+=("${tmp_anno}")
    rm_comments_cmds+=("${rm_comments_cmd}")
  done

  args=()
  for ((j=0; j<${#genome[@]}; j++)); do
    args+=(-n "${build_name[j]}" -f "${genome[j]}" -g "${rm_comments_annotations[j]}")
  done

  mobivision_mkindex_cmd="mobivision mkindex ${args[@]}"
  echo "running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
  eval "${mobivision_mkindex_cmd}" &>> "${log_file}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${mobivision_mkindex_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
  fi

  build_date=$(date +%Y%m%d_%H%M%S)
  output_dir="${output_dir}/${build_date}"
  mkdir -p "${output_dir}"
  mv_cmd="mv ${merged_build_name} ${output_dir}/"
  echo "running command: ${mv_cmd}" &>> "${log_file}"
  eval "${mv_cmd}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${mv_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${mv_cmd}" &>> "${log_file}"
  fi

  metadata_file="${output_dir}/${merged_build_name}".metadata.json
  echo "writing metadata to ${metadata_file}" &>> "${log_file}"
  build_names_json=$(printf '%s\n' "${build_name[@]}" | jq -R '.' | jq -s '.')
  genomes_json=$(printf '%s\n' "${genome[@]}" | jq -R '.' | jq -s '.')
  annotations_json=$(printf '%s\n' "${annotation[@]}" | jq -R '.' | jq -s '.')
  rm_comments_cmds_json=$(printf '"%s", ' "${rm_comments_cmds[@]}" | sed 's/, $//')
cat <<EOF |
{
  "build_name": "${merged_build_name}",
  "build_names": ${build_names_json},
  "build_date": "${build_date}",
  "software": "mobivision",
  "software_version": "v4.1",
  "library_type": "transcriptome",
  "reference_type": "multiple",
  "genomes": ${genomes_json},
  "annotations": ${annotations_json},
  "cmd": [${rm_comments_cmds_json}, "${mobivision_mkindex_cmd}", "${mv_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
  echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"
  mv "${log_file}" "${output_dir}/"

  cd ${root_work_dir} || exit
  rm -rf "${work_dir}"
done
echo "running pipeline done!"

1.1.2 Quantify transcriptome

#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp

cd "${work_dir}"
vim "${uuid_str}".run_mobivision_quantify.sh
chmod +x "${uuid_str}".run_mobivision_quantify.sh

log_file="${uuid_str}".run_mobivision_quantify.log
pid_file="${uuid_str}".run_mobivision_quantify.pid
nohup ./"${uuid_str}".run_mobivision_quantify.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished

#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
# Each directory must contain data from only one biological sample
# Multiple read pair files within a single directory will be automatically merged before pipeline execution
# Directory names serve as unique sample identifiers
# Ensure all directory basenames are globally unique across your entire project to prevent sample ID conflicts
fq_dirs=(
    /data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/raw_fq/JZ25176043-250818-Astro
    /data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/raw_fq/JZ25176044-250818-Glioma
    /data/users/yangrui/lab_projs/wuyuechun/TbelangeriKidneyOrganoid_HTNV76118_snRNA-seq_v20250921/raw_fq/JZ25142338-250707-NC
    /data/users/yangrui/lab_projs/wuyuechun/TbelangeriKidneyOrganoid_HTNV76118_snRNA-seq_v20250921/raw_fq/JZ25142339-250707-PC
)
ref_idx_dirs=(
    /data/database/ref_idx/mobivision_v4.1/transcriptome/20250927_000227/GRCm39_GENCODEvM38
    /data/database/ref_idx/mobivision_v4.1/transcriptome/20250927_000227/GRCm39_GENCODEvM38
    /data/database/ref_idx/mobivision_v4.1/transcriptome/20250927_125947/TupaiaBelangeriV3_HTNV76118
    /data/database/ref_idx/mobivision_v4.1/transcriptome/20250927_125947/TupaiaBelangeriV3_HTNV76118
)
output_dirs=(
    /data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res
    /data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res
    /data/users/yangrui/lab_projs/wuyuechun/TbelangeriKidneyOrganoid_HTNV76118_snRNA-seq_v20250921/quant_res
    /data/users/yangrui/lab_projs/wuyuechun/TbelangeriKidneyOrganoid_HTNV76118_snRNA-seq_v20250921/quant_res
)
nthreads=100
root_work_dir=/data/tmp/dell_tmp

# shellcheck source=/dev/null
source ${mobivision_source_path}

(( ${#fq_dirs[@]} == ${#ref_idx_dirs[@]} && ${#ref_idx_dirs[@]} == ${#output_dirs[@]} )) || {
  echo "ERROR: array lengths are not equal" >&2
  exit 1
}

echo "start running pipeline ..."
for ((i=0; i<${#fq_dirs[@]}; i++)); do
  uuid_str=$(</proc/sys/kernel/random/uuid)
  work_dir="${root_work_dir}/${uuid_str}"
  mkdir -p "${work_dir}"
  cd "${work_dir}" || exit

  fq_dir="${fq_dirs[i]}"
  ref_idx_dir="${ref_idx_dirs[i]}"
  output_dir="${output_dirs[i]}"

  run_date=$(date +%Y%m%d_%H%M%S)
  fq_dir_basename=$(basename "${fq_dir}")
  sub_dir="${fq_dir_basename}_${run_date}"
  output_dir="${output_dir}/${sub_dir}"
  mkdir -p "${output_dir}"

  log_file="${output_dir}/${sub_dir}".run_mobivision_quantify.log

  mobivision_quantify_cmd="mobivision quantify -i '${ref_idx_dir}' -t '${nthreads}' -f '${fq_dir}' -s '${fq_dir_basename}' -o '${output_dir}'"
  echo "running command: ${mobivision_quantify_cmd}" &> "${log_file}"
  eval "${mobivision_quantify_cmd}" &>> "${log_file}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${mobivision_quantify_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${mobivision_quantify_cmd}" &>> "${log_file}"
  fi

  metadata_file="${output_dir}/${sub_dir}".metadata.json
  echo "writing metadata to ${metadata_file}" &>> "${log_file}"
  fastqs=$(find "${fq_dir}" -maxdepth 1 -type f -name "*.fastq.gz" | jq -R '.' | jq -s '.')
cat <<EOF |
{
"quantify_name": "${sub_dir}",
"quantify_date": "${run_date}",
"software": "mobivision",
"software_version": "v4.1",
"library_type": "transcriptome",
"fastqs": ${fastqs},
"reference_index": "${ref_idx_dir}",
"cmd": ["${mobivision_quantify_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
  echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"

  cd ${root_work_dir} || exit
  rm -rf "${work_dir}"
done
echo "running pipeline done!"

1.2 Epigenome

1.2.1 Build epigenomic reference genome index

One genome file and one annotation file (single mode)

#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp

cd "${work_dir}"
vim "${uuid_str}".run_mobivision_mk_chip_ref.sh
chmod +x "${uuid_str}".run_mobivision_mk_chip_ref.sh

log_file="${uuid_str}".run_mobivision_mk_chip_ref.log
pid_file="${uuid_str}".run_mobivision_mk_chip_ref.pid
nohup ./"${uuid_str}".run_mobivision_mk_chip_ref.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished

#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
# <genome build version>_<annotation build version>
build_names=(
  PetaurusBrevicepsV1_PetaurusBrevicepsV3
)
# Gzipped FASTA only
genomes=(
  /data/database/genome/release/petaurus_breviceps/Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa.gz
)
# Gzipped GTF only
annotations=(
  /data/database/annotation/release/petaurus_breviceps/Petaurus_breviceps.v1.v3.KIZ_NCBI.basic.chr.annotation.gtf.gz
)
output_dirs=(
  /data/database/ref_idx/mobivision_v4.1/epigenome
)
# true
# any other values will be considered as false
check_gtf=true
root_work_dir=/data/tmp/dell_tmp

# shellcheck source=/dev/null
source ${mobivision_source_path}

(( ${#build_names[@]} == ${#genomes[@]} && ${#genomes[@]} == ${#annotations[@]} && ${#annotations[@]} == ${#output_dirs[@]} )) || {
  echo "ERROR: array lengths are not equal" >&2
  exit 1
}

echo "start running pipeline ..."
for((i=0;i<${#build_names[@]};i++)); do
  uuid_str=$(</proc/sys/kernel/random/uuid)
  work_dir="${root_work_dir}/${uuid_str}"
  mkdir -p "${work_dir}"
  cd "${work_dir}" || exit

  build_name="${build_names[i]}"
  genome="${genomes[i]}"
  annotation="${annotations[i]}"
  output_dir="${output_dirs[i]}"

  log_file="${build_name}".run_mobivision_mkindex.log

  # Generally, avoid running `mobivision fmt_gtf` unless absolutely necessary
  tmp_annotation_fixed=$(basename "${annotation}" .gtf.gz).fixed.gtf
  tmp_annotation=$(basename "${annotation}" .gtf.gz).rm_comments.gtf

  pigz -k -c -d "${annotation}" > "${tmp_annotation_fixed}"
  if [[ "${check_gtf}" == "true" ]]; then
    /data/softwares/misc_r_scripts/check_fix_gtf_for_cellranger.R "${tmp_annotation_fixed}"
  fi

  rm_comments_cmd="grep -v -P '^#' '${tmp_annotation_fixed}' > '${tmp_annotation}'"
  echo "running command: ${rm_comments_cmd}" &> "${log_file}"
  eval "${rm_comments_cmd}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${rm_comments_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${rm_comments_cmd}" &>> "${log_file}"
  fi

  mobivision_mk_chip_ref_cmd="mobivision mk_chip_ref -n '${build_name}' -f '${genome}' -g '${tmp_annotation}' -r '${work_dir}'"
  echo "running command: ${mobivision_mk_chip_ref_cmd}" &>> "${log_file}"
  eval "${mobivision_mk_chip_ref_cmd}" &>> "${log_file}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${mobivision_mk_chip_ref_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${mobivision_mk_chip_ref_cmd}" &>> "${log_file}"
  fi

  build_date=$(date +%Y%m%d_%H%M%S)
  output_dir="${output_dir}/${build_date}"
  mkdir -p "${output_dir}"
  mv_cmd="mv ${build_name} ${output_dir}/"
  echo "running command: ${mv_cmd}" &>> "${log_file}"
  eval "${mv_cmd}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${mv_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${mv_cmd}" &>> "${log_file}"
  fi

  metadata_file="${output_dir}/${build_name}".metadata.json
  echo "writing metadata to ${metadata_file}" &>> "${log_file}"
cat <<EOF |
{
  "build_name": "${build_name}",
  "build_date": "${build_date}",
  "software": "mobivision",
  "software_version": "v4.1",
  "library_type": "epigenome",
  "reference_type": "single",
  "genomes": ["${genome}"],
  "annotations": ["${annotation}"],
  "cmd": ["${rm_comments_cmd}", "${mobivision_mk_chip_ref_cmd}", "${mv_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
  echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"
  mv "${log_file}" "${output_dir}/"

  cd ${root_work_dir} || exit
  rm -rf "${work_dir}"
done
echo "running pipeline done!"

1.2.2 Align epigenome

#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp

cd "${work_dir}"
vim "${uuid_str}".run_mobivision_atac.sh
chmod +x "${uuid_str}".run_mobivision_atac.sh

log_file="${uuid_str}".run_mobivision_atac.log
pid_file="${uuid_str}".run_mobivision_atac.pid
nohup ./"${uuid_str}".run_mobivision_atac.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished

#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
# Each directory must contain data from only one biological sample
# Multiple read pair files within a single directory will be automatically merged before pipeline execution
# Directory names serve as unique sample identifiers
# Ensure all directory basenames are globally unique across your entire project to prevent sample ID conflicts
fq_dirs=(
  /data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133309-250714A-B-WYL-ZN-250714A-B-WYL-ZN
  /data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133310-250714A-B-WYL-WZT-250714A-B-WYL-WZT
  /data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133311-250714A-B-WYL-PC-250714A-B-WYL-PC
  /data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133312-250714A-B-LYY-HM-250714A-B-LYY-HM
  /data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133313-250714A-B-LYY-XQN-250714A-B-LYY-XQN
  /data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133315-250714A-B-LYY-HN-250714A-B-LYY-HN
  /data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133314-250714A-B-LYY-YP-250714A-B-LYY-YP
)
ref_idx_dirs=(
  /data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
  /data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
  /data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
  /data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
  /data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
  /data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
  /data/database/ref_idx/mobivision_v4.1/epigenome/20250927_190526/PetaurusBrevicepsV1_PetaurusBrevicepsV1
)
output_dirs=(
  /data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
  /data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
  /data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
  /data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
  /data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
  /data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
  /data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
)
# Can be either 'broad' or 'narrow'
peak_type=narrow
nthreads=100
root_work_dir=/data/tmp/dell_tmp

# shellcheck source=/dev/null
source ${mobivision_source_path}

(( ${#fq_dirs[@]} == ${#ref_idx_dirs[@]} && ${#ref_idx_dirs[@]} == ${#output_dirs[@]} )) || {
  echo "ERROR: array lengths are not equal" >&2
  exit 1
}

echo "start running pipeline ..."
for ((i=0; i<${#fq_dirs[@]}; i++)); do
  uuid_str=$(</proc/sys/kernel/random/uuid)
  work_dir="${root_work_dir}/${uuid_str}"
  mkdir -p "${work_dir}"
  cd "${work_dir}" || exit

  fq_dir="${fq_dirs[i]}"
  ref_idx_dir="${ref_idx_dirs[i]}"
  output_dir="${output_dirs[i]}"

  run_date=$(date +%Y%m%d_%H%M%S)
  fq_dir_basename=$(basename "${fq_dir}")
  sub_dir="${fq_dir_basename}_${peak_type}_${run_date}"
  output_dir="${output_dir}/${sub_dir}"
  mkdir -p "${output_dir}"

  log_file="${output_dir}/${sub_dir}".run_mobivision_quantify.log

  mobivision_atac_cmd="mobivision atac --peaktype '${peak_type}' -i '${ref_idx_dir}' -t '${nthreads}' -f '${fq_dir}' -o '${output_dir}'"
  echo "running command: ${mobivision_atac_cmd}" &> "${log_file}"
  eval "${mobivision_atac_cmd}" &>> "${log_file}"
  ret=$?
  if (( ret != 0 )); then
    echo "ERROR: running ${mobivision_atac_cmd} failed with exit code ${ret}" &>> "${log_file}"
  else
    echo "successfully running command: ${mobivision_atac_cmd}" &>> "${log_file}"
  fi

  metadata_file="${output_dir}/${sub_dir}".metadata.json
  echo "writing metadata to ${metadata_file}" &>> "${log_file}"
  fastqs=$(find "${fq_dir}" -maxdepth 1 -type f -name "*.fastq.gz" | jq -R '.' | jq -s '.')
cat <<EOF |
{
"quantify_name": "${sub_dir}",
"quantify_date": "${run_date}",
"software": "mobivision",
"software_version": "v4.1",
"library_type": "epigenome",
"fastqs": ${fastqs},
"reference_index": "${ref_idx_dir}",
"cmd": ["${mobivision_atac_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
  echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"

  cd ${root_work_dir} || exit
  rm -rf "${work_dir}"
done
echo "running pipeline done!"

2 CellRanger

2.1 Transcriptome

2.1.1 Build transcriptomic reference genome index

# !/usr/local/bin/Rscript
# check_fix_gtf_for_cellranger.R

args <- commandArgs(trailingOnly = TRUE)
if (length(args) != 1 || !file.exists(args[1])) {
    message("check_fix_gtf_for_cellranger.R example.gtf\n\nImportant: the input file i.e., example.gtf here will be overwritten, so pass a copy of your original file.")
    quit(save = "no", status = 1)
} else {
    message("dealing with ", args[1])
}

suppressMessages(library(rtracklayer))
suppressMessages(library(tidyverse))

gene_biotype_fields <- c("gene_type", "gene_biotype", "biotype")
gene_biotype_default <- "protein_coding"
keep_cols <- c(
    "source", "type", "score", "phase",
    "gene_id", "transcript_id", "gene_name", "gene_biotype",
    "transcript_biotype", "exon_number", "protein_id"
)

gtf_file <- args[1]

gtf <- import(gtf_file, format = "gtf")
df <- as_tibble(as.data.frame(gtf))

exon_df <- filter(df, type == "exon")
if (nrow(exon_df) == 0 || !("gene_id" %in% names(exon_df)) || !("transcript_id" %in% names(exon_df)) || any(is.na(exon_df$gene_id)) || any(is.na(exon_df$transcript_id)) || any(exon_df$gene_id == "") || any(exon_df$transcript_id == "")) {
    stop("some exons don't have 'gene_id' and/or 'transcript_id' attributes")
}

gtf$gene_id <- trimws(gtf$gene_id)
if (any(gtf$gene_id == "") || any(grepl("[ \t\r\n\"]", gtf$gene_id))) {
    stop("some gene_id values are empty")
}

gene_biotype_field <- gene_biotype_fields[which(gene_biotype_fields %in% names(df))][1]
if (!is.na(gene_biotype_field)) {
    gene_id_biotype_df <- df %>%
        mutate(gene_biotype = .data[[gene_biotype_field]]) %>%
        select(gene_id, gene_biotype) %>%
        na.omit() %>%
        distinct()
    gtf$gene_biotype <- gene_id_biotype_df$gene_biotype[match(gtf$gene_id, gene_id_biotype_df$gene_id)]
    gtf$gene_biotype[is.na(gtf$gene_biotype)] <- gene_biotype_default
} else {
    gtf$gene_biotype <- gene_biotype_default
}

if ("gene_name" %in% names(mcols(gtf))) {
    gtf$gene_name[is.na(gtf$gene_name)] <- gtf$gene_id[is.na(gtf$gene_name)]
} else {
    gtf$gene_name <- gtf$gene_id
}
gtf$gene_name[grepl("[ \t\r\n\"]", gtf$gene_name)] <- gtf$gene_id[grepl("[ \t\r\n\"]", gtf$gene_name)]

final_cols <- intersect(keep_cols, names(mcols(gtf)))
mcols(gtf) <- mcols(gtf)[, final_cols]

export(gtf, con = gtf_file, format = "gtf")
message("dealing with ", gtf_file, " done!")

#!/usr/bin/bash -e
annotation=/data/database/annotation/release/petaurus_breviceps/Petaurus_breviceps.v1.v3.KIZ_NCBI.basic.chr.annotation.gtf.gz
genome=/data/database/genome/release/petaurus_breviceps/Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa.gz
build_name=PetaurusBrevicepsV1_PetaurusBrevicepsV3
nthreads=120
memgb=1024
# true
# any other values will be considered as false
check_gtf=true
work_dir=/data/tmp/dell_tmp

cd "${work_dir}"

tmp_genome=genome.fa
pigz -k -c -d "${genome}" > "${tmp_genome}"

tmp_annotation=annotation.gtf
pigz -k -c -d "${annotation}" > "${tmp_annotation}"
if [[ "${check_gtf}" == "true" ]]; then
  /data/softwares/misc_r_scripts/check_fix_gtf_for_cellranger.R "${tmp_annotation}"
fi

cellranger mkref --genome "${build_name}" --fasta "${tmp_genome}" \
  --genes "${tmp_annotation}" --nthreads "${nthreads}" --memgb "${memgb}" \
  --localcores "${nthreads}" --localmem "${memgb}"

metadata_file="${build_name}".metadata.json
cat <<EOF |
{
  "build_name": "${build_name}",
  "check_gtf": "${check_gtf}",
  "genomes": ["${genome}"],
  "annotations": ["${annotation}"]
}
EOF
jq '.' > "${metadata_file}"

rm -f "${tmp_genome}" "${tmp_annotation}"

2.1.2 Quantify transcriptome for a single sample

对于单细胞转录组，应根据你的实验设计、建库与测序方案决定在哪一个阶段合并样本，例如：对于同一样本、同一库、不同测序芯片多次测序，应在比对定量之前合并 FASTQ 文件；而对于同一样本、不同库、同一或不同测序芯片，应在比对定量后进行数据整合（以库为最小整合单位，即 technical replicates）。

#!/usr/bin/bash -e
ulimit -Sn 16384

ref_idx=/data/database/ref_idx/cellranger_v8.0.1/transcriptome/20251110_124502/PetaurusBrevicepsV1_PetaurusBrevicepsV3
fastqs_path=/data/database/data/raw/fastq/20250923_163558/snRNA-seq/Sample_JZ25136889-250710-YP-250710-YP/mobiTo10x/tenx
id=Sample_JZ25136889-250710-YP-250710-YP
sample=YP
nthreads=120
memgb=1024
work_dir=/data/tmp/dell_tmp

cd "${work_dir}"

log_file="${id}".run_10x_cellranger_count.log

cellranger count --id="${id}" --transcriptome="${ref_idx}" \
  --fastqs="${fastqs_path}" --sample="${sample}" \
  --create-bam=true --localcores="${nthreads}" \
  --localmem="${memgb}" &>> "${log_file}"

metadata_file="${id}".metadata.json
fastqs=$(find "${fastqs_path}" -maxdepth 1 -type f -name "*.fastq.gz" | jq -R '.' | jq -s '.')
cat <<EOF |
{
"software": "cellranger",
"software_version": "v8.0.1",
"library_type": "transcriptome",
"fastqs": ${fastqs},
"reference_index": "${ref_idx}"
}
EOF
jq '.' > "${metadata_file}"

2.1.3 Quantify transcriptome in batch

#!/usr/bin/bash -e
ulimit -Sn 16384

ref_idx=/data/database/ref_idx/cellranger_v8.0.1/transcriptome/20251206_104658/MustelaPutoriusFuroV1_MustelaPutoriusFuroV1
fastqs_path=/data/tmp/dell_tmp/ferret_10x_fastq
sample_file=/data/users/dell/temp/sample.txt
nthreads=120
memgb=1024
work_dir=/data/tmp/dell_tmp

cd "${work_dir}"

while IFS= read -r sample
do
  echo "processing ${sample} ..."
  log_file="${sample}".run_10x_cellranger_count.log

  cellranger count --id="${sample}" --transcriptome="${ref_idx}" \
    --fastqs="${fastqs_path}" --sample="${sample}" \
    --create-bam=true --localcores="${nthreads}" \
    --localmem="${memgb}" &>> "${log_file}"

  metadata_file="${sample}".metadata.json
  fastqs=$(find "${fastqs_path}" -maxdepth 1 -type f -name "${sample}*.fastq.gz" | jq -R '.' | jq -s '.')
  cellranger_version=$(cellranger --version)
cat <<EOF |
{
"software": "cellranger",
"software_version": "${cellranger_version}",
"library_type": "transcriptome",
"fastqs": ${fastqs},
"reference_index": "${ref_idx}"
}
EOF
jq '.' > "${metadata_file}"
  echo "processing ${sample} done!"
done < "${sample_file}"

2.2 Epigenome

2.2.1 Build epigenomic reference genome index

2.2.2 Align epigenome

2.3 Multiome

2.3.1 Build multiomic reference genome index

2.3.2 Quantify and align multiome

3 Misc

3.1 Convert Mobi 3’ transcriptome reads to 10X-compatible format

/data/softwares/mobiTo10x/convert_3RNA_mobi_to_tenx -r1 JZ25136889-250710-YP-250710-YP_combined_R1.fastq.gz -r2 JZ25136889-250710-YP-250710-YP_combined_R2.fastq.gz -o mobiTo10x -p YP -l /data/softwares/mobiTo10x/3M-february-2018.txt

3.2 Judge read types (R1/R2) based on 10X barcode inclusion list

Only support two read files (i.e., R1 & R2).

If your read files include I1, I2, etc. you should identify and remove them first.

using CodecZlib
using YRUtils
using DataFrames
using CSV
using FASTX

barcodes_dict_files = Dict(
    Symbol("sc_3p_v2") => "/data/softwares/cellranger-8.0.1/lib/python/cellranger/barcodes/737K-august-2016.txt",
    Symbol("sc_3p_v3") => "/data/softwares/cellranger-8.0.1/lib/python/cellranger/barcodes/3M-february-2018.txt.gz",
)
barcodes_length_dict = Dict(
    Symbol("sc_3p_v2") => 16,
    Symbol("sc_3p_v3") => 16,
)
sra_dump_dir = "/data/tmp/dell_tmp/tmp_downloads"
output_dir = "/home/yangrui/mywd/minkproj_v20251001/test"
sample_num = 1000000

barcodes_dict = Dict(k => Set{String}() for k in keys(barcodes_dict_files))
for k in keys(barcodes_dict_files)
    if endswith(barcodes_dict_files[k], ".gz")
        barcodes = open(barcodes_dict_files[k], "r") do io
            readlines(GzipDecompressorStream(io))
        end
    else
        barcodes = open(barcodes_dict_files[k], "r") do io
            readlines(io)
        end
    end

    barcodes = strip.(barcodes)
    barcodes = convert(Vector{String}, barcodes)
    barcodes = barcodes[.!isempty.(barcodes)]
    barcodes_dict[k] = Set(barcodes)
end

fq_files = YRUtils.BaseUtils.list_files(sra_dump_dir, r"\.(fq|fastq)(\.gz)?$"; recursive = true, full_name = true)
fq_files_hit_dict = Dict(Symbol(k) => Dict(k => 0 for k in keys(barcodes_dict_files)) for k in fq_files)
fq_files_count_dict = Dict(Symbol(k) => 0 for k in fq_files)
for fq_file in keys(fq_files_hit_dict)
    if endswith(string(fq_file), ".gz")
        FASTQReader(GzipDecompressorStream(open(string(fq_file), "r"))) do reader
            record = FASTQ.Record()
            i = 0
            while !eof(reader) && i < sample_num
                i += 1
                empty!(record)
                read!(reader, record)

                seq = sequence(record)
                for barcodes_type in keys(barcodes_dict_files)
                    if length(seq) >= barcodes_length_dict[barcodes_type]
                        fq_files_hit_dict[fq_file][barcodes_type] += seq[firstindex(seq):nextind(seq, firstindex(seq), barcodes_length_dict[barcodes_type] - 1)] in barcodes_dict[barcodes_type]
                    end
                end
            end
            fq_files_count_dict[fq_file] = i
        end
    else
        FASTQReader(open(string(fq_file), "r")) do reader
            record = FASTQ.Record()
            i = 0
            while !eof(reader) && i < sample_num
                i += 1
                empty!(record)
                read!(reader, record)

                seq = sequence(record)
                for barcodes_type in keys(barcodes_dict_files)
                    if length(seq) >= barcodes_length_dict[barcodes_type]
                        fq_files_hit_dict[fq_file][barcodes_type] += seq[firstindex(seq):nextind(seq, firstindex(seq), barcodes_length_dict[barcodes_type] - 1)] in barcodes_dict[barcodes_type]
                    end
                end
            end
            fq_files_count_dict[fq_file] = i
        end
    end
end

res_vec = Any[]
for fq_file in keys(fq_files_hit_dict)
    push!(res_vec, (fq_file = string(fq_file), fq_files_hit_dict[fq_file]..., total_count = fq_files_count_dict[fq_file]))
end
res_df = DataFrame(res_vec)
CSV.write(joinpath(output_dir, "barcodes_eval.tsv"), res_df; header = true, delim = "\t", append = false)

3.3 Examples to rename FASTQ files downloaded from NCBI GEO

library(tidyverse)
library(vroom)

work_dir <- "/data/users/dell/temp"
setwd(work_dir)

file_process_meta_file <- "file_processing_metadata.tsv"
sra_run_table_file <- "SraRunTable.csv"
srr_acc_list_file <- "SRR_Acc_List.txt"

barcodes_eval_file <- "barcodes_eval.tsv"
barcodes_type_cols <- c("sc_3p_v2", "sc_3p_v3")
barcodes_pct_th <- 90

output_dir <- "/data/tmp/dell_tmp/tmp_downloads"

file_process_meta_df <- vroom(file_process_meta_file)
sra_run_table_df <- vroom(sra_run_table_file)
srr_acc_list_df <- vroom(srr_acc_list_file, col_names = "Run", delim = "\t")

if (all(srr_acc_list_df$Run %in% sra_run_table_df$Run) && all(sra_run_table_df$Run %in% srr_acc_list_df$Run)) {
    message("all runs are valid: OK")
} else {
    warning("all runs are valid: NO")
}

file_process_meta_df <- file_process_meta_df %>%
    select(geo_gsm_acc, stage, tissue, layer, bio_rep, sub_dir) %>%
    rename(area = tissue, bio_sample = sub_dir) %>%
    distinct()

barcodes_eval_df <- vroom(barcodes_eval_file)
barcodes_eval_df <- barcodes_eval_df %>%
    pivot_longer(cols = all_of(barcodes_type_cols), names_to = "barcodes_type", values_to = "barcodes_num") %>%
    mutate(
        barcodes_pct = barcodes_num / total_count * 100,
        Run = str_extract(basename(fq_file), "^SRR[0-9]+")
    )
best_barcodes_type_df <- barcodes_eval_df %>%
    group_by(Run) %>%
    slice_max(order_by = barcodes_pct, n = 1) %>%
    slice_sample(n = 1) %>%
    mutate(barcodes_type_validity = if_else(barcodes_pct > barcodes_pct_th, TRUE, FALSE)) %>%
    select(barcodes_type, Run, barcodes_type_validity) %>%
    distinct()

suspicious_runs <- best_barcodes_type_df$Run[!best_barcodes_type_df$barcodes_type_validity]
warning("these runs are suspicious: ", paste0(suspicious_runs, collapse = " "))
filter(barcodes_eval_df, Run %in% suspicious_runs) %>%
    arrange(Run, fq_file, desc(barcodes_pct)) %>%
    as.data.frame()

barcodes_eval_df <- inner_join(barcodes_eval_df, best_barcodes_type_df, by = c("Run", "barcodes_type")) %>%
    arrange(Run, fq_file, desc(barcodes_pct))
barcodes_eval_df <- lapply(split(barcodes_eval_df, barcodes_eval_df$Run), function(x) {
    if (nrow(x) != 2) {
        stop("only two files supported")
    }
    x %>%
        arrange(desc(barcodes_pct)) %>%
        mutate(read_type = seq_len(n()))
}) %>% bind_rows()

keep_cols <- c(
    "Run", "Developmental_Stage", "Region", "germinal_zone",
    "litter", "tissue", "Sample Name", "Assay Type",
    "BioProject", "BioSample", "Experiment",
    "Instrument", "Library Name", "LibraryLayout",
    "LibrarySelection", "LibrarySource", "Organism", "Platform",
    "sequencing_flowcell", "source_name", "SRA Study"
)
df <- sra_run_table_df %>%
    select(all_of(keep_cols)) %>%
    inner_join(file_process_meta_df, by = c("Sample Name" = "geo_gsm_acc")) %>%
    arrange(stage, area, layer, bio_rep) %>%
    group_by(bio_sample) %>%
    mutate(
        tech_rep = seq_len(n()),
        tech_sample = paste0(bio_sample, "_part", tech_rep)
    ) %>%
    ungroup() %>%
    inner_join(barcodes_eval_df, by = "Run") %>%
    mutate(
        new_fq_file_basename = paste0(bio_sample, "_S1_L00", tech_rep, "_R", read_type, "_001.fastq"),
        new_fq_file = file.path(output_dir, new_fq_file_basename)
    ) %>%
    arrange(Run, new_fq_file)

vroom_write(df, file = "metadata.tsv")

ok <- file.rename(df$fq_file, df$new_fq_file)
if (all(ok)) {
    message("all are well done")
} else {
    stop("some tasks were failed")
}
ok <- sapply(df$new_fq_file, function(x) system2(command = "pigz", args = x))
if (all(ok == 0)) {
    message("all are well done")
} else {
    stop("some tasks were failed")
}