using CodecZlib
using YRUtils
using DataFrames
using CSV
using FASTX
barcodes_dict_files = Dict(
Symbol("sc_3p_v2") => "/data/softwares/cellranger-8.0.1/lib/python/cellranger/barcodes/737K-august-2016.txt",
Symbol("sc_3p_v3") => "/data/softwares/cellranger-8.0.1/lib/python/cellranger/barcodes/3M-february-2018.txt.gz",
)
barcodes_length_dict = Dict(
Symbol("sc_3p_v2") => 16,
Symbol("sc_3p_v3") => 16,
)
sra_dump_dir = "/data/tmp/dell_tmp/tmp_downloads"
output_dir = "/home/yangrui/mywd/minkproj_v20251001/test"
sample_num = 1000000
barcodes_dict = Dict(k => Set{String}() for k in keys(barcodes_dict_files))
for k in keys(barcodes_dict_files)
if endswith(barcodes_dict_files[k], ".gz")
barcodes = open(barcodes_dict_files[k], "r") do io
readlines(GzipDecompressorStream(io))
end
else
barcodes = open(barcodes_dict_files[k], "r") do io
readlines(io)
end
end
barcodes = strip.(barcodes)
barcodes = convert(Vector{String}, barcodes)
barcodes = barcodes[.!isempty.(barcodes)]
barcodes_dict[k] = Set(barcodes)
end
fq_files = YRUtils.BaseUtils.list_files(sra_dump_dir, r"\.(fq|fastq)(\.gz)?$"; recursive = true, full_name = true)
fq_files_hit_dict = Dict(Symbol(k) => Dict(k => 0 for k in keys(barcodes_dict_files)) for k in fq_files)
fq_files_count_dict = Dict(Symbol(k) => 0 for k in fq_files)
for fq_file in keys(fq_files_hit_dict)
if endswith(string(fq_file), ".gz")
FASTQReader(GzipDecompressorStream(open(string(fq_file), "r"))) do reader
record = FASTQ.Record()
i = 0
while !eof(reader) && i < sample_num
i += 1
empty!(record)
read!(reader, record)
seq = sequence(record)
for barcodes_type in keys(barcodes_dict_files)
if length(seq) >= barcodes_length_dict[barcodes_type]
fq_files_hit_dict[fq_file][barcodes_type] += seq[firstindex(seq):nextind(seq, firstindex(seq), barcodes_length_dict[barcodes_type] - 1)] in barcodes_dict[barcodes_type]
end
end
end
fq_files_count_dict[fq_file] = i
end
else
FASTQReader(open(string(fq_file), "r")) do reader
record = FASTQ.Record()
i = 0
while !eof(reader) && i < sample_num
i += 1
empty!(record)
read!(reader, record)
seq = sequence(record)
for barcodes_type in keys(barcodes_dict_files)
if length(seq) >= barcodes_length_dict[barcodes_type]
fq_files_hit_dict[fq_file][barcodes_type] += seq[firstindex(seq):nextind(seq, firstindex(seq), barcodes_length_dict[barcodes_type] - 1)] in barcodes_dict[barcodes_type]
end
end
end
fq_files_count_dict[fq_file] = i
end
end
end
res_vec = Any[]
for fq_file in keys(fq_files_hit_dict)
push!(res_vec, (fq_file = string(fq_file), fq_files_hit_dict[fq_file]..., total_count = fq_files_count_dict[fq_file]))
end
res_df = DataFrame(res_vec)
CSV.write(joinpath(output_dir, "barcodes_eval.tsv"), res_df; header = true, delim = "\t", append = false)1 MobiVision
Caution: mobivision is not really robust for annotations from non-model organisms (it seems fine for Ensembl annotations). It’s highly recommended that you use cellranger instead of mobivison.
Relationships between the number of cells recovered and the expected doublet rates (MobiVision platform):
| 捕获细胞数 | 墨卓双胞率 |
|---|---|
| 5000 | 2.50% |
| 6000 | 3.00% |
| 7000 | 3.50% |
| 8000 | 4.00% |
| 9000 | 4.50% |
| 10000 | 5.00% |
| 11000 | 5.50% |
| 12000 | 6.00% |
| 13000 | 6.50% |
| 14000 | 7.00% |
| 15000 | 7.50% |
| 16000 | 8.00% |
| 17000 | 8.50% |
| 18000 | 9.00% |
| 19000 | 9.50% |
| 20000 | 10.00% |
| 21000 | 10.50% |
1.1 Transcriptome
1.1.1 Build transcriptomic reference genome index
- One genome file and one annotation file (
singlemode)
# !/usr/local/bin/Rscript
# check_fix_gtf_for_cellranger.R
args <- commandArgs(trailingOnly = TRUE)
if (length(args) != 1 || !file.exists(args[1])) {
message("check_fix_gtf_for_cellranger.R example.gtf\n\nImportant: the input file i.e., example.gtf here will be overwritten, so pass a copy of your original file.")
quit(save = "no", status = 1)
} else {
message("dealing with ", args[1])
}
suppressMessages(library(rtracklayer))
suppressMessages(library(tidyverse))
gene_biotype_fields <- c("gene_type", "gene_biotype", "biotype")
gene_biotype_default <- "protein_coding"
keep_cols <- c(
"source", "type", "score", "phase",
"gene_id", "transcript_id", "gene_name", "gene_biotype",
"transcript_biotype", "exon_number", "protein_id"
)
gtf_file <- args[1]
gtf <- import(gtf_file, format = "gtf")
df <- as_tibble(as.data.frame(gtf))
exon_df <- filter(df, type == "exon")
if (nrow(exon_df) == 0 || !("gene_id" %in% names(exon_df)) || !("transcript_id" %in% names(exon_df)) || any(is.na(exon_df$gene_id)) || any(is.na(exon_df$transcript_id)) || any(exon_df$gene_id == "") || any(exon_df$transcript_id == "")) {
stop("some exons don't have 'gene_id' and/or 'transcript_id' attributes")
}
gtf$gene_id <- trimws(gtf$gene_id)
if (any(gtf$gene_id == "") || any(grepl("[ \t\r\n\"]", gtf$gene_id))) {
stop("some gene_id values are empty")
}
gene_biotype_field <- gene_biotype_fields[which(gene_biotype_fields %in% names(df))][1]
if (!is.na(gene_biotype_field)) {
gene_id_biotype_df <- df %>%
mutate(gene_biotype = .data[[gene_biotype_field]]) %>%
select(gene_id, gene_biotype) %>%
na.omit() %>%
distinct()
gtf$gene_biotype <- gene_id_biotype_df$gene_biotype[match(gtf$gene_id, gene_id_biotype_df$gene_id)]
gtf$gene_biotype[is.na(gtf$gene_biotype)] <- gene_biotype_default
} else {
gtf$gene_biotype <- gene_biotype_default
}
if ("gene_name" %in% names(mcols(gtf))) {
gtf$gene_name[is.na(gtf$gene_name)] <- gtf$gene_id[is.na(gtf$gene_name)]
} else {
gtf$gene_name <- gtf$gene_id
}
gtf$gene_name[grepl("[ \t\r\n\"]", gtf$gene_name)] <- gtf$gene_id[grepl("[ \t\r\n\"]", gtf$gene_name)]
final_cols <- intersect(keep_cols, names(mcols(gtf)))
mcols(gtf) <- mcols(gtf)[, final_cols]
export(gtf, con = gtf_file, format = "gtf")
message("dealing with ", gtf_file, " done!")
#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp
cd "${work_dir}"
vim "${uuid_str}".run_mobivision_mkindex.sh
chmod +x "${uuid_str}".run_mobivision_mkindex.sh
log_file="${uuid_str}".run_mobivision_mkindex.log
pid_file="${uuid_str}".run_mobivision_mkindex.pid
nohup ./"${uuid_str}".run_mobivision_mkindex.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished
#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
# <genome build version>_<annotation build version>
build_names=(
PetaurusBrevicepsV1_PetaurusBrevicepsV3
)
# Gzipped FASTA only
genomes=(
/data/database/genome/release/petaurus_breviceps/Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa.gz
)
# Gzipped GTF only
annotations=(
/data/database/annotation/release/petaurus_breviceps/Petaurus_breviceps.v1.v3.KIZ_NCBI.basic.chr.annotation.gtf.gz
)
output_dirs=(
/data/database/ref_idx/mobivision_v4.1/transcriptome
)
# true
# any other values will be considered as false
check_gtf=true
root_work_dir=/data/tmp/dell_tmp
# shellcheck source=/dev/null
source ${mobivision_source_path}
(( ${#build_names[@]} == ${#genomes[@]} && ${#genomes[@]} == ${#annotations[@]} && ${#annotations[@]} == ${#output_dirs[@]} )) || {
echo "ERROR: array lengths are not equal" >&2
exit 1
}
echo "start running pipeline ..."
for((i=0;i<${#build_names[@]};i++)); do
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir="${root_work_dir}/${uuid_str}"
mkdir -p "${work_dir}"
cd "${work_dir}" || exit
build_name="${build_names[i]}"
genome="${genomes[i]}"
annotation="${annotations[i]}"
output_dir="${output_dirs[i]}"
log_file="${build_name}".run_mobivision_mkindex.log
# Generally, avoid running `mobivision fmt_gtf` unless absolutely necessary
tmp_annotation_fixed=$(basename "${annotation}" .gtf.gz).fixed.gtf
tmp_annotation=$(basename "${annotation}" .gtf.gz).rm_comments.gtf
pigz -k -c -d "${annotation}" > "${tmp_annotation_fixed}"
if [[ "${check_gtf}" == "true" ]]; then
/data/softwares/misc_r_scripts/check_fix_gtf_for_cellranger.R "${tmp_annotation_fixed}"
fi
rm_comments_cmd="grep -v -P '^#' '${tmp_annotation_fixed}' > '${tmp_annotation}'"
echo "running command: ${rm_comments_cmd}" &> "${log_file}"
eval "${rm_comments_cmd}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${rm_comments_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${rm_comments_cmd}" &>> "${log_file}"
fi
mobivision_mkindex_cmd="mobivision mkindex -n '${build_name}' -f '${genome}' -g '${tmp_annotation}'"
echo "running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
eval "${mobivision_mkindex_cmd}" &>> "${log_file}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${mobivision_mkindex_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
fi
build_date=$(date +%Y%m%d_%H%M%S)
output_dir="${output_dir}/${build_date}"
mkdir -p "${output_dir}"
mv_cmd="mv ${build_name} ${output_dir}/"
echo "running command: ${mv_cmd}" &>> "${log_file}"
eval "${mv_cmd}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${mv_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${mv_cmd}" &>> "${log_file}"
fi
metadata_file="${output_dir}/${build_name}".metadata.json
echo "writing metadata to ${metadata_file}" &>> "${log_file}"
cat <<EOF |
{
"build_name": "${build_name}",
"build_date": "${build_date}",
"software": "mobivision",
"software_version": "v4.1",
"library_type": "transcriptome",
"reference_type": "single",
"genomes": ["${genome}"],
"annotations": ["${annotation}"],
"cmd": ["${rm_comments_cmd}", "${mobivision_mkindex_cmd}", "${mv_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"
mv "${log_file}" "${output_dir}/"
cd ${root_work_dir} || exit
rm -rf "${work_dir}"
done
echo "running pipeline done!"
- Multiple genome assemblies and their corresponding annotation files (
concatenatemode)
Merge all genome assemblies into a single reference sequence and all annotations into a unified annotation set, then map the reads to this consolidated reference.
#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp
cd "${work_dir}"
vim "${uuid_str}".run_mobivision_mkindex.sh
chmod +x "${uuid_str}".run_mobivision_mkindex.sh
log_file="${uuid_str}".run_mobivision_mkindex.log
pid_file="${uuid_str}".run_mobivision_mkindex.pid
nohup ./"${uuid_str}".run_mobivision_mkindex.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished
#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
build_names=(
TupaiaBelangeriV3_HTNV76118
GRCm39GENCODEvM38_PRV531PlasmidV1
)
# Gzipped FASTA only
# Do not use colons (:) in any path
# Separate multiple paths for the same reference with colons
genomes=(
/data/database/genome/release/tupaia_belangeri/Tupaia_belangeri.v3.TreeShrewDB.dna.primary_assembly.fa.gz:/data/database/genome/release/hantaan_virus_strain_76-118/Hantaan_virus_strain_76-118.KT885047-9__1.NCBI.dna.primary_assembly.fa.gz
/data/database/genome/release/mus_musculus/Mus_musculus.GRCm39.gencode.dna.primary_assembly.fa.gz:/data/database/genome/release/prv531_plasmid/prv531_plasmid.v1.concatenated_curated_TRINITY_fragments.dna.primary_assembly.fa.gz
)
# Gzipped GTF only
# Do not use colons (:) in any path
# Separate multiple paths for the same reference with colons
annotations=(
/data/database/annotation/release/tupaia_belangeri/Tupaia_belangeri.v3.v3.TreeShrewDB.basic.chr.annotation.gtf.gz:/data/database/annotation/release/hantaan_virus_strain_76-118/Hantaan_virus_strain_76-118.KT885047-9__1.76118_yanglab.NCBI.basic.chr.annotation.gtf.gz
/data/database/annotation/release/mus_musculus/Mus_musculus.GRCm39.vM38.gencode.basic.chr.annotation.gtf.gz:/data/database/annotation/release/prv531_plasmid/prv531_plasmid.v1.v1.TRINITY.basic.chr.annotation.gtf.gz
)
output_dirs=(
/data/database/ref_idx/mobivision_v4.1/transcriptome
/data/database/ref_idx/mobivision_v4.1/transcriptome
)
root_work_dir=/data/tmp/dell_tmp
# shellcheck source=/dev/null
source ${mobivision_source_path}
(( ${#build_names[@]} == ${#genomes[@]} && ${#genomes[@]} == ${#annotations[@]} && ${#annotations[@]} == ${#output_dirs[@]} )) || {
echo "ERROR: array lengths are not equal" >&2
exit 1
}
echo "start running pipeline ..."
for((i=0;i<${#build_names[@]};i++)); do
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir="${root_work_dir}/${uuid_str}"
mkdir -p "${work_dir}"
cd "${work_dir}" || exit
build_name="${build_names[i]}"
IFS=: read -ra genome <<<"${genomes[i]}"
IFS=: read -ra annotation <<<"${annotations[i]}"
output_dir="${output_dirs[i]}"
log_file="${build_name}".run_mobivision_mkindex.log
merged_genome="${build_name}".fa
merge_genome_cmd="pigz -k -c -d ${genome[@]} > '${merged_genome}'"
echo "running command: ${merge_genome_cmd}" &> "${log_file}"
eval "${merge_genome_cmd}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${merge_genome_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${merge_genome_cmd}" &>> "${log_file}"
fi
seqnames=$(awk '/^>/ {print $1}' "${merged_genome}")
seqnames_num=$(wc -l <<<"${seqnames}")
seqnames_dedup_num=$(sort -u <<<"${seqnames}" | wc -l)
if [[ ${seqnames_num} -eq ${seqnames_dedup_num} ]]; then
echo "All chromosome names are unique" &>> "${log_file}"
else
echo "ERROR: some chromosome names are duplicated" &>> "${log_file}"
echo "Duplicate IDs:" &>> "${log_file}"
sort <<<"${seqnames}" | uniq -d &>> "${log_file}"
exit 1
fi
# Generally, avoid running `mobivision fmt_gtf` unless absolutely necessary
merged_annotation="${build_name}".rm_comments.gtf
merge_annotation_cmd="pigz -k -c -d ${annotation[@]} | grep -v -P '^#' > '${merged_annotation}'"
echo "running command: ${merge_annotation_cmd}" &>> "${log_file}"
eval "${merge_annotation_cmd}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${merge_annotation_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${merge_annotation_cmd}" &>> "${log_file}"
fi
mobivision_mkindex_cmd="mobivision mkindex -n '${build_name}' -f '${merged_genome}' -g '${merged_annotation}'"
echo "running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
eval "${mobivision_mkindex_cmd}" &>> "${log_file}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${mobivision_mkindex_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
fi
build_date=$(date +%Y%m%d_%H%M%S)
output_dir="${output_dir}/${build_date}"
mkdir -p "${output_dir}"
mv_cmd="mv ${build_name} ${output_dir}/"
echo "running command: ${mv_cmd}" &>> "${log_file}"
eval "${mv_cmd}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${mv_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${mv_cmd}" &>> "${log_file}"
fi
metadata_file="${output_dir}/${build_name}".metadata.json
echo "writing metadata to ${metadata_file}" &>> "${log_file}"
genomes_json=$(printf '%s\n' "${genome[@]}" | jq -R '.' | jq -s '.')
annotations_json=$(printf '%s\n' "${annotation[@]}" | jq -R '.' | jq -s '.')
cat <<EOF |
{
"build_name": "${build_name}",
"build_date": "${build_date}",
"software": "mobivision",
"software_version": "v4.1",
"library_type": "transcriptome",
"reference_type": "concatenate",
"genomes": ${genomes_json},
"annotations": ${annotations_json},
"cmd": ["${merge_genome_cmd}", "${merge_annotation_cmd}", "${mobivision_mkindex_cmd}", "${mv_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"
mv "${log_file}" "${output_dir}/"
cd ${root_work_dir} || exit
rm -rf "${work_dir}"
done
echo "running pipeline done!"
- Multiple genome assemblies and their corresponding annotation files (
multiplemode)
Map reads to multiple species genomes simultaneously.
#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp
cd "${work_dir}"
vim "${uuid_str}".run_mobivision_mkindex.sh
chmod +x "${uuid_str}".run_mobivision_mkindex.sh
log_file="${uuid_str}".run_mobivision_mkindex.log
pid_file="${uuid_str}".run_mobivision_mkindex.pid
nohup ./"${uuid_str}".run_mobivision_mkindex.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished
#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
# <genome build version>_<annotation build version>
# Do not use colons (:) in any name
# Separate multiple names for the same reference with colons
build_names=(
GRCh38_GENCODEv49:GRCm39_GENCODEvM38
)
# Gzipped FASTA only
# Do not use colons (:) in any path
# Separate multiple paths for the same reference with colons
genomes=(
/data/database/genome/release/homo_sapiens/Homo_sapiens.GRCh38.gencode.dna.primary_assembly.fa.gz:/data/database/genome/release/mus_musculus/Mus_musculus.GRCm39.gencode.dna.primary_assembly.fa.gz
)
# Gzipped GTF only
# Do not use colons (:) in any path
# Separate multiple paths for the same reference with colons
annotations=(
/data/database/annotation/release/homo_sapiens/Homo_sapiens.GRCh38.v49.gencode.basic.chr.annotation.gtf.gz:/data/database/annotation/release/mus_musculus/Mus_musculus.GRCm39.vM38.gencode.basic.chr.annotation.gtf.gz
)
output_dirs=(
/data/database/ref_idx/mobivision_v4.1/transcriptome
)
root_work_dir=/data/tmp/dell_tmp
# shellcheck source=/dev/null
source ${mobivision_source_path}
(( ${#build_names[@]} == ${#genomes[@]} && ${#genomes[@]} == ${#annotations[@]} && ${#annotations[@]} == ${#output_dirs[@]} )) || {
echo "ERROR: array lengths are not equal" >&2
exit 1
}
echo "start running pipeline ..."
for((i=0;i<${#build_names[@]};i++)); do
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir="${root_work_dir}/${uuid_str}"
mkdir -p "${work_dir}"
cd "${work_dir}" || exit
IFS=: read -ra build_name <<<"${build_names[i]}"
IFS=: read -ra genome <<<"${genomes[i]}"
IFS=: read -ra annotation <<<"${annotations[i]}"
output_dir="${output_dirs[i]}"
merged_build_name=$(printf '%s_and_' "${build_name[@]}" | sed 's/_and_$//')
log_file="${merged_build_name}".run_mobivision_mkindex.log
: > "${log_file}"
# Generally, avoid running `mobivision fmt_gtf` unless absolutely necessary
rm_comments_annotations=()
rm_comments_cmds=()
for anno in "${annotation[@]}"; do
tmp_anno=$(basename "${anno}" .gtf.gz).rm_comments.gtf
rm_comments_cmd="pigz -k -c -d '${anno}' | grep -v -P '^#' > '${tmp_anno}'"
echo "running command: ${rm_comments_cmd}" &>> "${log_file}"
eval "${rm_comments_cmd}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${rm_comments_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${rm_comments_cmd}" &>> "${log_file}"
fi
rm_comments_annotations+=("${tmp_anno}")
rm_comments_cmds+=("${rm_comments_cmd}")
done
args=()
for ((j=0; j<${#genome[@]}; j++)); do
args+=(-n "${build_name[j]}" -f "${genome[j]}" -g "${rm_comments_annotations[j]}")
done
mobivision_mkindex_cmd="mobivision mkindex ${args[@]}"
echo "running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
eval "${mobivision_mkindex_cmd}" &>> "${log_file}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${mobivision_mkindex_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${mobivision_mkindex_cmd}" &>> "${log_file}"
fi
build_date=$(date +%Y%m%d_%H%M%S)
output_dir="${output_dir}/${build_date}"
mkdir -p "${output_dir}"
mv_cmd="mv ${merged_build_name} ${output_dir}/"
echo "running command: ${mv_cmd}" &>> "${log_file}"
eval "${mv_cmd}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${mv_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${mv_cmd}" &>> "${log_file}"
fi
metadata_file="${output_dir}/${merged_build_name}".metadata.json
echo "writing metadata to ${metadata_file}" &>> "${log_file}"
build_names_json=$(printf '%s\n' "${build_name[@]}" | jq -R '.' | jq -s '.')
genomes_json=$(printf '%s\n' "${genome[@]}" | jq -R '.' | jq -s '.')
annotations_json=$(printf '%s\n' "${annotation[@]}" | jq -R '.' | jq -s '.')
rm_comments_cmds_json=$(printf '"%s", ' "${rm_comments_cmds[@]}" | sed 's/, $//')
cat <<EOF |
{
"build_name": "${merged_build_name}",
"build_names": ${build_names_json},
"build_date": "${build_date}",
"software": "mobivision",
"software_version": "v4.1",
"library_type": "transcriptome",
"reference_type": "multiple",
"genomes": ${genomes_json},
"annotations": ${annotations_json},
"cmd": [${rm_comments_cmds_json}, "${mobivision_mkindex_cmd}", "${mv_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"
mv "${log_file}" "${output_dir}/"
cd ${root_work_dir} || exit
rm -rf "${work_dir}"
done
echo "running pipeline done!"
1.1.2 Quantify transcriptome
#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp
cd "${work_dir}"
vim "${uuid_str}".run_mobivision_quantify.sh
chmod +x "${uuid_str}".run_mobivision_quantify.sh
log_file="${uuid_str}".run_mobivision_quantify.log
pid_file="${uuid_str}".run_mobivision_quantify.pid
nohup ./"${uuid_str}".run_mobivision_quantify.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished
#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
# Each directory must contain data from only one biological sample
# Multiple read pair files within a single directory will be automatically merged before pipeline execution
# Directory names serve as unique sample identifiers
# Ensure all directory basenames are globally unique across your entire project to prevent sample ID conflicts
fq_dirs=(
/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/raw_fq/JZ25176043-250818-Astro
/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/raw_fq/JZ25176044-250818-Glioma
/data/users/yangrui/lab_projs/wuyuechun/TbelangeriKidneyOrganoid_HTNV76118_snRNA-seq_v20250921/raw_fq/JZ25142338-250707-NC
/data/users/yangrui/lab_projs/wuyuechun/TbelangeriKidneyOrganoid_HTNV76118_snRNA-seq_v20250921/raw_fq/JZ25142339-250707-PC
)
ref_idx_dirs=(
/data/database/ref_idx/mobivision_v4.1/transcriptome/20250927_000227/GRCm39_GENCODEvM38
/data/database/ref_idx/mobivision_v4.1/transcriptome/20250927_000227/GRCm39_GENCODEvM38
/data/database/ref_idx/mobivision_v4.1/transcriptome/20250927_125947/TupaiaBelangeriV3_HTNV76118
/data/database/ref_idx/mobivision_v4.1/transcriptome/20250927_125947/TupaiaBelangeriV3_HTNV76118
)
output_dirs=(
/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res
/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res
/data/users/yangrui/lab_projs/wuyuechun/TbelangeriKidneyOrganoid_HTNV76118_snRNA-seq_v20250921/quant_res
/data/users/yangrui/lab_projs/wuyuechun/TbelangeriKidneyOrganoid_HTNV76118_snRNA-seq_v20250921/quant_res
)
nthreads=100
root_work_dir=/data/tmp/dell_tmp
# shellcheck source=/dev/null
source ${mobivision_source_path}
(( ${#fq_dirs[@]} == ${#ref_idx_dirs[@]} && ${#ref_idx_dirs[@]} == ${#output_dirs[@]} )) || {
echo "ERROR: array lengths are not equal" >&2
exit 1
}
echo "start running pipeline ..."
for ((i=0; i<${#fq_dirs[@]}; i++)); do
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir="${root_work_dir}/${uuid_str}"
mkdir -p "${work_dir}"
cd "${work_dir}" || exit
fq_dir="${fq_dirs[i]}"
ref_idx_dir="${ref_idx_dirs[i]}"
output_dir="${output_dirs[i]}"
run_date=$(date +%Y%m%d_%H%M%S)
fq_dir_basename=$(basename "${fq_dir}")
sub_dir="${fq_dir_basename}_${run_date}"
output_dir="${output_dir}/${sub_dir}"
mkdir -p "${output_dir}"
log_file="${output_dir}/${sub_dir}".run_mobivision_quantify.log
mobivision_quantify_cmd="mobivision quantify -i '${ref_idx_dir}' -t '${nthreads}' -f '${fq_dir}' -s '${fq_dir_basename}' -o '${output_dir}'"
echo "running command: ${mobivision_quantify_cmd}" &> "${log_file}"
eval "${mobivision_quantify_cmd}" &>> "${log_file}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${mobivision_quantify_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${mobivision_quantify_cmd}" &>> "${log_file}"
fi
metadata_file="${output_dir}/${sub_dir}".metadata.json
echo "writing metadata to ${metadata_file}" &>> "${log_file}"
fastqs=$(find "${fq_dir}" -maxdepth 1 -type f -name "*.fastq.gz" | jq -R '.' | jq -s '.')
cat <<EOF |
{
"quantify_name": "${sub_dir}",
"quantify_date": "${run_date}",
"software": "mobivision",
"software_version": "v4.1",
"library_type": "transcriptome",
"fastqs": ${fastqs},
"reference_index": "${ref_idx_dir}",
"cmd": ["${mobivision_quantify_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"
cd ${root_work_dir} || exit
rm -rf "${work_dir}"
done
echo "running pipeline done!"
1.2 Epigenome
1.2.1 Build epigenomic reference genome index
- One genome file and one annotation file (
singlemode)
#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp
cd "${work_dir}"
vim "${uuid_str}".run_mobivision_mk_chip_ref.sh
chmod +x "${uuid_str}".run_mobivision_mk_chip_ref.sh
log_file="${uuid_str}".run_mobivision_mk_chip_ref.log
pid_file="${uuid_str}".run_mobivision_mk_chip_ref.pid
nohup ./"${uuid_str}".run_mobivision_mk_chip_ref.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished
#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
# <genome build version>_<annotation build version>
build_names=(
PetaurusBrevicepsV1_PetaurusBrevicepsV3
)
# Gzipped FASTA only
genomes=(
/data/database/genome/release/petaurus_breviceps/Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa.gz
)
# Gzipped GTF only
annotations=(
/data/database/annotation/release/petaurus_breviceps/Petaurus_breviceps.v1.v3.KIZ_NCBI.basic.chr.annotation.gtf.gz
)
output_dirs=(
/data/database/ref_idx/mobivision_v4.1/epigenome
)
# true
# any other values will be considered as false
check_gtf=true
root_work_dir=/data/tmp/dell_tmp
# shellcheck source=/dev/null
source ${mobivision_source_path}
(( ${#build_names[@]} == ${#genomes[@]} && ${#genomes[@]} == ${#annotations[@]} && ${#annotations[@]} == ${#output_dirs[@]} )) || {
echo "ERROR: array lengths are not equal" >&2
exit 1
}
echo "start running pipeline ..."
for((i=0;i<${#build_names[@]};i++)); do
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir="${root_work_dir}/${uuid_str}"
mkdir -p "${work_dir}"
cd "${work_dir}" || exit
build_name="${build_names[i]}"
genome="${genomes[i]}"
annotation="${annotations[i]}"
output_dir="${output_dirs[i]}"
log_file="${build_name}".run_mobivision_mkindex.log
# Generally, avoid running `mobivision fmt_gtf` unless absolutely necessary
tmp_annotation_fixed=$(basename "${annotation}" .gtf.gz).fixed.gtf
tmp_annotation=$(basename "${annotation}" .gtf.gz).rm_comments.gtf
pigz -k -c -d "${annotation}" > "${tmp_annotation_fixed}"
if [[ "${check_gtf}" == "true" ]]; then
/data/softwares/misc_r_scripts/check_fix_gtf_for_cellranger.R "${tmp_annotation_fixed}"
fi
rm_comments_cmd="grep -v -P '^#' '${tmp_annotation_fixed}' > '${tmp_annotation}'"
echo "running command: ${rm_comments_cmd}" &> "${log_file}"
eval "${rm_comments_cmd}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${rm_comments_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${rm_comments_cmd}" &>> "${log_file}"
fi
mobivision_mk_chip_ref_cmd="mobivision mk_chip_ref -n '${build_name}' -f '${genome}' -g '${tmp_annotation}' -r '${work_dir}'"
echo "running command: ${mobivision_mk_chip_ref_cmd}" &>> "${log_file}"
eval "${mobivision_mk_chip_ref_cmd}" &>> "${log_file}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${mobivision_mk_chip_ref_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${mobivision_mk_chip_ref_cmd}" &>> "${log_file}"
fi
build_date=$(date +%Y%m%d_%H%M%S)
output_dir="${output_dir}/${build_date}"
mkdir -p "${output_dir}"
mv_cmd="mv ${build_name} ${output_dir}/"
echo "running command: ${mv_cmd}" &>> "${log_file}"
eval "${mv_cmd}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${mv_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${mv_cmd}" &>> "${log_file}"
fi
metadata_file="${output_dir}/${build_name}".metadata.json
echo "writing metadata to ${metadata_file}" &>> "${log_file}"
cat <<EOF |
{
"build_name": "${build_name}",
"build_date": "${build_date}",
"software": "mobivision",
"software_version": "v4.1",
"library_type": "epigenome",
"reference_type": "single",
"genomes": ["${genome}"],
"annotations": ["${annotation}"],
"cmd": ["${rm_comments_cmd}", "${mobivision_mk_chip_ref_cmd}", "${mv_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"
mv "${log_file}" "${output_dir}/"
cd ${root_work_dir} || exit
rm -rf "${work_dir}"
done
echo "running pipeline done!"
1.2.2 Align epigenome
#!/usr/bin/bash -e
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir=/data/tmp/dell_tmp
cd "${work_dir}"
vim "${uuid_str}".run_mobivision_atac.sh
chmod +x "${uuid_str}".run_mobivision_atac.sh
log_file="${uuid_str}".run_mobivision_atac.log
pid_file="${uuid_str}".run_mobivision_atac.pid
nohup ./"${uuid_str}".run_mobivision_atac.sh &> "${log_file}" &
echo $! > "${pid_file}"
kill -0 "$(<"${pid_file}")" 2>/dev/null && echo running || echo finished
ps -p "$(<"${pid_file}")" >/dev/null && echo running || echo finished
#!/usr/bin/bash -e
mobivision_source_path=/data/softwares/mobivision-v4.1/source.sh
# Each directory must contain data from only one biological sample
# Multiple read pair files within a single directory will be automatically merged before pipeline execution
# Directory names serve as unique sample identifiers
# Ensure all directory basenames are globally unique across your entire project to prevent sample ID conflicts
fq_dirs=(
/data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133309-250714A-B-WYL-ZN-250714A-B-WYL-ZN
/data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133310-250714A-B-WYL-WZT-250714A-B-WYL-WZT
/data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133311-250714A-B-WYL-PC-250714A-B-WYL-PC
/data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133312-250714A-B-LYY-HM-250714A-B-LYY-HM
/data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133313-250714A-B-LYY-XQN-250714A-B-LYY-XQN
/data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133315-250714A-B-LYY-HN-250714A-B-LYY-HN
/data/database/data/raw/fastq/20250923_163558/snATAC-seq/Sample_JZ25133314-250714A-B-LYY-YP-250714A-B-LYY-YP
)
ref_idx_dirs=(
/data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
/data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
/data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
/data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
/data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
/data/database/ref_idx/mobivision_v4.1/epigenome/20250927_175726/GRCm39_GENCODEvM38
/data/database/ref_idx/mobivision_v4.1/epigenome/20250927_190526/PetaurusBrevicepsV1_PetaurusBrevicepsV1
)
output_dirs=(
/data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
/data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
/data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
/data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
/data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
/data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
/data/database/data/processed/epigenome/snATAC-seq/align_results/20250928_152816
)
# Can be either 'broad' or 'narrow'
peak_type=narrow
nthreads=100
root_work_dir=/data/tmp/dell_tmp
# shellcheck source=/dev/null
source ${mobivision_source_path}
(( ${#fq_dirs[@]} == ${#ref_idx_dirs[@]} && ${#ref_idx_dirs[@]} == ${#output_dirs[@]} )) || {
echo "ERROR: array lengths are not equal" >&2
exit 1
}
echo "start running pipeline ..."
for ((i=0; i<${#fq_dirs[@]}; i++)); do
uuid_str=$(</proc/sys/kernel/random/uuid)
work_dir="${root_work_dir}/${uuid_str}"
mkdir -p "${work_dir}"
cd "${work_dir}" || exit
fq_dir="${fq_dirs[i]}"
ref_idx_dir="${ref_idx_dirs[i]}"
output_dir="${output_dirs[i]}"
run_date=$(date +%Y%m%d_%H%M%S)
fq_dir_basename=$(basename "${fq_dir}")
sub_dir="${fq_dir_basename}_${peak_type}_${run_date}"
output_dir="${output_dir}/${sub_dir}"
mkdir -p "${output_dir}"
log_file="${output_dir}/${sub_dir}".run_mobivision_quantify.log
mobivision_atac_cmd="mobivision atac --peaktype '${peak_type}' -i '${ref_idx_dir}' -t '${nthreads}' -f '${fq_dir}' -o '${output_dir}'"
echo "running command: ${mobivision_atac_cmd}" &> "${log_file}"
eval "${mobivision_atac_cmd}" &>> "${log_file}"
ret=$?
if (( ret != 0 )); then
echo "ERROR: running ${mobivision_atac_cmd} failed with exit code ${ret}" &>> "${log_file}"
else
echo "successfully running command: ${mobivision_atac_cmd}" &>> "${log_file}"
fi
metadata_file="${output_dir}/${sub_dir}".metadata.json
echo "writing metadata to ${metadata_file}" &>> "${log_file}"
fastqs=$(find "${fq_dir}" -maxdepth 1 -type f -name "*.fastq.gz" | jq -R '.' | jq -s '.')
cat <<EOF |
{
"quantify_name": "${sub_dir}",
"quantify_date": "${run_date}",
"software": "mobivision",
"software_version": "v4.1",
"library_type": "epigenome",
"fastqs": ${fastqs},
"reference_index": "${ref_idx_dir}",
"cmd": ["${mobivision_atac_cmd}"]
}
EOF
jq '.' > "${metadata_file}"
echo "successfully writing metadata to ${metadata_file}" &>> "${log_file}"
cd ${root_work_dir} || exit
rm -rf "${work_dir}"
done
echo "running pipeline done!"
2 CellRanger
2.1 Transcriptome
2.1.1 Build transcriptomic reference genome index
# !/usr/local/bin/Rscript
# check_fix_gtf_for_cellranger.R
args <- commandArgs(trailingOnly = TRUE)
if (length(args) != 1 || !file.exists(args[1])) {
message("check_fix_gtf_for_cellranger.R example.gtf\n\nImportant: the input file i.e., example.gtf here will be overwritten, so pass a copy of your original file.")
quit(save = "no", status = 1)
} else {
message("dealing with ", args[1])
}
suppressMessages(library(rtracklayer))
suppressMessages(library(tidyverse))
gene_biotype_fields <- c("gene_type", "gene_biotype", "biotype")
gene_biotype_default <- "protein_coding"
keep_cols <- c(
"source", "type", "score", "phase",
"gene_id", "transcript_id", "gene_name", "gene_biotype",
"transcript_biotype", "exon_number", "protein_id"
)
gtf_file <- args[1]
gtf <- import(gtf_file, format = "gtf")
df <- as_tibble(as.data.frame(gtf))
exon_df <- filter(df, type == "exon")
if (nrow(exon_df) == 0 || !("gene_id" %in% names(exon_df)) || !("transcript_id" %in% names(exon_df)) || any(is.na(exon_df$gene_id)) || any(is.na(exon_df$transcript_id)) || any(exon_df$gene_id == "") || any(exon_df$transcript_id == "")) {
stop("some exons don't have 'gene_id' and/or 'transcript_id' attributes")
}
gtf$gene_id <- trimws(gtf$gene_id)
if (any(gtf$gene_id == "") || any(grepl("[ \t\r\n\"]", gtf$gene_id))) {
stop("some gene_id values are empty")
}
gene_biotype_field <- gene_biotype_fields[which(gene_biotype_fields %in% names(df))][1]
if (!is.na(gene_biotype_field)) {
gene_id_biotype_df <- df %>%
mutate(gene_biotype = .data[[gene_biotype_field]]) %>%
select(gene_id, gene_biotype) %>%
na.omit() %>%
distinct()
gtf$gene_biotype <- gene_id_biotype_df$gene_biotype[match(gtf$gene_id, gene_id_biotype_df$gene_id)]
gtf$gene_biotype[is.na(gtf$gene_biotype)] <- gene_biotype_default
} else {
gtf$gene_biotype <- gene_biotype_default
}
if ("gene_name" %in% names(mcols(gtf))) {
gtf$gene_name[is.na(gtf$gene_name)] <- gtf$gene_id[is.na(gtf$gene_name)]
} else {
gtf$gene_name <- gtf$gene_id
}
gtf$gene_name[grepl("[ \t\r\n\"]", gtf$gene_name)] <- gtf$gene_id[grepl("[ \t\r\n\"]", gtf$gene_name)]
final_cols <- intersect(keep_cols, names(mcols(gtf)))
mcols(gtf) <- mcols(gtf)[, final_cols]
export(gtf, con = gtf_file, format = "gtf")
message("dealing with ", gtf_file, " done!")
#!/usr/bin/bash -e
annotation=/data/database/annotation/release/petaurus_breviceps/Petaurus_breviceps.v1.v3.KIZ_NCBI.basic.chr.annotation.gtf.gz
genome=/data/database/genome/release/petaurus_breviceps/Petaurus_breviceps.v1.self.dna_sm.primary_assembly.fa.gz
build_name=PetaurusBrevicepsV1_PetaurusBrevicepsV3
nthreads=120
memgb=1024
# true
# any other values will be considered as false
check_gtf=true
work_dir=/data/tmp/dell_tmp
cd "${work_dir}"
tmp_genome=genome.fa
pigz -k -c -d "${genome}" > "${tmp_genome}"
tmp_annotation=annotation.gtf
pigz -k -c -d "${annotation}" > "${tmp_annotation}"
if [[ "${check_gtf}" == "true" ]]; then
/data/softwares/misc_r_scripts/check_fix_gtf_for_cellranger.R "${tmp_annotation}"
fi
cellranger mkref --genome "${build_name}" --fasta "${tmp_genome}" \
--genes "${tmp_annotation}" --nthreads "${nthreads}" --memgb "${memgb}" \
--localcores "${nthreads}" --localmem "${memgb}"
metadata_file="${build_name}".metadata.json
cat <<EOF |
{
"build_name": "${build_name}",
"check_gtf": "${check_gtf}",
"genomes": ["${genome}"],
"annotations": ["${annotation}"]
}
EOF
jq '.' > "${metadata_file}"
rm -f "${tmp_genome}" "${tmp_annotation}"
2.1.2 Quantify transcriptome for a single sample
对于单细胞转录组,应根据你的实验设计、建库与测序方案决定在哪一个阶段合并样本,例如:对于同一样本、同一库、不同测序芯片多次测序,应在比对定量之前合并 FASTQ 文件;而对于同一样本、不同库、同一或不同测序芯片,应在比对定量后进行数据整合(以库为最小整合单位,即 technical replicates)。
#!/usr/bin/bash -e
ulimit -Sn 16384
ref_idx=/data/database/ref_idx/cellranger_v8.0.1/transcriptome/20251110_124502/PetaurusBrevicepsV1_PetaurusBrevicepsV3
fastqs_path=/data/database/data/raw/fastq/20250923_163558/snRNA-seq/Sample_JZ25136889-250710-YP-250710-YP/mobiTo10x/tenx
id=Sample_JZ25136889-250710-YP-250710-YP
sample=YP
nthreads=120
memgb=1024
work_dir=/data/tmp/dell_tmp
cd "${work_dir}"
log_file="${id}".run_10x_cellranger_count.log
cellranger count --id="${id}" --transcriptome="${ref_idx}" \
--fastqs="${fastqs_path}" --sample="${sample}" \
--create-bam=true --localcores="${nthreads}" \
--localmem="${memgb}" &>> "${log_file}"
metadata_file="${id}".metadata.json
fastqs=$(find "${fastqs_path}" -maxdepth 1 -type f -name "*.fastq.gz" | jq -R '.' | jq -s '.')
cat <<EOF |
{
"software": "cellranger",
"software_version": "v8.0.1",
"library_type": "transcriptome",
"fastqs": ${fastqs},
"reference_index": "${ref_idx}"
}
EOF
jq '.' > "${metadata_file}"
2.1.3 Quantify transcriptome in batch
#!/usr/bin/bash -e
ulimit -Sn 16384
ref_idx=/data/database/ref_idx/cellranger_v8.0.1/transcriptome/20251206_104658/MustelaPutoriusFuroV1_MustelaPutoriusFuroV1
fastqs_path=/data/tmp/dell_tmp/ferret_10x_fastq
sample_file=/data/users/dell/temp/sample.txt
nthreads=120
memgb=1024
work_dir=/data/tmp/dell_tmp
cd "${work_dir}"
while IFS= read -r sample
do
echo "processing ${sample} ..."
log_file="${sample}".run_10x_cellranger_count.log
cellranger count --id="${sample}" --transcriptome="${ref_idx}" \
--fastqs="${fastqs_path}" --sample="${sample}" \
--create-bam=true --localcores="${nthreads}" \
--localmem="${memgb}" &>> "${log_file}"
metadata_file="${sample}".metadata.json
fastqs=$(find "${fastqs_path}" -maxdepth 1 -type f -name "${sample}*.fastq.gz" | jq -R '.' | jq -s '.')
cellranger_version=$(cellranger --version)
cat <<EOF |
{
"software": "cellranger",
"software_version": "${cellranger_version}",
"library_type": "transcriptome",
"fastqs": ${fastqs},
"reference_index": "${ref_idx}"
}
EOF
jq '.' > "${metadata_file}"
echo "processing ${sample} done!"
done < "${sample_file}"
2.2 Epigenome
2.2.1 Build epigenomic reference genome index
2.2.2 Align epigenome
2.3 Multiome
2.3.1 Build multiomic reference genome index
2.3.2 Quantify and align multiome
3 Misc
3.1 Convert Mobi 3’ transcriptome reads to 10X-compatible format
/data/softwares/mobiTo10x/convert_3RNA_mobi_to_tenx -r1 JZ25136889-250710-YP-250710-YP_combined_R1.fastq.gz -r2 JZ25136889-250710-YP-250710-YP_combined_R2.fastq.gz -o mobiTo10x -p YP -l /data/softwares/mobiTo10x/3M-february-2018.txt
3.2 Judge read types (R1/R2) based on 10X barcode inclusion list
Only support two read files (i.e., R1 & R2).
If your read files include I1, I2, etc. you should identify and remove them first.
3.3 Examples to rename FASTQ files downloaded from NCBI GEO
library(tidyverse)
library(vroom)
work_dir <- "/data/users/dell/temp"
setwd(work_dir)
file_process_meta_file <- "file_processing_metadata.tsv"
sra_run_table_file <- "SraRunTable.csv"
srr_acc_list_file <- "SRR_Acc_List.txt"
barcodes_eval_file <- "barcodes_eval.tsv"
barcodes_type_cols <- c("sc_3p_v2", "sc_3p_v3")
barcodes_pct_th <- 90
output_dir <- "/data/tmp/dell_tmp/tmp_downloads"
file_process_meta_df <- vroom(file_process_meta_file)
sra_run_table_df <- vroom(sra_run_table_file)
srr_acc_list_df <- vroom(srr_acc_list_file, col_names = "Run", delim = "\t")
if (all(srr_acc_list_df$Run %in% sra_run_table_df$Run) && all(sra_run_table_df$Run %in% srr_acc_list_df$Run)) {
message("all runs are valid: OK")
} else {
warning("all runs are valid: NO")
}
file_process_meta_df <- file_process_meta_df %>%
select(geo_gsm_acc, stage, tissue, layer, bio_rep, sub_dir) %>%
rename(area = tissue, bio_sample = sub_dir) %>%
distinct()
barcodes_eval_df <- vroom(barcodes_eval_file)
barcodes_eval_df <- barcodes_eval_df %>%
pivot_longer(cols = all_of(barcodes_type_cols), names_to = "barcodes_type", values_to = "barcodes_num") %>%
mutate(
barcodes_pct = barcodes_num / total_count * 100,
Run = str_extract(basename(fq_file), "^SRR[0-9]+")
)
best_barcodes_type_df <- barcodes_eval_df %>%
group_by(Run) %>%
slice_max(order_by = barcodes_pct, n = 1) %>%
slice_sample(n = 1) %>%
mutate(barcodes_type_validity = if_else(barcodes_pct > barcodes_pct_th, TRUE, FALSE)) %>%
select(barcodes_type, Run, barcodes_type_validity) %>%
distinct()
suspicious_runs <- best_barcodes_type_df$Run[!best_barcodes_type_df$barcodes_type_validity]
warning("these runs are suspicious: ", paste0(suspicious_runs, collapse = " "))
filter(barcodes_eval_df, Run %in% suspicious_runs) %>%
arrange(Run, fq_file, desc(barcodes_pct)) %>%
as.data.frame()
barcodes_eval_df <- inner_join(barcodes_eval_df, best_barcodes_type_df, by = c("Run", "barcodes_type")) %>%
arrange(Run, fq_file, desc(barcodes_pct))
barcodes_eval_df <- lapply(split(barcodes_eval_df, barcodes_eval_df$Run), function(x) {
if (nrow(x) != 2) {
stop("only two files supported")
}
x %>%
arrange(desc(barcodes_pct)) %>%
mutate(read_type = seq_len(n()))
}) %>% bind_rows()
keep_cols <- c(
"Run", "Developmental_Stage", "Region", "germinal_zone",
"litter", "tissue", "Sample Name", "Assay Type",
"BioProject", "BioSample", "Experiment",
"Instrument", "Library Name", "LibraryLayout",
"LibrarySelection", "LibrarySource", "Organism", "Platform",
"sequencing_flowcell", "source_name", "SRA Study"
)
df <- sra_run_table_df %>%
select(all_of(keep_cols)) %>%
inner_join(file_process_meta_df, by = c("Sample Name" = "geo_gsm_acc")) %>%
arrange(stage, area, layer, bio_rep) %>%
group_by(bio_sample) %>%
mutate(
tech_rep = seq_len(n()),
tech_sample = paste0(bio_sample, "_part", tech_rep)
) %>%
ungroup() %>%
inner_join(barcodes_eval_df, by = "Run") %>%
mutate(
new_fq_file_basename = paste0(bio_sample, "_S1_L00", tech_rep, "_R", read_type, "_001.fastq"),
new_fq_file = file.path(output_dir, new_fq_file_basename)
) %>%
arrange(Run, new_fq_file)
vroom_write(df, file = "metadata.tsv")
ok <- file.rename(df$fq_file, df$new_fq_file)
if (all(ok)) {
message("all are well done")
} else {
stop("some tasks were failed")
}
ok <- sapply(df$new_fq_file, function(x) system2(command = "pigz", args = x))
if (all(ok == 0)) {
message("all are well done")
} else {
stop("some tasks were failed")
}