library(tidyverse)
library(vroom)
guess_compression_type <- function(files) {
sapply(files, function(f) {
if (!file.exists(f)) stop("file does not exist")
con <- file(f, open = "rb")
on.exit(close(con))
f_header <- readBin(con, what = "raw", n = 32)
if (all(f_header[1:2] == as.raw(c(0x1f, 0x8b)))) {
"gz"
} else if (all(f_header[1:3] == as.raw(c(0x42, 0x5a, 0x68)))) {
"bz2"
} else if (all(f_header[1:6] == as.raw(c(0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00)))) {
"xz"
} else if (all(f_header[1:4] == as.raw(c(0x50, 0x4b, 0x03, 0x04)))) {
"zip"
} else if (all(f_header[1:4] == as.raw(c(0x28, 0xb5, 0x2f, 0xfd)))) {
"zst"
} else {
"unknown"
}
}, USE.NAMES = FALSE)
}
metadata_template_file <- "/home/dell/YRArchive/NeuroBorder/Blogs/Bioinformatics/posts/Misc/metadata_template/data/raw_omics-file-level_metadata_template.v1.csv"
root_dir <- "/data/database/data/raw/fastq/20250923_164603/snRNA-seq"
exclude_file_pattern <- ".+\\.md5(\\.check)?$"
files <- list.files(root_dir, full.names = TRUE, recursive = TRUE)
files <- files[!grepl(exclude_file_pattern, files)]
metadata_template_df <- vroom(metadata_template_file,
delim = ",", col_names = TRUE,
col_types = cols(.default = col_character())
)
metadata_template_df <- metadata_template_df[0, ]
metadata_template_df <- metadata_template_df[rep(NA_integer_, length(files)), ]
df <- metadata_template_df %>%
mutate(
path = files,
directory = dirname(path),
first_level_parent_directory = basename(dirname(path)),
file = basename(path),
file_size = as.character(file.size(path)),
compression_type = guess_compression_type(path),
file_type = map2_chr(file, compression_type, function(f, c) tools::file_ext(sub(paste0("\\.", c, "$"), "", f))),
hash_checksum_checking_file = map2_chr(directory, file, function(d, f) {
check_f <- list.files(d, pattern = paste0(f, "\\.[a-zA-Z0-9_-]+\\.check$"), full.names = TRUE, recursive = FALSE)
if (length(check_f) != 1) stop("hash checksum checking file does not exist")
check_f
}),
hash_checksum_file = sapply(hash_checksum_checking_file, function(f) {
checksum_f <- tools::file_path_sans_ext(f)
if (!file.exists(checksum_f)) stop("hash checksum file does not exist")
checksum_f
}, USE.NAMES = FALSE),
hash_checksum_type = tools::file_ext(hash_checksum_file),
hash_checksum = sapply(hash_checksum_file, function(f) {
checksum_line <- vroom_lines(f, skip_empty_rows = TRUE)
if (length(checksum_line) != 1) stop("malformed checksum file")
checksum <- trimws(strsplit(checksum_line, " +")[[1]][1])
if (checksum == "") stop("parsing checksum failed")
checksum
}),
hash_checksum_status = sapply(hash_checksum_checking_file, function(f) {
check_status <- vroom_lines(f, skip_empty_rows = TRUE)
if (grepl("OK$", check_status)) {
"OK"
} else if (grepl("FAILED$", check_status)) {
"FAILED"
} else {
"UNKNOWN"
}
}),
id = uuid::UUIDgenerate(n = length(files)),
date = Sys.Date(),
read_type = pmap_chr(list(file, file_type, compression_type), function(f, ft, ct) str_extract(sub(paste0("\\.", ft, "(\\.", ct, ")?$"), "", f), pattern = "(?<=[-_.])(R|I)[0-9]+$")),
basename = pmap_chr(list(file, read_type, file_type, compression_type), function(f, rt, ft, ct) sub(paste0("([-_.]", rt, ")?\\.", ft, "(\\.", ct, ")?$"), "", f))
) %>%
arrange(path)
cols_status <- sapply(df, function(x) all(is.na(x)))
sub_df <- df[, unique(c("path", "first_level_parent_directory", "basename", names(cols_status[cols_status == TRUE])))]
vroom_write(
df,
file = file.path(root_dir, sub(
"(csv|tsv|txt)$",
paste0("auto_maintained.", format(Sys.time(), "%Y%m%d_%H%M%S"), ".csv"),
basename(metadata_template_file),
ignore.case = TRUE
)),
delim = ",", na = "",
col_names = TRUE, append = FALSE
)
vroom_write(
sub_df,
file = file.path(root_dir, sub(
"(csv|tsv|txt)$",
paste0("manually_curated.", format(Sys.time(), "%Y%m%d_%H%M%S"), ".csv"),
basename(metadata_template_file),
ignore.case = TRUE
)),
delim = ",", na = "",
col_names = TRUE, append = FALSE
)1 Raw omics-file-level metadata template
metadata_version: v1
1.1 Overview
This CSV template is designed to collect and describe raw omics-file-level metadata (e.g. FASTQ) in a standardized, machine-readable format.
1.2 Field description
| Field | Example | Definition |
|---|---|---|
| project_name | BrainAtlas2025 | Human-readable project title. |
| project_id | PRJNA123456 | Official accession assigned by a public repository (e.g. NCBI BioProject, CNCB GSA). |
| path | /data/fastq/B1_R1.fastq.gz |
Absolute path to the file. |
| directory | /data/fastq |
Folder containing the file. |
| first_level_parent_directory | fastq |
The first-level parent directory of the file. |
| file | B1_R1.fastq.gz |
File name only. |
| basename | B1 |
The basename of the file. |
| file_size | 36724821 | Size in bytes (obtained via file.size() in R or stat -c%s in Linux command line). |
| file_type | fastq |
Type of data file (fastq, bam, etc.). |
| compression_type | gzip |
Compression algorithm (obtained via file -b B1_R1.fastq.gz | awk '{print tolower($1)}', e.g. gzip, bz2, xz, ascii). |
| hash_checksum_type | md5 |
Algorithm used for checksum (e.g. md5, sha256). |
| hash_checksum | 5e4f28b8e215a6a7a1c4f5e3d2c1a0b9 |
Hexadecimal checksum string. |
| hash_checksum_status | OK | Either OK, FAILED, or UNKNOWN. |
| hash_checksum_file | /data/fastq/B1_R1.fastq.gz.md5 |
Path to hash checksum file. |
| hash_checksum_checking_file | /data/fastq/B1_R1.fastq.gz.md5.check |
Path to hash checksum checking file. |
| id | 550e8400-e29b-41d4-a716-446655440000 |
Globally unique identifier (generated by calling uuid::UUIDgenerate() in R). |
| source | SRA-SRP4321 | Provenance trace: database link, paper DOI, or lab notebook entry. |
| sample | CTX_B1 | Biological sample name/label. |
| sample_type | primary tissue | Controlled vocabulary: primary organ, primary tissue, primary cell line, organoid, cell line, etc. |
| biological_replicate | 1 | Integer, biological replicate number. |
| technical_replicate | 1 | Integer, technical replicate number. |
| species | mouse | Common name. |
| species_scientific_name | Mus musculus | Binomial nomenclature. |
| species_strain | C57BL/6 | Strain or stock. |
| species_model | autism spectrum disorder | Disease or experimental model (if any). |
| age | P7 | Age stage (e.g. E80, P4, adult, PCW20). |
| sex | male | male / female / mixed / unknown. |
| genotype | WT | Genotype or genetic modification. |
| organ | brain | Organ of origin. |
| tissue | cerebral cortex | Fine-grained tissue region. |
| cell_line | NA | Name of cell line (NA or empty if not applicable). |
| treatment | NA | Experimental treatment applied (drug, virus, etc.). |
| experiment_batch | EXP_batch_01 | Arbitrary batch identifier for wet-lab steps. |
| library_type | snRNA-seq | Sequencing library strategy (controlled vocabulary), e.g. snRNA-seq, snATAC-seq, RNA-seq, ATAC-seq, ChIP-seq (H3K27ac), Cut&Tag (Satb2), etc. |
| library_kit | 10x-3v3 | Commercial kit or in-house protocol name. |
| library_batch | LIB_batch_A | Library preparation batch. |
| omics_type | transcriptome | Genomic layer: transcriptome, epigenome, genome, etc. |
| sequencing_platform | Illumina NovaSeq | Instrument used. |
| sequencing_type | PE:150 | Read layout & length, e.g. PE:150, SE:100. |
| sequencing_batch | SEQ_batch_202509 | Sequencing run batch. |
| read_type | R1 | Read segment: R1, R2, I1, I2. |
| date | 2025-09-23 | Date, this metadata record was created (ISO format). |
| note | No QC yet, spike-in added | Free-text comments not captured elsewhere. |
| data_status | raw | Lifecycle flag: raw, processed, archived. |
1.3 Usage Notes
Keep the header line exactly as provided.
Use
NAor leave the cell empty for missing values.Avoid special characters (comma, semicolon, line break) inside fields.
Store the CSV in UTF-8 encoding.
Increment
metadata_versionin the filename and README when structural changes occur.
1.4 Management scripts
- Extract file information for MobiVision-formatted raw data