Metadata template – BrainArch

1 Raw omics-file-level metadata template

metadata_version: v1

1.1 Overview

This CSV template is designed to collect and describe raw omics-file-level metadata (e.g. FASTQ) in a standardized, machine-readable format.

1.2 Field description

Field	Example	Definition
project_name	BrainAtlas2025	Human-readable project title.
project_id	PRJNA123456	Official accession assigned by a public repository (e.g. NCBI BioProject, CNCB GSA).
path	`/data/fastq/B1_R1.fastq.gz`	Absolute path to the file.
directory	`/data/fastq`	Folder containing the file.
first_level_parent_directory	`fastq`	The first-level parent directory of the file.
file	`B1_R1.fastq.gz`	File name only.
basename	`B1`	The basename of the file.
file_size	36724821	Size in bytes (obtained via `file.size()` in R or `stat -c%s` in Linux command line).
file_type	`fastq`	Type of data file (`fastq`, `bam`, etc.).
compression_type	`gzip`	Compression algorithm (obtained via `file -b B1_R1.fastq.gz \| awk '{print tolower($1)}'`, e.g. `gzip`, `bz2`, `xz`, `ascii`).
hash_checksum_type	`md5`	Algorithm used for checksum (e.g. `md5`, `sha256`).
hash_checksum	`5e4f28b8e215a6a7a1c4f5e3d2c1a0b9`	Hexadecimal checksum string.
hash_checksum_status	OK	Either OK, FAILED, or UNKNOWN.
hash_checksum_file	`/data/fastq/B1_R1.fastq.gz.md5`	Path to hash checksum file.
hash_checksum_checking_file	`/data/fastq/B1_R1.fastq.gz.md5.check`	Path to hash checksum checking file.
id	`550e8400-e29b-41d4-a716-446655440000`	Globally unique identifier (generated by calling `uuid::UUIDgenerate()` in R).
source	SRA-SRP4321	Provenance trace: database link, paper DOI, or lab notebook entry.
sample	CTX_B1	Biological sample name/label.
sample_type	primary tissue	Controlled vocabulary: primary organ, primary tissue, primary cell line, organoid, cell line, etc.
biological_replicate	1	Integer, biological replicate number.
technical_replicate	1	Integer, technical replicate number.
species	mouse	Common name.
species_scientific_name	Mus musculus	Binomial nomenclature.
species_strain	C57BL/6	Strain or stock.
species_model	autism spectrum disorder	Disease or experimental model (if any).
age	P7	Age stage (e.g. E80, P4, adult, PCW20).
sex	male	male / female / mixed / unknown.
genotype	WT	Genotype or genetic modification.
organ	brain	Organ of origin.
tissue	cerebral cortex	Fine-grained tissue region.
cell_line	NA	Name of cell line (NA or empty if not applicable).
treatment	NA	Experimental treatment applied (drug, virus, etc.).
experiment_batch	EXP_batch_01	Arbitrary batch identifier for wet-lab steps.
library_type	snRNA-seq	Sequencing library strategy (controlled vocabulary), e.g. snRNA-seq, snATAC-seq, RNA-seq, ATAC-seq, ChIP-seq (H3K27ac), Cut&Tag (Satb2), etc.
library_kit	10x-3v3	Commercial kit or in-house protocol name.
library_batch	LIB_batch_A	Library preparation batch.
omics_type	transcriptome	Genomic layer: transcriptome, epigenome, genome, etc.
sequencing_platform	Illumina NovaSeq	Instrument used.
sequencing_type	PE:150	Read layout & length, e.g. PE:150, SE:100.
sequencing_batch	SEQ_batch_202509	Sequencing run batch.
read_type	R1	Read segment: R1, R2, I1, I2.
date	2025-09-23	Date, this metadata record was created (ISO format).
note	No QC yet, spike-in added	Free-text comments not captured elsewhere.
data_status	raw	Lifecycle flag: raw, processed, archived.

1.3 Usage Notes

Keep the header line exactly as provided.
Use NA or leave the cell empty for missing values.
Avoid special characters (comma, semicolon, line break) inside fields.
Store the CSV in UTF-8 encoding.
Increment metadata_version in the filename and README when structural changes occur.

1.4 Management scripts

Extract file information for MobiVision-formatted raw data

library(tidyverse)
library(vroom)

guess_compression_type <- function(files) {
    sapply(files, function(f) {
        if (!file.exists(f)) stop("file does not exist")
        con <- file(f, open = "rb")
        on.exit(close(con))
        f_header <- readBin(con, what = "raw", n = 32)
        if (all(f_header[1:2] == as.raw(c(0x1f, 0x8b)))) {
            "gz"
        } else if (all(f_header[1:3] == as.raw(c(0x42, 0x5a, 0x68)))) {
            "bz2"
        } else if (all(f_header[1:6] == as.raw(c(0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00)))) {
            "xz"
        } else if (all(f_header[1:4] == as.raw(c(0x50, 0x4b, 0x03, 0x04)))) {
            "zip"
        } else if (all(f_header[1:4] == as.raw(c(0x28, 0xb5, 0x2f, 0xfd)))) {
            "zst"
        } else {
            "unknown"
        }
    }, USE.NAMES = FALSE)
}

metadata_template_file <- "/home/dell/YRArchive/NeuroBorder/Blogs/Bioinformatics/posts/Misc/metadata_template/data/raw_omics-file-level_metadata_template.v1.csv"
root_dir <- "/data/database/data/raw/fastq/20250923_164603/snRNA-seq"
exclude_file_pattern <- ".+\\.md5(\\.check)?$"

files <- list.files(root_dir, full.names = TRUE, recursive = TRUE)
files <- files[!grepl(exclude_file_pattern, files)]

metadata_template_df <- vroom(metadata_template_file,
    delim = ",", col_names = TRUE,
    col_types = cols(.default = col_character())
)
metadata_template_df <- metadata_template_df[0, ]
metadata_template_df <- metadata_template_df[rep(NA_integer_, length(files)), ]

df <- metadata_template_df %>%
    mutate(
        path = files,
        directory = dirname(path),
        first_level_parent_directory = basename(dirname(path)),
        file = basename(path),
        file_size = as.character(file.size(path)),
        compression_type = guess_compression_type(path),
        file_type = map2_chr(file, compression_type, function(f, c) tools::file_ext(sub(paste0("\\.", c, "$"), "", f))),
        hash_checksum_checking_file = map2_chr(directory, file, function(d, f) {
            check_f <- list.files(d, pattern = paste0(f, "\\.[a-zA-Z0-9_-]+\\.check$"), full.names = TRUE, recursive = FALSE)
            if (length(check_f) != 1) stop("hash checksum checking file does not exist")
            check_f
        }),
        hash_checksum_file = sapply(hash_checksum_checking_file, function(f) {
            checksum_f <- tools::file_path_sans_ext(f)
            if (!file.exists(checksum_f)) stop("hash checksum file does not exist")
            checksum_f
        }, USE.NAMES = FALSE),
        hash_checksum_type = tools::file_ext(hash_checksum_file),
        hash_checksum = sapply(hash_checksum_file, function(f) {
            checksum_line <- vroom_lines(f, skip_empty_rows = TRUE)
            if (length(checksum_line) != 1) stop("malformed checksum file")
            checksum <- trimws(strsplit(checksum_line, " +")[[1]][1])
            if (checksum == "") stop("parsing checksum failed")
            checksum
        }),
        hash_checksum_status = sapply(hash_checksum_checking_file, function(f) {
            check_status <- vroom_lines(f, skip_empty_rows = TRUE)
            if (grepl("OK$", check_status)) {
                "OK"
            } else if (grepl("FAILED$", check_status)) {
                "FAILED"
            } else {
                "UNKNOWN"
            }
        }),
        id = uuid::UUIDgenerate(n = length(files)),
        date = Sys.Date(),
        read_type = pmap_chr(list(file, file_type, compression_type), function(f, ft, ct) str_extract(sub(paste0("\\.", ft, "(\\.", ct, ")?$"), "", f), pattern = "(?<=[-_.])(R|I)[0-9]+$")),
        basename = pmap_chr(list(file, read_type, file_type, compression_type), function(f, rt, ft, ct) sub(paste0("([-_.]", rt, ")?\\.", ft, "(\\.", ct, ")?$"), "", f))
    ) %>%
    arrange(path)

cols_status <- sapply(df, function(x) all(is.na(x)))
sub_df <- df[, unique(c("path", "first_level_parent_directory", "basename", names(cols_status[cols_status == TRUE])))]

vroom_write(
    df,
    file = file.path(root_dir, sub(
        "(csv|tsv|txt)$",
        paste0("auto_maintained.", format(Sys.time(), "%Y%m%d_%H%M%S"), ".csv"),
        basename(metadata_template_file),
        ignore.case = TRUE
    )),
    delim = ",", na = "",
    col_names = TRUE, append = FALSE
)

vroom_write(
    sub_df,
    file = file.path(root_dir, sub(
        "(csv|tsv|txt)$",
        paste0("manually_curated.", format(Sys.time(), "%Y%m%d_%H%M%S"), ".csv"),
        basename(metadata_template_file),
        ignore.case = TRUE
    )),
    delim = ",", na = "",
    col_names = TRUE, append = FALSE
)