diff --git a/image_development/Dockerfile_nextits b/image_development/Dockerfile_nextits index 1b8cce8..c8d35f2 100644 --- a/image_development/Dockerfile_nextits +++ b/image_development/Dockerfile_nextits @@ -9,7 +9,7 @@ # docker build --target test --tag nextits-test --file NextITS.dockerfile . ## Build stage 1 (Rust and Cargo) -FROM rust:1.89.0-slim AS rust +FROM rust:1.92.0-slim AS rust RUN cargo install runiq sd # Stage 2 (Nextflow - minimal stage) @@ -58,14 +58,18 @@ RUN R -e 'BiocManager::install("Biostrings", ask = FALSE)' \ && R -e 'BiocManager::install("phyloseq", ask = FALSE)' \ && rm -rf /tmp/downloaded_packages -RUN install2.r --error --skipinstalled geodist phytools ggdendro gridExtra \ - && R -e 'ok <- tryCatch({ remotes::install_github("mikemc/speedyseq"); TRUE }, error=function(e){ message(e); FALSE }); \ - if (!ok || !requireNamespace("speedyseq", quietly=TRUE)) quit(status=1)' \ - && R -e 'ok <- tryCatch({ remotes::install_github("vmikk/metagMisc"); TRUE }, error=function(e){ message(e); FALSE }); \ - if (!ok || !requireNamespace("metagMisc", quietly=TRUE)) quit(status=1)' \ - && R -e 'ok <- tryCatch({ remotes::install_cran("qs2", type = "source", configure.args = "--with-simd=AVX2"); TRUE }, error=function(e){ message(e); FALSE }); \ - if (!ok || !requireNamespace("qs2", quietly=TRUE)) quit(status=1)' \ - && rm -rf /tmp/downloaded_packages +RUN install2.r --error --skipinstalled \ + geodist \ + phytools \ + ggdendro \ + gridExtra \ + && R -e 'ok <- tryCatch({ remotes::install_github("mikemc/speedyseq"); TRUE }, error=function(e){ message(e); FALSE }); \ + if (!ok || !requireNamespace("speedyseq", quietly=TRUE)) quit(status=1)' \ + && R -e 'ok <- tryCatch({ remotes::install_github("vmikk/metagMisc"); TRUE }, error=function(e){ message(e); FALSE }); \ + if (!ok || !requireNamespace("metagMisc", quietly=TRUE)) quit(status=1)' \ + && R -e 'ok <- tryCatch({ remotes::install_cran("qs", type = "source", configure.args = "--with-simd=AVX2"); TRUE }, error=function(e){ message(e); FALSE }); \ + if (!ok || !requireNamespace("qs", quietly=TRUE)) quit(status=1)' \ + && rm -rf /tmp/downloaded_packages ## Install conda RUN mkdir -p /opt/software \ @@ -97,23 +101,23 @@ RUN cd /opt/software \ RUN /opt/software/conda/bin/mamba install -y \ "lima>=2.13.0" \ "pbtk>=3.5.0" \ - "vsearch>=2.30.0" \ - "swarm>=3.1.5" \ - "seqkit>=2.10.1" \ - "seqfu>=1.22.3" \ + "vsearch>=2.30.3" \ + "swarm>=3.1.6" \ + "seqkit>=2.12.0" \ + "seqfu>=1.23.0" \ "fastp>=1.0.1" \ "blast>=2.17.0" \ "bioawk" \ - "miller>=6.13.0" \ + "miller>=6.16.0" \ "xsv>=0.13.0" \ "bedtools>=2.31.1" \ - "parallel>=20250622" \ - "csvtk>=0.34.0" \ - "cutadapt>=5.1" \ + "parallel>=20251122" \ + "csvtk>=0.36.0" \ + "cutadapt>=5.2" \ "itsx>=1.1.3" \ - "bbmap>=39.33" \ - "ripgrep>=14.1.1" \ - "fd-find>=10.2.0" \ + "bbmap>=39.52" \ + "ripgrep>=15.1.0" \ + "fd-find>=10.3.0" \ "mmseqs2" \ "mamba" \ && /opt/software/conda/bin/conda clean --all --yes @@ -128,7 +132,7 @@ RUN cd /opt/software \ && wget https://github.com/vmikk/seqhasher/releases/download/1.1.2/seqhasher \ && chmod +x seqhasher \ && mv seqhasher /opt/software/conda/bin/ \ - && wget https://github.com/vmikk/phredsort/releases/download/1.3.0/phredsort \ + && wget https://github.com/vmikk/phredsort/releases/download/1.4.0/phredsort \ && chmod +x phredsort \ && mv phredsort /opt/software/conda/bin/ \ && wget https://github.com/vmikk/ucs/releases/download/0.8.0/ucs \ @@ -144,7 +148,7 @@ RUN git clone --depth 1 https://github.com/indraniel/fqgrep \ && rm -r fqgrep ## rush -RUN wget https://github.com/shenwei356/rush/releases/download/v0.7.0/rush_linux_amd64.tar.gz \ +RUN wget https://github.com/shenwei356/rush/releases/download/v0.8.0/rush_linux_amd64.tar.gz \ && tar -xzf rush_linux_amd64.tar.gz \ && mv rush /opt/software/conda/bin/ \ && rm rush_linux_amd64.tar.gz @@ -179,7 +183,7 @@ RUN cd /opt/software \ ## Install DuckDB RUN cd /opt/software \ - && curl -L https://github.com/duckdb/duckdb/releases/download/v1.3.2/duckdb_cli-linux-amd64.zip -o duckdb_cli-linux-amd64.zip \ + && curl -L https://github.com/duckdb/duckdb/releases/download/v1.4.3/duckdb_cli-linux-amd64.zip -o duckdb_cli-linux-amd64.zip \ && unzip duckdb_cli-linux-amd64.zip -d /opt/software/conda/bin/ \ && rm duckdb_cli-linux-amd64.zip @@ -221,7 +225,7 @@ ENTRYPOINT ["/opt/software/entrypoint.sh"] FROM main AS test # Set environment variable for R version testing -ENV R_VERSION=4.5.1 +ENV R_VERSION=4.5.2 RUN echo "=== Testing R installation and packages ===" \ && R --quiet -e "stopifnot(getRversion() == '${R_VERSION}')" \ @@ -230,7 +234,7 @@ RUN echo "=== Testing R installation and packages ===" \ 'required_packages <- c("optparse", "data.table", "arrow", "duckdb",' \ ' "plyr", "dplyr", "ggplot2", "openxlsx", "yaml",' \ ' "Biostrings", "DECIPHER", "dada2", "phyloseq",' \ - ' "metagMisc", "qs2")' \ + ' "metagMisc", "qs")' \ '' \ 'for(pkg in required_packages) {' \ ' cat("Testing package:", pkg, "... ")' \ @@ -268,4 +272,5 @@ RUN echo "=== Testing R installation and packages ===" \ echo "FAILED - $tool not found in PATH" && exit 1; \ fi; \ done \ - && echo "=== All tests passed! Container looks ready for use ===" \ No newline at end of file + && echo "=== All tests passed! Container looks ready for use ===" + \ No newline at end of file diff --git a/src/pipecraft-core/service_scripts/NextITS/CHANGELOG.md b/src/pipecraft-core/service_scripts/NextITS/CHANGELOG.md index 7f95745..0b92d7a 100644 --- a/src/pipecraft-core/service_scripts/NextITS/CHANGELOG.md +++ b/src/pipecraft-core/service_scripts/NextITS/CHANGELOG.md @@ -7,7 +7,7 @@ For version numbering, we use the following convention: `MAJOR.MINOR.PATCH`. Each element increases numerically (e.g., `1.9.0` -> `1.10.0` -> `1.11.0`). -## [1.1.0] - 2025-xx-xx +## [1.1.0] - 2026-01-22 - Fixed handling of unknown barcode combinations (in `dual asymmetric` mode); thanks to Alice Retter for reporting - Refactored and optimized the tag-jump removal step @@ -18,6 +18,8 @@ Each element increases numerically (e.g., `1.9.0` -> `1.10.0` -> `1.11.0`). - `lima_remove_unknown` (default, `false`; if `true`, unknown barcode combinations are removed from demultiplexed data) - `chunking_n` (number of chunks to split the dataset into prior to clustering) - `chunking_id` (minimum sequence identity used for splitting the dataset into chunks) + - `chimera_methods` (specifies which chimera removal methods to use - "ref" for reference-based, "denovo" for de novo, or "ref,denovo" for both; could be also "none" or `null` to disable chimera removal) + - `tj` (specifies whether to run tag-jump removal - "true" or "false") - Added DADA2 denoising (`--preclustering dada2`; also works with `--clustering none`) - Implemented automated documentation for analysis procedures (generates `README_Step1_Methods.txt` and `README_Step2_Methods.txt` in the `pipeline_info` directory) - Refactored parameter validation (using `nf-schema` plugin) diff --git a/src/pipecraft-core/service_scripts/NextITS/CITATION.cff b/src/pipecraft-core/service_scripts/NextITS/CITATION.cff index 00c0105..67541b1 100644 --- a/src/pipecraft-core/service_scripts/NextITS/CITATION.cff +++ b/src/pipecraft-core/service_scripts/NextITS/CITATION.cff @@ -11,7 +11,7 @@ authors: - family-names: "Tedersoo" given-names: "Leho" orcid: "https://orcid.org/0000-0002-1635-1249" -version: 1.0.0 +version: 1.1.0 doi: 10.5281/zenodo.15074881 date-released: 2025-03-24 url: "https://github.com/vmikk/NextITS" diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/ITSx_to_DuckDB.sh b/src/pipecraft-core/service_scripts/NextITS/bin/ITSx_to_DuckDB.sh index d32070f..84b69ae 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/ITSx_to_DuckDB.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/ITSx_to_DuckDB.sh @@ -1,146 +1,146 @@ -#!/bin/bash - -## Import sequences into DuckDB-compatible tables - -## Input = FASTA formatted sequences (header = "hash;size=...") -## Output = Table in DuckDB-native format or Parquet - -# Define usage function -usage() { - echo "Usage: $0 [-i input_file] [-o output_file] [-f format]" - echo " -i : Input FASTA file (required)" - echo " -o : Output file (optional, defaults to input filename with .db/.parquet extension)" - echo " -f : Output format (optional): 'duckdb' or 'parquet' (default, 'parquet')" - exit 1 -} - -# Parse command line arguments -input_file="" -output_file="" -format="parquet" # format="duckdb" - -while getopts "i:o:f:h" opt; do - case $opt in - i) input_file="$OPTARG" ;; - o) output_file="$OPTARG" ;; - f) format="$OPTARG" ;; - h) usage ;; - ?) usage ;; - esac -done - -# Validate required parameters -if [ -z "$input_file" ]; then - echo "Error: Input file is required" - usage -fi - -# Validate output format -if [ "$format" != "duckdb" ] && [ "$format" != "parquet" ]; then - echo "Error: Format must be either 'duckdb' or 'parquet'" - usage -fi - -## Extract rRNA region name from filename -if [[ $input_file =~ ([^.]+)\.fasta\.gz$ ]]; then - rRNA_part="${BASH_REMATCH[1]}" -else - echo "Error in extracting rRNA region name from filename" - rRNA_part="X" -fi - -## Check if rRNA region name is valid -VALID_PARTS=("full" "SSU" "ITS1" "5_8S" "ITS2" "LSU") -if [[ ! " ${VALID_PARTS[@]} " =~ " ${rRNA_part} " ]]; then - echo "..Error: Invalid rRNA region name. Supported names are: ${VALID_PARTS[*]}" - rRNA_part="X" -fi - -## 'full' is a reserved keyword in DuckDB, rename to ITS -if [ "$rRNA_part" == "full" ]; then - rRNA_part="ITS" -fi - -## DuckDB table name cannot start with a number -if [[ "$rRNA_part" == "5_8S" ]]; then - rRNA_part="S58" -fi - -## Extract sample name from filename -sample_name="${input_file/.fasta.gz/}" - -# Set output file if not specified -if [ -z "${output_file}" ]; then - if [ "${format}" == "duckdb" ]; then - output_file="${sample_name}.db" - else - output_file="${sample_name}.parquet" - fi -fi - -## Check if input file exists -if [ ! -f "${input_file}" ]; then - echo "..Error: File ${input_file} not found" - exit 1 -fi - -echo "..Importing ${input_file} into ${output_file} (format: ${format})" - -if [ "$format" == "duckdb" ]; then - seqkit fx2tab "${input_file}" \ - | sed 's/;size=/\t/' \ - | duckdb "${output_file}" \ - " - DROP TABLE IF EXISTS ${rRNA_part}; - CREATE TABLE ${rRNA_part} ( - SeqID VARCHAR PRIMARY KEY, - Abundance INTEGER, - Sequence VARCHAR - ); - - INSERT INTO ${rRNA_part} - SELECT * FROM read_csv( - '/dev/stdin', - header = false, delim = '\t', - columns = { - 'SeqID': 'VARCHAR', - 'Abundance': 'INTEGER', - 'Sequence': 'VARCHAR' - } - );" -else - seqkit fx2tab "${input_file}" \ - | sed 's/;size=/\t/' \ - | duckdb -c " - COPY ( - SELECT * FROM read_csv( - '/dev/stdin', - header = false, delim = '\t', - columns = { - 'SeqID': 'VARCHAR', - 'Abundance': 'INTEGER', - 'Sequence': 'VARCHAR' - } - ) - ) TO '${output_file}' (FORMAT PARQUET, COMPRESSION 'ZSTD', COMPRESSION_LEVEL 12);" -fi - -echo "..Data imported to ${output_file}" - - -#### Check the data -# duckdb "$db_file" -# -# -- Show all tables -# SHOW TABLES; -# SELECT * FROM information_schema.tables; -# -# -- Show all column names and their types -# DESCRIBE ITS1; -# -# -- Show first 10 rows -# SELECT * FROM ITS1 LIMIT 10; -# -# -- Get count of rows -# SELECT COUNT(*) FROM ITS1; +#!/bin/bash + +## Import sequences into DuckDB-compatible tables + +## Input = FASTA formatted sequences (header = "hash;size=...") +## Output = Table in DuckDB-native format or Parquet + +# Define usage function +usage() { + echo "Usage: $0 [-i input_file] [-o output_file] [-f format]" + echo " -i : Input FASTA file (required)" + echo " -o : Output file (optional, defaults to input filename with .db/.parquet extension)" + echo " -f : Output format (optional): 'duckdb' or 'parquet' (default, 'parquet')" + exit 1 +} + +# Parse command line arguments +input_file="" +output_file="" +format="parquet" # format="duckdb" + +while getopts "i:o:f:h" opt; do + case $opt in + i) input_file="$OPTARG" ;; + o) output_file="$OPTARG" ;; + f) format="$OPTARG" ;; + h) usage ;; + ?) usage ;; + esac +done + +# Validate required parameters +if [ -z "$input_file" ]; then + echo "Error: Input file is required" + usage +fi + +# Validate output format +if [ "$format" != "duckdb" ] && [ "$format" != "parquet" ]; then + echo "Error: Format must be either 'duckdb' or 'parquet'" + usage +fi + +## Extract rRNA region name from filename +if [[ $input_file =~ ([^.]+)\.fasta\.gz$ ]]; then + rRNA_part="${BASH_REMATCH[1]}" +else + echo "Error in extracting rRNA region name from filename" + rRNA_part="X" +fi + +## Check if rRNA region name is valid +VALID_PARTS=("full" "SSU" "ITS1" "5_8S" "ITS2" "LSU") +if [[ ! " ${VALID_PARTS[@]} " =~ " ${rRNA_part} " ]]; then + echo "..Error: Invalid rRNA region name. Supported names are: ${VALID_PARTS[*]}" + rRNA_part="X" +fi + +## 'full' is a reserved keyword in DuckDB, rename to ITS +if [ "$rRNA_part" == "full" ]; then + rRNA_part="ITS" +fi + +## DuckDB table name cannot start with a number +if [[ "$rRNA_part" == "5_8S" ]]; then + rRNA_part="S58" +fi + +## Extract sample name from filename +sample_name="${input_file/.fasta.gz/}" + +# Set output file if not specified +if [ -z "${output_file}" ]; then + if [ "${format}" == "duckdb" ]; then + output_file="${sample_name}.db" + else + output_file="${sample_name}.parquet" + fi +fi + +## Check if input file exists +if [ ! -f "${input_file}" ]; then + echo "..Error: File ${input_file} not found" + exit 1 +fi + +echo "..Importing ${input_file} into ${output_file} (format: ${format})" + +if [ "$format" == "duckdb" ]; then + seqkit fx2tab "${input_file}" \ + | sed 's/;size=/\t/' \ + | duckdb "${output_file}" \ + " + DROP TABLE IF EXISTS ${rRNA_part}; + CREATE TABLE ${rRNA_part} ( + SeqID VARCHAR PRIMARY KEY, + Abundance INTEGER, + Sequence VARCHAR + ); + + INSERT INTO ${rRNA_part} + SELECT * FROM read_csv( + '/dev/stdin', + header = false, delim = '\t', + columns = { + 'SeqID': 'VARCHAR', + 'Abundance': 'INTEGER', + 'Sequence': 'VARCHAR' + } + );" +else + seqkit fx2tab "${input_file}" \ + | sed 's/;size=/\t/' \ + | duckdb -c " + COPY ( + SELECT * FROM read_csv( + '/dev/stdin', + header = false, delim = '\t', + columns = { + 'SeqID': 'VARCHAR', + 'Abundance': 'INTEGER', + 'Sequence': 'VARCHAR' + } + ) + ) TO '${output_file}' (FORMAT PARQUET, COMPRESSION 'ZSTD', COMPRESSION_LEVEL 12);" +fi + +echo "..Data imported to ${output_file}" + + +#### Check the data +# duckdb "$db_file" +# +# -- Show all tables +# SHOW TABLES; +# SELECT * FROM information_schema.tables; +# +# -- Show all column names and their types +# DESCRIBE ITS1; +# +# -- Show first 10 rows +# SELECT * FROM ITS1 LIMIT 10; +# +# -- Get count of rows +# SELECT COUNT(*) FROM ITS1; # \ No newline at end of file diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/aggregate_sequences.R b/src/pipecraft-core/service_scripts/NextITS/bin/aggregate_sequences.R index cf5bb74..8f71d9f 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/aggregate_sequences.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/aggregate_sequences.R @@ -1,173 +1,173 @@ -#!/usr/bin/env Rscript - -## Script to aggregate sequences from multiple runs into a single file (for dereplication and subsequent clustering) -## Also, performs removal of de novo chimeras with high scores (with option to recover sequences that occurred in multiple runs) - -## Do-novo chimera recovery: -# if a sequence identified as putative chimera was observed in the other samples, -# where there is no evidence that it is chimeric, it will be recovered - - -## Function to load packages -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(".. ", paste(pkg, packageVersion(pkg), "\n")) -} - -cat("Loading packages:\n") - -load_pckg("optparse") -load_pckg("data.table") -load_pckg("Biostrings") -load_pckg("plyr") -load_pckg("arrow") -# load_pckg("dplyr") - - -cat("Parsing input options and arguments...\n") - -option_list <- list( - make_option("--seqtabs", action="store", default=NA, type='character', help = "Direcotry containing long tables with quality-filtered sequences (Parquet format)"), - make_option("--maxchim", action="store", default=0.6, type='numeric', help = "Maximum de novo chimera score to remove"), - make_option("--recoverdenovo", action="store", default=TRUE, type='logical', help="Recover de-novo chimeras (logical)"), - make_option("--output", action="store", default="Seqs", type='character', help = "Output prefix"), - make_option("--threads", action="store", default=4, type='integer', help = "Number of CPU threads to use") -) - -opt <- parse_args(OptionParser(option_list=option_list)) - -## Function to convert text "NA"s to NA -to_na <- function(x){ - if(x %in% c("NA", "null", "Null")){ x <- NA } - return(x) -} - -## Replaces "null"s from Nextflow with NA -opt <- lapply(X = opt, FUN = to_na) - - -## Validation of the required arguments -if (is.na(opt$seqtabs)) { - stop("Input directory with quality-filtered sequences is not specified\n") -} - -## Assign variables -SEQTABS <- opt$seqtabs -MAXCHIM <- opt$maxchim -RECOV_DENOVO <- opt$recoverdenovo -OUTPUT <- opt$output -CPUTHREADS <- as.numeric( opt$threads ) - -## Log assigned variables -cat(paste("Path to sequence tables: ", SEQTABS, "\n", sep="")) -cat(paste("Max de novo chimera score: ", MAXCHIM, "\n", sep="")) -cat(paste("De novo chimera recovery: ", RECOV_DENOVO, "\n", sep="")) -cat(paste("Output prefix: ", OUTPUT, "\n", sep="")) -cat(paste("CPU threads: ", CPUTHREADS, "\n", sep="")) - -cat("\n") - -## Set CPU thread number -cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") -setDTthreads(threads = CPUTHREADS) # for data.table -set_cpu_count(CPUTHREADS) # for arrow - - -###################################### -###################################### Process the data -###################################### - -## Load sequence tables -cat("\n..Looking for sequence tables\n") -TABS <- list.files(path = SEQTABS, pattern = ".parquet", full.names = TRUE, recursive = TRUE) -cat("... Tables found: ", length(TABS), "\n") - -cat("\n..Loading sequence tables\n") -TAB <- alply(.data = TABS, .margins = 1, .fun = function(x){ - res <- arrow::read_parquet(x) - setDT(res) - return(res) -}) -TAB <- rbindlist(TAB, use.names = TRUE, fill = TRUE) -cat("... Total number of records: ", nrow(TAB), "\n") -cat("... Total number unique sequences: ", length(unique(TAB$Sequence)), "\n") -cat("... Total number unique samples (fastq files): ", length(unique(TAB$SampleID)), "\n") - - -## Filter sequences by chimeric score (MAXCHIM) -if(!is.na(MAXCHIM)){ - - cat("\n..Filtering data by max de novo chimera score\n") - nrecs <- nrow(TAB) - nabun <- sum(TAB$Abundance, na.rm = TRUE) - - cat("... Max de novo chimera score observed: ", max(TAB$DeNovo_Chimera_Score, na.rm = TRUE), "\n") - - ## If no chimera recovery is required - if(RECOV_DENOVO == FALSE){ - - TAB <- TAB[ DeNovo_Chimera_Score < MAXCHIM | is.na(DeNovo_Chimera_Score) ] - - ## If we need to recover chimeras - } else { - - ## Find putative chimeras - CHIMERAS <- TAB[ DeNovo_Chimera_Score >= MAXCHIM, .(SeqID___SampleID, DeNovo_Chimera_Score, Sequence, Abundance) ] - NONCHIMERAS <- TAB[ ! SeqID___SampleID %in% CHIMERAS$SeqID___SampleID ] - - ## Recover false-positives - chim_seqs <- unique(CHIMERAS$Sequence) - nonchim_seqs <- unique(NONCHIMERAS$Sequence) - fp_chims <- chim_seqs %in% nonchim_seqs - if(any(fp_chims)){ - cat(".... Probably there are a few false-positive chimeras\n") - cat(".... Recovering ", sum(fp_chims), "sequences\n") - fp_seqs <- chim_seqs[ fp_chims ] - CHIMERAS <- CHIMERAS[ ! Sequence %in% fp_seqs ] - rm(fp_seqs) - } - - TAB <- TAB[ ! Sequence %in% CHIMERAS$Sequence ] - rm(CHIMERAS, NONCHIMERAS) - - } # end of chimera recovery - - ## Data summary after filtering - nrecs_delta <- nrecs - nrow(TAB) - nabun_delta <- nabun - sum(TAB$Abundance, na.rm = TRUE) - - cat("... Records removed: ", nrecs_delta, " (", round(nrecs_delta/nrecs * 100, 1), "%)\n") - cat("... Reads removed: ", nabun_delta, " (", round(nabun_delta/nabun * 100, 1), "%)\n") - - rm(nrecs_delta, nabun_delta) - -} # end of MAXCHIM filtering - - -cat("\n..Sorting table by abundance, quality score\n") -setorder(x = TAB, -Abundance, -PhredScore, SeqID) - -cat("..Preparing FASTA file\n") - -SQF <- DNAStringSet(x = TAB$Sequence) -names(SQF) <- paste0(TAB$SeqID, ";size=", TAB$Abundance) # , ";sample=", TAB$SampleID, ";" - -## Export FASTA -cat("..Exporting FASTA file with filtered sequences\n") - -writeXStringSet( - x = SQF, - filepath = paste0(OUTPUT, ".fa.gz"), - compress = TRUE, format = "fasta", width = 9999) - -## Export FASTA -cat("..Exporting filtered table\n") - -write_parquet( - x = TAB, - sink = paste0(OUTPUT, ".parquet"), - compression = "zstd", - compression_level = 10, - use_dictionary = TRUE) - -cat("All done.\n") +#!/usr/bin/env Rscript + +## Script to aggregate sequences from multiple runs into a single file (for dereplication and subsequent clustering) +## Also, performs removal of de novo chimeras with high scores (with option to recover sequences that occurred in multiple runs) + +## Do-novo chimera recovery: +# if a sequence identified as putative chimera was observed in the other samples, +# where there is no evidence that it is chimeric, it will be recovered + + +## Function to load packages +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(".. ", paste(pkg, packageVersion(pkg), "\n")) +} + +cat("Loading packages:\n") + +load_pckg("optparse") +load_pckg("data.table") +load_pckg("Biostrings") +load_pckg("plyr") +load_pckg("arrow") +# load_pckg("dplyr") + + +cat("Parsing input options and arguments...\n") + +option_list <- list( + make_option("--seqtabs", action="store", default=NA, type='character', help = "Direcotry containing long tables with quality-filtered sequences (Parquet format)"), + make_option("--maxchim", action="store", default=0.6, type='numeric', help = "Maximum de novo chimera score to remove"), + make_option("--recoverdenovo", action="store", default=TRUE, type='logical', help="Recover de-novo chimeras (logical)"), + make_option("--output", action="store", default="Seqs", type='character', help = "Output prefix"), + make_option("--threads", action="store", default=4, type='integer', help = "Number of CPU threads to use") +) + +opt <- parse_args(OptionParser(option_list=option_list)) + +## Function to convert text "NA"s to NA +to_na <- function(x){ + if(x %in% c("NA", "null", "Null")){ x <- NA } + return(x) +} + +## Replaces "null"s from Nextflow with NA +opt <- lapply(X = opt, FUN = to_na) + + +## Validation of the required arguments +if (is.na(opt$seqtabs)) { + stop("Input directory with quality-filtered sequences is not specified\n") +} + +## Assign variables +SEQTABS <- opt$seqtabs +MAXCHIM <- opt$maxchim +RECOV_DENOVO <- opt$recoverdenovo +OUTPUT <- opt$output +CPUTHREADS <- as.numeric( opt$threads ) + +## Log assigned variables +cat(paste("Path to sequence tables: ", SEQTABS, "\n", sep="")) +cat(paste("Max de novo chimera score: ", MAXCHIM, "\n", sep="")) +cat(paste("De novo chimera recovery: ", RECOV_DENOVO, "\n", sep="")) +cat(paste("Output prefix: ", OUTPUT, "\n", sep="")) +cat(paste("CPU threads: ", CPUTHREADS, "\n", sep="")) + +cat("\n") + +## Set CPU thread number +cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") +setDTthreads(threads = CPUTHREADS) # for data.table +set_cpu_count(CPUTHREADS) # for arrow + + +###################################### +###################################### Process the data +###################################### + +## Load sequence tables +cat("\n..Looking for sequence tables\n") +TABS <- list.files(path = SEQTABS, pattern = ".parquet", full.names = TRUE, recursive = TRUE) +cat("... Tables found: ", length(TABS), "\n") + +cat("\n..Loading sequence tables\n") +TAB <- alply(.data = TABS, .margins = 1, .fun = function(x){ + res <- arrow::read_parquet(x) + setDT(res) + return(res) +}) +TAB <- rbindlist(TAB, use.names = TRUE, fill = TRUE) +cat("... Total number of records: ", nrow(TAB), "\n") +cat("... Total number unique sequences: ", length(unique(TAB$Sequence)), "\n") +cat("... Total number unique samples (fastq files): ", length(unique(TAB$SampleID)), "\n") + + +## Filter sequences by chimeric score (MAXCHIM) +if(!is.na(MAXCHIM)){ + + cat("\n..Filtering data by max de novo chimera score\n") + nrecs <- nrow(TAB) + nabun <- sum(TAB$Abundance, na.rm = TRUE) + + cat("... Max de novo chimera score observed: ", max(TAB$DeNovo_Chimera_Score, na.rm = TRUE), "\n") + + ## If no chimera recovery is required + if(RECOV_DENOVO == FALSE){ + + TAB <- TAB[ DeNovo_Chimera_Score < MAXCHIM | is.na(DeNovo_Chimera_Score) ] + + ## If we need to recover chimeras + } else { + + ## Find putative chimeras + CHIMERAS <- TAB[ DeNovo_Chimera_Score >= MAXCHIM, .(SeqID___SampleID, DeNovo_Chimera_Score, Sequence, Abundance) ] + NONCHIMERAS <- TAB[ ! SeqID___SampleID %in% CHIMERAS$SeqID___SampleID ] + + ## Recover false-positives + chim_seqs <- unique(CHIMERAS$Sequence) + nonchim_seqs <- unique(NONCHIMERAS$Sequence) + fp_chims <- chim_seqs %in% nonchim_seqs + if(any(fp_chims)){ + cat(".... Probably there are a few false-positive chimeras\n") + cat(".... Recovering ", sum(fp_chims), "sequences\n") + fp_seqs <- chim_seqs[ fp_chims ] + CHIMERAS <- CHIMERAS[ ! Sequence %in% fp_seqs ] + rm(fp_seqs) + } + + TAB <- TAB[ ! Sequence %in% CHIMERAS$Sequence ] + rm(CHIMERAS, NONCHIMERAS) + + } # end of chimera recovery + + ## Data summary after filtering + nrecs_delta <- nrecs - nrow(TAB) + nabun_delta <- nabun - sum(TAB$Abundance, na.rm = TRUE) + + cat("... Records removed: ", nrecs_delta, " (", round(nrecs_delta/nrecs * 100, 1), "%)\n") + cat("... Reads removed: ", nabun_delta, " (", round(nabun_delta/nabun * 100, 1), "%)\n") + + rm(nrecs_delta, nabun_delta) + +} # end of MAXCHIM filtering + + +cat("\n..Sorting table by abundance, quality score\n") +setorder(x = TAB, -Abundance, -PhredScore, SeqID) + +cat("..Preparing FASTA file\n") + +SQF <- DNAStringSet(x = TAB$Sequence) +names(SQF) <- paste0(TAB$SeqID, ";size=", TAB$Abundance) # , ";sample=", TAB$SampleID, ";" + +## Export FASTA +cat("..Exporting FASTA file with filtered sequences\n") + +writeXStringSet( + x = SQF, + filepath = paste0(OUTPUT, ".fa.gz"), + compress = TRUE, format = "fasta", width = 9999) + +## Export FASTA +cat("..Exporting filtered table\n") + +write_parquet( + x = TAB, + sink = paste0(OUTPUT, ".parquet"), + compression = "zstd", + compression_level = 10, + use_dictionary = TRUE) + +cat("All done.\n") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/bucketize_db.R b/src/pipecraft-core/service_scripts/NextITS/bin/bucketize_db.R old mode 100644 new mode 100755 index 18d9c97..e5b0874 --- a/src/pipecraft-core/service_scripts/NextITS/bin/bucketize_db.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/bucketize_db.R @@ -1,248 +1,248 @@ -#!/usr/bin/env Rscript - -## Aim - evenly distribute sequence clusters across a specified number of buckets. -## The goal is to have the total length of sequences in each bucket as equal as possible. - -## Number of buckets can be automatically selected -## (e.g., to avoid the DADA2s' error message `long vectors not supported yet`, related with > 2^31 elements) - -## Usage examples: -# bucketize_db.R \ -# --db stat_clusters.txt \ -# --fasta Input.fa.gz \ -# --summary bucket_summary.txt \ -# --numbuckets 10 \ -# --threads 10 - - -## Check time -start_time <- Sys.time() - -cat("\nParsing input options and arguments...\n") - -suppressPackageStartupMessages(require(optparse)) - -## Parse arguments -option_list <- list( - make_option(c("-d", "--db"), action="store", default="DB_clu.tsv", type='character', help="Clustering database"), - make_option(c("-f", "--fasta"), action="store", default="Input.fa.gz", type='character', help="Input sequences in FASTA format"), - make_option(c("-s", "--summary"), action="store", default="bucket_summary.txt", type='character', help="Output file summary information"), - make_option(c("-n", "--numbuckets"), action="store", default=NA, type='integer', help="Number of output buckets (NA, for automatic selection)"), - make_option(c("-t", "--threads"), action="store", default=4, type='integer', help="Number of CPU threads to use") -) -opt <- parse_args(OptionParser(option_list=option_list)) - -# Validation of the required arguments -if(is.na(opt$fasta)){ - stop("Input file with sequences is not specified\n") -} -if(is.na(opt$db)){ - stop("Clustering results are not specified\n") -} -if(!is.na(opt$numbuckets) & opt$numbuckets <= 1){ - stop("Number of buckets should be > 1\n") -} - - -## Assign variables -DATABASE <- opt$db -FASTA <- opt$fasta -SUMMARY <- opt$summary -NBUCKETS <- opt$numbuckets -THREADS <- opt$threads - -## Log assigned variables -cat("\nParameters specified:\n") -cat(paste("Clustering database: " , DATABASE, "\n", sep = "")) -cat(paste("Input sequences (FASTA): " , FASTA, "\n", sep = "")) -cat(paste("Output with bucket summary: ", SUMMARY, "\n", sep = "")) -if(is.na(NBUCKETS)){ - cat(paste("Number of buckets: ", "auto", "\n", sep = "")) -} else { - cat(paste("Number of buckets: ", NBUCKETS, "\n", sep = "")) -} - -cat(paste("CPU threads: ", THREADS, "\n", sep = "")) -cat("\n") - - -############################################## Load packages - -cat("Loading R packages...\n") - -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(paste(pkg, packageVersion(pkg), "\n")) -} - -load_pckg("data.table") -load_pckg("Biostrings") -load_pckg("plyr") - -if(THREADS < 1){ THREADS <- 1 } -if(THREADS > 1){ - cat("Preparing multi-threaded setup\n") - - load_pckg("doFuture") - registerDoFuture() - plan(multicore, workers = THREADS) - options(future.globals.maxSize = 6e10) # 60GB - - setDTthreads(threads = THREADS) # for data.table - - parall <- TRUE - -} else { - parall <- FALSE - setDTthreads(threads = 1) -} - -cat("\n") - - -############################################## Workflow - -## Load seq stats -cat("..Loading input sequences\n") -seqs <- readDNAStringSet(filepath = FASTA) - -## Load clustering file -cat("..Loading clustering file\n") -DB <- fread(file = DATABASE, - sep = "\t", header = FALSE, - col.names = c("Cluster", "Member")) - -## Estimate sequence length -cat("..Estimating total length of the sequences\n") -seqt <- data.table(Member = names(seqs), Len = width(seqs)) -DB <- merge(x = DB, y = seqt, by = "Member", all.x = TRUE) -rm(seqt) - -## Estimate number of sequences per cluster and the total length of sequences -cat("..Estimating cluster sizes\n") -datt <- DB[ , .(num_seqs = .N, sum_len = sum(Len, na.rm = TRUE)), by = "Cluster" ] - -## Sort clusters by the number of sequenes in descending order -cat("..Sorting clusters\n") -setorder(datt, -sum_len, -num_seqs) - - -cat("..Bucketizing\n") - -if(is.na(NBUCKETS)){ - cat("...Number of buckets is not specified, using automatic selection\n") - - ## For DADA2, a matrix with quality values is required `as(Biostrings::quality(fq), "matrix")` - ## It should not exceed 2^31 (2147483648) elemens, - ## Meaning that `num_seq * len_seq` must be < 2^31 - - ## Calculate approximate estimate for the maximum number of sequences per bucket - maxseqs <- 2^31 / max(DB$Len) # quantile(x = DB$Len, probs = 0.99) - - ## Number of buckets - NBUCKETS <- ceiling(nrow(DB) / maxseqs) - - cat("...The sugested number of buckets is ", NBUCKETS, "\n") -} - - -## Initializing buckets and bucket sizes -buckets <- vector("list", length = NBUCKETS) -bucket_size_numseqs <- numeric(NBUCKETS) -bucket_size_lenseqs <- numeric(NBUCKETS) - -## Distributing files into buckets -## By starting with the largest files and placing each one in the currently smallest bucket, -## we try to prevent any single bucket from becoming significantly larger than the others -for (i in 1:nrow(datt)) { - - ## Find the bucket with the minimum total sequence length - min_bucket_index <- which.min(bucket_size_lenseqs) - - ## Add the cluster ID to the chosen bucket - buckets[[ min_bucket_index ]] <- c( - buckets[[ min_bucket_index ]], - datt[i, ]$Cluster - ) - - # Updating the total sequence length of the chosen bucket - bucket_size_lenseqs[ min_bucket_index ] <- bucket_size_lenseqs[min_bucket_index] + datt[i, ]$sum_len - bucket_size_numseqs[ min_bucket_index ] <- bucket_size_numseqs[min_bucket_index] + datt[i, ]$num_seqs - -} - -cat("..Bucket summary:\n\n") - -## Prepare bucket summary -smr <- data.table( - BucketID = 1:length(buckets), - Num_clusters = laply(.data = buckets, .fun = function(x){ length(x) }), - sum_len = bucket_size_lenseqs, - num_seqs = bucket_size_numseqs) - -print(smr) - -## Add percentages -smr[ , NumClust_Percent := round(Num_clusters / sum(Num_clusters) * 100, 2) ] -smr[ , TotLen_Percent := round(sum_len / sum(sum_len) * 100, 2) ] -smr[ , TotSeqs_Percent := round(num_seqs / sum(num_seqs) * 100, 2) ] - - -cat("\n\n..Exporting FASTA file for each bucket\n") - -## Exporting function -export_bucket <- function(clustnum = 1){ - - cat("...Bucket ", clustnum, "\n") - - ## IDs of cluster representatives - clustids <- buckets[[ clustnum ]] - - ## Find sequence IDs to export - ids <- data.table(SeqID = DB[ Cluster %in% clustids ]$Member) - - ## Sort sequences by size - ids[ , Size := tstrsplit(SeqID, split = ";", keep = 2) ] - ids[ , Size := as.numeric( sub(pattern = "size=", replacement = "", x = Size) ) ] - setorder(ids, -Size, SeqID) - - ## Cluster ID with leading zero - cl <- sprintf(paste0("%0", nchar(NBUCKETS), "d"), clustnum) - - ## Extract and export - writeXStringSet( - x = seqs[ ids$SeqID ], - filepath = paste0("bucket_", cl, ".fa.gz"), - compress = TRUE, - format = "fasta", - width = 9999) - -} - -a_ply( - .data = seq_along(buckets), - .margins = 1, - .fun = export_bucket, - .parallel = parall) - - -## Bucket summary -cat("..Exporting bucket summary\n") -fwrite(x = smr, file = SUMMARY, sep = "\t", col.names = TRUE) - - -cat("\nAll done.\n") - - -##################### Session info - -## Check time -end_time <- Sys.time() - -tmm <- as.numeric(difftime(end_time, start_time, units = "min")) -cat("\nElapsed time: ", tmm, " minutes\n") - -cat("\n") -cat("Session info:\n") -sessionInfo() -cat("\n") +#!/usr/bin/env Rscript + +## Aim - evenly distribute sequence clusters across a specified number of buckets. +## The goal is to have the total length of sequences in each bucket as equal as possible. + +## Number of buckets can be automatically selected +## (e.g., to avoid the DADA2s' error message `long vectors not supported yet`, related with > 2^31 elements) + +## Usage examples: +# bucketize_db.R \ +# --db stat_clusters.txt \ +# --fasta Input.fa.gz \ +# --summary bucket_summary.txt \ +# --numbuckets 10 \ +# --threads 10 + + +## Check time +start_time <- Sys.time() + +cat("\nParsing input options and arguments...\n") + +suppressPackageStartupMessages(require(optparse)) + +## Parse arguments +option_list <- list( + make_option(c("-d", "--db"), action="store", default="DB_clu.tsv", type='character', help="Clustering database"), + make_option(c("-f", "--fasta"), action="store", default="Input.fa.gz", type='character', help="Input sequences in FASTA format"), + make_option(c("-s", "--summary"), action="store", default="bucket_summary.txt", type='character', help="Output file summary information"), + make_option(c("-n", "--numbuckets"), action="store", default=NA, type='integer', help="Number of output buckets (NA, for automatic selection)"), + make_option(c("-t", "--threads"), action="store", default=4, type='integer', help="Number of CPU threads to use") +) +opt <- parse_args(OptionParser(option_list=option_list)) + +# Validation of the required arguments +if(is.na(opt$fasta)){ + stop("Input file with sequences is not specified\n") +} +if(is.na(opt$db)){ + stop("Clustering results are not specified\n") +} +if(!is.na(opt$numbuckets) & opt$numbuckets <= 1){ + stop("Number of buckets should be > 1\n") +} + + +## Assign variables +DATABASE <- opt$db +FASTA <- opt$fasta +SUMMARY <- opt$summary +NBUCKETS <- opt$numbuckets +THREADS <- opt$threads + +## Log assigned variables +cat("\nParameters specified:\n") +cat(paste("Clustering database: " , DATABASE, "\n", sep = "")) +cat(paste("Input sequences (FASTA): " , FASTA, "\n", sep = "")) +cat(paste("Output with bucket summary: ", SUMMARY, "\n", sep = "")) +if(is.na(NBUCKETS)){ + cat(paste("Number of buckets: ", "auto", "\n", sep = "")) +} else { + cat(paste("Number of buckets: ", NBUCKETS, "\n", sep = "")) +} + +cat(paste("CPU threads: ", THREADS, "\n", sep = "")) +cat("\n") + + +############################################## Load packages + +cat("Loading R packages...\n") + +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(paste(pkg, packageVersion(pkg), "\n")) +} + +load_pckg("data.table") +load_pckg("Biostrings") +load_pckg("plyr") + +if(THREADS < 1){ THREADS <- 1 } +if(THREADS > 1){ + cat("Preparing multi-threaded setup\n") + + load_pckg("doFuture") + registerDoFuture() + plan(multicore, workers = THREADS) + options(future.globals.maxSize = 6e10) # 60GB + + setDTthreads(threads = THREADS) # for data.table + + parall <- TRUE + +} else { + parall <- FALSE + setDTthreads(threads = 1) +} + +cat("\n") + + +############################################## Workflow + +## Load seq stats +cat("..Loading input sequences\n") +seqs <- readDNAStringSet(filepath = FASTA) + +## Load clustering file +cat("..Loading clustering file\n") +DB <- fread(file = DATABASE, + sep = "\t", header = FALSE, + col.names = c("Cluster", "Member")) + +## Estimate sequence length +cat("..Estimating total length of the sequences\n") +seqt <- data.table(Member = names(seqs), Len = width(seqs)) +DB <- merge(x = DB, y = seqt, by = "Member", all.x = TRUE) +rm(seqt) + +## Estimate number of sequences per cluster and the total length of sequences +cat("..Estimating cluster sizes\n") +datt <- DB[ , .(num_seqs = .N, sum_len = sum(Len, na.rm = TRUE)), by = "Cluster" ] + +## Sort clusters by the number of sequenes in descending order +cat("..Sorting clusters\n") +setorder(datt, -sum_len, -num_seqs) + + +cat("..Bucketizing\n") + +if(is.na(NBUCKETS)){ + cat("...Number of buckets is not specified, using automatic selection\n") + + ## For DADA2, a matrix with quality values is required `as(Biostrings::quality(fq), "matrix")` + ## It should not exceed 2^31 (2147483648) elemens, + ## Meaning that `num_seq * len_seq` must be < 2^31 + + ## Calculate approximate estimate for the maximum number of sequences per bucket + maxseqs <- 2^31 / max(DB$Len) # quantile(x = DB$Len, probs = 0.99) + + ## Number of buckets + NBUCKETS <- ceiling(nrow(DB) / maxseqs) + + cat("...The sugested number of buckets is ", NBUCKETS, "\n") +} + + +## Initializing buckets and bucket sizes +buckets <- vector("list", length = NBUCKETS) +bucket_size_numseqs <- numeric(NBUCKETS) +bucket_size_lenseqs <- numeric(NBUCKETS) + +## Distributing files into buckets +## By starting with the largest files and placing each one in the currently smallest bucket, +## we try to prevent any single bucket from becoming significantly larger than the others +for (i in 1:nrow(datt)) { + + ## Find the bucket with the minimum total sequence length + min_bucket_index <- which.min(bucket_size_lenseqs) + + ## Add the cluster ID to the chosen bucket + buckets[[ min_bucket_index ]] <- c( + buckets[[ min_bucket_index ]], + datt[i, ]$Cluster + ) + + # Updating the total sequence length of the chosen bucket + bucket_size_lenseqs[ min_bucket_index ] <- bucket_size_lenseqs[min_bucket_index] + datt[i, ]$sum_len + bucket_size_numseqs[ min_bucket_index ] <- bucket_size_numseqs[min_bucket_index] + datt[i, ]$num_seqs + +} + +cat("..Bucket summary:\n\n") + +## Prepare bucket summary +smr <- data.table( + BucketID = 1:length(buckets), + Num_clusters = laply(.data = buckets, .fun = function(x){ length(x) }), + sum_len = bucket_size_lenseqs, + num_seqs = bucket_size_numseqs) + +print(smr) + +## Add percentages +smr[ , NumClust_Percent := round(Num_clusters / sum(Num_clusters) * 100, 2) ] +smr[ , TotLen_Percent := round(sum_len / sum(sum_len) * 100, 2) ] +smr[ , TotSeqs_Percent := round(num_seqs / sum(num_seqs) * 100, 2) ] + + +cat("\n\n..Exporting FASTA file for each bucket\n") + +## Exporting function +export_bucket <- function(clustnum = 1){ + + cat("...Bucket ", clustnum, "\n") + + ## IDs of cluster representatives + clustids <- buckets[[ clustnum ]] + + ## Find sequence IDs to export + ids <- data.table(SeqID = DB[ Cluster %in% clustids ]$Member) + + ## Sort sequences by size + ids[ , Size := tstrsplit(SeqID, split = ";", keep = 2) ] + ids[ , Size := as.numeric( sub(pattern = "size=", replacement = "", x = Size) ) ] + setorder(ids, -Size, SeqID) + + ## Cluster ID with leading zero + cl <- sprintf(paste0("%0", nchar(NBUCKETS), "d"), clustnum) + + ## Extract and export + writeXStringSet( + x = seqs[ ids$SeqID ], + filepath = paste0("bucket_", cl, ".fa.gz"), + compress = TRUE, + format = "fasta", + width = 9999) + +} + +a_ply( + .data = seq_along(buckets), + .margins = 1, + .fun = export_bucket, + .parallel = parall) + + +## Bucket summary +cat("..Exporting bucket summary\n") +fwrite(x = smr, file = SUMMARY, sep = "\t", col.names = TRUE) + + +cat("\nAll done.\n") + + +##################### Session info + +## Check time +end_time <- Sys.time() + +tmm <- as.numeric(difftime(end_time, start_time, units = "min")) +cat("\nElapsed time: ", tmm, " minutes\n") + +cat("\n") +cat("Session info:\n") +sessionInfo() +cat("\n") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/chimera_rescue.R b/src/pipecraft-core/service_scripts/NextITS/bin/chimera_rescue.R index 691d8f9..b53f3a3 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/chimera_rescue.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/chimera_rescue.R @@ -1,64 +1,64 @@ -#!/usr/bin/env Rscript - -## Script to rescue sequences that were annotated as chimeric, -## but have high occurrence within sequenceing run (occurrence > 2) - -# Input is given as positional arguments: -# 1. List of all chimeric sequences (`All_chimeras.txt.gz`) -# 2. Min sequence occurrence to be preserved (e.g., 2) -# 3. Output file name (`Rescued_Chimeric_sequences.fa.gz`) - -suppressMessages(library(data.table)); setDTthreads(threads = 1) -suppressMessages(library(Biostrings)) - -args <- commandArgs(trailingOnly = TRUE) - -MINOCC <- as.numeric( args[2] ) - -## Load sequences -cat("..Loading chimeric sequences\n") -CH <- try( - fread(file = args[1], - sep = "\t", header = F, - col.names = c("SeqID", "Seq")) - ) - -if("try-error" %in% class(CH)){ - cat("\nCould not read the file with chimeric sequences\n") - cat("Most likely, the file file is empty (no chimeras)\n") - q(save = "no", status = 0, runLast = FALSE) -} - -cat("..Total number of chimeric records: ", nrow(CH), "\n") - -if(nrow(CH) > 0){ - - ## Extract sample name and sequencing run ID - CH[, SampleID := tstrsplit(x = SeqID, split = ";", keep = 2) ] - CH[, SampleID := gsub(pattern = "sample=", replacement = "", x = SampleID) ] - - ## Estimate sequence frequency - cat("..Estimating chimera occurrence\n") - CF <- CH[, .(Occurrence = .N), by = "Seq"] - - ## Exclude sequences with low occurrence (most probably chimeric) - ## Sequences with higher occurrence should be "real" sequences - CF <- CF[ Occurrence > MINOCC ] - - cat("..Total number of unique chimeric sequences: ", length(unique(CH$Seq)), ".\n") - - ## Export sequences - if(nrow(CF) > 0){ - cat("..There are", nrow(CF), "unique sequence to rescue.\n") - NCH <- CH[ Seq %in% CF$Seq ] - SQS <- DNAStringSet(x = NCH$Seq) - names(SQS) <- NCH$SeqID - - cat("..Exporting rescued sequences\n") - writeXStringSet(x = SQS, filepath = args[3], - compress=TRUE, format="fasta", width=9999) - } else { - cat("..No sequences were rescued.\n") - } - +#!/usr/bin/env Rscript + +## Script to rescue sequences that were annotated as chimeric, +## but have high occurrence within sequenceing run (occurrence > 2) + +# Input is given as positional arguments: +# 1. List of all chimeric sequences (`All_chimeras.txt.gz`) +# 2. Min sequence occurrence to be preserved (e.g., 2) +# 3. Output file name (`Rescued_Chimeric_sequences.fa.gz`) + +suppressMessages(library(data.table)); setDTthreads(threads = 1) +suppressMessages(library(Biostrings)) + +args <- commandArgs(trailingOnly = TRUE) + +MINOCC <- as.numeric( args[2] ) + +## Load sequences +cat("..Loading chimeric sequences\n") +CH <- try( + fread(file = args[1], + sep = "\t", header = F, + col.names = c("SeqID", "Seq")) + ) + +if("try-error" %in% class(CH)){ + cat("\nCould not read the file with chimeric sequences\n") + cat("Most likely, the file file is empty (no chimeras)\n") + q(save = "no", status = 0, runLast = FALSE) +} + +cat("..Total number of chimeric records: ", nrow(CH), "\n") + +if(nrow(CH) > 0){ + + ## Extract sample name and sequencing run ID + CH[, SampleID := tstrsplit(x = SeqID, split = ";", keep = 2) ] + CH[, SampleID := gsub(pattern = "sample=", replacement = "", x = SampleID) ] + + ## Estimate sequence frequency + cat("..Estimating chimera occurrence\n") + CF <- CH[, .(Occurrence = .N), by = "Seq"] + + ## Exclude sequences with low occurrence (most probably chimeric) + ## Sequences with higher occurrence should be "real" sequences + CF <- CF[ Occurrence > MINOCC ] + + cat("..Total number of unique chimeric sequences: ", length(unique(CH$Seq)), ".\n") + + ## Export sequences + if(nrow(CF) > 0){ + cat("..There are", nrow(CF), "unique sequence to rescue.\n") + NCH <- CH[ Seq %in% CF$Seq ] + SQS <- DNAStringSet(x = NCH$Seq) + names(SQS) <- NCH$SeqID + + cat("..Exporting rescued sequences\n") + writeXStringSet(x = SQS, filepath = args[3], + compress=TRUE, format="fasta", width=9999) + } else { + cat("..No sequences were rescued.\n") + } + } \ No newline at end of file diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/convert_IUPAC.sh b/src/pipecraft-core/service_scripts/NextITS/bin/convert_IUPAC.sh index 791228f..4a250b7 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/convert_IUPAC.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/convert_IUPAC.sh @@ -1,42 +1,42 @@ -#!/bin/bash - -## Function to convert IUPAC codes in primers -# Based on PipeCraft2 scripts -# https://github.com/SuvalineVana/pipecraft/blob/main/src/pipecraft-core/service_scripts/submodules/framework.functions.sh -# Git commit 5650545 (Jun 9, 2022) -# Author - Sten Anslan - -echo "$1" | \ -if grep -q -E "R|Y|S|W|K|M|B|D|H|V|N|I" ; then - - ## Define IUPAC codes - R=$"[AG]" - Y=$"[CT]" - S=$"[GC]" - W=$"[AT]" - K=$"[GT]" - M=$"[AC]" - B=$"[CGT]" - D=$"[AGT]" - H=$"[ACT]" - V=$"[ACG]" - N=$"[ATGC]" - I=$"[ATGC]" - - ## Replace IUPAC codes - primer=$(echo "$1" | \ - sed -e "s/R/$R/g; s/Y/$Y/g; \ - s/S/$S/g; s/W/$W/g; s/K/$K/g; \ - s/M/$M/g; s/B/$B/g; s/D/$D/g; \ - s/H/$H/g; s/V/$V/g; s/N/$N/g; \ - s/I/$I/g") - - ## Return convered primer - echo "$primer" -else - ## Return original primer when no IUPAC codes were detected - echo "$1" -fi - -## Example: -# ./convert_IUPAC.sh "CGACCWGCGGARGGATCATTA" # CGACC[AT]GCGGA[AG]GGATCATTA +#!/bin/bash + +## Function to convert IUPAC codes in primers +# Based on PipeCraft2 scripts +# https://github.com/SuvalineVana/pipecraft/blob/main/src/pipecraft-core/service_scripts/submodules/framework.functions.sh +# Git commit 5650545 (Jun 9, 2022) +# Author - Sten Anslan + +echo "$1" | \ +if grep -q -E "R|Y|S|W|K|M|B|D|H|V|N|I" ; then + + ## Define IUPAC codes + R=$"[AG]" + Y=$"[CT]" + S=$"[GC]" + W=$"[AT]" + K=$"[GT]" + M=$"[AC]" + B=$"[CGT]" + D=$"[AGT]" + H=$"[ACT]" + V=$"[ACG]" + N=$"[ATGC]" + I=$"[ATGC]" + + ## Replace IUPAC codes + primer=$(echo "$1" | \ + sed -e "s/R/$R/g; s/Y/$Y/g; \ + s/S/$S/g; s/W/$W/g; s/K/$K/g; \ + s/M/$M/g; s/B/$B/g; s/D/$D/g; \ + s/H/$H/g; s/V/$V/g; s/N/$N/g; \ + s/I/$I/g") + + ## Return convered primer + echo "$primer" +else + ## Return original primer when no IUPAC codes were detected + echo "$1" +fi + +## Example: +# ./convert_IUPAC.sh "CGACCWGCGGARGGATCATTA" # CGACC[AT]GCGGA[AG]GGATCATTA diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/count_homopolymer_stats.sh b/src/pipecraft-core/service_scripts/NextITS/bin/count_homopolymer_stats.sh index 2cc9188..c2c4af7 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/count_homopolymer_stats.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/count_homopolymer_stats.sh @@ -1,12 +1,12 @@ -#!/bin/bash - -# $1 = input file -# $2 = text to add to the resulting file - -zcat "$1" \ - | awk \ - -F '\t' -v OFS='\t' \ - -v fnm="$2" \ - '$1 ~ /H/ { print fnm , $9 , $10 }' \ - | sed 's/_uch.uc//' - +#!/bin/bash + +# $1 = input file +# $2 = text to add to the resulting file + +zcat "$1" \ + | awk \ + -F '\t' -v OFS='\t' \ + -v fnm="$2" \ + '$1 ~ /H/ { print fnm , $9 , $10 }' \ + | sed 's/_uch.uc//' + diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/count_number_of_reads.sh b/src/pipecraft-core/service_scripts/NextITS/bin/count_number_of_reads.sh index 61ad844..ae81163 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/count_number_of_reads.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/count_number_of_reads.sh @@ -1,12 +1,12 @@ -#!/bin/bash - -## Count number of reads in the dereplicated file -## Size annotations should be in USEARCH-style (e.g., size=100) - -# $1 = input file -# $2 = text to add to the resulting file - -seqkit seq --name "$1" \ - | grep -Po ';size=[0-9]+' \ - | sed 's/;size=//g' \ - | awk -F '\t' -v OFS='\t' -v fnm="$2" '{sum+=$1} END {print fnm , sum}' +#!/bin/bash + +## Count number of reads in the dereplicated file +## Size annotations should be in USEARCH-style (e.g., size=100) + +# $1 = input file +# $2 = text to add to the resulting file + +seqkit seq --name "$1" \ + | grep -Po ';size=[0-9]+' \ + | sed 's/;size=//g' \ + | awk -F '\t' -v OFS='\t' -v fnm="$2" '{sum+=$1} END {print fnm , sum}' diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/dada2_no_quals.R b/src/pipecraft-core/service_scripts/NextITS/bin/dada2_no_quals.R index 3127ff3..930c436 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/dada2_no_quals.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/dada2_no_quals.R @@ -1,421 +1,421 @@ -#!/usr/bin/env Rscript - -## Perform sequence denoising with DADA2 - -### Notes: -## - `USE_QUALS = FALSE` will be deprecated -## https://github.com/benjjneb/dada2/issues/816#issuecomment-521836313 -## therefore, use `noqualErrfun` -## - - -## TO DO: -## - Benchmark params (especially BAND_SIZE, DETECT_SINGLETONS, and OMEGA_A) - - -## Usage example: -# dada2_no_quals.R \ -# --input input.fq.gz \ -# --nbases 1e6 \ -# --bandsize 16 \ -# --detectsingletons TRUE \ -# --omegaA 1e-20 \ -# --omegaC 1e-40 \ -# --omegaP 1e-4 \ -# --maxconsist 10 \ -# --match 4 \ -# --mismatch -5 \ -# --gappenalty -8 \ -# --threads 8 - -## Outputs: -# - DADA2_ErrorRates_noqualErrfun.RData -# - DADA2_denoised.fa.gz -# - DADA2_denoised.uc.gz -# - DADA2_UC.qs -# - DADA2_denoising_summary.txt - - -############################################## Parse input parameters - -## Check time -start_time <- Sys.time() - - -cat("\nParsing input options and arguments...\n") - -suppressPackageStartupMessages(require(optparse)) - -## Parse arguments -option_list <- list( - make_option(c("-i", "--input"), action="store", default=NA, type='character', help=""), - make_option(c("-n", "--nbases"), action="store", default=1e6, type='double', help=""), - make_option(c("-b", "--bandsize"), action="store", default=16, type='double', help=""), - make_option(c("-s", "--detectsingletons"), action="store", default=TRUE, type='logical', help=""), - make_option(c("-A", "--omegaA"), action="store", default=1e-20, type='double', help=""), - make_option(c("-C", "--omegaC"), action="store", default=1e-40, type='double', help=""), - make_option(c("-P", "--omegaP"), action="store", default=1e-4, type='double', help=""), - make_option(c("-x", "--maxconsist"), action="store", default=10, type='integer', help=""), - make_option("--match", action="store", default=4, type='double', help=""), - make_option("--mismatch", action="store", default=-5, type='double', help=""), - make_option("--gappenalty", action="store", default=-8, type='double', help=""), - make_option("--hpgap", action="store", default=NULL, type='double', help=""), - make_option(c("-t", "--threads"), action="store", default=4L, type='integer', help="Number of CPU threads for arrow, default 4") -) -opt <- parse_args(OptionParser(option_list=option_list)) - -# Validation of the required argiments -if(is.na(opt$input)){ - cat("Input file is not specified: ....\n", file=stderr()) - stop() -} - - -## Function to convert text "NA"s to NA -# to_na <- function(x){ -# if(x %in% c("NA", "null", "Null")){ x <- NA } -# return(x) -# } - -## Assign variables -INPUT <- opt$input -NBASES <- opt$nbases -BAND_SIZE <- opt$bandsize -DETECT_SINGLETONS <- opt$detectsingletons -OMEGA_A <- opt$omegaA -OMEGA_C <- opt$omegaC -OMEGA_P <- opt$omegaP -MAX_CONSIST <- opt$maxconsist -MATCH <- opt$match -MISMATCH <- opt$mismatch -GAP_PENALTY <- opt$gappenalty -HOMOPOLYMER_GAP_PENALTY <- opt$hpgap # PacBio CCS does not make homopolymer errors at a higher rate than normal indels -> NULL -CPUTHREADS <- opt$threads - - -## Log assigned variables -cat("\nParameters specified:\n") -cat(paste("Input file: " , INPUT, "\n", sep="")) -cat(paste("Number of bases to use for error rate learning: ", NBASES, "\n", sep = "")) -cat(paste("Band size for the Needleman-Wunsch alignment: ", BAND_SIZE, "\n", sep = "")) -cat(paste("Singleton detection: ", DETECT_SINGLETONS, "\n", sep = "")) -cat(paste("OMEGA_A: ", OMEGA_A, "\n", sep = "")) -cat(paste("OMEGA_C: ", OMEGA_C, "\n", sep = "")) -cat(paste("OMEGA_P: ", OMEGA_P, "\n", sep = "")) -cat(paste("Number of iterations of the self-consistency loop: ", MAX_CONSIST, "\n", sep = "")) -cat(paste("Alignment for matches: ", MATCH, "\n", sep = "")) -cat(paste("Alignment for mismatches: ", MISMATCH, "\n", sep = "")) -cat(paste("Gap penalty: ", GAP_PENALTY, "\n", sep = "")) -cat(paste("Homopolymer gap penalty: ", HOMOPOLYMER_GAP_PENALTY, "\n", sep = "")) -cat(paste("Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) - -cat("\n") - - - -############################################## Load packages - -cat("Loading R packages...\n") - -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(paste(pkg, packageVersion(pkg), "\n")) -} - -load_pckg("Biostrings") -load_pckg("ShortRead") -load_pckg("data.table") -load_pckg("dada2") - -cat("\n") - -## Set CPU thread number -cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") -setDTthreads(threads = CPUTHREADS) # for data.table - -## Set seed -set.seed(111) - -## Set DADA options -cat("Setting DADA2 options\n") -setDadaOpt( - BAND_SIZE = BAND_SIZE, # dada2 default, 16 - DETECT_SINGLETONS = DETECT_SINGLETONS, # dada2 default, FALSE - OMEGA_A = OMEGA_A, # dada2 default, 1e-40 - OMEGA_C = OMEGA_C, # dada2 default, 1e-40 - OMEGA_P = OMEGA_P, # dada2 default, 1e-4 - MAX_CONSIST = MAX_CONSIST, # dada2 default, 10 - GAP_PENALTY = GAP_PENALTY, # dada2 default, -8 - MATCH = MATCH, # dada2 default, 4 - MISMATCH = MISMATCH, # dada2 default, -5 - HOMOPOLYMER_GAP_PENALTY = HOMOPOLYMER_GAP_PENALTY # PacBio CCS does not make homopolymer errors at a higher rate than normal indels - ) - -## Get DADA options -# getDadaOpt() - -############################################## Workflow - - -## Load FASTQ file -cat("\nLoading input data\n") -fq <- readFastq(dirPath = INPUT, qualityType = "FastqQuality") - -## Extract sequence headers -cat("Processing sequences\n") -sq <- as.data.table(fq@id) -setnames(x = sq, new = "SeqName") -sq[ , c("SeqID", "Abundance") := tstrsplit(x = SeqName, split = ";size=", keep = 1:2) ] -sq[ , Abundance := as.numeric(Abundance) ] -sq[ , Sequence := as.character(sread(fq))] - -## Extract sequence qualities -cat("Processing sequence quality scores\n") -seq_quals <- as(quality(fq), "matrix") -# dada2:::qtables2(fq) - -## Summary stats -num_seqs <- nrow(sq) -num_singl <- nrow(sq[ Abundance < 2 ]) -num_reads <- sum(sq$Abundance, na.rm = TRUE) -perc_nonsingleton <- round((num_seqs - num_singl) / num_seqs * 100, 2) - -cat("\n") -cat("Number of unique sequences detected: ", num_seqs, "\n") -cat("Number of singleton sequences: ", num_singl, "\n") -cat("Total abundance of sequences: ", num_reads, "\n") -cat("Percentage of non-singleton sequences: ", round(perc_nonsingleton, 2), "\n") - -## Test the rule of thumb, https://github.com/benjjneb/dada2/issues/1663#issuecomment-1359905397 -if(perc_nonsingleton < 10){ - cat("WARNING: <10% of reads are duplicates of other reads,\n") - cat(" meaning that DADA2 might not be the right algorithmic choice\n") -} - - -## Manually create a derep-class object -## See also https://github.com/benjjneb/dada2/blob/004ce26909268e1318a2f68e0ea26807412c7a2d/R/sequenceIO.R#L240-L242 -# https://github.com/benjjneb/dada2/blob/004ce26909268e1318a2f68e0ea26807412c7a2d/R/sequenceIO.R#L45 - -## Prepare derep-class object -cat("\nPreparing derep-class object\n") -uniques <- sq$Abundance -names(uniques) <- as.character(sread(fq)) # names = full amplicon sequence -rownames(seq_quals) <- names(uniques) - -derep <- list( - uniques = uniques, - quals = seq_quals, - map = NULL, - SeqID = sq$SeqID # add allso sequence IDs - ) - -derep <- as(derep, "derep") - -## Clean up -rm(uniques, seq_quals) - - - -## Estimate error rates for each type of transition while ignoring quality scores -cat("\nEstimating error rates\n") -errors <- try( - learnErrors( - fls = derep, - nbases = NBASES, - errorEstimationFunction = noqualErrfun, - qualityType = "FastqQuality", - verbose = 1, - multithread = CPUTHREADS - ) - ) - -## Retry if multithreading failed -if("try-error" %in% class(errors) & CPUTHREADS > 1){ - cat("..Multi-threaded error rate estimation failed\n") - cat("..Trying to resume with a single CPU thread\n") - - errors <- learnErrors( - fls = derep, - nbases = NBASES, - errorEstimationFunction = noqualErrfun, - qualityType = "FastqQuality", - verbose = 1, - multithread = 1 - ) -} -if("try-error" %in% class(errors) & CPUTHREADS == 1){ - stop("..Error rate estimation failed\n") -} - - -## Export results -cat("\nExporting error rates\n") -saveRDS(object = errors, - file = "DADA2_ErrorRates_noqualErrfun.RData", - compress = "xz") - - -## Plot observed and estimated error rates -# plotErrors(errors) - - -## Run sample inference with DADA2 -cat("\nRunning sample inference\n") -dadares <- dada( - derep = derep, - err = errors, - errorEstimationFunction = noqualErrfun, - selfConsist = FALSE, - verbose = 1, - multithread = CPUTHREADS) - -cat("\nExporting DADA2 object\n") -saveRDS(object = dadares, - file = "DADA2_InferedSeqs_noqualErrfun.RData", - compress = "xz") - - -## Prepare resulting data -cat("Preparing resulting table\n") -res <- data.table( - Sequence = dadares$sequence, - Abundance = dadares$denoised) - -## Add sequence IDs -res[ , SeqNumID := .I ] -res <- merge( - x = res, - y = sq[, .(SeqID, Sequence)], - by = "Sequence", all.x = TRUE) - -## Sort by abundance -setorder(res, -Abundance, SeqID, na.last = TRUE) - -## Export denoised sequences -cat("Exporting denoised sequences\n") -ASVS <- DNAStringSet(x = res$Sequence) -names(ASVS) <- paste0(res$SeqID, ";size=", res$Abundance) - -writeXStringSet( - x = ASVS, - filepath = "DADA2_denoised.fa.gz", - compress = TRUE, - format = "fasta", - width = 20000) - - - -## Create UC file -cat("Preparing pseudo-UC file\n") -UC <- data.table( - DerepSeqID = derep$SeqID, - SeqNumID = dadares$map, - Abundance = derep$uniques) - -UC <- merge( - x = UC, - y = res[ , .(SeqNumID, SeqID) ], - by = "SeqNumID", all.x = TRUE) - -setorder(UC, SeqNumID, na.last = TRUE) -setnames(x = UC, old = "SeqID", new = "ASV") - -## Export pre-UC file -cat("Exporting pre-UC file\n") -# saveRDS( -# object = UC, -# file = "DADA2_UC.RData", -# compress = "xz") -qs2::qs_save(x = UC, file = "DADA2_UC.qs", - preset = "custom", algorithm = "zstd", compress_level = 15L, nthreads = CPUTHREADS) - -## Summary stats -num_asvs <- nrow(res) -num_asvreads <- sum(res$Abundance, na.rm = T) -num_merged <- nrow(UC[ !is.na(SeqNumID) & DerepSeqID != ASV ]) # excluding representative seqs -num_dsc <- nrow(UC[ is.na(SeqNumID) ]) -num_dscreads <- sum(UC[ is.na(SeqNumID) ]$Abundance) -perc_dsc <- round(num_dsc / num_seqs * 100, 2) -perc_dscreads <- round(num_dscreads / num_reads * 100, 2) - -cat("\nRun summary:\n") -cat("Number of ASVs infered: ", num_asvs, "\n") -cat("Number of reads in ASV table: ", num_asvreads , "\n") -cat("Number of sequences merged into ASVs: ", num_merged, "\n") -cat("Number of discarded sequences (%): ", num_dsc, "(", perc_dsc, "% )\n") -cat("Number of reads of discarded sequences (%): ", num_dscreads, "(", perc_dscreads, "% )\n") - - -## Format pseudo-UC file -# 1 Record type S, H, C or N (see table below) -# 2 Cluster number (0-based) -# 3 Sequence length (S, N and H) or cluster size (C) -# 4 For H records, percent identity with target -# 5 For H records, the strand: + or - for nucleotides, . for proteins -# 6 Not used, parsers should ignore this field. Included for backwards compatibility -# 7 Not used, parsers should ignore this field. Included for backwards compatibility -# 8 Compressed alignment or the symbol '=' (equals sign). The = indicates that the query is 100% identical to the target sequence (field 10) -# 9 Label of query sequence (always present) -# 10 Label of target sequence (H records only) - -## Remove noisy sequences -UC <- UC[ ! is.na(SeqNumID), .(DerepSeqID, ASV) ] -UC[ , RecordType := fifelse(DerepSeqID == ASV, "C", "H", na = NA) ] -UC[ , `:=` (ClustNum = NA, SeqLen = NA, Ident = NA, Strand = "+", V6 = NA, V7 = NA, ALN = ".") ] - -setcolorder(x = UC, - neworder = c("RecordType", "ClustNum", "SeqLen", "Ident", "Strand", "V6", "V7", "ALN", "DerepSeqID", "ASV")) - -## Export UC file -cat("\nExporting pseudo-UC file\n") -fwrite(x = UC, - file = "DADA2_denoised.uc.gz", - quote = FALSE, sep = "\t", - col.names = FALSE, row.names = FALSE, - compress = "gzip") - - - -## Write summary -cat("Exporting run statistics\n") -smr <- rbind( - data.table(Param = "Number of unique sequences (prior denoising)", Value = num_seqs), - data.table(Param = "Number of singleton sequences (prior denoising)", Value = num_singl), - data.table(Param = "Total abundance of sequences (prior denoising)", Value = num_reads), - data.table(Param = "Percentage of non-singleton sequences (prior denoising)", Value = perc_nonsingleton), - - data.table(Param = "Number of ASVs infered", Value = num_asvs), - data.table(Param = "Number of reads in ASV table", Value = num_asvreads), - data.table(Param = "Number of sequences merged into ASVs (excluding representative seqs)", Value = num_merged), - data.table(Param = "Number of discarded sequences", Value = num_dsc), - data.table(Param = "Percentage of discarded sequences", Value = perc_dsc), - data.table(Param = "Number of reads of discarded sequences", Value = num_dscreads), - data.table(Param = "Percentage of reads of discarded sequences", Value = perc_dscreads) - ) - -fwrite(x = smr, - file = "DADA2_denoising_summary.txt", - quote = FALSE, sep = "\t") - - - -## Construct sequence table (rows = samples, cols = ASVs) -# makeSequenceTable(dadares, orderBy = "abundance") - - - -cat("\nAll done.\n") - - -##################### Session info - -## Check time -end_time <- Sys.time() - -tmm <- as.numeric(difftime(end_time, start_time, units = "min")) -cat("\nElapsed time: ", tmm, " minutes\n") - -cat("\n") -cat("Session info:\n") -sessionInfo() -cat("\n") +#!/usr/bin/env Rscript + +## Perform sequence denoising with DADA2 + +### Notes: +## - `USE_QUALS = FALSE` will be deprecated +## https://github.com/benjjneb/dada2/issues/816#issuecomment-521836313 +## therefore, use `noqualErrfun` +## - + +## TO DO: +## - Benchmark params (especially BAND_SIZE, DETECT_SINGLETONS, and OMEGA_A) + + +## Usage example: +# dada2_no_quals.R \ +# --input input.fq.gz \ +# --nbases 1e6 \ +# --bandsize 16 \ +# --detectsingletons TRUE \ +# --omegaA 1e-20 \ +# --omegaC 1e-40 \ +# --omegaP 1e-4 \ +# --maxconsist 10 \ +# --match 4 \ +# --mismatch -5 \ +# --gappenalty -8 \ +# --threads 8 + +## Outputs: +# - DADA2_ErrorRates_noqualErrfun.RData +# - DADA2_denoised.fa.gz +# - DADA2_denoised.uc.gz +# - DADA2_UC.qs +# - DADA2_denoising_summary.txt + + +############################################## Parse input parameters + +## Check time +start_time <- Sys.time() + + +cat("\nParsing input options and arguments...\n") + +suppressPackageStartupMessages(require(optparse)) + +## Parse arguments +option_list <- list( + make_option(c("-i", "--input"), action="store", default=NA, type='character', help=""), + make_option(c("-n", "--nbases"), action="store", default=1e6, type='double', help=""), + make_option(c("-b", "--bandsize"), action="store", default=16, type='double', help=""), + make_option(c("-s", "--detectsingletons"), action="store", default=TRUE, type='logical', help=""), + make_option(c("-A", "--omegaA"), action="store", default=1e-20, type='double', help=""), + make_option(c("-C", "--omegaC"), action="store", default=1e-40, type='double', help=""), + make_option(c("-P", "--omegaP"), action="store", default=1e-4, type='double', help=""), + make_option(c("-x", "--maxconsist"), action="store", default=10, type='integer', help=""), + make_option("--match", action="store", default=4, type='double', help=""), + make_option("--mismatch", action="store", default=-5, type='double', help=""), + make_option("--gappenalty", action="store", default=-8, type='double', help=""), + make_option("--hpgap", action="store", default=NULL, type='double', help=""), + make_option(c("-t", "--threads"), action="store", default=4L, type='integer', help="Number of CPU threads for arrow, default 4") +) +opt <- parse_args(OptionParser(option_list=option_list)) + +# Validation of the required argiments +if(is.na(opt$input)){ + cat("Input file is not specified: ....\n", file=stderr()) + stop() +} + + +## Function to convert text "NA"s to NA +# to_na <- function(x){ +# if(x %in% c("NA", "null", "Null")){ x <- NA } +# return(x) +# } + +## Assign variables +INPUT <- opt$input +NBASES <- opt$nbases +BAND_SIZE <- opt$bandsize +DETECT_SINGLETONS <- opt$detectsingletons +OMEGA_A <- opt$omegaA +OMEGA_C <- opt$omegaC +OMEGA_P <- opt$omegaP +MAX_CONSIST <- opt$maxconsist +MATCH <- opt$match +MISMATCH <- opt$mismatch +GAP_PENALTY <- opt$gappenalty +HOMOPOLYMER_GAP_PENALTY <- opt$hpgap # PacBio CCS does not make homopolymer errors at a higher rate than normal indels -> NULL +CPUTHREADS <- opt$threads + + +## Log assigned variables +cat("\nParameters specified:\n") +cat(paste("Input file: " , INPUT, "\n", sep="")) +cat(paste("Number of bases to use for error rate learning: ", NBASES, "\n", sep = "")) +cat(paste("Band size for the Needleman-Wunsch alignment: ", BAND_SIZE, "\n", sep = "")) +cat(paste("Singleton detection: ", DETECT_SINGLETONS, "\n", sep = "")) +cat(paste("OMEGA_A: ", OMEGA_A, "\n", sep = "")) +cat(paste("OMEGA_C: ", OMEGA_C, "\n", sep = "")) +cat(paste("OMEGA_P: ", OMEGA_P, "\n", sep = "")) +cat(paste("Number of iterations of the self-consistency loop: ", MAX_CONSIST, "\n", sep = "")) +cat(paste("Alignment for matches: ", MATCH, "\n", sep = "")) +cat(paste("Alignment for mismatches: ", MISMATCH, "\n", sep = "")) +cat(paste("Gap penalty: ", GAP_PENALTY, "\n", sep = "")) +cat(paste("Homopolymer gap penalty: ", HOMOPOLYMER_GAP_PENALTY, "\n", sep = "")) +cat(paste("Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) + +cat("\n") + + + +############################################## Load packages + +cat("Loading R packages...\n") + +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(paste(pkg, packageVersion(pkg), "\n")) +} + +load_pckg("Biostrings") +load_pckg("ShortRead") +load_pckg("data.table") +load_pckg("dada2") + +cat("\n") + +## Set CPU thread number +cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") +setDTthreads(threads = CPUTHREADS) # for data.table + +## Set seed +set.seed(111) + +## Set DADA options +cat("Setting DADA2 options\n") +setDadaOpt( + BAND_SIZE = BAND_SIZE, # dada2 default, 16 + DETECT_SINGLETONS = DETECT_SINGLETONS, # dada2 default, FALSE + OMEGA_A = OMEGA_A, # dada2 default, 1e-40 + OMEGA_C = OMEGA_C, # dada2 default, 1e-40 + OMEGA_P = OMEGA_P, # dada2 default, 1e-4 + MAX_CONSIST = MAX_CONSIST, # dada2 default, 10 + GAP_PENALTY = GAP_PENALTY, # dada2 default, -8 + MATCH = MATCH, # dada2 default, 4 + MISMATCH = MISMATCH, # dada2 default, -5 + HOMOPOLYMER_GAP_PENALTY = HOMOPOLYMER_GAP_PENALTY # PacBio CCS does not make homopolymer errors at a higher rate than normal indels + ) + +## Get DADA options +# getDadaOpt() + +############################################## Workflow + + +## Load FASTQ file +cat("\nLoading input data\n") +fq <- readFastq(dirPath = INPUT, qualityType = "FastqQuality") + +## Extract sequence headers +cat("Processing sequences\n") +sq <- as.data.table(fq@id) +setnames(x = sq, new = "SeqName") +sq[ , c("SeqID", "Abundance") := tstrsplit(x = SeqName, split = ";size=", keep = 1:2) ] +sq[ , Abundance := as.numeric(Abundance) ] +sq[ , Sequence := as.character(sread(fq))] + +## Extract sequence qualities +cat("Processing sequence quality scores\n") +seq_quals <- as(quality(fq), "matrix") +# dada2:::qtables2(fq) + +## Summary stats +num_seqs <- nrow(sq) +num_singl <- nrow(sq[ Abundance < 2 ]) +num_reads <- sum(sq$Abundance, na.rm = TRUE) +perc_nonsingleton <- round((num_seqs - num_singl) / num_seqs * 100, 2) + +cat("\n") +cat("Number of unique sequences detected: ", num_seqs, "\n") +cat("Number of singleton sequences: ", num_singl, "\n") +cat("Total abundance of sequences: ", num_reads, "\n") +cat("Percentage of non-singleton sequences: ", round(perc_nonsingleton, 2), "\n") + +## Test the rule of thumb, https://github.com/benjjneb/dada2/issues/1663#issuecomment-1359905397 +if(perc_nonsingleton < 10){ + cat("WARNING: <10% of reads are duplicates of other reads,\n") + cat(" meaning that DADA2 might not be the right algorithmic choice\n") +} + + +## Manually create a derep-class object +## See also https://github.com/benjjneb/dada2/blob/004ce26909268e1318a2f68e0ea26807412c7a2d/R/sequenceIO.R#L240-L242 +# https://github.com/benjjneb/dada2/blob/004ce26909268e1318a2f68e0ea26807412c7a2d/R/sequenceIO.R#L45 + +## Prepare derep-class object +cat("\nPreparing derep-class object\n") +uniques <- sq$Abundance +names(uniques) <- as.character(sread(fq)) # names = full amplicon sequence +rownames(seq_quals) <- names(uniques) + +derep <- list( + uniques = uniques, + quals = seq_quals, + map = NULL, + SeqID = sq$SeqID # add allso sequence IDs + ) + +derep <- as(derep, "derep") + +## Clean up +rm(uniques, seq_quals) + + + +## Estimate error rates for each type of transition while ignoring quality scores +cat("\nEstimating error rates\n") +errors <- try( + learnErrors( + fls = derep, + nbases = NBASES, + errorEstimationFunction = noqualErrfun, + qualityType = "FastqQuality", + verbose = 1, + multithread = CPUTHREADS + ) + ) + +## Retry if multithreading failed +if("try-error" %in% class(errors) & CPUTHREADS > 1){ + cat("..Multi-threaded error rate estimation failed\n") + cat("..Trying to resume with a single CPU thread\n") + + errors <- learnErrors( + fls = derep, + nbases = NBASES, + errorEstimationFunction = noqualErrfun, + qualityType = "FastqQuality", + verbose = 1, + multithread = 1 + ) +} +if("try-error" %in% class(errors) & CPUTHREADS == 1){ + stop("..Error rate estimation failed\n") +} + + +## Export results +cat("\nExporting error rates\n") +saveRDS(object = errors, + file = "DADA2_ErrorRates_noqualErrfun.RData", + compress = "xz") + + +## Plot observed and estimated error rates +# plotErrors(errors) + + +## Run sample inference with DADA2 +cat("\nRunning sample inference\n") +dadares <- dada( + derep = derep, + err = errors, + errorEstimationFunction = noqualErrfun, + selfConsist = FALSE, + verbose = 1, + multithread = CPUTHREADS) + +cat("\nExporting DADA2 object\n") +saveRDS(object = dadares, + file = "DADA2_InferedSeqs_noqualErrfun.RData", + compress = "xz") + + +## Prepare resulting data +cat("Preparing resulting table\n") +res <- data.table( + Sequence = dadares$sequence, + Abundance = dadares$denoised) + +## Add sequence IDs +res[ , SeqNumID := .I ] +res <- merge( + x = res, + y = sq[, .(SeqID, Sequence)], + by = "Sequence", all.x = TRUE) + +## Sort by abundance +setorder(res, -Abundance, SeqID, na.last = TRUE) + +## Export denoised sequences +cat("Exporting denoised sequences\n") +ASVS <- DNAStringSet(x = res$Sequence) +names(ASVS) <- paste0(res$SeqID, ";size=", res$Abundance) + +writeXStringSet( + x = ASVS, + filepath = "DADA2_denoised.fa.gz", + compress = TRUE, + format = "fasta", + width = 20000) + + + +## Create UC file +cat("Preparing pseudo-UC file\n") +UC <- data.table( + DerepSeqID = derep$SeqID, + SeqNumID = dadares$map, + Abundance = derep$uniques) + +UC <- merge( + x = UC, + y = res[ , .(SeqNumID, SeqID) ], + by = "SeqNumID", all.x = TRUE) + +setorder(UC, SeqNumID, na.last = TRUE) +setnames(x = UC, old = "SeqID", new = "ASV") + +## Export pre-UC file +cat("Exporting pre-UC file\n") +# saveRDS( +# object = UC, +# file = "DADA2_UC.RData", +# compress = "xz") +qs::qsave(x = UC, file = "DADA2_UC.qs", + preset = "custom", algorithm = "zstd", compress_level = 15L, nthreads = CPUTHREADS) + +## Summary stats +num_asvs <- nrow(res) +num_asvreads <- sum(res$Abundance, na.rm = T) +num_merged <- nrow(UC[ !is.na(SeqNumID) & DerepSeqID != ASV ]) # excluding representative seqs +num_dsc <- nrow(UC[ is.na(SeqNumID) ]) +num_dscreads <- sum(UC[ is.na(SeqNumID) ]$Abundance) +perc_dsc <- round(num_dsc / num_seqs * 100, 2) +perc_dscreads <- round(num_dscreads / num_reads * 100, 2) + +cat("\nRun summary:\n") +cat("Number of ASVs infered: ", num_asvs, "\n") +cat("Number of reads in ASV table: ", num_asvreads , "\n") +cat("Number of sequences merged into ASVs: ", num_merged, "\n") +cat("Number of discarded sequences (%): ", num_dsc, "(", perc_dsc, "% )\n") +cat("Number of reads of discarded sequences (%): ", num_dscreads, "(", perc_dscreads, "% )\n") + + +## Format pseudo-UC file +# 1 Record type S, H, C or N (see table below) +# 2 Cluster number (0-based) +# 3 Sequence length (S, N and H) or cluster size (C) +# 4 For H records, percent identity with target +# 5 For H records, the strand: + or - for nucleotides, . for proteins +# 6 Not used, parsers should ignore this field. Included for backwards compatibility +# 7 Not used, parsers should ignore this field. Included for backwards compatibility +# 8 Compressed alignment or the symbol '=' (equals sign). The = indicates that the query is 100% identical to the target sequence (field 10) +# 9 Label of query sequence (always present) +# 10 Label of target sequence (H records only) + +## Remove noisy sequences +UC <- UC[ ! is.na(SeqNumID), .(DerepSeqID, ASV) ] +UC[ , RecordType := fifelse(DerepSeqID == ASV, "C", "H", na = NA) ] +UC[ , `:=` (ClustNum = NA, SeqLen = NA, Ident = NA, Strand = "+", V6 = NA, V7 = NA, ALN = ".") ] + +setcolorder(x = UC, + neworder = c("RecordType", "ClustNum", "SeqLen", "Ident", "Strand", "V6", "V7", "ALN", "DerepSeqID", "ASV")) + +## Export UC file +cat("\nExporting pseudo-UC file\n") +fwrite(x = UC, + file = "DADA2_denoised.uc.gz", + quote = FALSE, sep = "\t", + col.names = FALSE, row.names = FALSE, + compress = "gzip") + + + +## Write summary +cat("Exporting run statistics\n") +smr <- rbind( + data.table(Param = "Number of unique sequences (prior denoising)", Value = num_seqs), + data.table(Param = "Number of singleton sequences (prior denoising)", Value = num_singl), + data.table(Param = "Total abundance of sequences (prior denoising)", Value = num_reads), + data.table(Param = "Percentage of non-singleton sequences (prior denoising)", Value = perc_nonsingleton), + + data.table(Param = "Number of ASVs infered", Value = num_asvs), + data.table(Param = "Number of reads in ASV table", Value = num_asvreads), + data.table(Param = "Number of sequences merged into ASVs (excluding representative seqs)", Value = num_merged), + data.table(Param = "Number of discarded sequences", Value = num_dsc), + data.table(Param = "Percentage of discarded sequences", Value = perc_dsc), + data.table(Param = "Number of reads of discarded sequences", Value = num_dscreads), + data.table(Param = "Percentage of reads of discarded sequences", Value = perc_dscreads) + ) + +fwrite(x = smr, + file = "DADA2_denoising_summary.txt", + quote = FALSE, sep = "\t") + + + +## Construct sequence table (rows = samples, cols = ASVs) +# makeSequenceTable(dadares, orderBy = "abundance") + + + +cat("\nAll done.\n") + + +##################### Session info + +## Check time +end_time <- Sys.time() + +tmm <- as.numeric(difftime(end_time, start_time, units = "min")) +cat("\nElapsed time: ", tmm, " minutes\n") + +cat("\n") +cat("Session info:\n") +sessionInfo() +cat("\n") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/disambiguate_primers.R b/src/pipecraft-core/service_scripts/NextITS/bin/disambiguate_primers.R index 076232d..0ba6110 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/disambiguate_primers.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/disambiguate_primers.R @@ -1,37 +1,37 @@ -#!/usr/bin/env Rscript - -## The script to disambiguate sequences -## (expand ambiguous nucleotides into all combinations) -## Based on IUPAC codes - -# Input is given as positional arguments: -# 1. A text string (e.g., "ACTGNK") -# 2. output file name (e.g., "Primer_F.fasta") - -# Output: -# - FASTA with disambiguated sequences - -args <- commandArgs(trailingOnly = TRUE) - -cat("..Loading packages\n") -suppressMessages(library(DECIPHER)) -suppressMessages(library(Biostrings)) - -## Convert input string into DNAStringSet object -cat("..Preparing DNAStringSet\n") -dna <- DNAStringSet(args[1]) - -## Disambiguate -cat("..Disambiguating\n") -res <- Disambiguate(dna)[[1]] - -## Assign names -names(res) <- paste0("seq", 1:length(res), sep = "") - -## Export FASTA -cat("..Exporting FASTA\n") -writeXStringSet(x = res, - filepath = args[2], - compress=FALSE, format="fasta", width=9999) - -cat("..done\n") +#!/usr/bin/env Rscript + +## The script to disambiguate sequences +## (expand ambiguous nucleotides into all combinations) +## Based on IUPAC codes + +# Input is given as positional arguments: +# 1. A text string (e.g., "ACTGNK") +# 2. output file name (e.g., "Primer_F.fasta") + +# Output: +# - FASTA with disambiguated sequences + +args <- commandArgs(trailingOnly = TRUE) + +cat("..Loading packages\n") +suppressMessages(library(DECIPHER)) +suppressMessages(library(Biostrings)) + +## Convert input string into DNAStringSet object +cat("..Preparing DNAStringSet\n") +dna <- DNAStringSet(args[1]) + +## Disambiguate +cat("..Disambiguating\n") +res <- Disambiguate(dna)[[1]] + +## Assign names +names(res) <- paste0("seq", 1:length(res), sep = "") + +## Export FASTA +cat("..Exporting FASTA\n") +writeXStringSet(x = res, + filepath = args[2], + compress=FALSE, format="fasta", width=9999) + +cat("..done\n") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/document_s1.R b/src/pipecraft-core/service_scripts/NextITS/bin/document_s1.R old mode 100644 new mode 100755 index 7e2d30a..755bb88 --- a/src/pipecraft-core/service_scripts/NextITS/bin/document_s1.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/document_s1.R @@ -1,337 +1,337 @@ -#!/usr/bin/env Rscript - -## Script to document the Step-1 workflow of the NextITS pipeline. - -## Usage: -## Rscript document_s1.R [output_path] - -## Input: -## - software_versions.yml -## - pipeline_params.tsv - -## Output: -## - README_Step1_Methods.txt -## with two sections: methods and references - - -## Function to load packages -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(paste(pkg, packageVersion(pkg), "\n")) -} - -load_pckg("glue") -load_pckg("data.table") -load_pckg("yaml") - -## Parse arguments -args <- commandArgs(trailingOnly = TRUE) -if (length(args) < 2) { - cat("Usage: document_s1.R [output_path]\n") - stop() -} - -versions_path <- args[[1]] -params_path <- args[[2]] -output_path <- ifelse(length(args) >= 3, args[[3]], "README_Step1_Methods.txt") - - -## Validation -if(is.null(versions_path) || versions_path == ""){ - stop("Versions YAML not specified") -} -if(is.null(params_path) || params_path == ""){ - stop("Params table not specified") -} - -if(!file.exists(versions_path)){ - stop(glue("Versions YAML not found: {versions_path}")) -} -if(!file.exists(params_path)){ - stop(glue("Params table not found: {params_path}")) -} - - -################################## -################################## Data for debugging -################################## - -# versions_path <- "software_versions.yml" -# params_path <- "pipeline_params.tsv" -# output_path <- "README_Step1_Methods.txt" - - -################################## -################################## References -################################## - -## Citation registry -citation_db <- list( - nextits = "Mikryukov V, Anslan S, Tedersoo L (2025) NextITS - A pipeline for metabarcoding fungi and other eukaryotes with full-length ITS sequenced with PacBio. DOI:10.5281/zenodo.15074882", - nextflow = "Di Tommaso P, et al. (2017) Nextflow enables reproducible computational workflows. Nat Biotechnol 35, 316-319, DOI:10.1038/nbt.3820", - lima = "Pacific Biosciences (2025) LIMA - The PacBio barcode demultiplexer and primer remover. URL: https://lima.how/", - seqkit = "Shen W, Sipos B, Zhao L (2024) SeqKit2: A Swiss Army Knife for Sequence and Alignment Processing. iMeta e191. DOI:10.1002/imt2.191", - csvtk = "Shen W (2025) csvtk - a cross-platform, efficient and practical CSV/TSV toolkit. URL: https://github.com/shenwei356/csvtk", -# brename = "Shen W (2025) brename - batch renaming safely, URL: https://github.com/shenwei356/brename", - cutadapt = "Martin M (2011) Cutadapt removes adapter sequences. EMBnet.journal 17(1):10-12, DOI:10.14806/ej.17.1.200", - itsx = "Bengtsson-Palme J, et al (2013) Improved software detection and extraction of ITS1 and ITS2 from ribosomal ITS sequences of fungi and other eukaryotes for analysis of environmental sequencing data. Methods Ecol Evol 4:914-919, DOI:10.1111/2041-210X.12073", - vsearch = "Rognes T, Flouri T, Nichols B, Quince C, Mahé F (2016) VSEARCH: a versatile open source tool for metagenomics. PeerJ 4:e2584. DOI:10.7717/peerj.2584", - uchime2 = "Edgar RC (2016) UCHIME2: improved chimera prediction for amplicon sequencing. bioRxiv 074252. DOI:10.1101/074252", - uncross2 = "Edgar RC (2018) UNCROSS2: identification of cross-talk in 16S rRNA OTU tables. bioRxiv 400762. DOI:10.1101/400762", - chimscore = "Nilsson RH, et al. (2015) A Comprehensive, Automatically Updated Fungal ITS Sequence Dataset for Reference-Based Chimera Control in Environmental Sequencing Efforts. Microbes Environ. 30(2), 145-50. DOI:10.1264/jsme2.ME14121", - bedtools = "Quinlan AR, Hall IM (2010) BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics 26:841-842. DOI:10.1093/bioinformatics/btq033", - duckdb = "Raasveldt M, Mühleisen H (2019) DuckDB: an Embeddable Analytical Database. SIGMOD '19: Proceedings of the 2019 International Conference on Management of Data, 1981-1984. DOI:10.1145/3299869.332021", - parallel = "Tange O (2011) GNU Parallel: The command-line power tool. Usenix Mag 36 (1), 42", - eukaryome = "Tedersoo L, et al. (2024). EUKARYOME: the rRNA gene reference database for identification of all eukaryotes. Database (Oxford) 12:baae043. DOI:10.1093/database/baae043", - R = "R Core Team (2025) R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria. URL: https://www.R-project.org/", - arrow = "Richardson N, Cook I, Crane N, Dunnington D, François R, Keane J, Moldovan-Grünfeld D, Ooms J, Wujciak-Jens J, and Apache Arrow (2025) arrow: Integration to Apache Arrow. URL: https://github.com/apache/arrow/", - ggplot2 = "Wickham H (2016) ggplot2: Elegant Graphics for Data Analysis. Springer. DOI:10.1007/978-3-319-24277-4", - biostrings= "Pagès H, Aboyoun P, Gentleman R, DebRoy S (2025) Biostrings: Efficient manipulation of biological strings. DOI:10.18129/B9.bioc.Biostrings", - datatable = "Barrett T, Dowle M, Srinivasan A, Gorecki J, Chirico M, Hocking T, Schwendinger B, Krylov I (2025) data.table: Extension of data.frame. URL: " -) - - -################################## -################################## Helpers -################################## - -## Get version number -getv <- function(v, process, tool){ - # v = list (from YAML file) - # process = process name - # tool = tool name - - if(is.null(v[[process]]) || is.null(v[[process]][[tool]])){ return("") } - as.character( v[[process]][[tool]] ) -} -# E.g., getv(versions, "demux", "lima") - - -## Get parameter -getp <- function(p, pname, default = NA){ - # p = table with parameters (two columns: name and value) - # pname = parameter name - # default = default value if parameter is not found - - pp <- p[ name == pname ]$value - if(is.null(pp) || is.na(pp)){ return(default) } - return(pp) -} -# E.g., getp(params, "lima_minscore", 93) - - -## Remove NAs and empty strings (to curate the citations) -trim_na <- function(x){ - x[ !is.na(x) & nzchar(x) ] -} - -################################## -################################## Body builders -################################## - -emit_nextits <- function(v) { - nextits_v <- if(!is.null(v$NextITS$version)){ as.character(v$NextITS$version) } else { "" } - glue("Bioinformatic processing was performed using the \\ - NextITS pipeline v.{nextits_v} (Mikryukov et al., 2025).") -} - -emit_nextflow <- function(v) { - nextflow_v <- if(!is.null(v$Nextflow$version)){ as.character(v$Nextflow$version) } else { "" } - glue("Workflow management was performed using \\ - Nextflow v.{nextflow_v} (Di Tommaso et al., 2017).") -} - -emit_demux_pacbio <- function(p, v) { - ms <- getp(p, "lima_minscore", 93) - mb <- getp(p, "lima_barcodetype", "dual_symmetric") - vs <- getv(v, "demux", "lima") - switch(mb, - "single" = {barcode_type <- "single-end barcodes"}, - "dual_symmetric" = {barcode_type <- "symmetric dual-end barcodes"}, - "dual_asymmetric" = {barcode_type <- "asymmetric dual-end barcodes"}, - "dual" = {barcode_type <- "combination of symmetric and asymmetric dual-end barcodes"}) - - glue("Demultiplexed PacBio reads using LIMA v.{vs} (Pacific Biosciences) with min score {ms} and {barcode_type}.") -} - -emit_qc_pacbio <- function(p, v) { - glue("Quality control was performed using \\ - VSEARCH v.{getv(v,'qc_se','vsearch')} (Rognes et al., 2016) and \\ - seqkit v.{getv(v,'qc_se','seqkit')} (Shen et al., 2024). \\ - Reads with the number of ambiguous bases >= {getp(p,'qc_maxn',4)}, \\ - expected error rate >= {getp(p,'qc_maxeerate',0.01)}, \\ - or homopolymer stretches longer than {getp(p,'qc_maxhomopolymerlen',25)} nt were removed.") -} - -# emit_demux_illumina <- function(p, v) { -# c( -# glue("- Illumina PE QC and merging; demultiplexed merged reads with cutadapt v.{getv(v,'primer_check','cutadapt')} using barcode window {getp(p,'barcode_window',30)}, max errors {getp(p,'barcode_errors',1)}, min overlap {getp(p,'barcode_overlap',11)}."), -# glue("- Non-merged reads optionally retained (join padding '{getp(p,'illumina_joinpadgap','NNNNNNNNNN')}').") -# ) -# } - -emit_primer_check <- function(p, v) { - glue("Primers were trimmed using \\ - cutadapt v.{getv(v, 'primer_check', 'cutadapt')} (Martin, 2011) \\ - with <= {getp(p, 'primer_mismatches', 2)} mismatches. \\ - Reads without both primers were discarded.") -} - -emit_itsx <- function(p, v) { - switch(getp(p,'its_region','full'), - "full" = {its_region <- "full-length ITS"}, - "SSU" = {its_region <- "SSU"}, - "ITS1" = {its_region <- "ITS1"}, - # "5_8S" = {its_region <- "5.8S"}, # not-yet-implemented - "ITS2" = {its_region <- "ITS2"}, - "LSU" = {its_region <- "LSU"}, - "ITS1_5.8S_ITS2" = {its_region <- "near-full-length ITS"}) - - - glue("Extraction of rRNA regions ({its_region}) was performed using \\ - ITSx v.{getv(v,'itsx','ITSx')} (Bengtsson-Palme et al., 2013).") -} - -emit_assemble_its <- function(p, v) { - glue("To assemble near-full-length ITS sequences, we ... (TODO)") -} - -emit_hp_and_chimeras <- function(p, v, did_hp) { - res <- character() - if(isTRUE(did_hp)){ - res <- c(res, glue( - "Homopolymer correction of sequences was performed using an algorithm implemented in NextITS \\ - with support of VSEARCH v.{getv(v,'homopolymer','vsearch')} and seqkit v.{getv(v,'homopolymer','seqkit')}.") ) - } else { - res <- c(res, "Homopolymer correction of sequences was not performed.") - } - res <- c(res, - glue( - "Two-step chimera detection was done using VSEARCH v.{getv(v,'chimera_denovo','vsearch')}: - - de novo using UCHIME2 algorithm (Edgar, 2016) with max score {getp(p,'max_ChimeraScore',0.6)} (Nilsson et al., 2015), - - then reference-based against the EUKARYOME database (Tedersoo et al., 2024).") - ) - res <- paste0(res, collapse = "\n") - return(res) -} - -emit_tj <- function(p, v) { - glue("Tag-jump detection and removal was performed using \\ - UNCROSS2 algorithm (Edgar, 2018) with the parameter f = {getp(p,'tj_f',0.01)}.") -} - -emit_seqtab <- function(p, v) { - glue("Sequence counts table was generated using \\ - R v.{getv(v,'prep_seqtab','R')} (R Core Team, 2025), \\ - data.table v.{getv(v,'prep_seqtab','data.table')} (Barrett et al., 2025), \\ - and Apache Arrow v.{getv(v,'prep_seqtab','arrow')} (Richardson et al., 2025) \\ - packages.") -} - - - -################################## -################################## Workflow-dependent method descriptions -################################## - -## Function to assembly the workflow description and references -build_docs <- function(versions, params){ - body <- character() - tools_used <- character() - - body <- c(body, emit_nextits(versions)) - tools_used <- c(tools_used, "nextits") - - body <- c(body, emit_nextflow(versions)) - tools_used <- c(tools_used, "nextflow") - - demuxed <- tolower(as.character(getp(params, "demultiplexed", FALSE))) %in% c("true", "t", "1") - platform <- getp(params, "seqplatform", "PacBio") - - if(!demuxed){ - if(platform %in% "PacBio"){ - - body <- c(body, emit_demux_pacbio(params, versions)) - tools_used <- c(tools_used, c("lima")) - - body <- c(body, emit_qc_pacbio(params, versions)) - tools_used <- c(tools_used, c("vsearch", "seqkit")) - - } else { - body <- c(body, emit_demux_illumina(params, versions)) - tools_used <- c(tools_used, c("cutadapt")) - } - } else { - if(platform %in% "PacBio"){ - body <- c(body, emit_qc_pacbio(params, versions)) - tools_used <- c(tools_used, c("vsearch", "seqkit")) - } else { - ## TODO - } - } - - ## Primer trimming - body <- c(body, emit_primer_check(params, versions)) - tools_used <- c(tools_used, c("cutadapt")) - - ## ITS extraction - its_region <- getp(params, "its_region", "full") - if(its_region %in% c("full", "ITS1", "ITS2", "SSU", "LSU")){ - body <- c(body, emit_itsx(params, versions)) - tools_used <- c(tools_used, c("itsx", "vsearch", "duckdb", "seqkit", "cutadapt")) - } else if (its_region %in% "ITS1_5.8S_ITS2") { - body <- c(body, emit_itsx(params, versions)) # , emit_assemble_its(params, versions)) - tools_used <- c(tools_used, c("itsx", "vsearch", "duckdb", "seqkit", "cutadapt")) - } - - did_hp <- tolower(as.character(getp(params, "hp", TRUE))) %in% c("true", "t", "1") - body <- c(body, emit_hp_and_chimeras(params, versions, did_hp)) - tools_used <- c(tools_used, c("vsearch", "uchime2", "eukaryome")) - - body <- c(body, emit_tj(params, versions)) - tools_used <- c(tools_used, c("uncross2")) - - body <- c(body, emit_seqtab(params, versions)) - tools_used <- c(tools_used, c("arrow", "datatable", "R")) - - tools_used <- unique(tools_used) - citations <- trim_na( unlist(citation_db[tools_used]) ) - citations <- sort(unique(citations)) - - res <- list( - body = body, - citations = citations) - - return(res) -} - - -################################## -################################## Assemble body and citations -################################## - -## Load inputs -cat("Loading versions YAML...\n") -versions <- yaml::read_yaml(versions_path) - -cat("Loading params table...\n") -params <- data.table::fread(params_path, sep = "\t", header = TRUE, na.strings = c("", "NA")) -setnames(params, new = c("name", "value")) - -## Build body and citations -res <- build_docs(versions, params) - -## Write output -con <- file(output_path, open = "wt") - -writeLines("Methods:", con) -writeLines(res$body, con) - -writeLines("", con) - -writeLines("References:", con) -writeLines(paste0("- ", res$citations), con) - -close(con) - -cat("All done.\n") +#!/usr/bin/env Rscript + +## Script to document the Step-1 workflow of the NextITS pipeline. + +## Usage: +## Rscript document_s1.R [output_path] + +## Input: +## - software_versions.yml +## - pipeline_params.tsv + +## Output: +## - README_Step1_Methods.txt +## with two sections: methods and references + + +## Function to load packages +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(paste(pkg, packageVersion(pkg), "\n")) +} + +load_pckg("glue") +load_pckg("data.table") +load_pckg("yaml") + +## Parse arguments +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 2) { + cat("Usage: document_s1.R [output_path]\n") + stop() +} + +versions_path <- args[[1]] +params_path <- args[[2]] +output_path <- ifelse(length(args) >= 3, args[[3]], "README_Step1_Methods.txt") + + +## Validation +if(is.null(versions_path) || versions_path == ""){ + stop("Versions YAML not specified") +} +if(is.null(params_path) || params_path == ""){ + stop("Params table not specified") +} + +if(!file.exists(versions_path)){ + stop(glue("Versions YAML not found: {versions_path}")) +} +if(!file.exists(params_path)){ + stop(glue("Params table not found: {params_path}")) +} + + +################################## +################################## Data for debugging +################################## + +# versions_path <- "software_versions.yml" +# params_path <- "pipeline_params.tsv" +# output_path <- "README_Step1_Methods.txt" + + +################################## +################################## References +################################## + +## Citation registry +citation_db <- list( + nextits = "Mikryukov V, Anslan S, Tedersoo L (2025) NextITS - A pipeline for metabarcoding fungi and other eukaryotes with full-length ITS sequenced with PacBio. DOI:10.5281/zenodo.15074882", + nextflow = "Di Tommaso P, et al. (2017) Nextflow enables reproducible computational workflows. Nat Biotechnol 35, 316-319, DOI:10.1038/nbt.3820", + lima = "Pacific Biosciences (2025) LIMA - The PacBio barcode demultiplexer and primer remover. URL: https://lima.how/", + seqkit = "Shen W, Sipos B, Zhao L (2024) SeqKit2: A Swiss Army Knife for Sequence and Alignment Processing. iMeta e191. DOI:10.1002/imt2.191", + csvtk = "Shen W (2025) csvtk - a cross-platform, efficient and practical CSV/TSV toolkit. URL: https://github.com/shenwei356/csvtk", +# brename = "Shen W (2025) brename - batch renaming safely, URL: https://github.com/shenwei356/brename", + cutadapt = "Martin M (2011) Cutadapt removes adapter sequences. EMBnet.journal 17(1):10-12, DOI:10.14806/ej.17.1.200", + itsx = "Bengtsson-Palme J, et al (2013) Improved software detection and extraction of ITS1 and ITS2 from ribosomal ITS sequences of fungi and other eukaryotes for analysis of environmental sequencing data. Methods Ecol Evol 4:914-919, DOI:10.1111/2041-210X.12073", + vsearch = "Rognes T, Flouri T, Nichols B, Quince C, Mahé F (2016) VSEARCH: a versatile open source tool for metagenomics. PeerJ 4:e2584. DOI:10.7717/peerj.2584", + uchime2 = "Edgar RC (2016) UCHIME2: improved chimera prediction for amplicon sequencing. bioRxiv 074252. DOI:10.1101/074252", + uncross2 = "Edgar RC (2018) UNCROSS2: identification of cross-talk in 16S rRNA OTU tables. bioRxiv 400762. DOI:10.1101/400762", + chimscore = "Nilsson RH, et al. (2015) A Comprehensive, Automatically Updated Fungal ITS Sequence Dataset for Reference-Based Chimera Control in Environmental Sequencing Efforts. Microbes Environ. 30(2), 145-50. DOI:10.1264/jsme2.ME14121", + bedtools = "Quinlan AR, Hall IM (2010) BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics 26:841-842. DOI:10.1093/bioinformatics/btq033", + duckdb = "Raasveldt M, Mühleisen H (2019) DuckDB: an Embeddable Analytical Database. SIGMOD '19: Proceedings of the 2019 International Conference on Management of Data, 1981-1984. DOI:10.1145/3299869.332021", + parallel = "Tange O (2011) GNU Parallel: The command-line power tool. Usenix Mag 36 (1), 42", + eukaryome = "Tedersoo L, et al. (2024). EUKARYOME: the rRNA gene reference database for identification of all eukaryotes. Database (Oxford) 12:baae043. DOI:10.1093/database/baae043", + R = "R Core Team (2025) R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria. URL: https://www.R-project.org/", + arrow = "Richardson N, Cook I, Crane N, Dunnington D, François R, Keane J, Moldovan-Grünfeld D, Ooms J, Wujciak-Jens J, and Apache Arrow (2025) arrow: Integration to Apache Arrow. URL: https://github.com/apache/arrow/", + ggplot2 = "Wickham H (2016) ggplot2: Elegant Graphics for Data Analysis. Springer. DOI:10.1007/978-3-319-24277-4", + biostrings= "Pagès H, Aboyoun P, Gentleman R, DebRoy S (2025) Biostrings: Efficient manipulation of biological strings. DOI:10.18129/B9.bioc.Biostrings", + datatable = "Barrett T, Dowle M, Srinivasan A, Gorecki J, Chirico M, Hocking T, Schwendinger B, Krylov I (2025) data.table: Extension of data.frame. URL: " +) + + +################################## +################################## Helpers +################################## + +## Get version number +getv <- function(v, process, tool){ + # v = list (from YAML file) + # process = process name + # tool = tool name + + if(is.null(v[[process]]) || is.null(v[[process]][[tool]])){ return("") } + as.character( v[[process]][[tool]] ) +} +# E.g., getv(versions, "demux", "lima") + + +## Get parameter +getp <- function(p, pname, default = NA){ + # p = table with parameters (two columns: name and value) + # pname = parameter name + # default = default value if parameter is not found + + pp <- p[ name == pname ]$value + if(is.null(pp) || is.na(pp)){ return(default) } + return(pp) +} +# E.g., getp(params, "lima_minscore", 93) + + +## Remove NAs and empty strings (to curate the citations) +trim_na <- function(x){ + x[ !is.na(x) & nzchar(x) ] +} + +################################## +################################## Body builders +################################## + +emit_nextits <- function(v) { + nextits_v <- if(!is.null(v$NextITS$version)){ as.character(v$NextITS$version) } else { "" } + glue("Bioinformatic processing was performed using the \\ + NextITS pipeline v.{nextits_v} (Mikryukov et al., 2025).") +} + +emit_nextflow <- function(v) { + nextflow_v <- if(!is.null(v$Nextflow$version)){ as.character(v$Nextflow$version) } else { "" } + glue("Workflow management was performed using \\ + Nextflow v.{nextflow_v} (Di Tommaso et al., 2017).") +} + +emit_demux_pacbio <- function(p, v) { + ms <- getp(p, "lima_minscore", 93) + mb <- getp(p, "lima_barcodetype", "dual_symmetric") + vs <- getv(v, "demux", "lima") + switch(mb, + "single" = {barcode_type <- "single-end barcodes"}, + "dual_symmetric" = {barcode_type <- "symmetric dual-end barcodes"}, + "dual_asymmetric" = {barcode_type <- "asymmetric dual-end barcodes"}, + "dual" = {barcode_type <- "combination of symmetric and asymmetric dual-end barcodes"}) + + glue("Demultiplexed PacBio reads using LIMA v.{vs} (Pacific Biosciences) with min score {ms} and {barcode_type}.") +} + +emit_qc_pacbio <- function(p, v) { + glue("Quality control was performed using \\ + VSEARCH v.{getv(v,'qc_se','vsearch')} (Rognes et al., 2016) and \\ + seqkit v.{getv(v,'qc_se','seqkit')} (Shen et al., 2024). \\ + Reads with the number of ambiguous bases >= {getp(p,'qc_maxn',4)}, \\ + expected error rate >= {getp(p,'qc_maxeerate',0.01)}, \\ + or homopolymer stretches longer than {getp(p,'qc_maxhomopolymerlen',25)} nt were removed.") +} + +# emit_demux_illumina <- function(p, v) { +# c( +# glue("- Illumina PE QC and merging; demultiplexed merged reads with cutadapt v.{getv(v,'primer_check','cutadapt')} using barcode window {getp(p,'barcode_window',30)}, max errors {getp(p,'barcode_errors',1)}, min overlap {getp(p,'barcode_overlap',11)}."), +# glue("- Non-merged reads optionally retained (join padding '{getp(p,'illumina_joinpadgap','NNNNNNNNNN')}').") +# ) +# } + +emit_primer_check <- function(p, v) { + glue("Primers were trimmed using \\ + cutadapt v.{getv(v, 'primer_check', 'cutadapt')} (Martin, 2011) \\ + with <= {getp(p, 'primer_mismatches', 2)} mismatches. \\ + Reads without both primers were discarded.") +} + +emit_itsx <- function(p, v) { + switch(getp(p,'its_region','full'), + "full" = {its_region <- "full-length ITS"}, + "SSU" = {its_region <- "SSU"}, + "ITS1" = {its_region <- "ITS1"}, + # "5_8S" = {its_region <- "5.8S"}, # not-yet-implemented + "ITS2" = {its_region <- "ITS2"}, + "LSU" = {its_region <- "LSU"}, + "ITS1_5.8S_ITS2" = {its_region <- "near-full-length ITS"}) + + + glue("Extraction of rRNA regions ({its_region}) was performed using \\ + ITSx v.{getv(v,'itsx','ITSx')} (Bengtsson-Palme et al., 2013).") +} + +emit_assemble_its <- function(p, v) { + glue("To assemble near-full-length ITS sequences, we ... (TODO)") +} + +emit_hp_and_chimeras <- function(p, v, did_hp) { + res <- character() + if(isTRUE(did_hp)){ + res <- c(res, glue( + "Homopolymer correction of sequences was performed using an algorithm implemented in NextITS \\ + with support of VSEARCH v.{getv(v,'homopolymer','vsearch')} and seqkit v.{getv(v,'homopolymer','seqkit')}.") ) + } else { + res <- c(res, "Homopolymer correction of sequences was not performed.") + } + res <- c(res, + glue( + "Two-step chimera detection was done using VSEARCH v.{getv(v,'chimera_denovo','vsearch')}: + - de novo using UCHIME2 algorithm (Edgar, 2016) with max score {getp(p,'max_ChimeraScore',0.6)} (Nilsson et al., 2015), + - then reference-based against the EUKARYOME database (Tedersoo et al., 2024).") + ) + res <- paste0(res, collapse = "\n") + return(res) +} + +emit_tj <- function(p, v) { + glue("Tag-jump detection and removal was performed using \\ + UNCROSS2 algorithm (Edgar, 2018) with the parameter f = {getp(p,'tj_f',0.01)}.") +} + +emit_seqtab <- function(p, v) { + glue("Sequence counts table was generated using \\ + R v.{getv(v,'prep_seqtab','R')} (R Core Team, 2025), \\ + data.table v.{getv(v,'prep_seqtab','data.table')} (Barrett et al., 2025), \\ + and Apache Arrow v.{getv(v,'prep_seqtab','arrow')} (Richardson et al., 2025) \\ + packages.") +} + + + +################################## +################################## Workflow-dependent method descriptions +################################## + +## Function to assembly the workflow description and references +build_docs <- function(versions, params){ + body <- character() + tools_used <- character() + + body <- c(body, emit_nextits(versions)) + tools_used <- c(tools_used, "nextits") + + body <- c(body, emit_nextflow(versions)) + tools_used <- c(tools_used, "nextflow") + + demuxed <- tolower(as.character(getp(params, "demultiplexed", FALSE))) %in% c("true", "t", "1") + platform <- getp(params, "seqplatform", "PacBio") + + if(!demuxed){ + if(platform %in% "PacBio"){ + + body <- c(body, emit_demux_pacbio(params, versions)) + tools_used <- c(tools_used, c("lima")) + + body <- c(body, emit_qc_pacbio(params, versions)) + tools_used <- c(tools_used, c("vsearch", "seqkit")) + + } else { + body <- c(body, emit_demux_illumina(params, versions)) + tools_used <- c(tools_used, c("cutadapt")) + } + } else { + if(platform %in% "PacBio"){ + body <- c(body, emit_qc_pacbio(params, versions)) + tools_used <- c(tools_used, c("vsearch", "seqkit")) + } else { + ## TODO + } + } + + ## Primer trimming + body <- c(body, emit_primer_check(params, versions)) + tools_used <- c(tools_used, c("cutadapt")) + + ## ITS extraction + its_region <- getp(params, "its_region", "full") + if(its_region %in% c("full", "ITS1", "ITS2", "SSU", "LSU")){ + body <- c(body, emit_itsx(params, versions)) + tools_used <- c(tools_used, c("itsx", "vsearch", "duckdb", "seqkit", "cutadapt")) + } else if (its_region %in% "ITS1_5.8S_ITS2") { + body <- c(body, emit_itsx(params, versions)) # , emit_assemble_its(params, versions)) + tools_used <- c(tools_used, c("itsx", "vsearch", "duckdb", "seqkit", "cutadapt")) + } + + did_hp <- tolower(as.character(getp(params, "hp", TRUE))) %in% c("true", "t", "1") + body <- c(body, emit_hp_and_chimeras(params, versions, did_hp)) + tools_used <- c(tools_used, c("vsearch", "uchime2", "eukaryome")) + + body <- c(body, emit_tj(params, versions)) + tools_used <- c(tools_used, c("uncross2")) + + body <- c(body, emit_seqtab(params, versions)) + tools_used <- c(tools_used, c("arrow", "datatable", "R")) + + tools_used <- unique(tools_used) + citations <- trim_na( unlist(citation_db[tools_used]) ) + citations <- sort(unique(citations)) + + res <- list( + body = body, + citations = citations) + + return(res) +} + + +################################## +################################## Assemble body and citations +################################## + +## Load inputs +cat("Loading versions YAML...\n") +versions <- yaml::read_yaml(versions_path) + +cat("Loading params table...\n") +params <- data.table::fread(params_path, sep = "\t", header = TRUE, na.strings = c("", "NA")) +setnames(params, new = c("name", "value")) + +## Build body and citations +res <- build_docs(versions, params) + +## Write output +con <- file(output_path, open = "wt") + +writeLines("Methods:", con) +writeLines(res$body, con) + +writeLines("", con) + +writeLines("References:", con) +writeLines(paste0("- ", res$citations), con) + +close(con) + +cat("All done.\n") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/document_s2.R b/src/pipecraft-core/service_scripts/NextITS/bin/document_s2.R old mode 100644 new mode 100755 index 0a7f2a3..1469ba3 --- a/src/pipecraft-core/service_scripts/NextITS/bin/document_s2.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/document_s2.R @@ -1,339 +1,339 @@ -#!/usr/bin/env Rscript - -## Script to document the Step-2 workflow of the NextITS pipeline. - -## Usage: -## Rscript document_s2.R [output_path] - - - -## Function to load packages -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(paste(pkg, packageVersion(pkg), "\n")) -} - -load_pckg("glue") -load_pckg("data.table") -load_pckg("yaml") - -## Parse arguments -args <- commandArgs(trailingOnly = TRUE) -if (length(args) < 2) { - cat("Usage: document_s2.R [output_path]\n") - stop() -} - -versions_path <- args[[1]] -params_path <- args[[2]] -output_path <- ifelse(length(args) >= 3, args[[3]], "README_Step2_Methods.txt") - - -## Validation -if(is.null(versions_path) || versions_path == ""){ - stop("Versions YAML not specified") -} -if(is.null(params_path) || params_path == ""){ - stop("Params table not specified") -} - -if(!file.exists(versions_path)){ - stop(glue("Versions YAML not found: {versions_path}")) -} -if(!file.exists(params_path)){ - stop(glue("Params table not found: {params_path}")) -} - - - -################################## -################################## References -################################## - -## Citation registry -citation_db <- list( - nextits = "Mikryukov V, Anslan S, Tedersoo L (2025) NextITS - A pipeline for metabarcoding fungi and other eukaryotes with full-length ITS sequenced with PacBio. DOI:10.5281/zenodo.15074882", - nextflow = "Di Tommaso P, et al. (2017) Nextflow enables reproducible computational workflows. Nat Biotechnol 35, 316-319, DOI:10.1038/nbt.3820", - vsearch = "Rognes T, Flouri T, Nichols B, Quince C, Mahé F (2016) VSEARCH: a versatile open source tool for metagenomics. PeerJ 4:e2584. DOI:10.7717/peerj.2584", - dada2 = "Callahan BJ, et al. (2016) DADA2: High-resolution sample inference from Illumina amplicon data. Nat Methods 13:581-583. DOI:10.1038/nmeth.3869", - unoise = "Edgar RC (2016) UNOISE2: improved error-correction for Illumina 16S and ITS amplicon sequencing. bioRxiv 081257. DOI:10.1101/081257", - swarm = "Mahé F, Czech L, Stamatakis A, Quince C, de Vargas C, Dunthorn M, Rognes T. (2021) Swarm v3: towards tera-scale amplicon clustering. Bioinformatics 38(1), 267-269. DOI:10.1093/bioinformatics/btab493", - lulu = "Frøslev TG, et al. (2017) Algorithm for post-clustering curation of DNA amplicon data yields reliable biodiversity estimates. Nat Commun 8:1188. DOI:10.1038/s41467-017-01312-x", - mumu = "Mahé F (2025) MUMU: C++ implementation of LULU, a R package for post-clustering curation of metabarcoding data. URL: https://github.com/frederic-mahe/mumu", - ucs = "Mikryukov V (2025) ucs - USEARCH cluster file parser. URL: https://github.com/vmikk/ucs", - duckdb = "Raasveldt M, Mühleisen H (2019) DuckDB: an Embeddable Analytical Database. SIGMOD '19: Proceedings of the 2019 International Conference on Management of Data, 1981-1984. DOI:10.1145/3299869.332021", - R = "R Core Team (2025) R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria. URL: https://www.R-project.org/", - arrow = "Richardson N, Cook I, Crane N, Dunnington D, François R, Keane J, Moldovan-Grünfeld D, Ooms J, Wujciak-Jens J, and Apache Arrow (2025) arrow: Integration to Apache Arrow. URL: https://github.com/apache/arrow/", - biostrings= "Pagès H, Aboyoun P, Gentleman R, DebRoy S (2025) Biostrings: Efficient manipulation of biological strings. DOI:10.18129/B9.bioc.Biostrings", - datatable = "Barrett T, Dowle M, Srinivasan A, Gorecki J, Chirico M, Hocking T, Schwendinger B, Krylov I (2025) data.table: Extension of data.frame. URL: " -) - - -################################## -################################## Helpers -################################## - -## Get version number -getv <- function(v, process, tool){ - # v = list (from YAML file) - # process = process name - # tool = tool name - - if(is.null(v[[process]]) || is.null(v[[process]][[tool]])){ return("") } - as.character( v[[process]][[tool]] ) -} -# E.g., getv(versions, "dereplication", "vsearch") - - -## Get parameter -getp <- function(p, pname, default = NA){ - # p = table with parameters (two columns: name and value) - # pname = parameter name - # default = default value if parameter is not found - - pp <- p[ name == pname ]$value - if(is.null(pp) || is.na(pp)){ return(default) } - return(pp) -} -# E.g., getp(params, "otu_id", 0.98) - - -## Remove NAs and empty strings (to curate the citations) -trim_na <- function(x){ - x[ !is.na(x) & nzchar(x) ] -} - -################################## -################################## Body builders -################################## - -emit_nextits <- function(v) { - nextits_v <- if(!is.null(v$NextITS$version)){ as.character(v$NextITS$version) } else { "" } - glue("Bioinformatic processing was performed using the \\ - NextITS pipeline v.{nextits_v} (Mikryukov et al., 2025).") -} - -emit_nextflow <- function(v) { - nextflow_v <- if(!is.null(v$Nextflow$version)){ as.character(v$Nextflow$version) } else { "" } - glue("Workflow management was performed using \\ - Nextflow v.{nextflow_v} (Di Tommaso et al., 2017).") -} - -emit_aggregation <- function(p, v) { - glue("Sequences from all sequencing runs were aggregated and \\ - de novo chimeric sequences with chimera score >= {getp(p,'max_ChimeraScore',0.6)} were removed.") -} - -emit_dereplication <- function(p, v) { - minlen <- getp(p, "ampliconlen_min", NA) - maxlen <- getp(p, "ampliconlen_max", NA) - - length_filter <- "" - if(!is.na(minlen) && !is.na(maxlen)){ - length_filter <- glue(" Sequences shorter than {minlen} nt or longer than {maxlen} nt were excluded.") - } else if(!is.na(minlen)){ - length_filter <- glue(" Sequences shorter than {minlen} nt were excluded.") - } else if(!is.na(maxlen)){ - length_filter <- glue(" Sequences longer than {maxlen} nt were excluded.") - } - - glue("Global sequence dereplication was performed using \\ - VSEARCH v.{getv(v,'dereplication','vsearch')} (Rognes et al., 2016).\\ - {length_filter}") -} - -emit_preclustering <- function(p, v) { - preclustering_method <- getp(p, "preclustering", "none") - - res <- switch(preclustering_method, - - "none" = "", # No pre-clustering or denoising was performed - - "homopolymer" = glue( - "Global homopolymer correction was performed using an algorithm implemented in NextITS \\ - with support of VSEARCH v.{getv(v,'homopolymer','vsearch')} (Rognes et al., 2016)."), - - "unoise" = glue( - "Sequence denoising was performed using the UNOISE3 algorithm (Edgar, 2016) \\ - implemented in VSEARCH v.{getv(v,'unoise','vsearch')} (Rognes et al., 2016) \\ - with alpha parameter {getp(p,'unoise_alpha',6.0)} and minimum size {getp(p,'unoise_minsize',1)}."), - - "dada2" = glue( - "Sequence denoising was performed using \\ - DADA2 v.{getv(v,'dada2','dada2')} (Callahan et al., 2016)"), - # using {getp(p,'dada2_pooling','global')} pooling strategy." - - "swarm_d1" = glue( - "Pre-clustering was performed using \\ - SWARM v.{getv(v,'precluster_swarm','swarm')} (Mahé et al., 2021) \\ - with d=1 and fastidious option enabled.") - ) - - return(res) -} - -emit_clustering <- function(p, v) { - clustering_method <- getp(p, "clustering", "vsearch") - preclustering_method <- getp(p, "preclustering", "none") - - ## Handle special case where SWARM pre-clustering = SWARM clustering with same d - # if(preclustering_method == "swarm_d1" && clustering_method == "swarm" && getp(p, "swarm_d", 1) == 1){ - # return("No additional clustering was performed (SWARM pre-clustering with d=1 was used as final clustering).") - # } - - cls <- switch(clustering_method, - - "vsearch" = glue( - "VSEARCH v.{getv(v,'cluster_vsearch','vsearch')} (Rognes et al., 2016) \\ - with { as.numeric(getp(p,'otu_id',0.98))*100}% similarity threshold."), - - "swarm" = { - fastidious_text <- if(getp(p, "swarm_fastidious", TRUE) && getp(p, "swarm_d", 1) == 1) { - " with fastidious option enabled" - } else { - "" - } - glue("SWARM v.{getv(v,'cluster_swarm','swarm')} (Mahé et al., 2021) \\ - with parameter d={getp(p,'swarm_d',1)}{fastidious_text}.") - }) - - res <- glue("OTU clustering was performed using {cls}") - return(res) -} - -emit_uc_merging <- function(p, v) { - glue("UC files from dereplication, pre-clustering, and clustering steps were merged using \\ - ucs v.{getv(v,'merge_uc','ucs')} (Mikryukov, 2025) and \\ - DuckDB v.{getv(v,'merge_uc','duckdb')} (Raasveldt & Mühleisen, 2019) \\ - to track sequence membership through all processing steps.") -} - -emit_lulu <- function(p, v) { - glue("Post-clustering curation was performed using \\ - LULU algorithm (Frøslev et al., 2017) \\ - as implemented in MUMU v.{getv(v,'lulu','mumu')} (Mahé, 2025) \\ - with {getp(p,'lulu_match',95.0)}% minimum similarity, \\ - {getp(p,'lulu_ratio',1.0)} minimum abundance ratio, \\ - and {getp(p,'lulu_relcooc',0.95)} minimum relative co-occurrence. \\ - Pairwise sequence similarities were calculated using \\ - VSEARCH v.{getv(v,'lulu','vsearch')} (Rognes et al., 2016).") -} - - -################################## -################################## Workflow-dependent method descriptions -################################## - -## Function to assembly the workflow description and references -build_docs <- function(versions, params){ - body <- character() - tools_used <- character() - - ## Pipeline version - body <- c(body, emit_nextits(versions)) - tools_used <- c(tools_used, "nextits") - - ## Nextflow version - body <- c(body, emit_nextflow(versions)) - tools_used <- c(tools_used, "nextflow") - - ## Sequence aggregation - body <- c(body, emit_aggregation(params, versions)) - - ## Sequence dereplication and amplicon length filtering - body <- c(body, emit_dereplication(params, versions)) - tools_used <- c(tools_used, "vsearch") - - ## Conditional: pre-clustering/denoising - preclustering_method <- getp(params, "preclustering", "none") - if(preclustering_method != "none" && !is.na(preclustering_method)){ - body <- c(body, emit_preclustering(params, versions)) - - switch(preclustering_method, - "homopolymer" = { - tools_used <- c(tools_used, "vsearch") - }, - "unoise" = { - tools_used <- c(tools_used, c("vsearch", "unoise")) - }, - "dada2" = { - tools_used <- c(tools_used, "dada2") - }, - "swarm_d1" = { - tools_used <- c(tools_used, "swarm") - } - ) - } - - ## Conditional: clustering - clustering_method <- getp(params, "clustering", "vsearch") - if(clustering_method != "none" && !is.na(clustering_method)){ - ## Skip clustering description if it's redundant with pre-clustering - if(!(preclustering_method == "swarm_d1" && clustering_method == "swarm" && getp(params, "swarm_d", 1) == 1)){ - body <- c(body, emit_clustering(params, versions)) - - switch(clustering_method, - "vsearch" = { - tools_used <- c(tools_used, "vsearch") - }, - "swarm" = { - tools_used <- c(tools_used, "swarm") - } - ) - } - } - - ## UC file merging - body <- c(body, emit_uc_merging(params, versions)) - tools_used <- c(tools_used, c("ucs", "duckdb")) - - ## Conditional: LULU post-clustering curation - lulu_enabled <- tolower(as.character(getp(params, "lulu", "true"))) %in% c("true", "t", "1") - if(lulu_enabled){ - body <- c(body, emit_lulu(params, versions)) - tools_used <- c(tools_used, c("mumu", "lulu", "vsearch")) - } - - ## Generate citations - tools_used <- unique(tools_used) - citations <- trim_na( unlist(citation_db[tools_used]) ) - citations <- sort(unique(citations)) - - res <- list( - body = body, - citations = citations) - - return(res) -} - - -################################## -################################## Assemble body and citations -################################## - -## Load inputs -cat("Loading versions YAML...\n") -versions <- yaml::read_yaml(versions_path) - -cat("Loading params table...\n") -params <- data.table::fread(params_path, sep = "\t", header = TRUE, na.strings = c("", "NA")) -setnames(params, new = c("name", "value")) - -## Build body and citations -res <- build_docs(versions, params) - -## Write output -con <- file(output_path, open = "wt") - -writeLines("Methods:", con) -writeLines(res$body, con) - -writeLines("", con) - -writeLines("References:", con) -writeLines(paste0("- ", res$citations), con) - -close(con) - -cat("All done.\n") - +#!/usr/bin/env Rscript + +## Script to document the Step-2 workflow of the NextITS pipeline. + +## Usage: +## Rscript document_s2.R [output_path] + + + +## Function to load packages +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(paste(pkg, packageVersion(pkg), "\n")) +} + +load_pckg("glue") +load_pckg("data.table") +load_pckg("yaml") + +## Parse arguments +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 2) { + cat("Usage: document_s2.R [output_path]\n") + stop() +} + +versions_path <- args[[1]] +params_path <- args[[2]] +output_path <- ifelse(length(args) >= 3, args[[3]], "README_Step2_Methods.txt") + + +## Validation +if(is.null(versions_path) || versions_path == ""){ + stop("Versions YAML not specified") +} +if(is.null(params_path) || params_path == ""){ + stop("Params table not specified") +} + +if(!file.exists(versions_path)){ + stop(glue("Versions YAML not found: {versions_path}")) +} +if(!file.exists(params_path)){ + stop(glue("Params table not found: {params_path}")) +} + + + +################################## +################################## References +################################## + +## Citation registry +citation_db <- list( + nextits = "Mikryukov V, Anslan S, Tedersoo L (2025) NextITS - A pipeline for metabarcoding fungi and other eukaryotes with full-length ITS sequenced with PacBio. DOI:10.5281/zenodo.15074882", + nextflow = "Di Tommaso P, et al. (2017) Nextflow enables reproducible computational workflows. Nat Biotechnol 35, 316-319, DOI:10.1038/nbt.3820", + vsearch = "Rognes T, Flouri T, Nichols B, Quince C, Mahé F (2016) VSEARCH: a versatile open source tool for metagenomics. PeerJ 4:e2584. DOI:10.7717/peerj.2584", + dada2 = "Callahan BJ, et al. (2016) DADA2: High-resolution sample inference from Illumina amplicon data. Nat Methods 13:581-583. DOI:10.1038/nmeth.3869", + unoise = "Edgar RC (2016) UNOISE2: improved error-correction for Illumina 16S and ITS amplicon sequencing. bioRxiv 081257. DOI:10.1101/081257", + swarm = "Mahé F, Czech L, Stamatakis A, Quince C, de Vargas C, Dunthorn M, Rognes T. (2021) Swarm v3: towards tera-scale amplicon clustering. Bioinformatics 38(1), 267-269. DOI:10.1093/bioinformatics/btab493", + lulu = "Frøslev TG, et al. (2017) Algorithm for post-clustering curation of DNA amplicon data yields reliable biodiversity estimates. Nat Commun 8:1188. DOI:10.1038/s41467-017-01312-x", + mumu = "Mahé F (2025) MUMU: C++ implementation of LULU, a R package for post-clustering curation of metabarcoding data. URL: https://github.com/frederic-mahe/mumu", + ucs = "Mikryukov V (2025) ucs - USEARCH cluster file parser. URL: https://github.com/vmikk/ucs", + duckdb = "Raasveldt M, Mühleisen H (2019) DuckDB: an Embeddable Analytical Database. SIGMOD '19: Proceedings of the 2019 International Conference on Management of Data, 1981-1984. DOI:10.1145/3299869.332021", + R = "R Core Team (2025) R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria. URL: https://www.R-project.org/", + arrow = "Richardson N, Cook I, Crane N, Dunnington D, François R, Keane J, Moldovan-Grünfeld D, Ooms J, Wujciak-Jens J, and Apache Arrow (2025) arrow: Integration to Apache Arrow. URL: https://github.com/apache/arrow/", + biostrings= "Pagès H, Aboyoun P, Gentleman R, DebRoy S (2025) Biostrings: Efficient manipulation of biological strings. DOI:10.18129/B9.bioc.Biostrings", + datatable = "Barrett T, Dowle M, Srinivasan A, Gorecki J, Chirico M, Hocking T, Schwendinger B, Krylov I (2025) data.table: Extension of data.frame. URL: " +) + + +################################## +################################## Helpers +################################## + +## Get version number +getv <- function(v, process, tool){ + # v = list (from YAML file) + # process = process name + # tool = tool name + + if(is.null(v[[process]]) || is.null(v[[process]][[tool]])){ return("") } + as.character( v[[process]][[tool]] ) +} +# E.g., getv(versions, "dereplication", "vsearch") + + +## Get parameter +getp <- function(p, pname, default = NA){ + # p = table with parameters (two columns: name and value) + # pname = parameter name + # default = default value if parameter is not found + + pp <- p[ name == pname ]$value + if(is.null(pp) || is.na(pp)){ return(default) } + return(pp) +} +# E.g., getp(params, "otu_id", 0.98) + + +## Remove NAs and empty strings (to curate the citations) +trim_na <- function(x){ + x[ !is.na(x) & nzchar(x) ] +} + +################################## +################################## Body builders +################################## + +emit_nextits <- function(v) { + nextits_v <- if(!is.null(v$NextITS$version)){ as.character(v$NextITS$version) } else { "" } + glue("Bioinformatic processing was performed using the \\ + NextITS pipeline v.{nextits_v} (Mikryukov et al., 2025).") +} + +emit_nextflow <- function(v) { + nextflow_v <- if(!is.null(v$Nextflow$version)){ as.character(v$Nextflow$version) } else { "" } + glue("Workflow management was performed using \\ + Nextflow v.{nextflow_v} (Di Tommaso et al., 2017).") +} + +emit_aggregation <- function(p, v) { + glue("Sequences from all sequencing runs were aggregated and \\ + de novo chimeric sequences with chimera score >= {getp(p,'max_ChimeraScore',0.6)} were removed.") +} + +emit_dereplication <- function(p, v) { + minlen <- getp(p, "ampliconlen_min", NA) + maxlen <- getp(p, "ampliconlen_max", NA) + + length_filter <- "" + if(!is.na(minlen) && !is.na(maxlen)){ + length_filter <- glue(" Sequences shorter than {minlen} nt or longer than {maxlen} nt were excluded.") + } else if(!is.na(minlen)){ + length_filter <- glue(" Sequences shorter than {minlen} nt were excluded.") + } else if(!is.na(maxlen)){ + length_filter <- glue(" Sequences longer than {maxlen} nt were excluded.") + } + + glue("Global sequence dereplication was performed using \\ + VSEARCH v.{getv(v,'dereplication','vsearch')} (Rognes et al., 2016).\\ + {length_filter}") +} + +emit_preclustering <- function(p, v) { + preclustering_method <- getp(p, "preclustering", "none") + + res <- switch(preclustering_method, + + "none" = "", # No pre-clustering or denoising was performed + + "homopolymer" = glue( + "Global homopolymer correction was performed using an algorithm implemented in NextITS \\ + with support of VSEARCH v.{getv(v,'homopolymer','vsearch')} (Rognes et al., 2016)."), + + "unoise" = glue( + "Sequence denoising was performed using the UNOISE3 algorithm (Edgar, 2016) \\ + implemented in VSEARCH v.{getv(v,'unoise','vsearch')} (Rognes et al., 2016) \\ + with alpha parameter {getp(p,'unoise_alpha',6.0)} and minimum size {getp(p,'unoise_minsize',1)}."), + + "dada2" = glue( + "Sequence denoising was performed using \\ + DADA2 v.{getv(v,'dada2','dada2')} (Callahan et al., 2016)"), + # using {getp(p,'dada2_pooling','global')} pooling strategy." + + "swarm_d1" = glue( + "Pre-clustering was performed using \\ + SWARM v.{getv(v,'precluster_swarm','swarm')} (Mahé et al., 2021) \\ + with d=1 and fastidious option enabled.") + ) + + return(res) +} + +emit_clustering <- function(p, v) { + clustering_method <- getp(p, "clustering", "vsearch") + preclustering_method <- getp(p, "preclustering", "none") + + ## Handle special case where SWARM pre-clustering = SWARM clustering with same d + # if(preclustering_method == "swarm_d1" && clustering_method == "swarm" && getp(p, "swarm_d", 1) == 1){ + # return("No additional clustering was performed (SWARM pre-clustering with d=1 was used as final clustering).") + # } + + cls <- switch(clustering_method, + + "vsearch" = glue( + "VSEARCH v.{getv(v,'cluster_vsearch','vsearch')} (Rognes et al., 2016) \\ + with { as.numeric(getp(p,'otu_id',0.98))*100}% similarity threshold."), + + "swarm" = { + fastidious_text <- if(getp(p, "swarm_fastidious", TRUE) && getp(p, "swarm_d", 1) == 1) { + " with fastidious option enabled" + } else { + "" + } + glue("SWARM v.{getv(v,'cluster_swarm','swarm')} (Mahé et al., 2021) \\ + with parameter d={getp(p,'swarm_d',1)}{fastidious_text}.") + }) + + res <- glue("OTU clustering was performed using {cls}") + return(res) +} + +emit_uc_merging <- function(p, v) { + glue("UC files from dereplication, pre-clustering, and clustering steps were merged using \\ + ucs v.{getv(v,'merge_uc','ucs')} (Mikryukov, 2025) and \\ + DuckDB v.{getv(v,'merge_uc','duckdb')} (Raasveldt & Mühleisen, 2019) \\ + to track sequence membership through all processing steps.") +} + +emit_lulu <- function(p, v) { + glue("Post-clustering curation was performed using \\ + LULU algorithm (Frøslev et al., 2017) \\ + as implemented in MUMU v.{getv(v,'lulu','mumu')} (Mahé, 2025) \\ + with {getp(p,'lulu_match',95.0)}% minimum similarity, \\ + {getp(p,'lulu_ratio',1.0)} minimum abundance ratio, \\ + and {getp(p,'lulu_relcooc',0.95)} minimum relative co-occurrence. \\ + Pairwise sequence similarities were calculated using \\ + VSEARCH v.{getv(v,'lulu','vsearch')} (Rognes et al., 2016).") +} + + +################################## +################################## Workflow-dependent method descriptions +################################## + +## Function to assembly the workflow description and references +build_docs <- function(versions, params){ + body <- character() + tools_used <- character() + + ## Pipeline version + body <- c(body, emit_nextits(versions)) + tools_used <- c(tools_used, "nextits") + + ## Nextflow version + body <- c(body, emit_nextflow(versions)) + tools_used <- c(tools_used, "nextflow") + + ## Sequence aggregation + body <- c(body, emit_aggregation(params, versions)) + + ## Sequence dereplication and amplicon length filtering + body <- c(body, emit_dereplication(params, versions)) + tools_used <- c(tools_used, "vsearch") + + ## Conditional: pre-clustering/denoising + preclustering_method <- getp(params, "preclustering", "none") + if(preclustering_method != "none" && !is.na(preclustering_method)){ + body <- c(body, emit_preclustering(params, versions)) + + switch(preclustering_method, + "homopolymer" = { + tools_used <- c(tools_used, "vsearch") + }, + "unoise" = { + tools_used <- c(tools_used, c("vsearch", "unoise")) + }, + "dada2" = { + tools_used <- c(tools_used, "dada2") + }, + "swarm_d1" = { + tools_used <- c(tools_used, "swarm") + } + ) + } + + ## Conditional: clustering + clustering_method <- getp(params, "clustering", "vsearch") + if(clustering_method != "none" && !is.na(clustering_method)){ + ## Skip clustering description if it's redundant with pre-clustering + if(!(preclustering_method == "swarm_d1" && clustering_method == "swarm" && getp(params, "swarm_d", 1) == 1)){ + body <- c(body, emit_clustering(params, versions)) + + switch(clustering_method, + "vsearch" = { + tools_used <- c(tools_used, "vsearch") + }, + "swarm" = { + tools_used <- c(tools_used, "swarm") + } + ) + } + } + + ## UC file merging + body <- c(body, emit_uc_merging(params, versions)) + tools_used <- c(tools_used, c("ucs", "duckdb")) + + ## Conditional: LULU post-clustering curation + lulu_enabled <- tolower(as.character(getp(params, "lulu", "true"))) %in% c("true", "t", "1") + if(lulu_enabled){ + body <- c(body, emit_lulu(params, versions)) + tools_used <- c(tools_used, c("mumu", "lulu", "vsearch")) + } + + ## Generate citations + tools_used <- unique(tools_used) + citations <- trim_na( unlist(citation_db[tools_used]) ) + citations <- sort(unique(citations)) + + res <- list( + body = body, + citations = citations) + + return(res) +} + + +################################## +################################## Assemble body and citations +################################## + +## Load inputs +cat("Loading versions YAML...\n") +versions <- yaml::read_yaml(versions_path) + +cat("Loading params table...\n") +params <- data.table::fread(params_path, sep = "\t", header = TRUE, na.strings = c("", "NA")) +setnames(params, new = c("name", "value")) + +## Build body and citations +res <- build_docs(versions, params) + +## Write output +con <- file(output_path, open = "wt") + +writeLines("Methods:", con) +writeLines(res$body, con) + +writeLines("", con) + +writeLines("References:", con) +writeLines(paste0("- ", res$citations), con) + +close(con) + +cat("All done.\n") + diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/hash_sequences.sh b/src/pipecraft-core/service_scripts/NextITS/bin/hash_sequences.sh index 3cb686c..d36f6ba 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/hash_sequences.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/hash_sequences.sh @@ -1,17 +1,17 @@ -#!/bin/bash - -awk \ - '{ print $0 "\t" encodeData( $2 ) } - function encodeData( fld, cmd, output ) { - cmd = "printf \047" fld "\047 | sha1sum" - if ( (cmd | getline output) > 0 ) { - sub(/ .*/,"",output) - } - else { - print "failed to hash " fld | "cat>&2" - output = fld - } - close( cmd ) - return output - }' \ - "$1" +#!/bin/bash + +awk \ + '{ print $0 "\t" encodeData( $2 ) } + function encodeData( fld, cmd, output ) { + cmd = "printf \047" fld "\047 | sha1sum" + if ( (cmd | getline output) > 0 ) { + sub(/ .*/,"",output) + } + else { + print "failed to hash " fld | "cat>&2" + output = fld + } + close( cmd ) + return output + }' \ + "$1" diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/homopolymer_collapse_len.sh b/src/pipecraft-core/service_scripts/NextITS/bin/homopolymer_collapse_len.sh index 725efdc..c757543 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/homopolymer_collapse_len.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/homopolymer_collapse_len.sh @@ -1,58 +1,58 @@ -#!/bin/bash - -## Script to compress homopolymer stretches (e.g., prior to k-mer counting) -## All homopolymer stretches will be collapsed to max H length (default, 1) - -## Input: -# $1 = Parameter H (max homopolymer length) -# $2 = input FASTA file - -## Usage example: -# -# cat > input.fasta <<'EOT' -# >s1 -# AACCCTTTTGGGGG -# >s2 -# ACCTTTGGGGGGGG -# >s3 -# ACTGACTGACTGAC -# EOT -# -# homopolymer_compression.sh 2 input.fasta - - - -awk -v H="$1" '\ - -BEGIN { - if (H < 1) H = 1; -} - -# If the line is a header, print it as is -/^>/ { - print; - next; -} - -# Process sequence lines -{ - sequence = $0; - collapsedSeq = ""; - count = 1; - - for (i = 2; i <= length(sequence); i++) { - if (substr(sequence, i, 1) == substr(sequence, i - 1, 1)) { - count++; - } else { - collapsedSeq = collapsedSeq substr(sequence, i - count, (count > H) ? H : count); - count = 1; - } - } - - # Handle the last homopolymer stretch - collapsedSeq = collapsedSeq substr(sequence, length(sequence) - count + 1, (count > H) ? H : count); - - print collapsedSeq; -} -' "$2" - +#!/bin/bash + +## Script to compress homopolymer stretches (e.g., prior to k-mer counting) +## All homopolymer stretches will be collapsed to max H length (default, 1) + +## Input: +# $1 = Parameter H (max homopolymer length) +# $2 = input FASTA file + +## Usage example: +# +# cat > input.fasta <<'EOT' +# >s1 +# AACCCTTTTGGGGG +# >s2 +# ACCTTTGGGGGGGG +# >s3 +# ACTGACTGACTGAC +# EOT +# +# homopolymer_compression.sh 2 input.fasta + + + +awk -v H="$1" '\ + +BEGIN { + if (H < 1) H = 1; +} + +# If the line is a header, print it as is +/^>/ { + print; + next; +} + +# Process sequence lines +{ + sequence = $0; + collapsedSeq = ""; + count = 1; + + for (i = 2; i <= length(sequence); i++) { + if (substr(sequence, i, 1) == substr(sequence, i - 1, 1)) { + count++; + } else { + collapsedSeq = collapsedSeq substr(sequence, i - count, (count > H) ? H : count); + count = 1; + } + } + + # Handle the last homopolymer stretch + collapsedSeq = collapsedSeq substr(sequence, length(sequence) - count + 1, (count > H) ? H : count); + + print collapsedSeq; +} +' "$2" + diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/homopolymer_compression.sh b/src/pipecraft-core/service_scripts/NextITS/bin/homopolymer_compression.sh index 86ffeb7..03e66d5 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/homopolymer_compression.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/homopolymer_compression.sh @@ -1,7 +1,7 @@ -#!/bin/bash - -## Script to collapse homopolymer stretches to a single letter - -bioawk \ - '{ gsub(/[A]+/,"A");gsub(/[C]+/,"C");gsub(/[T]+/,"T");gsub(/[G]+/,"G");gsub(/[N]+/,"N") }1' \ - "$1" +#!/bin/bash + +## Script to collapse homopolymer stretches to a single letter + +bioawk \ + '{ gsub(/[A]+/,"A");gsub(/[C]+/,"C");gsub(/[T]+/,"T");gsub(/[G]+/,"G");gsub(/[N]+/,"N") }1' \ + "$1" diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/max_ee.R b/src/pipecraft-core/service_scripts/NextITS/bin/max_ee.R index f8468fb..3031d40 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/max_ee.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/max_ee.R @@ -1,57 +1,57 @@ -#!/usr/bin/env Rscript - -## Script to combine Phred scores and MaxEE estimates - -# Input is given as positional arguments: -# 1. Phred score table (`tmp_hash_table.txt`) -# 2. MaxEE table (`tmp_ee.txt`) -# 3. Output file name (`${sampID}_hash_table.txt`) - - -args <- commandArgs(trailingOnly = TRUE) - -## Debug: -# args <- c( -# "tmp_hash_table.txt", -# "tmp_ee.txt", -# "res_hash_table.txt" -# ) - -suppressMessages(library(data.table)) - - -## Load table with Phred scores -cat("..Loading Phred scores\n") -T1 <- fread( - file = args[1], - sep = "\t", header = FALSE, - col.names = c("SeqID", "SeqHash", "Len", "PhredScore"), - colClasses = c("character", "character", "numeric", "numeric")) - -if(any(is.na(T1$Len))){ - cat("WARNING: non-numeric data detected. Maybe there are some empty sequences\n") -} - -## Load table with Phred scores -cat("..Loading MaxEE estimates\n") -T2 <- fread( - file = args[2], - sep = "\t", header = FALSE, - col.names = c("SeqID", "MaxEE")) - -## Merge tables -cat("..Merging tables\n") -TAB <- merge(x = T1, y = T2, by = "SeqID", all.x = TRUE) - -## Estimate the MEEP score (Koparde et al., DOI:10.1504/IJCBDD.2017.10006006) -## Maximum number of probable incorrect base calls per every 100 bases in the read -cat("..Estimating MEEP score\n") -TAB[ , MEEP := 100 * MaxEE / Len ] - -## Export results -cat("..Exporting results\n") -fwrite(x = TAB, - file = args[3], - sep = "\t", - compress = "none") - +#!/usr/bin/env Rscript + +## Script to combine Phred scores and MaxEE estimates + +# Input is given as positional arguments: +# 1. Phred score table (`tmp_hash_table.txt`) +# 2. MaxEE table (`tmp_ee.txt`) +# 3. Output file name (`${sampID}_hash_table.txt`) + + +args <- commandArgs(trailingOnly = TRUE) + +## Debug: +# args <- c( +# "tmp_hash_table.txt", +# "tmp_ee.txt", +# "res_hash_table.txt" +# ) + +suppressMessages(library(data.table)) + + +## Load table with Phred scores +cat("..Loading Phred scores\n") +T1 <- fread( + file = args[1], + sep = "\t", header = FALSE, + col.names = c("SeqID", "SeqHash", "Len", "PhredScore"), + colClasses = c("character", "character", "numeric", "numeric")) + +if(any(is.na(T1$Len))){ + cat("WARNING: non-numeric data detected. Maybe there are some empty sequences\n") +} + +## Load table with Phred scores +cat("..Loading MaxEE estimates\n") +T2 <- fread( + file = args[2], + sep = "\t", header = FALSE, + col.names = c("SeqID", "MaxEE")) + +## Merge tables +cat("..Merging tables\n") +TAB <- merge(x = T1, y = T2, by = "SeqID", all.x = TRUE) + +## Estimate the MEEP score (Koparde et al., DOI:10.1504/IJCBDD.2017.10006006) +## Maximum number of probable incorrect base calls per every 100 bases in the read +cat("..Estimating MEEP score\n") +TAB[ , MEEP := 100 * MaxEE / Len ] + +## Export results +cat("..Exporting results\n") +fwrite(x = TAB, + file = args[3], + sep = "\t", + compress = "none") + diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/merge_hash_tables.sh b/src/pipecraft-core/service_scripts/NextITS/bin/merge_hash_tables.sh index 9973a6b..ec5313f 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/merge_hash_tables.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/merge_hash_tables.sh @@ -1,131 +1,131 @@ -#!/bin/bash - -## Usage: -# merge_hash_tables.sh \ -# -i '/path/to/input/directory' \ -# -o '/path/to/output.parquet' \ -# -t 4 - -## Input data: -# - Tab-delimited tables with columns: -# SampleID - Hash - PacBioID - AvgPhredScore - MaxEE - MEEP - Sequence - Quality - Length - -## Notes -# - memory constraints might reduce the number of threads used -# - when saving to parquet, the ROW_GROUP_SIZE param might be adjusted to reduce memory usage (but the effect is not very significant): -# default ROW_GROUP_SIZE = 122,880 (with DuckDB's vector size = 2,048 -> 60 row groups) -# here, a half of the default value is used (ROW_GROUP_SIZE = 61,440 -> 30 row groups) - -## Function to display usage information -usage() { - echo "Usage: $0 -i INPUTDIR -o OUTPUT [-t THREADS] [-m MEMORY] [-x TEMP_DIR] [-z COMPRESSION]" - echo " -i INPUTDIR : Input directory with text files" - echo " -o OUTPUT : Output Parquet file path" - echo " -t THREADS : Number of CPU threads to use (optional)" - echo " -m MEMORY : Memory limit (e.g., '100GB') (optional)" - echo " -z COMPRESSION : ZSTD compression level (0-22) (optional, default: 12)" - exit 1 -} - -## Initialize variables -INPUT="" -OUTPUT="" -THREADS="" -MEMORY="" -COMPRESSION="12" - -## Parse command-line options -while getopts "i:o:t:m:z:" opt; do - case $opt in - i) INPUT="$OPTARG" ;; - o) OUTPUT="$OPTARG" ;; - t) THREADS="$OPTARG" ;; - m) MEMORY="$OPTARG" ;; - z) COMPRESSION="$OPTARG" ;; - *) usage ;; - esac -done - - -## Validate input parameters -if [[ -z "$INPUT" || -z "$OUTPUT" ]]; then - echo -e "Error: Missing required parameters!\n" - usage -fi - -## Threads should be a positive integer -if [[ -n "$THREADS" && "$THREADS" -le 0 ]]; then - echo -e "Error: Threads must be a positive integer!\n" - usage -fi - -## Validate compression level -if ! [[ "$COMPRESSION" =~ ^[0-9]+$ ]] || [ "$COMPRESSION" -lt 0 ] || [ "$COMPRESSION" -gt 22 ]; then - echo -e "Error: Compression level must be an integer between 0 and 22!\n" - usage -fi - -## View user-supplied parameters -echo -e "\nInput parameters:" -echo "Input directory: $INPUT" -echo "Output file: $OUTPUT" -if [[ -n "$THREADS" ]]; then - echo "Threads: $THREADS" -fi -if [[ -n "$MEMORY" ]]; then - echo "Memory: $MEMORY" -fi -echo "Parquet compression level (ZSTD): $COMPRESSION" - - -SQL_COMMAND="" - -## Add configuration settings (if provided) -if [[ -n "$THREADS" ]]; then - SQL_COMMAND+=" -SET threads TO ${THREADS}; -" -fi - -if [[ -n "$MEMORY" ]]; then - SQL_COMMAND+=" -SET memory_limit = '${MEMORY}'; -" -fi - -SQL_COMMAND+=" -COPY ( - SELECT - column0 as SampleID, - column1 as Hash, - column2 as PacBioID, - column3 as AvgPhredScore, - column4 as MaxEE, - column5 as MEEP, - column6 as Sequence, - column7 as Quality, - column8 as Length - FROM read_csv('${INPUT}/*.txt.gz', - header = false, - delim = '\t', - quote = '', - columns = { - 'column0': 'VARCHAR', - 'column1': 'VARCHAR', - 'column2': 'VARCHAR', - 'column3': 'DOUBLE', - 'column4': 'DOUBLE', - 'column5': 'DOUBLE', - 'column6': 'VARCHAR', - 'column7': 'VARCHAR', - 'column8': 'INTEGER' - } - ) -) TO '${OUTPUT}' (FORMAT PARQUET, ROW_GROUP_SIZE 61_440, COMPRESSION 'ZSTD', COMPRESSION_LEVEL ${COMPRESSION}); -" - -## Execute the SQL command -echo -e "\nExecuting DuckDB command" - -duckdb -c "${SQL_COMMAND}" - +#!/bin/bash + +## Usage: +# merge_hash_tables.sh \ +# -i '/path/to/input/directory' \ +# -o '/path/to/output.parquet' \ +# -t 4 + +## Input data: +# - Tab-delimited tables with columns: +# SampleID - Hash - PacBioID - AvgPhredScore - MaxEE - MEEP - Sequence - Quality - Length + +## Notes +# - memory constraints might reduce the number of threads used +# - when saving to parquet, the ROW_GROUP_SIZE param might be adjusted to reduce memory usage (but the effect is not very significant): +# default ROW_GROUP_SIZE = 122,880 (with DuckDB's vector size = 2,048 -> 60 row groups) +# here, a half of the default value is used (ROW_GROUP_SIZE = 61,440 -> 30 row groups) + +## Function to display usage information +usage() { + echo "Usage: $0 -i INPUTDIR -o OUTPUT [-t THREADS] [-m MEMORY] [-x TEMP_DIR] [-z COMPRESSION]" + echo " -i INPUTDIR : Input directory with text files" + echo " -o OUTPUT : Output Parquet file path" + echo " -t THREADS : Number of CPU threads to use (optional)" + echo " -m MEMORY : Memory limit (e.g., '100GB') (optional)" + echo " -z COMPRESSION : ZSTD compression level (0-22) (optional, default: 12)" + exit 1 +} + +## Initialize variables +INPUT="" +OUTPUT="" +THREADS="" +MEMORY="" +COMPRESSION="12" + +## Parse command-line options +while getopts "i:o:t:m:z:" opt; do + case $opt in + i) INPUT="$OPTARG" ;; + o) OUTPUT="$OPTARG" ;; + t) THREADS="$OPTARG" ;; + m) MEMORY="$OPTARG" ;; + z) COMPRESSION="$OPTARG" ;; + *) usage ;; + esac +done + + +## Validate input parameters +if [[ -z "$INPUT" || -z "$OUTPUT" ]]; then + echo -e "Error: Missing required parameters!\n" + usage +fi + +## Threads should be a positive integer +if [[ -n "$THREADS" && "$THREADS" -le 0 ]]; then + echo -e "Error: Threads must be a positive integer!\n" + usage +fi + +## Validate compression level +if ! [[ "$COMPRESSION" =~ ^[0-9]+$ ]] || [ "$COMPRESSION" -lt 0 ] || [ "$COMPRESSION" -gt 22 ]; then + echo -e "Error: Compression level must be an integer between 0 and 22!\n" + usage +fi + +## View user-supplied parameters +echo -e "\nInput parameters:" +echo "Input directory: $INPUT" +echo "Output file: $OUTPUT" +if [[ -n "$THREADS" ]]; then + echo "Threads: $THREADS" +fi +if [[ -n "$MEMORY" ]]; then + echo "Memory: $MEMORY" +fi +echo "Parquet compression level (ZSTD): $COMPRESSION" + + +SQL_COMMAND="" + +## Add configuration settings (if provided) +if [[ -n "$THREADS" ]]; then + SQL_COMMAND+=" +SET threads TO ${THREADS}; +" +fi + +if [[ -n "$MEMORY" ]]; then + SQL_COMMAND+=" +SET memory_limit = '${MEMORY}'; +" +fi + +SQL_COMMAND+=" +COPY ( + SELECT + column0 as SampleID, + column1 as Hash, + column2 as PacBioID, + column3 as AvgPhredScore, + column4 as MaxEE, + column5 as MEEP, + column6 as Sequence, + column7 as Quality, + column8 as Length + FROM read_csv('${INPUT}/*.txt.gz', + header = false, + delim = '\t', + quote = '', + columns = { + 'column0': 'VARCHAR', + 'column1': 'VARCHAR', + 'column2': 'VARCHAR', + 'column3': 'DOUBLE', + 'column4': 'DOUBLE', + 'column5': 'DOUBLE', + 'column6': 'VARCHAR', + 'column7': 'VARCHAR', + 'column8': 'INTEGER' + } + ) +) TO '${OUTPUT}' (FORMAT PARQUET, ROW_GROUP_SIZE 61_440, COMPRESSION 'ZSTD', COMPRESSION_LEVEL ${COMPRESSION}); +" + +## Execute the SQL command +echo -e "\nExecuting DuckDB command" + +duckdb -c "${SQL_COMMAND}" + diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/merge_sequnce_qualities.sh b/src/pipecraft-core/service_scripts/NextITS/bin/merge_sequnce_qualities.sh index bee6316..c30735c 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/merge_sequnce_qualities.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/merge_sequnce_qualities.sh @@ -1,12 +1,12 @@ -#!/bin/bash - -# $1 = input file -# $2 = text to add to the resulting file - -zcat "$1" \ - | awk \ - -F '\t' -v OFS='\t' \ - -v fnm="$2" \ - 'NR>1 { print fnm , $2 , $3 , $4 , $5 , $6 }' \ - | sed 's/_hash_table.txt//' \ - | sed '1i SampleID\tSeqID\tSeqLen\tPhredScore\tMaxEE\tMEEP' +#!/bin/bash + +# $1 = input file +# $2 = text to add to the resulting file + +zcat "$1" \ + | awk \ + -F '\t' -v OFS='\t' \ + -v fnm="$2" \ + 'NR>1 { print fnm , $2 , $3 , $4 , $5 , $6 }' \ + | sed 's/_hash_table.txt//' \ + | sed '1i SampleID\tSeqID\tSeqLen\tPhredScore\tMaxEE\tMEEP' diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/merge_tj_memberships.sh b/src/pipecraft-core/service_scripts/NextITS/bin/merge_tj_memberships.sh old mode 100644 new mode 100755 index b82e8ea..683b834 --- a/src/pipecraft-core/service_scripts/NextITS/bin/merge_tj_memberships.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/merge_tj_memberships.sh @@ -1,87 +1,87 @@ -#!/bin/bash - -## Usage: -# merge_tj_memberships.sh \ -# -d 'Dereplicated.parquet' \ -# -c 'Clustered.parquet' \ -# -o 'TJPreclust.uc.parquet' \ -# -t 4 - -## Input data: -# - Parsed UC file from dereplication (`Dereplicated.parquet`) -# - Parsed UC file from clustering (`Clustered.parquet`) - -## Function to display usage information -usage() { - echo "Usage: $0 -d DEREP -c CLUST -o OUTPUT [-t THREADS]" - echo " -d DEREP : Parquet file from dereplication" - echo " -c CLUST : Parquet file from clustering" - echo " -o OUTPUT : Output Parquet file path" - echo " -t THREADS : Number of CPU threads to use (optional)" - exit 1 -} - -## Initialize variables -DEREP="" -CLUST="" -OUTPUT="TJPreclust.uc.parquet" # default output file name -THREADS="" - -## Parse command-line options -while getopts "d:c:o:t:" opt; do - case $opt in - d) DEREP="$OPTARG" ;; - c) CLUST="$OPTARG" ;; - o) OUTPUT="$OPTARG" ;; - t) THREADS="$OPTARG" ;; - *) usage ;; - esac -done - - -## Validate input parameters -if [[ -z "$DEREP" || -z "$CLUST" ]]; then - echo -e "Error: Missing required parameters!\n" - usage -fi - -## Threads should be a positive integer -if [[ -n "$THREADS" && "$THREADS" -le 0 ]]; then - echo -e "Error: Threads must be a positive integer!\n" - usage -fi - -## View user-supplied parameters -echo -e "\nInput parameters:" -echo "Parquet file from dereplication: $DEREP" -echo "Parquet file from clustering: $CLUST" -echo "Output file: $OUTPUT" -if [[ -n "$THREADS" ]]; then - echo "Threads: $THREADS" -fi - -SQL_COMMAND="" - -## Add configuration settings (if provided) -if [[ -n "$THREADS" ]]; then - SQL_COMMAND+=" -SET threads TO ${THREADS}; -" -fi - - -SQL_COMMAND+=" -COPY ( - SELECT - d.query AS SeqID, - c.target AS OTU - FROM read_parquet('${DEREP}') AS d - LEFT JOIN read_parquet('${CLUST}') AS c - ON d.target = c.query -) TO '${OUTPUT}' (FORMAT PARQUET, COMPRESSION 'ZSTD', COMPRESSION_LEVEL 8); -" - -## Execute the SQL command -echo -e "\nExecuting DuckDB command" - -duckdb -c "${SQL_COMMAND}" +#!/bin/bash + +## Usage: +# merge_tj_memberships.sh \ +# -d 'Dereplicated.parquet' \ +# -c 'Clustered.parquet' \ +# -o 'TJPreclust.uc.parquet' \ +# -t 4 + +## Input data: +# - Parsed UC file from dereplication (`Dereplicated.parquet`) +# - Parsed UC file from clustering (`Clustered.parquet`) + +## Function to display usage information +usage() { + echo "Usage: $0 -d DEREP -c CLUST -o OUTPUT [-t THREADS]" + echo " -d DEREP : Parquet file from dereplication" + echo " -c CLUST : Parquet file from clustering" + echo " -o OUTPUT : Output Parquet file path" + echo " -t THREADS : Number of CPU threads to use (optional)" + exit 1 +} + +## Initialize variables +DEREP="" +CLUST="" +OUTPUT="TJPreclust.uc.parquet" # default output file name +THREADS="" + +## Parse command-line options +while getopts "d:c:o:t:" opt; do + case $opt in + d) DEREP="$OPTARG" ;; + c) CLUST="$OPTARG" ;; + o) OUTPUT="$OPTARG" ;; + t) THREADS="$OPTARG" ;; + *) usage ;; + esac +done + + +## Validate input parameters +if [[ -z "$DEREP" || -z "$CLUST" ]]; then + echo -e "Error: Missing required parameters!\n" + usage +fi + +## Threads should be a positive integer +if [[ -n "$THREADS" && "$THREADS" -le 0 ]]; then + echo -e "Error: Threads must be a positive integer!\n" + usage +fi + +## View user-supplied parameters +echo -e "\nInput parameters:" +echo "Parquet file from dereplication: $DEREP" +echo "Parquet file from clustering: $CLUST" +echo "Output file: $OUTPUT" +if [[ -n "$THREADS" ]]; then + echo "Threads: $THREADS" +fi + +SQL_COMMAND="" + +## Add configuration settings (if provided) +if [[ -n "$THREADS" ]]; then + SQL_COMMAND+=" +SET threads TO ${THREADS}; +" +fi + + +SQL_COMMAND+=" +COPY ( + SELECT + d.query AS SeqID, + c.target AS OTU + FROM read_parquet('${DEREP}') AS d + LEFT JOIN read_parquet('${CLUST}') AS c + ON d.target = c.query +) TO '${OUTPUT}' (FORMAT PARQUET, COMPRESSION 'ZSTD', COMPRESSION_LEVEL 8); +" + +## Execute the SQL command +echo -e "\nExecuting DuckDB command" + +duckdb -c "${SQL_COMMAND}" diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/merge_uc_files.R b/src/pipecraft-core/service_scripts/NextITS/bin/merge_uc_files.R index 8f67841..94be777 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/merge_uc_files.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/merge_uc_files.R @@ -1,264 +1,264 @@ -#!/usr/bin/env Rscript - -## Merge UC files from different steps (dereplication, pre-clustering, clustering) into a single file - -cat("Joining parquet files\n\n") - -## Check time -start_time <- Sys.time() - -## Function to load packages -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(".. ", paste(pkg, packageVersion(pkg), "\n")) -} - -cat("Loading packages:\n") -load_pckg("DBI") -load_pckg("duckdb") -# load_pckg("qs") -# load_pckg("data.table") -# load_pckg("arrow") -# load_pckg("dplyr") - -cat("\nParsing input options and arguments...\n") - -suppressPackageStartupMessages(require(optparse)) - -## Parse arguments -option_list <- list( - make_option("--ucderep", action="store", default=NA, type='character', help="UC file from global dereplication"), - make_option("--ucpreclust", action="store", default=NA, type='character', help="UC file from pre-clustering (optional)"), - make_option("--ucclust", action="store", default=NA, type='character', help="UC file from clustering"), - make_option("--output", action="store", default="UC_Pooled.parquet", type='character', help="Output file name"), - make_option(c("-t", "--threads"), action="store", default=4L, type='integer', help="Number of CPU threads for arrow, default 4") -) -opt <- parse_args(OptionParser(option_list=option_list)) - -## Function to convert text "NA"s to NA -to_na <- function(x){ - if(x %in% c("NA", "null", "Null")){ x <- NA } - return(x) -} - -## Replaces "null"s from Nextflow with NA -opt <- lapply(X = opt, FUN = to_na) - - -## Validation of the required argiments -if(is.na(opt$ucderep)){ - cat("Input file is not specified: UC file from global dereplication.\n", file=stderr()) - stop() -} -if(is.na(opt$ucclust)){ - cat("Input file is not specified: UC file from clustering.\n", file=stderr()) - stop() -} - - -## Assign variables -UCDEREP <- opt$ucderep -UCPRECLUST <- opt$ucpreclust -UCCLUST <- opt$ucclust -OUTPUT <- opt$output -CPUTHREADS <- as.numeric( opt$threads ) - -## Log assigned variables -cat("\nParameters specified:\n") -cat(paste(" UC file from global dereplication: ", UCDEREP, "\n", sep="")) -cat(paste(" UC file from pre-clustering or denoising: ", UCPRECLUST, "\n", sep="")) -cat(paste(" UC file from clustering: ", UCCLUST, "\n", sep="")) -cat(paste(" Output file name: ", OUTPUT, "\n", sep="")) -cat(paste(" Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) - -cat("\n") - -## Data for debugging -# UCDEREP <- "UC_derep.parquet" -# UCPRECLUST <- "UC_preclust.parquet" # "NoPrecluster" -# UCCLUST <- "UC_clust.parquet" -# OUTPUT <- "UC_Pooled.parquet" -# CPUTHREADS <- 4 - -if(UCPRECLUST == "NoPrecluster") { - UCPRECLUST <- NA -} - - -###################################### -###################################### Load and process the data [duckdb] -###################################### - -## Initialize DuckDB connection -cat("..Initializing DuckDB connection\n") -con <- DBI::dbConnect(duckdb::duckdb()) -# duckdb::duckdb(dbdir = tempfile(pattern = "nextits.duckdb.") - -## Register parquet files as tables -cat("..Registering parquet files as tables\n") -cat("...Dereplication UC\n") -dbExecute(con, sprintf("CREATE VIEW derep_seqs AS SELECT * FROM parquet_scan('%s')", UCDEREP)) - -if(!is.na(UCPRECLUST)) { - cat("...Pre-clustering UC\n") - dbExecute(con, sprintf("CREATE VIEW preclust_seqs AS SELECT * FROM parquet_scan('%s')", UCPRECLUST)) -} - -cat("...Clustering UC\n") -dbExecute(con, sprintf("CREATE VIEW clust_seqs AS SELECT * FROM parquet_scan('%s')", UCCLUST)) - -## Set number of threads -cat("..Setting number of threads for DuckDB\n") -dbExecute(con, sprintf("SET threads TO %d;", CPUTHREADS)) - - -## Process and merge the data -if(is.na(UCPRECLUST)) { - - ## Two-file merge (no pre-clustering) - cat("..Merging UC files [no pre-clustering or denoising]\n") - dbExecute(con, sprintf(" - COPY ( - WITH derep AS ( - SELECT DISTINCT - query as SeqID, - target as DerepID - FROM derep_seqs - QUALIFY ROW_NUMBER() OVER (PARTITION BY query ORDER BY target) = 1 - ), - clust AS ( - SELECT DISTINCT - query as DerepID, - target as OTU - FROM clust_seqs - QUALIFY ROW_NUMBER() OVER (PARTITION BY query ORDER BY target) = 1 - ) - SELECT - d.SeqID, - d.DerepID, - c.OTU - FROM derep d - LEFT JOIN clust c ON d.DerepID = c.DerepID - ) TO '%s' - (FORMAT 'parquet', COMPRESSION 'ZSTD', COMPRESSION_LEVEL 8)", - OUTPUT)) - -} else { - - ## Three-file merge (with pre-clustering) - cat("..Merging UC files [with pre-clustering or denoising]\n") - - dbExecute(con, sprintf(" - COPY ( - WITH derep AS ( - SELECT DISTINCT - query as SeqID, - target as DerepID - FROM derep_seqs - QUALIFY ROW_NUMBER() OVER (PARTITION BY query ORDER BY target) = 1 - ), - preclust AS ( - SELECT DISTINCT - query as DerepID, - target as PreclusterID - FROM preclust_seqs - QUALIFY ROW_NUMBER() OVER (PARTITION BY query ORDER BY target) = 1 - ), - clust AS ( - SELECT DISTINCT - query as PreclusterID, - target as OTU - FROM clust_seqs - QUALIFY ROW_NUMBER() OVER (PARTITION BY query ORDER BY target) = 1 - ) - SELECT - d.SeqID, - d.DerepID, - p.PreclusterID, - c.OTU - FROM derep d - LEFT JOIN preclust p ON d.DerepID = p.DerepID - LEFT JOIN clust c ON p.PreclusterID = c.PreclusterID - ) TO '%s' - (FORMAT 'parquet', COMPRESSION 'ZSTD', COMPRESSION_LEVEL 8)", - OUTPUT)) - -} - -## Clean up -cat("..Disconnecting from DuckDB\n") -dbDisconnect(con, shutdown = TRUE) - - -cat("..Done!\n") - - - -###################################### -###################################### Load and process the data [arrow + dplyr + data.table] -###################################### - -# ## Set number of threads for data.table -# cat("..Setting number of threads\n") -# setDTthreads(threads = CPUTHREADS) # for data.table -# set_cpu_count(CPUTHREADS) # for arrow -# -# ## Globally dereplicated sequences (remove multi-target matches) -# cat("..Loading globally dereplicated sequences\n") -# UCA <- open_dataset(UCDEREP) %>% -# rename(SeqID = query, DerepID = target) %>% -# to_duckdb() %>% -# distinct(SeqID, .keep_all = TRUE) %>% -# collect() %>% -# setDT() -# -# ## Pre-clustered sequences (remove multi-target matches) -# if(!is.na(UCPRECLUST)){ -# cat("..Loading pre-clustered sequences\n") -# UCP <- open_dataset(UCPRECLUST) %>% -# rename(DerepID = query, PreclusterID = target) %>% -# to_duckdb() %>% -# distinct(DerepID, .keep_all = TRUE) %>% -# collect() %>% -# setDT() -# } -# -# ## Clustered sequences (remove multi-target matches) -# cat("..Loading clustering UC file\n") -# if(!is.na(UCPRECLUST)){ -# UCO <- open_dataset(UCCLUST) %>% -# rename(PreclusterID = query, OTU = target) %>% -# to_duckdb() %>% -# distinct(PreclusterID, .keep_all = TRUE) %>% -# collect() %>% -# setDT() -# } else { -# UCO <- open_dataset(UCCLUST) %>% -# rename(DerepID = query, OTU = target) %>% -# to_duckdb() %>% -# distinct(DerepID, .keep_all = TRUE) %>% -# collect() %>% -# setDT() -# } -# -# ## Merge UC files -# if(is.na(UCPRECLUST)){ -# -# ## No pre-clustering or denoising -# cat("..Merging UC files [no pre-clustering or denoising ]\n") -# RES <- merge(x = UCA, y = UCO, by = "DerepID", all.x = TRUE) -# -# } else { -# -# ## Merge UC files with pre-clustering or denoising -# cat("..Merging UC files [with pre-clustering or denoising ]\n") -# -# cat("... Adding pre-cluster or denoised IDs\n") -# RES <- merge(x = UCA, y = UCP, by = "DerepID", all.x = TRUE) -# -# cat("... Adding clustering IDs\n") -# RES <- merge(x = RES, y = UCO, by = "PreclusterID", all.x = TRUE) -# -# } -# - +#!/usr/bin/env Rscript + +## Merge UC files from different steps (dereplication, pre-clustering, clustering) into a single file + +cat("Joining parquet files\n\n") + +## Check time +start_time <- Sys.time() + +## Function to load packages +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(".. ", paste(pkg, packageVersion(pkg), "\n")) +} + +cat("Loading packages:\n") +load_pckg("DBI") +load_pckg("duckdb") +# load_pckg("qs") +# load_pckg("data.table") +# load_pckg("arrow") +# load_pckg("dplyr") + +cat("\nParsing input options and arguments...\n") + +suppressPackageStartupMessages(require(optparse)) + +## Parse arguments +option_list <- list( + make_option("--ucderep", action="store", default=NA, type='character', help="UC file from global dereplication"), + make_option("--ucpreclust", action="store", default=NA, type='character', help="UC file from pre-clustering (optional)"), + make_option("--ucclust", action="store", default=NA, type='character', help="UC file from clustering"), + make_option("--output", action="store", default="UC_Pooled.parquet", type='character', help="Output file name"), + make_option(c("-t", "--threads"), action="store", default=4L, type='integer', help="Number of CPU threads for arrow, default 4") +) +opt <- parse_args(OptionParser(option_list=option_list)) + +## Function to convert text "NA"s to NA +to_na <- function(x){ + if(x %in% c("NA", "null", "Null")){ x <- NA } + return(x) +} + +## Replaces "null"s from Nextflow with NA +opt <- lapply(X = opt, FUN = to_na) + + +## Validation of the required argiments +if(is.na(opt$ucderep)){ + cat("Input file is not specified: UC file from global dereplication.\n", file=stderr()) + stop() +} +if(is.na(opt$ucclust)){ + cat("Input file is not specified: UC file from clustering.\n", file=stderr()) + stop() +} + + +## Assign variables +UCDEREP <- opt$ucderep +UCPRECLUST <- opt$ucpreclust +UCCLUST <- opt$ucclust +OUTPUT <- opt$output +CPUTHREADS <- as.numeric( opt$threads ) + +## Log assigned variables +cat("\nParameters specified:\n") +cat(paste(" UC file from global dereplication: ", UCDEREP, "\n", sep="")) +cat(paste(" UC file from pre-clustering or denoising: ", UCPRECLUST, "\n", sep="")) +cat(paste(" UC file from clustering: ", UCCLUST, "\n", sep="")) +cat(paste(" Output file name: ", OUTPUT, "\n", sep="")) +cat(paste(" Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) + +cat("\n") + +## Data for debugging +# UCDEREP <- "UC_derep.parquet" +# UCPRECLUST <- "UC_preclust.parquet" # "NoPrecluster" +# UCCLUST <- "UC_clust.parquet" +# OUTPUT <- "UC_Pooled.parquet" +# CPUTHREADS <- 4 + +if(UCPRECLUST == "NoPrecluster") { + UCPRECLUST <- NA +} + + +###################################### +###################################### Load and process the data [duckdb] +###################################### + +## Initialize DuckDB connection +cat("..Initializing DuckDB connection\n") +con <- DBI::dbConnect(duckdb::duckdb()) +# duckdb::duckdb(dbdir = tempfile(pattern = "nextits.duckdb.") + +## Register parquet files as tables +cat("..Registering parquet files as tables\n") +cat("...Dereplication UC\n") +dbExecute(con, sprintf("CREATE VIEW derep_seqs AS SELECT * FROM parquet_scan('%s')", UCDEREP)) + +if(!is.na(UCPRECLUST)) { + cat("...Pre-clustering UC\n") + dbExecute(con, sprintf("CREATE VIEW preclust_seqs AS SELECT * FROM parquet_scan('%s')", UCPRECLUST)) +} + +cat("...Clustering UC\n") +dbExecute(con, sprintf("CREATE VIEW clust_seqs AS SELECT * FROM parquet_scan('%s')", UCCLUST)) + +## Set number of threads +cat("..Setting number of threads for DuckDB\n") +dbExecute(con, sprintf("SET threads TO %d;", CPUTHREADS)) + + +## Process and merge the data +if(is.na(UCPRECLUST)) { + + ## Two-file merge (no pre-clustering) + cat("..Merging UC files [no pre-clustering or denoising]\n") + dbExecute(con, sprintf(" + COPY ( + WITH derep AS ( + SELECT DISTINCT + query as SeqID, + target as DerepID + FROM derep_seqs + QUALIFY ROW_NUMBER() OVER (PARTITION BY query ORDER BY target) = 1 + ), + clust AS ( + SELECT DISTINCT + query as DerepID, + target as OTU + FROM clust_seqs + QUALIFY ROW_NUMBER() OVER (PARTITION BY query ORDER BY target) = 1 + ) + SELECT + d.SeqID, + d.DerepID, + c.OTU + FROM derep d + LEFT JOIN clust c ON d.DerepID = c.DerepID + ) TO '%s' + (FORMAT 'parquet', COMPRESSION 'ZSTD', COMPRESSION_LEVEL 8)", + OUTPUT)) + +} else { + + ## Three-file merge (with pre-clustering) + cat("..Merging UC files [with pre-clustering or denoising]\n") + + dbExecute(con, sprintf(" + COPY ( + WITH derep AS ( + SELECT DISTINCT + query as SeqID, + target as DerepID + FROM derep_seqs + QUALIFY ROW_NUMBER() OVER (PARTITION BY query ORDER BY target) = 1 + ), + preclust AS ( + SELECT DISTINCT + query as DerepID, + target as PreclusterID + FROM preclust_seqs + QUALIFY ROW_NUMBER() OVER (PARTITION BY query ORDER BY target) = 1 + ), + clust AS ( + SELECT DISTINCT + query as PreclusterID, + target as OTU + FROM clust_seqs + QUALIFY ROW_NUMBER() OVER (PARTITION BY query ORDER BY target) = 1 + ) + SELECT + d.SeqID, + d.DerepID, + p.PreclusterID, + c.OTU + FROM derep d + LEFT JOIN preclust p ON d.DerepID = p.DerepID + LEFT JOIN clust c ON p.PreclusterID = c.PreclusterID + ) TO '%s' + (FORMAT 'parquet', COMPRESSION 'ZSTD', COMPRESSION_LEVEL 8)", + OUTPUT)) + +} + +## Clean up +cat("..Disconnecting from DuckDB\n") +dbDisconnect(con, shutdown = TRUE) + + +cat("..Done!\n") + + + +###################################### +###################################### Load and process the data [arrow + dplyr + data.table] +###################################### + +# ## Set number of threads for data.table +# cat("..Setting number of threads\n") +# setDTthreads(threads = CPUTHREADS) # for data.table +# set_cpu_count(CPUTHREADS) # for arrow +# +# ## Globally dereplicated sequences (remove multi-target matches) +# cat("..Loading globally dereplicated sequences\n") +# UCA <- open_dataset(UCDEREP) %>% +# rename(SeqID = query, DerepID = target) %>% +# to_duckdb() %>% +# distinct(SeqID, .keep_all = TRUE) %>% +# collect() %>% +# setDT() +# +# ## Pre-clustered sequences (remove multi-target matches) +# if(!is.na(UCPRECLUST)){ +# cat("..Loading pre-clustered sequences\n") +# UCP <- open_dataset(UCPRECLUST) %>% +# rename(DerepID = query, PreclusterID = target) %>% +# to_duckdb() %>% +# distinct(DerepID, .keep_all = TRUE) %>% +# collect() %>% +# setDT() +# } +# +# ## Clustered sequences (remove multi-target matches) +# cat("..Loading clustering UC file\n") +# if(!is.na(UCPRECLUST)){ +# UCO <- open_dataset(UCCLUST) %>% +# rename(PreclusterID = query, OTU = target) %>% +# to_duckdb() %>% +# distinct(PreclusterID, .keep_all = TRUE) %>% +# collect() %>% +# setDT() +# } else { +# UCO <- open_dataset(UCCLUST) %>% +# rename(DerepID = query, OTU = target) %>% +# to_duckdb() %>% +# distinct(DerepID, .keep_all = TRUE) %>% +# collect() %>% +# setDT() +# } +# +# ## Merge UC files +# if(is.na(UCPRECLUST)){ +# +# ## No pre-clustering or denoising +# cat("..Merging UC files [no pre-clustering or denoising ]\n") +# RES <- merge(x = UCA, y = UCO, by = "DerepID", all.x = TRUE) +# +# } else { +# +# ## Merge UC files with pre-clustering or denoising +# cat("..Merging UC files [with pre-clustering or denoising ]\n") +# +# cat("... Adding pre-cluster or denoised IDs\n") +# RES <- merge(x = UCA, y = UCP, by = "DerepID", all.x = TRUE) +# +# cat("... Adding clustering IDs\n") +# RES <- merge(x = RES, y = UCO, by = "PreclusterID", all.x = TRUE) +# +# } +# + diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/pool_seqs_clean_header.sh b/src/pipecraft-core/service_scripts/NextITS/bin/pool_seqs_clean_header.sh index b6cb87e..773efa4 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/pool_seqs_clean_header.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/pool_seqs_clean_header.sh @@ -1,11 +1,11 @@ -#!/bin/bash - -# $1 = input file (full path) -# $2 = sample ID (e.g., basename of the input file) - -zcat "${1}" \ - | sed -r '/^>/ s/;sample=[^;]*/;/g ; s/;;/;/g' \ - | sed "s/>.*/&;sample=${2}; / ; s/_NoChimera.fa//g ; s/_RescuedChimera.fa//g ; s/_JoinedPE//g ; s/Rescued_Chimeric_sequences.part_//g" \ - | sed -r '/^>/ s/;;/;/g' - - +#!/bin/bash + +# $1 = input file (full path) +# $2 = sample ID (e.g., basename of the input file) + +zcat "${1}" \ + | sed -r '/^>/ s/;sample=[^;]*/;/g ; s/;;/;/g' \ + | sed "s/>.*/&;sample=${2}; / ; s/_NoChimera.fa//g ; s/_RescuedChimera.fa//g ; s/_JoinedPE//g ; s/Rescued_Chimeric_sequences.part_//g" \ + | sed -r '/^>/ s/;;/;/g' + + diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/quick_stats.R b/src/pipecraft-core/service_scripts/NextITS/bin/quick_stats.R index e90ff8c..a132c36 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/quick_stats.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/quick_stats.R @@ -1,241 +1,241 @@ -#!/usr/bin/env Rscript - -## Summarise number of reads (demultiplexed and primer-checked) - -# quick_stats.R \ -# --raw Counts_1.RawData.txt \ -# --qc Counts_2.QC.txt \ -# --demuxed Counts_3.Demux.txt \ -# --primer Counts_4.PrimerCheck.txt \ -# --primerartef Counts_4.PrimerArtefacts.txt \ -# --threads 4 - - - -############################################## Parse input parameters - -## Check time -start_time <- Sys.time() - - -cat("Parsing input options and arguments...\n") - -suppressPackageStartupMessages(require(optparse)) - -## Parse arguments -option_list <- list( - - make_option("--raw", action="store", default=NA, type='character', help="Raw read counts"), - make_option("--qc", action="store", default=NA, type='character', help="Counts of reads passed QC"), - make_option("--demuxed", action="store", default=NA, type='character', help="Counts of demultiplexed reads"), - make_option("--primer", action="store", default=NA, type='character', help="Counts of reads with both primers detected"), - make_option("--primerartef",action="store", default=NA, type='character', help="Counts of primer artefacts"), - make_option(c("-t", "--threads"), action="store", default=4L, type='integer', help="Number of CPU threads for arrow, default 4") -) -opt <- parse_args(OptionParser(option_list=option_list)) - - -## Validation of the required argiments -# if(is.na(opt$raw)){ -# cat("Input file is not specified: ....\n", file=stderr()) -# stop() -# } - - - -## Function to convert text "NA"s to NA -to_na <- function(x){ - if(x %in% c("NA", "null", "Null")){ x <- NA } - return(x) -} - -## Assign variables -RAW <- opt$raw -QC <- opt$qc -DEMUXED <- opt$demuxed -PRIMER <- opt$primer -PRIMERARTEF <- opt$primerartef -CPUTHREADS <- as.numeric( opt$threads ) - -## Log assigned variables -cat(paste("Counts - RawData: " , RAW, "\n", sep="")) -cat(paste("Counts - QC: " , QC, "\n", sep="")) -cat(paste("Counts - Demux: " , DEMUXED, "\n", sep="")) -cat(paste("Counts - PrimerCheck: " , PRIMER, "\n", sep="")) -cat(paste("Counts - Primer Artefacts: " , PRIMERARTEF, "\n", sep="")) -cat(paste("Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) - -cat("\n") - - -############################################## data for debuging - -# RAW <- "Counts_1.RawData.txt" -# QC <- "Counts_2.QC.txt" -# DEMUXED <- "Counts_3.Demux.txt" -# PRIMER <- "Counts_4.PrimerCheck.txt" -# PRIMERARTEF <- "Counts_4.PrimerArtefacts.txt" -# CPUTHREADS <- 6 - - -############################################## Load packages and data - -cat("Loading R packages...\n") - -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(paste(pkg, packageVersion(pkg), "\n")) -} - -load_pckg("data.table") -load_pckg("plyr") -load_pckg("metagMisc") -load_pckg("openxlsx") - -cat("\n") - -## Set CPU thread number -cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") -setDTthreads(threads = CPUTHREADS) # for data.table - - -###################################### -###################################### Load the data -###################################### - -cat("\nLoading input data\n") - - -#### Per-dataset stats - -## Load ASV table -cat("..Loading raw counts\n") -RAW <- fread(RAW) - -cat("..Loading QC counts\n") -QC <- fread(QC) - -#### Per-sample stats - -SEQKITCOUNTS <- list() - -cat("..Loading demux counts\n") -SEQKITCOUNTS$DEMUXED <- fread(DEMUXED) - -cat("..Loading primer-checked data counts\n") -SEQKITCOUNTS$PRIMER <- fread(PRIMER) -SEQKITCOUNTS$PRIMERARTEF <- fread(PRIMERARTEF) - - -## Remove NULL-files -null_seqk <- laply(.data = SEQKITCOUNTS, .fun = nrow) - -if(any(null_seqk == 0)){ - cat("Some files with counts are missing:\n") - to_rm <- which(null_seqk == 0) - cat(".. ", paste(names(SEQKITCOUNTS)[ to_rm ], collapse = ", "), "\n") - SEQKITCOUNTS[ to_rm ] <- NULL - rm(to_rm) -} - - -## Process seqkit counts -seqkit_process <- function(x){ - if(nrow(x) > 0){ - - ## Remove reudndant columns - x <- x[ , .(file, num_seqs) ] - - ## Remove file extensions - x[ , file := sub(pattern = ".fastq.gz", replacement = "", x = file) ] - x[ , file := sub(pattern = ".fq.gz", replacement = "", x = file) ] - x[ , file := sub(pattern = ".fa.gz", replacement = "", x = file) ] - x[ , file := sub(pattern = "_PrimerChecked$", replacement = "", x = file) ] - x[ , file := sub(pattern = "_PrimerArtefacts$",replacement = "", x = file) ] - - } - return(x) -} - - -cat("Processing data\n") -SEQKITCOUNTS <- llply(.data = SEQKITCOUNTS, .fun = seqkit_process) - -## Rename columns -if(!is.null(SEQKITCOUNTS$DEMUXED)){ -setnames(x = SEQKITCOUNTS$DEMUXED, old = "num_seqs", new = "Demultiplexed_Reads", skip_absent = TRUE) -} -if(!is.null(SEQKITCOUNTS$PRIMER)){ -setnames(x = SEQKITCOUNTS$PRIMER, old = "num_seqs", new = "PrimerChecked_Reads", skip_absent = TRUE) -} -if(!is.null(SEQKITCOUNTS$PRIMERARTEF)){ -setnames(x = SEQKITCOUNTS$PRIMERARTEF, old = "num_seqs", new = "PrimerArtefacts_Reads", skip_absent = TRUE) -} - -## Merge seqkit and custom counts into a single list -cat("Pooling per-sample counts\n") -COUNTS <- SEQKITCOUNTS - -## Pool per-file estimates -merge_dt <- function(x,y){ merge(x, y, by = "file", all = TRUE) } -PER_SAMPLE_COUNTS_merged <- Reduce(f = merge_dt, x = COUNTS) - -## Estimate percentage of multiprimer artefacts -PER_SAMPLE_COUNTS_merged[ , - PrimerArtefacts_Percent := round( - PrimerArtefacts_Reads / (PrimerChecked_Reads + PrimerArtefacts_Reads) * 100, - 2) - ] - - -### ... update -# .. replace NAs with zero -# .. reorder columns -# .. estimate percentages -# .. add tag-jump summary -# .. add final counts from the Seq table -# .. add positive / negative counts (based on default sample names) - -## Prepare per-run stats -PER_RUN_COUNTS_merged <- data.table( - Total_Number_Of_Reads = RAW$num_seqs, - Reads_Passed_QC = QC$num_seqs, - Reads_Demultiplexed = sum(PER_SAMPLE_COUNTS_merged$Demultiplexed_Reads, na.rm = TRUE), - Reads_PrimerChecked = sum(PER_SAMPLE_COUNTS_merged$PrimerChecked_Reads, na.rm = TRUE) - ) - -## Estimate percentages -cat("..Estimating per-run percentages\n") -PER_RUN_COUNTS_merged[ , Percentage_QC := - round(Reads_Passed_QC / Total_Number_Of_Reads * 100, 1) ] - -PER_RUN_COUNTS_merged[ , Percentage_Demultiplexed := - round(Reads_Demultiplexed / Total_Number_Of_Reads * 100, 1) ] - -PER_RUN_COUNTS_merged[ , Percentage_Passed := - round(Reads_PrimerChecked / Total_Number_Of_Reads * 100, 1) ] - - -## Export summary stats -write.xlsx(list( - "per_sample" = PER_SAMPLE_COUNTS_merged, - "per_run" = PER_RUN_COUNTS_merged - ), - file = "Run_summary.xlsx", colNames = TRUE) - - -cat("\nAll done.\n") - - -##################### Session info - -## Check time -end_time <- Sys.time() - -tmm <- as.numeric(difftime(end_time, start_time, units = "min")) -cat("\nElapsed time: ", tmm, " minutes\n") - -cat("\n") -cat("Session info:\n") -sessionInfo() -cat("\n") +#!/usr/bin/env Rscript + +## Summarise number of reads (demultiplexed and primer-checked) + +# quick_stats.R \ +# --raw Counts_1.RawData.txt \ +# --qc Counts_2.QC.txt \ +# --demuxed Counts_3.Demux.txt \ +# --primer Counts_4.PrimerCheck.txt \ +# --primerartef Counts_4.PrimerArtefacts.txt \ +# --threads 4 + + + +############################################## Parse input parameters + +## Check time +start_time <- Sys.time() + + +cat("Parsing input options and arguments...\n") + +suppressPackageStartupMessages(require(optparse)) + +## Parse arguments +option_list <- list( + + make_option("--raw", action="store", default=NA, type='character', help="Raw read counts"), + make_option("--qc", action="store", default=NA, type='character', help="Counts of reads passed QC"), + make_option("--demuxed", action="store", default=NA, type='character', help="Counts of demultiplexed reads"), + make_option("--primer", action="store", default=NA, type='character', help="Counts of reads with both primers detected"), + make_option("--primerartef",action="store", default=NA, type='character', help="Counts of primer artefacts"), + make_option(c("-t", "--threads"), action="store", default=4L, type='integer', help="Number of CPU threads for arrow, default 4") +) +opt <- parse_args(OptionParser(option_list=option_list)) + + +## Validation of the required argiments +# if(is.na(opt$raw)){ +# cat("Input file is not specified: ....\n", file=stderr()) +# stop() +# } + + + +## Function to convert text "NA"s to NA +to_na <- function(x){ + if(x %in% c("NA", "null", "Null")){ x <- NA } + return(x) +} + +## Assign variables +RAW <- opt$raw +QC <- opt$qc +DEMUXED <- opt$demuxed +PRIMER <- opt$primer +PRIMERARTEF <- opt$primerartef +CPUTHREADS <- as.numeric( opt$threads ) + +## Log assigned variables +cat(paste("Counts - RawData: " , RAW, "\n", sep="")) +cat(paste("Counts - QC: " , QC, "\n", sep="")) +cat(paste("Counts - Demux: " , DEMUXED, "\n", sep="")) +cat(paste("Counts - PrimerCheck: " , PRIMER, "\n", sep="")) +cat(paste("Counts - Primer Artefacts: " , PRIMERARTEF, "\n", sep="")) +cat(paste("Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) + +cat("\n") + + +############################################## data for debuging + +# RAW <- "Counts_1.RawData.txt" +# QC <- "Counts_2.QC.txt" +# DEMUXED <- "Counts_3.Demux.txt" +# PRIMER <- "Counts_4.PrimerCheck.txt" +# PRIMERARTEF <- "Counts_4.PrimerArtefacts.txt" +# CPUTHREADS <- 6 + + +############################################## Load packages and data + +cat("Loading R packages...\n") + +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(paste(pkg, packageVersion(pkg), "\n")) +} + +load_pckg("data.table") +load_pckg("plyr") +load_pckg("metagMisc") +load_pckg("openxlsx") + +cat("\n") + +## Set CPU thread number +cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") +setDTthreads(threads = CPUTHREADS) # for data.table + + +###################################### +###################################### Load the data +###################################### + +cat("\nLoading input data\n") + + +#### Per-dataset stats + +## Load ASV table +cat("..Loading raw counts\n") +RAW <- fread(RAW) + +cat("..Loading QC counts\n") +QC <- fread(QC) + +#### Per-sample stats + +SEQKITCOUNTS <- list() + +cat("..Loading demux counts\n") +SEQKITCOUNTS$DEMUXED <- fread(DEMUXED) + +cat("..Loading primer-checked data counts\n") +SEQKITCOUNTS$PRIMER <- fread(PRIMER) +SEQKITCOUNTS$PRIMERARTEF <- fread(PRIMERARTEF) + + +## Remove NULL-files +null_seqk <- laply(.data = SEQKITCOUNTS, .fun = nrow) + +if(any(null_seqk == 0)){ + cat("Some files with counts are missing:\n") + to_rm <- which(null_seqk == 0) + cat(".. ", paste(names(SEQKITCOUNTS)[ to_rm ], collapse = ", "), "\n") + SEQKITCOUNTS[ to_rm ] <- NULL + rm(to_rm) +} + + +## Process seqkit counts +seqkit_process <- function(x){ + if(nrow(x) > 0){ + + ## Remove reudndant columns + x <- x[ , .(file, num_seqs) ] + + ## Remove file extensions + x[ , file := sub(pattern = ".fastq.gz", replacement = "", x = file) ] + x[ , file := sub(pattern = ".fq.gz", replacement = "", x = file) ] + x[ , file := sub(pattern = ".fa.gz", replacement = "", x = file) ] + x[ , file := sub(pattern = "_PrimerChecked$", replacement = "", x = file) ] + x[ , file := sub(pattern = "_PrimerArtefacts$",replacement = "", x = file) ] + + } + return(x) +} + + +cat("Processing data\n") +SEQKITCOUNTS <- llply(.data = SEQKITCOUNTS, .fun = seqkit_process) + +## Rename columns +if(!is.null(SEQKITCOUNTS$DEMUXED)){ +setnames(x = SEQKITCOUNTS$DEMUXED, old = "num_seqs", new = "Demultiplexed_Reads", skip_absent = TRUE) +} +if(!is.null(SEQKITCOUNTS$PRIMER)){ +setnames(x = SEQKITCOUNTS$PRIMER, old = "num_seqs", new = "PrimerChecked_Reads", skip_absent = TRUE) +} +if(!is.null(SEQKITCOUNTS$PRIMERARTEF)){ +setnames(x = SEQKITCOUNTS$PRIMERARTEF, old = "num_seqs", new = "PrimerArtefacts_Reads", skip_absent = TRUE) +} + +## Merge seqkit and custom counts into a single list +cat("Pooling per-sample counts\n") +COUNTS <- SEQKITCOUNTS + +## Pool per-file estimates +merge_dt <- function(x,y){ merge(x, y, by = "file", all = TRUE) } +PER_SAMPLE_COUNTS_merged <- Reduce(f = merge_dt, x = COUNTS) + +## Estimate percentage of multiprimer artefacts +PER_SAMPLE_COUNTS_merged[ , + PrimerArtefacts_Percent := round( + PrimerArtefacts_Reads / (PrimerChecked_Reads + PrimerArtefacts_Reads) * 100, + 2) + ] + + +### ... update +# .. replace NAs with zero +# .. reorder columns +# .. estimate percentages +# .. add tag-jump summary +# .. add final counts from the Seq table +# .. add positive / negative counts (based on default sample names) + +## Prepare per-run stats +PER_RUN_COUNTS_merged <- data.table( + Total_Number_Of_Reads = RAW$num_seqs, + Reads_Passed_QC = QC$num_seqs, + Reads_Demultiplexed = sum(PER_SAMPLE_COUNTS_merged$Demultiplexed_Reads, na.rm = TRUE), + Reads_PrimerChecked = sum(PER_SAMPLE_COUNTS_merged$PrimerChecked_Reads, na.rm = TRUE) + ) + +## Estimate percentages +cat("..Estimating per-run percentages\n") +PER_RUN_COUNTS_merged[ , Percentage_QC := + round(Reads_Passed_QC / Total_Number_Of_Reads * 100, 1) ] + +PER_RUN_COUNTS_merged[ , Percentage_Demultiplexed := + round(Reads_Demultiplexed / Total_Number_Of_Reads * 100, 1) ] + +PER_RUN_COUNTS_merged[ , Percentage_Passed := + round(Reads_PrimerChecked / Total_Number_Of_Reads * 100, 1) ] + + +## Export summary stats +write.xlsx(list( + "per_sample" = PER_SAMPLE_COUNTS_merged, + "per_run" = PER_RUN_COUNTS_merged + ), + file = "Run_summary.xlsx", colNames = TRUE) + + +cat("\nAll done.\n") + + +##################### Session info + +## Check time +end_time <- Sys.time() + +tmm <- as.numeric(difftime(end_time, start_time, units = "min")) +cat("\nElapsed time: ", tmm, " minutes\n") + +cat("\n") +cat("Session info:\n") +sessionInfo() +cat("\n") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/rc.sh b/src/pipecraft-core/service_scripts/NextITS/bin/rc.sh index 78acb80..77f6ee6 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/rc.sh +++ b/src/pipecraft-core/service_scripts/NextITS/bin/rc.sh @@ -1,9 +1,9 @@ -#!/bin/bash - -## Function to reverse-complement DNA sequences (with the support of IUPAC codes) - -echo "$1" \ - | tr \ - "[ATGCUatgcuNnYyRrSsWwKkMmBbDdHhVv]" \ - "[TACGAtacgaNnRrYySsWwMmKkVvHhDdBb]" \ - | rev +#!/bin/bash + +## Function to reverse-complement DNA sequences (with the support of IUPAC codes) + +echo "$1" \ + | tr \ + "[ATGCUatgcuNnYyRrSsWwKkMmBbDdHhVv]" \ + "[TACGAtacgaNnRrYySsWwMmKkVvHhDdBb]" \ + | rev diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/read_count_summary.R b/src/pipecraft-core/service_scripts/NextITS/bin/read_count_summary.R index 9bb4bee..ea399aa 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/read_count_summary.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/read_count_summary.R @@ -1,500 +1,500 @@ -#!/usr/bin/env Rscript - -## Summarise number of reads per process - -# read_count_summary.R \ -# --raw Counts_1.RawData.txt \ -# --qc Counts_2.QC.txt \ -# --demuxed Counts_3.Demux.txt \ -# --primer Counts_4.PrimerCheck.txt \ -# --primerartef Counts_4.PrimerArtefacts.txt \ -# --itsx Counts_5.ITSx_or_PrimTrim.txt \ -# --homopolymer Counts_5.Homopolymers.txt \ -# --chimrefn Counts_6.ChimRef_reads.txt \ -# --chimrefu Counts_6.ChimRef_uniqs.txt \ -# --chimdenovo Counts_7.ChimDenov.txt \ -# --chimrecovn Counts_8.ChimRecov_reads.txt \ -# --chimrecovu Counts_8.ChimRecov_uniqs.txt \ -# --tj TagJump_scores.qs \ -# --seqtab Seqs.parquet \ -# --maxchim 0.6 \ -# --threads 4 - - -############################################## Parse input parameters - -## Check time -start_time <- Sys.time() - - -cat("\nParsing input options and arguments...\n") - -suppressPackageStartupMessages(require(optparse)) - -## Parse arguments -option_list <- list( - - make_option("--raw", action="store", default=NA, type='character', help="Raw read counts"), - make_option("--qc", action="store", default=NA, type='character', help="Counts of reads passed QC"), - make_option("--demuxed", action="store", default=NA, type='character', help="Counts of demultiplexed reads"), - make_option("--primer", action="store", default=NA, type='character', help="Counts of reads with both primers detected"), - make_option("--primerartef",action="store", default=NA, type='character', help="Counts of primer artefacts"), - make_option("--itsx", action="store", default=NA, type='character', help="Read counts after ITSx or primer removal"), - make_option("--homopolymer",action="store", default=NA, type='character', help="Homopolymer correction results"), - make_option("--chimrefn", action="store", default=NA, type='character', help="Number of reads for reference-based chimeras"), - make_option("--chimrefu", action="store", default=NA, type='character', help="Number of unique sequences detected as reference-based chimeras"), - make_option("--chimdenovo", action="store", default=NA, type='character', help="Number of de novo chimeras"), - make_option("--chimrecovn", action="store", default=NA, type='character', help="Number of resued reads for de novo chimeras (false positives)"), - make_option("--chimrecovu", action="store", default=NA, type='character', help="Number of resued unique sequences detected as de novo chimeras (false positives)"), - make_option("--tj", action="store", default=NA, type='character', help="Tag jump removal data (serialized in qs format)"), - make_option("--seqtab", action="store", default=NA, type='character', help="Final seq table (Parquet format)"), - make_option("--maxchim", action="store", default=0.6, type='numeric', help = "Maximum de novo chimera score to remove"), - make_option(c("-t", "--threads"), action="store", default=4L, type='integer', help="Number of CPU threads for arrow, default 4") -) -opt <- parse_args(OptionParser(option_list=option_list)) - - -## Validation of the required argiments -# if(is.na(opt$raw)){ -# cat("Input file is not specified: ....\n", file=stderr()) -# stop() -# } - - - -## Function to convert text "NA"s to NA -to_na <- function(x){ - if(x %in% c("NA", "null", "Null")){ x <- NA } - return(x) -} - -## Assign variables -RAW <- opt$raw -QC <- opt$qc -DEMUXED <- opt$demuxed -PRIMER <- opt$primer -PRIMERARTEF <- opt$primerartef -ITSX <- opt$itsx -HOMOPOLY <- opt$homopolymer -CHIMREFN <- opt$chimrefn -CHIMREFU <- opt$chimrefu -CHIMDENOVO <- opt$chimdenovo -CHIMRECOVN <- opt$chimrecovn -CHIMRECOVU <- opt$chimrecovu -TJ <- opt$tj -SEQTAB <- opt$seqtab -MAXCHIM <- opt$maxchim -CPUTHREADS <- as.numeric( opt$threads ) - -## Log assigned variables -cat(paste("Counts - RawData: " , RAW, "\n", sep="")) -cat(paste("Counts - QC: " , QC, "\n", sep="")) -cat(paste("Counts - Demux: " , DEMUXED, "\n", sep="")) -cat(paste("Counts - PrimerCheck: " , PRIMER, "\n", sep="")) -cat(paste("Counts - Primer Artefacts: " , PRIMERARTEF, "\n", sep="")) -cat(paste("Counts - ITSx or Primer Trim: " , ITSX, "\n", sep="")) -cat(paste("Counts - Homopolymer correction results: " , HOMOPOLY, "\n", sep="")) -cat(paste("Counts - Chimera Ref-based, reads: " , CHIMREFN, "\n", sep="")) -cat(paste("Counts - Chimera Ref-based, unique sequences: " , CHIMREFU, "\n", sep="")) -cat(paste("Counts - Chimera de novo: " , CHIMDENOVO, "\n", sep="")) -cat(paste("Counts - Chimera Ref-based recoverd, reads: " , CHIMRECOVN, "\n", sep="")) -cat(paste("Counts - Chimera Ref-based recoverd, unique sequences: " , CHIMRECOVU, "\n", sep="")) -cat(paste("Tag-jump data: " , TJ, "\n", sep="")) -cat(paste("Final sequence table: " , SEQTAB, "\n", sep="")) -cat(paste("Maximum de novo chimera score: ", MAXCHIM, "\n", sep="")) -cat(paste("Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) - -cat("\n") - - -############################################## data for debuging - -# RAW <- "Counts_1.RawData.txt" -# QC <- "Counts_2.QC.txt" -# DEMUXED <- "Counts_3.Demux.txt" -# PRIMER <- "Counts_4.PrimerCheck.txt" -# PRIMERARTEF <- "Counts_4.PrimerArtefacts.txt" -# ITSX <- "Counts_5.ITSx_or_PrimTrim.txt" -# HOMOPOLY <- "Counts_5.Homopolymers.txt" -# CHIMREFN <- "Counts_6.ChimRef_reads.txt" -# CHIMREFU <- "Counts_6.ChimRef_uniqs.txt" -# CHIMDENOVO <- "Counts_7.ChimDenov.txt" -# CHIMRECOVN <- "Counts_8.ChimRecov_reads.txt" -# CHIMRECOVU <- "Counts_8.ChimRecov_uniqs.txt" -# TJ <- "TagJump_scores.qs" -# SEQTAB <- "Seqs.parquet" -# MAXCHIM <- 0.6 -# CPUTHREADS <- 6 - - -############################################## Load packages and data - -cat("Loading R packages...\n") - -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(paste(pkg, packageVersion(pkg), "\n")) -} - -load_pckg("data.table") -load_pckg("plyr") -load_pckg("arrow") -# load_pckg("dplyr") -load_pckg("metagMisc") -load_pckg("openxlsx") - -cat("\n") - -## Set CPU thread number -cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") -setDTthreads(threads = CPUTHREADS) # for data.table - - -###################################### -###################################### Load the data -###################################### - -cat("\nLoading input data\n") - - -#### Per-dataset stats - -## Load ASV table -cat("..Loading raw counts\n") -RAW <- fread(RAW) - -cat("..Loading QC counts\n") -QC <- fread(QC) - -#### Per-sample stats - -SEQKITCOUNTS <- list() -CUSTOMCOUNTS <- list() - -cat("..Loading demux counts\n") -SEQKITCOUNTS$DEMUXED <- fread(DEMUXED) - -cat("..Loading primer-checked data counts\n") -SEQKITCOUNTS$PRIMER <- fread(PRIMER) -SEQKITCOUNTS$PRIMERARTEF <- fread(PRIMERARTEF) - -cat("..Loading ITSx or primer trim counts\n") -CUSTOMCOUNTS$ITSX <- fread(ITSX) - -cat("..Loading homopolymer correction results\n") -HOMOPOLY_data <- fread(HOMOPOLY) - -cat("..Loading ref-based chimera counts\n") -CUSTOMCOUNTS$CHIMREFN <- fread(CHIMREFN) -SEQKITCOUNTS$CHIMREFU <- fread(CHIMREFU) - -cat("..Loading de novo chimera counts\n") -CHIMDENOVO <- fread(CHIMDENOVO) # incorporate to the main table - -cat("..Loading rescued ref-based chimera counts\n") -CUSTOMCOUNTS$CHIMRECOVN <- fread(CHIMRECOVN) -SEQKITCOUNTS$CHIMRECOVU <- fread(CHIMRECOVU) - -if(!is.na(TJ) && TJ != "no_tj" && file.exists(TJ)){ - cat("..Loading tag-jump filtration data\n") - TJ <- qs2::qs_read(TJ) - tjdata <- TRUE -} else { - cat("..No tag-jump filtration data found\n") - tjdata <- FALSE -} - -cat("..Loading sequence table\n") -SEQTAB <- arrow::open_dataset(SEQTAB) - - -## Remove NULL-files -null_cust <- laply(.data = CUSTOMCOUNTS, .fun = nrow) -null_seqk <- laply(.data = SEQKITCOUNTS, .fun = nrow) - -if(any(null_cust == 0)){ - cat("Some files with counts are missing:\n") - to_rm <- which(null_cust == 0) - cat(".. ", paste(names(CUSTOMCOUNTS)[ to_rm ], collapse = ", "), "\n") - CUSTOMCOUNTS[ to_rm ] <- NULL - rm(to_rm) -} - -if(any(null_seqk == 0)){ - cat("Some files with counts are missing:\n") - to_rm <- which(null_seqk == 0) - cat(".. ", paste(names(SEQKITCOUNTS)[ to_rm ], collapse = ", "), "\n") - SEQKITCOUNTS[ to_rm ] <- NULL - rm(to_rm) -} - - -## Process seqkit counts -seqkit_process <- function(x){ - if(nrow(x) > 0){ - - ## Remove reudndant columns - x <- x[ , .(file, num_seqs) ] - - ## Remove file extensions - x[ , file := sub(pattern = ".fastq.gz$", replacement = "", x = file) ] - x[ , file := sub(pattern = ".fq.gz$", replacement = "", x = file) ] - x[ , file := sub(pattern = ".fa.gz$", replacement = "", x = file) ] - x[ , file := sub(pattern = ".full.fasta$", replacement = "", x = file) ] - x[ , file := sub(pattern = ".ITS1.fasta.gz$", replacement = "", x = file) ] - x[ , file := sub(pattern = ".ITS2.fasta.gz$", replacement = "", x = file) ] - x[ , file := sub(pattern = "_PrimerChecked$", replacement = "", x = file) ] - x[ , file := sub(pattern = "_PrimerArtefacts$", replacement = "", x = file) ] - x[ , file := sub(pattern = "_Chimera$", replacement = "", x = file) ] - x[ , file := sub(pattern = "_RescuedChimera$", replacement = "", x = file) ] - x[ , file := sub(pattern = "^Rescued_Chimeric_sequences.part_", replacement = "", x = file) ] - - } - return(x) -} - -## Process custom counts -custom_process <- function(x){ - if(nrow(x) > 0){ - - ## There should be just two columns - `SampleID` & `NumReads` - - ## Rename "SampleID" into "file" - setnames(x = x, old = "SampleID", new = "file") - - ## Remove file extensions - x[ , file := sub(pattern = ".full.fasta$", replacement = "", x = file) ] - x[ , file := sub(pattern = "_ITS1_58S_ITS2.fasta$", replacement = "", x = file) ] - x[ , file := sub(pattern = "_Chimera.fa$", replacement = "", x = file) ] - x[ , file := sub(pattern = "_RescuedChimera.fa$", replacement = "", x = file) ] - x[ , file := sub(pattern = "^Rescued_Chimeric_sequences.part_", replacement = "", x = file) ] - - } - return(x) -} - - -cat("Processing data\n") -SEQKITCOUNTS <- llply(.data = SEQKITCOUNTS, .fun = seqkit_process) -CUSTOMCOUNTS <- llply(.data = CUSTOMCOUNTS, .fun = custom_process) - -cat("Estimating homopolymer stats\n") -if(nrow(HOMOPOLY_data) > 0){ - HOMOPOLY_counts <- HOMOPOLY_data[ , .( - N_UniqSequences_AfterITSx_or_PrimerTrimming = .N, - N_UniqSequences_AfterHomopolymerCorrection = length(unique(Target)) - ), - by = "SampleID" ] -} else { - cat("..No homopolymer correction data found\n") -} - -# HOMOPOLY_counts[, Num_HomopolymerCorrectedSequences := -# N_UniqSequences_AfterITSx_or_PrimerTrimming - N_UniqSequences_AfterHomopolymerCorrection ] - - -## Rename columns -if(!is.null(SEQKITCOUNTS$DEMUXED)){ -setnames(x = SEQKITCOUNTS$DEMUXED, old = "num_seqs", new = "Demultiplexed_Reads", skip_absent = TRUE) -} -if(!is.null(SEQKITCOUNTS$PRIMER)){ -setnames(x = SEQKITCOUNTS$PRIMER, old = "num_seqs", new = "PrimerChecked_Reads", skip_absent = TRUE) -} -if(!is.null(SEQKITCOUNTS$PRIMERARTEF)){ -setnames(x = SEQKITCOUNTS$PRIMERARTEF, old = "num_seqs", new = "PrimerArtefacts_Reads", skip_absent = TRUE) -} -if(!is.null(SEQKITCOUNTS$CHIMREFU)){ -setnames(x = SEQKITCOUNTS$CHIMREFU, old = "num_seqs", new = "ReferenceBasedChimera_NumUniqSequences", skip_absent = TRUE) -} -if(!is.null(SEQKITCOUNTS$CHIMRECOVU)){ -setnames(x = SEQKITCOUNTS$CHIMRECOVU, old = "num_seqs", new = "Recovered_ReferenceBasedChimea_NumUniqSequences", skip_absent = TRUE) -} - -if(!is.null(CUSTOMCOUNTS$ITSX)){ -setnames(x = CUSTOMCOUNTS$ITSX, old = "NumReads", new = "ITSx_Extracted_Reads", skip_absent = TRUE) -} -if(!is.null(CUSTOMCOUNTS$CHIMREFN)){ -setnames(x = CUSTOMCOUNTS$CHIMREFN, old = "NumReads", new = "ReferenceBasedChimera_Reads", skip_absent = TRUE) -} -if(!is.null(CUSTOMCOUNTS$CHIMRECOVN)){ -setnames(x = CUSTOMCOUNTS$CHIMRECOVN, old = "NumReads", new = "Recovered_ReferenceBasedChimea_Reads", skip_absent = TRUE) -} - -## Merge seqkit and custom counts into a single list -cat("Pooling per-sample counts\n") -COUNTS <- c(SEQKITCOUNTS, CUSTOMCOUNTS) - -## Pool per-file estimates -merge_dt <- function(x,y){ merge(x, y, by = "file", all = TRUE) } -PER_SAMPLE_COUNTS_merged <- Reduce(f = merge_dt, x = COUNTS) - -## If there are no primer artefacts -if(is.null(SEQKITCOUNTS$PRIMERARTEF)){ - PER_SAMPLE_COUNTS_merged[ , PrimerArtefacts_Reads := 0 ] -} - -## Estimate percentage of primer artefacts -cat("Estimating percentage of primer artefacts\n") -PER_SAMPLE_COUNTS_merged[ , - PrimerArtefacts_Percent := round( - PrimerArtefacts_Reads / (PrimerChecked_Reads + PrimerArtefacts_Reads) * 100, - 2) - ] - -## Estimate tag-jump stats -if(tjdata == TRUE){ - cat("Estimating tag-jump removal yields\n") - TJ_stats <- TJ[ TagJump == TRUE, .( - TagJump_Events = .N, - TagJump_Reads = sum(Abundance, na.rm = TRUE)), - by = "SampleID" ] - - if(nrow(TJ_stats) > 0){ - PER_SAMPLE_COUNTS_merged <- merge( - x = PER_SAMPLE_COUNTS_merged, - y = TJ_stats, - by.x = "file", by.y = "SampleID", all.x = TRUE) - } else { - PER_SAMPLE_COUNTS_merged[ , TagJump_Events := 0 ] - PER_SAMPLE_COUNTS_merged[ , TagJump_Reads := 0 ] - } -} - -## Add homopolymer stats -if(nrow(HOMOPOLY_data) > 0){ - cat("Adding homopolymer stats\n") - PER_SAMPLE_COUNTS_merged <- merge( - x = PER_SAMPLE_COUNTS_merged, - y = HOMOPOLY_counts, - by.x = "file", by.y = "SampleID", all.x = TRUE) -} - -## Add de novo chimera stats -cat("Adding de novo chimera stats\n") -denovo_stats <- SEQTAB |> - dplyr::filter(DeNovo_Chimera_Score >= MAXCHIM) |> - dplyr::group_by(SampleID) |> - dplyr::summarize( - DeNovoChimeras_NumReads = sum(Abundance, na.rm = TRUE), - DeNovoChimeras_NumUniqSeqs = n()) |> - dplyr::collect() |> - setDT() - -PER_SAMPLE_COUNTS_merged <- merge( - x = PER_SAMPLE_COUNTS_merged, - y = denovo_stats, - by.x = "file", by.y = "SampleID", all.x = TRUE) - - - -## Number of reads and unique sequences in the sequence table (per sample) -## Excluding putative de novo chimeras (with score >= MAXCHIM) -cat("Adding sequence table stats\n") -seqtab_stats <- SEQTAB |> - dplyr::filter(DeNovo_Chimera_Score < MAXCHIM | is.na(DeNovo_Chimera_Score) ) |> - dplyr::group_by(SampleID) |> - dplyr::summarize( - SeqTable_NumReads = sum(Abundance, na.rm = TRUE), - SeqTable_NumUniqSeqs = n()) |> - dplyr::collect() |> - setDT() - -PER_SAMPLE_COUNTS_merged <- merge( - x = PER_SAMPLE_COUNTS_merged, - y = seqtab_stats, - by.x = "file", by.y = "SampleID", all.x = TRUE) - - -## Replace NAs with zeros -cat("Replacing NAs with zero\n") -for (j in seq_len(ncol(PER_SAMPLE_COUNTS_merged))){ - set(PER_SAMPLE_COUNTS_merged, which(is.na(PER_SAMPLE_COUNTS_merged[[j]])), j, 0) -} -rm(j) - -## Estimate percentage of reads retained (starting from demultiplexed reads) -cat("Estimating percentage of reads retained\n") -PER_SAMPLE_COUNTS_merged[ , Percentage_Reads_Retained := round( SeqTable_NumReads / Demultiplexed_Reads * 100, 2) ] - -## Estimate percentage of reads retained after ITSx -if("ITSx_Extracted_Reads" %in% colnames(PER_SAMPLE_COUNTS_merged)){ - PER_SAMPLE_COUNTS_merged[ , ITSx_Yield_Percent := round( ITSx_Extracted_Reads / PrimerChecked_Reads * 100, 2) ] -} - -### TODO: -# .. estimate percentages -# .. add per-run positive / negative counts (based on default sample names) - - -## Reorder columns -setcolorder(PER_SAMPLE_COUNTS_merged, - skip_absent = TRUE, - neworder = c( - "file", "Demultiplexed_Reads", - "PrimerChecked_Reads", "PrimerArtefacts_Reads", "PrimerArtefacts_Percent", - "ReferenceBasedChimera_Reads", "ReferenceBasedChimera_NumUniqSequences", - "Recovered_ReferenceBasedChimea_Reads", "Recovered_ReferenceBasedChimea_NumUniqSequences", - "DeNovoChimeras_NumReads", "DeNovoChimeras_NumUniqSeqs", - "ITSx_Extracted_Reads", "ITSx_Yield_Percent", - "N_UniqSequences_AfterITSx_or_PrimerTrimming", - "N_UniqSequences_AfterHomopolymerCorrection", - # "Num_HomopolymerCorrectedSequences", - "TagJump_Reads", "TagJump_Events", - "SeqTable_NumReads", "SeqTable_NumUniqSeqs", - "Percentage_Reads_Retained")) - - - - -## Prepare per-run stats -cat("Preparing per-run stats\n") -PER_RUN_COUNTS_merged <- data.table( - Total_Number_Of_Reads = sum(RAW$num_seqs, na.rm = TRUE), - Reads_Demultiplexed = sum(PER_SAMPLE_COUNTS_merged$Demultiplexed_Reads, na.rm = TRUE), - Reads_Passed_QC = sum(QC$num_seqs, na.rm = TRUE), - Reads_PrimerChecked = sum(PER_SAMPLE_COUNTS_merged$PrimerChecked_Reads, na.rm = TRUE) - ) - -if("ITSx_Extracted_Reads" %in% colnames(PER_SAMPLE_COUNTS_merged)){ - PER_RUN_COUNTS_merged[ , Reads_ITSx_Extracted := sum(PER_SAMPLE_COUNTS_merged$ITSx_Extracted_Reads, na.rm = TRUE) ] -} - -## Estimate percentage of reads passed primer checking -cat("..Estimating per-run percentages\n") -PER_RUN_COUNTS_merged[ , Percentage_Demultiplexed := - round(Reads_Demultiplexed / Total_Number_Of_Reads * 100, 1) ] - -PER_RUN_COUNTS_merged[ , Percentage_PrimerChecked := - round(Reads_PrimerChecked / Total_Number_Of_Reads * 100, 1) ] - -## Final per-run num reads -PER_RUN_COUNTS_merged[ , SeqTable_NumReads := sum(PER_SAMPLE_COUNTS_merged$SeqTable_NumReads, na.rm = TRUE) ] -PER_RUN_COUNTS_merged[ , Percentage_Reads_Retained := round( SeqTable_NumReads / Total_Number_Of_Reads * 100, 2) ] - -NumUniqSeqs <- SEQTAB |> dplyr::select(SeqID) |> dplyr::summarize(N = n()) |> dplyr::collect() -PER_RUN_COUNTS_merged$SeqTable_NumUniqueSequences <- NumUniqSeqs$N - -## Export summary stats -cat("Exporting results\n") -write.xlsx(list( - "per_sample" = PER_SAMPLE_COUNTS_merged, - "per_run" = PER_RUN_COUNTS_merged - ), - file = "Run_summary.xlsx", colNames = TRUE) - - -cat("\nAll done.\n") - - -##################### Session info - -## Check time -end_time <- Sys.time() - -tmm <- as.numeric(difftime(end_time, start_time, units = "min")) -cat("\nElapsed time: ", tmm, " minutes\n") - -cat("\n") -cat("Session info:\n") -sessionInfo() -cat("\n") +#!/usr/bin/env Rscript + +## Summarise number of reads per process + +# read_count_summary.R \ +# --raw Counts_1.RawData.txt \ +# --qc Counts_2.QC.txt \ +# --demuxed Counts_3.Demux.txt \ +# --primer Counts_4.PrimerCheck.txt \ +# --primerartef Counts_4.PrimerArtefacts.txt \ +# --itsx Counts_5.ITSx_or_PrimTrim.txt \ +# --homopolymer Counts_5.Homopolymers.txt \ +# --chimrefn Counts_6.ChimRef_reads.txt \ +# --chimrefu Counts_6.ChimRef_uniqs.txt \ +# --chimdenovo Counts_7.ChimDenov.txt \ +# --chimrecovn Counts_8.ChimRecov_reads.txt \ +# --chimrecovu Counts_8.ChimRecov_uniqs.txt \ +# --tj TagJump_scores.qs \ +# --seqtab Seqs.parquet \ +# --maxchim 0.6 \ +# --threads 4 + + +############################################## Parse input parameters + +## Check time +start_time <- Sys.time() + + +cat("\nParsing input options and arguments...\n") + +suppressPackageStartupMessages(require(optparse)) + +## Parse arguments +option_list <- list( + + make_option("--raw", action="store", default=NA, type='character', help="Raw read counts"), + make_option("--qc", action="store", default=NA, type='character', help="Counts of reads passed QC"), + make_option("--demuxed", action="store", default=NA, type='character', help="Counts of demultiplexed reads"), + make_option("--primer", action="store", default=NA, type='character', help="Counts of reads with both primers detected"), + make_option("--primerartef",action="store", default=NA, type='character', help="Counts of primer artefacts"), + make_option("--itsx", action="store", default=NA, type='character', help="Read counts after ITSx or primer removal"), + make_option("--homopolymer",action="store", default=NA, type='character', help="Homopolymer correction results"), + make_option("--chimrefn", action="store", default=NA, type='character', help="Number of reads for reference-based chimeras"), + make_option("--chimrefu", action="store", default=NA, type='character', help="Number of unique sequences detected as reference-based chimeras"), + make_option("--chimdenovo", action="store", default=NA, type='character', help="Number of de novo chimeras"), + make_option("--chimrecovn", action="store", default=NA, type='character', help="Number of resued reads for de novo chimeras (false positives)"), + make_option("--chimrecovu", action="store", default=NA, type='character', help="Number of resued unique sequences detected as de novo chimeras (false positives)"), + make_option("--tj", action="store", default=NA, type='character', help="Tag jump removal data (serialized in qs format)"), + make_option("--seqtab", action="store", default=NA, type='character', help="Final seq table (Parquet format)"), + make_option("--maxchim", action="store", default=0.6, type='numeric', help = "Maximum de novo chimera score to remove"), + make_option(c("-t", "--threads"), action="store", default=4L, type='integer', help="Number of CPU threads for arrow, default 4") +) +opt <- parse_args(OptionParser(option_list=option_list)) + + +## Validation of the required argiments +# if(is.na(opt$raw)){ +# cat("Input file is not specified: ....\n", file=stderr()) +# stop() +# } + + + +## Function to convert text "NA"s to NA +to_na <- function(x){ + if(x %in% c("NA", "null", "Null")){ x <- NA } + return(x) +} + +## Assign variables +RAW <- opt$raw +QC <- opt$qc +DEMUXED <- opt$demuxed +PRIMER <- opt$primer +PRIMERARTEF <- opt$primerartef +ITSX <- opt$itsx +HOMOPOLY <- opt$homopolymer +CHIMREFN <- opt$chimrefn +CHIMREFU <- opt$chimrefu +CHIMDENOVO <- opt$chimdenovo +CHIMRECOVN <- opt$chimrecovn +CHIMRECOVU <- opt$chimrecovu +TJ <- opt$tj +SEQTAB <- opt$seqtab +MAXCHIM <- opt$maxchim +CPUTHREADS <- as.numeric( opt$threads ) + +## Log assigned variables +cat(paste("Counts - RawData: " , RAW, "\n", sep="")) +cat(paste("Counts - QC: " , QC, "\n", sep="")) +cat(paste("Counts - Demux: " , DEMUXED, "\n", sep="")) +cat(paste("Counts - PrimerCheck: " , PRIMER, "\n", sep="")) +cat(paste("Counts - Primer Artefacts: " , PRIMERARTEF, "\n", sep="")) +cat(paste("Counts - ITSx or Primer Trim: " , ITSX, "\n", sep="")) +cat(paste("Counts - Homopolymer correction results: " , HOMOPOLY, "\n", sep="")) +cat(paste("Counts - Chimera Ref-based, reads: " , CHIMREFN, "\n", sep="")) +cat(paste("Counts - Chimera Ref-based, unique sequences: " , CHIMREFU, "\n", sep="")) +cat(paste("Counts - Chimera de novo: " , CHIMDENOVO, "\n", sep="")) +cat(paste("Counts - Chimera Ref-based recoverd, reads: " , CHIMRECOVN, "\n", sep="")) +cat(paste("Counts - Chimera Ref-based recoverd, unique sequences: " , CHIMRECOVU, "\n", sep="")) +cat(paste("Tag-jump data: " , TJ, "\n", sep="")) +cat(paste("Final sequence table: " , SEQTAB, "\n", sep="")) +cat(paste("Maximum de novo chimera score: ", MAXCHIM, "\n", sep="")) +cat(paste("Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) + +cat("\n") + + +############################################## data for debuging + +# RAW <- "Counts_1.RawData.txt" +# QC <- "Counts_2.QC.txt" +# DEMUXED <- "Counts_3.Demux.txt" +# PRIMER <- "Counts_4.PrimerCheck.txt" +# PRIMERARTEF <- "Counts_4.PrimerArtefacts.txt" +# ITSX <- "Counts_5.ITSx_or_PrimTrim.txt" +# HOMOPOLY <- "Counts_5.Homopolymers.txt" +# CHIMREFN <- "Counts_6.ChimRef_reads.txt" +# CHIMREFU <- "Counts_6.ChimRef_uniqs.txt" +# CHIMDENOVO <- "Counts_7.ChimDenov.txt" +# CHIMRECOVN <- "Counts_8.ChimRecov_reads.txt" +# CHIMRECOVU <- "Counts_8.ChimRecov_uniqs.txt" +# TJ <- "TagJump_scores.qs" +# SEQTAB <- "Seqs.parquet" +# MAXCHIM <- 0.6 +# CPUTHREADS <- 6 + + +############################################## Load packages and data + +cat("Loading R packages...\n") + +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(paste(pkg, packageVersion(pkg), "\n")) +} + +load_pckg("data.table") +load_pckg("plyr") +load_pckg("arrow") +# load_pckg("dplyr") +load_pckg("metagMisc") +load_pckg("openxlsx") + +cat("\n") + +## Set CPU thread number +cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") +setDTthreads(threads = CPUTHREADS) # for data.table + + +###################################### +###################################### Load the data +###################################### + +cat("\nLoading input data\n") + + +#### Per-dataset stats + +## Load ASV table +cat("..Loading raw counts\n") +RAW <- fread(RAW) + +cat("..Loading QC counts\n") +QC <- fread(QC) + +#### Per-sample stats + +SEQKITCOUNTS <- list() +CUSTOMCOUNTS <- list() + +cat("..Loading demux counts\n") +SEQKITCOUNTS$DEMUXED <- fread(DEMUXED) + +cat("..Loading primer-checked data counts\n") +SEQKITCOUNTS$PRIMER <- fread(PRIMER) +SEQKITCOUNTS$PRIMERARTEF <- fread(PRIMERARTEF) + +cat("..Loading ITSx or primer trim counts\n") +CUSTOMCOUNTS$ITSX <- fread(ITSX) + +cat("..Loading homopolymer correction results\n") +HOMOPOLY_data <- fread(HOMOPOLY) + +cat("..Loading ref-based chimera counts\n") +CUSTOMCOUNTS$CHIMREFN <- fread(CHIMREFN) +SEQKITCOUNTS$CHIMREFU <- fread(CHIMREFU) + +cat("..Loading de novo chimera counts\n") +CHIMDENOVO <- fread(CHIMDENOVO) # incorporate to the main table + +cat("..Loading rescued ref-based chimera counts\n") +CUSTOMCOUNTS$CHIMRECOVN <- fread(CHIMRECOVN) +SEQKITCOUNTS$CHIMRECOVU <- fread(CHIMRECOVU) + +if(!is.na(TJ) && TJ != "no_tj" && file.exists(TJ)){ + cat("..Loading tag-jump filtration data\n") + TJ <- qs::qread(TJ) + tjdata <- TRUE +} else { + cat("..No tag-jump filtration data found\n") + tjdata <- FALSE +} + +cat("..Loading sequence table\n") +SEQTAB <- arrow::open_dataset(SEQTAB) + + +## Remove NULL-files +null_cust <- laply(.data = CUSTOMCOUNTS, .fun = nrow) +null_seqk <- laply(.data = SEQKITCOUNTS, .fun = nrow) + +if(any(null_cust == 0)){ + cat("Some files with counts are missing:\n") + to_rm <- which(null_cust == 0) + cat(".. ", paste(names(CUSTOMCOUNTS)[ to_rm ], collapse = ", "), "\n") + CUSTOMCOUNTS[ to_rm ] <- NULL + rm(to_rm) +} + +if(any(null_seqk == 0)){ + cat("Some files with counts are missing:\n") + to_rm <- which(null_seqk == 0) + cat(".. ", paste(names(SEQKITCOUNTS)[ to_rm ], collapse = ", "), "\n") + SEQKITCOUNTS[ to_rm ] <- NULL + rm(to_rm) +} + + +## Process seqkit counts +seqkit_process <- function(x){ + if(nrow(x) > 0){ + + ## Remove reudndant columns + x <- x[ , .(file, num_seqs) ] + + ## Remove file extensions + x[ , file := sub(pattern = ".fastq.gz$", replacement = "", x = file) ] + x[ , file := sub(pattern = ".fq.gz$", replacement = "", x = file) ] + x[ , file := sub(pattern = ".fa.gz$", replacement = "", x = file) ] + x[ , file := sub(pattern = ".full.fasta$", replacement = "", x = file) ] + x[ , file := sub(pattern = ".ITS1.fasta.gz$", replacement = "", x = file) ] + x[ , file := sub(pattern = ".ITS2.fasta.gz$", replacement = "", x = file) ] + x[ , file := sub(pattern = "_PrimerChecked$", replacement = "", x = file) ] + x[ , file := sub(pattern = "_PrimerArtefacts$", replacement = "", x = file) ] + x[ , file := sub(pattern = "_Chimera$", replacement = "", x = file) ] + x[ , file := sub(pattern = "_RescuedChimera$", replacement = "", x = file) ] + x[ , file := sub(pattern = "^Rescued_Chimeric_sequences.part_", replacement = "", x = file) ] + + } + return(x) +} + +## Process custom counts +custom_process <- function(x){ + if(nrow(x) > 0){ + + ## There should be just two columns - `SampleID` & `NumReads` + + ## Rename "SampleID" into "file" + setnames(x = x, old = "SampleID", new = "file") + + ## Remove file extensions + x[ , file := sub(pattern = ".full.fasta$", replacement = "", x = file) ] + x[ , file := sub(pattern = "_ITS1_58S_ITS2.fasta$", replacement = "", x = file) ] + x[ , file := sub(pattern = "_Chimera.fa$", replacement = "", x = file) ] + x[ , file := sub(pattern = "_RescuedChimera.fa$", replacement = "", x = file) ] + x[ , file := sub(pattern = "^Rescued_Chimeric_sequences.part_", replacement = "", x = file) ] + + } + return(x) +} + + +cat("Processing data\n") +SEQKITCOUNTS <- llply(.data = SEQKITCOUNTS, .fun = seqkit_process) +CUSTOMCOUNTS <- llply(.data = CUSTOMCOUNTS, .fun = custom_process) + +cat("Estimating homopolymer stats\n") +if(nrow(HOMOPOLY_data) > 0){ + HOMOPOLY_counts <- HOMOPOLY_data[ , .( + N_UniqSequences_AfterITSx_or_PrimerTrimming = .N, + N_UniqSequences_AfterHomopolymerCorrection = length(unique(Target)) + ), + by = "SampleID" ] +} else { + cat("..No homopolymer correction data found\n") +} + +# HOMOPOLY_counts[, Num_HomopolymerCorrectedSequences := +# N_UniqSequences_AfterITSx_or_PrimerTrimming - N_UniqSequences_AfterHomopolymerCorrection ] + + +## Rename columns +if(!is.null(SEQKITCOUNTS$DEMUXED)){ +setnames(x = SEQKITCOUNTS$DEMUXED, old = "num_seqs", new = "Demultiplexed_Reads", skip_absent = TRUE) +} +if(!is.null(SEQKITCOUNTS$PRIMER)){ +setnames(x = SEQKITCOUNTS$PRIMER, old = "num_seqs", new = "PrimerChecked_Reads", skip_absent = TRUE) +} +if(!is.null(SEQKITCOUNTS$PRIMERARTEF)){ +setnames(x = SEQKITCOUNTS$PRIMERARTEF, old = "num_seqs", new = "PrimerArtefacts_Reads", skip_absent = TRUE) +} +if(!is.null(SEQKITCOUNTS$CHIMREFU)){ +setnames(x = SEQKITCOUNTS$CHIMREFU, old = "num_seqs", new = "ReferenceBasedChimera_NumUniqSequences", skip_absent = TRUE) +} +if(!is.null(SEQKITCOUNTS$CHIMRECOVU)){ +setnames(x = SEQKITCOUNTS$CHIMRECOVU, old = "num_seqs", new = "Recovered_ReferenceBasedChimea_NumUniqSequences", skip_absent = TRUE) +} + +if(!is.null(CUSTOMCOUNTS$ITSX)){ +setnames(x = CUSTOMCOUNTS$ITSX, old = "NumReads", new = "ITSx_Extracted_Reads", skip_absent = TRUE) +} +if(!is.null(CUSTOMCOUNTS$CHIMREFN)){ +setnames(x = CUSTOMCOUNTS$CHIMREFN, old = "NumReads", new = "ReferenceBasedChimera_Reads", skip_absent = TRUE) +} +if(!is.null(CUSTOMCOUNTS$CHIMRECOVN)){ +setnames(x = CUSTOMCOUNTS$CHIMRECOVN, old = "NumReads", new = "Recovered_ReferenceBasedChimea_Reads", skip_absent = TRUE) +} + +## Merge seqkit and custom counts into a single list +cat("Pooling per-sample counts\n") +COUNTS <- c(SEQKITCOUNTS, CUSTOMCOUNTS) + +## Pool per-file estimates +merge_dt <- function(x,y){ merge(x, y, by = "file", all = TRUE) } +PER_SAMPLE_COUNTS_merged <- Reduce(f = merge_dt, x = COUNTS) + +## If there are no primer artefacts +if(is.null(SEQKITCOUNTS$PRIMERARTEF)){ + PER_SAMPLE_COUNTS_merged[ , PrimerArtefacts_Reads := 0 ] +} + +## Estimate percentage of primer artefacts +cat("Estimating percentage of primer artefacts\n") +PER_SAMPLE_COUNTS_merged[ , + PrimerArtefacts_Percent := round( + PrimerArtefacts_Reads / (PrimerChecked_Reads + PrimerArtefacts_Reads) * 100, + 2) + ] + +## Estimate tag-jump stats +if(tjdata == TRUE){ + cat("Estimating tag-jump removal yields\n") + TJ_stats <- TJ[ TagJump == TRUE, .( + TagJump_Events = .N, + TagJump_Reads = sum(Abundance, na.rm = TRUE)), + by = "SampleID" ] + + if(nrow(TJ_stats) > 0){ + PER_SAMPLE_COUNTS_merged <- merge( + x = PER_SAMPLE_COUNTS_merged, + y = TJ_stats, + by.x = "file", by.y = "SampleID", all.x = TRUE) + } else { + PER_SAMPLE_COUNTS_merged[ , TagJump_Events := 0 ] + PER_SAMPLE_COUNTS_merged[ , TagJump_Reads := 0 ] + } +} + +## Add homopolymer stats +if(nrow(HOMOPOLY_data) > 0){ + cat("Adding homopolymer stats\n") + PER_SAMPLE_COUNTS_merged <- merge( + x = PER_SAMPLE_COUNTS_merged, + y = HOMOPOLY_counts, + by.x = "file", by.y = "SampleID", all.x = TRUE) +} + +## Add de novo chimera stats +cat("Adding de novo chimera stats\n") +denovo_stats <- SEQTAB |> + dplyr::filter(DeNovo_Chimera_Score >= MAXCHIM) |> + dplyr::group_by(SampleID) |> + dplyr::summarize( + DeNovoChimeras_NumReads = sum(Abundance, na.rm = TRUE), + DeNovoChimeras_NumUniqSeqs = n()) |> + dplyr::collect() |> + setDT() + +PER_SAMPLE_COUNTS_merged <- merge( + x = PER_SAMPLE_COUNTS_merged, + y = denovo_stats, + by.x = "file", by.y = "SampleID", all.x = TRUE) + + + +## Number of reads and unique sequences in the sequence table (per sample) +## Excluding putative de novo chimeras (with score >= MAXCHIM) +cat("Adding sequence table stats\n") +seqtab_stats <- SEQTAB |> + dplyr::filter(DeNovo_Chimera_Score < MAXCHIM | is.na(DeNovo_Chimera_Score) ) |> + dplyr::group_by(SampleID) |> + dplyr::summarize( + SeqTable_NumReads = sum(Abundance, na.rm = TRUE), + SeqTable_NumUniqSeqs = n()) |> + dplyr::collect() |> + setDT() + +PER_SAMPLE_COUNTS_merged <- merge( + x = PER_SAMPLE_COUNTS_merged, + y = seqtab_stats, + by.x = "file", by.y = "SampleID", all.x = TRUE) + + +## Replace NAs with zeros +cat("Replacing NAs with zero\n") +for (j in seq_len(ncol(PER_SAMPLE_COUNTS_merged))){ + set(PER_SAMPLE_COUNTS_merged, which(is.na(PER_SAMPLE_COUNTS_merged[[j]])), j, 0) +} +rm(j) + +## Estimate percentage of reads retained (starting from demultiplexed reads) +cat("Estimating percentage of reads retained\n") +PER_SAMPLE_COUNTS_merged[ , Percentage_Reads_Retained := round( SeqTable_NumReads / Demultiplexed_Reads * 100, 2) ] + +## Estimate percentage of reads retained after ITSx +if("ITSx_Extracted_Reads" %in% colnames(PER_SAMPLE_COUNTS_merged)){ + PER_SAMPLE_COUNTS_merged[ , ITSx_Yield_Percent := round( ITSx_Extracted_Reads / PrimerChecked_Reads * 100, 2) ] +} + +### TODO: +# .. estimate percentages +# .. add per-run positive / negative counts (based on default sample names) + + +## Reorder columns +setcolorder(PER_SAMPLE_COUNTS_merged, + skip_absent = TRUE, + neworder = c( + "file", "Demultiplexed_Reads", + "PrimerChecked_Reads", "PrimerArtefacts_Reads", "PrimerArtefacts_Percent", + "ReferenceBasedChimera_Reads", "ReferenceBasedChimera_NumUniqSequences", + "Recovered_ReferenceBasedChimea_Reads", "Recovered_ReferenceBasedChimea_NumUniqSequences", + "DeNovoChimeras_NumReads", "DeNovoChimeras_NumUniqSeqs", + "ITSx_Extracted_Reads", "ITSx_Yield_Percent", + "N_UniqSequences_AfterITSx_or_PrimerTrimming", + "N_UniqSequences_AfterHomopolymerCorrection", + # "Num_HomopolymerCorrectedSequences", + "TagJump_Reads", "TagJump_Events", + "SeqTable_NumReads", "SeqTable_NumUniqSeqs", + "Percentage_Reads_Retained")) + + + + +## Prepare per-run stats +cat("Preparing per-run stats\n") +PER_RUN_COUNTS_merged <- data.table( + Total_Number_Of_Reads = sum(RAW$num_seqs, na.rm = TRUE), + Reads_Demultiplexed = sum(PER_SAMPLE_COUNTS_merged$Demultiplexed_Reads, na.rm = TRUE), + Reads_Passed_QC = sum(QC$num_seqs, na.rm = TRUE), + Reads_PrimerChecked = sum(PER_SAMPLE_COUNTS_merged$PrimerChecked_Reads, na.rm = TRUE) + ) + +if("ITSx_Extracted_Reads" %in% colnames(PER_SAMPLE_COUNTS_merged)){ + PER_RUN_COUNTS_merged[ , Reads_ITSx_Extracted := sum(PER_SAMPLE_COUNTS_merged$ITSx_Extracted_Reads, na.rm = TRUE) ] +} + +## Estimate percentage of reads passed primer checking +cat("..Estimating per-run percentages\n") +PER_RUN_COUNTS_merged[ , Percentage_Demultiplexed := + round(Reads_Demultiplexed / Total_Number_Of_Reads * 100, 1) ] + +PER_RUN_COUNTS_merged[ , Percentage_PrimerChecked := + round(Reads_PrimerChecked / Total_Number_Of_Reads * 100, 1) ] + +## Final per-run num reads +PER_RUN_COUNTS_merged[ , SeqTable_NumReads := sum(PER_SAMPLE_COUNTS_merged$SeqTable_NumReads, na.rm = TRUE) ] +PER_RUN_COUNTS_merged[ , Percentage_Reads_Retained := round( SeqTable_NumReads / Total_Number_Of_Reads * 100, 2) ] + +NumUniqSeqs <- SEQTAB |> dplyr::select(SeqID) |> dplyr::summarize(N = n()) |> dplyr::collect() +PER_RUN_COUNTS_merged$SeqTable_NumUniqueSequences <- NumUniqSeqs$N + +## Export summary stats +cat("Exporting results\n") +write.xlsx(list( + "per_sample" = PER_SAMPLE_COUNTS_merged, + "per_run" = PER_RUN_COUNTS_merged + ), + file = "Run_summary.xlsx", colNames = TRUE) + + +cat("\nAll done.\n") + + +##################### Session info + +## Check time +end_time <- Sys.time() + +tmm <- as.numeric(difftime(end_time, start_time, units = "min")) +cat("\nElapsed time: ", tmm, " minutes\n") + +cat("\n") +cat("Session info:\n") +sessionInfo() +cat("\n") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/seq_table_assembly.R b/src/pipecraft-core/service_scripts/NextITS/bin/seq_table_assembly.R index 107670b..161fc59 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/seq_table_assembly.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/seq_table_assembly.R @@ -1,299 +1,299 @@ -#!/usr/bin/env Rscript - -## Script to perform tag-jump removal - -## To do: -# - add HMM profile ID if ITSx was used - -# Inputs: -# 1. tag-jump-filtered Seq table (`Seq_tab_TagJumpFiltered.txt.gz`) -# 2. Sequences in fasta (`Seq_not_filtered.fa.gz`) -# 3. de novo chimera scores (`DeNovo_Chimera.txt`) -# 4. sequence qualities (`SeqQualities.parquet`) - -# Outputs: -# - FASTA with filtered Seqs `Seqs.fa.gz` -# - Seq table in long format `Seqs.txt.gz` (with additional sequence info) -# - Data in Parquet format `Seqs.parquet` - - -## Function to load packages -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(".. ", paste(pkg, packageVersion(pkg), "\n")) -} - -cat("Loading packages:\n") - -load_pckg("optparse") -load_pckg("data.table") -load_pckg("Biostrings") -load_pckg("plyr") -load_pckg("arrow") -# load_pckg("dplyr") -# load_pckg("openxlsx") - - -cat("\nParsing input options and arguments...\n") - -option_list <- list( - make_option("--seqtab", action="store", default=NA, type='character', help = "Sequence table (tab-delimited, long format)"), - make_option("--fasta", action="store", default=NA, type='character', help = "Sequences in FASTA format"), - make_option("--chimera", action="store", default=NA, type='character', help = "De novo chimera scores"), - make_option("--quality", action="store", default=NA, type='character', help = "Sequence qualities (Parquet format)"), - make_option("--threads", action="store", default=4, type='integer', help = "Number of CPU threads to use") -) - -opt <- parse_args(OptionParser(option_list=option_list)) - -## Function to convert text "NA"s to NA -to_na <- function(x){ - if(x %in% c("NA", "null", "Null")){ x <- NA } - return(x) -} - -## Replaces "null"s from Nextflow with NA -opt <- lapply(X = opt, FUN = to_na) - - -## Validation of the required arguments -required_args <- c("seqtab", "fasta", "quality") -missing_args <- required_args[ sapply(required_args, function(x) is.na(opt[[x]])) ] -if (length(missing_args) > 0) { - stop("Missing required arguments: ", paste(missing_args, collapse=", ")) -} - -## Assign variables -SEQTAB <- opt$seqtab -FASTA <- opt$fasta -CHIMERA <- opt$chimera -QUALITY <- opt$quality -CPUTHREADS <- as.numeric( opt$threads ) - -## Log assigned variables -cat(paste("Input sequence table: ", SEQTAB, "\n", sep="")) -cat(paste("Sequences in FASTA format: ", FASTA, "\n", sep="")) -cat(paste("De novo chimera scores: ", CHIMERA, "\n", sep="")) -cat(paste("Sequence qualities: ", QUALITY, "\n", sep="")) -cat(paste("Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) - -cat("\n") - - -## Debug: -# SEQTAB <- "Seq_tab_TagJumpFiltered.txt.gz" -# FASTA <- "Seq_not_filtered.fa.gz" -# CHIMERA <- "DeNovo_Chimera.txt" -# QUALITY <- "SeqQualities.parquet" -# CPUTHREADS <- 4 - - -## Set CPU thread number -cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") -setDTthreads(threads = CPUTHREADS) # for data.table -set_cpu_count(CPUTHREADS) # for arrow - -###################################### -###################################### Load the data -###################################### - -## Load sequnece table -cat("\n\n..Loading sequence table\n") - -TAB <- fread( - file = SEQTAB, - sep = "\t", header = TRUE) - -## Load sequences in fasta format -cat("..Loading sequneces in FASTA format\n") -SQS <- readDNAStringSet(filepath = FASTA) - -## Load de novo chimera scores -cat("..Loading de novo chimera scores\n") - -CHI <- try( - fread( - file = CHIMERA, - header = FALSE, sep = "\t", - col.names = c("SeqID", "DeNovo_Chimera_Score", "SampleID")) - ) - -if("try-error" %in% class(CHI)){ - cat("\nCould not read the file with de novo chimeric scores\n") - cat("Most likely, the file file is empty (no de novo chimeras)\n") - - ## Initialize empty data table - CHI <- data.table(SeqID = character(), DeNovo_Chimera_Score = numeric(), SampleID = character()) -} - - -## Load sequence quality scores -cat("..Loading sequence quality scores\n") -QLT <- arrow::open_dataset(QUALITY) |> - dplyr::select(Hash, Length, AvgPhredScore, MaxEE, MEEP) |> - dplyr::collect() |> - dplyr::filter(Hash %in% unique(TAB$SeqID)) |> - setDT() - -setnames(QLT, - old = c("Hash", "Length", "AvgPhredScore"), - new = c("SeqID", "SeqLen", "PhredScore")) - -## Quality data: -# old header: c("SampleID", "SeqID", "SeqLen", "PhredScore", "MaxEE", "MEEP") -# new header: c("SampleID", "Hash", "PacBioID", "PhredScore", "MaxEE", "MEEP", "Sequence", "Quality", "Length") - - -## Create SeqID___SampleID column -TAB[, SeqID___SampleID := paste0(SeqID, "___", SampleID) ] -# QLT[, SeqID___SampleID := paste0(SeqID, "___", SampleID) ] - - -###################################### -###################################### Add quality scores, for non-singleton use max score -###################################### - -cat("\n\n..Adding quality scores\n") - -cat("...Prepareing quality scores\n") -setorder(QLT, SeqID, -PhredScore) -QLT <- QLT[QLT[, .I[which.max(PhredScore)], by=SeqID]$V1] - -cat("...Adding data to the main table\n") -if(any(! TAB$SeqID %in% QLT$SeqID)){ - cat("WARNING: Some sequences are not present in the quality table\n") -} - -TAB <- merge(x = TAB, y = QLT, by = "SeqID", all.x = TRUE) - -# with(TAB, plot(Abundance, PhredScore)) - - -###################################### -###################################### Add chimera info -###################################### - -cat("..Adding info about de novo chimeric sequences\n") - -if(nrow(CHI) > 0){ - - TAB <- merge(x = TAB, y = CHI, - by = c("SeqID", "SampleID"), all.x = TRUE) - - ## Convert variables to numeric scores - TAB[ , DeNovo_Chimera_Score := as.numeric(DeNovo_Chimera_Score) ] - - ## Classify sequences into putative chimeras - TAB[ !is.na(DeNovo_Chimera_Score), DeNovo_Chimera := TRUE ] - TAB[ is.na(DeNovo_Chimera_Score), DeNovo_Chimera := FALSE ] - - cat("... ", sum( TAB$DeNovo_Chimera), " putative de novo chimeras found\n") - cat("... ", sum(!TAB$DeNovo_Chimera), " non-chimeric sequences\n") - -} else { - - ## No de novo chimeras - - TAB[ , DeNovo_Chimera_Score := as.numeric(NA) ] - TAB[ , DeNovo_Chimera := FALSE ] - - cat("... ", 0, " putative de novo chimeras found\n") - cat("... ", nrow(TAB), " non-chimeric sequences\n") - -} - - -###################################### -###################################### Add sequences -###################################### - -cat("\n\n..Processing sequences\n") - -SQTAB <- data.table( - SeqHeader = names(SQS), - Sequence = as.character(SQS)) - -## Split the header (`feb76b9;size=1;sample=ABCD;` ) -SQTAB[ , c("SeqID", "SampleID") := tstrsplit(x = SeqHeader, split = ";", keep = c(1,2)) ] -SQTAB[ , SeqHeader := NULL ] -SQTAB[ , SampleID := gsub(pattern = "sample=", replacement = "", x = SampleID) ] - - -SQTAB[ , SeqID___SampleID := paste0(SeqID, "___", SampleID) ] -SQTAB[ , c("SeqID", "SampleID") := NULL ] - -cat("..Adding sequences to the main table\n") - -TAB <- merge(x = TAB, y = SQTAB, - by = c("SeqID___SampleID"), all.x = TRUE) - - -cat("..Sorting table by abundance, quality score, and SampleID\n") - -setorder(x = TAB, -Abundance, -PhredScore, SampleID) - - -cat("..Preparing FASTA file with filtered sequences\n") - -SQF <- DNAStringSet(x = TAB$Sequence) -names(SQF) <- paste0(TAB$SeqID, ";size=", TAB$Abundance, ";sample=", TAB$SampleID, ";") - -## Export FASTA -cat("..Exporting FASTA file with filtered sequences\n") - -writeXStringSet(x = SQF, - filepath = "Seqs.fa.gz", - compress = TRUE, format = "fasta", width = 9999) - - - - -###################################### -###################################### Export results -###################################### - -# cat("..Reshaping sequence table into wide format\n") -# -# TABW <- dcast(data = TAB, -# formula = SeqID ~ SampleID, -# value.var = "Abundance", -# fill = 0) - - -cat("..Exporting result\n") - -setcolorder( - x = TAB, - neworder = c( - "SeqID___SampleID", "SampleID", "SeqID", - "Abundance", "SeqLen", "PhredScore", "MaxEE", "MEEP", - "DeNovo_Chimera", "DeNovo_Chimera_Score", - "Sequence")) - -# cat("...Exporting RData\n") -# saveRDS(object = TAB, file = "Seqs.RData", compress = "xz") - -cat("...Exporting Parquet\n") - -write_parquet( - x = TAB, - sink = "Seqs.parquet", - compression = "zstd", - compression_level = 10, - use_dictionary = TRUE) - - -## Long table -cat("...Exporting long table\n") - -TAB[ , SeqID___SampleID := NULL ] - -fwrite(x = TAB, file = "Seqs.txt.gz", sep = "\t", compress = "gzip") - -## Wide table -# cat("...Exporting wide table\n") - -# fwrite(x = TABW, file = "Seq_tab.txt.gz", sep = "\t", compress = "gzip") - - -cat("All done.") +#!/usr/bin/env Rscript + +## Script to perform tag-jump removal + +## To do: +# - add HMM profile ID if ITSx was used + +# Inputs: +# 1. tag-jump-filtered Seq table (`Seq_tab_TagJumpFiltered.txt.gz`) +# 2. Sequences in fasta (`Seq_not_filtered.fa.gz`) +# 3. de novo chimera scores (`DeNovo_Chimera.txt`) +# 4. sequence qualities (`SeqQualities.parquet`) + +# Outputs: +# - FASTA with filtered Seqs `Seqs.fa.gz` +# - Seq table in long format `Seqs.txt.gz` (with additional sequence info) +# - Data in Parquet format `Seqs.parquet` + + +## Function to load packages +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(".. ", paste(pkg, packageVersion(pkg), "\n")) +} + +cat("Loading packages:\n") + +load_pckg("optparse") +load_pckg("data.table") +load_pckg("Biostrings") +load_pckg("plyr") +load_pckg("arrow") +# load_pckg("dplyr") +# load_pckg("openxlsx") + + +cat("\nParsing input options and arguments...\n") + +option_list <- list( + make_option("--seqtab", action="store", default=NA, type='character', help = "Sequence table (tab-delimited, long format)"), + make_option("--fasta", action="store", default=NA, type='character', help = "Sequences in FASTA format"), + make_option("--chimera", action="store", default=NA, type='character', help = "De novo chimera scores"), + make_option("--quality", action="store", default=NA, type='character', help = "Sequence qualities (Parquet format)"), + make_option("--threads", action="store", default=4, type='integer', help = "Number of CPU threads to use") +) + +opt <- parse_args(OptionParser(option_list=option_list)) + +## Function to convert text "NA"s to NA +to_na <- function(x){ + if(x %in% c("NA", "null", "Null")){ x <- NA } + return(x) +} + +## Replaces "null"s from Nextflow with NA +opt <- lapply(X = opt, FUN = to_na) + + +## Validation of the required arguments +required_args <- c("seqtab", "fasta", "quality") +missing_args <- required_args[ sapply(required_args, function(x) is.na(opt[[x]])) ] +if (length(missing_args) > 0) { + stop("Missing required arguments: ", paste(missing_args, collapse=", ")) +} + +## Assign variables +SEQTAB <- opt$seqtab +FASTA <- opt$fasta +CHIMERA <- opt$chimera +QUALITY <- opt$quality +CPUTHREADS <- as.numeric( opt$threads ) + +## Log assigned variables +cat(paste("Input sequence table: ", SEQTAB, "\n", sep="")) +cat(paste("Sequences in FASTA format: ", FASTA, "\n", sep="")) +cat(paste("De novo chimera scores: ", CHIMERA, "\n", sep="")) +cat(paste("Sequence qualities: ", QUALITY, "\n", sep="")) +cat(paste("Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) + +cat("\n") + + +## Debug: +# SEQTAB <- "Seq_tab_TagJumpFiltered.txt.gz" +# FASTA <- "Seq_not_filtered.fa.gz" +# CHIMERA <- "DeNovo_Chimera.txt" +# QUALITY <- "SeqQualities.parquet" +# CPUTHREADS <- 4 + + +## Set CPU thread number +cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") +setDTthreads(threads = CPUTHREADS) # for data.table +set_cpu_count(CPUTHREADS) # for arrow + +###################################### +###################################### Load the data +###################################### + +## Load sequnece table +cat("\n\n..Loading sequence table\n") + +TAB <- fread( + file = SEQTAB, + sep = "\t", header = TRUE) + +## Load sequences in fasta format +cat("..Loading sequences in FASTA format\n") +SQS <- readDNAStringSet(filepath = FASTA) + +## Load de novo chimera scores +cat("..Loading de novo chimera scores\n") + +CHI <- try( + fread( + file = CHIMERA, + header = FALSE, sep = "\t", + col.names = c("SeqID", "DeNovo_Chimera_Score", "SampleID")) + ) + +if("try-error" %in% class(CHI)){ + cat("\nCould not read the file with de novo chimeric scores\n") + cat("Most likely, the file file is empty (no de novo chimeras)\n") + + ## Initialize empty data table + CHI <- data.table(SeqID = character(), DeNovo_Chimera_Score = numeric(), SampleID = character()) +} + + +## Load sequence quality scores +cat("..Loading sequence quality scores\n") +QLT <- arrow::open_dataset(QUALITY) |> + dplyr::select(Hash, Length, AvgPhredScore, MaxEE, MEEP) |> + dplyr::collect() |> + dplyr::filter(Hash %in% unique(TAB$SeqID)) |> + setDT() + +setnames(QLT, + old = c("Hash", "Length", "AvgPhredScore"), + new = c("SeqID", "SeqLen", "PhredScore")) + +## Quality data: +# old header: c("SampleID", "SeqID", "SeqLen", "PhredScore", "MaxEE", "MEEP") +# new header: c("SampleID", "Hash", "PacBioID", "PhredScore", "MaxEE", "MEEP", "Sequence", "Quality", "Length") + + +## Create SeqID___SampleID column +TAB[, SeqID___SampleID := paste0(SeqID, "___", SampleID) ] +# QLT[, SeqID___SampleID := paste0(SeqID, "___", SampleID) ] + + +###################################### +###################################### Add quality scores, for non-singleton use max score +###################################### + +cat("\n\n..Adding quality scores\n") + +cat("...Prepareing quality scores\n") +setorder(QLT, SeqID, -PhredScore) +QLT <- QLT[QLT[, .I[which.max(PhredScore)], by=SeqID]$V1] + +cat("...Adding data to the main table\n") +if(any(! TAB$SeqID %in% QLT$SeqID)){ + cat("WARNING: Some sequences are not present in the quality table\n") +} + +TAB <- merge(x = TAB, y = QLT, by = "SeqID", all.x = TRUE) + +# with(TAB, plot(Abundance, PhredScore)) + + +###################################### +###################################### Add chimera info +###################################### + +cat("..Adding info about de novo chimeric sequences\n") + +if(nrow(CHI) > 0){ + + TAB <- merge(x = TAB, y = CHI, + by = c("SeqID", "SampleID"), all.x = TRUE) + + ## Convert variables to numeric scores + TAB[ , DeNovo_Chimera_Score := as.numeric(DeNovo_Chimera_Score) ] + + ## Classify sequences into putative chimeras + TAB[ !is.na(DeNovo_Chimera_Score), DeNovo_Chimera := TRUE ] + TAB[ is.na(DeNovo_Chimera_Score), DeNovo_Chimera := FALSE ] + + cat("... ", sum( TAB$DeNovo_Chimera), " putative de novo chimeras found\n") + cat("... ", sum(!TAB$DeNovo_Chimera), " non-chimeric sequences\n") + +} else { + + ## No de novo chimeras + + TAB[ , DeNovo_Chimera_Score := as.numeric(NA) ] + TAB[ , DeNovo_Chimera := FALSE ] + + cat("... ", 0, " putative de novo chimeras found\n") + cat("... ", nrow(TAB), " non-chimeric sequences\n") + +} + + +###################################### +###################################### Add sequences +###################################### + +cat("\n\n..Processing sequences\n") + +SQTAB <- data.table( + SeqHeader = names(SQS), + Sequence = as.character(SQS)) + +## Split the header (`feb76b9;size=1;sample=ABCD;` ) +SQTAB[ , c("SeqID", "SampleID") := tstrsplit(x = SeqHeader, split = ";", keep = c(1,2)) ] +SQTAB[ , SeqHeader := NULL ] +SQTAB[ , SampleID := gsub(pattern = "sample=", replacement = "", x = SampleID) ] + + +SQTAB[ , SeqID___SampleID := paste0(SeqID, "___", SampleID) ] +SQTAB[ , c("SeqID", "SampleID") := NULL ] + +cat("..Adding sequences to the main table\n") + +TAB <- merge(x = TAB, y = SQTAB, + by = c("SeqID___SampleID"), all.x = TRUE) + + +cat("..Sorting table by abundance, quality score, and SampleID\n") + +setorder(x = TAB, -Abundance, -PhredScore, SampleID) + + +cat("..Preparing FASTA file with filtered sequences\n") + +SQF <- DNAStringSet(x = TAB$Sequence) +names(SQF) <- paste0(TAB$SeqID, ";size=", TAB$Abundance, ";sample=", TAB$SampleID, ";") + +## Export FASTA +cat("..Exporting FASTA file with filtered sequences\n") + +writeXStringSet(x = SQF, + filepath = "Seqs.fa.gz", + compress = TRUE, format = "fasta", width = 9999) + + + + +###################################### +###################################### Export results +###################################### + +# cat("..Reshaping sequence table into wide format\n") +# +# TABW <- dcast(data = TAB, +# formula = SeqID ~ SampleID, +# value.var = "Abundance", +# fill = 0) + + +cat("..Exporting result\n") + +setcolorder( + x = TAB, + neworder = c( + "SeqID___SampleID", "SampleID", "SeqID", + "Abundance", "SeqLen", "PhredScore", "MaxEE", "MEEP", + "DeNovo_Chimera", "DeNovo_Chimera_Score", + "Sequence")) + +# cat("...Exporting RData\n") +# saveRDS(object = TAB, file = "Seqs.RData", compress = "xz") + +cat("...Exporting Parquet\n") + +write_parquet( + x = TAB, + sink = "Seqs.parquet", + compression = "zstd", + compression_level = 10, + use_dictionary = TRUE) + + +## Long table +cat("...Exporting long table\n") + +TAB[ , SeqID___SampleID := NULL ] + +fwrite(x = TAB, file = "Seqs.txt.gz", sep = "\t", compress = "gzip") + +## Wide table +# cat("...Exporting wide table\n") + +# fwrite(x = TABW, file = "Seq_tab.txt.gz", sep = "\t", compress = "gzip") + + +cat("All done.") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/substitute_compressed_seqs.R b/src/pipecraft-core/service_scripts/NextITS/bin/substitute_compressed_seqs.R index 6a60fc4..0f66d71 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/substitute_compressed_seqs.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/substitute_compressed_seqs.R @@ -1,67 +1,67 @@ -#!/usr/bin/env Rscript - -## Script to replace homopolymer-comressed sequences with non-compressed seqs -## + update size annotation - -# Input is given as positional arguments: -# 1. Uncompressed sequences (`inp_tab.txt`) -# 2. Homopolymer-compressed sequences (`clust_tab.txt`) -# 3. Name of the output FASTA file (`res.fa`) - -suppressMessages(library(data.table)); setDTthreads(threads = 1) -suppressMessages(library(Biostrings)) - -args <- commandArgs(trailingOnly = TRUE) - -## Load data - Uncompressed (inp_tab.txt) -cat("..Loading original sequences\n") -d1 <- fread(file = args[1], - header=FALSE, sep = "\t", quote = F, col.names = c("SeqID", "Seq_OK"), selec = 1:2) - -## Load data - Compressed (clust_tab.txt) -cat("..Loading compressed sequences\n") -d2 <- fread(file = args[2], - header=FALSE, sep = "\t", quote = F, col.names = c("SeqID", "Seq_Compr"), selec = 1:2) - -cat("...Number of raw sequences: ", nrow(d1), "\n") -cat("...Number of compressed sequences: ", nrow(d2), "\n") - -cat("..Processing data\n") - -## Remove multiple separators -d1[, SeqID := gsub(pattern = ";;", replacement = ";", x = SeqID)] -d2[, SeqID := gsub(pattern = ";;", replacement = ";", x = SeqID)] - -## Split seq ID -d1[, c("Hash", "Size") := tstrsplit(SeqID, ";", keep=1:2)] -d2[, c("Hash", "Size") := tstrsplit(SeqID, ";", keep=1:2)] - -## Drop seq ID -d1[, SeqID := NULL ] -d2[, SeqID := NULL ] - -## Replace seqs -res <- merge( - x = d2[, .(Hash, Size)], - y = d1[, .(Seq_OK, Hash)], - by = "Hash", all.x = TRUE) - -res[, SeqID := do.call(paste, c(.SD, sep = ";")), .SDcols = c("Hash", "Size")] - -## Verify the number of reads - should be the same -# sum(as.numeric(gsub(pattern = "size=", replacement = "", x = d1$Size))) -# sum(as.numeric(gsub(pattern = "size=", replacement = "", x = d2$Size))) -# sum(as.numeric(gsub(pattern = "size=", replacement = "", x = res$Size))) - -cat("...Total number of reads: ", - sum(as.numeric(gsub(pattern = "size=", replacement = "", x = res$Size))), - "\n") - -## Prepare sequences -cat("..Exporting results\n") -sqs <- DNAStringSet(x = res$Seq_OK) -names(sqs) <- res$SeqID - -## Export FASTA -writeXStringSet(x = sqs, filepath = args[3], - compress=FALSE, format="fasta", width=9999) +#!/usr/bin/env Rscript + +## Script to replace homopolymer-comressed sequences with non-compressed seqs +## + update size annotation + +# Input is given as positional arguments: +# 1. Uncompressed sequences (`inp_tab.txt`) +# 2. Homopolymer-compressed sequences (`clust_tab.txt`) +# 3. Name of the output FASTA file (`res.fa`) + +suppressMessages(library(data.table)); setDTthreads(threads = 1) +suppressMessages(library(Biostrings)) + +args <- commandArgs(trailingOnly = TRUE) + +## Load data - Uncompressed (inp_tab.txt) +cat("..Loading original sequences\n") +d1 <- fread(file = args[1], + header=FALSE, sep = "\t", quote = F, col.names = c("SeqID", "Seq_OK"), selec = 1:2) + +## Load data - Compressed (clust_tab.txt) +cat("..Loading compressed sequences\n") +d2 <- fread(file = args[2], + header=FALSE, sep = "\t", quote = F, col.names = c("SeqID", "Seq_Compr"), selec = 1:2) + +cat("...Number of raw sequences: ", nrow(d1), "\n") +cat("...Number of compressed sequences: ", nrow(d2), "\n") + +cat("..Processing data\n") + +## Remove multiple separators +d1[, SeqID := gsub(pattern = ";;", replacement = ";", x = SeqID)] +d2[, SeqID := gsub(pattern = ";;", replacement = ";", x = SeqID)] + +## Split seq ID +d1[, c("Hash", "Size") := tstrsplit(SeqID, ";", keep=1:2)] +d2[, c("Hash", "Size") := tstrsplit(SeqID, ";", keep=1:2)] + +## Drop seq ID +d1[, SeqID := NULL ] +d2[, SeqID := NULL ] + +## Replace seqs +res <- merge( + x = d2[, .(Hash, Size)], + y = d1[, .(Seq_OK, Hash)], + by = "Hash", all.x = TRUE) + +res[, SeqID := do.call(paste, c(.SD, sep = ";")), .SDcols = c("Hash", "Size")] + +## Verify the number of reads - should be the same +# sum(as.numeric(gsub(pattern = "size=", replacement = "", x = d1$Size))) +# sum(as.numeric(gsub(pattern = "size=", replacement = "", x = d2$Size))) +# sum(as.numeric(gsub(pattern = "size=", replacement = "", x = res$Size))) + +cat("...Total number of reads: ", + sum(as.numeric(gsub(pattern = "size=", replacement = "", x = res$Size))), + "\n") + +## Prepare sequences +cat("..Exporting results\n") +sqs <- DNAStringSet(x = res$Seq_OK) +names(sqs) <- res$SeqID + +## Export FASTA +writeXStringSet(x = sqs, filepath = args[3], + compress=FALSE, format="fasta", width=9999) diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/summarize_clustered_data.R b/src/pipecraft-core/service_scripts/NextITS/bin/summarize_clustered_data.R index b2c43dc..c6e0b6f 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/summarize_clustered_data.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/summarize_clustered_data.R @@ -1,400 +1,400 @@ -#!/usr/bin/env Rscript - -## Script to pool remove low-quality singletons and summarize sequence abundance at OTU level (per sample) - -# Input: -# 1. Sequence tables in long format with de novo chimeras removed (`Seqs.parquet`) -# 2. UC file (`UC_Pooled.parquet`) -# 3. FASTA file with OTU sequences (`Clustered.fa.gz`) -# 4. Max MEEP score - -# Outputs: -# - OTU table in long format (`OTU_table_long.txt.gz` & `OTU_table_long.RData`) -# - OTU table in wide format (`OTU_table_wide.txt.gz` & `OTU_table_wide.RData`) -# - FASTA file with sequences (`OTUs.fa.gz`) - -## Usage: -# ./summarize_clustered_data.R \ -# --seqtab "Seqs.parquet" \ -# --uc "UC_Pooled.parquet" \ -# --otus "Clustered.fa.gz" \ -# --maxmeep 0.6 \ -# --recoversinglet TRUE \ -# --mergesamples TRUE \ -# --threads 4 - - -## Quality threshold: -# MEEP score of 0.6 corresponds approximately to the average Phred score of 22.2 - -## Singleton recovery: -# If enabled, then singleton OTUs with MEEP score <= 0.6 & will be preserved -# Otherwise, singleton OTUs will be removed - - -############################################## Parse input parameters - -## Check time -start_time <- Sys.time() - - -cat("Parsing input options and arguments:\n") - -suppressPackageStartupMessages(require(optparse)) - -## Parse arguments -option_list <- list( - make_option("--seqtab", action="store", default=NA, type='character', help="Sequence tables in long format with de novo chimeras removed (Parquet format)"), - make_option("--uc", action="store", default=NA, type='character', help="UC file (Parquet format)"), - make_option("--otus", action="store", default=NA, type='character', help="FASTA file with OTU sequences"), - make_option("--maxmeep", action="store", default=0.5, type='double', help="Max MEEP score"), - make_option("--recoversinglet", action="store", default=TRUE, type='logical', help="Recover singletons"), - make_option(c("-m", "--mergesamples"), action="store", default=FALSE, type='logical', help="Merge sample replicates (default, false)"), - make_option(c("-t", "--threads"), action="store", default=4L, type='integer', help="Number of CPU threads for arrow, default 4") - # make_option(c("-s", "--scriptdir"), action="store", default=getwd(), type='character', help="Directory containing source scripts") -) -opt <- parse_args(OptionParser(option_list=option_list)) - -## Function to convert text "NA"s to NA -to_na <- function(x){ - if(x %in% c("NA", "null", "Null")){ x <- NA } - return(x) -} - -## Replaces "null"s from Nextflow with NA -opt <- lapply(X = opt, FUN = to_na) - - -## Validation of the required argiments -if(is.na(opt$seqtab)){ - cat("Input file is not specified: sequence tables in Parquet format.\n", file=stderr()) - stop() -} -if(is.na(opt$uc)){ - cat("Input file is not specified: UC file is required.\n", file=stderr()) - stop() -} -if(is.na(opt$otus)){ - cat("Input file is not specified: FASTA file with OTU sequences.\n", file=stderr()) - stop() -} -if(opt$recoversinglet == TRUE && is.na(opt$maxmeep)){ - cat("For singleton recovery, the max MEEP score must be specified.\n", file=stderr()) - stop() -} - -## Assign variables -SEQTAB <- opt$seqtab -UCF <- opt$uc -MAXMEEP <- as.numeric( opt$maxmeep ) -RECOV_SINGLET <- as.logical(opt$recoversinglet) -MERGE_SAMPLES <- as.logical(opt$mergesamples) -OTUS <- opt$otus - -CPUTHREADS <- as.numeric( opt$threads ) -# SCRIPTDIR <- opt$scriptdir - -## Log assigned variables -cat(paste("Sequence tables (Parquet format): ", SEQTAB, "\n", sep="")) -cat(paste("UC file (Parquet format): ", UCF, "\n", sep="")) -cat(paste("Max MEEP score: ", MAXMEEP, "\n", sep="")) -cat(paste("Low-quality singleton recovery: ", RECOV_SINGLET, "\n", sep="")) -cat(paste("Merge sample replicates: ", MERGE_SAMPLES, "\n", sep="")) -cat(paste("OTU sequences: ", OTUS, "\n", sep="")) -cat(paste("Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) -# cat(paste("Directory containing source scripts: ", SCRIPTDIR, "\n", sep="")) - -cat("\n") - - - -############################################## Data for debugging - -# SEQTAB <- "Seqs.parquet" -# UCF <- "UC_Pooled.parquet" -# MAXMEEP <- 0.5 -# RECOV_SINGLET <- TRUE -# MERGE_SAMPLES <- TRUE -# OTUS <- "Clustered.fa.gz" -# CPUTHREADS <- 4 - - -############################################## Load packages and data - -cat("Loading R packages:\n") - -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(paste(pkg, packageVersion(pkg), "\n")) -} - -load_pckg("data.table") -load_pckg("plyr") -load_pckg("metagMisc") -load_pckg("Biostrings") -load_pckg("arrow") - - -cat("\n") - - -# cat("Loading additional R funcitons...\n") -# source(file.path(SCRIPTDIR, "R_functions.R")) -# cat("\n") - - -## Set CPU thread number -cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") -setDTthreads(threads = CPUTHREADS) # for data.table -set_cpu_count(CPUTHREADS) # for arrow - -###################################### -###################################### Load the data -###################################### - -## Load sequence tables -cat("\n..Loading sequence tables\n") -TAB <- arrow::read_parquet(SEQTAB) -setDT(TAB) -cat("... Total number of records: ", nrow(TAB), "\n") -cat("... Total number unique sequences: ", length(unique(TAB$Sequence)), "\n") -cat("... Total number unique samples (files): ", length(unique(TAB$SampleID)), "\n") - -## Load UC file for globally dereplicated sequences -cat("..Loading pooled UC file\n") -UC <- open_dataset(UCF) |> dplyr::collect() |> setDT() - -## Add OTU IDs to seq table -cat("... Adding OTU IDs to sequence table\n") -cat(".... Number of records in sequence table before merging: ", nrow(TAB), "\n") -TAB <- merge(x = TAB, y = UC, by = "SeqID", all.x = TRUE) -cat(".... Number of records in sequence table after merging: ", nrow(TAB), "\n") - -## Remove NA OTUs -- probably excluded seqs -if(any(is.na(TAB$OTU))){ - cat("WARNING: not all sequences were assigned to OTUs\n") - cat("..Removing missing/excluded sequences\n") - cat(".. ", sum(is.na(TAB$OTU)), " sequences with total abundance ", - sum(TAB[ is.na(OTU) ]$Abundance, na.rm = TRUE), " reads will be excluded\n") - TAB <- TAB[ ! is.na(OTU) ] -} - - -## Find singleton OTUs -cat("\n..Finding singleton OTUs\n") -SINGLETONS <- TAB[ , .(Abundance = sum(Abundance, na.rm = TRUE)), by = .(OTU) ][ Abundance < 2 ] -cat("... Number of singleton OTUs: ", nrow(SINGLETONS), "\n") - -## If singleton recovery is reqired -if(RECOV_SINGLET == TRUE && nrow(SINGLETONS) > 0){ - - ## Add quality scores - SINGLETONS <- merge(x = SINGLETONS, y = TAB[ , .(SeqID, MEEP)], by.x = "OTU", by.y = "SeqID", all.x = TRUE) - - ## Filter by MEEP score - SINGLETONS <- SINGLETONS[ MEEP > MAXMEEP ] - cat("... Number of singleton OTUs after filtering by MEEP score: ", nrow(SINGLETONS), "\n") - -} - -if(nrow(SINGLETONS) > 0){ - cat("..Removing singleton OTUs\n") - TAB <- TAB[ ! OTU %in% SINGLETONS$OTU ] - cat("... Number of records in sequence table after removing singleton OTUs: ", nrow(TAB), "\n") -} - - -## Summarize abundance by sample and OTU -cat("\n..Summarizing OTU abundance\n") -if(MERGE_SAMPLES == TRUE){ - - cat("\n... Merging sample replicates (e.g., re-sequenced samples)\n") - - ## Extract sample names - cat(".... Extracting sample names\n") - TAB[ , SampleName := tstrsplit(x = SampleID, split = "__", keep = 2) ] - - cat(".... Summarizing abundance by sample and OTU\n") - RES <- TAB[ , - .( Abundance = sum(Abundance, na.rm = TRUE) ), - by = c("OTU", "SampleName") ] - - setnames(x = RES, old = "SampleName", new = "SampleID") - -} else { - - cat("... Summarizing abundance by sample and OTU\n") - RES <- TAB[ , - .( Abundance = sum(Abundance, na.rm = TRUE) ), - by = c("OTU", "SampleID") ] - -} - -#### Reshape to wide table -cat("\nReshaping table into wide format\n") - -## Check if we can reshape the table in a single pass -n_otu <- length(unique(RES$OTU)) -n_smp <- length(unique(RES$SampleID)) -n_cll <- as.numeric(n_otu) * as.numeric(n_smp) -cat("...In total, there are ", n_otu, " OTUs and ", n_smp, " samples\n") -cat("...The total number of cells in the wide table will be ", n_cll, "\n") - -## Reshape data in one pass -if(n_cll < 50000000){ - REW <- dcast(data = RES, - formula = OTU ~ SampleID, - fun.aggregate = sum, fill = 0, value.var = "Abundance") -} else { -## Split data into chunks, reshape, and merge back - - cat("..The input table is too large to reshape in a single pass, reshaping by chunks\n") - - ## Function to split vector into N chunks - chunk <- function(x, n){ - if(n > 1) { res <- split(x, cut(seq_along(x), n, labels = FALSE)) } - if(n == 1){ res <- list(); res[[1]] <- x } - return(res) - } - - ## Choose the number of chunks - n_chunks <- data.table::fcase( - n_cll < 9e7, 2L, - n_cll >= 9e7 & n_cll < 5e8, 5L, - n_cll >= 5e8 & n_cll < 5e9, 6L, - n_cll >= 5e9 & n_cll < 5e10, 7L, - n_cll >= 5e10, 8L) - - cat("...The number of chunks to process: , ", n_chunks, "\n") - - ch <- chunk(x = sort(unique(RES$SampleID)), n = n_chunks) - - ## Chunk-and-reshape loop - REWL <- plyr::llply( - .data = ch, - .fun = function(x){ - - ## Reshape to wide - res <- dcast( - data = RES[ SampleID %in% x , ], - formula = OTU ~ SampleID, - fill = 0, fun.aggregate = sum, value.var = "Abundance") - - ## Create key on a data.table (should improve merging speed) - setkey(res, OTU) - - return(res) - }, - .progress = "text") - - cat("...Chunk reshaping finished\n") - cat("..Merging data into a single wide table\n") - - ## Merge chunks into a single wide table - merge_dt <- function(x,y){ data.table::merge.data.table(x, y, by = "OTU", all = TRUE) } - REW <- Reduce(f = merge_dt, x = REWL) - cat("...Merging finished\n") - - ## Clean up - cat("...Cleaning up\n") - rm(REWL); gc() - - ## Replace NAs with zeros - cat("...Filling missing values with zeros\n") - for (j in seq_len(ncol(REW))){ - set(REW, which(is.na(REW[[j]])), j, 0) - } - -} ## end of reshaping - -cat("...Reshaping to the wide format done!\n") - - -## Reorder OTU rows -cat("\n..Reordering OTU rows by total abundance\n") -otu_tots <- rowSums(REW[, -1], na.rm = TRUE) -REW <- REW[ order(otu_tots, decreasing = T), ] - -## Add attributes if samples were merged -setattr(x = RES, name = "Samples_merged", value = MERGE_SAMPLES) -setattr(x = REW, name = "Samples_merged", value = MERGE_SAMPLES) - - -cat("\nExporting results\n") - -## Export data -saveRDS.gz <- function(object, file, threads = parallel::detectCores()) { - con <- pipe(paste0("pigz -p",threads," > ",file),"wb") - saveRDS(object, file = con) - close(con) -} - -cat("..Exporting long table [R]\n") -saveRDS.gz(object = RES, - file = "OTU_table_long.RData", - threads = CPUTHREADS) - -cat("..Exporting wide table [R]\n") -saveRDS.gz(object = REW, - file = "OTU_table_wide.RData", - threads = CPUTHREADS) - -cat("..Exporting long table [tab-delimited]\n") -fwrite(x = RES, file = "OTU_table_long.txt.gz", sep = "\t", compress = "gzip") - -cat("..Exporting wide table [tab-delimited]\n") -fwrite(x = REW, file = "OTU_table_wide.txt.gz", sep = "\t", compress = "gzip") - - -cat("\nExporting OTU sequences to FASTA\n") - -cat("..Preparing sequences\n") - -## Take sequences from the data (NB! there are a several different sequence per OTU) -# SQS <- unique(RES[, .(OTU) ]) -# tmp_OTUs <- unique(TAB[ OTU %in% SQS$OTU & SeqID == OTU , .(OTU, Sequence) ]) -# SQS <- merge(x = SQS, y = tmp_OTUs, by = "OTU", all.x = TRUE) -# rm(tmp_OTUs) -# -# cat("...Preparing XStringSet object\n") -# SQF <- DNAStringSet(x = SQS$Sequence) -# names(SQF) <- SQS$OTU - -## Take sequnces from the OTU file -cat("... Loading FASTA file\n") -SQS <- readDNAStringSet(filepath = OTUS, format="fasta") -cat("... Extracting sequence IDs\n") -names(SQS) <- tstrsplit(x = names(SQS), split = ";", keep = 1)[[1]] - -if(any(duplicated(names(SQS)))){ - cat("WARNING: duplicated OTU names detected!\n") -} - -cat("... Subsetting OTUs\n") -SQF <- SQS[ names(SQS) %in% unique(REW$OTU) ] - -cat("....Total number of OTUs in input sequences: ", length(SQS), "\n") -cat("....Number of OTUs to export: ", length(SQF), "\n") -cat("....Number of OTUs in the OTU table: ", nrow(REW), "\n") - -cat("... Writing FASTA file\n") -writeXStringSet(x = SQF, - filepath = "OTUs.fa.gz", - compress = TRUE, format = "fasta", width = 9999) - - -cat("\nAll done.\n") - - -##################### Session info - -## Check time -end_time <- Sys.time() - -tmm <- as.numeric(difftime(end_time, start_time, units = "min")) -cat("\nElapsed time: ", tmm, " minutes\n") - -cat("\n") -cat("Session info:\n") -sessionInfo() -cat("\n") +#!/usr/bin/env Rscript + +## Script to pool remove low-quality singletons and summarize sequence abundance at OTU level (per sample) + +# Input: +# 1. Sequence tables in long format with de novo chimeras removed (`Seqs.parquet`) +# 2. UC file (`UC_Pooled.parquet`) +# 3. FASTA file with OTU sequences (`Clustered.fa.gz`) +# 4. Max MEEP score + +# Outputs: +# - OTU table in long format (`OTU_table_long.txt.gz` & `OTU_table_long.RData`) +# - OTU table in wide format (`OTU_table_wide.txt.gz` & `OTU_table_wide.RData`) +# - FASTA file with sequences (`OTUs.fa.gz`) + +## Usage: +# ./summarize_clustered_data.R \ +# --seqtab "Seqs.parquet" \ +# --uc "UC_Pooled.parquet" \ +# --otus "Clustered.fa.gz" \ +# --maxmeep 0.6 \ +# --recoversinglet TRUE \ +# --mergesamples TRUE \ +# --threads 4 + + +## Quality threshold: +# MEEP score of 0.6 corresponds approximately to the average Phred score of 22.2 + +## Singleton recovery: +# If enabled, then singleton OTUs with MEEP score <= 0.6 & will be preserved +# Otherwise, singleton OTUs will be removed + + +############################################## Parse input parameters + +## Check time +start_time <- Sys.time() + + +cat("Parsing input options and arguments:\n") + +suppressPackageStartupMessages(require(optparse)) + +## Parse arguments +option_list <- list( + make_option("--seqtab", action="store", default=NA, type='character', help="Sequence tables in long format with de novo chimeras removed (Parquet format)"), + make_option("--uc", action="store", default=NA, type='character', help="UC file (Parquet format)"), + make_option("--otus", action="store", default=NA, type='character', help="FASTA file with OTU sequences"), + make_option("--maxmeep", action="store", default=0.5, type='double', help="Max MEEP score"), + make_option("--recoversinglet", action="store", default=TRUE, type='logical', help="Recover singletons"), + make_option(c("-m", "--mergesamples"), action="store", default=FALSE, type='logical', help="Merge sample replicates (default, false)"), + make_option(c("-t", "--threads"), action="store", default=4L, type='integer', help="Number of CPU threads for arrow, default 4") + # make_option(c("-s", "--scriptdir"), action="store", default=getwd(), type='character', help="Directory containing source scripts") +) +opt <- parse_args(OptionParser(option_list=option_list)) + +## Function to convert text "NA"s to NA +to_na <- function(x){ + if(x %in% c("NA", "null", "Null")){ x <- NA } + return(x) +} + +## Replaces "null"s from Nextflow with NA +opt <- lapply(X = opt, FUN = to_na) + + +## Validation of the required argiments +if(is.na(opt$seqtab)){ + cat("Input file is not specified: sequence tables in Parquet format.\n", file=stderr()) + stop() +} +if(is.na(opt$uc)){ + cat("Input file is not specified: UC file is required.\n", file=stderr()) + stop() +} +if(is.na(opt$otus)){ + cat("Input file is not specified: FASTA file with OTU sequences.\n", file=stderr()) + stop() +} +if(opt$recoversinglet == TRUE && is.na(opt$maxmeep)){ + cat("For singleton recovery, the max MEEP score must be specified.\n", file=stderr()) + stop() +} + +## Assign variables +SEQTAB <- opt$seqtab +UCF <- opt$uc +MAXMEEP <- as.numeric( opt$maxmeep ) +RECOV_SINGLET <- as.logical(opt$recoversinglet) +MERGE_SAMPLES <- as.logical(opt$mergesamples) +OTUS <- opt$otus + +CPUTHREADS <- as.numeric( opt$threads ) +# SCRIPTDIR <- opt$scriptdir + +## Log assigned variables +cat(paste("Sequence tables (Parquet format): ", SEQTAB, "\n", sep="")) +cat(paste("UC file (Parquet format): ", UCF, "\n", sep="")) +cat(paste("Max MEEP score: ", MAXMEEP, "\n", sep="")) +cat(paste("Low-quality singleton recovery: ", RECOV_SINGLET, "\n", sep="")) +cat(paste("Merge sample replicates: ", MERGE_SAMPLES, "\n", sep="")) +cat(paste("OTU sequences: ", OTUS, "\n", sep="")) +cat(paste("Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) +# cat(paste("Directory containing source scripts: ", SCRIPTDIR, "\n", sep="")) + +cat("\n") + + + +############################################## Data for debugging + +# SEQTAB <- "Seqs.parquet" +# UCF <- "UC_Pooled.parquet" +# MAXMEEP <- 0.5 +# RECOV_SINGLET <- TRUE +# MERGE_SAMPLES <- TRUE +# OTUS <- "Clustered.fa.gz" +# CPUTHREADS <- 4 + + +############################################## Load packages and data + +cat("Loading R packages:\n") + +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(paste(pkg, packageVersion(pkg), "\n")) +} + +load_pckg("data.table") +load_pckg("plyr") +load_pckg("metagMisc") +load_pckg("Biostrings") +load_pckg("arrow") + + +cat("\n") + + +# cat("Loading additional R funcitons...\n") +# source(file.path(SCRIPTDIR, "R_functions.R")) +# cat("\n") + + +## Set CPU thread number +cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") +setDTthreads(threads = CPUTHREADS) # for data.table +set_cpu_count(CPUTHREADS) # for arrow + +###################################### +###################################### Load the data +###################################### + +## Load sequence tables +cat("\n..Loading sequence tables\n") +TAB <- arrow::read_parquet(SEQTAB) +setDT(TAB) +cat("... Total number of records: ", nrow(TAB), "\n") +cat("... Total number unique sequences: ", length(unique(TAB$Sequence)), "\n") +cat("... Total number unique samples (files): ", length(unique(TAB$SampleID)), "\n") + +## Load UC file for globally dereplicated sequences +cat("..Loading pooled UC file\n") +UC <- open_dataset(UCF) |> dplyr::collect() |> setDT() + +## Add OTU IDs to seq table +cat("... Adding OTU IDs to sequence table\n") +cat(".... Number of records in sequence table before merging: ", nrow(TAB), "\n") +TAB <- merge(x = TAB, y = UC, by = "SeqID", all.x = TRUE) +cat(".... Number of records in sequence table after merging: ", nrow(TAB), "\n") + +## Remove NA OTUs -- probably excluded seqs +if(any(is.na(TAB$OTU))){ + cat("WARNING: not all sequences were assigned to OTUs\n") + cat("..Removing missing/excluded sequences\n") + cat(".. ", sum(is.na(TAB$OTU)), " sequences with total abundance ", + sum(TAB[ is.na(OTU) ]$Abundance, na.rm = TRUE), " reads will be excluded\n") + TAB <- TAB[ ! is.na(OTU) ] +} + + +## Find singleton OTUs +cat("\n..Finding singleton OTUs\n") +SINGLETONS <- TAB[ , .(Abundance = sum(Abundance, na.rm = TRUE)), by = .(OTU) ][ Abundance < 2 ] +cat("... Number of singleton OTUs: ", nrow(SINGLETONS), "\n") + +## If singleton recovery is reqired +if(RECOV_SINGLET == TRUE && nrow(SINGLETONS) > 0){ + + ## Add quality scores + SINGLETONS <- merge(x = SINGLETONS, y = TAB[ , .(SeqID, MEEP)], by.x = "OTU", by.y = "SeqID", all.x = TRUE) + + ## Filter by MEEP score + SINGLETONS <- SINGLETONS[ MEEP > MAXMEEP ] + cat("... Number of singleton OTUs after filtering by MEEP score: ", nrow(SINGLETONS), "\n") + +} + +if(nrow(SINGLETONS) > 0){ + cat("..Removing singleton OTUs\n") + TAB <- TAB[ ! OTU %in% SINGLETONS$OTU ] + cat("... Number of records in sequence table after removing singleton OTUs: ", nrow(TAB), "\n") +} + + +## Summarize abundance by sample and OTU +cat("\n..Summarizing OTU abundance\n") +if(MERGE_SAMPLES == TRUE){ + + cat("\n... Merging sample replicates (e.g., re-sequenced samples)\n") + + ## Extract sample names + cat(".... Extracting sample names\n") + TAB[ , SampleName := tstrsplit(x = SampleID, split = "__", keep = 2) ] + + cat(".... Summarizing abundance by sample and OTU\n") + RES <- TAB[ , + .( Abundance = sum(Abundance, na.rm = TRUE) ), + by = c("OTU", "SampleName") ] + + setnames(x = RES, old = "SampleName", new = "SampleID") + +} else { + + cat("... Summarizing abundance by sample and OTU\n") + RES <- TAB[ , + .( Abundance = sum(Abundance, na.rm = TRUE) ), + by = c("OTU", "SampleID") ] + +} + +#### Reshape to wide table +cat("\nReshaping table into wide format\n") + +## Check if we can reshape the table in a single pass +n_otu <- length(unique(RES$OTU)) +n_smp <- length(unique(RES$SampleID)) +n_cll <- as.numeric(n_otu) * as.numeric(n_smp) +cat("...In total, there are ", n_otu, " OTUs and ", n_smp, " samples\n") +cat("...The total number of cells in the wide table will be ", n_cll, "\n") + +## Reshape data in one pass +if(n_cll < 50000000){ + REW <- dcast(data = RES, + formula = OTU ~ SampleID, + fun.aggregate = sum, fill = 0, value.var = "Abundance") +} else { +## Split data into chunks, reshape, and merge back + + cat("..The input table is too large to reshape in a single pass, reshaping by chunks\n") + + ## Function to split vector into N chunks + chunk <- function(x, n){ + if(n > 1) { res <- split(x, cut(seq_along(x), n, labels = FALSE)) } + if(n == 1){ res <- list(); res[[1]] <- x } + return(res) + } + + ## Choose the number of chunks + n_chunks <- data.table::fcase( + n_cll < 9e7, 2L, + n_cll >= 9e7 & n_cll < 5e8, 5L, + n_cll >= 5e8 & n_cll < 5e9, 6L, + n_cll >= 5e9 & n_cll < 5e10, 7L, + n_cll >= 5e10, 8L) + + cat("...The number of chunks to process: , ", n_chunks, "\n") + + ch <- chunk(x = sort(unique(RES$SampleID)), n = n_chunks) + + ## Chunk-and-reshape loop + REWL <- plyr::llply( + .data = ch, + .fun = function(x){ + + ## Reshape to wide + res <- dcast( + data = RES[ SampleID %in% x , ], + formula = OTU ~ SampleID, + fill = 0, fun.aggregate = sum, value.var = "Abundance") + + ## Create key on a data.table (should improve merging speed) + setkey(res, OTU) + + return(res) + }, + .progress = "text") + + cat("...Chunk reshaping finished\n") + cat("..Merging data into a single wide table\n") + + ## Merge chunks into a single wide table + merge_dt <- function(x,y){ data.table::merge.data.table(x, y, by = "OTU", all = TRUE) } + REW <- Reduce(f = merge_dt, x = REWL) + cat("...Merging finished\n") + + ## Clean up + cat("...Cleaning up\n") + rm(REWL); gc() + + ## Replace NAs with zeros + cat("...Filling missing values with zeros\n") + for (j in seq_len(ncol(REW))){ + set(REW, which(is.na(REW[[j]])), j, 0) + } + +} ## end of reshaping + +cat("...Reshaping to the wide format done!\n") + + +## Reorder OTU rows +cat("\n..Reordering OTU rows by total abundance\n") +otu_tots <- rowSums(REW[, -1], na.rm = TRUE) +REW <- REW[ order(otu_tots, decreasing = T), ] + +## Add attributes if samples were merged +setattr(x = RES, name = "Samples_merged", value = MERGE_SAMPLES) +setattr(x = REW, name = "Samples_merged", value = MERGE_SAMPLES) + + +cat("\nExporting results\n") + +## Export data +saveRDS.gz <- function(object, file, threads = parallel::detectCores()) { + con <- pipe(paste0("pigz -p",threads," > ",file),"wb") + saveRDS(object, file = con) + close(con) +} + +cat("..Exporting long table [R]\n") +saveRDS.gz(object = RES, + file = "OTU_table_long.RData", + threads = CPUTHREADS) + +cat("..Exporting wide table [R]\n") +saveRDS.gz(object = REW, + file = "OTU_table_wide.RData", + threads = CPUTHREADS) + +cat("..Exporting long table [tab-delimited]\n") +fwrite(x = RES, file = "OTU_table_long.txt.gz", sep = "\t", compress = "gzip") + +cat("..Exporting wide table [tab-delimited]\n") +fwrite(x = REW, file = "OTU_table_wide.txt.gz", sep = "\t", compress = "gzip") + + +cat("\nExporting OTU sequences to FASTA\n") + +cat("..Preparing sequences\n") + +## Take sequences from the data (NB! there are a several different sequence per OTU) +# SQS <- unique(RES[, .(OTU) ]) +# tmp_OTUs <- unique(TAB[ OTU %in% SQS$OTU & SeqID == OTU , .(OTU, Sequence) ]) +# SQS <- merge(x = SQS, y = tmp_OTUs, by = "OTU", all.x = TRUE) +# rm(tmp_OTUs) +# +# cat("...Preparing XStringSet object\n") +# SQF <- DNAStringSet(x = SQS$Sequence) +# names(SQF) <- SQS$OTU + +## Take sequnces from the OTU file +cat("... Loading FASTA file\n") +SQS <- readDNAStringSet(filepath = OTUS, format="fasta") +cat("... Extracting sequence IDs\n") +names(SQS) <- tstrsplit(x = names(SQS), split = ";", keep = 1)[[1]] + +if(any(duplicated(names(SQS)))){ + cat("WARNING: duplicated OTU names detected!\n") +} + +cat("... Subsetting OTUs\n") +SQF <- SQS[ names(SQS) %in% unique(REW$OTU) ] + +cat("....Total number of OTUs in input sequences: ", length(SQS), "\n") +cat("....Number of OTUs to export: ", length(SQF), "\n") +cat("....Number of OTUs in the OTU table: ", nrow(REW), "\n") + +cat("... Writing FASTA file\n") +writeXStringSet(x = SQF, + filepath = "OTUs.fa.gz", + compress = TRUE, format = "fasta", width = 9999) + + +cat("\nAll done.\n") + + +##################### Session info + +## Check time +end_time <- Sys.time() + +tmm <- as.numeric(difftime(end_time, start_time, units = "min")) +cat("\nElapsed time: ", tmm, " minutes\n") + +cat("\n") +cat("Session info:\n") +sessionInfo() +cat("\n") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/summarize_dereplicated_data.R b/src/pipecraft-core/service_scripts/NextITS/bin/summarize_dereplicated_data.R new file mode 100755 index 0000000..5b1bb28 --- /dev/null +++ b/src/pipecraft-core/service_scripts/NextITS/bin/summarize_dereplicated_data.R @@ -0,0 +1,390 @@ +#!/usr/bin/env Rscript + +## Script to pool dereplicated sequences (non-clustered or denoised), remove low-quality data, and summarize sequence abundance per sample + +# Input: +# 1. Sequence tables in long format with de novo chimeras removed (`Seqs.parquet`) +# 2. UC file from dereplication (`UC_Pooled.parquet`) +# 3. FASTA file with sequences (`Dereplicated.fa.gz`) +# 4. Max MEEP score + +# Outputs: +# - OTU table in long format (`OTU_table_long.txt.gz` & `OTU_table_long.RData`) +# - OTU table in wide format (`OTU_table_wide.txt.gz` & `OTU_table_wide.RData`) +# - FASTA file with sequences (`OTUs.fa.gz`) + +## Usage: +# ./summarize_dereplicated_data.R \ +# --seqtab "Seqs.parquet" \ +# --uc "UC_Pooled.parquet" \ +# --seqs "Dereplicated.fa.gz" \ +# --maxmeep 0.6 \ +# --recoversinglet TRUE \ +# --mergesamples TRUE \ +# --threads 4 + + +## Quality threshold: +# MEEP score of 0.6 corresponds approximately to the average Phred score of 22.2 + +## Singleton recovery: +# If enabled, then singleton sequences with MEEP score <= 0.6 & will be preserved +# Otherwise, low-quality singleton sequences will be removed + + +############################################## Parse input parameters + +## Check time +start_time <- Sys.time() + + +cat("Parsing input options and arguments:\n") + +suppressPackageStartupMessages(require(optparse)) + +## Parse arguments +option_list <- list( + make_option("--seqtab", action="store", default=NA, type='character', help="Sequence tables in long format with de novo chimeras removed (Parquet format)"), + make_option("--uc", action="store", default=NA, type='character', help="UC file (Parquet format)"), + make_option("--seqs", action="store", default=NA, type='character', help="FASTA file with sequences"), + make_option("--maxmeep", action="store", default=0.5, type='double', help="Max MEEP score"), + make_option("--recoversinglet", action="store", default=TRUE, type='logical', help="Recover singletons"), + make_option(c("-m", "--mergesamples"), action="store", default=FALSE, type='logical', help="Merge sample replicates (default, false)"), + make_option(c("-t", "--threads"), action="store", default=4L, type='integer', help="Number of CPU threads for arrow, default 4") + # make_option(c("-s", "--scriptdir"), action="store", default=getwd(), type='character', help="Directory containing source scripts") +) +opt <- parse_args(OptionParser(option_list=option_list)) + +## Function to convert text "NA"s to NA +to_na <- function(x){ + if(x %in% c("NA", "null", "Null")){ x <- NA } + return(x) +} + +## Replaces "null"s from Nextflow with NA +opt <- lapply(X = opt, FUN = to_na) + + +## Validation of the required argiments +if(is.na(opt$seqtab)){ + cat("Input file is not specified: sequence tables in Parquet format.\n", file=stderr()) + stop() +} +if(is.na(opt$uc)){ + cat("Input file is not specified: UC file is required.\n", file=stderr()) + stop() +} +if(is.na(opt$seqs)){ + cat("Input file is not specified: FASTA file with sequences.\n", file=stderr()) + stop() +} +if(opt$recoversinglet == TRUE && is.na(opt$maxmeep)){ + cat("For singleton recovery, the max MEEP score must be specified.\n", file=stderr()) + stop() +} + +## Assign variables +SEQTAB <- opt$seqtab +UCF <- opt$uc +MAXMEEP <- as.numeric( opt$maxmeep ) +RECOV_SINGLET <- as.logical(opt$recoversinglet) +MERGE_SAMPLES <- as.logical(opt$mergesamples) +SEQS <- opt$seqs + +CPUTHREADS <- as.numeric( opt$threads ) +# SCRIPTDIR <- opt$scriptdir + +## Log assigned variables +cat(paste("Sequence tables (Parquet format): ", SEQTAB, "\n", sep="")) +cat(paste("UC file (Parquet format): ", UCF, "\n", sep="")) +cat(paste("Max MEEP score: ", MAXMEEP, "\n", sep="")) +cat(paste("Low-quality singleton recovery: ", RECOV_SINGLET, "\n", sep="")) +cat(paste("Merge sample replicates: ", MERGE_SAMPLES, "\n", sep="")) +cat(paste("Sequences: ", SEQS, "\n", sep="")) +cat(paste("Number of CPU threads to use: ", CPUTHREADS, "\n", sep="")) +# cat(paste("Directory containing source scripts: ", SCRIPTDIR, "\n", sep="")) + +cat("\n") + + + +############################################## Data for debugging + +# SEQTAB <- "Seqs.parquet" +# UCF <- "UC_Pooled.parquet" +# MAXMEEP <- 0.5 +# RECOV_SINGLET <- TRUE +# MERGE_SAMPLES <- TRUE +# SEQS <- "Dereplicated.fa.gz" +# CPUTHREADS <- 4 + + +############################################## Load packages and data + +cat("Loading R packages:\n") + +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(paste(pkg, packageVersion(pkg), "\n")) +} + +load_pckg("data.table") +load_pckg("plyr") +load_pckg("metagMisc") +load_pckg("Biostrings") +load_pckg("arrow") + + +cat("\n") + + +# cat("Loading additional R funcitons...\n") +# source(file.path(SCRIPTDIR, "R_functions.R")) +# cat("\n") + + +## Set CPU thread number +cat("Setting number of CPU threads to: ", CPUTHREADS, "\n") +setDTthreads(threads = CPUTHREADS) # for data.table +set_cpu_count(CPUTHREADS) # for arrow + +###################################### +###################################### Load the data +###################################### + +## Load sequence tables +cat("\n..Loading sequence tables\n") +TAB <- arrow::read_parquet(SEQTAB) |> setDT() +cat("... Total number of records: ", nrow(TAB), "\n") +cat("... Total number unique sequences: ", length(unique(TAB$Sequence)), "\n") +cat("... Total number unique samples (files): ", length(unique(TAB$SampleID)), "\n") + +## Load UC file for globally dereplicated sequences +cat("..Loading pooled UC file\n") +UC <- open_dataset(UCF) |> dplyr::collect() |> setDT() +setnames(UC, new = c("SeqID", "DerepID")) + +## Add dereplicated IDs to seq table +cat("... Adding dereplicated IDs to sequence table\n") +cat(".... Number of records in sequence table before merging: ", nrow(TAB), "\n") +TAB <- merge(x = TAB, y = UC, by = "SeqID", all.x = TRUE) +cat(".... Number of records in sequence table after merging: ", nrow(TAB), "\n") + +## Remove NA IDs -- probably excluded seqs +if(any(is.na(TAB$DerepID))){ + cat("WARNING: not all sequences are present in the dereplicated data (could be due to length-filteing)\n") + cat("..Removing missing/excluded sequences\n") + cat(".. ", sum(is.na(TAB$DerepID)), " sequences with total abundance ", + sum(TAB[ is.na(DerepID) ]$Abundance, na.rm = TRUE), " reads will be excluded\n") + TAB <- TAB[ ! is.na(DerepID) ] +} + + +## Find singleton sequences +cat("\n..Finding singleton sequences\n") +SINGLETONS <- TAB[ , .(Abundance = sum(Abundance, na.rm = TRUE)), by = .(DerepID) ][ Abundance < 2 ] +cat("... Number of singleton sequences: ", nrow(SINGLETONS), "\n") + +## If singleton recovery is reqired +if(RECOV_SINGLET == TRUE && nrow(SINGLETONS) > 0){ + + ## Add quality scores + SINGLETONS <- merge(x = SINGLETONS, y = TAB[ , .(SeqID, MEEP)], by.x = "DerepID", by.y = "SeqID", all.x = TRUE) + + ## Filter by MEEP score + SINGLETONS <- SINGLETONS[ MEEP > MAXMEEP ] + cat("... Number of singleton sequences after filtering by MEEP score: ", nrow(SINGLETONS), "\n") + +} + +if(nrow(SINGLETONS) > 0){ + cat("..Removing singleton sequences\n") + TAB <- TAB[ ! DerepID %in% SINGLETONS$DerepID ] + cat("... Number of records in sequence table after removing singleton sequences: ", nrow(TAB), "\n") +} + + +## Summarize abundance by sample and dereplicated ID +cat("\n..Summarizing sequence abundance\n") +if(MERGE_SAMPLES == TRUE){ + + cat("\n... Merging sample replicates (e.g., re-sequenced samples)\n") + + ## Extract sample names + cat(".... Extracting sample names\n") + TAB[ , SampleName := tstrsplit(x = SampleID, split = "__", keep = 2) ] + + cat(".... Summarizing abundance by sample and OTU\n") + RES <- TAB[ , + .( Abundance = sum(Abundance, na.rm = TRUE) ), + by = c("DerepID", "SampleName") ] + + setnames(x = RES, old = "SampleName", new = "SampleID") + +} else { + + cat("... Summarizing abundance by sample and dereplicated ID\n") + RES <- TAB[ , + .( Abundance = sum(Abundance, na.rm = TRUE) ), + by = c("DerepID", "SampleID") ] + +} + +#### Reshape to wide table +cat("\nReshaping table into wide format\n") + +## Check if we can reshape the table in a single pass +n_seq <- length(unique(RES$DerepID)) +n_smp <- length(unique(RES$SampleID)) +n_cll <- as.numeric(n_seq) * as.numeric(n_smp) +cat("...In total, there are ", n_seq, " sequences and ", n_smp, " samples\n") +cat("...The total number of cells in the wide table will be ", n_cll, "\n") + +## Reshape data in one pass +if(n_cll < 50000000){ + REW <- dcast(data = RES, + formula = DerepID ~ SampleID, + fun.aggregate = sum, fill = 0, value.var = "Abundance") +} else { +## Split data into chunks, reshape, and merge back + + cat("..The input table is too large to reshape in a single pass, reshaping by chunks\n") + + ## Function to split vector into N chunks + chunk <- function(x, n){ + if(n > 1) { res <- split(x, cut(seq_along(x), n, labels = FALSE)) } + if(n == 1){ res <- list(); res[[1]] <- x } + return(res) + } + + ## Choose the number of chunks + n_chunks <- data.table::fcase( + n_cll < 9e7, 2L, + n_cll >= 9e7 & n_cll < 5e8, 5L, + n_cll >= 5e8 & n_cll < 5e9, 6L, + n_cll >= 5e9 & n_cll < 5e10, 7L, + n_cll >= 5e10, 8L) + + cat("...The number of chunks to process: , ", n_chunks, "\n") + + ch <- chunk(x = sort(unique(RES$SampleID)), n = n_chunks) + + ## Chunk-and-reshape loop + REWL <- plyr::llply( + .data = ch, + .fun = function(x){ + + ## Reshape to wide + res <- dcast( + data = RES[ SampleID %in% x , ], + formula = DerepID ~ SampleID, + fill = 0, fun.aggregate = sum, value.var = "Abundance") + + ## Create key on a data.table (should improve merging speed) + setkey(res, DerepID) + + return(res) + }, + .progress = "text") + + cat("...Chunk reshaping finished\n") + cat("..Merging data into a single wide table\n") + + ## Merge chunks into a single wide table + merge_dt <- function(x,y){ data.table::merge.data.table(x, y, by = "DerepID", all = TRUE) } + REW <- Reduce(f = merge_dt, x = REWL) + cat("...Merging finished\n") + + ## Clean up + cat("...Cleaning up\n") + rm(REWL); gc() + + ## Replace NAs with zeros + cat("...Filling missing values with zeros\n") + for (j in seq_len(ncol(REW))){ + set(REW, which(is.na(REW[[j]])), j, 0) + } + +} ## end of reshaping + +cat("...Reshaping to the wide format done!\n") + + +## Reorder OTU rows +cat("\n..Reordering OTU rows by total abundance\n") +otu_tots <- rowSums(REW[, -1], na.rm = TRUE) +REW <- REW[ order(otu_tots, decreasing = T), ] + +## Add attributes if samples were merged +setattr(x = RES, name = "Samples_merged", value = MERGE_SAMPLES) +setattr(x = REW, name = "Samples_merged", value = MERGE_SAMPLES) + + +cat("\nExporting results\n") + +## Export data +saveRDS.gz <- function(object, file, threads = parallel::detectCores()) { + con <- pipe(paste0("pigz -p",threads," > ",file),"wb") + saveRDS(object, file = con) + close(con) +} + +cat("..Exporting long table [R]\n") +saveRDS.gz(object = RES, + file = "OTU_table_long.RData", + threads = CPUTHREADS) + +cat("..Exporting wide table [R]\n") +saveRDS.gz(object = REW, + file = "OTU_table_wide.RData", + threads = CPUTHREADS) + +cat("..Exporting long table [tab-delimited]\n") +fwrite(x = RES, file = "OTU_table_long.txt.gz", sep = "\t", compress = "gzip") + +cat("..Exporting wide table [tab-delimited]\n") +fwrite(x = REW, file = "OTU_table_wide.txt.gz", sep = "\t", compress = "gzip") + + +cat("\nExporting sequences to FASTA\n") + +cat("..Preparing sequences\n") + +## Take sequnces from the FASTA file +cat("... Loading FASTA file\n") +SQS <- readDNAStringSet(filepath = SEQS, format="fasta") +cat("... Extracting sequence IDs\n") +names(SQS) <- tstrsplit(x = names(SQS), split = ";", keep = 1)[[1]] + +if(any(duplicated(names(SQS)))){ + cat("WARNING: duplicated OTU names detected!\n") +} + +cat("... Subsetting OTUs\n") +SQF <- SQS[ names(SQS) %in% unique(REW$DerepID) ] + +cat("....Total number of sequences in input FASTA: ", length(SQS), "\n") +cat("....Number of sequences to export: ", length(SQF), "\n") +cat("....Number of sequences in the abundance table: ", nrow(REW), "\n") + +cat("... Writing FASTA file\n") +writeXStringSet(x = SQF, + filepath = "OTUs.fa.gz", + compress = TRUE, format = "fasta", width = 9999) + + +cat("\nAll done.\n") + + +##################### Session info + +## Check time +end_time <- Sys.time() + +tmm <- as.numeric(difftime(end_time, start_time, units = "min")) +cat("\nElapsed time: ", tmm, " minutes\n") + +cat("\n") +cat("Session info:\n") +sessionInfo() +cat("\n") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/tag_jump_removal.R b/src/pipecraft-core/service_scripts/NextITS/bin/tag_jump_removal.R index 635d5f9..c6fb3e5 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/tag_jump_removal.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/tag_jump_removal.R @@ -1,138 +1,138 @@ -#!/usr/bin/env Rscript - -## Script to perform tag-jump removal - -# Input is given as positional arguments: -# 1. OTU table (`OTU_tab_not_filtered.txt.gz`) -# 2. f-parameter of UNCROSS (e.g., 0.01) -# 3. p-parameter (e.g., 1.0) - -# Outputs: -# - Tag-jumpfiltered OTU table (`OTU_tab_TagJumpFiltered.txt.gz`) -# - Table with tag-jumps (`TagJump_OTUs.RData`) -# - Plot (`TagJump_plot.pdf`) - -args <- commandArgs(trailingOnly = TRUE) - -suppressMessages(library(data.table)) -suppressMessages(library(ggplot2)) -# library(openxlsx) - -theme_set(theme_classic(base_size = 14)) - -## Load OTU table -cat("..Loading OTU table\n") -OTUTABW <- fread( - file = args[1], - sep = "\t", header = TRUE) - -colnames(OTUTABW)[1] <- "OTU" - -cat("...Number of OTUs: ", nrow(OTUTABW), "\n") -cat("...Number of samples: ", ncol(OTUTABW) - 1, "\n") - -## Convert to long format -cat("..Converting OTU table to long format\n") -OTUTAB <- melt(data = OTUTABW, id.vars = "OTU", - variable.name = "SampleID", value.name = "Abundance") - -## Remove zero-OTUs -OTUTAB <- OTUTAB[ Abundance > 0 ] -cat("...Number of non-zero records: ", nrow(OTUTAB), "\n") - - -## Estimate total abundance of sequence per plate -cat("..Estimating total OTU abundance\n") -OTUTAB[ , Total := sum(Abundance, na.rm = TRUE), by = "OTU" ] - -## UNCROSS score (with original parameter - take a root from the exp in denominator, to make curves more steep) -uncross_score <- function(x, N, n, f = 0.01, tmin = 0.1, p = 1){ - # x = OTU abundance in a sample - # N = total OTU abundance - # n = number of samples - # f = expected cross-talk rate, e.g. 0.01 - # tmin = min score to be considered as cross-talk - # p = power to rise the exponent (default, 1; use 1/2 or 1/3 to make cureves more stepp) - - z <- f * N / n # Expected treshold - sc <- 2 / (1 + exp(x/z)^p) # t-score - res <- data.table(Score = sc, TagJump = sc >= tmin) - return(res) -} - -## Esimate UNCROSS score -cat("..Estimating UNCROSS score\n") -OTUTAB <- cbind( - OTUTAB, - uncross_score( - x = OTUTAB$Abundance, - N = OTUTAB$Total, - n = length(unique(OTUTAB$SampleID)), - f = as.numeric(args[2]), - p = as.numeric(args[3]) - ) - ) - -## Truncate singletons with total OTU abundance > 99 reads -# OTUTAB[ Abundance == 1 & Total > 99 , TagJump := TRUE ] -# OTUTAB[ Abundance == 2 & Total > 999 , TagJump := TRUE ] - -cat("...Number of tag-jumps: ", sum(OTUTAB$TagJump, na.rm = TRUE), "\n") - - -## Plot -cat("..Making a plot\n") -PP <- ggplot(data = OTUTAB, aes(x = Total, y = Abundance, color = TagJump)) + - geom_point() + scale_x_log10() + scale_y_log10() + - scale_color_manual(values = c("#0C7C59", "#D64933")) + - labs(x = "Total abundance of OTU, reads", y = "Abundance of OTU in a sample, reads") - -cat("..Exporting a plot\n") -pdf(file = "TagJump_plot.pdf", width = 12, height = 9.5, useDingbats = FALSE) - PP -dev.off() - - -## TJ stats -cat("..Calculating tag-jump summary\n") -TJ <- data.table( - Total_reads = sum(OTUTAB$Abundance), - Number_of_TagJump_Events = sum(OTUTAB$TagJump), - TagJump_reads = sum(OTUTAB[ TagJump == TRUE ]$Abundance, na.rm = T) - ) - -TJ$ReadPercent_removed <- with(TJ, (TagJump_reads / Total_reads * 100)) - -fwrite(x = TJ, file = "TagJump_stats.txt", sep = "\t") - - -## Exporting tag-jump data -cat("..Exporting tag-jump data\n") -JMPS <- OTUTAB[ TagJump == TRUE, .(OTU, SampleID) ] - -saveRDS(object = JMPS, - file = "TagJump_OTUs.RData", - compress = "xz") - - -## Prepare OTU tables, remove tag-jumps -cat("..Removing tag-jumps\n") - -OTUTAB <- OTUTAB[ TagJump == FALSE ] - -## Convert to wide format -RES <- dcast(data = OTUTAB, - formula = OTU ~ SampleID, - value.var = "Abundance", fill = 0) - -## Sort rows (by total abundance) -clz <- colnames(RES)[-1] -otu_sums <- rowSums(RES[, ..clz], na.rm = TRUE) -RES <- RES[ order(otu_sums, decreasing = TRUE) ] - - -cat("..Exporting tag-jump filtered table\n") - -fwrite(x = RES, - file = "OTU_tab_TagJumpFiltered.txt.gz", - sep = "\t", compress = "gzip") +#!/usr/bin/env Rscript + +## Script to perform tag-jump removal + +# Input is given as positional arguments: +# 1. OTU table (`OTU_tab_not_filtered.txt.gz`) +# 2. f-parameter of UNCROSS (e.g., 0.01) +# 3. p-parameter (e.g., 1.0) + +# Outputs: +# - Tag-jumpfiltered OTU table (`OTU_tab_TagJumpFiltered.txt.gz`) +# - Table with tag-jumps (`TagJump_OTUs.RData`) +# - Plot (`TagJump_plot.pdf`) + +args <- commandArgs(trailingOnly = TRUE) + +suppressMessages(library(data.table)) +suppressMessages(library(ggplot2)) +# library(openxlsx) + +theme_set(theme_classic(base_size = 14)) + +## Load OTU table +cat("..Loading OTU table\n") +OTUTABW <- fread( + file = args[1], + sep = "\t", header = TRUE) + +colnames(OTUTABW)[1] <- "OTU" + +cat("...Number of OTUs: ", nrow(OTUTABW), "\n") +cat("...Number of samples: ", ncol(OTUTABW) - 1, "\n") + +## Convert to long format +cat("..Converting OTU table to long format\n") +OTUTAB <- melt(data = OTUTABW, id.vars = "OTU", + variable.name = "SampleID", value.name = "Abundance") + +## Remove zero-OTUs +OTUTAB <- OTUTAB[ Abundance > 0 ] +cat("...Number of non-zero records: ", nrow(OTUTAB), "\n") + + +## Estimate total abundance of sequence per plate +cat("..Estimating total OTU abundance\n") +OTUTAB[ , Total := sum(Abundance, na.rm = TRUE), by = "OTU" ] + +## UNCROSS score (with original parameter - take a root from the exp in denominator, to make curves more steep) +uncross_score <- function(x, N, n, f = 0.01, tmin = 0.1, p = 1){ + # x = OTU abundance in a sample + # N = total OTU abundance + # n = number of samples + # f = expected cross-talk rate, e.g. 0.01 + # tmin = min score to be considered as cross-talk + # p = power to rise the exponent (default, 1; use 1/2 or 1/3 to make cureves more stepp) + + z <- f * N / n # Expected treshold + sc <- 2 / (1 + exp(x/z)^p) # t-score + res <- data.table(Score = sc, TagJump = sc >= tmin) + return(res) +} + +## Esimate UNCROSS score +cat("..Estimating UNCROSS score\n") +OTUTAB <- cbind( + OTUTAB, + uncross_score( + x = OTUTAB$Abundance, + N = OTUTAB$Total, + n = length(unique(OTUTAB$SampleID)), + f = as.numeric(args[2]), + p = as.numeric(args[3]) + ) + ) + +## Truncate singletons with total OTU abundance > 99 reads +# OTUTAB[ Abundance == 1 & Total > 99 , TagJump := TRUE ] +# OTUTAB[ Abundance == 2 & Total > 999 , TagJump := TRUE ] + +cat("...Number of tag-jumps: ", sum(OTUTAB$TagJump, na.rm = TRUE), "\n") + + +## Plot +cat("..Making a plot\n") +PP <- ggplot(data = OTUTAB, aes(x = Total, y = Abundance, color = TagJump)) + + geom_point() + scale_x_log10() + scale_y_log10() + + scale_color_manual(values = c("#0C7C59", "#D64933")) + + labs(x = "Total abundance of OTU, reads", y = "Abundance of OTU in a sample, reads") + +cat("..Exporting a plot\n") +pdf(file = "TagJump_plot.pdf", width = 12, height = 9.5, useDingbats = FALSE) + PP +dev.off() + + +## TJ stats +cat("..Calculating tag-jump summary\n") +TJ <- data.table( + Total_reads = sum(OTUTAB$Abundance), + Number_of_TagJump_Events = sum(OTUTAB$TagJump), + TagJump_reads = sum(OTUTAB[ TagJump == TRUE ]$Abundance, na.rm = T) + ) + +TJ$ReadPercent_removed <- with(TJ, (TagJump_reads / Total_reads * 100)) + +fwrite(x = TJ, file = "TagJump_stats.txt", sep = "\t") + + +## Exporting tag-jump data +cat("..Exporting tag-jump data\n") +JMPS <- OTUTAB[ TagJump == TRUE, .(OTU, SampleID) ] + +saveRDS(object = JMPS, + file = "TagJump_OTUs.RData", + compress = "xz") + + +## Prepare OTU tables, remove tag-jumps +cat("..Removing tag-jumps\n") + +OTUTAB <- OTUTAB[ TagJump == FALSE ] + +## Convert to wide format +RES <- dcast(data = OTUTAB, + formula = OTU ~ SampleID, + value.var = "Abundance", fill = 0) + +## Sort rows (by total abundance) +clz <- colnames(RES)[-1] +otu_sums <- rowSums(RES[, ..clz], na.rm = TRUE) +RES <- RES[ order(otu_sums, decreasing = TRUE) ] + + +cat("..Exporting tag-jump filtered table\n") + +fwrite(x = RES, + file = "OTU_tab_TagJumpFiltered.txt.gz", + sep = "\t", compress = "gzip") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/tag_jump_removal_longtab.R b/src/pipecraft-core/service_scripts/NextITS/bin/tag_jump_removal_longtab.R old mode 100644 new mode 100755 index c498030..c7bd52e --- a/src/pipecraft-core/service_scripts/NextITS/bin/tag_jump_removal_longtab.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/tag_jump_removal_longtab.R @@ -1,247 +1,247 @@ -#!/usr/bin/env Rscript - -## Script to perform tag-jump removal - -# Input arguments: -# 1. Sequence table in long format, no header (`Seq_tab_not_filtered.txt.gz`), -# with columns: `SeqID`, `Abundance`, `SampleID` -# 2. Dereplicated or pre-clustered membership table (`TJPreclust.uc.parquet`) -# 2. f-parameter of UNCROSS (e.g., 0.01) -# 3. p-parameter (e.g., 1.0) - -# Outputs: -# - Tag-jump-filtered sequence table (`Seq_tab_TagJumpFiltered.txt.gz`) -# - Table with tag-jump scores (`TagJump_scores.qs`) -# - Plot (`TagJump_plot.pdf`) - -cat("\nParsing input options and arguments...\n") - -suppressPackageStartupMessages(require(optparse)) - -## Parse arguments -option_list <- list( - make_option(c("-s", "--seqtab"), action="store", default="seqtab.txt.gz", type='character', help="Sequence table in long format"), - make_option(c("-c", "--precls"), action="store", default="precls.txt.gz", type='character', help="Table with pre-clustered sequence membership"), - make_option(c("-f", "--uncross_f"), action="store", default=0.01, type='numeric', help="f-parameter of UNCROSS"), - make_option(c("-p", "--uncross_p"), action="store", default=1, type='numeric', help="Additional p-parameter for UNCROSS") -) -opt <- parse_args(OptionParser(option_list=option_list)) - -## Function to convert text "NA"s to NA -to_na <- function(x){ - if(x %in% c("NA", "null", "Null")){ x <- NA } - return(x) -} - -## Replaces "null"s from Nextflow with NA -opt <- lapply(X = opt, FUN = to_na) - -## Validation of the required arguments -if(is.na(opt$seqtab)){ - stop("Input file with sequence table is not specified\n") -} -if(is.na(opt$precls)){ - stop("Input file with pre-clustered membership table is not specified\n") -} - -## Set default params if not specified -if(is.na(opt$uncross_f) | is.null(opt$uncross_f) | is.nan(opt$uncross_f) | !is.numeric(opt$uncross_f)){ - cat("f-parameter is not specified, using default value: 0.01\n") - opt$uncross_f <- 0.01 -} -if(is.na(opt$uncross_p) | is.null(opt$uncross_p) | is.nan(opt$uncross_p) | !is.numeric(opt$uncross_p)){ - cat("p-parameter is not specified, using default value: 1\n") - opt$uncross_p <- 1 -} - -## Assign variables -SEQTAB <- opt$seqtab -PRECLS <- opt$precls -F <- opt$uncross_f -P <- opt$uncross_p - -## Log assigned variables -cat("\nParameters specified:\n") -cat(paste("Sequence table: " , SEQTAB, "\n", sep = "")) -cat(paste("Pre-clustered membership table: ", PRECLS, "\n", sep = "")) -cat(paste("f-parameter of UNCROSS: ", F, "\n", sep = "")) -cat(paste("p-parameter of UNCROSS: ", P, "\n", sep = "")) - -cat("\n") - - -############################################## Data for debugging - -# SEQTAB <- "Seq_tab_not_filtered.txt.gz" -# PRECLS <- "TJPreclust.uc.parquet" -# F <- 0.01 -# P <- 1 - - -############################################## Load packages - -cat("Loading R packages...\n") - -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(paste(pkg, packageVersion(pkg), "\n")) -} - -load_pckg("data.table") -load_pckg("arrow") -load_pckg("ggplot2") -# load_pckg("qs") - -theme_set(theme_classic(base_size = 14)) - -cat("\n") - -############################################## Workflow - -## Load sequence table -cat("..Loading sequence table\n") -SEQTAB <- fread(file = SEQTAB, - sep = "\t", header = TRUE) # "SeqID", "SampleID", "Abundance" - -## Load sequence membership table -cat("..Loading sequence membership table\n") -PRECLS <- read_parquet(file = PRECLS) -setDT(PRECLS) -setnames(PRECLS, new = c("SeqID", "OTU")) - -## Remove ambiguous mappings (should not happen, but just in case) -if(any(duplicated(PRECLS$SeqID))){ - cat("WARNING: ambiguous mapping of sequences in membership table detected - excluding duplicates\n") - PRECLS <- unique(PRECLS, by = "SeqID") -} - -## Add cluster membership to the sequence table -SEQTAB <- merge(x = SEQTAB, y = PRECLS, by = "SeqID", all.x = TRUE) - -if(any(is.na(SEQTAB$OTU))){ - cat("WARNING: Sequences without cluster membership detected\n") - cat("WARNING: Excluding these records from the analysis\n") - SEQTAB <- SEQTAB[ !is.na(OTU) ] -} - -cat("...Number of unique sequences: ", length(unique(SEQTAB$SeqID)), "\n") -cat("...Number of clusters: ", length(unique(SEQTAB$OTU)), "\n") -cat("...Number of samples: ", length(unique(SEQTAB$SampleID)), "\n") - -## Summarize by sequence clusters -cat("..Summarizing by sequence clusters\n") -OTUTAB <- SEQTAB[ , .(Abundance = sum(Abundance, na.rm = TRUE)), by = c("OTU", "SampleID") ] - -## Estimate total abundance of sequence per plate -cat("..Estimating total OTU abundance\n") -OTUTAB[ , Total := sum(Abundance, na.rm = TRUE), by = "OTU" ] - -## UNCROSS score (with original parameter - take a root from the exp in denominator, to make curves more steep) -uncross_score <- function(x, N, n, f = 0.01, tmin = 0.1, p = 1){ - # x = OTU abundance in a sample - # N = total OTU abundance - # n = number of samples - # f = expected cross-talk rate, e.g. 0.01 - # tmin = min score to be considered as cross-talk - # p = power to rise the exponent (default, 1; use 1/2 or 1/3 to make cureves more stepp) - - z <- f * N / n # Expected treshold - sc <- 2 / (1 + exp(x/z)^p) # t-score - res <- data.table(Score = sc, TagJump = sc >= tmin) - return(res) -} - -## Esimate UNCROSS score -cat("..Estimating UNCROSS score\n") -OTUTAB <- cbind( - OTUTAB, - uncross_score( - x = OTUTAB$Abundance, - N = OTUTAB$Total, - n = length(unique(OTUTAB$SampleID)), - f = as.numeric(F), - p = as.numeric(P) - ) - ) - -## Truncate singletons with total OTU abundance > 99 reads -# OTUTAB[ Abundance == 1 & Total > 99 , TagJump := TRUE ] -# OTUTAB[ Abundance == 2 & Total > 999 , TagJump := TRUE ] - -cat("...Number of tag-jumps: ", sum(OTUTAB$TagJump, na.rm = TRUE), "\n") - -## Export tag-jump scores -setcolorder(OTUTAB, - c("OTU", "SampleID", "TagJump", "Score", "Abundance", "Total")) - -setorder(OTUTAB, OTU, -Abundance, SampleID) - -cat("..Exporting tag-jump scores\n") -qs::qsave(OTUTAB, - "TagJump_scores.qs", - preset = "custom", algorithm = "zstd", compress_level = 5L, nthreads = 1L) - - -## Plot -cat("..Making a plot\n") -PP <- ggplot(data = OTUTAB, aes(x = Total, y = Abundance, color = TagJump)) + - geom_point() + scale_x_log10() + scale_y_log10() + - scale_color_manual(values = c("#0C7C59", "#D64933")) + - labs(x = "Total abundance of OTU, reads", y = "Abundance of OTU in a sample, reads") - -cat("..Exporting a plot\n") -pdf(file = "TagJump_plot.pdf", width = 12, height = 9.5, useDingbats = FALSE) - PP -dev.off() - - - -## Exporting tag-jump data -# cat("..Exporting tag-jump data\n") -# JMPS <- OTUTAB[ TagJump == TRUE, .(OTU, SampleID) ] -# -# saveRDS(object = JMPS, -# file = "TagJump_OTUs.RData", -# compress = "xz") - - -## Prepare filtered sequence table, remove tag-jumps -cat("..Removing tag-jumps\n") - -## Add tag-jump info to the sequence table -n1 <- nrow(SEQTAB) - -RES <- merge( - x = SEQTAB, - y = OTUTAB[ , .(OTU, SampleID, TagJump) ], - by = c("OTU", "SampleID"), all.x = TRUE) - -n2 <- nrow(RES) -if(n1 != n2){ - cat("WARNING: merging went wrong likely\n") - cat("WARNING: There might be duplicated sequences\n") -} - - -## TJ stats -cat("..Calculating tag-jump summary\n") -TJ <- data.table( - Total_reads = sum(RES$Abundance), - Number_of_TagJump_Events = sum(RES$TagJump), - TagJump_reads = sum(RES[ TagJump == TRUE ]$Abundance, na.rm = T) - ) - -TJ$ReadPercent_removed <- with(TJ, (TagJump_reads / Total_reads * 100)) - -fwrite(x = TJ, file = "TagJump_stats.txt", sep = "\t") - - -## Keep only non-tag-jump reads -RES <- RES[ TagJump == FALSE , .(SampleID, SeqID, Abundance) ] -setorder(RES, SampleID, -Abundance) - -cat("..Exporting tag-jump filtered table\n") - -fwrite(x = RES, - file = "Seq_tab_TagJumpFiltered.txt.gz", - sep = "\t", compress = "gzip") +#!/usr/bin/env Rscript + +## Script to perform tag-jump removal + +# Input arguments: +# 1. Sequence table in long format, no header (`Seq_tab_not_filtered.txt.gz`), +# with columns: `SeqID`, `Abundance`, `SampleID` +# 2. Dereplicated or pre-clustered membership table (`TJPreclust.uc.parquet`) +# 2. f-parameter of UNCROSS (e.g., 0.01) +# 3. p-parameter (e.g., 1.0) + +# Outputs: +# - Tag-jump-filtered sequence table (`Seq_tab_TagJumpFiltered.txt.gz`) +# - Table with tag-jump scores (`TagJump_scores.qs`) +# - Plot (`TagJump_plot.pdf`) + +cat("\nParsing input options and arguments...\n") + +suppressPackageStartupMessages(require(optparse)) + +## Parse arguments +option_list <- list( + make_option(c("-s", "--seqtab"), action="store", default="seqtab.txt.gz", type='character', help="Sequence table in long format"), + make_option(c("-c", "--precls"), action="store", default="precls.txt.gz", type='character', help="Table with pre-clustered sequence membership"), + make_option(c("-f", "--uncross_f"), action="store", default=0.01, type='numeric', help="f-parameter of UNCROSS"), + make_option(c("-p", "--uncross_p"), action="store", default=1, type='numeric', help="Additional p-parameter for UNCROSS") +) +opt <- parse_args(OptionParser(option_list=option_list)) + +## Function to convert text "NA"s to NA +to_na <- function(x){ + if(x %in% c("NA", "null", "Null")){ x <- NA } + return(x) +} + +## Replaces "null"s from Nextflow with NA +opt <- lapply(X = opt, FUN = to_na) + +## Validation of the required arguments +if(is.na(opt$seqtab)){ + stop("Input file with sequence table is not specified\n") +} +if(is.na(opt$precls)){ + stop("Input file with pre-clustered membership table is not specified\n") +} + +## Set default params if not specified +if(is.na(opt$uncross_f) | is.null(opt$uncross_f) | is.nan(opt$uncross_f) | !is.numeric(opt$uncross_f)){ + cat("f-parameter is not specified, using default value: 0.01\n") + opt$uncross_f <- 0.01 +} +if(is.na(opt$uncross_p) | is.null(opt$uncross_p) | is.nan(opt$uncross_p) | !is.numeric(opt$uncross_p)){ + cat("p-parameter is not specified, using default value: 1\n") + opt$uncross_p <- 1 +} + +## Assign variables +SEQTAB <- opt$seqtab +PRECLS <- opt$precls +F <- opt$uncross_f +P <- opt$uncross_p + +## Log assigned variables +cat("\nParameters specified:\n") +cat(paste("Sequence table: " , SEQTAB, "\n", sep = "")) +cat(paste("Pre-clustered membership table: ", PRECLS, "\n", sep = "")) +cat(paste("f-parameter of UNCROSS: ", F, "\n", sep = "")) +cat(paste("p-parameter of UNCROSS: ", P, "\n", sep = "")) + +cat("\n") + + +############################################## Data for debugging + +# SEQTAB <- "Seq_tab_not_filtered.txt.gz" +# PRECLS <- "TJPreclust.uc.parquet" +# F <- 0.01 +# P <- 1 + + +############################################## Load packages + +cat("Loading R packages...\n") + +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(paste(pkg, packageVersion(pkg), "\n")) +} + +load_pckg("data.table") +load_pckg("arrow") +load_pckg("ggplot2") +# load_pckg("qs") + +theme_set(theme_classic(base_size = 14)) + +cat("\n") + +############################################## Workflow + +## Load sequence table +cat("..Loading sequence table\n") +SEQTAB <- fread(file = SEQTAB, + sep = "\t", header = TRUE) # "SeqID", "SampleID", "Abundance" + +## Load sequence membership table +cat("..Loading sequence membership table\n") +PRECLS <- read_parquet(file = PRECLS) +setDT(PRECLS) +setnames(PRECLS, new = c("SeqID", "OTU")) + +## Remove ambiguous mappings (should not happen, but just in case) +if(any(duplicated(PRECLS$SeqID))){ + cat("WARNING: ambiguous mapping of sequences in membership table detected - excluding duplicates\n") + PRECLS <- unique(PRECLS, by = "SeqID") +} + +## Add cluster membership to the sequence table +SEQTAB <- merge(x = SEQTAB, y = PRECLS, by = "SeqID", all.x = TRUE) + +if(any(is.na(SEQTAB$OTU))){ + cat("WARNING: Sequences without cluster membership detected\n") + cat("WARNING: Excluding these records from the analysis\n") + SEQTAB <- SEQTAB[ !is.na(OTU) ] +} + +cat("...Number of unique sequences: ", length(unique(SEQTAB$SeqID)), "\n") +cat("...Number of clusters: ", length(unique(SEQTAB$OTU)), "\n") +cat("...Number of samples: ", length(unique(SEQTAB$SampleID)), "\n") + +## Summarize by sequence clusters +cat("..Summarizing by sequence clusters\n") +OTUTAB <- SEQTAB[ , .(Abundance = sum(Abundance, na.rm = TRUE)), by = c("OTU", "SampleID") ] + +## Estimate total abundance of sequence per plate +cat("..Estimating total OTU abundance\n") +OTUTAB[ , Total := sum(Abundance, na.rm = TRUE), by = "OTU" ] + +## UNCROSS score (with original parameter - take a root from the exp in denominator, to make curves more steep) +uncross_score <- function(x, N, n, f = 0.01, tmin = 0.1, p = 1){ + # x = OTU abundance in a sample + # N = total OTU abundance + # n = number of samples + # f = expected cross-talk rate, e.g. 0.01 + # tmin = min score to be considered as cross-talk + # p = power to rise the exponent (default, 1; use 1/2 or 1/3 to make cureves more stepp) + + z <- f * N / n # Expected treshold + sc <- 2 / (1 + exp(x/z)^p) # t-score + res <- data.table(Score = sc, TagJump = sc >= tmin) + return(res) +} + +## Esimate UNCROSS score +cat("..Estimating UNCROSS score\n") +OTUTAB <- cbind( + OTUTAB, + uncross_score( + x = OTUTAB$Abundance, + N = OTUTAB$Total, + n = length(unique(OTUTAB$SampleID)), + f = as.numeric(F), + p = as.numeric(P) + ) + ) + +## Truncate singletons with total OTU abundance > 99 reads +# OTUTAB[ Abundance == 1 & Total > 99 , TagJump := TRUE ] +# OTUTAB[ Abundance == 2 & Total > 999 , TagJump := TRUE ] + +cat("...Number of tag-jumps: ", sum(OTUTAB$TagJump, na.rm = TRUE), "\n") + +## Export tag-jump scores +setcolorder(OTUTAB, + c("OTU", "SampleID", "TagJump", "Score", "Abundance", "Total")) + +setorder(OTUTAB, OTU, -Abundance, SampleID) + +cat("..Exporting tag-jump scores\n") +qs::qsave(OTUTAB, + "TagJump_scores.qs", + preset = "custom", algorithm = "zstd", compress_level = 5L, nthreads = 1L) + + +## Plot +cat("..Making a plot\n") +PP <- ggplot(data = OTUTAB, aes(x = Total, y = Abundance, color = TagJump)) + + geom_point() + scale_x_log10() + scale_y_log10() + + scale_color_manual(values = c("#0C7C59", "#D64933")) + + labs(x = "Total abundance of OTU, reads", y = "Abundance of OTU in a sample, reads") + +cat("..Exporting a plot\n") +pdf(file = "TagJump_plot.pdf", width = 12, height = 9.5, useDingbats = FALSE) + PP +dev.off() + + + +## Exporting tag-jump data +# cat("..Exporting tag-jump data\n") +# JMPS <- OTUTAB[ TagJump == TRUE, .(OTU, SampleID) ] +# +# saveRDS(object = JMPS, +# file = "TagJump_OTUs.RData", +# compress = "xz") + + +## Prepare filtered sequence table, remove tag-jumps +cat("..Removing tag-jumps\n") + +## Add tag-jump info to the sequence table +n1 <- nrow(SEQTAB) + +RES <- merge( + x = SEQTAB, + y = OTUTAB[ , .(OTU, SampleID, TagJump) ], + by = c("OTU", "SampleID"), all.x = TRUE) + +n2 <- nrow(RES) +if(n1 != n2){ + cat("WARNING: merging went wrong likely\n") + cat("WARNING: There might be duplicated sequences\n") +} + + +## TJ stats +cat("..Calculating tag-jump summary\n") +TJ <- data.table( + Total_reads = sum(RES$Abundance), + Number_of_TagJump_Events = sum(RES$TagJump), + TagJump_reads = sum(RES[ TagJump == TRUE ]$Abundance, na.rm = T) + ) + +TJ$ReadPercent_removed <- with(TJ, (TagJump_reads / Total_reads * 100)) + +fwrite(x = TJ, file = "TagJump_stats.txt", sep = "\t") + + +## Keep only non-tag-jump reads +RES <- RES[ TagJump == FALSE , .(SampleID, SeqID, Abundance) ] +setorder(RES, SampleID, -Abundance) + +cat("..Exporting tag-jump filtered table\n") + +fwrite(x = RES, + file = "Seq_tab_TagJumpFiltered.txt.gz", + sep = "\t", compress = "gzip") diff --git a/src/pipecraft-core/service_scripts/NextITS/bin/validate_tags.R b/src/pipecraft-core/service_scripts/NextITS/bin/validate_tags.R index cdfe029..6cb4d4e 100755 --- a/src/pipecraft-core/service_scripts/NextITS/bin/validate_tags.R +++ b/src/pipecraft-core/service_scripts/NextITS/bin/validate_tags.R @@ -1,428 +1,488 @@ -#!/usr/bin/env Rscript - -## Script to validate tags (barcodes) used during sample multiplexing -## - Tags should be unique -## - Tag names should be unique -## - Tag names must be alphanumeric (ASCII-only) and must not contain whitespace, dot, comma, semicolon, or dash -## - Sequencing run ID could be present in tag names (before double underscore) -## - Checks the presence of positive and negative controls -## - Estimates number of unqiue tags and their length -## - For dual assymetric tags, -## unique barcodes are converted into a "long" format, -## a biosample tables (`biosamples_asym.csv` and `biosamples_sym.csv`), -## file naming scheme (`file_renaming.tsv`), -## and `unknown_combinations.tsv` are exported as well - -## Usage: -# validate_tags.R \ -# --tags tags.fasta \ -# --output tags_validated.fasta - - - -cat("Parsing input options and arguments...\n") - -suppressPackageStartupMessages(require(optparse)) - -## Parse arguments -option_list <- list( - make_option("--tags", action="store", default=NA, type='character', help="FASTA file with tags"), - make_option("--output", action="store", default=NA, type='character', help="FASTA file with validated tags") -) -opt <- parse_args(OptionParser(option_list=option_list)) - - -## Validation of the required argiments -if(is.na(opt$tags)){ - cat("Input file is not specified!\n", file=stderr()) - stop() -} -if(is.na(opt$output)){ - cat("Output file is not specified!\n", file=stderr()) - stop() -} - -## Assign variables -TAGS <- opt$tags -OUTP <- opt$output - -## Log assigned variables -cat(paste("FASTA file with tags: ", TAGS, "\n", sep="")) -cat(paste("Output file with validated tags: ", OUTP, "\n", sep="")) - -cat("\n") - - - -############## - -cat("Loading R packages...\n") - -load_pckg <- function(pkg = "data.table"){ - suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) - cat(paste(pkg, packageVersion(pkg), "\n")) -} - -load_pckg("data.table") -load_pckg("Biostrings") - -cat("\n") - - -## Load FASTA sequences -cat("..Loading sequence tables\n") -TAGS <- try( readDNAStringSet(filepath = TAGS, format="fasta") ) - -if("try-error" %in% class(TAGS)){ - cat("Error in reading FASTA file!\n", file=stderr()) - stop(TAGS) -} - -cat("Number of records in the file: ", length(TAGS), "\n") - - -######################## -######################## Validate sample names -######################## - -cat("\n\n===== Validating sample names =====\n\n") - -cat("Positive control: ", - ifelse(test = any(grepl(pattern = "PosC", x = names(TAGS))), - yes = "Present", no = "Absent"), "\n") - -cat("Negative control: ", - ifelse(test = any(grepl(pattern = "NegC", x = names(TAGS))), - yes = "Present", no = "Absent"), "\n") - -## Validate names -cat("\nValidating tag names\n") -newnames <- names(TAGS) - -cat("..Replacing leading and trailing spaces and tabs\n") -newnames <- trimws(x = newnames, which = "both") - -cat("..Replacing duplicated spaces, dashes, dots, commas, or semicolons\n") -newnames <- gsub(pattern = "\\s+", replacement = " ", x = newnames) -newnames <- gsub(pattern = "\\-+", replacement = "-", x = newnames) -newnames <- gsub(pattern = "\\.+", replacement = ".", x = newnames) -newnames <- gsub(pattern = ",+", replacement = ",", x = newnames) -newnames <- gsub(pattern = ";+", replacement = ";", x = newnames) - -cat("..Replacing disallowed symbols\n") -newnames <- iconv(newnames, from = "UTF-8", to = "ASCII//TRANSLIT") -newnames <- gsub(pattern = "[^[:alnum:]]", replacement = "_", x = newnames) - -cat("..Replacing the second occurrence of double underscore\n") -## need something like `sed 's/__/_/2g'` -newnames <- sub(pattern = "__", replacement = "TEMPTEMPTEMPTEMP", x = newnames) -newnames <- gsub(pattern = "_+", replacement = "_", x = newnames) -newnames <- sub(pattern = "TEMPTEMPTEMPTEMP", replacement = "__", x = newnames) - -## test: c("A__B", "B__C__D", "E__F__G__H", "A___3", "B___4___E") - - -## Find out which names were changed -renamed <- data.table( - OriginalName = names(TAGS), - NewName = newnames) - -renamed[ , Renamed := OriginalName != NewName ] -renamed <- renamed[ Renamed == TRUE ] - -if(nrow(renamed) > 0){ - cat("..The following tag names were corrected:\n") - print( - renamed[, .(OriginalName, NewName)], - nrows = nrow(renamed), trunc.cols = FALSE) - - cat("...Exporting renamed tag names\n") - fwrite(x = renamed[ , .(OriginalName, NewName)], - file = "tag_names_renamed.tsv", quote = FALSE, sep = "\t", col.names = FALSE) -} - -names(TAGS) <- newnames - - -## Check tag name uniqness -nuniq <- length(unique(names(TAGS))) -cat("\nAll tag names unique: ", - ifelse(test = (length(TAGS) == nuniq), - yes = "TRUE", no = "FALSE"), "\n") - -if(length(TAGS) != nuniq){ - cat("..Not all tag names are unique!\n") - cat("..Resolving tag name uniqness by adding sequential numbers to non-unique names\n") - - dups <- unique(names(TAGS)[ which(duplicated(names(TAGS))) ]) - cat("..Number of duplicates: ", length(dups), "\n") - cat("..Duplicated names: ", paste(dups, collapse = ", "), "\n") - - dtt <- data.table(ID = 1:length(TAGS), TagName = names(TAGS)) - dtt[ , Duplicated := TagName %in% dups ] - dtt[ , NewName := TagName ] - dtt[ - Duplicated == TRUE, - NewName := paste0(TagName, "_", 1:.N), - by = "TagName" ] - setorder(x = dtt, ID) - - names(TAGS) <- dtt$NewName - - rm(dtt) -} - - -## Check run name -TESTRUN <- grepl(pattern = "__", x = names(TAGS)) - -cat("\nTag names contain sequencing run ID: ", - ifelse(test = any(TESTRUN), - yes = "TRUE", no = "FALSE"), "\n") - -if(any(TESTRUN)){ - - ## Check that all samples contain RunID - cat("\nAll samples contain sequencing run ID: ", - ifelse(test = sum(TESTRUN) == length(names(TAGS)), - yes = "TRUE", no = "FALSE"), "\n") - - ## Check run name uniqness - dtt <- data.table(TagName = names(TAGS)) - dtt[ , c("RunID", "SampleID") := tstrsplit(x = TagName, split = "__", keep = 1:2) ] - - cat("Number of run IDs in tag names (ideally, should be = 1): ", length(unique(dtt$RunID)), "\n") - cat("Run IDs detected: ", paste(unique(dtt$RunID), collapse = ", "), "\n") - - rm(dtt) -} - - -######################## -######################## Validate sequences -######################## - -cat("\n\n===== Validating sequences =====\n\n") - - -DUAL <- grepl(pattern = "\\.\\.\\.", x = as.character(TAGS)) - -if(any(DUAL)){ - cat("Barcode type detected: Dual\n") -} else { - cat("Barcode type detected: Single (or dual symmetric)\n") -} - - -##### Single tag - -if(any(DUAL) == FALSE){ - - cat("Tag length: ", paste(unique(width(TAGS)), collapse = ", "), "\n") - - suniq <- length(unique(as.character(TAGS))) - cat("\nAll tag sequences unique: ", - ifelse(test = (length(TAGS) == suniq), - yes = "TRUE", no = "FALSE"), "\n") - - if(length(TAGS) != suniq){ - cat("..Not all tag sequences are unique!\n") - cat("..This should be resolved manually!\n") - - dup_name <- unique( names(TAGS)[ duplicated(as.character(TAGS)) ]) - dup_tags <- as.character(TAGS[ dup_name ]) - - cat("..Number of duplicated tags: ", length(dup_name), "\n") - - dupss <- TAGS[ TAGS %in% dup_tags ] - dups <- data.table( - TagNames = names(dupss), - Tags = as.character(dupss)) - - dup_smr <- dups[ , .( - TagNames = paste0("[ ", paste(TagNames, collapse = ", "), " ]") - ), - by = "Tags"] - - cat("..Duplicates: \n") - print(dup_smr, nrows = length(TAGS), trunc.cols = FALSE) - - stop("\nPlease fix the tag sequences!\n") - } - - ## Export FASTA - cat("Exporting validated tags in FASTA format\n") - - writeXStringSet( - x = TAGS, - filepath = OUTP, - compress = FALSE, - format = "fasta", - width = 9999) - -} # end of single tag - - - -##### Dual tags - -if(any(DUAL) == TRUE){ - - ## Convert to tabular format - dtt <- data.table( - SampleID = names(TAGS), - Tags = as.character(TAGS)) - - ## Split dual tags - dtt[ , c("Tag1", "Tag2") := tstrsplit(x = Tags, split = "\\.\\.\\.", keep = c(1,2)) ] - - ## Check if there are any missing tags - missing_tags <- dtt[ is.na(Tag1) | is.na(Tag2) ] - if(nrow(missing_tags) > 0){ - cat("WARNING: missing dual tags detected!\n") - print(missing_tags) - stop("\nPlease fix the tag sequences!\n") - } - - cat("..Forward tag length: ", paste(sort(unique(nchar(dtt$Tag1))), collapse = ", "), "\n") - cat("..Reverse tag length: ", paste(sort(unique(nchar(dtt$Tag2))), collapse = ", "), "\n") - - cat("\n") - cat("..Number of unique forward tags: ", length(unique(dtt$Tag1)), "\n") - cat("..Number of unique reverse tags: ", length(unique(dtt$Tag2)), "\n") - - ## Find unique barcodes - bu <- data.table(Sequence = unique(c(dtt$Tag1, dtt$Tag2))) - - ## Name unique barcodes - len <- nchar(nrow(bu)) - bu[ , ID := .I ] - bu[ , ID := sprintf(paste("%0", len, "d", sep = ""), ID) ] - bu[ , ID := paste0("bc", ID) ] - - ## Convert to FASTA - seqs <- DNAStringSet(x = bu$Sequence) - names(seqs) <- bu$ID - - - - ## Add bacrode IDs - dtt <- merge(x = dtt, y = bu, by.x = "Tag1", by.y = "Sequence", all.x = TRUE) - setnames(x = dtt, old = "ID", new = "ID1") - - dtt <- merge(x = dtt, y = bu, by.x = "Tag2", by.y = "Sequence", all.x = TRUE) - setnames(x = dtt, old = "ID", new = "ID2") - - dtt[ , Barcodes := paste0(ID1, "--", ID2)] - - - dtt[ , TagSymmetry := fifelse(Tag1 == Tag2, yes = "symmetric", no = "asymmetric", na = NA) ] - cat("Number of symmetric tag combinations: ", sum(dtt$TagSymmetry %in% "symmetric"), "\n") - cat("Number of asymmetric tag combinations: ", sum(dtt$TagSymmetry %in% "asymmetric"), "\n") - - ## Validate barcode combination uniqness - if(nrow(dtt) != length(unique(dtt$Barcodes))){ - cat("WARNING: non-unique barcode combination detected!\n") - - dups <- dtt$Barcodes[ duplicated(dtt$Barcodes) ] - print( dtt[ Barcodes %in% dups ] ) - - stop("\nPlease fix the tag sequences!\n") - } - - - ## Prepare biosample table for LIMA - # https://lima.how/faq/biosample.html - cat("\nExporting biosample tables: 'biosamples_sym.csv' and 'biosamples_asym.csv'\n") - - res <- data.table( - Barcodes = dtt$Barcodes, - `Bio Sample` = dtt$SampleID, - TagSymmetry = dtt$TagSymmetry) - - fwrite( - x = res[ TagSymmetry %in% "symmetric", .(Barcodes, `Bio Sample`) ] , - file = "biosamples_sym.csv", - quote = FALSE, sep = ",", col.names = TRUE) - - fwrite( - x = res[ TagSymmetry %in% "asymmetric", .(Barcodes, `Bio Sample`) ] , - file = "biosamples_asym.csv", - quote = FALSE, sep = ",", col.names = TRUE) - - - ## File for sample renaming - res[ , OldName := paste0("lima.", Barcodes, ".fq.gz") ] - res[ , NewName := paste0(`Bio Sample`, ".fq.gz") ] - - cat("Exporting file naming scheme: 'file_renaming.tsv'\n") - - fwrite(x = res[ , .(OldName, NewName)], - file = "file_renaming.tsv", quote = F, sep = "\t", col.names = FALSE) - - ## Export unique barcodes - cat("Exporting unique tags in FASTA format\n") - - writeXStringSet( - x = seqs, - filepath = OUTP, - compress = FALSE, - format = "fasta", - width = 9999) - - - ## Prepare unknown combinations - cat("Preparing unknown tag combinations\n") - - UNKN <- CJ( - Tag1 = unique(dtt$Tag1), - Tag2 = unique(dtt$Tag2)) - - UNKN <- merge(x = UNKN, y = bu, by.x = "Tag1", by.y = "Sequence", all.x = TRUE) - setnames(x = UNKN, old = "ID", new = "ID1") - - UNKN <- merge(x = UNKN, y = bu, by.x = "Tag2", by.y = "Sequence", all.x = TRUE) - setnames(x = UNKN, old = "ID", new = "ID2") - - ## Trying to parse RunID from the first sample - if(any(TESTRUN)){ - cat("WARNING: in assumption that there is a single sequencing run, RunID of the first sample will be used!\n") - RUNID <- tstrsplit(dtt$SampleID[1], split = "__", keep = 1)[[1]] - if(is.na(RUNID)){ - cat("WARNING: RunID is not found in the sample name\n") - RUNID <- "unknown" - } - } else { - RUNID <- "unknown" - } - - UNKN[ , IDS := paste0(ID1, "--", ID2) ] - UNKN[ , OldName := paste0("lima.", IDS, ".fq.gz") ] - UNKN[ , Barcodes := paste0(Tag1, "_", Tag2) ] - UNKN[ , NewName := paste0(RUNID, "__", Barcodes, ".fq.gz") ] - - ## Remove known combinations - UNKN <- UNKN[ !IDS %in% res$Barcodes ] - - cat("Number of possible unknown combinations: ", nrow(UNKN), "\n") - - ## Export unknown combinations - cat("Exporting unknown combinations\n") - - fwrite(x = UNKN[ , .(OldName, NewName)], - file = "unknown_combinations.tsv", quote = F, sep = "\t", col.names = FALSE) - -} # end of dual tags - - -cat("\nValidation finished\n") - - -##################### Session info - -cat("\nAll done.\n") -cat("\n") -cat("Session info:\n") -sessionInfo() -cat("\n") - +#!/usr/bin/env Rscript + +## Script to validate tags (barcodes) used during sample multiplexing +## - Tags should be unique +## - Tag names should be unique +## - Tag names must be alphanumeric (ASCII-only) and must not contain whitespace, dot, comma, semicolon, or dash +## - Sequencing run ID could be present in tag names (before double underscore) +## - Checks the presence of positive and negative controls +## - Estimates number of unqiue tags and their length +## - For dual assymetric tags, +## unique barcodes are converted into a "long" format, +## a biosample tables (`biosamples_asym.csv` and `biosamples_sym.csv`), +## file naming scheme (`file_renaming.tsv`), +## and `unknown_combinations.tsv` are exported as well + +## Usage: +# validate_tags.R \ +# --tags tags.fasta \ +# --output tags_validated.fasta + + + +cat("Parsing input options and arguments...\n") + +suppressPackageStartupMessages(require(optparse)) + +## Parse arguments +option_list <- list( + make_option("--tags", action="store", default=NA, type='character', help="FASTA file with tags"), + make_option("--output", action="store", default=NA, type='character', help="FASTA file with validated tags") +) +opt <- parse_args(OptionParser(option_list=option_list)) + + +## Validation of the required argiments +if(is.na(opt$tags)){ + cat("Input file is not specified!\n", file=stderr()) + stop() +} +if(is.na(opt$output)){ + cat("Output file is not specified!\n", file=stderr()) + stop() +} + +## Assign variables +TAGS <- opt$tags +OUTP <- opt$output + +## Log assigned variables +cat(paste("FASTA file with tags: ", TAGS, "\n", sep="")) +cat(paste("Output file with validated tags: ", OUTP, "\n", sep="")) + +cat("\n") + + + +############## + +cat("Loading R packages...\n") + +load_pckg <- function(pkg = "data.table"){ + suppressPackageStartupMessages( library(package = pkg, character.only = TRUE) ) + cat(paste(pkg, packageVersion(pkg), "\n")) +} + +load_pckg("data.table") +load_pckg("Biostrings") + +cat("\n") + + +## Load FASTA sequences +cat("..Loading sequence tables\n") +TAGS <- try( readDNAStringSet(filepath = TAGS, format="fasta") ) + +if("try-error" %in% class(TAGS)){ + cat("Error in reading FASTA file!\n", file=stderr()) + stop(TAGS) +} + +cat("Number of records in the file: ", length(TAGS), "\n") + + +######################## +######################## Validate sample names +######################## + +cat("\n\n===== Validating sample names =====\n\n") + +cat("Positive control: ", + ifelse(test = any(grepl(pattern = "PosC", x = names(TAGS))), + yes = "Present", no = "Absent"), "\n") + +cat("Negative control: ", + ifelse(test = any(grepl(pattern = "NegC", x = names(TAGS))), + yes = "Present", no = "Absent"), "\n") + +## Validate names +cat("\nValidating tag names\n") +newnames <- names(TAGS) + +cat("..Replacing leading and trailing spaces and tabs\n") +newnames <- trimws(x = newnames, which = "both") + +cat("..Replacing duplicated spaces, dashes, dots, commas, or semicolons\n") +newnames <- gsub(pattern = "\\s+", replacement = " ", x = newnames) +newnames <- gsub(pattern = "\\-+", replacement = "-", x = newnames) +newnames <- gsub(pattern = "\\.+", replacement = ".", x = newnames) +newnames <- gsub(pattern = ",+", replacement = ",", x = newnames) +newnames <- gsub(pattern = ";+", replacement = ";", x = newnames) + +cat("..Replacing disallowed symbols\n") +newnames <- iconv(newnames, from = "UTF-8", to = "ASCII//TRANSLIT") +newnames <- gsub(pattern = "[^[:alnum:]]", replacement = "_", x = newnames) + +cat("..Replacing the second occurrence of double underscore\n") +## need something like `sed 's/__/_/2g'` +newnames <- sub(pattern = "__", replacement = "TEMPTEMPTEMPTEMP", x = newnames) +newnames <- gsub(pattern = "_+", replacement = "_", x = newnames) +newnames <- sub(pattern = "TEMPTEMPTEMPTEMP", replacement = "__", x = newnames) + +## test: c("A__B", "B__C__D", "E__F__G__H", "A___3", "B___4___E") + + +## Find out which names were changed +renamed <- data.table( + OriginalName = names(TAGS), + NewName = newnames) + +renamed[ , Renamed := OriginalName != NewName ] +renamed <- renamed[ Renamed == TRUE ] + +if(nrow(renamed) > 0){ + cat("..The following tag names were corrected:\n") + print( + renamed[, .(OriginalName, NewName)], + nrows = nrow(renamed), trunc.cols = FALSE) + + cat("...Exporting renamed tag names\n") + fwrite(x = renamed[ , .(OriginalName, NewName)], + file = "tag_names_renamed.tsv", quote = FALSE, sep = "\t", col.names = FALSE) +} + +names(TAGS) <- newnames + + +## Check tag name uniqness +nuniq <- length(unique(names(TAGS))) +cat("\nAll tag names unique: ", + ifelse(test = (length(TAGS) == nuniq), + yes = "TRUE", no = "FALSE"), "\n") + +if(length(TAGS) != nuniq){ + cat("..Not all tag names are unique!\n") + cat("..Resolving tag name uniqness by adding sequential numbers to non-unique names\n") + + dups <- unique(names(TAGS)[ which(duplicated(names(TAGS))) ]) + cat("..Number of duplicates: ", length(dups), "\n") + cat("..Duplicated names: ", paste(dups, collapse = ", "), "\n") + + dtt <- data.table(ID = 1:length(TAGS), TagName = names(TAGS)) + dtt[ , Duplicated := TagName %in% dups ] + dtt[ , NewName := TagName ] + dtt[ + Duplicated == TRUE, + NewName := paste0(TagName, "_", 1:.N), + by = "TagName" ] + setorder(x = dtt, ID) + + names(TAGS) <- dtt$NewName + + rm(dtt) +} + + +## Check run name +TESTRUN <- grepl(pattern = "__", x = names(TAGS)) + +cat("\nTag names contain sequencing run ID: ", + ifelse(test = any(TESTRUN), + yes = "TRUE", no = "FALSE"), "\n") + +if(any(TESTRUN)){ + + ## Check that all samples contain RunID + cat("\nAll samples contain sequencing run ID: ", + ifelse(test = sum(TESTRUN) == length(names(TAGS)), + yes = "TRUE", no = "FALSE"), "\n") + + ## Check run name uniqness + dtt <- data.table(TagName = names(TAGS)) + dtt[ , c("RunID", "SampleID") := tstrsplit(x = TagName, split = "__", keep = 1:2) ] + + cat("Number of run IDs in tag names (ideally, should be = 1): ", length(unique(dtt$RunID)), "\n") + cat("Run IDs detected: ", paste(unique(dtt$RunID), collapse = ", "), "\n") + + rm(dtt) +} + + +######################## +######################## Validate sequences +######################## + +cat("\n\n===== Validating sequences =====\n\n") + + +DUAL <- grepl(pattern = "\\.\\.\\.", x = as.character(TAGS)) + +if(any(DUAL)){ + cat("Barcode type detected: Dual\n") + + if(any(!DUAL)){ + cat("WARNING: mixture of single and dual tags detected!\n") + print(names(TAGS)[ !DUAL ]) + stop("\nPlease fix the tag sequences (remove single tags or add double dots to dual tags)!\n") + } + +} else { + cat("Barcode type detected: Single (or dual symmetric)\n") +} + + +##### Single tag + +if(any(DUAL) == FALSE){ + + cat("Tag length: ", paste(unique(width(TAGS)), collapse = ", "), "\n") + + suniq <- length(unique(as.character(TAGS))) + cat("\nAll tag sequences unique: ", + ifelse(test = (length(TAGS) == suniq), + yes = "TRUE", no = "FALSE"), "\n") + + if(length(TAGS) != suniq){ + cat("..Not all tag sequences are unique!\n") + cat("..This should be resolved manually!\n") + + dup_name <- unique( names(TAGS)[ duplicated(as.character(TAGS)) ]) + dup_tags <- as.character(TAGS[ dup_name ]) + + cat("..Number of duplicated tags: ", length(dup_name), "\n") + + dupss <- TAGS[ TAGS %in% dup_tags ] + dups <- data.table( + TagNames = names(dupss), + Tags = as.character(dupss)) + + dup_smr <- dups[ , .( + TagNames = paste0("[ ", paste(TagNames, collapse = ", "), " ]") + ), + by = "Tags"] + + cat("..Duplicates: \n") + print(dup_smr, nrows = length(TAGS), trunc.cols = FALSE) + + stop("\nPlease fix the tag sequences!\n") + } + + ## Export FASTA + cat("Exporting validated tags in FASTA format\n") + + writeXStringSet( + x = TAGS, + filepath = OUTP, + compress = FALSE, + format = "fasta", + width = 9999) + +} # end of single tag + + + +##### Dual tags + +if(any(DUAL) == TRUE){ + + ## Convert to tabular format + dtt <- data.table( + SampleID = names(TAGS), + Tags = as.character(TAGS)) + + ## Split dual tags + dtt[ , c("Tag1", "Tag2") := tstrsplit(x = Tags, split = "\\.\\.\\.", keep = c(1,2)) ] + + ## Check if there are any missing tags + missing_tags <- dtt[ is.na(Tag1) | is.na(Tag2) ] + if(nrow(missing_tags) > 0){ + cat("WARNING: missing dual tags detected!\n") + print(missing_tags) + stop("\nPlease fix the tag sequences!\n") + } + + cat("..Forward tag length: ", paste(sort(unique(nchar(dtt$Tag1))), collapse = ", "), "\n") + cat("..Reverse tag length: ", paste(sort(unique(nchar(dtt$Tag2))), collapse = ", "), "\n") + + cat("\n") + cat("..Number of unique forward tags: ", length(unique(dtt$Tag1)), "\n") + cat("..Number of unique reverse tags: ", length(unique(dtt$Tag2)), "\n") + + ## Find unique barcodes + bu <- data.table(Sequence = unique(c(dtt$Tag1, dtt$Tag2))) + + ## Name unique barcodes + len <- nchar(nrow(bu)) + bu[ , ID := .I ] + bu[ , ID := sprintf(paste("%0", len, "d", sep = ""), ID) ] + bu[ , ID := paste0("bc", ID) ] + + ## Convert to FASTA + seqs <- DNAStringSet(x = bu$Sequence) + names(seqs) <- bu$ID + + + + ## Add bacrode IDs + dtt <- merge(x = dtt, y = bu, by.x = "Tag1", by.y = "Sequence", all.x = TRUE) + setnames(x = dtt, old = "ID", new = "ID1") + + dtt <- merge(x = dtt, y = bu, by.x = "Tag2", by.y = "Sequence", all.x = TRUE) + setnames(x = dtt, old = "ID", new = "ID2") + + dtt[ , Barcodes := paste0(ID1, "--", ID2)] + + + dtt[ , TagSymmetry := fifelse(Tag1 == Tag2, yes = "symmetric", no = "asymmetric", na = NA) ] + cat("Number of symmetric tag combinations: ", sum(dtt$TagSymmetry %in% "symmetric"), "\n") + cat("Number of asymmetric tag combinations: ", sum(dtt$TagSymmetry %in% "asymmetric"), "\n") + + ## Validate barcode combination uniqness + if(nrow(dtt) != length(unique(dtt$Barcodes))){ + cat("WARNING: non-unique barcode combination detected!\n") + + dups <- dtt$Barcodes[ duplicated(dtt$Barcodes) ] + print( dtt[ Barcodes %in% dups ] ) + + stop("\nPlease fix the tag sequences!\n") + } + + + ## Find unique barcode combinations (taking into account reverse complements) + dtt[, tag_pair_unordered := paste( + pmin(Tag1, Tag2), + pmax(Tag1, Tag2), + sep="|") ] + + dtt[, tag_pair_ordered := paste(Tag1, Tag2, sep="|") ] + + ## Swapped-pair ambiguity: X-Y exists AND Y-X exists (cannot disambiguate if you don't know direction) + swapped <- dtt[, .( + n_samples = .N, + n_ordered = uniqueN(tag_pair_ordered), + samples = paste(sort(SampleID), collapse=", "), + ordered_set = paste(sort(unique(tag_pair_ordered)), collapse=", ") + ), by = tag_pair_unordered][ n_samples > 1 ] + + if(nrow(swapped) > 0){ + cat("\nWARNING: swapped-pair tag ambiguity detected!\n") + cat("..Number of swapped-pair tag combinations: ", nrow(swapped), "\n") + cat("..Swapped-pair tag combinations: ", "\n") + print(swapped[ , .(samples, tag_pair_unordered) ]) + stop("\nIt is impossible to assign sample ID to sequences with swapped-pair tag combinations!\nPlease fix the tag sequences (e.g., combine primer sequence with the tag sequence!\n") + } + + + ## Prepare biosample table for LIMA + # https://lima.how/faq/biosample.html + cat("\nExporting biosample tables: 'biosamples_sym.csv' and 'biosamples_asym.csv'\n") + + res <- data.table( + Barcodes = dtt$Barcodes, + `Bio Sample` = dtt$SampleID, + TagSymmetry = dtt$TagSymmetry) + + setorder(x = res, `Bio Sample`) + + fwrite( + x = res[ TagSymmetry %in% "symmetric", .(Barcodes, `Bio Sample`) ] , + file = "biosamples_sym.csv", + quote = FALSE, sep = ",", col.names = TRUE) + + fwrite( + x = res[ TagSymmetry %in% "asymmetric", .(Barcodes, `Bio Sample`) ] , + file = "biosamples_asym.csv", + quote = FALSE, sep = ",", col.names = TRUE) + + + ## File for sample renaming + res[ , OldName := paste0("lima.", Barcodes, ".fq.gz") ] + res[ , NewName := paste0(`Bio Sample`, ".fq.gz") ] + + ## The order of tags can be different in the FASTQ file names + ## Ensure that we keep track of both options (x--y and y--x) + tmp <- copy(res) + tmp[ , c("ID1", "ID2") := tstrsplit(x = Barcodes, split = "--", keep = 1:2) ] + tmp[ , Barcodes := paste0(ID2, "--", ID1) ] + tmp[ , OldName := paste0("lima.", Barcodes, ".fq.gz") ] + tmp[ , ID1 := NULL ] + tmp[ , ID2 := NULL ] + + res <- rbind(res, tmp) + rm(tmp) + res <- unique(res, by = "OldName") + setorder(x = res, Barcodes) + + cat("Exporting file naming scheme: 'file_renaming.tsv'\n") + + fwrite(x = res[ , .(OldName, NewName)], + file = "file_renaming.tsv", quote = F, sep = "\t", col.names = FALSE) + + ## Export unique barcodes + cat("Exporting unique tags in FASTA format\n") + + writeXStringSet( + x = seqs, + filepath = OUTP, + compress = FALSE, + format = "fasta", + width = 9999) + + + ## Prepare unknown combinations + cat("Preparing unknown tag combinations\n") + + UNKN <- CJ( + Tag1 = unique(dtt$Tag1), + Tag2 = unique(dtt$Tag2)) + + UNKN <- merge(x = UNKN, y = bu, by.x = "Tag1", by.y = "Sequence", all.x = TRUE) + setnames(x = UNKN, old = "ID", new = "ID1") + + UNKN <- merge(x = UNKN, y = bu, by.x = "Tag2", by.y = "Sequence", all.x = TRUE) + setnames(x = UNKN, old = "ID", new = "ID2") + + ## Trying to parse RunID from the first sample + if(any(TESTRUN)){ + cat("WARNING: in assumption that there is a single sequencing run, RunID of the first sample will be used!\n") + RUNID <- tstrsplit(dtt$SampleID[1], split = "__", keep = 1)[[1]] + if(is.na(RUNID)){ + cat("WARNING: RunID is not found in the sample name\n") + RUNID <- "unknown" + } + } else { + RUNID <- "unknown" + } + + UNKN1 <- copy(UNKN) + UNKN1[ , IDS := paste0(ID1, "--", ID2) ] + UNKN1[ , OldName := paste0("lima.", IDS, ".fq.gz") ] + UNKN1[ , Barcodes := paste0(Tag1, "_", Tag2) ] + UNKN1[ , NewName := paste0(RUNID, "__", Barcodes, ".fq.gz") ] + + UNKN2 <- copy(UNKN) + UNKN2[ , IDS := paste0(ID2, "--", ID1) ] + UNKN2[ , OldName := paste0("lima.", IDS, ".fq.gz") ] + UNKN2[ , Barcodes := paste0(Tag2, "_", Tag1) ] + UNKN2[ , NewName := paste0(RUNID, "__", Barcodes, ".fq.gz") ] + + UNKN <- rbind(UNKN1, UNKN2) + rm(UNKN1, UNKN2) + UNKN <- unique(UNKN, by = "OldName") + setorder(x = UNKN, OldName) + + ## Remove known combinations + UNKN <- UNKN[ !IDS %in% res$Barcodes ] + + cat("Number of possible unknown combinations: ", nrow(UNKN), "\n") + + ## Export unknown combinations + cat("Exporting unknown combinations\n") + + fwrite(x = UNKN[ , .(OldName, NewName)], + file = "unknown_combinations.tsv", quote = F, sep = "\t", col.names = FALSE) + +} # end of dual tags + + +cat("\nValidation finished\n") + + +##################### Session info + +cat("\nAll done.\n") +cat("\n") +cat("Session info:\n") +sessionInfo() +cat("\n") + diff --git a/src/pipecraft-core/service_scripts/NextITS/conf/docker.config b/src/pipecraft-core/service_scripts/NextITS/conf/docker.config index ff66239..16c0eab 100644 --- a/src/pipecraft-core/service_scripts/NextITS/conf/docker.config +++ b/src/pipecraft-core/service_scripts/NextITS/conf/docker.config @@ -3,7 +3,7 @@ process { // Container from Docker Hub - container = 'vmikk/nextits:1.1.0' + container = 'vmikk/nextits:1.2.0' //// Container from Quay.io registry // container = 'docker://quay.io/vmikk/nextits:1.1.0' diff --git a/src/pipecraft-core/service_scripts/NextITS/conf/hpc.config b/src/pipecraft-core/service_scripts/NextITS/conf/hpc.config index 78ec8f7..07877a4 100644 --- a/src/pipecraft-core/service_scripts/NextITS/conf/hpc.config +++ b/src/pipecraft-core/service_scripts/NextITS/conf/hpc.config @@ -190,6 +190,16 @@ process { cpus = 20 } + // Bucketizing workflow - merge chunks into a single file + withName: 'S2:merge_buckets' { + cpus = 4 + } + + // Merge UC files + withName: 'S2:merge_uc' { + cpus = 4 + } + // Summarize sequence abundance by OTU withName: 'S2:summarize' { cpus = 12 diff --git a/src/pipecraft-core/service_scripts/NextITS/conf/hpc_utslurm.config b/src/pipecraft-core/service_scripts/NextITS/conf/hpc_utslurm.config index 7473c5b..676654f 100644 --- a/src/pipecraft-core/service_scripts/NextITS/conf/hpc_utslurm.config +++ b/src/pipecraft-core/service_scripts/NextITS/conf/hpc_utslurm.config @@ -275,6 +275,20 @@ process { time = { check_max( 24.h * task.attempt, 'time' ) } } + // Bucketizing workflow - merge chunks into a single file + withName: 'S2:merge_buckets' { + cpus = 4 + memory = { check_max( 40.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + + // Merge UC files + withName: 'S2:merge_uc' { + cpus = 4 + memory = { check_max( 40.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + // Summarize sequence abundance by OTU withName: 'S2:summarize' { cpus = 2 diff --git a/src/pipecraft-core/service_scripts/NextITS/conf/params.config b/src/pipecraft-core/service_scripts/NextITS/conf/params.config index 7c82e12..7d34acb 100644 --- a/src/pipecraft-core/service_scripts/NextITS/conf/params.config +++ b/src/pipecraft-core/service_scripts/NextITS/conf/params.config @@ -102,7 +102,8 @@ params { ITSx_tax = "all" ITSx_complement = "F" // "F" (check single strand) or "T" (check both DNA strands for matches to HMM-profiles) /// ITSx_singledomain = true .... optional arguments - ITSx_to_parquet = true // convert ITSx output (FASTA files) to Parquet + ITSx_to_parquet = true // convert ITSx output (FASTA files) to Parquet + ITSx_chunk_size = 10000 // chunk size (number of dereplicated sequences per sample) for distributed ITSx processing; set to 0 to disable chunking // Primer trimming (for Illumina) trim_minlen = 10 diff --git a/src/pipecraft-core/service_scripts/NextITS/conf/singularity.config b/src/pipecraft-core/service_scripts/NextITS/conf/singularity.config index a65b3b1..0186233 100644 --- a/src/pipecraft-core/service_scripts/NextITS/conf/singularity.config +++ b/src/pipecraft-core/service_scripts/NextITS/conf/singularity.config @@ -3,14 +3,14 @@ process { // Container from Singularity library - container = 'library://vmiks/nextits/nextits:1-1-0' + container = 'library://vmiks/nextits/nextits:1-2-0' // Container from Quay.io registry [not working yet] - // container = 'quay.io/vmikk/nextits:1.1.0' + // container = 'quay.io/vmikk/nextits:1.2.0' // Container from Docker Hub (will be converted to Singularity image) - // container = 'vmikk/nextits:1.1.0' + // container = 'vmikk/nextits:1.2.0' // Local Singularity image file - // container = '/path/to/local/file/nextits-1-1-0.sif' + // container = '/path/to/local/file/nextits-1-2-0.sif' } diff --git a/src/pipecraft-core/service_scripts/NextITS/containerfiles/NextITS.def b/src/pipecraft-core/service_scripts/NextITS/containerfiles/NextITS.def index c2262ba..86c1a35 100644 --- a/src/pipecraft-core/service_scripts/NextITS/containerfiles/NextITS.def +++ b/src/pipecraft-core/service_scripts/NextITS/containerfiles/NextITS.def @@ -6,26 +6,26 @@ ## Build stage (Rust and Cargo), for runiq Bootstrap: docker -From: rust:1.89.0-slim +From: rust:1.92.0-slim Stage: build %post # rustup --version # 1.28.2 - # rustc --version # 1.89.0 - # cargo --version # 1.89.0 + # rustc --version # 1.92.0 + # cargo --version # 1.92.0 cargo install runiq sd - # runiq --version # 2.0.0 + # runiq --version # 2.1.0 # sd --version # 1.0.0 ## Main stage Bootstrap: docker -From: rocker/r-ver:4.5.1 +From: rocker/r-ver:4.5.2 Stage: final %labels Maintainer vladimir.mikryukov@ut.ee - R_Version 4.5.1 - NextITS_container_version 1.1.0 + R_Version 4.5.2 + NextITS_container_version 1.2.0 %runscript exec "$@" @@ -47,7 +47,7 @@ Stage: final ## Install the required dependencies %post - export R_VERSION=4.5.1 + export R_VERSION=4.5.2 apt-get update -qq \ && apt-get -y --no-install-recommends install \ @@ -87,11 +87,17 @@ Stage: final && R -e 'BiocManager::install("phyloseq", ask = FALSE)' \ && rm -rf /tmp/downloaded_packages - install2.r --error --skipinstalled geodist phytools \ + install2.r --error --skipinstalled \ + geodist \ + phytools \ + ggdendro \ + gridExtra \ + && R -e 'ok <- tryCatch({ remotes::install_github("mikemc/speedyseq"); TRUE }, error=function(e){ message(e); FALSE }); \ + if (!ok || !requireNamespace("speedyseq", quietly=TRUE)) quit(status=1)' \ && R -e 'ok <- tryCatch({ remotes::install_github("vmikk/metagMisc"); TRUE }, error=function(e){ message(e); FALSE }); \ if (!ok || !requireNamespace("metagMisc", quietly=TRUE)) quit(status=1)' \ - && R -e 'ok <- tryCatch({ remotes::install_cran("qs2", type = "source", configure.args = "--with-simd=AVX2"); TRUE }, error=function(e){ message(e); FALSE }); \ - if (!ok || !requireNamespace("qs2", quietly=TRUE)) quit(status=1)' \ + && R -e 'ok <- tryCatch({ remotes::install_cran("qs", type = "source", configure.args = "--with-simd=AVX2"); TRUE }, error=function(e){ message(e); FALSE }); \ + if (!ok || !requireNamespace("qs", quietly=TRUE)) quit(status=1)' \ && rm -rf /tmp/downloaded_packages ## Install conda @@ -127,23 +133,23 @@ Stage: final ${conda_prefix}/bin/mamba install --quiet --yes \ "lima>=2.13.0" \ "pbtk>=3.5.0" \ - "vsearch>=2.30.0" \ - "swarm>=3.1.5" \ - "seqkit>=2.10.1" \ - "seqfu>=1.22.3" \ + "vsearch>=2.30.3" \ + "swarm>=3.1.6" \ + "seqkit>=2.12.0" \ + "seqfu>=1.23.0" \ "fastp>=1.0.1" \ "blast>=2.17.0" \ "bioawk" \ - "miller>=6.13.0" \ + "miller>=6.16.0" \ "xsv>=0.13.0" \ "bedtools>=2.31.1" \ - "parallel>=20250622" \ - "csvtk>=0.34.0" \ + "parallel>=20251122" \ + "csvtk>=0.36.0" \ + "cutadapt>=5.2" \ "itsx>=1.1.3" \ - "cutadapt>=5.1" \ - "bbmap>=39.33" \ - "ripgrep>=14.1.1" \ - "fd-find>=10.2.0" \ + "bbmap>=39.52" \ + "ripgrep>=15.1.0" \ + "fd-find>=10.3.0" \ "mmseqs2" ## seqhasher @@ -154,7 +160,7 @@ Stage: final ## phredsort # https://github.com/vmikk/phredsort - wget https://github.com/vmikk/phredsort/releases/download/1.3.0/phredsort + wget https://github.com/vmikk/phredsort/releases/download/1.4.0/phredsort chmod +x phredsort mv phredsort ${conda_prefix}/bin/ @@ -175,7 +181,7 @@ Stage: final ## rush # https://github.com/shenwei356/rush - wget https://github.com/shenwei356/rush/releases/download/v0.7.0/rush_linux_amd64.tar.gz + wget https://github.com/shenwei356/rush/releases/download/v0.8.0/rush_linux_amd64.tar.gz tar -xzf rush_linux_amd64.tar.gz mv rush ${conda_prefix}/bin/ rm rush_linux_amd64.tar.gz @@ -197,6 +203,7 @@ Stage: final rm -r mumu ## Get the updated ITSx databases ["Version 2", 5 April 2024, curated by Henrik Nilsson] + ## NB! Currently, there is no X.hmm profile (Apusozoa) git clone --depth 1 https://github.com/USDA-ARS-GBRU/ITS_HMMs/ ## Compress and index HMM flatfiles find ITS_HMMs/ITSx_db/HMMs/ -name "*.hmm" | grep -v "N.hmm" \ @@ -211,7 +218,7 @@ Stage: final ## DuckDB # https://duckdb.org/docs/installation/?version=stable - curl -L https://github.com/duckdb/duckdb/releases/download/v1.3.2/duckdb_cli-linux-amd64.zip -o duckdb_cli-linux-amd64.zip \ + curl -L https://github.com/duckdb/duckdb/releases/download/v1.4.3/duckdb_cli-linux-amd64.zip -o duckdb_cli-linux-amd64.zip \ && unzip duckdb_cli-linux-amd64.zip -d ${conda_prefix}/bin/ \ && rm duckdb_cli-linux-amd64.zip @@ -250,7 +257,7 @@ Stage: final required_packages <- c('optparse', 'R.utils', 'data.table', 'arrow', 'duckdb', 'plyr', 'dplyr', 'ggplot2', 'doFuture', 'openxlsx', 'yaml', 'Biostrings', 'ShortRead', 'DECIPHER', 'dada2', 'phyloseq', - 'metagMisc', 'qs2') + 'metagMisc', 'qs') for(pkg in required_packages) { cat('Testing package:', pkg, '... ') diff --git a/src/pipecraft-core/service_scripts/NextITS/containerfiles/NextITS.dockerfile b/src/pipecraft-core/service_scripts/NextITS/containerfiles/NextITS.dockerfile index 36a6d78..99e1e4c 100644 --- a/src/pipecraft-core/service_scripts/NextITS/containerfiles/NextITS.dockerfile +++ b/src/pipecraft-core/service_scripts/NextITS/containerfiles/NextITS.dockerfile @@ -9,17 +9,17 @@ # docker build --target test --tag nextits-test --file NextITS.dockerfile . ## Build stage 1 (Rust and Cargo) -FROM rust:1.89.0-slim AS rust +FROM rust:1.92.0-slim AS rust RUN cargo install runiq sd ## Build stage 2 - Main -FROM rocker/r-ver:4.5.1 AS main +FROM rocker/r-ver:4.5.2 AS main ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 ENV SHELL=/bin/bash LABEL org.opencontainers.image.authors="vladimir.mikryukov@ut.ee" -LABEL org.opencontainers.image.version="1.1.0" +LABEL org.opencontainers.image.version="1.2.0" RUN apt-get update -qq \ && apt-get -y --no-install-recommends install \ @@ -54,11 +54,17 @@ RUN R -e 'BiocManager::install("Biostrings", ask = FALSE)' \ && R -e 'BiocManager::install("phyloseq", ask = FALSE)' \ && rm -rf /tmp/downloaded_packages -RUN install2.r --error --skipinstalled geodist phytools \ +RUN install2.r --error --skipinstalled \ + geodist \ + phytools \ + ggdendro \ + gridExtra \ + && R -e 'ok <- tryCatch({ remotes::install_github("mikemc/speedyseq"); TRUE }, error=function(e){ message(e); FALSE }); \ + if (!ok || !requireNamespace("speedyseq", quietly=TRUE)) quit(status=1)' \ && R -e 'ok <- tryCatch({ remotes::install_github("vmikk/metagMisc"); TRUE }, error=function(e){ message(e); FALSE }); \ if (!ok || !requireNamespace("metagMisc", quietly=TRUE)) quit(status=1)' \ - && R -e 'ok <- tryCatch({ remotes::install_cran("qs2", type = "source", configure.args = "--with-simd=AVX2"); TRUE }, error=function(e){ message(e); FALSE }); \ - if (!ok || !requireNamespace("qs2", quietly=TRUE)) quit(status=1)' \ + && R -e 'ok <- tryCatch({ remotes::install_cran("qs", type = "source", configure.args = "--with-simd=AVX2"); TRUE }, error=function(e){ message(e); FALSE }); \ + if (!ok || !requireNamespace("qs", quietly=TRUE)) quit(status=1)' \ && rm -rf /tmp/downloaded_packages ## Install conda @@ -91,25 +97,25 @@ RUN cd /opt/software \ RUN /opt/software/conda/bin/mamba install -y \ "lima>=2.13.0" \ "pbtk>=3.5.0" \ - "vsearch>=2.30.0" \ - "swarm>=3.1.5" \ - "seqkit>=2.10.1" \ - "seqfu>=1.22.3" \ + "vsearch>=2.30.3" \ + "swarm>=3.1.6" \ + "seqkit>=2.12.0" \ + "seqfu>=1.23.0" \ "fastp>=1.0.1" \ "blast>=2.17.0" \ "bioawk" \ - "miller>=6.13.0" \ + "miller>=6.16.0" \ "xsv>=0.13.0" \ "bedtools>=2.31.1" \ - "parallel>=20250622" \ - "csvtk>=0.34.0" \ - "cutadapt>=5.1" \ + "parallel>=20251122" \ + "csvtk>=0.36.0" \ + "cutadapt>=5.2" \ "itsx>=1.1.3" \ - "bbmap>=39.33" \ - "ripgrep>=14.1.1" \ - "fd-find>=10.2.0" \ + "bbmap>=39.52" \ + "ripgrep>=15.1.0" \ + "fd-find>=10.3.0" \ "mmseqs2" \ - && /opt/software/conda/bin/mamba clean --all --yes + && /opt/software/conda/bin/conda clean --all --yes ## Install cutadapt (with dependencies) from pip - it fails with conda (Python 3.13 confilict) @@ -121,7 +127,7 @@ RUN cd /opt/software \ && wget https://github.com/vmikk/seqhasher/releases/download/1.1.2/seqhasher \ && chmod +x seqhasher \ && mv seqhasher /opt/software/conda/bin/ \ - && wget https://github.com/vmikk/phredsort/releases/download/1.3.0/phredsort \ + && wget https://github.com/vmikk/phredsort/releases/download/1.4.0/phredsort \ && chmod +x phredsort \ && mv phredsort /opt/software/conda/bin/ \ && wget https://github.com/vmikk/ucs/releases/download/0.8.0/ucs \ @@ -137,7 +143,7 @@ RUN git clone --depth 1 https://github.com/indraniel/fqgrep \ && rm -r fqgrep ## rush -RUN wget https://github.com/shenwei356/rush/releases/download/v0.7.0/rush_linux_amd64.tar.gz \ +RUN wget https://github.com/shenwei356/rush/releases/download/v0.8.0/rush_linux_amd64.tar.gz \ && tar -xzf rush_linux_amd64.tar.gz \ && mv rush /opt/software/conda/bin/ \ && rm rush_linux_amd64.tar.gz @@ -172,7 +178,7 @@ RUN cd /opt/software \ ## Install DuckDB RUN cd /opt/software \ - && curl -L https://github.com/duckdb/duckdb/releases/download/v1.3.2/duckdb_cli-linux-amd64.zip -o duckdb_cli-linux-amd64.zip \ + && curl -L https://github.com/duckdb/duckdb/releases/download/v1.4.3/duckdb_cli-linux-amd64.zip -o duckdb_cli-linux-amd64.zip \ && unzip duckdb_cli-linux-amd64.zip -d /opt/software/conda/bin/ \ && rm duckdb_cli-linux-amd64.zip @@ -214,7 +220,7 @@ ENTRYPOINT ["/opt/software/entrypoint.sh"] FROM main AS test # Set environment variable for R version testing -ENV R_VERSION=4.5.1 +ENV R_VERSION=4.5.2 RUN echo "=== Testing R installation and packages ===" \ && R --quiet -e "stopifnot(getRversion() == '${R_VERSION}')" \ @@ -223,7 +229,7 @@ RUN echo "=== Testing R installation and packages ===" \ 'required_packages <- c("optparse", "data.table", "arrow", "duckdb",' \ ' "plyr", "dplyr", "ggplot2", "openxlsx", "yaml",' \ ' "Biostrings", "DECIPHER", "dada2", "phyloseq",' \ - ' "metagMisc", "qs2")' \ + ' "metagMisc", "qs")' \ '' \ 'for(pkg in required_packages) {' \ ' cat("Testing package:", pkg, "... ")' \ diff --git a/src/pipecraft-core/service_scripts/NextITS/main.nf b/src/pipecraft-core/service_scripts/NextITS/main.nf index 06dd4c0..2da9cc3 100644 --- a/src/pipecraft-core/service_scripts/NextITS/main.nf +++ b/src/pipecraft-core/service_scripts/NextITS/main.nf @@ -232,8 +232,8 @@ if( params.step == "Step1" ) { // Additional parameter validation for Step-2 if( params.step == "Step2" ) { - if (params.preclustering == "none" && params.clustering == "none"){ - println errorMsg("Pre-clustering and clustering could not be both set to 'none'", params.monochrome_logs) + if (params.preclustering == "none" && params.clustering == "none" && params.lulu == true){ + println errorMsg("LULU can not be applied when pre-clustering and clustering are set to 'none'", params.monochrome_logs) exit(1) } diff --git a/src/pipecraft-core/service_scripts/NextITS/nextflow.config b/src/pipecraft-core/service_scripts/NextITS/nextflow.config index 330dc42..a723b6d 100644 --- a/src/pipecraft-core/service_scripts/NextITS/nextflow.config +++ b/src/pipecraft-core/service_scripts/NextITS/nextflow.config @@ -252,6 +252,16 @@ process { cpus = 8 } + // Bucketizing workflow - merge chunks into a single file + withName: 'S2:merge_buckets' { + cpus = 4 + } + + // Merge UC files + withName: 'S2:merge_uc' { + cpus = 4 + } + // Summarize sequence abundance by OTU withName: 'S2:summarize' { cpus = 4 diff --git a/src/pipecraft-core/service_scripts/NextITS/nextflow_schema.json b/src/pipecraft-core/service_scripts/NextITS/nextflow_schema.json index 3154109..c19cf23 100644 --- a/src/pipecraft-core/service_scripts/NextITS/nextflow_schema.json +++ b/src/pipecraft-core/service_scripts/NextITS/nextflow_schema.json @@ -197,6 +197,12 @@ "fa_icon": "fas fa-database", "description": "Convert ITSx output (FASTA files) to Parquet" }, + "ITSx_chunk_size": { + "type": "integer", + "default": 10000, + "fa_icon": "fas fa-database", + "description": "Chunk size for distributed ITSx processing (number of dereplicated sequences per sample); set to 0 to disable chunking" + }, "hp": { "type": "boolean", "default": true, diff --git a/src/pipecraft-core/service_scripts/NextITS/subworkflows/clustering_subworkflow.nf b/src/pipecraft-core/service_scripts/NextITS/subworkflows/clustering_subworkflow.nf index 0e4721e..d5e7f1b 100644 --- a/src/pipecraft-core/service_scripts/NextITS/subworkflows/clustering_subworkflow.nf +++ b/src/pipecraft-core/service_scripts/NextITS/subworkflows/clustering_subworkflow.nf @@ -421,12 +421,14 @@ workflow CLUSTERING { if ( params.preclustering == "none" || params.preclustering == null ) { denoise_ch = derep_ch preclustuc_ch = file('NoPrecluster') + preclustaf_ch = file('NoPreclusterFASTA') // Denoise with UNOISE } else if ( params.preclustering == "unoise" ) { unoise(derep_ch) denoise_ch = unoise.out.unoise preclustuc_ch = unoise.out.unoise_uc + preclustaf_ch = denoise_ch // Denoise with DADA2 } else if ( params.preclustering == "dada2" ) { @@ -436,19 +438,42 @@ workflow CLUSTERING { dada2(derep_ch) denoise_ch = dada2.out.dada preclustuc_ch = dada2.out.dada_uc + preclustaf_ch = denoise_ch } + // // Dereplicate and denoise by sequencing run + // if(params.dada2_pooling == "byrun"){ + // + // dereplication_byrun(ch_seqs) + // dada2(dereplication_byrun.out.dereps.flatten()) + // + // dada2pool( + // dereplication.out.derep_uc, + // dada2.out.dada_ucr.collect() + // ) + // + // /* + // denoise_ch = dada2pool.out.dada + // preclustuc_ch = dada2pool.out.dada_uc + // preclustaf_ch = file('NoPreclusterFASTA') + // + // */ + // } + + // Precluster with SWARM } else if ( params.preclustering == "swarm_d1" ){ precluster_swarm(derep_ch) denoise_ch = precluster_swarm.out.clust preclustuc_ch = precluster_swarm.out.clust_uc + preclustaf_ch = denoise_ch // Global homopolymer correction } else if ( params.preclustering == "homopolymer" ){ homopolymer(derep_ch) denoise_ch = homopolymer.out.hp preclustuc_ch = homopolymer.out.hp_uc + preclustaf_ch = denoise_ch } @@ -473,6 +498,7 @@ workflow CLUSTERING { cluster_ch = precluster_swarm.out.clust clustuc_ch = precluster_swarm.out.clust_uc preclustuc_ch = file('NoPrecluster') + preclustaf_ch = file('NoPreclusterFASTA') // Otherwise, run SWARM } else { @@ -486,6 +512,7 @@ workflow CLUSTERING { cluster_ch = unoise.out.unoise clustuc_ch = unoise.out.unoise_uc preclustuc_ch = file('NoPrecluster') + preclustaf_ch = file('NoPreclusterFASTA') // Do not cluster, use ASVs from DADA2 } else if ( params.preclustering == "dada2" & params.clustering == "none" ){ @@ -496,17 +523,18 @@ workflow CLUSTERING { } preclustuc_ch = file('NoPrecluster') + preclustaf_ch = file('NoPreclusterFASTA') } else if ( params.preclustering == "none" & params.clustering == "none" ){ println "No pre-clustering or clustering was done" - - // TODO: create table based on dereplicated sequences? + // This is done outside the clustering subworkflow } emit: - preclustuc_ch = preclustuc_ch - cluster_ch = cluster_ch - clustuc_ch = clustuc_ch + preclustuc_ch = preclustuc_ch // UC file for pre-clustering + preclustaf_ch = preclustaf_ch // FASTA file for pre-clustering + cluster_ch = cluster_ch // FASTA file for clustering + clustuc_ch = clustuc_ch // UC file for clustering } // end of subworkflow diff --git a/src/pipecraft-core/service_scripts/NextITS/subworkflows/itsx_subworkflow.nf b/src/pipecraft-core/service_scripts/NextITS/subworkflows/itsx_subworkflow.nf new file mode 100644 index 0000000..629eea0 --- /dev/null +++ b/src/pipecraft-core/service_scripts/NextITS/subworkflows/itsx_subworkflow.nf @@ -0,0 +1,564 @@ +/* +============================================================================ + NextITS: Pipeline to process eukaryotic ITS amplicons +============================================================================ + License: Apache-2.0 + Github : https://github.com/vmikk/NextITS + Website: https://Next-ITS.github.io/ +---------------------------------------------------------------------------- +*/ + +// Subworkflow for ITSx processing +// (which splits large dereplicated FASTAs into chunks). +// The workflow is as follows: +// 1. Trim primers and dereplicate at sample level +// 2. Split the dereplicated primer-trimmed sequences (at sample level) into chunks while preserving metadata +// 3. Run ITSx on each chunk +// 4. Group results back by original sample ID and concatenate + convert ITSx output to Parquet + +// Path to the output results +out_3_itsx = params.outdir + "/03_ITSx" +out_3_itsxp = params.outdir + "/03_ITSx_PooledParts" + + +// Trim primers and dereplicate at sample level +process primer_trim { + + label "main_container" + + publishDir "${out_3_itsx}", mode: "${params.storagemode}" + // cpus 2 + + // Add sample ID to the log file + tag "${meta.id}" + + input: + tuple val(meta), path(fastq) + + output: + tuple val(meta), path("${meta.id}_derep.fasta.gz"), emit: derep, optional: true + tuple val(meta), path("${meta.id}_hash_table.txt.gz"), emit: hashes, optional: true + tuple val(meta), path("${meta.id}_uc.uc.gz"), emit: uc, optional: true + tuple val(meta), path("${meta.id}_primertrimmed_sorted.fq.gz"), emit: trimmed_seqs, optional: true + tuple val("${task.process}"), val('cutadapt'), eval('cutadapt --version'), topic: versions + tuple val("${task.process}"), val('vsearch'), eval('vsearch --version 2>&1 | head -n 1 | sed "s/vsearch //g" | sed "s/,.*//g" | sed "s/^v//" | sed "s/_.*//"'), topic: versions + tuple val("${task.process}"), val('seqkit'), eval('seqkit version | sed "s/seqkit v//"'), topic: versions + tuple val("${task.process}"), val('phredsort'), eval('phredsort -v | sed "s/phredsort //"'), topic: versions + tuple val("${task.process}"), val('seqhasher'), eval('seqhasher -v | sed "s/SeqHasher //"'), topic: versions + tuple val("${task.process}"), val('parallel'), eval('parallel --version | head -n 1 | sed "s/GNU parallel //"'), topic: versions + tuple val("${task.process}"), val('brename'), eval('brename --help | head -n 4 | tail -1 | sed "s/Version: //"'), topic: versions + + script: + sampID="${meta.id}" + """ + echo -e "Primer trimming and dereplication at sample level\\n" + echo -e "Input sample: " ${sampID} + + ## Trim primers + echo -e "Trimming primers\\n" + + ## Reverse-complement rev primer + RR=\$(rc.sh ${params.primer_reverse}) + + cutadapt \ + -a ${params.primer_forward}";required;min_overlap=${params.primer_foverlap}"..."\$RR"";required;min_overlap=${params.primer_roverlap}" \ + --errors ${params.primer_mismatches} \ + --revcomp --rename "{id}" \ + --discard-untrimmed \ + --minimum-length ${params.trim_minlen} \ + --cores ${task.cpus} \ + --action trim \ + --output ${sampID}_primertrimmed.fq.gz \ + ${fastq} + + echo -e "..Done\\n" + + ## Check if there are sequences in the output + NUMSEQS=\$( seqkit stat --tabular --quiet ${sampID}_primertrimmed.fq.gz | awk -F'\t' 'NR==2 {print \$4}' ) + echo -e "Number of sequences after primer trimming: " \$NUMSEQS + if [ \$NUMSEQS -lt 1 ]; then + echo -e "\\nIt looks like no reads remained after trimming the primers\\n" + exit 0 + fi + + ## Estimate sequence quality and sort sequences by quality + echo -e "\\nSorting by sequence quality" + seqkit replace -p "\\s.+" ${sampID}_primertrimmed.fq.gz \ + | phredsort -i - -o - --metric meep --header avgphred,maxee,meep \ + | gzip -1 > ${sampID}_primertrimmed_sorted.fq.gz + echo -e "..Done" + + ## Hash sequences, add sample ID to the header + ## columns: Sample ID - Hash - PacBioID - AvgPhredScore - MaxEE - MEEP - Sequence - Quality - Length + ## Convert to Parquet format + echo -e "\\nCreating hash table" + seqhasher --hash sha1 --name ${sampID} ${sampID}_primertrimmed_sorted.fq.gz - \ + | seqkit fx2tab --length \ + | sed 's/;/\t/ ; s/;/\t/ ; s/ avgphred=/\t/ ; s/ maxee=/\t/ ; s/ meep=/\t/' \ + > ${sampID}_hash_table.txt + echo -e "..Done" + + ## Check the number of fields per record (should be 9!) + # awk '{print NF}' ${sampID}_hash_table.txt | sort | uniq -c + # awk 'NF > 9 {print \$0 }' ${sampID}_hash_table.txt + + ## Dereplicate at sample level (use quality-sorted sequences to make sure that the representative sequence is with the highest quality) + echo -e "\\nDereplicating at sample level" + seqkit fq2fa -w 0 ${sampID}_primertrimmed_sorted.fq.gz \ + | vsearch \ + --derep_fulllength - \ + --output - \ + --strand both \ + --fasta_width 0 \ + --threads 1 \ + --relabel_sha1 \ + --sizein --sizeout \ + --minseqlength ${params.trim_minlen} \ + --uc ${sampID}_uc.uc \ + --quiet \ + > ${sampID}_derep.fasta + + ## Remove temporary file + rm ${sampID}_primertrimmed.fq.gz + + ## Compress results + echo -e "\\nCompressing results" + parallel -j${task.cpus} "gzip -${params.gzip_compression} {}" ::: \ + ${sampID}_hash_table.txt \ + ${sampID}_uc.uc \ + ${sampID}_derep.fasta + + echo -e "..Done" + + """ +} + + +// Extract ITS region with ITSx +// NB. In input data, sequence header should not contain spaces! +process itsx { + + label "main_container" + + // No need to publish intermediate results for chunked workflow, as they will be concatenated later + publishDir "${out_3_itsx}", + mode: "${params.storagemode}", + enabled: params.ITSx_chunk_size == 0 + + // cpus 2 + + // Add sample ID to the log file + tag { meta.chunk_id != null ? "${meta.id}__chunk${meta.chunk_id}" : "${meta.id}" } + + input: + tuple val(meta), path(input) // FASTA file with dereplicated sequences + + output: + tuple val(meta), path( "${meta.id}*.full.fasta.gz"), emit: itsx_full, optional: true + tuple val(meta), path( "${meta.id}*.SSU.fasta.gz"), emit: itsx_ssu, optional: true + tuple val(meta), path( "${meta.id}*.ITS1.fasta.gz"), emit: itsx_its1, optional: true + tuple val(meta), path( "${meta.id}*.5_8S.fasta.gz"), emit: itsx_58s, optional: true + tuple val(meta), path( "${meta.id}*.ITS2.fasta.gz"), emit: itsx_its2, optional: true + tuple val(meta), path( "${meta.id}*.LSU.fasta.gz"), emit: itsx_lsu, optional: true + tuple val(meta), path( "${meta.id}*.positions.txt"), emit: itsx_positions, optional: true + tuple val(meta), path( "${meta.id}*.problematic.txt"), emit: itsx_problematic, optional: true + tuple val(meta), path( "${meta.id}*_no_detections.fasta.gz"), emit: itsx_nondetects, optional: true + tuple val(meta), path( "${meta.id}*.summary.txt"), emit: itsx_summary, optional: true + tuple val(meta), path( "${meta.id}*.extraction.results.gz"), emit: itsx_details, optional: true + tuple val(meta), path( "${meta.id}*.SSU.full_and_partial.fasta.gz"), emit: itsx_ssu_part, optional: true + tuple val(meta), path( "${meta.id}*.ITS1.full_and_partial.fasta.gz"), emit: itsx_its1_part, optional: true + tuple val(meta), path( "${meta.id}*.5_8S.full_and_partial.fasta.gz"), emit: itsx_58s_part, optional: true + tuple val(meta), path( "${meta.id}*.ITS2.full_and_partial.fasta.gz"), emit: itsx_its2_part, optional: true + tuple val(meta), path( "${meta.id}*.LSU.full_and_partial.fasta.gz"), emit: itsx_lsu_part, optional: true + tuple val("${task.process}"), val('ITSx'), eval('ITSx --help 2>&1 | head -n 3 | tail -n 1 | sed "s/Version: //"'), topic: versions + tuple val("${task.process}"), val('cutadapt'), eval('cutadapt --version'), topic: versions + tuple val("${task.process}"), val('vsearch'), eval('vsearch --version 2>&1 | head -n 1 | sed "s/vsearch //g" | sed "s/,.*//g" | sed "s/^v//" | sed "s/_.*//"'), topic: versions + tuple val("${task.process}"), val('seqkit'), eval('seqkit version | sed "s/seqkit v//"'), topic: versions + tuple val("${task.process}"), val('phredsort'), eval('phredsort -v | sed "s/phredsort //"'), topic: versions + tuple val("${task.process}"), val('seqhasher'), eval('seqhasher -v | sed "s/SeqHasher //"'), topic: versions + tuple val("${task.process}"), val('parallel'), eval('parallel --version | head -n 1 | sed "s/GNU parallel //"'), topic: versions + tuple val("${task.process}"), val('brename'), eval('brename --help | head -n 4 | tail -1 | sed "s/Version: //"'), topic: versions + tuple val("${task.process}"), val('duckdb'), eval('duckdb --version | cut -d" " -f1 | sed "s/^v//"'), topic: versions + + script: + sampID="${meta.id}" + chunkPrefix="${meta.id}_chunk${meta.chunk_id}" + + // Allow inclusion of sequences that only find a single domain, given that they meet the given E-value and score thresholds, on with parameters 1e-9,0 by default + // singledomain = params.ITSx_singledomain ? "--allow_single_domain 1e-9,0" : "" + + """ + echo -e "Extraction of rRNA regions using ITSx\\n" + echo -e "Input sample: " ${sampID} + echo -e "Chunk ID: " ${meta.chunk_id} + + ## Check if input file is gz-compressed (by magic bytes `1f 8b`) + tmp_created=0 + tmpfile="" + if [[ -f "${input}" ]] && head -c 2 -- "${input}" | LC_ALL=C od -An -tx1 | tr -d ' \n' | grep -qi '^1f8b'; then + echo -e "Input file is gz-compressed, decompressing..." + tmpfile="\$(mktemp "tmp.decompressed.input.XXXXXX")" + gunzip -c -- "${input}" > "\$tmpfile" + tmp_created=1 + itsxinput="\$tmpfile" + itsxoutput="${sampID}" + else + itsxinput="${input}" + itsxoutput="${chunkPrefix}" + fi + + ## ITSx extraction + echo -e "\\nITSx extraction" + ITSx \ + -i "\$itsxinput" \ + --complement ${params.ITSx_complement} \ + --save_regions all \ + --graphical F \ + --detailed_results T \ + --positions T \ + --not_found T \ + -E ${params.ITSx_evalue} \ + -t ${params.ITSx_tax} \ + --partial ${params.ITSx_partial} \ + --cpu ${task.cpus} \ + --preserve T \ + -o "\$itsxoutput" + + echo -e "..Done" + + # ITSx.full.fasta + # ITSx.SSU.fasta + # ITSx.ITS1.fasta + # ITSx.5_8S.fasta + # ITSx.ITS2.fasta + # ITSx.LSU.fasta + # ITSx.positions.txt + # ITSx.problematic.txt + # ITSx_no_detections.fasta + # ITSx_no_detections.txt + # ITSx.summary.txt + # ITSx.extraction.results + # ITSx.SSU.full_and_partial.fasta + # ITSx.ITS1.full_and_partial.fasta + # ITSx.5_8S.full_and_partial.fasta + # ITSx.ITS2.full_and_partial.fasta + # ITSx.LSU.full_and_partial.fasta + + + ## If partial sequences were required, remove empty sequences + if [ \$(find . -type f -name "*.full_and_partial.fasta" | wc -l) -gt 0 ]; then + echo -e "Partial files found, removing empty sequences\\n." + + find . -name "*.full_and_partial.fasta" \ + | parallel -j${task.cpus} "seqkit seq -m 1 -w 0 {} > {.}_tmp.fasta" + + rm *.full_and_partial.fasta + brename -p "_tmp" -r "" -f "_tmp.fasta\$" + + fi + + ## Remove empty files (no sequences) + echo -e "\\nRemoving empty files" + find . -type f -name "*.fasta" -empty -print -delete + echo -e "..Done" + + ## Remove temporary file (if input file was gz-compressed) + if (( tmp_created )); then + rm -f -- "\$tmpfile" + fi + + ## Compress results + echo -e "\\nCompressing files" + + ## ITSx results (no symlinked derep input) + find . -type f -name "*.fasta" \ + | parallel -j${task.cpus} "gzip -${params.gzip_compression} {}" + + gzip -${params.gzip_compression} "\$itsxoutput".extraction.results + + echo -e "..Done" + """ +} + + +// Concatenate ITSx output from all chunks (per samples) +// Convert ITSx output to Parquet +process itsx_concatenate { + + label "main_container" + + publishDir "${out_3_itsx}", mode: "${params.storagemode}" + + tag "${meta.id}" + + input: + tuple val(meta), path(fasta_chunks, stageAs: "chunks/") // all files from ITSx for all chunks for each sample + + output: + path( "${meta.id}.full.fasta.gz"), emit: itsx_full, optional: true + path( "${meta.id}.SSU.fasta.gz"), emit: itsx_ssu, optional: true + path( "${meta.id}.ITS1.fasta.gz"), emit: itsx_its1, optional: true + path( "${meta.id}.5_8S.fasta.gz"), emit: itsx_58s, optional: true + path( "${meta.id}.ITS2.fasta.gz"), emit: itsx_its2, optional: true + path( "${meta.id}.LSU.fasta.gz"), emit: itsx_lsu, optional: true + path( "${meta.id}.positions.txt"), emit: itsx_positions, optional: true + path( "${meta.id}.problematic.txt"), emit: itsx_problematic, optional: true + path( "${meta.id}_no_detections.fasta.gz"), emit: itsx_nondetects, optional: true + path( "${meta.id}.summary.txt"), emit: itsx_summary, optional: true + path( "${meta.id}.extraction.results.gz"), emit: itsx_details, optional: true + path( "${meta.id}.SSU.full_and_partial.fasta.gz"), emit: itsx_ssu_part, optional: true + path( "${meta.id}.ITS1.full_and_partial.fasta.gz"), emit: itsx_its1_part, optional: true + path( "${meta.id}.5_8S.full_and_partial.fasta.gz"), emit: itsx_58s_part, optional: true + path( "${meta.id}.ITS2.full_and_partial.fasta.gz"), emit: itsx_its2_part, optional: true + path( "${meta.id}.LSU.full_and_partial.fasta.gz"), emit: itsx_lsu_part, optional: true + path( "parquet/*.parquet"), emit: parquet, optional: true + + script: + sampID="${meta.id}" + """ + + echo -e "Concatenating ITSx output from all chunks" + echo -e "Input sample: " ${sampID} + + shopt -s nullglob + + ## Concatenate ITSx FASTA outputs + echo -e "Concatenating:" + + full_files=( chunks/${sampID}_chunk*.full.fasta.gz ) + echo -e " - full ITS sequences: \${#full_files[@]}" + if [ \${#full_files[@]} -gt 0 ]; then + for f in "\${full_files[@]}"; do + echo -e " \$f" + done + cat "\${full_files[@]}" > ${sampID}.full.fasta.gz + fi + + ssu_files=( chunks/${sampID}_chunk*.SSU.fasta.gz ) + echo -e " - SSU sequences: \${#ssu_files[@]}" + if [ \${#ssu_files[@]} -gt 0 ]; then + for f in "\${ssu_files[@]}"; do + echo -e " \$f" + done + cat "\${ssu_files[@]}" > ${sampID}.SSU.fasta.gz + fi + + its1_files=( chunks/${sampID}_chunk*.ITS1.fasta.gz ) + echo -e " - ITS1 sequences: \${#its1_files[@]}" + if [ \${#its1_files[@]} -gt 0 ]; then + for f in "\${its1_files[@]}"; do + echo -e " \$f" + done + cat "\${its1_files[@]}" > ${sampID}.ITS1.fasta.gz + fi + + s58_files=( chunks/${sampID}_chunk*.5_8S.fasta.gz ) + echo -e " - 5.8S sequences: \${#s58_files[@]}" + if [ \${#s58_files[@]} -gt 0 ]; then + for f in "\${s58_files[@]}"; do + echo -e " \$f" + done + cat "\${s58_files[@]}" > ${sampID}.5_8S.fasta.gz + fi + + its2_files=( chunks/${sampID}_chunk*.ITS2.fasta.gz ) + echo -e " - ITS2 sequences: \${#its2_files[@]}" + if [ \${#its2_files[@]} -gt 0 ]; then + for f in "\${its2_files[@]}"; do + echo -e " \$f" + done + cat "\${its2_files[@]}" > ${sampID}.ITS2.fasta.gz + fi + + lsu_files=( chunks/${sampID}_chunk*.LSU.fasta.gz ) + echo -e " - LSU sequences: \${#lsu_files[@]}" + if [ \${#lsu_files[@]} -gt 0 ]; then + for f in "\${lsu_files[@]}"; do + echo -e " \$f" + done + cat "\${lsu_files[@]}" > ${sampID}.LSU.fasta.gz + fi + + nd_files=( chunks/${sampID}_chunk*_no_detections.fasta.gz ) + echo -e " - no detections sequences: \${#nd_files[@]}" + if [ \${#nd_files[@]} -gt 0 ]; then + for f in "\${nd_files[@]}"; do + echo -e " \$f" + done + cat "\${nd_files[@]}" > ${sampID}_no_detections.fasta.gz + fi + + ## Concatenate partial outputs if present + ssu_part_files=( chunks/${sampID}_chunk*.SSU.full_and_partial.fasta.gz ) + echo -e " - SSU partial sequences: \${#ssu_part_files[@]}" + if [ \${#ssu_part_files[@]} -gt 0 ]; then + for f in "\${ssu_part_files[@]}"; do + echo -e " \$f" + done + cat "\${ssu_part_files[@]}" > ${sampID}.SSU.full_and_partial.fasta.gz + fi + + its1_part_files=( chunks/${sampID}_chunk*.ITS1.full_and_partial.fasta.gz ) + echo -e " - ITS1 partial sequences: \${#its1_part_files[@]}" + if [ \${#its1_part_files[@]} -gt 0 ]; then + for f in "\${its1_part_files[@]}"; do + echo -e " \$f" + done + cat "\${its1_part_files[@]}" > ${sampID}.ITS1.full_and_partial.fasta.gz + fi + + s58_part_files=( chunks/${sampID}_chunk*.5_8S.full_and_partial.fasta.gz ) + echo -e " - 5.8S partial sequences: \${#s58_part_files[@]}" + if [ \${#s58_part_files[@]} -gt 0 ]; then + for f in "\${s58_part_files[@]}"; do + echo -e " \$f" + done + cat "\${s58_part_files[@]}" > ${sampID}.5_8S.full_and_partial.fasta.gz + fi + + its2_part_files=( chunks/${sampID}_chunk*.ITS2.full_and_partial.fasta.gz ) + echo -e " - ITS2 partial sequences: \${#its2_part_files[@]}" + if [ \${#its2_part_files[@]} -gt 0 ]; then + for f in "\${its2_part_files[@]}"; do + echo -e " \$f" + done + cat "\${its2_part_files[@]}" > ${sampID}.ITS2.full_and_partial.fasta.gz + fi + + lsu_part_files=( chunks/${sampID}_chunk*.LSU.full_and_partial.fasta.gz ) + echo -e " - LSU partial sequences: \${#lsu_part_files[@]}" + if [ \${#lsu_part_files[@]} -gt 0 ]; then + for f in "\${lsu_part_files[@]}"; do + echo -e " \$f" + done + cat "\${lsu_part_files[@]}" > ${sampID}.LSU.full_and_partial.fasta.gz + fi + + ## Concatenate text outputs + pos_files=( chunks/${sampID}_chunk*.positions.txt ) + echo -e " - positions: \${#pos_files[@]}" + if [ \${#pos_files[@]} -gt 0 ]; then + for f in "\${pos_files[@]}"; do + echo -e " \$f" + done + cat "\${pos_files[@]}" > ${sampID}.positions.txt + fi + + prob_files=( chunks/${sampID}_chunk*.problematic.txt ) + echo -e " - problematic sequences: \${#prob_files[@]}" + if [ \${#prob_files[@]} -gt 0 ]; then + for f in "\${prob_files[@]}"; do + echo -e " \$f" + done + cat "\${prob_files[@]}" > ${sampID}.problematic.txt + fi + + sum_files=( chunks/${sampID}_chunk*.summary.txt ) + echo -e " - ITSx summary reports: \${#sum_files[@]}" + if [ \${#sum_files[@]} -gt 0 ]; then + for f in "\${sum_files[@]}"; do + echo -e " \$f" + done + cat "\${sum_files[@]}" > ${sampID}.summary.txt + fi + + det_files=( chunks/${sampID}_chunk*.extraction.results.gz ) + echo -e " - ITSx extraction results: \${#det_files[@]}" + if [ \${#det_files[@]} -gt 0 ]; then + for f in "\${det_files[@]}"; do + echo -e " \$f" + done + cat "\${det_files[@]}" > ${sampID}.extraction.results.gz + fi + + echo -e "\\n" + + ## Convert ITSx output to Parquet + if [ ${params.ITSx_to_parquet} == true ]; then + + echo -e "\\nConverting ITSx output to Parquet" + mkdir -p parquet + + if [ -f ${sampID}.full.fasta.gz ]; then + ITSx_to_DuckDB.sh -i ${sampID}.full.fasta.gz -o parquet/${sampID}.full.parquet + fi + + if [ -f ${sampID}.SSU.fasta.gz ]; then + ITSx_to_DuckDB.sh -i ${sampID}.SSU.fasta.gz -o parquet/${sampID}.SSU.parquet + fi + + if [ -f ${sampID}.ITS1.fasta.gz ]; then + ITSx_to_DuckDB.sh -i ${sampID}.ITS1.fasta.gz -o parquet/${sampID}.ITS1.parquet + fi + + if [ -f ${sampID}.5_8S.fasta.gz ]; then + ITSx_to_DuckDB.sh -i ${sampID}.5_8S.fasta.gz -o parquet/${sampID}.5_8S.parquet + fi + + if [ -f ${sampID}.ITS2.fasta.gz ]; then + ITSx_to_DuckDB.sh -i ${sampID}.ITS2.fasta.gz -o parquet/${sampID}.ITS2.parquet + fi + + if [ -f ${sampID}.LSU.fasta.gz ]; then + ITSx_to_DuckDB.sh -i ${sampID}.LSU.fasta.gz -o parquet/${sampID}.LSU.parquet + fi + + echo -e "Parquet files created\\n" + + fi + + """ +} + + + + +// ITSx processing workflow +workflow ITSx { + + take: + seqs + + main: + + // Add metadata to the channel (fetch sample ID from the FASTQ file name) + ch_seqs = seqs.map { fastq -> + def sample_id = fastq.getSimpleName().replaceAll(/_PrimerChecked/, '') + def meta = [id: sample_id] + [meta, fastq] + } + + // Trim primers and dereplicate at sample level + primer_trim(ch_seqs) + + // Size of dereplicated input for ITSx + // if null, use default value (currently, 10000) + // if 0, use all sequences in one chunk + def chunk_size = (params.ITSx_chunk_size == null ? 10000 : params.ITSx_chunk_size as int) + + if( chunk_size == 0 ) { + // Single-chunk workflow (no data splitting) + // NB! here, fasta will be gz-compressed -> will be handled in the itsx process + chunks_ch = primer_trim.out.derep + .map { meta, fasta -> [ meta + [chunk_id: null], fasta ] } + } + else { + // Chunking mode: split the dereplicated primer-trimmed sequences (at sample level) into chunks while preserving metadata + // NB! here, fasta will be uncompressed + chunks_ch = primer_trim.out.derep + .flatMap { meta, fasta -> + def chunks = fasta.splitFasta(by: chunk_size, file: true, decompress: true, compress: false) + def result = [] + chunks.eachWithIndex { chunk_file, idx -> + result << [ meta + [chunk_id: idx], chunk_file ] + } + return result + } + } + + // Run ITSx + itsx(chunks_ch) + + + +} // end of ITSx workflow diff --git a/src/pipecraft-core/service_scripts/NextITS/workflows/STEP1.nf b/src/pipecraft-core/service_scripts/NextITS/workflows/STEP1.nf index f21b4ba..f17e2fa 100644 --- a/src/pipecraft-core/service_scripts/NextITS/workflows/STEP1.nf +++ b/src/pipecraft-core/service_scripts/NextITS/workflows/STEP1.nf @@ -389,8 +389,9 @@ process demux { echo -e "\\n\\nSymmetric barcodes counts\\n\\n" >> LIMA/lima.lima.counts cat LIMAs/lima.lima.counts >> LIMA/lima.lima.counts - echo -e "\\n\\nSymmetric barcodes report\\n\\n" >> LIMA/lima.lima.report - cat LIMAs/lima.lima.report >> LIMA/lima.lima.report + ## Reports should be identical for symmetric and asymmetric barcodes, so no need to combine them + # echo -e "\\n\\nSymmetric barcodes report\\n\\n" >> LIMA/lima.lima.report + # cat LIMAs/lima.lima.report >> LIMA/lima.lima.report fi fi # end of dual logs pooling @@ -787,7 +788,7 @@ process itsx { path "${input.getSimpleName().replaceAll(/_PrimerChecked/, '')}.problematic.txt", optional: true path "${input.getSimpleName().replaceAll(/_PrimerChecked/, '')}_no_detections.fasta.gz", emit: itsx_nondetects, optional: true path "${input.getSimpleName().replaceAll(/_PrimerChecked/, '')}.summary.txt", emit: itsx_summary, optional: true - path "${input.getSimpleName().replaceAll(/_PrimerChecked/, '')}.extraction.results", emit: itsx_details, optional: true + path "${input.getSimpleName().replaceAll(/_PrimerChecked/, '')}.extraction.results.gz", emit: itsx_details, optional: true path "${input.getSimpleName().replaceAll(/_PrimerChecked/, '')}.SSU.full_and_partial.fasta.gz", emit: itsx_ssu_part, optional: true path "${input.getSimpleName().replaceAll(/_PrimerChecked/, '')}.ITS1.full_and_partial.fasta.gz", emit: itsx_its1_part, optional: true path "${input.getSimpleName().replaceAll(/_PrimerChecked/, '')}.5_8S.full_and_partial.fasta.gz", emit: itsx_58s_part, optional: true @@ -946,7 +947,8 @@ process itsx { parallel -j${task.cpus} "gzip -${params.gzip_compression} {}" ::: \ ${sampID}_hash_table.txt \ ${sampID}_uc.uc \ - *.fasta + *.fasta \ + ${sampID}.extraction.results ## Convert ITSx output to Parquet if [ ${params.ITSx_to_parquet} == true ]; then diff --git a/src/pipecraft-core/service_scripts/NextITS/workflows/STEP2.nf b/src/pipecraft-core/service_scripts/NextITS/workflows/STEP2.nf index 6d3b2fe..75ef3fa 100644 --- a/src/pipecraft-core/service_scripts/NextITS/workflows/STEP2.nf +++ b/src/pipecraft-core/service_scripts/NextITS/workflows/STEP2.nf @@ -278,86 +278,61 @@ process merge_buckets { publishDir "${params.outdir}/02.Homopolymer", mode: "${params.storagemode}", enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.preclustering == "homopolymer", - pattern: "PreClustered.uc.gz", - saveAs: { filename -> filename == "PreClustered.uc.gz" ? "HomopolymerCompressed.uc.gz" : null } - - publishDir "${params.outdir}/02.UNOISE", - mode: "${params.storagemode}", - enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.preclustering == "unoise", - pattern: "PreClustered.uc.gz", - saveAs: { filename -> filename == "PreClustered.uc.gz" ? "UNOISE.uc.gz" : null } - - publishDir "${params.outdir}/02.DADA2", - mode: "${params.storagemode}", - enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.preclustering == "dada2", - pattern: "PreClustered.uc.gz", - saveAs: { filename -> filename == "PreClustered.uc.gz" ? "DADA2_denoised.uc.gz" : null } - - publishDir "${params.outdir}/02.Preclustered_SWARM_d1", - mode: "${params.storagemode}", - enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.preclustering == "swarm_d1", - pattern: "PreClustered.uc.gz", - saveAs: { filename -> filename == "PreClustered.uc.gz" ? "SWARM.uc.gz" : null } - - // Final clustering results - publish to clustering directory if clustering != "none" - publishDir "${params.outdir}/03.Clustered_VSEARCH", - mode: "${params.storagemode}", - enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.clustering == "vsearch", - pattern: "Clustered.{fa,uc}.gz" - // No saveAs needed - files already have correct names for VSEARCH - - publishDir "${params.outdir}/03.Clustered_SWARM", - mode: "${params.storagemode}", - enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.clustering == "swarm", - pattern: "Clustered.{fa,uc}.gz", + pattern: "PreClustered.{fa,uc}.gz", saveAs: { filename -> switch(filename) { - case "Clustered.fa.gz": return "SWARM_representatives.fa.gz" - case "Clustered.uc.gz": return "SWARM.uc.gz" + case "PreClustered.fa.gz": return "HomopolymerCompressed.fa.gz" + case "PreClustered.uc.gz": return "HomopolymerCompressed.uc.gz" default: return null } } - // Final results when no clustering is done (clustering == "none") - publish to preclustering directory - publishDir "${params.outdir}/02.Homopolymer", + publishDir "${params.outdir}/02.UNOISE", mode: "${params.storagemode}", - enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.clustering == "none" && params.preclustering == "homopolymer", - pattern: "Clustered.{fa,uc}.gz", + enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.preclustering == "unoise", + pattern: "PreClustered.{fa,uc}.gz", saveAs: { filename -> switch(filename) { - case "Clustered.fa.gz": return "HomopolymerCompressed.fa.gz" - case "Clustered.uc.gz": return "HomopolymerCompressed.uc.gz" + case "PreClustered.fa.gz": return "UNOISE.fa.gz" + case "PreClustered.uc.gz": return "UNOISE.uc.gz" default: return null } } - publishDir "${params.outdir}/02.UNOISE", + publishDir "${params.outdir}/02.DADA2", mode: "${params.storagemode}", - enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.clustering == "none" && params.preclustering == "unoise", - pattern: "Clustered.{fa,uc}.gz", + enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.preclustering == "dada2", + pattern: "PreClustered.{fa,uc}.gz", saveAs: { filename -> switch(filename) { - case "Clustered.fa.gz": return "UNOISE.fa.gz" - case "Clustered.uc.gz": return "UNOISE.uc.gz" + case "PreClustered.fa.gz": return "DADA2_denoised.fa.gz" + case "PreClustered.uc.gz": return "DADA2_denoised.uc.gz" default: return null } } - publishDir "${params.outdir}/02.DADA2", + publishDir "${params.outdir}/02.Preclustered_SWARM_d1", mode: "${params.storagemode}", - enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.clustering == "none" && params.preclustering == "dada2", - pattern: "Clustered.{fa,uc}.gz", + enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.preclustering == "swarm_d1", + pattern: "PreClustered.{fa,uc}.gz", saveAs: { filename -> switch(filename) { - case "Clustered.fa.gz": return "DADA2_denoised.fa.gz" - case "Clustered.uc.gz": return "DADA2_denoised.uc.gz" + case "PreClustered.fa.gz": return "SWARM.fa.gz" + case "PreClustered.uc.gz": return "SWARM.uc.gz" default: return null } } + + // Final clustering results - publish to clustering directory if clustering != "none" + publishDir "${params.outdir}/03.Clustered_VSEARCH", + mode: "${params.storagemode}", + enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.clustering == "vsearch", + pattern: "Clustered.{fa,uc}.gz" + // No saveAs needed - files already have correct names for VSEARCH - publishDir "${params.outdir}/02.Preclustered_SWARM_d1", + publishDir "${params.outdir}/03.Clustered_SWARM", mode: "${params.storagemode}", - enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.clustering == "none" && params.preclustering == "swarm_d1", + enabled: (params.chunking_n != null && params.chunking_n >= 2) && params.clustering == "swarm", pattern: "Clustered.{fa,uc}.gz", saveAs: { filename -> switch(filename) { @@ -366,16 +341,18 @@ process merge_buckets { default: return null } } - + // Since there are name collisions, we need to stage files with unique names input: path(preclustuc_chunks, stageAs: "pre/?/*") // UC files for pre-clustering (optional) + path(preclustaf_chunks, stageAs: "pre/?/*") // FASTA files for pre-clustering (optional) path(cluster_chunks, stageAs: "cls/?/*") // Sequence representatives path(clustuc_chunks, stageAs: "ucs/?/*") // UC files for clustering output: path "PreClustered.uc.gz", emit: preclustuc_ch, optional: true + path "PreClustered.fa.gz", emit: preclustaf_ch, optional: true path "Clustered.fa.gz", emit: cluster_ch path "Clustered.uc.gz", emit: clustuc_ch @@ -401,9 +378,23 @@ process merge_buckets { echo -e "..Pre-clustering was not performed. Skipping pooling these data\\n" else echo -e "..Pre-clustering was performed\\n" + + echo -e "..Pooling pre-clustered UC files\\n" find pre -name "*.uc.gz" \ | parallel -j 1 "cat {}" \ > PreClustered.uc.gz + + echo -e "..Pooling pre-clustered FASTA files\\n" + find pre -name "*.fa.gz" \ + | parallel -j 1 "cat {}" \ + | vsearch \ + --sortbysize - \ + --sizein --sizeout \ + --threads 1 \ + --fasta_width 0 \ + --output - \ + | pigz -p ${task.cpus} -${params.gzip_compression} \ + > PreClustered.fa.gz fi echo -e "..Done\\n" @@ -505,6 +496,54 @@ process summarize { } +// Summarize dereplicated data +process summarize_dereplicated_data { + + label "main_container" + publishDir "${params.outdir}/04.PooledResults", mode: "${params.storagemode}" + // cpus 4 + + input: + path(seqtab) // Sequence tables in long format, parquet + path(uc_derep) // UC file from dereplication + path(fasta) // FASTA file with sequences + + output: + path "UC_Pooled.parquet", emit: uc + path "OTU_table_wide.txt.gz", emit: otutabwide + path "OTU_table_long.txt.gz", emit: otutablong + path "OTU_table_wide.RData", emit: otutabwider + path "OTU_table_long.RData", emit: otutablongr + path "OTUs.fa.gz", emit: seqs + tuple val("${task.process}"), val('ucs'), eval('ucs --version | sed "s/ucs //"'), topic: versions + tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //" | cut -d" " -f1'), topic: versions + tuple val("${task.process}"), val('data.table'), eval('Rscript -e "cat(as.character(packageVersion(\'data.table\')))"'), topic: versions + tuple val("${task.process}"), val('arrow'), eval('Rscript -e "cat(as.character(packageVersion(\'arrow\')))"'), topic: versions + tuple val("${task.process}"), val('Biostrings'), eval('Rscript -e "cat(as.character(packageVersion(\'Biostrings\')))"'), topic: versions + + script: + """ + echo -e "Summarizing clustered data\\n" + + ## Parse UC file from dereplication + echo -e "..Parsing dereplicated UC file" + ucs --input ${uc_derep} --output UC_Pooled.parquet + + ## Summarize sequence abundance by OTU and sample + echo -e "\\n..Summarizing sequence abundance by OTU and sample\\n" + summarize_dereplicated_data.R \ + --seqtab ${seqtab} \ + --uc UC_Pooled.parquet \ + --seqs ${fasta} \ + --maxmeep ${params.max_MEEP} \ + --recoversinglet ${params.recover_lowqsingletons} \ + --mergesamples ${params.merge_replicates} \ + --threads ${task.cpus} + + """ +} + + // Post-clustering curation process lulu { @@ -707,6 +746,17 @@ workflow S2 { // hash-based dereplication first, then additional round of clustering-based derep. // But it would add extra complexity to manage and combine two UC files. + // Prepare sequence table based on dereplicated sequences + // (no clustering, pre-clustering, or denoising) + if(params.preclustering == "none" & params.clustering == "none"){ + + summarize_dereplicated_data( + aggregate_sequences.out.seqs_parquet, + derepuc_ch, + derep_ch + ) + + } else { /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -738,12 +788,14 @@ workflow S2 { // collect UC and FASTA files from all chunks preclustuc_chunks = CLUSTERING.out.preclustuc_ch.collect() + preclustaf_chunks = CLUSTERING.out.preclustaf_ch.collect() cluster_chunks = CLUSTERING.out.cluster_ch.collect() clustuc_chunks = CLUSTERING.out.clustuc_ch.collect() // Merge buckets into a single file merge_buckets( preclustuc_chunks, + preclustaf_chunks, cluster_chunks, clustuc_chunks) @@ -784,9 +836,11 @@ workflow S2 { ) } - // Run statistics - // run_summary() + } // end of preclustering == "none" & clustering == "none" + + // Run statistics + // run_summary() // Dump the software versions to a file ch_versions_yml = software_versions_to_yaml(Channel.topic('versions'))