Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ __pycache__/
*.py[cod]
*.swp

# Configs
configs/
camisim.cfg
metax_bench_data.config

# C extensions
*.so

Expand Down
12 changes: 12 additions & 0 deletions convert_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,13 @@ def convert_to_nextflow_config(input_file, output_file):
nf_config.write(f" {'verbose'} = false\n")
elif (key == 'distribution_file_paths'):
nf_config.write(f" distribution_files = \"{value}\"\n")
elif (key == 'prioritize_additional_genomes' or key == 'no_replace' or key == 'fill_up'):
if(value == 'True'):
nf_config.write(f" {key} = true\n")
else:
nf_config.write(f" {key} = false\n")
elif (key == 'additional_references' or key == 'additional_genomes_quality_file' or key == 'reference_genomes' or key == 'biom_profile' or key == 'max_rank'):
nf_config.write(f" {key} = \"{value}\"\n")
else:
nf_config.write(f" {key} = {value}\n")

Expand All @@ -84,6 +91,11 @@ def convert_to_nextflow_config(input_file, output_file):
nf_config.write(" no_replace = true\n")
nf_config.write(" no_replace = false\n")
nf_config.write(" additional_references=\"\"\n")
nf_config.write(" prioritize_additional_genomes = false\n")
nf_config.write(" additional_genomes_quality_file=\"\"\n")
nf_config.write(" additional_genomes_max_contamination = 5\n")
nf_config.write(" additional_genomes_min_completeness = 95\n")
nf_config.write(" additional_genomes_max_num_contigs = 200\n")
nf_config.write(" conda.enabled = true\n")
nf_config.write(" conda.useMamba = true\n")
nf_config.write(" conda.cacheDir=\"/home/jfunk/conda_cache\"\n")
Expand Down
31 changes: 30 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,38 @@ params {
config = ""
}

def resolveExternalConfigPath(rawPath) {
def trimmedPath = rawPath?.toString()?.trim()
if (!trimmedPath) {
return null
}

def providedFile = new File(trimmedPath)
if (providedFile.isAbsolute()) {
if (providedFile.exists()) {
return providedFile.canonicalPath
}
} else {
def launchDirFile = new File(System.getProperty('user.dir'), trimmedPath)
if (launchDirFile.exists()) {
return launchDirFile.canonicalPath
}

def projectDirFile = new File(projectDir.toString(), trimmedPath)
if (projectDirFile.exists()) {
return projectDirFile.canonicalPath
}
}

throw new IllegalArgumentException(
"External config file not found: '${trimmedPath}'. Checked the provided path, " +
"launch directory '${System.getProperty('user.dir')}', and project directory '${projectDir}'."
)
}

if (params.config && params.config.toString().trim()) {

includeConfig params.config
includeConfig resolveExternalConfigPath(params.config)

} else if (params.pipeline == "metagenomic") {

Expand Down
2 changes: 1 addition & 1 deletion pipelines/metagenomic/config/distribution.config
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ params {
gauss_sigma = 1

// Show the used distribution of genomes before simulating.
verbose = "False"
verbose = false

// Total number of simulated genomes
// Difference between genomes_total and genomes_real are simulated by sgEvolver
Expand Down
3 changes: 2 additions & 1 deletion pipelines/metagenomic/config/metagenomic.config
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ params {

// Maximum number of strains drawn from genomes belonging to a single OTU
// OTU is taken from the metadata file
// If the simulation is executed from profile (see parameter biom_profile): max strains need to be greater or equal to 2
// In profile mode this is validated together with min_strains_per_otu.
// If max_strains_per_otu = 1, exactly one genome is drawn per OTU.
max_strains_per_otu=2

// Minimum number of strains drawn from genomes belonging to a single OTU
Expand Down
25 changes: 22 additions & 3 deletions pipelines/metagenomic/config/profile.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,30 @@ params {
// If no genomes are found for certain OTUs, fill up with previously unused genomes
fill_up = false

// File containing additional reference genomes, mapped to OTUs from the input profile
// is optional
// Optional file containing additional reference genomes, mapped to taxa from the input profile.
// Accepted columns are either:
// ncbi_taxid <tab> scientific_name <tab> genome_path
// or:
// ncbi_taxid <tab> scientific_name <tab> genome_path <tab> novelty_category
// If novelty_category is omitted, it defaults to known_strain.
additional_references=""

// If true, high-quality additional genomes are selected before public reference genomes.
// If false, public references and high-quality additional genomes are sampled together.
prioritize_additional_genomes = false

// TSV describing quality of genomes from additional_references.
// Required columns: genome_path, completeness, contamination, num_contigs.
// Required when additional_references is set.
additional_genomes_quality_file=""

// Quality thresholds for genomes from additional_references.
// Lower-quality additional genomes are only used as a last resort after the preferred pools are exhausted.
additional_genomes_max_contamination = 5
additional_genomes_min_completeness = 90
additional_genomes_max_num_contigs = 200

// Highest taxonomic rank to draw genomes from. Note: If the rank of an OTU is higher than max_rank and
// fill_up = true, a random genome will be drawn for the OTU
max_rank = "family"
}
}
38 changes: 33 additions & 5 deletions pipelines/metagenomic/from_profile.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,24 @@ shared_scripts_dir = "${projectDir}/pipelines/shared/scripts"
workflow metagenomesimulation_from_profile {

main:

reference_genomes = params.containsKey('reference_genomes') ? params.reference_genomes : "${projectDir}/tools/assembly_summary_complete_genomes.txt"
min_strains_per_otu = params.containsKey('min_strains_per_otu') ? params.min_strains_per_otu : 1
max_strains_per_otu = params.containsKey('max_strains_per_otu') ? params.max_strains_per_otu : 2
no_replace = params.containsKey('no_replace') ? params.no_replace : true
fill_up = params.containsKey('fill_up') ? params.fill_up : false
max_rank = params.containsKey('max_rank') ? params.max_rank : "family"
prioritize_additional_genomes = params.containsKey('prioritize_additional_genomes') ? params.prioritize_additional_genomes : false
additional_genomes_quality_file = params.containsKey('additional_genomes_quality_file') ? params.additional_genomes_quality_file : ""
additional_genomes_max_contamination = params.containsKey('additional_genomes_max_contamination') ? params.additional_genomes_max_contamination : 5
additional_genomes_min_completeness = params.containsKey('additional_genomes_min_completeness') ? params.additional_genomes_min_completeness : 95
additional_genomes_max_num_contigs = params.containsKey('additional_genomes_max_num_contigs') ? params.additional_genomes_max_num_contigs : 200

get_genomes(params.biom_profile, params.number_of_samples, params.reference_genomes, params.seed, params.gauss_mu, params.gauss_sigma,
params.min_strains_per_otu, params.max_strains_per_otu, params.no_replace, params.fill_up, params.ncbi_taxdump_file, params.max_rank)
get_genomes(params.biom_profile, params.number_of_samples, reference_genomes, params.seed, params.gauss_mu, params.gauss_sigma,
min_strains_per_otu, max_strains_per_otu, no_replace, fill_up,
params.ncbi_taxdump_file, max_rank, prioritize_additional_genomes,
additional_genomes_quality_file, additional_genomes_max_contamination,
additional_genomes_min_completeness, additional_genomes_max_num_contigs)

loc_ch = get_genomes.out[0]
abundance_ch = get_genomes.out[1].flatten()
Expand Down Expand Up @@ -53,6 +68,11 @@ process get_genomes {
val(fill_up)
val(ncbi_taxdump_file)
val(max_rank)
val(prioritize_additional_genomes)
val(additional_genomes_quality_file)
val(additional_genomes_max_contamination)
val(additional_genomes_min_completeness)
val(additional_genomes_max_num_contigs)

output:
path "genome_to_id.tsv"
Expand All @@ -62,17 +82,25 @@ process get_genomes {
script:

additional_references = "None"
additional_genomes_quality = "None"

if(!params.additional_references.isEmpty()) {
if(params.containsKey('additional_references') && !params.additional_references.isEmpty()) {
additional_references = params.additional_references
}

if(!additional_genomes_quality_file.isEmpty()) {
additional_genomes_quality = additional_genomes_quality_file
}

"""
mkdir --parents ${params.outdir}/source_genomes/
mkdir --parents ${params.outdir}/internal/
mkdir --parents ${params.outdir}/distributions/

python ${scripts_dir}/get_genomes.py ${biom_profile} ${number_of_samples} ${reference_genomes} ${seed} ${mu} ${sigma} ${min_strains} ${max_strains} False ${no_replace} ${fill_up} ${scripts_dir}/split_fasta.pl ${params.outdir}/source_genomes/ ${additional_references} ${ncbi_taxdump_file} "${max_rank}"
python ${scripts_dir}/get_genomes.py "${biom_profile}" ${number_of_samples} "${reference_genomes}" ${seed} ${mu} ${sigma} ${min_strains} ${max_strains} False ${no_replace} ${fill_up} "${scripts_dir}/split_fasta.pl" "${params.outdir}/source_genomes/" "${additional_references}" "${additional_genomes_quality}" "${ncbi_taxdump_file}" "${max_rank}" ${prioritize_additional_genomes} ${additional_genomes_max_contamination} ${additional_genomes_min_completeness} ${additional_genomes_max_num_contigs}
cp metadata.tsv ${params.outdir}/internal/metadata.tsv
cp genome_to_id.tsv ${params.outdir}/internal/genome_to_id.tsv
cp genome_to_id.tsv ${params.outdir}/internal/genome_locations.tsv
cp abundance_*.tsv ${params.outdir}/distributions/
"""
}
}
21 changes: 16 additions & 5 deletions pipelines/metagenomic/metagenomic.nf
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ workflow metagenomic {
}

// make all sequences from input genomes (also strain simulated ones) unique and move them to an output location
genome_location_file_ch = cleanup_and_filter_sequences(genome_location_file_ch, genome_location_ch.map { it[1] }.collect())
genome_location_file_ch = cleanup_and_filter_sequences(genome_location_file_ch, genome_location_ch.collect(flat: false))

// this channel holds the genome location map (key = genome_id, value = absolute path to genome)
genome_location_ch = genome_location_file_ch
Expand Down Expand Up @@ -547,19 +547,30 @@ process cleanup_and_filter_sequences {

input:
path genome_id_to_file_path
path genomes
val genome_entries

output:
path genome_id_to_file_path
path "internal_*"

script:
normalized_genome_entries = genome_entries.collect { entry ->
if (entry instanceof List || entry instanceof Object[]) {
if (entry.size() >= 2) {
return [entry[0], entry[1]]
}
}
throw new IllegalArgumentException("cleanup_and_filter_sequences expected [genome_id, path] entries but received: ${entry}")
}
absolute_genome_id_to_file_path = normalized_genome_entries.collect { "${it[0]}\t${it[1]}" }.join('\n')
"""
mkdir --parents ${params.outdir}/source_genomes/
mkdir --parents ${params.outdir}/internal/

touch internal_${genome_id_to_file_path}
cat > absolute_${genome_id_to_file_path} <<'EOF'
${absolute_genome_id_to_file_path}
EOF

python ${scripts_dir}/clean_up_sequences.py ${genome_id_to_file_path} ${params.outdir}/source_genomes/ internal_${genome_id_to_file_path}
python ${scripts_dir}/clean_up_sequences.py absolute_${genome_id_to_file_path} ${params.outdir}/source_genomes/ internal_${genome_id_to_file_path}

cp ./out_genomes/* ${params.outdir}/source_genomes/
cp internal_${genome_id_to_file_path} ${params.outdir}/internal/genome_locations.tsv
Expand Down
27 changes: 23 additions & 4 deletions pipelines/metagenomic/scripts/clean_up_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import sys
import os
import re
from Bio import SeqIO

def get_new_name(name, set_of_sequence_names):
Expand Down Expand Up @@ -81,6 +82,23 @@ def parse_tsv_to_dict(file):
result_dict[key] = value
return result_dict


def get_unique_output_file_path(file_path_input, directory_output, genome_id, used_output_file_names):
file_name = os.path.basename(file_path_input)
stem, extension = os.path.splitext(file_name)
candidate = file_name
safe_genome_id = re.sub(r'[^A-Za-z0-9._-]+', '_', genome_id)

if candidate in used_output_file_names:
candidate = "{}__{}{}".format(safe_genome_id, stem, extension)
index = 1
while candidate in used_output_file_names:
candidate = "{}__{}_{}{}".format(safe_genome_id, stem, index, extension)
index += 1

used_output_file_names.add(candidate)
return os.path.join(directory_output, candidate)

def write_genome_id_to_path_map(genome_id_to_path_map, file_path_output, outdir):
"""
Write mapping of genome id to genome file path to a file.
Expand Down Expand Up @@ -118,6 +136,7 @@ def stream_genome_id_to_path_map(stream_out, genome_id_to_path_map, outdir):
set_of_sequence_names = set()
directory_output = "./out_genomes/"
genome_id_to_path_map = dict()
used_output_file_names = set()

# Create the output directory if it does not exist
os.makedirs(directory_output, exist_ok=True)
Expand All @@ -128,12 +147,12 @@ def stream_genome_id_to_path_map(stream_out, genome_id_to_path_map, outdir):
with open(file_path_sequence_map, 'w') as stream_map:

for genome_id, genome_file_path in genome_id_to_file_path.items():
file_name = os.path.basename(genome_file_path)

new_genome_file_path = os.path.join(directory_output, file_name)
new_genome_file_path = get_unique_output_file_path(
genome_file_path, directory_output, genome_id, used_output_file_names
)

move_genome_file(
file_name, new_genome_file_path, stream_map, genome_id, sequence_min_length, set_of_sequence_names)
genome_file_path, new_genome_file_path, stream_map, genome_id, sequence_min_length, set_of_sequence_names)
genome_id_to_path_map[genome_id] = new_genome_file_path


Expand Down
Loading