diff --git a/CHANGELOG.md b/CHANGELOG.md index 32276d2c..7d07b775 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,27 @@ All notable changes to this project will be documented in this file. +## 2.0.0-beta28 - 2026-04-15 + +[cdfe210](https://github.com/WrightonLabCSU/DRAM/commit/cdfe210ca64eb95baf6f1acedb62f91b74630181)...[e07cd74](https://github.com/WrightonLabCSU/DRAM/commit/e07cd74e8d60fca7513f645c04d0956760c74768) + +### Features + +- Add antiSMASH, CARD, RGI, TCDB ([8d08d1f](https://github.com/WrightonLabCSU/DRAM/commit/8d08d1f9d54fb139eb53587754e569c4317ddc37)) + + Add antiSMASH nextflow module, right now just collect antismash + raw output while we work on incorporating raw output into + larger pipeline + Add rgi nextflow module, right now like antiSMASH, only + collect raw output while we work on incorporating + ADD CARD db processing with mmseqs + ADD TCDB processing with mmseqs + +- Add DRAM DB HMMs = ([e07cd74](https://github.com/WrightonLabCSU/DRAM/commit/e07cd74e8d60fca7513f645c04d0956760c74768)) + + Add DRAM team curated HMM database as new annotation db option. + Work in progress and testing database, but can be found on GLOBUS. + ## 2.0.0-beta27 - 2026-03-18 [f03804b](https://github.com/WrightonLabCSU/DRAM/commit/f03804bca43b15e55731316c00b1c34ac328c62c)...[7d9a12d](https://github.com/WrightonLabCSU/DRAM/commit/7d9a12d225c577a6b2fb0c4d7b1ba60a5588e1e8) diff --git a/bin/combine_annotations.py b/bin/combine_annotations.py index 0fed20c2..f3707720 100755 --- a/bin/combine_annotations.py +++ b/bin/combine_annotations.py @@ -170,6 +170,8 @@ def combine_annotations(annotations_dir, genes_dir, output, threads): combined_data[FASTA_COLUMN] = combined_data[FASTA_COLUMN].where( mask, other=combined_data[FASTA_COLUMN + "2"] ) + # TODO: fix the merge so it doesn't make this column + combined_data = combined_data.drop(columns=FASTA_COLUMN + "2") combined_data = convert_bit_scores_to_numeric(combined_data) diff --git a/bin/hmm_parser.py b/bin/hmm_parser.py index 73490b43..6b7e910d 100755 --- a/bin/hmm_parser.py +++ b/bin/hmm_parser.py @@ -196,11 +196,6 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output): hits["perc_cov"] = (hits["model_end"] - hits["model_start"] + 1) / hits[ "query_length" ] - hits[f"{db_name}_id"] = hits["query_name"].str.replace(r".hmm", "", regex=True) - all_hits = get_all_hits(hits, db_name) - all_hits.name = f"{db_name}_ids" - hits = hits.merge(all_hits, how="left", left_on="query_id", right_index=True) - hmm_sheet = False if hmm_info_path is not None: hmm_sheet = True @@ -228,8 +223,11 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output): pass elif "definition" in hmm_info.columns: hmm_info = hmm_info.rename(columns={"definition": "description"}) - elif pd.api.types.is_string_dtype(hmm_info.iloc[:, -1]): - hmm_info = hmm_info.rename(columns={hmm_info.columns[-1]: "description"}) + elif ( + pd.api.types.is_string_dtype(hmm_info.iloc[:, -1]) + and hmm_info.columns[-1] not in merge_cols + ): # don't need to worry about description in merge cols, cause already checked + hmm_info["deescription"] = hmm_info[hmm_info.columns[-1]].copy() else: raise_on_ec = True @@ -243,10 +241,13 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output): ) merge_cols = [col for col in merge_cols if col in hmm_info.columns] - + print(hmm_info.columns) + print(hmm_info) hits = hits.merge( hmm_info[merge_cols], how="left", left_on="query_name", right_index=True ) + print(hits.columns) + print(hits) hits_sig = sig_scores_row_by_row(hits, db_name=db_name) drop_cols = [ col @@ -268,6 +269,15 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output): # df.to_csv(output, index=False) return + hits_sig[f"{db_name}_id"] = hits_sig["query_name"].str.replace( + r".hmm", "", regex=True + ) + all_hits_sig = get_all_hits(hits_sig, db_name) + all_hits_sig.name = f"{db_name}_ids" + hits_sig = hits_sig.merge( + all_hits_sig, how="left", left_on="query_id", right_index=True + ) + # Get the best hit # hits_sig = hits_sig.sort_values(['full_evalue', "domain_ievalue", "perc_cov"], ascending=[True, True, False]).drop_duplicates(subset=["query_id"]) hits_sig = hits_sig.sort_values( diff --git a/bin/hmm_search.py b/bin/hmm_search.py index ed096a37..e48ce7d4 100755 --- a/bin/hmm_search.py +++ b/bin/hmm_search.py @@ -19,18 +19,23 @@ help="Path to the input fasta to search against", ) @click.option("--e_value", type=float, help="e value cutoff for filtering") +@click.option("--t_value", type=float, help="bitscore cutoff for filtering") @click.option( "--output_file", type=click.Path(), help="Path to output file", ) @click.option("--cpus", type=int, help="number of cpu core to run HMMER with") -def main(hmm, input_file, e_value, output_file, cpus): +def main(hmm, input_file, e_value, t_value, output_file, cpus): t1 = time.time() hmm = Path(hmm) - - hmm_paths = hmm.parent.glob(hmm.name) + if hmm.is_dir(): # if directory passed, glob all hmms in dir + hmm = hmm / "*.hmm" + if "*" in str(hmm) or "?" in str(hmm): # check if path is glob path + hmm_paths = hmm.parent.glob(hmm.name) + else: + hmm_paths = [hmm] hmms = [] for path in hmm_paths: @@ -38,6 +43,11 @@ def main(hmm, input_file, e_value, output_file, cpus): hmms.extend(hmm_file) print(hmms) + kw = {} + if t_value: + kw["T"] = t_value + elif e_value: + kw["E"] = e_value with open(output_file, "wb") as out_fh: with pyhmmer.easel.SequenceFile( @@ -46,7 +56,7 @@ def main(hmm, input_file, e_value, output_file, cpus): seqs = pyhmmer.easel.DigitalSequenceBlock(alphabet) seqs.extend(sf) first = True - for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, E=e_value): + for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, **kw): hits.write(out_fh, format="domains", header=first) first = False # total = sum(len(hits) for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=8, E=1e-15)) diff --git a/bin/utils/click_utils.py b/bin/utils/click_utils.py index e563647e..303e532e 100755 --- a/bin/utils/click_utils.py +++ b/bin/utils/click_utils.py @@ -1,13 +1,21 @@ #!/usr/bin/env python -def validate_comma_separated(ctx, param, value, split=(",", " ")): +def validate_comma_separated(ctx, param, value, split=(",", " "), converter=None): if not value: return [] if isinstance(value, (list, tuple)): s = split if isinstance(split, str) else split[0] value = s.join(value) if isinstance(split, str): + split = [split] return value.split(split) if isinstance(split, (list, tuple)): + sentinel = "|SENTINEL|" for s in split: - value = value.replace(s, ",") - return [val.strip() for val in value.split(",")] + value = value.replace(s, sentinel) + ls = [] + for val in value.split(sentinel): + val = val.strip() + if converter: + val = converter(val) + ls.append(val) + return ls diff --git a/modules.json b/modules.json index 48c04632..918b2ca5 100644 --- a/modules.json +++ b/modules.json @@ -5,10 +5,25 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "antismash/antismash": { + "branch": "master", + "git_sha": "96c57dfd98a0641886a67bd449fe33ee2ec0e374", + "installed_by": ["modules"] + }, + "antismash/antismashdownloaddatabases": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", "installed_by": ["modules"] + }, + "rgi/main": { + "branch": "master", + "git_sha": "5e748ff2b0f990949081c9e49792622eb3fe9ee9", + "installed_by": ["modules"] } } }, diff --git a/modules/local/annotate/hmmsearch.nf b/modules/local/annotate/hmmsearch.nf index 52c9dbd9..1007f28a 100644 --- a/modules/local/annotate/hmmsearch.nf +++ b/modules/local/annotate/hmmsearch.nf @@ -22,12 +22,13 @@ process HMM_SEARCH { script: def args = task.ext.args ?: "" def ec_flag = ec_from_info ? "--ec_from_info" : "" + def cutoff_flag = e_value ? "--e_value ${e_value}" : "" """ hmm_search.py \\ - --hmm ${database_loc}/*.hmm \\ + --hmm ${database_loc} \\ --input_file ${fasta} \\ - --e_value ${e_value} \\ + ${cutoff_flag} \\ --output_file ${input_fasta}_hmmsearch.out \\ --cpus ${task.cpus} diff --git a/modules/local/annotate/mmseqs_search.nf b/modules/local/annotate/mmseqs_search.nf index 5407e3fe..fa2074b3 100644 --- a/modules/local/annotate/mmseqs_search.nf +++ b/modules/local/annotate/mmseqs_search.nf @@ -36,14 +36,14 @@ process MMSEQS_SEARCH { # Perform search mmseqs search query_database/${input_fasta}.mmsdb ${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/tmp --threads ${task.cpus} - # Filter to only best hit - mmseqs filterdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}_tophit.mmsdb --extract-lines 1 - # Filter to only hits with minimum bit score - mmseqs filterdb --filter-column 2 --comparison-operator ge --comparison-value ${bit_score_threshold} --threads ${task.cpus} mmseqs_out/${input_fasta}_${db_name}_tophit.mmsdb mmseqs_out/${input_fasta}_${db_name}_tophit_minbitscore${bit_score_threshold}.mmsdb + mmseqs filterdb --filter-column 2 --comparison-operator ge --comparison-value ${bit_score_threshold} --threads ${task.cpus} mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb + + # Filter to only best hit + mmseqs filterdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb --extract-lines 1 # Convert results to BLAST outformat 6 - mmseqs convertalis query_database/${input_fasta}.mmsdb ${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}_tophit_minbitscore${bit_score_threshold}.mmsdb mmseqs_out/${input_fasta}___mmseqs_${db_name}.tsv --threads ${task.cpus} + mmseqs convertalis query_database/${input_fasta}.mmsdb ${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}___mmseqs_${db_name}.tsv --threads ${task.cpus} # if statement for kegg rbh goes here elif [ "${db_name}" == "pfam" ]; then diff --git a/modules/nf-core/antismash/antismash/environment.yml b/modules/nf-core/antismash/antismash/environment.yml new file mode 100644 index 00000000..f03e68e2 --- /dev/null +++ b/modules/nf-core/antismash/antismash/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::antismash=8.0.1" diff --git a/modules/nf-core/antismash/antismash/main.nf b/modules/nf-core/antismash/antismash/main.nf new file mode 100644 index 00000000..42b51699 --- /dev/null +++ b/modules/nf-core/antismash/antismash/main.nf @@ -0,0 +1,85 @@ +process ANTISMASH_ANTISMASH { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "nf-core/antismash:8.0.1--pyhdfd78af_0" + + input: + tuple val(meta), path(sequence_input) + path databases + path gff + + output: + tuple val(meta), path("${prefix}/{css,images,js}") , emit: html_accessory_files + tuple val(meta), path("${prefix}/*.gbk") , emit: gbk_input + tuple val(meta), path("${prefix}/*.json") , emit: json_results + tuple val(meta), path("${prefix}/*.log") , emit: log + tuple val(meta), path("${prefix}/*.zip") , emit: zip + tuple val(meta), path("${prefix}/index.html") , emit: html + tuple val(meta), path("${prefix}/regions.js") , emit: json_sideloading + tuple val(meta), path("${prefix}/clusterblast/*_c*.txt") , emit: clusterblast_file , optional: true + tuple val(meta), path("${prefix}/knownclusterblast/region*/ctg*.html"), emit: knownclusterblast_html , optional: true + tuple val(meta), path("${prefix}/knownclusterblast/") , emit: knownclusterblast_dir , optional: true + tuple val(meta), path("${prefix}/knownclusterblast/*_c*.txt") , emit: knownclusterblast_txt , optional: true + tuple val(meta), path("${prefix}/svg/clusterblast*.svg") , emit: svg_files_clusterblast , optional: true + tuple val(meta), path("${prefix}/svg/knownclusterblast*.svg") , emit: svg_files_knownclusterblast, optional: true + tuple val(meta), path("${prefix}/*region*.gbk") , emit: gbk_results , optional: true + tuple val(meta), path("${prefix}/clusterblastoutput.txt") , emit: clusterblastoutput , optional: true + tuple val(meta), path("${prefix}/knownclusterblastoutput.txt") , emit: knownclusterblastoutput , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + gff_flag = gff ? "--genefinding-gff3 ${gff}" : "" + + """ + ## We specifically do not include on-the-fly annotations (--genefinding-tool none) as + ## this should be run as a separate module for versioning purposes + + antismash \\ + ${args} \\ + ${gff_flag} \\ + -c ${task.cpus} \\ + --output-dir ${prefix} \\ + --output-basename ${prefix} \\ + --genefinding-tool none \\ + --logfile ${prefix}/${prefix}.log \\ + --databases ${databases} \\ + ${sequence_input} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + antismash: \$(echo \$(antismash --version) | sed 's/antiSMASH //;s/-.*//g') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p ${prefix}/css + mkdir ${prefix}/images + mkdir ${prefix}/js + touch ${prefix}/NZ_CP069563.1.region001.gbk + touch ${prefix}/NZ_CP069563.1.region002.gbk + touch ${prefix}/css/bacteria.css + touch ${prefix}/genome.gbk + touch ${prefix}/genome.json + touch ${prefix}/genome.zip + touch ${prefix}/images/about.svg + touch ${prefix}/index.html + touch ${prefix}/js/antismash.js + touch ${prefix}/js/jquery.js + touch ${prefix}/regions.js + touch ${prefix}/test.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + antismash: \$(echo \$(antismash --version) | sed 's/antiSMASH //;s/-.*//g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/antismash/antismash/meta.yml b/modules/nf-core/antismash/antismash/meta.yml new file mode 100644 index 00000000..8dcb610b --- /dev/null +++ b/modules/nf-core/antismash/antismash/meta.yml @@ -0,0 +1,245 @@ +name: antismash_antismash +description: | + antiSMASH allows the rapid genome-wide identification, annotation + and analysis of secondary metabolite biosynthesis gene clusters. +keywords: + - secondary metabolites + - BGC + - biosynthetic gene cluster + - genome mining + - NRPS + - RiPP + - antibiotics + - prokaryotes + - bacteria + - eukaryotes + - fungi + - antismash +tools: + - antismash: + description: "antiSMASH - the antibiotics and Secondary Metabolite Analysis SHell" + homepage: "https://docs.antismash.secondarymetabolites.org" + documentation: "https://docs.antismash.secondarymetabolites.org" + tool_dev_url: "https://github.com/antismash/antismash" + doi: "10.1093/nar/gkab335" + licence: ["AGPL v3"] + identifier: biotools:antismash +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - sequence_input: + type: file + description: Nucleotide sequence file (annotated) + pattern: "*.{gbk, gb, gbff, genbank, embl, fasta, fna}" + ontologies: [] + - databases: + type: directory + description: | + Downloaded AntiSMASH databases (e.g. in the AntiSMASH installation directory + "data/databases") + pattern: "*/" + - gff: + type: file + description: Optional GFF3 file containing premade annotations of the input sequence + pattern: "*.gff" + ontologies: [] +output: + html_accessory_files: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/{css,images,js}: + type: directory + description: Accessory files for the HTML output + pattern: "{css/,images/,js/}" + gbk_input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/*.gbk: + type: file + description: Nucleotide sequence and annotations in GenBank format; converted + from input file + pattern: "*.gbk" + ontologies: [] + json_results: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/*.json: + type: file + description: Nucleotide sequence and annotations in JSON format; converted + from GenBank file (gbk_input) + pattern: "*.json" + ontologies: + - edam: http://edamontology.org/format_3464 # JSON + log: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/*.log: + type: file + description: Contains all the logging output that antiSMASH produced during + its run + pattern: "*.log" + ontologies: [] + zip: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/*.zip: + type: file + description: Contains a compressed version of the output folder in zip format + pattern: "*.zip" + ontologies: + - edam: http://edamontology.org/format_3987 # ZIP format + html: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/index.html: + type: file + description: Graphical web view of results in HTML format + patterN: "index.html" + ontologies: [] + json_sideloading: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/regions.js: + type: file + description: Sideloaded annotations of protoclusters and/or subregions (see + antiSMASH documentation "Annotation sideloading") + pattern: "regions.js" + ontologies: [] + clusterblast_file: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/clusterblast/*_c*.txt: + type: file + description: Output of ClusterBlast algorithm + pattern: "clusterblast/*_c*.txt" + ontologies: [] + knownclusterblast_html: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/knownclusterblast/region*/ctg*.html: + type: file + description: Tables with MIBiG hits in HTML format + pattern: "knownclusterblast/region*/ctg*.html" + ontologies: [] + knownclusterblast_dir: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/knownclusterblast/: + type: directory + description: Directory with MIBiG hits + pattern: "knownclusterblast/" + knownclusterblast_txt: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/knownclusterblast/*_c*.txt: + type: file + description: Tables with MIBiG hits + pattern: "knownclusterblast/*_c*.txt" + ontologies: [] + svg_files_clusterblast: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/svg/clusterblast*.svg: + type: file + description: SVG images showing the % identity of the aligned hits against + their queries + pattern: "svg/clusterblast*.svg" + ontologies: [] + svg_files_knownclusterblast: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/svg/knownclusterblast*.svg: + type: file + description: SVG images showing the % identity of the aligned hits against + their queries + pattern: "svg/knownclusterblast*.svg" + ontologies: [] + gbk_results: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/*region*.gbk: + type: file + description: Nucleotide sequence and annotations in GenBank format; one file + per antiSMASH hit + pattern: "*region*.gbk" + ontologies: [] + clusterblastoutput: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/clusterblastoutput.txt: + type: file + description: Raw BLAST output of known clusters previously predicted by antiSMASH + using the built-in ClusterBlast algorithm + pattern: "clusterblastoutput.txt" + ontologies: [] + knownclusterblastoutput: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - ${prefix}/knownclusterblastoutput.txt: + type: file + description: Raw BLAST output of known clusters of the MIBiG database + pattern: "knownclusterblastoutput.txt" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@jasmezz" +maintainers: + - "@jasmezz" + - "@jfy133" diff --git a/modules/nf-core/antismash/antismash/tests/main.nf.test b/modules/nf-core/antismash/antismash/tests/main.nf.test new file mode 100644 index 00000000..9cfd3a01 --- /dev/null +++ b/modules/nf-core/antismash/antismash/tests/main.nf.test @@ -0,0 +1,92 @@ +nextflow_process { + + name "Test Process ANTISMASH_ANTISMASH" + script "../main.nf" + process "ANTISMASH_ANTISMASH" + config './nextflow.config' + + tag "modules" + tag "modules_nfcore" + tag "antismash" + tag "antismash/antismash" + tag "antismash/antismashdownloaddatabases" + tag "gunzip" + + + setup { + + run("ANTISMASH_ANTISMASHDOWNLOADDATABASES") { + script "../../../antismash/antismashdownloaddatabases" + process { + """ + """ + } + } + + run("GUNZIP") { + script "../../../gunzip" + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.gbff.gz', checkIfExists: true) + ] + """ + } + } + } + + test("antismash - bacteroides_fragilis - genome") { + + when { + process { + """ + input[0] = GUNZIP.out.gunzip + input[1] = ANTISMASH_ANTISMASHDOWNLOADDATABASES.out.database + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + path(process.out.gbk_results.get(0).get(1).get(0)).text.contains("##antiSMASH-Data-START##"), + path(process.out.gbk_input.get(0).get(1).get(0)).text.contains("##antiSMASH-Data-END##"), + path(process.out.zip.get(0).get(1)).exists(), + path(process.out.html.get(0).get(1)).text.contains("https://antismash.secondarymetabolites.org/"), + path(process.out.json_sideloading.get(0).get(1)).text.contains("NZ_CP069563.1"), + path(process.out.log.get(0).get(1)).text.contains("antiSMASH status: SUCCESS"), + process.out.html_accessory_files, + process.out.versions, + path(process.out.versions[0]).yaml + ).match()} + ) + } + } + + test("antismash - bacteroides_fragilis - genome - stub") { + options "-stub" + + when { + process { + """ + input[0] = GUNZIP.out.gunzip + input[1] = ANTISMASH_ANTISMASHDOWNLOADDATABASES.out.database + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out, + path(process.out.versions[0]).yaml + ).match()} + ) + } + } +} diff --git a/modules/nf-core/antismash/antismash/tests/main.nf.test.snap b/modules/nf-core/antismash/antismash/tests/main.nf.test.snap new file mode 100644 index 00000000..85735c18 --- /dev/null +++ b/modules/nf-core/antismash/antismash/tests/main.nf.test.snap @@ -0,0 +1,308 @@ +{ + "antismash - bacteroides_fragilis - genome": { + "content": [ + true, + true, + true, + true, + true, + true, + [ + [ + { + "id": "test" + }, + [ + [ + "bacteria.css:md5,e5b4d3ceaa91b03f6393d9b3d5f072e7" + ], + [ + "about.svg:md5,2573f954dd506e2d0878daed04f5420a", + "bacteria_about.png:md5,99cdc2aa09aee37553b10ca86b172170", + "bacteria_antismash_icon.svg:md5,23a265b0e1cf293a4743fe13030b636f", + "bacteria_antismash_logo.svg:md5,f80f639969ee6506571ffda2e197df93", + "bacteria_antismash_white.svg:md5,2c9da15cc168d8f796269d037b5e7f60", + "bacteria_download.png:md5,c3428df1cf17cb97e2897ca6daa93d48", + "bacteria_help.png:md5,359b68f90c73208eb389759c0f5c1091", + "bacteria_home.png:md5,6595d97ee49d251fe038207f82012eff", + "bacteria_logo.png:md5,013f84d6dd93cde96f07084ff63d855c", + "contact.svg:md5,53b878c2af4f8a80a647ac30f61e6bf6", + "download.svg:md5,722038156f4ece46747cbf6908501974", + "expand-arrows-alt-solid.svg:md5,21b37749f54320135a455ed266a7fc3a", + "external-link-alt-solid.svg:md5,ca337694c74e57f73d15ca9db30081ba", + "fungi_about.png:md5,4d55bf14df0340dca01a286487fa8448", + "fungi_antismash_icon.svg:md5,2acc19cc91d5d7285a72f0b3912e108a", + "fungi_antismash_icon_white.svg:md5,961f1c41e25036a625f115f209a961c7", + "fungi_antismash_logo.svg:md5,36560983a36f46786c98a05125b15724", + "fungi_download.png:md5,782580852674aab0b69b2b94a94c7615", + "fungi_help.png:md5,0ac06748f3177d150ab90997117c4f64", + "fungi_home.png:md5,880071898062d6dafe989ac73bb7bbea", + "fungi_logo.png:md5,29294392a3953fd1ba12d1a39cebaeeb", + "help.svg:md5,e7565a3cd74893422f2886a0af748df2", + "mail.png:md5,049f51233b29663e4e4e4c8097c2d096", + "minus-circle.svg:md5,b523305570d06b6e34cd7099bed22015", + "nostructure_icon.png:md5,fc982a5b84a1a99db607731625a87f88", + "plant_antismash_icon.svg:md5,e031de9570ef2809e52502481a5e77ea", + "plant_antismash_icon_white.svg:md5,10d25996b023dbdaed4a382471ab4877", + "plus-circle.svg:md5,cba2cdd9ef893274f572228b354718cf", + "question-circle-solid.svg:md5,6dbc83547e29ecedc7f2a5b81354353b", + "search-solid.svg:md5,aeab848c26357f3d120f3e58f1efa8f5" + ], + [ + "antismash.js:md5,b452a926645e2d4dd93f8a685275aa79", + "jquery.js:md5,397754ba49e9e0cf4e7c190da78dda05", + "jquery.tablesorter.min.js:md5,5e9e08cef4d1be0eaa538e6eb28809a7" + ] + ] + ] + ], + [ + "versions.yml:md5,48e6949487e113c0b097dcee63dc894d" + ], + { + "ANTISMASH_ANTISMASH": { + "antismash": "8.0.1" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-03T08:17:33.268167622" + }, + "antismash - bacteroides_fragilis - genome - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + [ + "bacteria.css:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "about.svg:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "antismash.js:md5,d41d8cd98f00b204e9800998ecf8427e", + "jquery.js:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ] + ], + "1": [ + [ + { + "id": "test" + }, + [ + "NZ_CP069563.1.region001.gbk:md5,d41d8cd98f00b204e9800998ecf8427e", + "NZ_CP069563.1.region002.gbk:md5,d41d8cd98f00b204e9800998ecf8427e", + "genome.gbk:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "10": [ + + ], + "11": [ + + ], + "12": [ + + ], + "13": [ + [ + { + "id": "test" + }, + [ + "NZ_CP069563.1.region001.gbk:md5,d41d8cd98f00b204e9800998ecf8427e", + "NZ_CP069563.1.region002.gbk:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "14": [ + + ], + "15": [ + + ], + "16": [ + "versions.yml:md5,48e6949487e113c0b097dcee63dc894d" + ], + "2": [ + [ + { + "id": "test" + }, + "genome.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "genome.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "test" + }, + "index.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "test" + }, + "regions.js:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + + ], + "8": [ + + ], + "9": [ + + ], + "clusterblast_file": [ + + ], + "clusterblastoutput": [ + + ], + "gbk_input": [ + [ + { + "id": "test" + }, + [ + "NZ_CP069563.1.region001.gbk:md5,d41d8cd98f00b204e9800998ecf8427e", + "NZ_CP069563.1.region002.gbk:md5,d41d8cd98f00b204e9800998ecf8427e", + "genome.gbk:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "gbk_results": [ + [ + { + "id": "test" + }, + [ + "NZ_CP069563.1.region001.gbk:md5,d41d8cd98f00b204e9800998ecf8427e", + "NZ_CP069563.1.region002.gbk:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "html": [ + [ + { + "id": "test" + }, + "index.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "html_accessory_files": [ + [ + { + "id": "test" + }, + [ + [ + "bacteria.css:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "about.svg:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "antismash.js:md5,d41d8cd98f00b204e9800998ecf8427e", + "jquery.js:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ] + ], + "json_results": [ + [ + { + "id": "test" + }, + "genome.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "json_sideloading": [ + [ + { + "id": "test" + }, + "regions.js:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "knownclusterblast_dir": [ + + ], + "knownclusterblast_html": [ + + ], + "knownclusterblast_txt": [ + + ], + "knownclusterblastoutput": [ + + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "svg_files_clusterblast": [ + + ], + "svg_files_knownclusterblast": [ + + ], + "versions": [ + "versions.yml:md5,48e6949487e113c0b097dcee63dc894d" + ], + "zip": [ + [ + { + "id": "test" + }, + "genome.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + }, + { + "ANTISMASH_ANTISMASH": { + "antismash": "8.0.1" + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-01T15:56:34.83807227" + } +} diff --git a/modules/nf-core/antismash/antismash/tests/nextflow.config b/modules/nf-core/antismash/antismash/tests/nextflow.config new file mode 100644 index 00000000..d76b72bd --- /dev/null +++ b/modules/nf-core/antismash/antismash/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: ANTISMASH_ANTISMASH { + memory = 7.GB + } +} diff --git a/modules/nf-core/antismash/antismashdownloaddatabases/environment.yml b/modules/nf-core/antismash/antismashdownloaddatabases/environment.yml new file mode 100644 index 00000000..f03e68e2 --- /dev/null +++ b/modules/nf-core/antismash/antismashdownloaddatabases/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::antismash=8.0.1" diff --git a/modules/nf-core/antismash/antismashdownloaddatabases/main.nf b/modules/nf-core/antismash/antismashdownloaddatabases/main.nf new file mode 100644 index 00000000..3c1d33b0 --- /dev/null +++ b/modules/nf-core/antismash/antismashdownloaddatabases/main.nf @@ -0,0 +1,49 @@ +process ANTISMASH_ANTISMASHDOWNLOADDATABASES { + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "nf-core/antismash:8.0.1--pyhdfd78af_0" + + output: + path "antismash_db", emit: database + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + download-antismash-databases \\ + --database-dir antismash_db \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + antismash: \$(echo \$(antismash --version) | sed 's/antiSMASH //;s/-.*//g') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + """ + echo "download-antismash-databases --database-dir antismash_db ${args}" + + mkdir antismash_db + mkdir antismash_db/as-js + mkdir antismash_db/clusterblast + mkdir antismash_db/clustercompare + mkdir antismash_db/comparippson + mkdir antismash_db/knownclusterblast + mkdir antismash_db/mite + mkdir antismash_db/nrps_pks + mkdir antismash_db/pfam + mkdir antismash_db/resfam + mkdir antismash_db/tigrfam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + antismash: \$(echo \$(antismash --version) | sed 's/antiSMASH //;s/-.*//g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/antismash/antismashdownloaddatabases/meta.yml b/modules/nf-core/antismash/antismashdownloaddatabases/meta.yml new file mode 100644 index 00000000..ad1e6cbc --- /dev/null +++ b/modules/nf-core/antismash/antismashdownloaddatabases/meta.yml @@ -0,0 +1,46 @@ +name: antismash_antismashdownloaddatabases +description: antiSMASH allows the rapid genome-wide identification, annotation and + analysis of secondary metabolite biosynthesis gene clusters. This module downloads + the antiSMASH databases for conda and docker/singularity runs. +keywords: + - secondary metabolites + - BGC + - biosynthetic gene cluster + - genome mining + - NRPS + - RiPP + - antibiotics + - prokaryotes + - bacteria + - eukaryotes + - fungi + - antismash + - database +tools: + - antismash: + description: antiSMASH - the antibiotics and Secondary Metabolite Analysis SHell + homepage: https://docs.antismash.secondarymetabolites.org + documentation: https://docs.antismash.secondarymetabolites.org + tool_dev_url: https://github.com/antismash/antismash + doi: "10.1093/nar/gkab335" + licence: ["AGPL v3"] + identifier: biotools:antismash +input: [] +output: + database: + - antismash_db: + type: directory + description: Download directory for antiSMASH databases + pattern: "antismash_db" + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@jasmezz" +maintainers: + - "@jasmezz" + - "@jfy133" diff --git a/modules/nf-core/antismash/antismashdownloaddatabases/tests/main.nf.test b/modules/nf-core/antismash/antismashdownloaddatabases/tests/main.nf.test new file mode 100644 index 00000000..e25dff2a --- /dev/null +++ b/modules/nf-core/antismash/antismashdownloaddatabases/tests/main.nf.test @@ -0,0 +1,57 @@ +nextflow_process { + + name "Test Process ANTISMASH_ANTISMASHDOWNLOADDATABASES" + script "../main.nf" + process "ANTISMASH_ANTISMASHDOWNLOADDATABASES" + config './nextflow.config' + + tag "modules" + tag "modules_nfcore" + tag "antismash" + tag "antismash/antismashdownloaddatabases" + + test("antismash/downloaddatabases") { + + when { + process { + """ + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.database.get(0)).list().sort(), + path(process.out.versions[0]).yaml, + file(process.out.versions[0]).name, + ).match() + } + ) + } + } + + test("antismash/downloaddatabases - stub") { + + options "-stub" + + when { + process { + """ + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.database.get(0)).list().sort(), + file(process.out.versions[0]).name, + ).match() + } + ) + } + } +} diff --git a/modules/nf-core/antismash/antismashdownloaddatabases/tests/main.nf.test.snap b/modules/nf-core/antismash/antismashdownloaddatabases/tests/main.nf.test.snap new file mode 100644 index 00000000..6c8a33d7 --- /dev/null +++ b/modules/nf-core/antismash/antismashdownloaddatabases/tests/main.nf.test.snap @@ -0,0 +1,51 @@ +{ + "antismash/downloaddatabases - stub": { + "content": [ + [ + "as-js", + "clusterblast", + "clustercompare", + "comparippson", + "knownclusterblast", + "mite", + "nrps_pks", + "pfam", + "resfam", + "tigrfam" + ], + "versions.yml" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-05-22T07:52:56.373189968" + }, + "antismash/downloaddatabases": { + "content": [ + [ + "as-js", + "clusterblast", + "clustercompare", + "comparippson", + "knownclusterblast", + "mite", + "nrps_pks", + "pfam", + "resfam", + "tigrfam" + ], + { + "ANTISMASH_ANTISMASHDOWNLOADDATABASES": { + "antismash": "8.0.1" + } + }, + "versions.yml" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.2" + }, + "timestamp": "2025-06-03T08:25:39.61984576" + } +} diff --git a/modules/nf-core/antismash/antismashdownloaddatabases/tests/nextflow.config b/modules/nf-core/antismash/antismashdownloaddatabases/tests/nextflow.config new file mode 100644 index 00000000..63ec101f --- /dev/null +++ b/modules/nf-core/antismash/antismashdownloaddatabases/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: ANTISMASH_ANTISMASHDOWNLOADDATABASES { + memory = 7.GB + } +} diff --git a/modules/nf-core/rgi/main/environment.yml b/modules/nf-core/rgi/main/environment.yml new file mode 100644 index 00000000..b6b2d343 --- /dev/null +++ b/modules/nf-core/rgi/main/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::rgi=6.0.5 diff --git a/modules/nf-core/rgi/main/main.nf b/modules/nf-core/rgi/main/main.nf new file mode 100644 index 00000000..744dbc92 --- /dev/null +++ b/modules/nf-core/rgi/main/main.nf @@ -0,0 +1,91 @@ +process RGI_MAIN { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/rgi:6.0.5--pyh05cac1d_0' + : 'biocontainers/rgi:6.0.5--pyh05cac1d_0'}" + + input: + tuple val(meta), path(fasta) + path card + path wildcard + + output: + tuple val(meta), path("*.json"), emit: json + tuple val(meta), path("*.txt"), emit: tsv + tuple val(meta), path("temp/"), emit: tmp + env 'RGI_VERSION', emit: tool_version + env 'DB_VERSION', emit: db_version + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + // This customizes the command: rgi load + def args2 = task.ext.args2 ?: '' + // This customizes the command: rgi main + def prefix = task.ext.prefix ?: "${meta.id}" + def load_wildcard = "" + + if (wildcard) { + load_wildcard = """ \\ + --wildcard_annotation ${wildcard}/wildcard_database_v\$DB_VERSION.fasta \\ + --wildcard_annotation_all_models ${wildcard}/wildcard_database_v\$DB_VERSION\\_all.fasta \\ + --wildcard_index ${wildcard}/wildcard/index-for-model-sequences.txt \\ + --amr_kmers ${wildcard}/wildcard/all_amr_61mers.txt \\ + --kmer_database ${wildcard}/wildcard/61_kmer_db.json \\ + --kmer_size 61 + """ + } + + """ + DB_VERSION=\$(ls ${card}/card_database_*_all.fasta | sed "s/${card}\\/card_database_v\\([0-9].*[0-9]\\).*/\\1/") + + rgi \\ + load \\ + ${args} \\ + --card_json ${card}/card.json \\ + --debug --local \\ + --card_annotation ${card}/card_database_v\$DB_VERSION.fasta \\ + --card_annotation_all_models ${card}/card_database_v\$DB_VERSION\\_all.fasta \\ + ${load_wildcard} + + rgi \\ + main \\ + ${args2} \\ + --num_threads ${task.cpus} \\ + --output_file ${prefix} \\ + --input_sequence ${fasta} + + mkdir temp/ + for FILE in *.xml *.fsa *.{nhr,nin,nsq} *.draft *.potentialGenes *{variant,rrna,protein,predictedGenes,overexpression,homolog}.json; do [[ -e \$FILE ]] && mv \$FILE temp/; done + + RGI_VERSION=\$(rgi main --version) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rgi: \$(echo \$RGI_VERSION) + rgi-database: \$(echo \$DB_VERSION) + END_VERSIONS + """ + + stub: + """ + mkdir -p temp + touch test.json + touch test.txt + + RGI_VERSION=\$(rgi main --version) + DB_VERSION=stub_version + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rgi: \$(echo \$RGI_VERSION) + rgi-database: \$(echo \$DB_VERSION) + END_VERSIONS + """ +} diff --git a/modules/nf-core/rgi/main/meta.yml b/modules/nf-core/rgi/main/meta.yml new file mode 100644 index 00000000..f8b102f6 --- /dev/null +++ b/modules/nf-core/rgi/main/meta.yml @@ -0,0 +1,102 @@ +name: rgi_main +description: Predict antibiotic resistance from protein or nucleotide data +keywords: + - bacteria + - fasta + - antibiotic resistance +tools: + - rgi: + description: This tool provides a preliminary annotation of your DNA sequence(s) + based upon the data available in The Comprehensive Antibiotic Resistance Database + (CARD). Hits to genes tagged with Antibiotic Resistance ontology terms will + be highlighted. As CARD expands to include more pathogens, genomes, plasmids, + and ontology terms this tool will grow increasingly powerful in providing first-pass + detection of antibiotic resistance associated genes. See license at CARD website + homepage: https://card.mcmaster.ca + documentation: https://github.com/arpcard/rgi + tool_dev_url: https://github.com/arpcard/rgi + doi: "10.1093/nar/gkz935" + licence: ["https://card.mcmaster.ca/about"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Nucleotide or protein sequences in FASTA format + pattern: "*.{fasta,fasta.gz,fa,fa.gz,fna,fna.gz,faa,faa.gz}" + ontologies: [] + - card: + type: directory + description: Directory containing the CARD database. This is expected to be the + unarchived but otherwise unaltered download folder (see RGI documentation for + download instructions). + pattern: "*/" + - wildcard: + type: directory + description: Directory containing the WildCARD database (optional). This is expected + to be the unarchived but otherwise unaltered download folder (see RGI documentation + for download instructions). + pattern: "*/" +output: + json: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.json": + type: file + description: JSON formatted file with RGI results + pattern: "*.{json}" + ontologies: + - edam: http://edamontology.org/format_3464 # JSON + tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.txt": + type: file + description: Tab-delimited file with RGI results + pattern: "*.{txt}" + ontologies: [] + tmp: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - temp/: + type: directory + description: Directory containing various intermediate files + pattern: "temp/" + tool_version: + - RGI_VERSION: + type: string + description: The version of the tool in string format (useful for downstream + tools such as hAMRronization) + db_version: + - DB_VERSION: + type: string + description: The version of the used database in string format (useful for downstream + tools such as hAMRronization) + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@rpetit3" + - "@jfy133" + - "@jasmezz" +maintainers: + - "@rpetit3" + - "@jfy133" + - "@jasmezz" diff --git a/modules/nf-core/rgi/main/tests/main.nf.test b/modules/nf-core/rgi/main/tests/main.nf.test new file mode 100644 index 00000000..fc4a5616 --- /dev/null +++ b/modules/nf-core/rgi/main/tests/main.nf.test @@ -0,0 +1,94 @@ +nextflow_process { + + name "Test Process RGI_MAIN" + script "../main.nf" + process "RGI_MAIN" + + tag "modules" + tag "modules_nfcore" + tag "rgi" + tag "rgi/main" + tag "rgi/cardannotation" + tag "untar" + + setup { + run("UNTAR") { + script "modules/nf-core/untar/main.nf" + process { + """ + file('https://card.mcmaster.ca/latest/data', checkIfExists: true).copyTo('card-data.tar.bz2') + + input[0] = [ + [ ], + file("card-data.tar.bz2") + ] + """ + } + } + + run("RGI_CARDANNOTATION") { + script "modules/nf-core/rgi/cardannotation" + process { + """ + input[0] = UNTAR.out.untar.map{ it[1] } + """ + } + } + } + + + test("rgi/main - haemophilus_influenzae - genome_fna_gz") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/haemophilus_influenzae/genome/genome.fna.gz', checkIfExists: true) + ] + input[1] = RGI_CARDANNOTATION.out.db + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + process.out.tsv, + process.out.json, + file(process.out.tmp.get(0).get(1)).list().sort(), + process.out.tool_version, + process.out.db_version, + ).match() } + ) + } + } + + test("rgi/main - haemophilus_influenzae - genome_fna_gz - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/prokaryotes/haemophilus_influenzae/genome/genome.fna.gz', checkIfExists: true) + ] + input[1] = RGI_CARDANNOTATION.out.db + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/rgi/main/tests/main.nf.test.snap b/modules/nf-core/rgi/main/tests/main.nf.test.snap new file mode 100644 index 00000000..bb326ac6 --- /dev/null +++ b/modules/nf-core/rgi/main/tests/main.nf.test.snap @@ -0,0 +1,143 @@ +{ + "rgi/main - haemophilus_influenzae - genome_fna_gz - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + [ + + ] + ] + ], + "3": [ + "6.0.5" + ], + "4": [ + "stub_version" + ], + "5": [ + "versions.yml:md5,b0808f9aef5a00d6542969c6dbd1c891" + ], + "db_version": [ + "stub_version" + ], + "json": [ + [ + { + "id": "test", + "single_end": false + }, + "test.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tmp": [ + [ + { + "id": "test", + "single_end": false + }, + [ + + ] + ] + ], + "tool_version": [ + "6.0.5" + ], + "tsv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,b0808f9aef5a00d6542969c6dbd1c891" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-09-03T10:03:18.046807023" + }, + "rgi/main - haemophilus_influenzae - genome_fna_gz": { + "content": [ + [ + "versions.yml:md5,1c882aa66647fa7275d0c9fd6d2dda5f" + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,9d7754551163e020beed52a8bc14ce83" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.json:md5,6c403fb8e2f24b7c88be27ba5a30ca70" + ] + ], + [ + "genome.fna.gz.temp.uncompressed.fsa", + "genome.fna.gz.temp.uncompressed.fsa.temp.blastRes.rrna.xml", + "genome.fna.gz.temp.uncompressed.fsa.temp.contig.fsa", + "genome.fna.gz.temp.uncompressed.fsa.temp.contig.fsa.blastRes.xml", + "genome.fna.gz.temp.uncompressed.fsa.temp.contigToORF.fsa", + "genome.fna.gz.temp.uncompressed.fsa.temp.db.nhr", + "genome.fna.gz.temp.uncompressed.fsa.temp.db.nin", + "genome.fna.gz.temp.uncompressed.fsa.temp.db.nsq", + "genome.fna.gz.temp.uncompressed.fsa.temp.draft", + "genome.fna.gz.temp.uncompressed.fsa.temp.homolog.json", + "genome.fna.gz.temp.uncompressed.fsa.temp.overexpression.json", + "genome.fna.gz.temp.uncompressed.fsa.temp.potentialGenes", + "genome.fna.gz.temp.uncompressed.fsa.temp.predictedGenes.json", + "genome.fna.gz.temp.uncompressed.fsa.temp.predictedGenes.protein.json", + "genome.fna.gz.temp.uncompressed.fsa.temp.rrna.json", + "genome.fna.gz.temp.uncompressed.fsa.temp.variant.json" + ], + [ + "6.0.5" + ], + [ + "4.0.1" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-09-03T10:03:00.004608398" + } +} diff --git a/nextflow.config b/nextflow.config index 3dab9ef2..846686e7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -45,6 +45,7 @@ params { /* Annotate Options */ // Annotation Database Flags + use_dram_db = false use_kegg = false use_kofam = false use_dbcan = false @@ -59,6 +60,10 @@ params { use_merops = false use_uniref = false use_metals = false + use_antismash = false + use_tcdb = false + use_rgi = false + use_card = false use_vog = false // TODO: Add vog annotation, not well supported currently use_viral = false // TODO: Add viral annotation, not well supported currently // use_viral = false // TODO: Add viral annotation @@ -99,6 +104,9 @@ params { metals_e_value = "1e-3" // Database locations + // DRAM DB + dram_db = "${launchDir}/databases/dram_db/dram_db.hmm" + dram_db_list = "${launchDir}/databases/dram_db/dram_db_scores.tsv" // KEGG kegg_db = "${launchDir}/databases/kegg/" // Uniref @@ -140,6 +148,12 @@ params { methyl_db = "${launchDir}/databases/methyl/" // Metals metals_db = "${launchDir}/databases/metals/" + // antiSMASH + antismash_db = "${launchDir}/databases/antismash/" + // rgi card + card_db = "${launchDir}/scratch/card" + // TCDB + tcdb_db = "${launchDir}/databases/tcdb" // SQL annotation descriptions database sql_descriptions_db = "${launchDir}/databases/db_descriptions/description_db.sqlite" @@ -481,7 +495,7 @@ manifest { mainScript = 'main.nf' defaultBranch = 'master' nextflowVersion = '!>=24' - version = '2.0.0-beta27' + version = '2.0.0-beta28' doi = '' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 44854bf1..689e2e17 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -134,6 +134,10 @@ "type": "string", "description": "Alternative way to specify database list for annotation. Comma sepeterated list of databases to include in the annotates. Use `all` for all. Example: 'kegg,dbcan,kofam,merops,viral,camper,cant_hyd,fegenie,sulfur,methyl,uniref,pfam,vogdb'. When in doubt, use the name after `use_` for each database. This option overrides individual `use_` database flags. (WARNING, this option name may change in the future)" }, + "use_dram_db": { + "type": "boolean", + "description": "Use the DRAM team specialized databases for annotation." + }, "use_camper": { "type": "boolean", "description": "Use the CAMPer database for annotation." @@ -186,6 +190,22 @@ "type": "boolean", "description": "Use the Metals database for annotation." }, + "use_antismash": { + "type": "boolean", + "description": "Use the antiSMASH database, currently experimental. Raw antiSMASH output only." + }, + "use_rgi": { + "type": "boolean", + "description": "Use RGI AMR analysis tool, currently experimental. Raw RGI output only." + }, + "use_card": { + "type": "boolean", + "description": "Use the CARD database for annotation." + }, + "use_tcdb": { + "type": "boolean", + "description": "Use the TCDB database for annotation." + }, "use_vog": { "type": "boolean" }, @@ -325,6 +345,16 @@ "fa_icon": "fas fa-database", "description": "File paths to databases used in the workflow.", "properties": { + "dram_db": { + "type": "string", + "default": "${launchDir}/databases/dram_db.hmm", + "hidden": true + }, + "dram_db_list": { + "type": "string", + "default": "${launchDir}/databases/dram_db/dram_db_scores.tsv", + "hidden": true + }, "kegg_db": { "type": "string", "default": "${launchDir}/databases/kegg/", @@ -340,6 +370,21 @@ "default": "${launchDir}/databases/metals/", "hidden": true }, + "antismash_db": { + "type": "string", + "default": "${launchDir}/databases/antismash/", + "hidden": true + }, + "card_db": { + "type": "string", + "default": "${launchDir}/databases/card/", + "hidden": true + }, + "tcdb_db": { + "type": "string", + "default": "${launchDir}/databases/tcdb/", + "hidden": true + }, "pfam_mmseq_db": { "type": "string", "default": "${launchDir}/databases/pfam/mmseqs/", diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf index 3316fb82..e8343162 100644 --- a/subworkflows/local/annotate.nf +++ b/subworkflows/local/annotate.nf @@ -28,6 +28,11 @@ workflow ANNOTATE { use_merops use_uniref use_metals + use_antismash + use_rgi + use_card + use_tcdb + use_dram_db use_vog main: @@ -38,6 +43,9 @@ workflow ANNOTATE { ch_quast_stats = default_sheet ch_collected_fna = default_sheet + ch_gene_gff = default_sheet + ch_filtered_fasta = default_sheet + ch_called_genes = default_sheet if (call){ fasta_name = ch_fasta.map { it[0] } @@ -65,6 +73,9 @@ workflow ANNOTATE { ch_gene_locs = CALL.out.ch_gene_locs ch_called_proteins = CALL.out.ch_called_proteins ch_collected_fna = CALL.out.ch_collected_fna + ch_gene_gff = CALL.out.ch_gene_gff + ch_filtered_fasta = CALL.out.ch_filtered_fasta + ch_called_genes = CALL.out.ch_called_genes } else { @@ -98,12 +109,28 @@ workflow ANNOTATE { ch_gene_locs = GENE_LOCS.out.prodigal_locs_tsv // n_fastas = file("$params.input_genes/${params.genes_fmt}").size() } + ch_antismash_map = ch_filtered_fasta + .map { file -> + def meta = [:] + meta.id = file.getBaseName() + tuple(meta, file) + } + ch_rgi_map = ch_called_genes + .map { + file_name, file -> + def meta = [:] + meta.id = file_name + tuple(meta, file) + } if (params.annotate){ DB_SEARCH( ch_gene_locs, ch_called_proteins, + ch_antismash_map, + ch_rgi_map, + ch_gene_gff, default_sheet, use_kegg, use_kofam, @@ -118,6 +145,11 @@ workflow ANNOTATE { use_merops, use_uniref, use_metals, + use_antismash, + use_rgi, + use_card, + use_tcdb, + use_dram_db, use_vog ) ch_combined_annotations = DB_SEARCH.out.ch_combined_annotations diff --git a/subworkflows/local/call.nf b/subworkflows/local/call.nf index fd34c1c4..9e5fd89f 100644 --- a/subworkflows/local/call.nf +++ b/subworkflows/local/call.nf @@ -61,4 +61,6 @@ workflow CALL { ch_collected_faa ch_collected_fna ch_collected_fasta + ch_gene_gff + ch_filtered_fasta } diff --git a/subworkflows/local/db_search.nf b/subworkflows/local/db_search.nf index a4edfec6..bebfda76 100644 --- a/subworkflows/local/db_search.nf +++ b/subworkflows/local/db_search.nf @@ -22,6 +22,8 @@ include { MMSEQS_SEARCH as MMSEQS_SEARCH_CANTHYD } from "../../modules/lo include { MMSEQS_SEARCH as MMSEQS_SEARCH_KEGG } from "../../modules/local/annotate/mmseqs_search.nf" include { MMSEQS_SEARCH as MMSEQS_SEARCH_UNIREF } from "../../modules/local/annotate/mmseqs_search.nf" include { MMSEQS_SEARCH as MMSEQS_SEARCH_PFAM } from "../../modules/local/annotate/mmseqs_search.nf" +include { MMSEQS_SEARCH as MMSEQS_SEARCH_CARD } from "../../modules/local/annotate/mmseqs_search.nf" +include { MMSEQS_SEARCH as MMSEQS_SEARCH_TCDB } from "../../modules/local/annotate/mmseqs_search.nf" include { ADD_SQL_DESCRIPTIONS as SQL_UNIREF } from "../../modules/local/annotate/add_sql_descriptions.nf" include { ADD_SQL_DESCRIPTIONS as SQL_VIRAL } from "../../modules/local/annotate/add_sql_descriptions.nf" @@ -32,8 +34,9 @@ include { ADD_SQL_DESCRIPTIONS as SQL_DBCAN } from "../../modules/lo include { HMM_SEARCH as HMM_SEARCH_KOFAM } from "../../modules/local/annotate/hmmsearch.nf" include { HMM_SEARCH as HMM_SEARCH_DBCAN } from "../../modules/local/annotate/hmmsearch.nf" -include { HMM_SEARCH as HMM_SEARCH_DBCAN3 } from "../../modules/local/annotate/hmmsearch.nf" -include { HMM_SEARCH as HMM_SEARCH_DBCAN3_SUB } from "../../modules/local/annotate/hmmsearch.nf" +include { HMM_SEARCH as HMM_SEARCH_DBCAN3 } from "../../modules/local/annotate/hmmsearch.nf" +include { HMM_SEARCH as HMM_SEARCH_DBCAN3_SUB } from "../../modules/local/annotate/hmmsearch.nf" +include { HMM_SEARCH as HMM_SEARCH_DRAM_DB } from "../../modules/local/annotate/hmmsearch.nf" include { HMM_SEARCH as HMM_SEARCH_VOG } from "../../modules/local/annotate/hmmsearch.nf" include { HMM_SEARCH as HMM_SEARCH_CAMPER } from "../../modules/local/annotate/hmmsearch.nf" include { HMM_SEARCH as HMM_SEARCH_CANTHYD } from "../../modules/local/annotate/hmmsearch.nf" @@ -41,6 +44,9 @@ include { HMM_SEARCH as HMM_SEARCH_SULFUR } from "../../modules/lo include { HMM_SEARCH as HMM_SEARCH_FEGENIE } from "../../modules/local/annotate/hmmsearch.nf" include { HMM_SEARCH as HMM_SEARCH_METALS } from "../../modules/local/annotate/hmmsearch.nf" +include { ANTISMASH_ANTISMASH } from '../../modules/nf-core/antismash/antismash/main' +include { RGI_MAIN } from '../../modules/nf-core/rgi/main/main' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SUBWORKFLOW TO DB_SEARCH @@ -51,6 +57,9 @@ workflow DB_SEARCH { take: ch_gene_locs // channel: path(gene_locs_tsv) ] ch_called_proteins // channel: [ val(input_fasta name), path(called_proteins_file.faa) ] + ch_antismash_map + ch_rgi_map + ch_gene_gff default_sheet // Path to dummy sheet use_kegg use_kofam @@ -65,6 +74,11 @@ workflow DB_SEARCH { use_merops use_uniref use_metals + use_antismash + use_rgi + use_card + use_tcdb + use_dram_db use_vog main: @@ -83,6 +97,11 @@ workflow DB_SEARCH { use_merops, use_uniref, use_metals, + use_antismash, + use_rgi, + use_card, + use_tcdb, + use_dram_db, use_vog ) @@ -94,6 +113,7 @@ workflow DB_SEARCH { ch_vog_list = file(params.vog_list) ch_camper_hmm_list = file(params.camper_hmm_list) ch_canthyd_hmm_list = file(params.cant_hyd_hmm_list) + ch_dram_db_hmm_list = file(params.dram_db_list) kegg_name = "kegg" @@ -112,6 +132,9 @@ workflow DB_SEARCH { pfam_name = "pfam" vogdb_name = "vogdb" metals_name = "metals" + card_name = "card" + tcdb_name = "tcdb" + dram_db_name = "dram_db" def formattedOutputChannels = channel.of() @@ -128,12 +151,11 @@ workflow DB_SEARCH { if (use_kegg) { ch_combined_query_locs_kegg = ch_mmseqs_query.join(ch_gene_locs) MMSEQS_SEARCH_KEGG( ch_combined_query_locs_kegg, DB_CHANNEL_SETUP.out.ch_kegg_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, kegg_name ) - ch_kegg_unformatted = MMSEQS_SEARCH_KEGG.out.mmseqs_search_formatted_out - - SQL_KEGG(ch_kegg_unformatted, kegg_name, ch_sql_descriptions_db) - ch_kegg_formatted = SQL_KEGG.out.sql_formatted_hits + ch_mmseqs_unformatted = MMSEQS_SEARCH_KEGG.out.mmseqs_search_formatted_out - formattedOutputChannels = formattedOutputChannels.mix(ch_kegg_formatted) + SQL_KEGG(ch_mmseqs_unformatted, kegg_name, ch_sql_descriptions_db) + ch_mmseqs_formatted = SQL_KEGG.out.sql_formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_mmseqs_formatted) } // KOFAM annotation if (use_kofam) { @@ -146,19 +168,18 @@ workflow DB_SEARCH { true, kofam_name ) - ch_kofam_formatted = HMM_SEARCH_KOFAM.out.formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_kofam_formatted) + ch_hmm_formatted = HMM_SEARCH_KOFAM.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_hmm_formatted) } // PFAM annotation if (use_pfam) { ch_combined_query_locs_pfam = ch_mmseqs_query.join(ch_gene_locs) MMSEQS_SEARCH_PFAM( ch_combined_query_locs_pfam, DB_CHANNEL_SETUP.out.ch_pfam_mmseqs_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, pfam_name ) - ch_pfam_unformatted = MMSEQS_SEARCH_PFAM.out.mmseqs_search_formatted_out + ch_mmseqs_unformatted = MMSEQS_SEARCH_PFAM.out.mmseqs_search_formatted_out - SQL_PFAM(ch_pfam_unformatted, pfam_name, ch_sql_descriptions_db) - ch_pfam_formatted = SQL_PFAM.out.sql_formatted_hits - - formattedOutputChannels = formattedOutputChannels.mix(ch_pfam_formatted) + SQL_PFAM(ch_mmseqs_unformatted, pfam_name, ch_sql_descriptions_db) + ch_mmseqs_formatted = SQL_PFAM.out.sql_formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_mmseqs_formatted) } // dbCAN annotation if (use_dbcan) { @@ -171,10 +192,10 @@ workflow DB_SEARCH { false, dbcan_name ) - ch_dbcan_unformatted = HMM_SEARCH_DBCAN.out.formatted_hits - SQL_DBCAN(ch_dbcan_unformatted, dbcan_name, ch_sql_descriptions_db) - ch_dbcan_formatted = SQL_DBCAN.out.sql_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan_formatted) + ch_hmm_unformatted = HMM_SEARCH_DBCAN.out.formatted_hits + SQL_DBCAN(ch_hmm_unformatted, dbcan_name, ch_sql_descriptions_db) + ch_hmm_formatted = SQL_DBCAN.out.sql_formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_hmm_formatted) } // dbCAN3 annotation if (use_dbcan3) { @@ -187,9 +208,8 @@ workflow DB_SEARCH { false, dbcan3_name ) - ch_dbcan3_formatted = HMM_SEARCH_DBCAN3.out.formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan3_formatted) - + ch_hmm_formatted = HMM_SEARCH_DBCAN3.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_hmm_formatted) HMM_SEARCH_DBCAN3_SUB ( ch_combined_proteins_locs, @@ -199,8 +219,8 @@ workflow DB_SEARCH { false, dbcan3_sub_name ) - ch_dbcan3_sub_formatted = HMM_SEARCH_DBCAN3_SUB.out.formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan3_sub_formatted) + ch_hmm_formatted = HMM_SEARCH_DBCAN3_SUB.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_hmm_formatted) } // CAMPER annotation if (use_camper) { @@ -214,15 +234,14 @@ workflow DB_SEARCH { false, camper_name ) - ch_camper_hmm_formatted = HMM_SEARCH_CAMPER.out.formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_camper_hmm_formatted) + ch_hmm_formatted = HMM_SEARCH_CAMPER.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_hmm_formatted) // MMseqs ch_combined_query_locs_camper = ch_mmseqs_query.join(ch_gene_locs) MMSEQS_SEARCH_CAMPER( ch_combined_query_locs_camper, DB_CHANNEL_SETUP.out.ch_camper_mmseqs_db, params.bit_score_threshold, params.rbh_bit_score_threshold, DB_CHANNEL_SETUP.out.ch_camper_mmseqs_list, camper_name ) - ch_camper_mmseqs_formatted = MMSEQS_SEARCH_CAMPER.out.mmseqs_search_formatted_out - - formattedOutputChannels = formattedOutputChannels.mix(ch_camper_mmseqs_formatted) + ch_mmseqs_formatted = MMSEQS_SEARCH_CAMPER.out.mmseqs_search_formatted_out + formattedOutputChannels = formattedOutputChannels.mix(ch_mmseqs_formatted) } // FeGenie annotation if (use_fegenie) { @@ -235,25 +254,23 @@ workflow DB_SEARCH { false, fegenie_name ) - ch_fegenie_formatted = HMM_SEARCH_FEGENIE.out.formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_fegenie_formatted) + ch_hmm_formatted = HMM_SEARCH_FEGENIE.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_hmm_formatted) } // Methyl annotation if (use_methyl) { ch_combined_query_locs_methyl = ch_mmseqs_query.join(ch_gene_locs) MMSEQS_SEARCH_METHYL( ch_combined_query_locs_methyl, DB_CHANNEL_SETUP.out.ch_methyl_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, methyl_name ) - ch_methyl_mmseqs_formatted = MMSEQS_SEARCH_METHYL.out.mmseqs_search_formatted_out - - formattedOutputChannels = formattedOutputChannels.mix(ch_methyl_mmseqs_formatted) + ch_mmseqs_formatted = MMSEQS_SEARCH_METHYL.out.mmseqs_search_formatted_out + formattedOutputChannels = formattedOutputChannels.mix(ch_mmseqs_formatted) } // CANT-HYD annotation if (use_canthyd) { // MMseqs ch_combined_query_locs_canthyd = ch_mmseqs_query.join(ch_gene_locs) MMSEQS_SEARCH_CANTHYD( ch_combined_query_locs_canthyd, DB_CHANNEL_SETUP.out.ch_canthyd_mmseqs_db, params.bit_score_threshold, params.rbh_bit_score_threshold, DB_CHANNEL_SETUP.out.ch_canthyd_mmseqs_list, canthyd_name ) - ch_canthyd_mmseqs_formatted = MMSEQS_SEARCH_CANTHYD.out.mmseqs_search_formatted_out - - formattedOutputChannels = formattedOutputChannels.mix(ch_canthyd_mmseqs_formatted) + ch_mmseqs_formatted = MMSEQS_SEARCH_CANTHYD.out.mmseqs_search_formatted_out + formattedOutputChannels = formattedOutputChannels.mix(ch_mmseqs_formatted) //HMM ch_combined_proteins_locs = ch_called_proteins.join(ch_gene_locs) @@ -265,9 +282,8 @@ workflow DB_SEARCH { false, canthyd_name ) - ch_canthyd_hmm_formatted = HMM_SEARCH_CANTHYD.out.formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_canthyd_hmm_formatted) - + ch_hmm_formatted = HMM_SEARCH_CANTHYD.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_hmm_formatted) } // Sulfur annotation if (use_sulfur) { @@ -280,30 +296,28 @@ workflow DB_SEARCH { false, sulfur_name ) - ch_sulfur_formatted = HMM_SEARCH_SULFUR.out.formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_sulfur_formatted) + ch_hmm_formatted = HMM_SEARCH_SULFUR.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_hmm_formatted) } // MEROPS annotation if (use_merops) { ch_combined_query_locs_merops = ch_mmseqs_query.join(ch_gene_locs) MMSEQS_SEARCH_MEROPS( ch_combined_query_locs_merops, DB_CHANNEL_SETUP.out.ch_merops_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, merops_name ) - ch_merops_unformatted = MMSEQS_SEARCH_MEROPS.out.mmseqs_search_formatted_out + ch_mmseqs_unformatted = MMSEQS_SEARCH_MEROPS.out.mmseqs_search_formatted_out - SQL_MEROPS(ch_merops_unformatted, merops_name, ch_sql_descriptions_db) - ch_merops_formatted = SQL_MEROPS.out.sql_formatted_hits - - formattedOutputChannels = formattedOutputChannels.mix(ch_merops_formatted) + SQL_MEROPS(ch_mmseqs_unformatted, merops_name, ch_sql_descriptions_db) + ch_mmseqs_formatted = SQL_MEROPS.out.sql_formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_mmseqs_formatted) } // Uniref annotation if (use_uniref) { ch_combined_query_locs_uniref = ch_mmseqs_query.join(ch_gene_locs) MMSEQS_SEARCH_UNIREF( ch_combined_query_locs_uniref, DB_CHANNEL_SETUP.out.ch_uniref_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, uniref_name ) - ch_uniref_unformatted = MMSEQS_SEARCH_UNIREF.out.mmseqs_search_formatted_out - - SQL_UNIREF(ch_uniref_unformatted, uniref_name, ch_sql_descriptions_db) - ch_uniref_formatted = SQL_UNIREF.out.sql_formatted_hits + ch_mmseqs_unformatted = MMSEQS_SEARCH_UNIREF.out.mmseqs_search_formatted_out - formattedOutputChannels = formattedOutputChannels.mix(ch_uniref_formatted) + SQL_UNIREF(ch_mmseqs_unformatted, uniref_name, ch_sql_descriptions_db) + ch_mmseqs_formatted = SQL_UNIREF.out.sql_formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_mmseqs_formatted) } // Metals annotation if (use_metals) { @@ -316,8 +330,45 @@ workflow DB_SEARCH { false, metals_name ) - ch_metals_formatted = HMM_SEARCH_METALS.out.formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_metals_formatted) + ch_hmm_formatted = HMM_SEARCH_METALS.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_hmm_formatted) + } + // antiSMASH + if (use_antismash) { + ANTISMASH_ANTISMASH(ch_antismash_map, DB_CHANNEL_SETUP.out.ch_antismash_db, ch_gene_gff) + } + // RGI with CARD + if (use_rgi) { + + RGI_MAIN(ch_rgi_map, DB_CHANNEL_SETUP.out.ch_card_db, []) + } + // CARD annotation + if (use_card) { + ch_combined_query_locs_card = ch_mmseqs_query.join(ch_gene_locs) + MMSEQS_SEARCH_CARD( ch_combined_query_locs_card, DB_CHANNEL_SETUP.out.ch_card_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, card_name ) + ch_mmseqs_formatted = MMSEQS_SEARCH_CARD.out.mmseqs_search_formatted_out + formattedOutputChannels = formattedOutputChannels.mix(ch_mmseqs_formatted) + } + // TCDB annotation + if (use_tcdb) { + ch_combined_query_locs_tcdb = ch_mmseqs_query.join(ch_gene_locs) + MMSEQS_SEARCH_TCDB( ch_combined_query_locs_tcdb, DB_CHANNEL_SETUP.out.ch_tcdb_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, tcdb_name ) + ch_mmseqs_formatted = MMSEQS_SEARCH_TCDB.out.mmseqs_search_formatted_out + formattedOutputChannels = formattedOutputChannels.mix(ch_mmseqs_formatted) + } + // DRAM DB annotation + if (use_dram_db) { + ch_combined_proteins_locs = ch_called_proteins.join(ch_gene_locs) + HMM_SEARCH_DRAM_DB ( + ch_combined_proteins_locs, + "", // No e value, skip e value flag + DB_CHANNEL_SETUP.out.ch_dram_db, + ch_dram_db_hmm_list, + false, + dram_db_name + ) + ch_hmm_formatted = HMM_SEARCH_DRAM_DB.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_hmm_formatted) } // VOGdb annotation if (use_vog) { @@ -330,19 +381,18 @@ workflow DB_SEARCH { false, vogdb_name ) - ch_vog_formatted = HMM_SEARCH_VOG.out.formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_vog_formatted) + ch_hmm_formatted = HMM_SEARCH_VOG.out.formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_hmm_formatted) } // Viral annotation if (params.use_viral) { ch_combined_query_locs_viral = ch_mmseqs_query.join(ch_gene_locs) MMSEQS_SEARCH_VIRAL( ch_combined_query_locs_viral, DB_CHANNEL_SETUP.out.ch_viral_db, params.bit_score_threshold, params.rbh_bit_score_threshold,default_sheet, viral_name ) - ch_viral_unformatted = MMSEQS_SEARCH_VIRAL.out.mmseqs_search_formatted_out - - SQL_VIRAL(ch_viral_unformatted, viral_name, ch_sql_descriptions_db) - ch_viral_formatted = SQL_VIRAL.out.sql_formatted_hits + ch_mmseqs_unformatted = MMSEQS_SEARCH_VIRAL.out.mmseqs_search_formatted_out - formattedOutputChannels = formattedOutputChannels.mix(ch_viral_formatted) + SQL_VIRAL(ch_mmseqs_unformatted, viral_name, ch_sql_descriptions_db) + ch_mmseqs_formatted = SQL_VIRAL.out.sql_formatted_hits + formattedOutputChannels = formattedOutputChannels.mix(ch_mmseqs_formatted) } fastas = formattedOutputChannels.map { it[1] }.collect() genes = ch_called_proteins.map { it[1] }.collect() @@ -371,6 +421,11 @@ workflow DB_CHANNEL_SETUP { use_merops use_uniref use_metals + use_antismash + use_rgi + use_card + use_tcdb + use_dram_db use_vog @@ -380,6 +435,8 @@ workflow DB_CHANNEL_SETUP { ch_kegg_db = Channel.empty() ch_kofam_db = Channel.empty() ch_dbcan_db = Channel.empty() + ch_dbcan3_db = Channel.empty() + ch_dbcan3_sub_db = Channel.empty() ch_camper_hmm_db = Channel.empty() ch_camper_mmseqs_db = Channel.empty() ch_camper_mmseqs_list = Channel.empty() @@ -389,6 +446,10 @@ workflow DB_CHANNEL_SETUP { ch_sulfur_db = Channel.empty() ch_uniref_db = Channel.empty() ch_metals_db = Channel.empty() + ch_antismash_db = Channel.empty() + ch_card_db = Channel.empty() + ch_tcdb_db = Channel.empty() + ch_dram_db = Channel.empty() ch_methyl_db = Channel.empty() ch_fegenie_db = Channel.empty() ch_canthyd_hmm_db = Channel.empty() @@ -449,6 +510,33 @@ workflow DB_CHANNEL_SETUP { ch_metals_db = file(params.metals_db).exists() ? file(params.metals_db) : error("Error: If using --annotate, you must supply prebuilt databases. METALS database file not found at ${params.metals_db}") } + if (use_antismash) { + ch_antismash_db = file(params.antismash_db).exists() ? file(params.antismash_db) : error("Error: If using --annotate, you must supply prebuilt databases. antismash database file not found at ${params.antismash_db}") + } + + if (use_rgi || use_card) { + ch_card_db = file(params.card_db).exists() ? file(params.card_db) : error("Error: If using --annotate, you must supply prebuilt databases. rgi database file not found at ${params.card_db}") + // rgi software uses the raw fasta, but card search we use the mmseqs database + if (use_card) { + index_mmseqs = true + } + } + + if (use_tcdb) { + ch_tcdb_db = file(params.tcdb_db).exists() ? file(params.tcdb_db) : error("Error: If using --annotate, you must supply prebuilt databases. tcdb database file not found at ${params.tcdb_db}") + index_mmseqs = true + } + + if (use_dram_db) { + if (!file(params.dram_db).exists()) { + error("Error: If using --annotate, you must supply prebuilt databases. dram database file not found at ${params.dram_db}") + } + // ch_dram_db = [file("${params.dram_db}/dram_db.hmm")] + // ch_dram_db = [file(params.dram_db)] + ch_dram_db = file(params.dram_db) + // ch_dram_db = file(params.dram_db).exists() ? file(params.dram_db) : error("Error: If using --annotate, you must supply prebuilt databases. dram database file not found at ${params.dram_db}") + } + if (use_methyl) { ch_methyl_db = file(params.methyl_db).exists() ? file(params.methyl_db) : error("Error: If using --annotate, you must supply prebuilt databases. METHYL database file not found at ${params.methyl_db}") index_mmseqs = true @@ -489,6 +577,10 @@ workflow DB_CHANNEL_SETUP { ch_sulfur_db ch_uniref_db ch_metals_db + ch_antismash_db + ch_card_db + ch_tcdb_db + ch_dram_db ch_methyl_db ch_fegenie_db ch_canthyd_hmm_db diff --git a/subworkflows/local/utils_nfcore_dram_pipeline/main.nf b/subworkflows/local/utils_nfcore_dram_pipeline/main.nf index 58b86c77..c72af0a2 100644 --- a/subworkflows/local/utils_nfcore_dram_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_dram_pipeline/main.nf @@ -67,10 +67,10 @@ workflow PIPELINE_INITIALISATION { if (params.anno_dbs != "") { anno_dbs = params.anno_dbs.tokenize(',').collect { it.trim().toLowerCase() } value_for_all = 'all' - use_kegg = getDBFlag(anno_dbs, 'kegg', value_for_all) - use_kofam = getDBFlag(anno_dbs, 'kofam', value_for_all) - use_fegenie = getDBFlag(anno_dbs, 'fegenie', value_for_all) - use_sulfur = getDBFlag(anno_dbs, 'sulfur', value_for_all) + use_kegg = getDBFlag(anno_dbs, 'kegg', value_for_all, params.kegg_db) + use_kofam = getDBFlag(anno_dbs, 'kofam', value_for_all, params.kofam_db) + use_fegenie = getDBFlag(anno_dbs, 'fegenie', value_for_all, params.fegenie_db) + use_sulfur = getDBFlag(anno_dbs, 'sulfur', value_for_all, params.sulfur_db) // use_pfam = getDBFlag(anno_dbs, 'pfam', value_for_all) // PFAM database is currently disabled in this pipeline due to a bug in the DRAM2 implementation with the PFAM database. It will be re-enabled in a future release. } diff --git a/subworkflows/local/utils_pipeline_setup.nf b/subworkflows/local/utils_pipeline_setup.nf index 5e40ab25..e208198d 100644 --- a/subworkflows/local/utils_pipeline_setup.nf +++ b/subworkflows/local/utils_pipeline_setup.nf @@ -8,8 +8,12 @@ include { getWorkflowVersion } from '../nf-core/utils_nfcore_pipeline' */ -def getDBFlag(db_list, db_name, value_for_all) { +def getDBFlag(db_list, db_name, value_for_all, db_path) { if (db_list.contains(value_for_all)) { + if (!file(db_path).exists()) { + log.warn("Database $db_name not found at path $db_path, skipping") + return false + } return true } else if (db_list.contains(db_name)) { return true diff --git a/workflows/dram.nf b/workflows/dram.nf index 53481ba2..ecf903ea 100644 --- a/workflows/dram.nf +++ b/workflows/dram.nf @@ -73,6 +73,7 @@ workflow DRAM { use_kegg = params.use_kegg use_kofam = params.use_kofam use_dbcan = params.use_dbcan + use_dbcan3 = params.use_dbcan3 use_camper = params.use_camper use_fegenie = params.use_fegenie use_methyl = params.use_methyl @@ -82,26 +83,35 @@ workflow DRAM { use_merops = params.use_merops use_uniref = params.use_uniref use_metals = params.use_metals + use_antismash = params.use_antismash + use_rgi = params.use_rgi + use_card = params.use_card + use_tcdb = params.use_tcdb + use_dram_db = params.use_dram_db use_vog = params.use_vog - if (params.anno_dbs != "") { anno_dbs = params.anno_dbs.tokenize(',').collect { it.trim().toLowerCase() } value_for_all = 'all' - use_kegg = getDBFlag(anno_dbs, 'kegg', value_for_all) - use_kofam = getDBFlag(anno_dbs, 'kofam', value_for_all) - use_dbcan = getDBFlag(anno_dbs, 'dbcan', value_for_all) - use_dbcan3 = getDBFlag(anno_dbs, 'dbcan3', value_for_all) - use_camper = getDBFlag(anno_dbs, 'camper', value_for_all) - use_fegenie = getDBFlag(anno_dbs, 'fegenie', value_for_all) - use_methyl = getDBFlag(anno_dbs, 'methyl', value_for_all) - use_canthyd = getDBFlag(anno_dbs, 'canthyd', value_for_all) - use_sulfur = getDBFlag(anno_dbs, 'sulfur', value_for_all) + use_kegg = getDBFlag(anno_dbs, 'kegg', value_for_all, params.kegg_db) + use_kofam = getDBFlag(anno_dbs, 'kofam', value_for_all, params.kofam_db) + use_dbcan = getDBFlag(anno_dbs, 'dbcan', value_for_all, params.dbcan_db) + use_dbcan3 = getDBFlag(anno_dbs, 'dbcan3', value_for_all, params.dbcan3_db) + use_camper = getDBFlag(anno_dbs, 'camper', value_for_all, params.camper_hmm_db) + use_fegenie = getDBFlag(anno_dbs, 'fegenie', value_for_all, params.fegenie_db) + use_methyl = getDBFlag(anno_dbs, 'methyl', value_for_all, params.methyl_db) + use_canthyd = getDBFlag(anno_dbs, 'canthyd', value_for_all, params.canthyd_hmm_db) + use_sulfur = getDBFlag(anno_dbs, 'sulfur', value_for_all, params.sulfur_db) // use_pfam = getDBFlag(anno_dbs, 'pfam', value_for_all) // PFAM database is currently disabled in this pipeline due to a bug in the DRAM2 implementation with the PFAM database. It will be re-enabled in a future release. - use_merops = getDBFlag(anno_dbs, 'merops', value_for_all) - use_uniref = getDBFlag(anno_dbs, 'uniref', value_for_all) - use_metals = getDBFlag(anno_dbs, 'metals', value_for_all) - use_vog = getDBFlag(anno_dbs, 'vog', value_for_all) + use_merops = getDBFlag(anno_dbs, 'merops', value_for_all, params.merops_db) + use_uniref = getDBFlag(anno_dbs, 'uniref', value_for_all, params.uniref_db) + use_metals = getDBFlag(anno_dbs, 'metals', value_for_all, params.metals_db) + use_antismash = getDBFlag(anno_dbs, 'antismash', value_for_all, params.antismash_db) + use_rgi = getDBFlag(anno_dbs, 'rgi', value_for_all, params.card_db) + use_card = getDBFlag(anno_dbs, 'card', value_for_all, params.card_db) + use_tcdb = getDBFlag(anno_dbs, 'tcdb', value_for_all, params.tcdb_db) + use_dram_db = getDBFlag(anno_dbs, 'dram_db', value_for_all, params.dram_db) + use_vog = getDBFlag(anno_dbs, 'vog', value_for_all, params.vog_db) } @@ -241,6 +251,11 @@ workflow DRAM { use_merops, use_uniref, use_metals, + use_antismash, + use_rgi, + use_card, + use_tcdb, + use_dram_db, use_vog )