Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,27 @@

All notable changes to this project will be documented in this file.

## 2.0.0-beta28 - 2026-04-15

[cdfe210](https://github.com/WrightonLabCSU/DRAM/commit/cdfe210ca64eb95baf6f1acedb62f91b74630181)...[e07cd74](https://github.com/WrightonLabCSU/DRAM/commit/e07cd74e8d60fca7513f645c04d0956760c74768)

### Features

- Add antiSMASH, CARD, RGI, TCDB ([8d08d1f](https://github.com/WrightonLabCSU/DRAM/commit/8d08d1f9d54fb139eb53587754e569c4317ddc37))

Add antiSMASH nextflow module, right now just collect antismash
raw output while we work on incorporating raw output into
larger pipeline
Add rgi nextflow module, right now like antiSMASH, only
collect raw output while we work on incorporating
ADD CARD db processing with mmseqs
ADD TCDB processing with mmseqs

- Add DRAM DB HMMs = ([e07cd74](https://github.com/WrightonLabCSU/DRAM/commit/e07cd74e8d60fca7513f645c04d0956760c74768))

Add DRAM team curated HMM database as new annotation db option.
Work in progress and testing database, but can be found on GLOBUS.

## 2.0.0-beta27 - 2026-03-18

[f03804b](https://github.com/WrightonLabCSU/DRAM/commit/f03804bca43b15e55731316c00b1c34ac328c62c)...[7d9a12d](https://github.com/WrightonLabCSU/DRAM/commit/7d9a12d225c577a6b2fb0c4d7b1ba60a5588e1e8)
Expand Down
2 changes: 2 additions & 0 deletions bin/combine_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ def combine_annotations(annotations_dir, genes_dir, output, threads):
combined_data[FASTA_COLUMN] = combined_data[FASTA_COLUMN].where(
mask, other=combined_data[FASTA_COLUMN + "2"]
)
# TODO: fix the merge so it doesn't make this column
combined_data = combined_data.drop(columns=FASTA_COLUMN + "2")

combined_data = convert_bit_scores_to_numeric(combined_data)

Expand Down
26 changes: 18 additions & 8 deletions bin/hmm_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,6 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
hits["perc_cov"] = (hits["model_end"] - hits["model_start"] + 1) / hits[
"query_length"
]
hits[f"{db_name}_id"] = hits["query_name"].str.replace(r".hmm", "", regex=True)
all_hits = get_all_hits(hits, db_name)
all_hits.name = f"{db_name}_ids"
hits = hits.merge(all_hits, how="left", left_on="query_id", right_index=True)

hmm_sheet = False
if hmm_info_path is not None:
hmm_sheet = True
Expand Down Expand Up @@ -228,8 +223,11 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
pass
elif "definition" in hmm_info.columns:
hmm_info = hmm_info.rename(columns={"definition": "description"})
elif pd.api.types.is_string_dtype(hmm_info.iloc[:, -1]):
hmm_info = hmm_info.rename(columns={hmm_info.columns[-1]: "description"})
elif (
pd.api.types.is_string_dtype(hmm_info.iloc[:, -1])
and hmm_info.columns[-1] not in merge_cols
): # don't need to worry about description in merge cols, cause already checked
hmm_info["deescription"] = hmm_info[hmm_info.columns[-1]].copy()
else:
raise_on_ec = True

Expand All @@ -243,10 +241,13 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
)

merge_cols = [col for col in merge_cols if col in hmm_info.columns]

print(hmm_info.columns)
print(hmm_info)
hits = hits.merge(
hmm_info[merge_cols], how="left", left_on="query_name", right_index=True
)
print(hits.columns)
print(hits)
hits_sig = sig_scores_row_by_row(hits, db_name=db_name)
drop_cols = [
col
Expand All @@ -268,6 +269,15 @@ def main(hmm_domtbl, hmm_info_path, ec_from_info, gene_locs, db_name, output):
# df.to_csv(output, index=False)
return

hits_sig[f"{db_name}_id"] = hits_sig["query_name"].str.replace(
r".hmm", "", regex=True
)
all_hits_sig = get_all_hits(hits_sig, db_name)
all_hits_sig.name = f"{db_name}_ids"
hits_sig = hits_sig.merge(
all_hits_sig, how="left", left_on="query_id", right_index=True
)

# Get the best hit
# hits_sig = hits_sig.sort_values(['full_evalue', "domain_ievalue", "perc_cov"], ascending=[True, True, False]).drop_duplicates(subset=["query_id"])
hits_sig = hits_sig.sort_values(
Expand Down
18 changes: 14 additions & 4 deletions bin/hmm_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,35 @@
help="Path to the input fasta to search against",
)
@click.option("--e_value", type=float, help="e value cutoff for filtering")
@click.option("--t_value", type=float, help="bitscore cutoff for filtering")
@click.option(
"--output_file",
type=click.Path(),
help="Path to output file",
)
@click.option("--cpus", type=int, help="number of cpu core to run HMMER with")
def main(hmm, input_file, e_value, output_file, cpus):
def main(hmm, input_file, e_value, t_value, output_file, cpus):
t1 = time.time()

hmm = Path(hmm)

hmm_paths = hmm.parent.glob(hmm.name)
if hmm.is_dir(): # if directory passed, glob all hmms in dir
hmm = hmm / "*.hmm"
if "*" in str(hmm) or "?" in str(hmm): # check if path is glob path
hmm_paths = hmm.parent.glob(hmm.name)
else:
hmm_paths = [hmm]

hmms = []
for path in hmm_paths:
with pyhmmer.plan7.HMMFile(path) as hmm_file:
hmms.extend(hmm_file)

print(hmms)
kw = {}
if t_value:
kw["T"] = t_value
elif e_value:
kw["E"] = e_value

with open(output_file, "wb") as out_fh:
with pyhmmer.easel.SequenceFile(
Expand All @@ -46,7 +56,7 @@ def main(hmm, input_file, e_value, output_file, cpus):
seqs = pyhmmer.easel.DigitalSequenceBlock(alphabet)
seqs.extend(sf)
first = True
for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, E=e_value):
for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=cpus, **kw):
hits.write(out_fh, format="domains", header=first)
first = False
# total = sum(len(hits) for hits in pyhmmer.hmmer.hmmsearch(hmms, seqs, cpus=8, E=1e-15))
Expand Down
14 changes: 11 additions & 3 deletions bin/utils/click_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
#!/usr/bin/env python
def validate_comma_separated(ctx, param, value, split=(",", " ")):
def validate_comma_separated(ctx, param, value, split=(",", " "), converter=None):
if not value:
return []
if isinstance(value, (list, tuple)):
s = split if isinstance(split, str) else split[0]
value = s.join(value)
if isinstance(split, str):
split = [split]
return value.split(split)
if isinstance(split, (list, tuple)):
sentinel = "|SENTINEL|"
for s in split:
value = value.replace(s, ",")
return [val.strip() for val in value.split(",")]
value = value.replace(s, sentinel)
ls = []
for val in value.split(sentinel):
val = val.strip()
if converter:
val = converter(val)
ls.append(val)
return ls
15 changes: 15 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,25 @@
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
"antismash/antismash": {
"branch": "master",
"git_sha": "96c57dfd98a0641886a67bd449fe33ee2ec0e374",
"installed_by": ["modules"]
},
"antismash/antismashdownloaddatabases": {
"branch": "master",
"git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46",
"installed_by": ["modules"]
},
"multiqc": {
"branch": "master",
"git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d",
"installed_by": ["modules"]
},
"rgi/main": {
"branch": "master",
"git_sha": "5e748ff2b0f990949081c9e49792622eb3fe9ee9",
"installed_by": ["modules"]
}
}
},
Expand Down
5 changes: 3 additions & 2 deletions modules/local/annotate/hmmsearch.nf
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ process HMM_SEARCH {
script:
def args = task.ext.args ?: ""
def ec_flag = ec_from_info ? "--ec_from_info" : ""
def cutoff_flag = e_value ? "--e_value ${e_value}" : ""

"""
hmm_search.py \\
--hmm ${database_loc}/*.hmm \\
--hmm ${database_loc} \\
--input_file ${fasta} \\
--e_value ${e_value} \\
${cutoff_flag} \\
--output_file ${input_fasta}_hmmsearch.out \\
--cpus ${task.cpus}

Expand Down
10 changes: 5 additions & 5 deletions modules/local/annotate/mmseqs_search.nf
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ process MMSEQS_SEARCH {
# Perform search
mmseqs search query_database/${input_fasta}.mmsdb ${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/tmp --threads ${task.cpus}

# Filter to only best hit
mmseqs filterdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}_tophit.mmsdb --extract-lines 1

# Filter to only hits with minimum bit score
mmseqs filterdb --filter-column 2 --comparison-operator ge --comparison-value ${bit_score_threshold} --threads ${task.cpus} mmseqs_out/${input_fasta}_${db_name}_tophit.mmsdb mmseqs_out/${input_fasta}_${db_name}_tophit_minbitscore${bit_score_threshold}.mmsdb
mmseqs filterdb --filter-column 2 --comparison-operator ge --comparison-value ${bit_score_threshold} --threads ${task.cpus} mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb

# Filter to only best hit
mmseqs filterdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb --extract-lines 1

# Convert results to BLAST outformat 6
mmseqs convertalis query_database/${input_fasta}.mmsdb ${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}_tophit_minbitscore${bit_score_threshold}.mmsdb mmseqs_out/${input_fasta}___mmseqs_${db_name}.tsv --threads ${task.cpus}
mmseqs convertalis query_database/${input_fasta}.mmsdb ${db_name}.mmsdb mmseqs_out/${input_fasta}_${db_name}.mmsdb mmseqs_out/${input_fasta}___mmseqs_${db_name}.tsv --threads ${task.cpus}

# if statement for kegg rbh goes here
elif [ "${db_name}" == "pfam" ]; then
Expand Down
7 changes: 7 additions & 0 deletions modules/nf-core/antismash/antismash/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- "bioconda::antismash=8.0.1"
85 changes: 85 additions & 0 deletions modules/nf-core/antismash/antismash/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
process ANTISMASH_ANTISMASH {
tag "${meta.id}"
label 'process_medium'

conda "${moduleDir}/environment.yml"
container "nf-core/antismash:8.0.1--pyhdfd78af_0"

input:
tuple val(meta), path(sequence_input)
path databases
path gff

output:
tuple val(meta), path("${prefix}/{css,images,js}") , emit: html_accessory_files
tuple val(meta), path("${prefix}/*.gbk") , emit: gbk_input
tuple val(meta), path("${prefix}/*.json") , emit: json_results
tuple val(meta), path("${prefix}/*.log") , emit: log
tuple val(meta), path("${prefix}/*.zip") , emit: zip
tuple val(meta), path("${prefix}/index.html") , emit: html
tuple val(meta), path("${prefix}/regions.js") , emit: json_sideloading
tuple val(meta), path("${prefix}/clusterblast/*_c*.txt") , emit: clusterblast_file , optional: true
tuple val(meta), path("${prefix}/knownclusterblast/region*/ctg*.html"), emit: knownclusterblast_html , optional: true
tuple val(meta), path("${prefix}/knownclusterblast/") , emit: knownclusterblast_dir , optional: true
tuple val(meta), path("${prefix}/knownclusterblast/*_c*.txt") , emit: knownclusterblast_txt , optional: true
tuple val(meta), path("${prefix}/svg/clusterblast*.svg") , emit: svg_files_clusterblast , optional: true
tuple val(meta), path("${prefix}/svg/knownclusterblast*.svg") , emit: svg_files_knownclusterblast, optional: true
tuple val(meta), path("${prefix}/*region*.gbk") , emit: gbk_results , optional: true
tuple val(meta), path("${prefix}/clusterblastoutput.txt") , emit: clusterblastoutput , optional: true
tuple val(meta), path("${prefix}/knownclusterblastoutput.txt") , emit: knownclusterblastoutput , optional: true
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
gff_flag = gff ? "--genefinding-gff3 ${gff}" : ""

"""
## We specifically do not include on-the-fly annotations (--genefinding-tool none) as
## this should be run as a separate module for versioning purposes

antismash \\
${args} \\
${gff_flag} \\
-c ${task.cpus} \\
--output-dir ${prefix} \\
--output-basename ${prefix} \\
--genefinding-tool none \\
--logfile ${prefix}/${prefix}.log \\
--databases ${databases} \\
${sequence_input}

cat <<-END_VERSIONS > versions.yml
"${task.process}":
antismash: \$(echo \$(antismash --version) | sed 's/antiSMASH //;s/-.*//g')
END_VERSIONS
"""

stub:
prefix = task.ext.prefix ?: "${meta.id}"
"""
mkdir -p ${prefix}/css
mkdir ${prefix}/images
mkdir ${prefix}/js
touch ${prefix}/NZ_CP069563.1.region001.gbk
touch ${prefix}/NZ_CP069563.1.region002.gbk
touch ${prefix}/css/bacteria.css
touch ${prefix}/genome.gbk
touch ${prefix}/genome.json
touch ${prefix}/genome.zip
touch ${prefix}/images/about.svg
touch ${prefix}/index.html
touch ${prefix}/js/antismash.js
touch ${prefix}/js/jquery.js
touch ${prefix}/regions.js
touch ${prefix}/test.log

cat <<-END_VERSIONS > versions.yml
"${task.process}":
antismash: \$(echo \$(antismash --version) | sed 's/antiSMASH //;s/-.*//g')
END_VERSIONS
"""
}
Loading
Loading