Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions conf/modules/bedtools_intersect.config
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
process {
withName: BEDTOOLS_INTERSECT {
ext.args = ''
ext.prefix = { "${intervals1.baseName}" }
ext.suffix = 'targeted.bedGraph'
withName: FILTER_BEDGRAPH_TARGETS {
ext.prefix = { "${bedgraph.baseName}" }
publishDir = [
[
path: { params.aligner == 'bismark' ? "${params.outdir}/bismark/methylation_calls/bedGraph" : "${params.outdir}/methyldackel" },
Expand Down
14 changes: 14 additions & 0 deletions conf/modules/bedtools_intersect_cov.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
process {
withName: BEDTOOLS_INTERSECT_COV {
ext.args = ''
ext.prefix = { "${intervals1.baseName}" }
ext.suffix = 'targeted.cov'
publishDir = [
[
path: { "${params.outdir}/bismark/methylation_calls/methylation_coverage" },
mode: params.publish_dir_mode,
pattern: "*targeted.cov"
]
]
}
}
1 change: 1 addition & 0 deletions conf/subworkflows/targeted_sequencing.config
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
includeConfig "../modules/bedtools_intersect.config"
includeConfig "../modules/bedtools_intersect_cov.config"
includeConfig "../modules/picard_bedtointervallist.config"
includeConfig "../modules/picard_createsequencedictionary.config"
includeConfig "../modules/picard_collecthsmetrics.config"
57 changes: 57 additions & 0 deletions modules/local/filter_bedgraph_targets.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Filter bedGraph files against target regions with CpG-aware boundary handling.
*
* Bismark bedGraph files use 0-based half-open coordinates for single-C positions
* (e.g., chr1 10788 10789 for a CpG at 1-based position 10789). When a CpG
* straddles a target boundary — C just outside, G just inside — a naive
* intersection drops it because the single-C interval [10788, 10789) does not
* overlap the target [10789, …).
*
* This process extends each bedGraph entry by 1 bp on the right to represent the
* full CpG dinucleotide, intersects with the target BED, then restores the
* original single-base coordinates.
*/
process FILTER_BEDGRAPH_TARGETS {
tag "${meta.id}"
label 'process_single'

conda "bioconda::bedtools=2.31.1"
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
? 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0'
: 'biocontainers/bedtools:2.31.1--hf5e1c6e_0'}"

input:
tuple val(meta), path(bedgraph), path(targets)

output:
tuple val(meta), path("*.targeted.bedGraph"), emit: intersect
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when

script:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
# Decompress and extend end coordinate by 1 bp so the interval covers the full CpG dinucleotide
# Skip any track/header lines to avoid corrupting them
zcat ${bedgraph} | awk 'BEGIN{OFS="\t"} /^track/{print; next} {if(NF>=3) \$3=\$3+1; print}' > extended.bedGraph

# Intersect with target regions (-wa: write original -a entry; -u: unique hits only)
bedtools intersect \\
-a extended.bedGraph \\
-b ${targets} \\
-wa \\
-u \\
> intersected.bedGraph

# Restore original single-base end coordinates
awk 'BEGIN{OFS="\t"} /^track/{print; next} {if(NF>=3) \$3=\$3-1; print}' intersected.bedGraph \\
> ${prefix}.targeted.bedGraph

cat <<-END_VERSIONS > versions.yml
"${task.process}":
bedtools: \$(bedtools --version | sed -e "s/bedtools v//g")
END_VERSIONS
"""
}
77 changes: 46 additions & 31 deletions subworkflows/local/targeted_sequencing/main.nf
Original file line number Diff line number Diff line change
@@ -1,47 +1,60 @@
/*
* targeted_sequencing subworkflow
*
* Filters bedGraph files with the target_regions BED file so they contain positions only listed in the BED.
* Filters bedGraph and coverage files with the target_regions BED file so they contain positions only listed in the BED.
* If specified, it also generates some performance metrics with Picard CollectHsMetrics, such as Fold-80 Base Penalty,
* HS Library Size, Percent Duplicates, and Percent Off Bait. This is relevant for methylome experiments with targeted seq.
*/

include { BEDTOOLS_INTERSECT } from '../../../modules/nf-core/bedtools/intersect/main'
include { PICARD_CREATESEQUENCEDICTIONARY } from '../../../modules/nf-core/picard/createsequencedictionary/main'
include { PICARD_BEDTOINTERVALLIST } from '../../../modules/nf-core/picard/bedtointervallist/main'
include { PICARD_COLLECTHSMETRICS } from '../../../modules/nf-core/picard/collecthsmetrics/main'
include { FILTER_BEDGRAPH_TARGETS } from '../../../modules/local/filter_bedgraph_targets'
include { BEDTOOLS_INTERSECT as BEDTOOLS_INTERSECT_COV } from '../../../modules/nf-core/bedtools/intersect/main'
include { PICARD_CREATESEQUENCEDICTIONARY } from '../../../modules/nf-core/picard/createsequencedictionary/main'
include { PICARD_BEDTOINTERVALLIST } from '../../../modules/nf-core/picard/bedtointervallist/main'
include { PICARD_COLLECTHSMETRICS } from '../../../modules/nf-core/picard/collecthsmetrics/main'

workflow TARGETED_SEQUENCING {

take:
ch_bedgraph // channel: [ val(meta), [ bedGraph(s) ]] when bwameth, [ val(meta), bedGraph ] when bismark
ch_target_regions // channel: path(target_regions.bed)
ch_fasta // channel: [ [:], /path/to/genome.fa]
ch_fasta_index // channel: [ val(meta), /path/to/genome.fa.fai]
ch_bam // channel: [ val(meta), [ bam ] ] ## BAM from alignment
ch_bai // channel: [ val(meta), [ bai ] ] ## BAI from alignment
ch_gzi // channel: [ val(meta), [ gzi ] ] ## GZI from fasta
collecthsmetrics // boolean: whether to run Picard CollectHsMetrics
ch_bedgraph // channel: [ val(meta), [ bedGraph(s) ]] when bwameth, [ val(meta), bedGraph ] when bismark
ch_coverage // channel: [ val(meta), cov.gz ] from Bismark methylation extractor (empty for bwameth)
ch_target_regions // channel: path(target_regions.bed)
ch_fasta // channel: [ [:], /path/to/genome.fa]
ch_fasta_index // channel: [ val(meta), /path/to/genome.fa.fai]
ch_bam // channel: [ val(meta), [ bam ] ] ## BAM from alignment
ch_bai // channel: [ val(meta), [ bai ] ] ## BAI from alignment
ch_gzi // channel: [ val(meta), [ gzi ] ] ## GZI from fasta
collecthsmetrics // boolean: whether to run Picard CollectHsMetrics

main:

ch_versions = Channel.empty()
ch_picard_metrics = Channel.empty()

/*
* Intersect bedGraph files with target regions
* Ensure ch_bedgraph contains the bedGraph file(s) in an array and split into individual bedGraphs
* Intersect bedGraph files with target regions (CpG-aware boundary handling)
* Ensure ch_bedgraph contains the bedGraph file(s) in an array and split into individual bedGraphs.
* The FILTER_BEDGRAPH_TARGETS process extends single-C intervals by 1 bp before
* intersection so that CpGs straddling a target boundary are not lost.
*/
ch_bedgraphs_target = ch_bedgraph
.map { meta, bedgraphs -> tuple(meta, bedgraphs instanceof List ? bedgraphs : [bedgraphs]) }
.flatMap { meta, bedgraphs -> bedgraphs.collect { bedgraph -> [meta, bedgraph] } }
.combine(ch_target_regions)

BEDTOOLS_INTERSECT(
ch_bedgraphs_target,
[[:], []]
FILTER_BEDGRAPH_TARGETS(ch_bedgraphs_target)
ch_versions = ch_versions.mix(FILTER_BEDGRAPH_TARGETS.out.versions)

/*
* Intersect Bismark coverage files with target regions
* The .cov.gz files are filtered the same way as bedGraph files so that
* downstream tools (methylKit, bsseq, DSS) receive only on-target CpGs.
*/
ch_coverage_target = ch_coverage.combine(ch_target_regions)

BEDTOOLS_INTERSECT_COV(
ch_coverage_target,
[[:], []],
)
ch_versions = ch_versions.mix(BEDTOOLS_INTERSECT.out.versions)
ch_versions = ch_versions.mix(BEDTOOLS_INTERSECT_COV.out.versions)

/*
* Run Picard CollectHSMetrics
Expand All @@ -65,7 +78,7 @@ workflow TARGETED_SEQUENCING {
PICARD_BEDTOINTERVALLIST(
target_regions_with_meta,
ch_sequence_dictionary,
[]
[],
)
ch_intervals = PICARD_BEDTOINTERVALLIST.out.intervallist.map { it[1] }
ch_versions = ch_versions.mix(PICARD_BEDTOINTERVALLIST.out.versions)
Expand All @@ -75,34 +88,36 @@ workflow TARGETED_SEQUENCING {
* Note: Using the same intervals for both target and bait as they are typically
* the same for targeted methylation sequencing experiments
*/
ch_picard_inputs = ch_bam.join(ch_bai)
ch_picard_inputs = ch_bam
.join(ch_bai)
.combine(ch_intervals)
.combine(ch_intervals)
.combine(ch_fasta)
.combine(ch_fasta_index)
.combine(ch_sequence_dictionary)
.combine(ch_gzi)
.multiMap { meta, bam, bai, intervals1, intervals2, meta_fasta, fasta, meta_fasta_index, fasta_index, meta_dict, dict, meta_gzi, gzi ->
bam_etc: [ meta, bam, bai, intervals1, intervals2 ] // intervals: baits, targets
fasta: [ meta_fasta, fasta ]
fasta_index: [ meta_fasta_index, fasta_index ]
dict: [ meta_dict, dict ]
gzi : [ meta_gzi, gzi ]
bam_etc: [meta, bam, bai, intervals1, intervals2]
fasta: [meta_fasta, fasta]
fasta_index: [meta_fasta_index, fasta_index]
dict: [meta_dict, dict]
gzi: [meta_gzi, gzi]
}

PICARD_COLLECTHSMETRICS(
ch_picard_inputs.bam_etc,
ch_picard_inputs.fasta,
ch_picard_inputs.fasta_index,
ch_picard_inputs.dict,
ch_picard_inputs.gzi
ch_picard_inputs.gzi,
)
ch_picard_metrics = PICARD_COLLECTHSMETRICS.out.metrics
ch_versions = ch_versions.mix(PICARD_COLLECTHSMETRICS.out.versions)
}

emit:
bedgraph_filtered = BEDTOOLS_INTERSECT.out.intersect // channel: [ val(meta), path("*.bedGraph") ]
picard_metrics = ch_picard_metrics // channel: [ val(meta), path("*_metrics") ]
versions = ch_versions // channel: path("*.version.txt")
bedgraph_filtered = FILTER_BEDGRAPH_TARGETS.out.intersect // channel: [ val(meta), path("*.targeted.bedGraph") ]
coverage_filtered = BEDTOOLS_INTERSECT_COV.out.intersect // channel: [ val(meta), path("*.cov") ]
picard_metrics = ch_picard_metrics // channel: [ val(meta), path("*_metrics") ]
versions = ch_versions // channel: path("*.version.txt")
}
Loading
Loading