Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
04e740e
refactor: move repetitive variants filter to function (#118)
m-huertasp Dec 16, 2025
78b182b
refactor: follow PR153 code refactoring (#118)
m-huertasp Dec 18, 2025
9123b0e
refactor: add docstrings as in PR153 (#118)
m-huertasp Dec 18, 2025
71fe982
add: output BED of flagged positions from FILTERBATCH step (#118)
m-huertasp Dec 18, 2025
6776d1e
Merge remote-tracking branch 'origin/dev' into 118-reapply-mask-disca…
m-huertasp Jan 9, 2026
6999cc1
refactor: separate CREATEPANELS in two steps
m-huertasp Jan 12, 2026
9bcc9ba
refactor: emit consensus and exons bed from ANNOTATEPANEL (#118)
m-huertasp Jan 13, 2026
7a547cb
refactor: improve comments in ANNOTATEPANEL (#118)
m-huertasp Jan 13, 2026
2e7e866
refactor: correct bed coordinates
m-huertasp Jan 13, 2026
e744435
fix: bug for publishDir
m-huertasp Jan 13, 2026
1d813bd
feature: add filtering step to CREATE PANELS
m-huertasp Jan 14, 2026
8d77fa6
bug: wrongly calling CREATEPANELS output in OMEGA (#118)
m-huertasp Jan 14, 2026
0921f59
refactor: nanoseq masks to flagged bed (#118)
m-huertasp Jan 15, 2026
79332d2
fix: add header to removed variants (#118)
m-huertasp Jan 15, 2026
4db3273
fix: correct column names in removed variants file (#118)
m-huertasp Jan 19, 2026
13d3544
documentation: added documentation for ANNOTATEPANELS (#118)
m-huertasp Jan 19, 2026
9f607bf
refactor: apply copilot's suggestions (#118)
m-huertasp Jan 19, 2026
a206746
refactor: rename ANNOTATEPANELS (#118)
m-huertasp Jan 23, 2026
d0c35bc
refactor: remove sample panels (#118)
m-huertasp Jan 23, 2026
3493e08
refactor: rename removed_variants to removed_sites (#118)
m-huertasp Jan 23, 2026
1aec319
refactor: rename removed_variants to removed_sites (#118)
m-huertasp Jan 27, 2026
bd7e247
refactor: add masking and refactor (#118)
m-huertasp Jan 27, 2026
01d5672
refactor: move functions to utils (#118)
m-huertasp Jan 27, 2026
83c9777
refactor: rename flagged_muts to flagged_positions (#118)
m-huertasp Jan 27, 2026
43e718c
feature: new module to extract flagged positions per sample (#118)
m-huertasp Jan 27, 2026
3a9d3d0
refactor: remove input to annotatedepths (#118)
m-huertasp Jan 27, 2026
ab2df56
bug: removed input.csv from ANNOTATEDEPTHS inputs (#118)
m-huertasp Jan 27, 2026
78086ab
refactor: use filter_criteria(_somatic) to flag positions (#118)
m-huertasp Jan 27, 2026
781cba9
feature: create mask matrix from sample's bed files (#118)
m-huertasp Jan 28, 2026
1041375
refactor: add create mask matrix to MUTPREPROCESSING (#118)
m-huertasp Jan 28, 2026
8d0b1f2
refactor: use matrix instead of bed files for filtering (#118)
m-huertasp Jan 28, 2026
d33e14c
tests: added tests for mask matrix related operations (#118)
m-huertasp Jan 28, 2026
8f8f7ba
refactor: move mask matrix related to same script (#118)
m-huertasp Jan 28, 2026
642178e
refactor: add all filters and the extraction function (#118)
m-huertasp Jan 28, 2026
cdf4b72
refactor: redo filtering strategy to depth = 0 (#118)
m-huertasp Jan 28, 2026
dffa9f3
refactor: rename for repo consistency (#118)
m-huertasp Jan 30, 2026
28a54ba
refactor: extract flagged positions all cohort (#118)
m-huertasp Jan 30, 2026
36cbfb1
refactor: bed file from FILTERBATCH (#118)
m-huertasp Jan 30, 2026
2241b77
refactor: use filter_criteria to extract flagged bed (#118)
m-huertasp Jan 30, 2026
c521f95
refactor: matrix created with sample and cohort positions (#118)
m-huertasp Jan 30, 2026
2b3d5b5
fix: solving empty filters -> no bed (#118)
m-huertasp Jan 30, 2026
478b0b5
refactor: don't publish mask matrix (#118)
m-huertasp Jan 30, 2026
10920c1
refactor: apply copilot's suggestions (#118)
m-huertasp Jan 30, 2026
359e44b
refactor: use filter parameters directly
m-huertasp Feb 2, 2026
9d80fb2
refactor: change from pandas to polars (#118)
m-huertasp Feb 6, 2026
d865490
refactor: obtain masks from write_maf (#118)
m-huertasp Feb 6, 2026
730dc6f
refactor: change flagged bed creation to WRITEMAF (#118)
m-huertasp Feb 10, 2026
4641c74
refactor: move filter_maf to utils_filter (#118)
m-huertasp Feb 11, 2026
a0a0742
refactor: apply review comments (#118)
m-huertasp Feb 11, 2026
3ab3a86
add: output only cohort-wide applied filters in bed (#118)
m-huertasp Feb 11, 2026
c44278c
refactor: adapt code to avoid merge conflicts (#118)
m-huertasp Feb 11, 2026
ee084cd
Merge remote-tracking branch 'origin/dev' into 118-reapply-mask-disca…
m-huertasp Feb 11, 2026
a6bdfaa
fix: double module inclusion (#118)
m-huertasp Feb 11, 2026
5ad035e
chore: change container version (#118)
m-huertasp Feb 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 191 additions & 0 deletions bin/create_mask_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#!/usr/bin/env python

"""
Create Position per Sample Mask Matrix from BED Files

This script aggregates per-sample flagged position BED files into a single
mask matrix where rows are positions and columns are samples.

Each sample-specific BED file (*.flagged-pos.bed) contains positions that should
be masked only for that particular sample.

The output is a TSV with 1/0 values indicating whether a position should be kept (1)
or masked (0) for each sample.

Command-line Arguments
----------------------
bed-files : str
Space-separated list of sample-specific BED files

Usage
-----
create_mask_matrix.py \\
--bed-files sample1.flagged-pos.bed sample2.flagged-pos.bed ... \\
"""

import logging
from pathlib import Path

import click
import pandas as pd

# Logging
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s - %(message)s",
level=logging.INFO,
datefmt="%m/%d/%Y %I:%M:%S %p"
)
LOG = logging.getLogger("create_mask_matrix")

def add_bed_positions(bed_df: pd.DataFrame,
sample_name: str,
masked_positions: set,
mask_data: list) -> int:
"""
Add BED positions to mask data for a specific sample.

Parameters
----------
bed_df : pd.DataFrame
BED dataframe with CHROM, START, END, FILTER columns
sample_name : str
Sample name to apply masking to
masked_positions : set
Set tracking already-masked (CHROM, POS, SAMPLE) tuples
mask_data : list
List to append mask entries to

Returns
-------
int
Number of new entries added
"""
entries_added = 0

for row in bed_df.itertuples():
for pos in range(row.START, row.END + 1):
key = (row.CHROM, pos, sample_name)
if key not in masked_positions:
mask_data.append({
'CHROM': row.CHROM,
'POS': pos,
'SAMPLE': sample_name,
'KEEP': 0,
})
masked_positions.add(key)
entries_added += 1

return entries_added

def prepare_matrix(mask_data: list) -> pd.DataFrame:
"""
Prepare the mask matrix dataframe from collected mask data.

Parameters
----------
mask_data : list
List of dictionaries with CHROM, POS, SAMPLE, KEEP keys

Returns
-------
pd.DataFrame
DataFrame in long format ready for pivoting to matrix
"""
# Create dataframe from collected data
mask_df = pd.DataFrame(mask_data)

# Pivot to matrix format: rows = (CHROM, POS), columns = samples
mask_matrix = mask_df.pivot_table(
index=['CHROM', 'POS'],
columns='SAMPLE',
values='KEEP',
fill_value=1, # Positions not in a sample's BED = we should keep
aggfunc='first' # In case of duplicates, take first
)

# Reset index to make CHROM and POS regular columns
mask_matrix = mask_matrix.reset_index()

# Sort by chromosome and position for readability
mask_matrix = mask_matrix.sort_values(['CHROM', 'POS']).reset_index(drop=True)

return mask_matrix


def create_mask_matrix(bed_files: list) -> None:
"""
Create a position per sample mask matrix from per-sample BED files.

Parameters
----------
bed_files : list
List of paths to sample-specific BED files
"""
LOG.info(f"Processing {len(bed_files)} BED files...")

# Collect all sample's names
all_samples = set()
for bed_file in bed_files:
sample_name = Path(bed_file).stem.replace('.flagged-pos', '')
all_samples.add(sample_name)

# Track already-masked positions using a set for lookups
masked_positions = set() # Set of (CHROM, POS, SAMPLE) tuples to avoid duplicates within sample
mask_data = []

# Process sample-specific BED files
sample_count = 0
for bed_file in bed_files:
sample_name = Path(bed_file).stem.replace('.flagged-pos', '')

try:
bed_df = pd.read_csv(bed_file, sep="\t", header=None,
names=["CHROM", "START", "END", "FILTER"])

if bed_df.empty:
LOG.info(f"No flagged positions for {sample_name}")
continue

# Add sample-specific positions
sample_specific = add_bed_positions(bed_df, sample_name, masked_positions, mask_data)

LOG.info(f"{sample_name}: {len(bed_df)} positions in BED, {sample_specific} unique entries added")
sample_count += sample_specific

except Exception as e:
LOG.warning(f"Could not process {bed_file}: {e}")
continue

LOG.info(f"Total mask entries: {sample_count}")

# Handle case where no positions need masking
if not mask_data:
LOG.info("No positions to mask across all samples, creating empty matrix")
empty_df = pd.DataFrame(columns=['CHROM', 'POS'])
empty_df.to_csv("flagged_positions.mask.tsv.gz", sep="\t", index=False, compression='gzip')
return

# Create dataframe from collected data
mask_matrix = prepare_matrix(mask_data)

# Save to compressed TSV
mask_matrix.to_csv("flagged_positions.mask.tsv.gz", sep="\t", index=False, compression='gzip')
LOG.info(f"Mask matrix saved to: flagged_positions.mask.tsv.gz")


@click.command()
@click.option('--bed-files', multiple=True, required=True, type=click.Path(exists=True),
help='Sample-specific flagged position BED files')
def main(bed_files: tuple):
"""
Create mask matrix from sample BED files.
"""
try:
create_mask_matrix(list(bed_files))
except Exception as e:
LOG.error(f"Error creating mask matrix: {e}")
raise


if __name__ == '__main__':
main()
Loading