diff --git a/bin/create_mask_matrix.py b/bin/create_mask_matrix.py new file mode 100755 index 00000000..ba66a870 --- /dev/null +++ b/bin/create_mask_matrix.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python + +""" +Create Position per Sample Mask Matrix from BED Files + +This script aggregates per-sample flagged position BED files into a single +mask matrix where rows are positions and columns are samples. + +Each sample-specific BED file (*.flagged-pos.bed) contains positions that should +be masked only for that particular sample. + +The output is a TSV with 1/0 values indicating whether a position should be kept (1) +or masked (0) for each sample. + +Command-line Arguments +---------------------- +bed-files : str + Space-separated list of sample-specific BED files + +Usage +----- +create_mask_matrix.py \\ + --bed-files sample1.flagged-pos.bed sample2.flagged-pos.bed ... \\ +""" + +import logging +from pathlib import Path + +import click +import pandas as pd + +# Logging +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s - %(message)s", + level=logging.INFO, + datefmt="%m/%d/%Y %I:%M:%S %p" +) +LOG = logging.getLogger("create_mask_matrix") + +def add_bed_positions(bed_df: pd.DataFrame, + sample_name: str, + masked_positions: set, + mask_data: list) -> int: + """ + Add BED positions to mask data for a specific sample. + + Parameters + ---------- + bed_df : pd.DataFrame + BED dataframe with CHROM, START, END, FILTER columns + sample_name : str + Sample name to apply masking to + masked_positions : set + Set tracking already-masked (CHROM, POS, SAMPLE) tuples + mask_data : list + List to append mask entries to + + Returns + ------- + int + Number of new entries added + """ + entries_added = 0 + + for row in bed_df.itertuples(): + for pos in range(row.START, row.END + 1): + key = (row.CHROM, pos, sample_name) + if key not in masked_positions: + mask_data.append({ + 'CHROM': row.CHROM, + 'POS': pos, + 'SAMPLE': sample_name, + 'KEEP': 0, + }) + masked_positions.add(key) + entries_added += 1 + + return entries_added + +def prepare_matrix(mask_data: list) -> pd.DataFrame: + """ + Prepare the mask matrix dataframe from collected mask data. + + Parameters + ---------- + mask_data : list + List of dictionaries with CHROM, POS, SAMPLE, KEEP keys + + Returns + ------- + pd.DataFrame + DataFrame in long format ready for pivoting to matrix + """ + # Create dataframe from collected data + mask_df = pd.DataFrame(mask_data) + + # Pivot to matrix format: rows = (CHROM, POS), columns = samples + mask_matrix = mask_df.pivot_table( + index=['CHROM', 'POS'], + columns='SAMPLE', + values='KEEP', + fill_value=1, # Positions not in a sample's BED = we should keep + aggfunc='first' # In case of duplicates, take first + ) + + # Reset index to make CHROM and POS regular columns + mask_matrix = mask_matrix.reset_index() + + # Sort by chromosome and position for readability + mask_matrix = mask_matrix.sort_values(['CHROM', 'POS']).reset_index(drop=True) + + return mask_matrix + + +def create_mask_matrix(bed_files: list) -> None: + """ + Create a position per sample mask matrix from per-sample BED files. + + Parameters + ---------- + bed_files : list + List of paths to sample-specific BED files + """ + LOG.info(f"Processing {len(bed_files)} BED files...") + + # Collect all sample's names + all_samples = set() + for bed_file in bed_files: + sample_name = Path(bed_file).stem.replace('.flagged-pos', '') + all_samples.add(sample_name) + + # Track already-masked positions using a set for lookups + masked_positions = set() # Set of (CHROM, POS, SAMPLE) tuples to avoid duplicates within sample + mask_data = [] + + # Process sample-specific BED files + sample_count = 0 + for bed_file in bed_files: + sample_name = Path(bed_file).stem.replace('.flagged-pos', '') + + try: + bed_df = pd.read_csv(bed_file, sep="\t", header=None, + names=["CHROM", "START", "END", "FILTER"]) + + if bed_df.empty: + LOG.info(f"No flagged positions for {sample_name}") + continue + + # Add sample-specific positions + sample_specific = add_bed_positions(bed_df, sample_name, masked_positions, mask_data) + + LOG.info(f"{sample_name}: {len(bed_df)} positions in BED, {sample_specific} unique entries added") + sample_count += sample_specific + + except Exception as e: + LOG.warning(f"Could not process {bed_file}: {e}") + continue + + LOG.info(f"Total mask entries: {sample_count}") + + # Handle case where no positions need masking + if not mask_data: + LOG.info("No positions to mask across all samples, creating empty matrix") + empty_df = pd.DataFrame(columns=['CHROM', 'POS']) + empty_df.to_csv("flagged_positions.mask.tsv.gz", sep="\t", index=False, compression='gzip') + return + + # Create dataframe from collected data + mask_matrix = prepare_matrix(mask_data) + + # Save to compressed TSV + mask_matrix.to_csv("flagged_positions.mask.tsv.gz", sep="\t", index=False, compression='gzip') + LOG.info(f"Mask matrix saved to: flagged_positions.mask.tsv.gz") + + +@click.command() +@click.option('--bed-files', multiple=True, required=True, type=click.Path(exists=True), + help='Sample-specific flagged position BED files') +def main(bed_files: tuple): + """ + Create mask matrix from sample BED files. + """ + try: + create_mask_matrix(list(bed_files)) + except Exception as e: + LOG.error(f"Error creating mask matrix: {e}") + raise + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/filter_cohort.py b/bin/filter_cohort.py index c5b5837f..378d22c9 100755 --- a/bin/filter_cohort.py +++ b/bin/filter_cohort.py @@ -1,199 +1,422 @@ #!/usr/bin/env python +""" +Filter Cohort - MAF Processing and Flagging Script + +This script processes a Mutation Annotation Format (MAF) file to filter variants by specific criteria and +generates a final filtered MAF. + +Command-line Arguments +---------------------- +maf_df-file : str + Path to the gzipped input MAF file. +sample_name : str + Output sample name. +repetitive-variant-threshold : int + Minimum occurrences threshold to flag a repetitive variant. +somatic-vaf-boundary : float + VAF threshold to classify somatic mutations. +n-rich-cohort-proportion : float + Proportion threshold for n-rich cohort filtering. + +Authors +------- +Author : Ferriol Calvet (@FerriolCalvet) +Email : ferriol.calvet@irbbarcelona.org + +Contributors +------------ +- Raquel Blanco - @rblancomi (raquel.blanco@irbbarcelona.org) +- Federica Brando - @FedericaBrando (federica.brando@irbbarcelona.org) +- Marta Huertas - @m-huertasp (marta.huertas@irbbarcelona.org) + +Usage +----- +python filter_cohort.py \\ + --maf-df-file input.maf.tsv.gz \\ + --sample-name sample1 \\ + --repetitive-variant-threshold 5 \\ + --somatic-vaf-boundary 0.4 \\ + --n-rich-cohort-proportion 0.1 + +""" +import logging +from pathlib import Path import click import pandas as pd from utils import add_filter from read_utils import custom_na_values - -@click.command() -@click.option('--maf-df-file', required=True, type=click.Path(exists=True), help='Input gzipped MAF file (TSV)') -@click.option('--sample-name', required=True, type=str, help='Sample name for output file') -@click.option('--repetitive-variant-threshold', required=True, type=int, help='Threshold for repetitive variants') -@click.option('--somatic-vaf-boundary', required=True, type=float, help='VAF boundary for somatic variants') -@click.option('--n-rich-cohort-proportion', required=True, type=float, help='Proportion for n-rich cohort filtering') -@click.option('--vaf-ns-threshold', required=False, type=float, default=0.1, help='VAF of Ns threshold for filtering variants') -def main(maf_df_file, sample_name, repetitive_variant_threshold, somatic_vaf_boundary, n_rich_cohort_proportion, vaf_ns_threshold): - maf_df = pd.read_csv(maf_df_file, compression='gzip', header=0, sep='\t', na_values=custom_na_values) - - ####### - ### Filter repetitive variants, - ### both based on frequency and including information on position in read - ####### - +from utils_filter import expand_filter_column + +# Logging +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s - %(message)s", level=logging.DEBUG, datefmt="%m/%d/%Y %I:%M:%S %p" +) +LOG = logging.getLogger("filter_cohort") + +def flag_repetitive_variants(maf_df: pd.DataFrame, + repetitive_variant_threshold: int, + somatic_vaf_boundary: float) -> pd.DataFrame: + """ + Flags filter column for repetitive variants from the MAF dataframe. A variant is considered repetitive if it appears in at least + ``repetitive_variant_threshold`` samples. Additionally, variants that consistently appear at the same position in reads + are also flagged as ``repetitive_mapping_variant``. + + Parameters + ---------- + maf_df : pd.DataFrame + MAF dataframe + repetitive_variant_threshold : int + Minimum number of samples a variant must appear in to be considered repetitive + somatic_vaf_boundary : float + VAF threshold to classify somatic mutations + + Returns + ------- + pandas.DataFrame + MAF dataframe with a new column 'repetitive_variant' that flags repetitive variants + """ max_samples = len(pd.unique(maf_df["SAMPLE_ID"])) n_samples = list(range(repetitive_variant_threshold, max_samples + 1)) - if len(n_samples) == 0: - print("Not enough samples to identify potential repetitive variants!") - - else: - - # work with already filtered df + somatic only to explore potential artifacts - # take only variant and sample info from the df - maf_df_f_somatic = maf_df.loc[maf_df["VAF"] <= somatic_vaf_boundary][["MUT_ID","SAMPLE_ID", "PMEAN", "PSTD"]].reset_index(drop = True) - - # add counter column - maf_df_f_somatic["count"] = 1 - maf_df_f_somatic_pivot = maf_df_f_somatic.groupby("MUT_ID")["count"].sum().reset_index() - - repetitive_variants = maf_df_f_somatic_pivot[maf_df_f_somatic_pivot["count"] >= repetitive_variant_threshold]["MUT_ID"] - print("Repetitive variants: ", len(repetitive_variants)) - - maf_df["repetitive_variant"] = maf_df["MUT_ID"].isin(repetitive_variants) - - maf_df["FILTER"] = maf_df[["FILTER","repetitive_variant"]].apply(lambda x: add_filter(x["FILTER"], x["repetitive_variant"], "repetitive_variant"), - axis = 1 - ) - maf_df = maf_df.drop("repetitive_variant", axis = 1) - - - - # use the position in read information to filter repetitive variants with a fixed position (likely artifacts) - maf_df_f_somatic_pos_info = maf_df_f_somatic[~(maf_df_f_somatic["PMEAN"].isna()) & - (maf_df_f_somatic["PMEAN"] != -1) & - (maf_df_f_somatic["PSTD"] == 0)] - - if maf_df_f_somatic_pos_info.shape[0] > 0: - maf_df_f_somatic_compiled_pos = maf_df_f_somatic_pos_info.groupby("MUT_ID")["PMEAN"].nunique().reset_index() - - variants_with_rep_position = maf_df_f_somatic_compiled_pos[(maf_df_f_somatic_compiled_pos["PMEAN"] == 1)]["MUT_ID"] - print("Variants always found in the same position: ", len(variants_with_rep_position)) - - variants_with_rep_position = set(variants_with_rep_position).intersection(set(repetitive_variants)) - print("Repetitive variants always found in the same position: ", len(variants_with_rep_position)) - - maf_df["repetitive_mapping_variant"] = maf_df["MUT_ID"].isin(variants_with_rep_position) - - maf_df["FILTER"] = maf_df[["FILTER","repetitive_mapping_variant"]].apply(lambda x: add_filter(x["FILTER"], x["repetitive_mapping_variant"], "repetitive_mapping_variant"), - axis = 1 - ) - maf_df = maf_df.drop("repetitive_mapping_variant", axis = 1) - - - - - ####### - ### Filter cohort_n_rich - ####### + if n_samples == 0: + LOG.warning("Not enough samples to identify potential repetitive variants!") + + return maf_df + + # Work with already filtered df + somatic only to explore potential artifacts + # take only variant and sample info from the df + maf_df_f_somatic = maf_df.loc[maf_df["VAF"] <= somatic_vaf_boundary][["MUT_ID","SAMPLE_ID", "PMEAN", "PSTD"]].reset_index(drop = True) + + # Group by 'MUT_ID' and count occurrences + maf_df_f_somatic_pivot = maf_df_f_somatic.groupby("MUT_ID").size().reset_index(name="count") + + # Store repetitive variants + repetitive_variants = maf_df_f_somatic_pivot[maf_df_f_somatic_pivot["count"] >= repetitive_variant_threshold]["MUT_ID"] + LOG.info("%s repetitive_variants", len(repetitive_variants)) + + # Flag repetitive variants in the original dataframe + maf_df["repetitive_variant"] = maf_df["MUT_ID"].isin(repetitive_variants) + maf_df["FILTER"] = maf_df[["FILTER", "repetitive_variant"]].apply( + lambda x: add_filter(x["FILTER"], x["repetitive_variant"], "repetitive_variant"), axis=1 + ) + maf_df = maf_df.drop("repetitive_variant", axis=1) + + # Use the position in read information to filter repetitive variants with a fixed position (likely artifacts) + maf_df_f_somatic_pos_info = maf_df_f_somatic[~(maf_df_f_somatic["PMEAN"].isna()) & + (maf_df_f_somatic["PMEAN"] != -1) & + (maf_df_f_somatic["PSTD"] == 0)] + + # Check if there are any repetitive variants with a fixed position + if maf_df_f_somatic_pos_info.shape[0] == 0: + LOG.info("No repetitive variants with fixed position found.") + return maf_df + + # Count unique PMEAN values for each MUT_ID + maf_df_f_somatic_compiled_pos = maf_df_f_somatic_pos_info.groupby("MUT_ID")["PMEAN"].nunique().reset_index() + + # Identify variants always found in the same position -> only one PMEAN value + variants_with_rep_position = maf_df_f_somatic_compiled_pos[(maf_df_f_somatic_compiled_pos["PMEAN"] == 1)]["MUT_ID"] + LOG.info("Variants always found in the same position: %d", len(variants_with_rep_position)) + + # Intersect with previously identified repetitive variants + variants_with_rep_position = set(variants_with_rep_position).intersection(set(repetitive_variants)) + LOG.info("Repetitive variants always found in the same position: %d", len(variants_with_rep_position)) + + # Flag these variants in the maf dataframe + maf_df["repetitive_mapping_variant"] = maf_df["MUT_ID"].isin(variants_with_rep_position) + LOG.info("%s muts flagged as repetitive_mapping_variant", maf_df["repetitive_mapping_variant"].sum()) + + maf_df["FILTER"] = maf_df[["FILTER","repetitive_mapping_variant"]].apply(lambda x: add_filter(x["FILTER"], x["repetitive_mapping_variant"], "repetitive_mapping_variant"), + axis = 1 + ) + maf_df = maf_df.drop("repetitive_mapping_variant", axis = 1) + + return maf_df + +def flag_cohort_n_rich(maf_df: pd.DataFrame, + n_rich_cohort_proportion: float, + somatic_vaf_boundary: float) -> pd.DataFrame: + """ + Flags FILTER column for cohort_n_rich variants from the MAF dataframe. + + Parameters + ---------- + maf_df : pandas.DataFrame + MAF dataframe + n_rich_cohort_proportion : float + Proportion of samples to consider a variant as cohort_n_rich + somatic_vaf_boundary : float + VAF threshold to classify somatic mutations + + Returns + ------- + maf_df : pandas.DataFrame + MAF dataframe with cohort_n_rich variants flagged + """ + LOG.info("Flagging cohort_n_rich...") max_samples = len(pd.unique(maf_df["SAMPLE_ID"])) - if max_samples < 2: - print("Not enough samples to identify cohort_n_rich mutations!") - - else: - number_of_samples = max(2, (max_samples * n_rich_cohort_proportion) // 1) - print(f"flagging mutations that are n_rich in at least: {number_of_samples} samples as cohort_n_rich") - - # work with already filtered df + somatic only to explore potential artifacts - # take only variant and sample info from the df - maf_df_f_somatic = maf_df[["MUT_ID", "SAMPLE_ID", "VAF_Ns", "FILTER"]].reset_index(drop = True) - - n_rich_vars_df = maf_df_f_somatic[maf_df_f_somatic["FILTER"].str.contains("n_rich")].groupby("MUT_ID")[ - ['SAMPLE_ID', 'VAF_Ns'] - ].agg({'SAMPLE_ID' : len, 'VAF_Ns' : min}) - n_rich_vars_df = n_rich_vars_df.rename({'SAMPLE_ID' : 'N_rich_frequency', 'VAF_Ns' : 'VAF_Ns_threshold'}, axis = 'columns') - - n_rich_vars = list(n_rich_vars_df[n_rich_vars_df['N_rich_frequency'] >= number_of_samples].index) - - maf_df["cohort_n_rich"] = maf_df["MUT_ID"].isin(n_rich_vars) - - maf_df["FILTER"] = maf_df[["FILTER","cohort_n_rich"]].apply(lambda x: add_filter(x["FILTER"], x["cohort_n_rich"], "cohort_n_rich"), - axis = 1 - ) - maf_df = maf_df.drop("cohort_n_rich", axis = 1) - - - - # if the variant appeared flagged as n_rich in a single sample it is also filtered out from all other samples - n_rich_vars_uni = list(n_rich_vars_df[n_rich_vars_df['N_rich_frequency'] > 0].index) - - maf_df["cohort_n_rich_uni"] = maf_df["MUT_ID"].isin(n_rich_vars_uni) - - maf_df["FILTER"] = maf_df[["FILTER","cohort_n_rich_uni"]].apply(lambda x: add_filter(x["FILTER"], x["cohort_n_rich_uni"], "cohort_n_rich_uni"), + LOG.warning("Not enough samples to identify cohort_n_rich mutations!") + return maf_df + + number_of_samples = max(2, (max_samples * n_rich_cohort_proportion) // 1) + LOG.info(f"Flagging mutations that are n_rich in at least: {number_of_samples} samples as cohort_n_rich") + + # Work with already filtered df to explore potential artifacts + # take only variant and sample info from the df. + maf_df_f = maf_df[["MUT_ID", "SAMPLE_ID", "VAF_Ns", "FILTER"]].reset_index(drop = True) + + # Aggregate n_rich variants + n_rich_vars_df = ( + maf_df_f[maf_df_f["FILTER"].str.contains("n_rich")] + .groupby("MUT_ID") + .agg( + N_rich_frequency=('SAMPLE_ID', 'count'), + VAF_Ns_threshold=('VAF_Ns', 'min') + ) + ) + + # Flag variants that are n_rich in at least number_of_samples samples -> cohort_n_rich + n_rich_vars = set(n_rich_vars_df[n_rich_vars_df['N_rich_frequency'] >= number_of_samples].index) + + maf_df["cohort_n_rich"] = maf_df["MUT_ID"].isin(n_rich_vars) + LOG.info("%s muts flagged as cohort_n_rich", maf_df["cohort_n_rich"].sum()) + + maf_df["FILTER"] = maf_df[["FILTER","cohort_n_rich"]].apply(lambda x: add_filter(x["FILTER"], x["cohort_n_rich"], "cohort_n_rich"), axis = 1 ) - maf_df = maf_df.drop("cohort_n_rich_uni", axis = 1) - - - # if the variant appeared flagged as n_rich in a single sample it is also filtered out from all other samples - maf_df = maf_df.merge(n_rich_vars_df, on = 'MUT_ID', how = 'left') - maf_df['N_rich_frequency'] = maf_df['N_rich_frequency'].fillna(0) - maf_df['VAF_Ns_threshold'] = maf_df['VAF_Ns_threshold'].fillna(1.1) - - maf_df["cohort_n_rich_threshold"] = maf_df["VAF_Ns"] >= maf_df['VAF_Ns_threshold'] - - maf_df["FILTER"] = maf_df[["FILTER","cohort_n_rich_threshold"]].apply(lambda x: add_filter(x["FILTER"], x["cohort_n_rich_threshold"], "cohort_n_rich_threshold"), - axis = 1 - ) - maf_df = maf_df.drop("cohort_n_rich_threshold", axis = 1) - - - # flag variants that have a proportion of Ns higher than vaf_ns_threshold - maf_df["high_n_vaf"] = maf_df[["VAF_Ns", "VAF_Ns_AM"]].ge(vaf_ns_threshold).any(axis=1) - maf_df["FILTER"] = maf_df[["FILTER","high_n_vaf"]].apply(lambda x: add_filter(x["FILTER"], x["high_n_vaf"], "high_n_vaf"), - axis = 1 - ) - maf_df = maf_df.drop("high_n_vaf", axis = 1) - - - - - - ####### - ### Filter other sample's SNP - ####### - - # this is if we were to consider both unique and no-unique variants + + # Flag variants that are n_rich in at least 1 sample -> cohort_n_rich_uni + n_rich_vars_uni = set(n_rich_vars_df[n_rich_vars_df['N_rich_frequency'] > 0].index) + + maf_df["cohort_n_rich_uni"] = maf_df["MUT_ID"].isin(n_rich_vars_uni) + LOG.info("%s muts flagged as cohort_n_rich_uni", maf_df["cohort_n_rich_uni"].sum()) + + maf_df["FILTER"] = maf_df[["FILTER","cohort_n_rich_uni"]].apply(lambda x: add_filter(x["FILTER"], x["cohort_n_rich_uni"], "cohort_n_rich_uni"), + axis = 1 + ) + + # Flag variants that exceed the VAF_Ns threshold -> cohort_n_rich_threshold + maf_df = maf_df.merge(n_rich_vars_df, on = 'MUT_ID', how = 'left') + maf_df['N_rich_frequency'] = maf_df['N_rich_frequency'].fillna(0) + maf_df['VAF_Ns_threshold'] = maf_df['VAF_Ns_threshold'].fillna(1.1) + + maf_df["cohort_n_rich_threshold"] = maf_df["VAF_Ns"] >= maf_df['VAF_Ns_threshold'] + LOG.info("%s muts flagged as cohort_n_rich_threshold", maf_df["cohort_n_rich_threshold"].sum()) + + maf_df["FILTER"] = maf_df[["FILTER","cohort_n_rich_threshold"]].apply(lambda x: add_filter(x["FILTER"], x["cohort_n_rich_threshold"], "cohort_n_rich_threshold"), + axis = 1 + ) + # Drop temporary columns + maf_df = maf_df.drop(["cohort_n_rich", "cohort_n_rich_uni", "cohort_n_rich_threshold"], axis = 1) + + return maf_df + +def flag_other_samples_snp(maf_df, + somatic_vaf_boundary: float) -> pd.DataFrame: + """ + Filters out SNPs from other samples from the MAF dataframe + + Parameters + ---------- + maf_df : pandas.DataFrame + MAF dataframe + somatic_vaf_boundary : float + VAF boundary to consider a variant as somatic + + Returns + ------- + maf_df : pandas.DataFrame + MAF dataframe with other sample SNPs flagged + """ + LOG.info("Flagging SNPs from other samples...") + # Get all germline variants from all samples, consider both unique and non-unique variants germline_vars_all_samples = maf_df.loc[(maf_df["VAF"] > somatic_vaf_boundary) & - (maf_df["VAF_AM"] > somatic_vaf_boundary) & - (maf_df["vd_VAF"] > somatic_vaf_boundary), - "MUT_ID"].unique() - print(len(germline_vars_all_samples), "using all germline variants of all samples") - + (maf_df["VAF_AM"] > somatic_vaf_boundary) & + (maf_df["vd_VAF"] > somatic_vaf_boundary), + "MUT_ID"].unique() + + LOG.info(f"Using all germline variants of all samples, total: {len(germline_vars_all_samples)} variants.") + # Identify variants that are germline in other samples but somatic in the current sample maf_df["other_sample_SNP"] = False maf_df.loc[(maf_df["MUT_ID"].isin(germline_vars_all_samples)) & - (maf_df["VAF"] <= somatic_vaf_boundary), "other_sample_SNP"] = True + (maf_df["VAF"] <= somatic_vaf_boundary), "other_sample_SNP"] = True + LOG.info("%s muts flagged as other_sample_SNP", maf_df['other_sample_SNP'].sum()) + # Flag variants that are germline in other samples but somatic in the current sample maf_df["FILTER"] = maf_df[["FILTER","other_sample_SNP"]].apply( lambda x: add_filter(x["FILTER"], x["other_sample_SNP"], "other_sample_SNP"), axis = 1 ) maf_df = maf_df.drop("other_sample_SNP", axis = 1) + return maf_df +def flag_gnomad_snp(maf_df: pd.DataFrame) -> pd.DataFrame: + """ + Flags gnomAD SNPs in the MAF dataframe - ####### - ### Filter gnomad SNP - ####### + Parameters + ---------- + maf_df : pandas.DataFrame + MAF dataframe + Returns + ------- + maf_df : pandas.DataFrame + MAF dataframe with gnomAD SNPs flagged + """ + LOG.info("Flagging gnomAD SNPs...") + + # Flag gnomAD SNPs if "gnomAD_SNP" in maf_df.columns: maf_df["gnomAD_SNP"] = maf_df["gnomAD_SNP"].replace({"True": True, "False": False, '-' : False}).fillna(False).astype(bool) - print("Out of ", maf_df["gnomAD_SNP"].shape[0], "positions", maf_df["gnomAD_SNP"].sum(), "are gnomAD SNPs (>0.1)") + LOG.info("Out of %d positions, %d are gnomAD SNPs", maf_df["gnomAD_SNP"].shape[0], maf_df["gnomAD_SNP"].sum()) + maf_df["FILTER"] = maf_df[["FILTER","gnomAD_SNP"]].apply( lambda x: add_filter(x["FILTER"], x["gnomAD_SNP"], "gnomAD_SNP"), axis = 1 ) + + maf_df = maf_df.drop("gnomAD_SNP", axis = 1) - if 'VAF_distorted_expanded_sq' in maf_df.columns: - maf_df["FILTER"] = maf_df[["FILTER","VAF_distorted_expanded_sq"]].apply(lambda x: add_filter(x["FILTER"], x["VAF_distorted_expanded_sq"], "VAF_distorted_expanded_sq"), - axis = 1 - ) + return maf_df + +def flag_vaf_ns_threshold(maf_df: pd.DataFrame, vaf_ns_threshold: float) -> pd.DataFrame: + + """ + Flag variants that have a proportion of Ns higher than vaf_ns_threshold + + Parameters + ---------- + maf_df : pandas.DataFrame + MAF dataframe + vaf_ns_threshold : float + VAF of Ns threshold for filtering variants + + Returns + ------- + maf_df : pandas.DataFrame + MAF dataframe with variants exceeding VAF_Ns threshold flagged + """ + LOG.info("Flagging variants with proportion of Ns higher than VAF_Ns threshold...") + + maf_df["high_n_vaf"] = maf_df[["VAF_Ns", "VAF_Ns_AM"]].ge(vaf_ns_threshold).any(axis=1) + LOG.info("%s muts flagged as high_n_vaf", maf_df["high_n_vaf"].sum()) + + maf_df["FILTER"] = maf_df[["FILTER","high_n_vaf"]].apply( + lambda x: add_filter(x["FILTER"], x["high_n_vaf"], "high_n_vaf"), + axis = 1 + ) + maf_df = maf_df.drop("high_n_vaf", axis = 1) + + return maf_df - for filt in pd.unique(maf_df["FILTER"].str.split(";").explode()): - maf_df[f"FILTER.{filt}"] = maf_df["FILTER"].apply(lambda x: filt in x.split(";")) +def flag_distorted_expanded(maf_df: pd.DataFrame) -> pd.DataFrame: + """ + If there is a column named VAF_distorted_expanded_sq, add a filter flag for variants with distorted VAF distribution. - for filtt in [ "not_covered", "not_in_exons"]: - if f"FILTER.{filtt}" not in maf_df.columns: - maf_df[f"FILTER.{filtt}"] = False + Parameters + ---------- + maf_df : pandas.DataFrame + MAF dataframe + Returns + ------- + maf_df : pandas.DataFrame + MAF dataframe with variants having distorted VAF distribution flagged + """ + LOG.info("Flagging variants with distorted VAF distribution...") + if 'VAF_distorted_expanded_sq' in maf_df.columns: + maf_df["FILTER"] = maf_df[["FILTER","VAF_distorted_expanded_sq"]].apply( + lambda x: add_filter(x["FILTER"], x["VAF_distorted_expanded_sq"], "VAF_distorted_expanded_sq"), + axis = 1 + ) + + return maf_df + +def flag_maf(maf_df: pd.DataFrame, sample_name: str, + repetitive_variant_threshold: int, + somatic_vaf_boundary: float, + n_rich_cohort_proportion: float, + vaf_ns_threshold: float) -> None: + """ + Script to process a MAF (Mutation Annotation Format) file. + It filters out repetitive variants, cohort_n_rich variants, and SNPs from other samples. + + Parameters + ---------- + maf_df : pd.DataFrame + Input MAF dataframe + sample_name : str + Name for the output sample + repetitive_variant_threshold : int + Minimum number of occurrences to flag a variant as repetitive + somatic_vaf_boundary : float + VAF threshold to distinguish somatic mutations + n_rich_cohort_proportion : float + Proportion threshold for n-rich cohort filtering + vaf_ns_threshold : float + VAF of Ns threshold for filtering variants + """ + ## Filter repetitive variants, + # both based on frequency and including information on position in read + maf_df = flag_repetitive_variants(maf_df, repetitive_variant_threshold, somatic_vaf_boundary) + + ## Filter cohort_n_rich variants + maf_df = flag_cohort_n_rich(maf_df, n_rich_cohort_proportion, somatic_vaf_boundary) + + ## Filter variants with high VAF of Ns + maf_df = flag_vaf_ns_threshold(maf_df, vaf_ns_threshold) + + ## Filter other sample's SNP + maf_df = flag_other_samples_snp(maf_df, somatic_vaf_boundary) + + ## Filter gnomad SNP + maf_df = flag_gnomad_snp(maf_df) + + ## Optionally, flag variants with distorted VAF distribution if the corresponding column is present + maf_df = flag_distorted_expanded(maf_df) + + ## Expand FILTER column + maf_df = expand_filter_column(maf_df) + + ## Save final filtered MAF maf_df.to_csv(f"{sample_name}.cohort.filtered.tsv.gz", sep = "\t", header = True, index = False) + + LOG.info("Cohort flagging complete!") + +@click.command() +@click.option('--maf-df-file', required=True, type=click.Path(exists=True), help='Input gzipped MAF file (TSV)') +@click.option('--sample-name', required=True, type=str, help='Sample name for output file') +@click.option('--repetitive-variant-threshold', required=True, type=int, help='Threshold for repetitive variants') +@click.option('--somatic-vaf-boundary', required=True, type=float, help='VAF boundary for somatic variants') +@click.option('--n-rich-cohort-proportion', required=True, type=float, help='Proportion for n-rich cohort filtering') +@click.option('--vaf-ns-threshold', required=False, type=float, default=0.1, help='VAF of Ns threshold for filtering variants') +def main(maf_df_file: str, sample_name: str, repetitive_variant_threshold: int, + somatic_vaf_boundary: float, n_rich_cohort_proportion: float, vaf_ns_threshold: float): + """ + CLI wrapper for flag_maf function. + """ + # Load MAF dataframe + maf_df = pd.read_csv(maf_df_file, compression='gzip', header=0, sep='\t', na_values=custom_na_values) + LOG.debug(f"{maf_df_file}") + # Flag MAF file + flag_maf(maf_df, + sample_name, + repetitive_variant_threshold, + somatic_vaf_boundary, + n_rich_cohort_proportion, + vaf_ns_threshold) + if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/bin/merge_annotation_depths.py b/bin/merge_annotation_depths.py index 3f9294ad..cdbf5af1 100755 --- a/bin/merge_annotation_depths.py +++ b/bin/merge_annotation_depths.py @@ -1,81 +1,200 @@ #!/usr/bin/env python +""" +Merge annotation depths - Depth Annotation and Flagging Script + +This script processes a depths annotation file to merge and annotate depth information across samples. + +Command-line Arguments +---------------------- + +Authors +------- +Author : Ferriol Calvet (@FerriolCalvet) +Email : ferriol.calvet@irbbarcelona.org + +Contributors +------------ +- Federica Brando - @FedericaBrando (federica.brando@irbbarcelona.org) +- Marta Huertas - @m-huertasp (marta.huertas@irbbarcelona.org) +""" import click import json -import pandas as pd +import subprocess +import polars as pl +import logging -def annotate_depths(annotation_file, depths_file, json_f, input_file): +# Logging +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s - %(message)s", level=logging.DEBUG, datefmt="%m/%d/%Y %I:%M:%S %p" +) +LOG = logging.getLogger("merge_annotation_depths") + +# Globals +COLS = ["CHROM", "POS", "CONTEXT"] + +# Functions +def preprocess(annotation_file: str, depths_file: str) -> tuple[pl.DataFrame, list]: """ - INFO + Merge annotation and depth files. + + Parameters + ---------- + annotation_file : str + Path to the annotation file. + depths_file : str + Path to the depths file. + + Returns + ------- + pl.DataFrame + Merged DataFrame with annotations and depths. + list + List of sample column names without suffixes. """ + LOG.info("Preprocessing annotation and depths files...") + _depths = pl.read_csv(depths_file, separator="\t") + _annots = pl.read_csv(annotation_file, separator="\t") - raw_depths = pd.read_csv(depths_file, sep = "\t", header = 0) - annotations = pd.read_csv(annotation_file, sep = "\t", header = 0) + annot_depth = _depths.join(_annots, on=["CHROM", "POS"], how='left').fill_null(pl.lit('-')) - annotated_depths = raw_depths.merge(annotations, on = ["CHROM", "POS"], how = 'left') + sample_columns = [col for col in annot_depth.columns if col not in COLS] - del raw_depths - del annotations + # Dict to remove the .*.bam suffix if there is one + rename_map = {col: col.split('.')[0] if "bam" in col else col for col in sample_columns} - print(annotated_depths.head()) + # Ensure all columns but CHROM and CONTEXT are numeric + annot_depth = annot_depth.with_columns([ + pl.col(col).cast(pl.Int64) if col not in ["CHROM", "CONTEXT"] else pl.col(col) + for col in annot_depth.columns + ]) - annotated_depths["CONTEXT"] = annotated_depths["CONTEXT"].fillna('-') - sample_columns = [x for x in annotated_depths.columns if x not in ["CHROM", "POS", "CONTEXT"] ] - annotated_depths = annotated_depths[["CHROM", "POS", "CONTEXT"] + sample_columns] + LOG.info("Samples: %s", list(rename_map.values())) # List of samples without the .*.bam suffix - input_csv = pd.read_table(input_file, sep = ',', header = 0) - if 'bam' in input_csv.columns: - bam_basenames = input_csv["bam"].astype(str).str.split("/").str[-1] - bam2sample_dict = dict(zip(bam_basenames, input_csv["sample"].astype(str))) + # Place COLS=[CHROM, POS, CONTEXT] columns at the beginning then the rest of the columns + return annot_depth.select(COLS + sample_columns).rename(rename_map), list(rename_map.values()) - else: - samples = input_csv["sample"].astype(str) - bam2sample_dict = dict(zip(samples, samples)) +def apply_mask_matrix(annotated_depths: pl.DataFrame, mask_matrix_file: str) -> pl.DataFrame: + """ + Apply position per sample mask matrix to depths. - sample_columns_correct = [ bam2sample_dict[str(x)] for x in sample_columns ] - annotated_depths.columns = ["CHROM", "POS", "CONTEXT"] + sample_columns_correct - annotated_depths.to_csv("all_samples_indv.depths.tsv.gz", - header=True, - index=False, - sep="\t") - - if json_f: - with open(json_f, 'r') as file: - groups_info = json.load(file) - - for group_name, samples in groups_info.items(): - annotated_depths[group_name] = annotated_depths.loc[:,samples].sum(axis=1) - annotated_depths[["CHROM", "POS", "CONTEXT", group_name]].to_csv(f"{group_name}.depths.annotated.tsv.gz", - sep = "\t", - header = True, - index = False) + Sets depth = 0 where mask value is 0 (position should be masked). + Keeps depth as-is where mask value is 1 (position should be kept). + + Parameters + ---------- + annotated_depths : pl.DataFrame + Depths dataframe with CHROM, POS, CONTEXT, and sample columns + mask_matrix_file : str + Path to mask matrix file (TSV, gzipped) + + Returns + ------- + pl.DataFrame + Depths with masked positions (where mask=0) set to 0 + """ + LOG.info("Loading mask matrix...") + mask_df = pl.read_csv(mask_matrix_file, separator="\t") + + if mask_df.is_empty(): + LOG.info("Mask matrix is empty, no masking applied") + return annotated_depths + + LOG.info(f"Mask matrix loaded: {len(mask_df)} positions") + + + # Obtain sample columns present in both annotated_depths and mask_df + sample_cols = [col for col in mask_df.columns if col in annotated_depths.columns and col not in ["CHROM", "POS"]] + + LOG.info(f"Applying mask to {len(sample_cols)} samples") + + # Join on CHROM and POS. For each sample, multiply depth by mask (0 or 1). If there is no mask, keep original depth (multiply by 1). + annotated_depths_filtered = ( + annotated_depths.join( + mask_df.select(["CHROM", "POS", *sample_cols]), + on=["CHROM", "POS"], + how="left", + suffix="_mask" + ) + .with_columns([ + # If the mask is missing (null), we fill with 1 to keep original depth + (pl.col(s) * pl.col(f"{s}_mask").fill_null(1)).alias(s) + for s in sample_cols + ]) + .drop([f"{col}_mask" for col in sample_cols]) + ) + + return annotated_depths_filtered - else: - for sample in sample_columns_correct: - annotated_depths[["CHROM", "POS", "CONTEXT", f"{sample}"]].to_csv(f"{sample}.depths.annotated.tsv.gz", - sep = "\t", - header = True, - index = False) +def output_annotate_dephts(annotated_depths: pl.DataFrame, json_f: str, samples: list): + """ + Output annotated depths file + + Parameters + ---------- + annotated_depths : pl.DataFrame + Annotated depths dataframe + json_f : str + JSON file with groups information + samples : list + List of samples + """ + LOG.info("Outputting annotated depths file...") - annotated_depths["all_samples"] = annotated_depths.iloc[:,3:].sum(axis=1) - annotated_depths[["CHROM", "POS", "CONTEXT", "all_samples"]].to_csv(f"all_samples.depths.annotated.tsv.gz", - sep = "\t", - header = True, - index = False) + # Output annotated depths file for all samples + # Polars functionality to write gzipped files is currently not working, so we write the file and then gzip it with subprocess + temp_file = "all_samples_indv.depths.tsv" + annotated_depths.write_csv(temp_file, include_header=True, separator="\t") + subprocess.run(["gzip", "-f", temp_file], check=True) + try: + with open(json_f, 'r') as file: + groups_info = json.load(file) + LOG.info("JSON file found. Outputting annotated depths file for each group.") + # Per group + for group_name, samples in groups_info.items(): + annotated_depths = annotated_depths.with_columns( + pl.sum_horizontal([pl.col(sample) for sample in samples]).alias(group_name) + ) + temp_file = f"{group_name}.depths.annotated.tsv" + annotated_depths.select(COLS + [group_name]).write_csv(temp_file, separator="\t", include_header=True) + subprocess.run(["gzip", "-f", temp_file], check=True) + except (TypeError, FileNotFoundError): + LOG.warning("JSON file not found. Outputting annotated depths file for each sample.") + + # Per sample + for sample in samples: + temp_file = f"{sample}.depths.annotated.tsv" + annotated_depths.select(COLS + [str(sample)]).write_csv(temp_file, separator="\t", include_header=True) + subprocess.run(["gzip", "-f", temp_file], check=True) + + # All the samples + annotated_depths = annotated_depths.with_columns( + pl.sum_horizontal(pl.exclude(COLS)).alias("all_samples")) + + temp_file = "all_samples.depths.annotated.tsv" + annotated_depths.select(COLS + ["all_samples"]).write_csv(temp_file, separator="\t", include_header=True) + subprocess.run(["gzip", "-f", temp_file], check=True) @click.command() @click.option('--annotation', type=click.Path(exists=True), help='Input annotation file') @click.option('--depths', type=click.Path(exists=True), help='Input depths file') @click.option('--json_file', type=click.Path(exists=True), help='JSON groups file') -@click.option('--input_csv', type=click.Path(exists=True), help='Input CSV file') -# @click.option('--output', type=click.Path(), help='Output annotated depths file') +@click.option('--mask-matrix', type=click.Path(exists=True), help='Position per sample mask matrix file (1=keep, 0=mask)') +def main(annotation, depths, json_file, mask_matrix): + LOG.info("Annotating depths file...") + + # Preprocess annotation and depths files + annotated_depths, samples = preprocess(annotation, depths) + + # Apply position masking + annotated_depths = apply_mask_matrix(annotated_depths, mask_matrix) + + output_annotate_dephts(annotated_depths, json_file, samples) -def main(annotation, depths, json_file, input_csv): - click.echo(f"Annotating depths file...") - annotate_depths(annotation, depths, json_file, input_csv) + LOG.info("Done!") if __name__ == '__main__': main() diff --git a/bin/plot_maf.py b/bin/plot_maf.py index b4a70d53..d89287fb 100755 --- a/bin/plot_maf.py +++ b/bin/plot_maf.py @@ -9,7 +9,7 @@ import seaborn as sns from matplotlib.backends.backend_pdf import PdfPages -from utils import filter_maf +from utils_filter import filter_maf from read_utils import custom_na_values diff --git a/bin/subset_maf.py b/bin/subset_maf.py index c4180df7..b01b71ee 100755 --- a/bin/subset_maf.py +++ b/bin/subset_maf.py @@ -5,7 +5,7 @@ import json import pandas as pd -from utils import filter_maf +from utils_filter import filter_maf from read_utils import custom_na_values diff --git a/bin/test/test_mask_matrix.py b/bin/test/test_mask_matrix.py new file mode 100755 index 00000000..6ccc1d7a --- /dev/null +++ b/bin/test/test_mask_matrix.py @@ -0,0 +1,561 @@ +#!/usr/bin/env python3 +""" +Unit tests for create_mask_matrix.py + +Tests the functionality of creating position × sample mask matrices from BED files. +""" + +import os +import sys +import tempfile +import shutil +import unittest +from pathlib import Path +import polars as pl + +# Add the bin directory to the path to import the module +sys.path.insert(0, str(Path(__file__).parent.parent)) +from create_mask_matrix import create_mask_matrix +from merge_annotation_depths import apply_mask_matrix + + +class TestCreateMaskMatrix(unittest.TestCase): + """Test suite for create_mask_matrix.py""" + + def setUp(self): + """Set up test fixtures""" + self.test_dir = tempfile.mkdtemp() + self.original_dir = os.getcwd() + os.chdir(self.test_dir) + + def tearDown(self): + """Clean up test fixtures""" + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + + def create_mock_bed_file(self, sample_name, positions): + """ + Create a mock BED file with specified positions. + + Parameters + ---------- + sample_name : str + Name of the sample + positions : list of tuples + List of (chrom, start, end, filter) tuples + """ + bed_file = f"{sample_name}.flagged-pos.bed" + with open(bed_file, 'w') as f: + for chrom, start, end, filter_val in positions: + f.write(f"{chrom}\t{start}\t{end}\t{filter_val}\n") + return bed_file + + def read_mask_matrix(self, filename="flagged_positions.mask.tsv.gz"): + """Read and return the mask matrix""" + return pl.read_csv(filename, separator="\t").to_pandas() + + def test_single_sample_single_position(self): + """Test creating mask matrix with one sample and one position""" + bed_files = [ + self.create_mock_bed_file("sample1", [("chr1", 100, 100, "n_rich")]) + ] + + create_mask_matrix(bed_files) + + # Check output exists + self.assertTrue(Path("flagged_positions.mask.tsv.gz").exists()) + + # Check content + mask_df = self.read_mask_matrix() + self.assertEqual(len(mask_df), 1) + self.assertEqual(mask_df.iloc[0]['CHROM'], 'chr1') + self.assertEqual(mask_df.iloc[0]['POS'], 100) + self.assertEqual(mask_df.iloc[0]['sample1'], 0) # Should be masked + + def test_multiple_samples(self): + """Test creating mask matrix with multiple samples""" + bed_files = [ + self.create_mock_bed_file("sample1", [("chr1", 100, 100, "n_rich")]), + self.create_mock_bed_file("sample2", [("chr1", 200, 200, "nanoseq_snp")]), + self.create_mock_bed_file("sample3", [("chr2", 300, 300, "nanoseq_noise")]) + ] + + create_mask_matrix(bed_files) + + mask_df = self.read_mask_matrix() + + # Should have 3 positions total + self.assertEqual(len(mask_df), 3) + + # Check that each sample has correct columns + self.assertIn('sample1', mask_df.columns) + self.assertIn('sample2', mask_df.columns) + self.assertIn('sample3', mask_df.columns) + + # Check masking pattern: position should be 0 for its sample, 1 for others + row1 = mask_df[(mask_df['CHROM'] == 'chr1') & (mask_df['POS'] == 100)].iloc[0] + self.assertEqual(row1['sample1'], 0) # Masked + self.assertEqual(row1['sample2'], 1) # Not masked + self.assertEqual(row1['sample3'], 1) # Not masked + + def test_overlapping_positions(self): + """Test when multiple samples have the same position flagged""" + bed_files = [ + self.create_mock_bed_file("sample1", [("chr1", 100, 100, "n_rich")]), + self.create_mock_bed_file("sample2", [("chr1", 100, 100, "nanoseq_snp")]), + ] + + create_mask_matrix(bed_files) + + mask_df = self.read_mask_matrix() + + # Should have only 1 position + self.assertEqual(len(mask_df), 1) + + # Both samples should have 0 for this position + row = mask_df.iloc[0] + self.assertEqual(row['CHROM'], 'chr1') + self.assertEqual(row['POS'], 100) + self.assertEqual(row['sample1'], 0) + self.assertEqual(row['sample2'], 0) + + def test_range_expansion(self): + """Test that ranges are expanded to individual positions (1-based inclusive)""" + bed_files = [ + self.create_mock_bed_file("sample1", [("chr1", 100, 102, "nanoseq_noise")]) + ] + + create_mask_matrix(bed_files) + + mask_df = self.read_mask_matrix() + + # Should have 3 positions: 100, 101, 102 (1-based inclusive) + self.assertEqual(len(mask_df), 3) + positions = sorted(mask_df['POS'].tolist()) + self.assertEqual(positions, [100, 101, 102]) + + # All should be masked for sample1 + for pos in [100, 101, 102]: + row = mask_df[mask_df['POS'] == pos].iloc[0] + self.assertEqual(row['sample1'], 0) + + def test_empty_bed_file(self): + """Test handling of empty BED file""" + # Create an empty BED file + bed_file = "sample1.flagged-pos.bed" + with open(bed_file, 'w') as f: + pass # Empty file + + bed_files = [bed_file] + create_mask_matrix(bed_files) + + # Should create empty matrix with just CHROM and POS columns + mask_df = self.read_mask_matrix() + self.assertEqual(len(mask_df), 0) + self.assertIn('CHROM', mask_df.columns) + self.assertIn('POS', mask_df.columns) + + def test_multiple_chromosomes(self): + """Test handling of multiple chromosomes""" + bed_files = [ + self.create_mock_bed_file("sample1", [ + ("chr1", 100, 100, "n_rich"), + ("chr2", 200, 200, "nanoseq_snp"), + ("chrX", 300, 300, "nanoseq_noise") + ]) + ] + + create_mask_matrix(bed_files) + + mask_df = self.read_mask_matrix() + + # Should have 3 positions on different chromosomes + self.assertEqual(len(mask_df), 3) + chroms = sorted(mask_df['CHROM'].unique()) + self.assertIn('chr1', chroms) + self.assertIn('chr2', chroms) + self.assertIn('chrX', chroms) + + def test_sorting(self): + """Test that output is sorted by CHROM and POS""" + bed_files = [ + self.create_mock_bed_file("sample1", [ + ("chr2", 200, 200, "nanoseq_noise"), + ("chr1", 150, 150, "nanoseq_snp"), + ("chr1", 100, 100, "n_rich"), + ]) + ] + + create_mask_matrix(bed_files) + + mask_df = self.read_mask_matrix() + + # Check sorting + chroms = mask_df['CHROM'].tolist() + positions = mask_df['POS'].tolist() + + # chr1 should come before chr2 + chr1_indices = [i for i, c in enumerate(chroms) if c == 'chr1'] + chr2_indices = [i for i, c in enumerate(chroms) if c == 'chr2'] + + if chr1_indices and chr2_indices: + self.assertTrue(max(chr1_indices) < min(chr2_indices)) + + # Within chr1, positions should be sorted + chr1_positions = [positions[i] for i in chr1_indices] + self.assertEqual(chr1_positions, sorted(chr1_positions)) + + def test_complex_scenario(self): + """Test a complex scenario with multiple samples, overlaps, and ranges""" + bed_files = [ + self.create_mock_bed_file("sample1", [ + ("chr1", 100, 102, "n_rich"), # 3 positions + ("chr2", 200, 200, "nanoseq_snp"), # 1 position + ]), + self.create_mock_bed_file("sample2", [ + ("chr1", 101, 101, "nanoseq_noise"), # Overlaps with sample1 + ("chr3", 300, 301, "n_rich"), # 2 positions + ]), + self.create_mock_bed_file("sample3", [ + ("chr2", 200, 200, "nanoseq_snp"), # Overlaps with sample1 + ]) + ] + + create_mask_matrix(bed_files) + + mask_df = self.read_mask_matrix() + + # Total unique positions: chr1:100,101,102 + chr2:200 + chr3:300,301 = 6 + self.assertEqual(len(mask_df), 6) + + # Check specific position: chr1:101 (masked in sample1 and sample2) + row_101 = mask_df[(mask_df['CHROM'] == 'chr1') & (mask_df['POS'] == 101)].iloc[0] + self.assertEqual(row_101['sample1'], 0) + self.assertEqual(row_101['sample2'], 0) + self.assertEqual(row_101['sample3'], 1) + + # Check specific position: chr2:200 (masked in sample1 and sample3) + row_200 = mask_df[(mask_df['CHROM'] == 'chr2') & (mask_df['POS'] == 200)].iloc[0] + self.assertEqual(row_200['sample1'], 0) + self.assertEqual(row_200['sample2'], 1) + self.assertEqual(row_200['sample3'], 0) + + def test_sample_name_extraction(self): + """Test that sample names are correctly extracted from filenames""" + # Create BED file with complex name + bed_file = "complex_sample_name-123.flagged-pos.bed" + with open(bed_file, 'w') as f: + f.write("chr1\t100\t100\tn_rich\n") + + create_mask_matrix([bed_file]) + + mask_df = self.read_mask_matrix() + + # Sample name should be extracted correctly + self.assertIn('complex_sample_name-123', mask_df.columns) + + def test_all_samples_empty(self): + """Test when all samples have empty BED files""" + bed_files = [ + self.create_mock_bed_file("sample1", []), + self.create_mock_bed_file("sample2", []), + ] + + create_mask_matrix(bed_files) + + # Should create empty matrix + mask_df = self.read_mask_matrix() + self.assertEqual(len(mask_df), 0) + + +class TestApplyMaskMatrix(unittest.TestCase): + """Test suite for apply_mask_matrix function""" + + def setUp(self): + """Set up test fixtures""" + self.test_dir = tempfile.mkdtemp() + self.original_dir = os.getcwd() + os.chdir(self.test_dir) + + def tearDown(self): + """Clean up test fixtures""" + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + + def create_mock_depths(self, data): + """ + Create a mock annotated depths dataframe. + + Parameters + ---------- + data : dict + Dictionary with columns as keys and lists as values + + Returns + ------- + pl.DataFrame + Mock depths dataframe + """ + return pl.DataFrame(data) + + def create_mock_mask_matrix(self, data, filename="mask_matrix.tsv.gz"): + """ + Create a mock mask matrix file. + + Parameters + ---------- + data : dict + Dictionary with columns as keys and lists as values + filename : str + Output filename + """ + mask_df = pl.DataFrame(data) + mask_df.write_csv(filename, separator="\t") + return filename + + def test_basic_masking(self): + """Test basic masking: positions with 0 in mask should have depth=0""" + # Create mock depths + depths = self.create_mock_depths({ + 'CHROM': ['chr1', 'chr1', 'chr1'], + 'POS': [100, 200, 300], + 'CONTEXT': ['ACA', 'TCG', 'GGC'], + 'sample1': [50, 60, 70], + 'sample2': [40, 50, 60] + }) + + # Create mask matrix: sample1 position 200 should be masked + mask_file = self.create_mock_mask_matrix({ + 'CHROM': ['chr1', 'chr1', 'chr1'], + 'POS': [100, 200, 300], + 'sample1': [1, 0, 1], # Position 200 masked + 'sample2': [1, 1, 1] # All kept + }) + + # Apply masking + result = apply_mask_matrix(depths, mask_file) + + # Verify results + self.assertEqual(result.filter(pl.col('POS') == 100)['sample1'][0], 50) # Kept + self.assertEqual(result.filter(pl.col('POS') == 200)['sample1'][0], 0) # Masked + self.assertEqual(result.filter(pl.col('POS') == 300)['sample1'][0], 70) # Kept + + # sample2 should remain unchanged + self.assertEqual(result.filter(pl.col('POS') == 100)['sample2'][0], 40) + self.assertEqual(result.filter(pl.col('POS') == 200)['sample2'][0], 50) + self.assertEqual(result.filter(pl.col('POS') == 300)['sample2'][0], 60) + + def test_multiple_samples_masked(self): + """Test masking across multiple samples at the same position""" + depths = self.create_mock_depths({ + 'CHROM': ['chr1', 'chr1'], + 'POS': [100, 200], + 'CONTEXT': ['ACA', 'TCG'], + 'sample1': [50, 60], + 'sample2': [40, 50], + 'sample3': [30, 40] + }) + + # Mask chr1:100 for sample1 and sample2 + mask_file = self.create_mock_mask_matrix({ + 'CHROM': ['chr1', 'chr1'], + 'POS': [100, 200], + 'sample1': [0, 1], # 100 masked + 'sample2': [0, 1], # 100 masked + 'sample3': [1, 1] # All kept + }) + + result = apply_mask_matrix(depths, mask_file) + + # Position 100 should be 0 for sample1 and sample2, kept for sample3 + self.assertEqual(result.filter(pl.col('POS') == 100)['sample1'][0], 0) + self.assertEqual(result.filter(pl.col('POS') == 100)['sample2'][0], 0) + self.assertEqual(result.filter(pl.col('POS') == 100)['sample3'][0], 30) + + # Position 200 should be kept for all + self.assertEqual(result.filter(pl.col('POS') == 200)['sample1'][0], 60) + self.assertEqual(result.filter(pl.col('POS') == 200)['sample2'][0], 50) + self.assertEqual(result.filter(pl.col('POS') == 200)['sample3'][0], 40) + + def test_realistic_example(self): + """Test with realistic example from user's data""" + depths = self.create_mock_depths({ + 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + 'POS': [1071610, 11059842, 19851697, 23495229, 23495334, 26729792], + 'CONTEXT': ['ACA', 'TCG', 'GGC', 'TAT', 'CGC', 'AAA'], + 'P19_0001_BDO_01': [45, 50, 55, 60, 65, 70], + 'P19_0001_BTR_01': [35, 40, 45, 50, 55, 60], + 'P19_0004_BTR_01': [25, 30, 35, 40, 45, 50], + 'P19_0005_BTR_01': [15, 20, 25, 30, 35, 40] + }) + + # Create mask based on user's example + mask_file = self.create_mock_mask_matrix({ + 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + 'POS': [1071610, 11059842, 19851697, 23495229, 23495334, 26729792], + 'P19_0001_BDO_01': [1, 0, 1, 1, 1, 0], + 'P19_0001_BTR_01': [1, 0, 1, 1, 1, 1], + 'P19_0004_BTR_01': [0, 0, 1, 0, 1, 1], + 'P19_0005_BTR_01': [1, 0, 1, 1, 1, 1] + }) + + result = apply_mask_matrix(depths, mask_file) + + # Check specific positions + # chr1:1071610 - P19_0004_BTR_01 should be masked (0) + self.assertEqual(result.filter(pl.col('POS') == 1071610)['P19_0004_BTR_01'][0], 0) + self.assertEqual(result.filter(pl.col('POS') == 1071610)['P19_0001_BDO_01'][0], 45) + + # chr1:11059842 - All samples should be masked except (P19_0001_BTR_01 and others have 0) + self.assertEqual(result.filter(pl.col('POS') == 11059842)['P19_0001_BDO_01'][0], 0) + self.assertEqual(result.filter(pl.col('POS') == 11059842)['P19_0001_BTR_01'][0], 0) + self.assertEqual(result.filter(pl.col('POS') == 11059842)['P19_0004_BTR_01'][0], 0) + self.assertEqual(result.filter(pl.col('POS') == 11059842)['P19_0005_BTR_01'][0], 0) + + # chr1:23495229 - P19_0004_BTR_01 should be masked + self.assertEqual(result.filter(pl.col('POS') == 23495229)['P19_0004_BTR_01'][0], 0) + self.assertEqual(result.filter(pl.col('POS') == 23495229)['P19_0001_BDO_01'][0], 60) + + # chr1:26729792 - P19_0001_BDO_01 should be masked + self.assertEqual(result.filter(pl.col('POS') == 26729792)['P19_0001_BDO_01'][0], 0) + self.assertEqual(result.filter(pl.col('POS') == 26729792)['P19_0001_BTR_01'][0], 60) + + def test_all_positions_kept(self): + """Test when all positions have mask=1 (nothing masked)""" + depths = self.create_mock_depths({ + 'CHROM': ['chr1', 'chr1'], + 'POS': [100, 200], + 'CONTEXT': ['ACA', 'TCG'], + 'sample1': [50, 60], + 'sample2': [40, 50] + }) + + mask_file = self.create_mock_mask_matrix({ + 'CHROM': ['chr1', 'chr1'], + 'POS': [100, 200], + 'sample1': [1, 1], + 'sample2': [1, 1] + }) + + result = apply_mask_matrix(depths, mask_file) + + # All depths should remain unchanged + expected = depths.select(['CHROM', 'POS', 'sample1', 'sample2']) + actual = result.select(['CHROM', 'POS', 'sample1', 'sample2']) + self.assertTrue(expected.equals(actual)) + + def test_all_positions_masked(self): + """Test when all positions have mask=0 (everything masked)""" + depths = self.create_mock_depths({ + 'CHROM': ['chr1', 'chr1'], + 'POS': [100, 200], + 'CONTEXT': ['ACA', 'TCG'], + 'sample1': [50, 60], + 'sample2': [40, 50] + }) + + mask_file = self.create_mock_mask_matrix({ + 'CHROM': ['chr1', 'chr1'], + 'POS': [100, 200], + 'sample1': [0, 0], + 'sample2': [0, 0] + }) + + result = apply_mask_matrix(depths, mask_file) + + # All depths should be 0 + self.assertTrue(result['sample1'].to_list() == [0, 0]) + self.assertTrue(result['sample2'].to_list() == [0, 0]) + + def test_empty_mask_matrix(self): + """Test handling of empty mask matrix""" + depths = self.create_mock_depths({ + 'CHROM': ['chr1'], + 'POS': [100], + 'CONTEXT': ['ACA'], + 'sample1': [50] + }) + + # Create empty mask matrix + mask_file = self.create_mock_mask_matrix({ + 'CHROM': [], + 'POS': [] + }) + + result = apply_mask_matrix(depths, mask_file) + + # Should return original depths unchanged + self.assertTrue(result.equals(depths)) + + def test_partial_overlap(self): + """Test when mask matrix has only some positions from depths""" + depths = self.create_mock_depths({ + 'CHROM': ['chr1', 'chr1', 'chr1'], + 'POS': [100, 200, 300], + 'CONTEXT': ['ACA', 'TCG', 'GGC'], + 'sample1': [50, 60, 70] + }) + + # Mask only has positions 100 and 200 + mask_file = self.create_mock_mask_matrix({ + 'CHROM': ['chr1', 'chr1'], + 'POS': [100, 200], + 'sample1': [0, 1] + }) + + result = apply_mask_matrix(depths, mask_file) + + # Position 100 should be masked + self.assertEqual(result.filter(pl.col('POS') == 100)['sample1'][0], 0) + # Position 200 should be kept + self.assertEqual(result.filter(pl.col('POS') == 200)['sample1'][0], 60) + # Position 300 not in mask, should remain unchanged + self.assertEqual(result.filter(pl.col('POS') == 300)['sample1'][0], 70) + + def test_structure_preservation(self): + """Test that CHROM, POS, and CONTEXT columns are preserved""" + depths = self.create_mock_depths({ + 'CHROM': ['chr1', 'chr2'], + 'POS': [100, 200], + 'CONTEXT': ['ACA', 'TCG'], + 'sample1': [50, 60] + }) + + mask_file = self.create_mock_mask_matrix({ + 'CHROM': ['chr1', 'chr2'], + 'POS': [100, 200], + 'sample1': [0, 1] + }) + + result = apply_mask_matrix(depths, mask_file) + + # Check structure is preserved + self.assertListEqual(list(result.columns), ['CHROM', 'POS', 'CONTEXT', 'sample1']) + self.assertListEqual(result['CHROM'].to_list(), ['chr1', 'chr2']) + self.assertListEqual(result['POS'].to_list(), [100, 200]) + self.assertListEqual(result['CONTEXT'].to_list(), ['ACA', 'TCG']) + + def test_zero_depth_remains_zero(self): + """Test that positions already at depth=0 remain at 0 regardless of mask""" + depths = self.create_mock_depths({ + 'CHROM': ['chr1', 'chr1'], + 'POS': [100, 200], + 'CONTEXT': ['ACA', 'TCG'], + 'sample1': [0, 60] # Already 0 + }) + + mask_file = self.create_mock_mask_matrix({ + 'CHROM': ['chr1', 'chr1'], + 'POS': [100, 200], + 'sample1': [1, 0] # Keep 100, mask 200 + }) + + result = apply_mask_matrix(depths, mask_file) + + # Position 100 was already 0, should remain 0 even with mask=1 + self.assertEqual(result.filter(pl.col('POS') == 100)['sample1'][0], 0) + # Position 200 should be masked to 0 + self.assertEqual(result.filter(pl.col('POS') == 200)['sample1'][0], 0) + +if __name__ == '__main__': + unittest.main() diff --git a/bin/utils.py b/bin/utils.py index 1ea2181b..9418d59a 100755 --- a/bin/utils.py +++ b/bin/utils.py @@ -5,9 +5,16 @@ def add_filter(old_filt, add_filt, filt_name): """ - old filt is the current FILTER field value - add_filt is a boolean, either True or False - filt_name is the name that should be added in the FILTER field in case the add_filt value is True + Adds a filter to the FILTER field in the MAF dataframe + + Parameters + ---------- + old_filt : str + The current FILTER field value + add_filt : bool + Either True or False + filt_name : str + The name that should be added in the FILTER field in case the add_filt value is True """ if add_filt: if old_filt == "PASS": @@ -23,68 +30,6 @@ def to_int_if_possible(string): except ValueError: return None - -def filter_maf(maf_df, filter_criteria): - ''' - Filter a MAF dataframe with filtering information coming from a list of tuples. - This can be either a dictionary transformed to list with the .items() method or by directly creating a list of tuples. - [('VAF', 'le 0.3'), ('VAF_AM', 'le 0.3'), ('vd_VAF', 'le 0.3'), - ('DEPTH', 'ge 40'), ('FILTER', 'notcontains n_rich'), - ('FILTER', 'notcontains cohort_n_rich_uni'), ('FILTER', 'notcontains NM20'), - ('FILTER', 'notcontains no_pileup_support'), ('FILTER', 'notcontains other_sample_SNP'), - ('FILTER', 'notcontains low_mappability')] - ''' - - # Define mappings for operators used in criteria - operators = { - 'eq': lambda x, y: x == y, - 'ne': lambda x, y: x != y, - 'lt': lambda x, y: x < y, - 'le': lambda x, y: x <= y, - 'gt': lambda x, y: x > y, - 'ge': lambda x, y: x >= y, - 'not': lambda x, y: x != y, - 'notcontains': lambda x, y: x.apply(lambda z : y not in z.split(";")), # (~maf_df["FILTER"].str.contains("not_in_panel")) - 'contains': lambda x, y: x.apply(lambda z : y in z.split(";")) - } - - # Apply filters based on criteria from the JSON file - for col, criterion in filter_criteria: - - if isinstance(criterion, bool): - pref_len = maf_df.shape[0] - maf_df = maf_df[maf_df[col] == criterion] - print(f"Applying {col}:{criterion} filter implied going from {pref_len} mutations to {maf_df.shape[0]} mutations.") - - elif ' ' in criterion: - operator, value = criterion.split(maxsplit=1) - - if len(operator) == 2 and operator in operators: - # 'VAF' : 'le 0.35' - pref_len = maf_df.shape[0] - maf_df = maf_df[operators[operator](maf_df[col], float(value))] - print(f"Applying {col}:{criterion} filter implied going from {pref_len} mutations to {maf_df.shape[0]} mutations.") - - elif operator in operators: - # 'FILTER' : 'notcontains n_rich', - pref_len = maf_df.shape[0] - maf_df = maf_df[operators[operator](maf_df[col], value)] - print(f"Applying {col}:{criterion} filter implied going from {pref_len} mutations to {maf_df.shape[0]} mutations.") - - else: - print(f"We have no filtering criteria defined for {col}:{criterion} filter.") - - - else: - # 'TYPE' : 'SNV' - pref_len = maf_df.shape[0] - maf_df = maf_df[maf_df[col] == criterion] - print(f"Applying {col}:{criterion} filter implied going from {pref_len} mutations to {maf_df.shape[0]} mutations.") - - return maf_df - - - def vartype(x, letters = ['A', 'T', 'C', 'G'], len_SV_lim = 50 diff --git a/bin/utils_filter.py b/bin/utils_filter.py new file mode 100644 index 00000000..8539b29f --- /dev/null +++ b/bin/utils_filter.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python +import logging +import pandas as pd +from pathlib import Path +""" +Utility functions for extracting filters from a MAF DataFrame. +""" + +LOG = logging.getLogger(__name__) + +def filter_maf(maf_df, filter_criteria): + ''' + Filter a MAF dataframe with filtering information coming from a list of tuples. + This can be either a dictionary transformed to list with the .items() method or by directly creating a list of tuples. + [('VAF', 'le 0.3'), ('VAF_AM', 'le 0.3'), ('vd_VAF', 'le 0.3'), + ('DEPTH', 'ge 40'), ('FILTER', 'notcontains n_rich'), + ('FILTER', 'notcontains cohort_n_rich_uni'), ('FILTER', 'notcontains NM20'), + ('FILTER', 'notcontains no_pileup_support'), ('FILTER', 'notcontains other_sample_SNP'), + ('FILTER', 'notcontains low_mappability')] + ''' + + # Define mappings for operators used in criteria + operators = { + 'eq': lambda x, y: x == y, + 'ne': lambda x, y: x != y, + 'lt': lambda x, y: x < y, + 'le': lambda x, y: x <= y, + 'gt': lambda x, y: x > y, + 'ge': lambda x, y: x >= y, + 'not': lambda x, y: x != y, + 'notcontains': lambda x, y: x.apply(lambda z : y not in z.split(";")), # (~maf_df["FILTER"].str.contains("not_in_panel")) + 'contains': lambda x, y: x.apply(lambda z : y in z.split(";")) + } + + # Apply filters based on criteria from the JSON file + for col, criterion in filter_criteria: + + if isinstance(criterion, bool): + pref_len = maf_df.shape[0] + maf_df = maf_df[maf_df[col] == criterion] + print(f"Applying {col}:{criterion} filter implied going from {pref_len} mutations to {maf_df.shape[0]} mutations.") + + elif ' ' in criterion: + operator, value = criterion.split(maxsplit=1) + + if len(operator) == 2 and operator in operators: + # 'VAF' : 'le 0.35' + pref_len = maf_df.shape[0] + maf_df = maf_df[operators[operator](maf_df[col], float(value))] + print(f"Applying {col}:{criterion} filter implied going from {pref_len} mutations to {maf_df.shape[0]} mutations.") + + elif operator in operators: + # 'FILTER' : 'notcontains n_rich', + pref_len = maf_df.shape[0] + maf_df = maf_df[operators[operator](maf_df[col], value)] + print(f"Applying {col}:{criterion} filter implied going from {pref_len} mutations to {maf_df.shape[0]} mutations.") + + else: + print(f"We have no filtering criteria defined for {col}:{criterion} filter.") + + + else: + # 'TYPE' : 'SNV' + pref_len = maf_df.shape[0] + maf_df = maf_df[maf_df[col] == criterion] + print(f"Applying {col}:{criterion} filter implied going from {pref_len} mutations to {maf_df.shape[0]} mutations.") + + return maf_df + +def load_filter_criteria(filters: str, somatic_filters: str) -> list[str]: + """ + Parse filter criteria from comma-separated strings. + + Parameters + ---------- + filters : str + Comma-separated list of filter criteria + somatic_filters : str + Comma-separated list of somatic filter criteria + + Returns + ------- + list[str] + List of filter names to apply + """ + # Parse comma-separated strings into lists + filter_list = [f.strip() for f in filters.split(',') if f.strip()] + somatic_filter_list = [f.strip() for f in somatic_filters.split(',') if f.strip()] + + # Combine both lists + all_filters = filter_list + somatic_filter_list + + result = [f.replace("notcontains ", "") for f in all_filters if f.startswith("notcontains ")] + + LOG.info(f"Loaded {len(result)} filter criteria: {result}") + return result + +def expand_filter_column(maf_df: pd.DataFrame) -> pd.DataFrame: + """ + Expands the FILTER column by creating new columns for each unique filter. + Each new column indicates if the corresponding filter is present (True/False). + """ + # Split FILTER column once per row and convert to set for O(1) lookup + filter_sets = maf_df["FILTER"].str.split(";").apply(lambda x: set(x) if x != [''] else set()) + + # Get all unique filter values (excluding empty strings) + all_filters = set( + filter_val + for filter_val in maf_df["FILTER"].str.split(";").explode().unique() + if filter_val and filter_val != '' + ) + + # Ensure "not_covered" and "not_in_exons" exist + required_filters = {"not_covered", "not_in_exons"} + all_filters.update(required_filters) + + # Create boolean columns efficiently + for filt in sorted(all_filters): + maf_df[f"FILTER.{filt}"] = filter_sets.apply(lambda x: filt in x) + + return maf_df + +def extract_flagged_regions_bed(maf_df: pd.DataFrame, name: str, FILTERS: list[str], specification: str = "") -> pd.DataFrame | None: + """ + Returns a BED file with the regions discarded, including the list of filters applied to each mutation. + Creates a properly formatted BED file with 0-based coordinates and half-open intervals. + + Parameters + ---------- + maf_df : pd.DataFrame + Input MAF dataframe with filter columns. POS column should contain 1-based coordinates. + name : str + Sample name to be used in the output BED file name. + FILTERS : list[str] + List of filter criteria to check for in the MAF dataframe. + specification : str, optional + Additional string to include in the output BED file name (e.g., "cohort-"), by default "". + + Returns + ------- + pd.DataFrame + A BED dataframe with discarded mutations and filters applied to each region. + Output coordinates are 0-based with half-open intervals [start, end). + """ + # List of filter columns you want to check for + filter_columns = [f"FILTER.{f}" for f in FILTERS if f in ','.join(list(maf_df.columns))] + + maf_df_filters = maf_df[maf_df[filter_columns].any(axis=1)] if filter_columns else pd.DataFrame() + + if maf_df_filters.empty: + LOG.warning("No mutations were flagged based on the applied filters. Creating empty BED file.") + # Create empty BED file to satisfy pipeline requirements + Path(f"{name}.flagged-pos.bed").touch() + return + + # Create BED-like dataframe with filter columns + bed_df = maf_df_filters[["CHROM", "POS"] + filter_columns] + + # Transform to long format + _bed_melt = (pd.melt(bed_df, + id_vars=["CHROM", "POS"], + value_vars=filter_columns, + var_name="FILTERS") + .query("value == True") + ) + + LOG.info("Mutations flagged: %s", _bed_melt.shape[0]) + + # Aggregate filters per position + bed_annotated = ( + _bed_melt + .drop_duplicates() + .sort_values(by=["CHROM", "POS"]) + .groupby(["CHROM","POS"])["FILTERS"] + .agg(','.join) + .reset_index() + .rename(columns={"POS": "START"}) + ) + + # The idea is to filter depth files at these positions, so make END = START (1-based) + bed_annotated["END"] = bed_annotated["START"] + + LOG.info("Unique regions flagged: %s", bed_annotated.shape[0]) + + # Write BED file without header or index + (bed_annotated[["CHROM", "START", "END", "FILTERS"]] + .to_csv(f"{name}.{specification}flagged-pos.bed", sep="\t", header=False, index=False) + ) \ No newline at end of file diff --git a/bin/write_mafs.py b/bin/write_mafs.py index 76f098da..4048b0c7 100755 --- a/bin/write_mafs.py +++ b/bin/write_mafs.py @@ -1,38 +1,141 @@ #!/usr/bin/env python +""" +Write MAFs - Create MAF file per sample and group. Extract flagged regions to BED file. +This script processes a Mutation Annotation Format (MAF) file to separate it +into individual MAF files for group defined in a provided JSON file. +It also extracts regions flagged by specified filters into a BED file for downstream analysis. +We obtain the flagged regions BED for both individual samples and the entire cohort. + +Command-line Arguments +---------------------- +maf-file : str + Path to the gzipped input MAF file. +groups-json : str + Path to the JSON file containing group/sample mapping. +filters : str + Comma-separated list of filter criteria to apply. +somatic-filters : str + Comma-separated list of somatic filter criteria to apply. + +Authors +------- +Author : Ferriol Calvet (@FerriolCalvet) +Email : ferriol.calvet@irbbarcelona.org + +Contributors +------------ +- Marta Huertas - @m-huertasp (marta.huertas@irbbarcelona.org) + +Usage +----- +python write_mafs.py \\ + --maf-file input.maf.tsv.gz \\ + --groups-json groups.json \\ + --filters "filter1,filter2,filter3" \\ + --somatic-filters "somatic_filter1,somatic_filter2" +""" +import logging import click import pandas as pd import json from read_utils import custom_na_values +from utils_filter import extract_flagged_regions_bed, load_filter_criteria + +# Logging +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s - %(message)s", + level=logging.INFO, + datefmt="%m/%d/%Y %I:%M:%S %p" +) +LOG = logging.getLogger("write_mafs") + +# Constants +# Define flags that are applied for all samples equally. +COHORT_FLAGS = ["cohort_n_rich", "cohort_n_rich_uni", "repetitive_variant", "gnomAD_SNP", "nanoseq_snp", "nanoseq_noise", "not_covered", "not_in_exons"] + +def create_group_mafs(maf_df: pd.DataFrame, group_name: str, samples: list[str]) -> None: + """ + Create separate MAF files for each group defined in the groups_info. + + Parameters + ---------- + maf_df : pd.DataFrame + Input MAF DataFrame + group_name : str + Name of the group + samples : list[str] + List of sample IDs belonging to the group + """ + # Create MAF for the group by filtering the samples + group_maf = maf_df[maf_df["SAMPLE_ID"].isin(samples)].sort_values(by=["CHROM", "POS"]) + # Generate output file + group_maf.to_csv(f"{group_name}.filtered.tsv.gz", sep="\t", header=True, index=False, compression='gzip') + LOG.info(f"Written MAF for group '{group_name}' with {len(group_maf)} records.") + +def create_sample_flagged_bed(maf_df: pd.DataFrame, samples: list[str], filter_criteria: list[str]) -> None: + """ + Extract the flagged regions for each sample and write them to BED files. + + Parameters + ---------- + maf_df : pd.DataFrame + Input MAF DataFrame + samples : list[str] + List of sample IDs to process + filter_criteria : list[str] + List of filter criteria to apply when extracting flagged regions + """ + for sample in samples: + sample_maf = maf_df[maf_df["SAMPLE_ID"] == sample] + LOG.info(f"Extracting flagged positions from {sample}.") + extract_flagged_regions_bed(sample_maf, sample, filter_criteria) + @click.command() @click.option('--maf-file', required=True, type=click.Path(exists=True), help='Input gzipped MAF file (TSV)') @click.option('--groups-json', required=True, type=click.Path(exists=True), help='Optional JSON file with group/sample mapping') -def main(maf_file, groups_json): +@click.option('--filters', required=True, type=str, default='', help='Comma-separated list of filter criteria') +@click.option('--somatic-filters', required=True, type=str, default='', help='Comma-separated list of somatic filter criteria') +def main(maf_file, groups_json, filters: str, somatic_filters: str): maf_df = pd.read_csv(maf_file, compression='gzip', header=0, sep='\t', na_values=custom_na_values) maf_df["SAMPLE_ID"] = maf_df["SAMPLE_ID"].astype(str) - + filter_criteria = load_filter_criteria(filters, somatic_filters) + with open(groups_json, 'r') as file: groups_info = json.load(file) - all_samples = set([item for sublist in groups_info.values() for item in sublist]) - available_samples = set(maf_df["SAMPLE_ID"]) + # Obtain the set of all samples to be analyzed + all_samples = set(groups_info.get("all_samples")) + # Obtain the set of samples for which we have information in the MAF file + available_samples = set(maf_df["SAMPLE_ID"].unique()) + requested_n_available_samples = available_samples.intersection(all_samples) + if len(all_samples) != len(requested_n_available_samples): missing_samples = all_samples - available_samples - raise ValueError("Some SAMPLE_IDs listed in the features table have no matching entries in the MAF file." - "Missing SAMPLE_IDs: " + ", ".join(missing_samples)) - + error = f"Some SAMPLE_IDs listed in the features table have no matching entries in the MAF file. Missing SAMPLE_IDs: {', '.join(missing_samples)}" + LOG.error(error) + raise ValueError(error) + for group_name, samples in groups_info.items(): samples = [str(x) for x in samples] - maf_df[maf_df["SAMPLE_ID"].isin(samples)].sort_values(by=["CHROM", "POS"]).to_csv( - f"{group_name}.filtered.tsv.gz", - sep="\t", - header=True, - index=False - ) + # Extract group-specific MAFs + create_group_mafs(maf_df, group_name, samples) + + # Extract flagged regions for each sample in the group if not already done + create_sample_flagged_bed(maf_df, list(all_samples), filter_criteria) + + # Extract flagged regions to BED file (cohort-wide, applies to all samples) + LOG.info("Extracting all flagged positions to BED file.") + extract_flagged_regions_bed(maf_df, "all_samples", filter_criteria, "all-") + + # Extract flagged regions applied only for all the cohort, not sample-specific + LOG.info("Extracting only cohort-wide flagged positions to BED file.") + cohort_wide_filters = [f for f in COHORT_FLAGS if f in filter_criteria] + extract_flagged_regions_bed(maf_df, "all_samples", cohort_wide_filters, "cohort-wide-") if __name__ == '__main__': main() diff --git a/conf/modules.config b/conf/modules.config index cd46f8b9..a8a4cf35 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -228,11 +228,19 @@ process { withName: WRITEMAF { + ext.filters = { params.filter_criteria.join(',') } + ext.somatic_filters = { params.filter_criteria_somatic.join(',') } + publishDir = [ [ mode: params.publish_dir_mode, path: { "${params.outdir}/mutations/germline_somatic" }, pattern: "*{tsv.gz}", + ], + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/flagged_positions" }, + pattern: "*{bed}", ] ] } @@ -311,14 +319,12 @@ process { ext.downsample_prop = params.downsample_proportion } - withName: FILTERBATCH { ext.repetitive_variant = params.repetitive_variant_thres ext.germline_threshold = params.germline_threshold + ext.prop_samples_nrich = params.prop_samples_nrich } - - - + withName: "TABLE2GROUP" { ext.unique_identifier = params.features_unique_identifier ext.feature_groups = params.features_groups_list @@ -627,7 +633,7 @@ process { } withLabel : deepcsa_core { - container = "docker.io/bbglab/deepcsa-core:0.0.2-alpha" + container = "docker.io/bbglab/deepcsa-core:0.1.0" } } diff --git a/conf/tools/panels.config b/conf/tools/panels.config index 7f1fc712..85559456 100644 --- a/conf/tools/panels.config +++ b/conf/tools/panels.config @@ -54,7 +54,6 @@ process { ] } - withName: 'BBGTOOLS:DEEPCSA:CREATEPANELS:CREATESAMPLEPANELSALL' { publishDir = [ [ diff --git a/modules/local/annotatedepth/main.nf b/modules/local/annotatedepth/main.nf index 903912cf..cdc9856f 100644 --- a/modules/local/annotatedepth/main.nf +++ b/modules/local/annotatedepth/main.nf @@ -4,12 +4,12 @@ process ANNOTATE_DEPTHS { label 'time_low' label 'deepcsa_core' - + input: tuple val(meta) , path(depths) tuple val(meta2), path(panel_all) path (json_groups) - path (input_csv) + path (mask_matrix) output: // tuple val(meta), path("*.depths.annotated.tsv.gz") , emit: annotated_depths @@ -25,8 +25,8 @@ process ANNOTATE_DEPTHS { --annotation ${panel_all}.contexts \\ --depths ${depths} \\ --json_file ${json_groups} \\ - --input_csv ${input_csv} - + --mask-matrix ${mask_matrix} + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') diff --git a/modules/local/createmaskmatrix/main.nf b/modules/local/createmaskmatrix/main.nf new file mode 100644 index 00000000..e703c218 --- /dev/null +++ b/modules/local/createmaskmatrix/main.nf @@ -0,0 +1,37 @@ +process CREATE_MASK_MATRIX { + tag "mask_matrix" + + label 'process_low' + label 'time_low' + + container "docker.io/bbglab/deepcsa-core:0.1.0" + + input: + path(bed_files) // List of sample-specific flagged position BED files + + output: + path("flagged_positions.mask.tsv.gz") , emit: mask_matrix + path "versions.yml" , topic: versions + + script: + def bed_args = bed_files.collect { "\"${it}\"" }.join(' --bed-files ') + """ + create_mask_matrix.py \\ + --bed-files ${bed_args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + + stub: + """ + touch flagged_positions.mask.tsv.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/createmaskmatrix/meta.yml b/modules/local/createmaskmatrix/meta.yml new file mode 100644 index 00000000..6aace8c2 --- /dev/null +++ b/modules/local/createmaskmatrix/meta.yml @@ -0,0 +1,38 @@ +name: "create_mask_matrix" +description: Create position × sample mask matrix from sample-specific flagged position BED files +keywords: + - masking + - bed + - matrix + - depths +tools: + - pandas: + description: | + Pandas is a Python library providing high-performance, easy-to-use data structures + and data analysis tools. + homepage: https://pandas.pydata.org/ + documentation: https://pandas.pydata.org/docs/ + +input: + - bed_files: + type: file + description: | + List of sample-specific flagged position BED files (*.flagged-pos.bed). + Each file contains regions to be masked for a specific sample. + pattern: "*.flagged-pos.bed" + +output: + - mask_matrix: + type: file + description: | + Position × sample mask matrix (TSV, gzipped). Contains CHROM, POS columns + followed by one boolean column per sample indicating whether the position + should be masked (TRUE) or not (FALSE). + pattern: "*.mask.tsv.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@mhuertas" diff --git a/modules/local/filtermaf/main.nf b/modules/local/filtermaf/main.nf index c070511e..7c0f8a98 100644 --- a/modules/local/filtermaf/main.nf +++ b/modules/local/filtermaf/main.nf @@ -10,23 +10,23 @@ process FILTER_BATCH { tuple val(meta), path(maf) output: - tuple val(meta), path("*.cohort.filtered.tsv.gz") , emit: cohort_maf - path "versions.yml" , topic: versions + tuple val(meta), path("*.cohort.filtered.tsv.gz") , emit: cohort_maf + path "versions.yml" , topic: versions script: def prefix = task.ext.prefix ?: "" prefix = "${meta.id}${prefix}" - def repetitive_variant = task.ext.repetitive_variant ?: "${params.repetitive_variant_thres}" - def germline_threshold = task.ext.germline_threshold ?: "${params.germline_threshold}" - def proportion_samples_nrich = task.ext.prop_samples_nrich ?: "${params.prop_samples_nrich}" + def repetitive_variant = task.ext.repetitive_variant ? "--repetitive-variant-threshold ${task.ext.repetitive_variant}" : "" + def germline_threshold = task.ext.germline_threshold ? "--somatic-vaf-boundary ${task.ext.germline_threshold}" : "" + def proportion_samples_nrich = task.ext.prop_samples_nrich ? "--n-rich-cohort-proportion ${task.ext.prop_samples_nrich}" : "" """ filter_cohort.py \\ --maf-df-file ${maf} \\ --sample-name ${prefix} \\ - --repetitive-variant-threshold ${repetitive_variant} \\ - --somatic-vaf-boundary ${germline_threshold} \\ - --n-rich-cohort-proportion ${proportion_samples_nrich} + ${repetitive_variant} \\ + ${germline_threshold} \\ + ${proportion_samples_nrich} cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -36,7 +36,7 @@ process FILTER_BATCH { stub: """ - touch all_samples.cohort.filtered.tsv.gz + touch shared_cohort.filtered.tsv.gz cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/writemaf/main.nf b/modules/local/writemaf/main.nf index dd82a01b..e11f1d0b 100644 --- a/modules/local/writemaf/main.nf +++ b/modules/local/writemaf/main.nf @@ -1,7 +1,7 @@ process WRITE_MAFS { tag "${meta.id}" - label 'process_low' + label 'process_high_memory' label 'deepcsa_core' @@ -10,14 +10,21 @@ process WRITE_MAFS { path (json_groups) output: - path("*.filtered.tsv.gz") , emit: mafs - path "versions.yml" , topic: versions + path("*.filtered.tsv.gz") , emit: mafs + path("*.flagged-pos.bed") , emit: sample_flagged_beds + path("all_samples.all-flagged-pos.bed") + path("all_samples.cohort-wide-flagged-pos.bed") + path "versions.yml" , topic: versions script: + def filters = task.ext.filters ?: "" + def somatic_filters = task.ext.somatic_filters ?: "" """ write_mafs.py \\ --maf-file ${maf} \\ - --groups-json ${json_groups} + --groups-json ${json_groups} \\ + --filters "${filters}" \\ + --somatic-filters "${somatic_filters}" cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -27,7 +34,10 @@ process WRITE_MAFS { stub: """ - touch all_samples.cohort.tsv.gz + touch all_samples.filtered.tsv.gz + touch all_samples.all-flagged-pos.bed + touch all_samples.cohort-wide-flagged-pos.bed + touch *.flagged-pos.bed cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 9a2e74e1..4faa435f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -83,8 +83,6 @@ params { use_custom_depths = false custom_depths_table = null - create_sample_panels = false - contamination = false downsample = false diff --git a/subworkflows/local/createpanels/main.nf b/subworkflows/local/createpanels/main.nf index ec86d14f..f8303044 100644 --- a/subworkflows/local/createpanels/main.nf +++ b/subworkflows/local/createpanels/main.nf @@ -1,33 +1,23 @@ -include { SITESFROMPOSITIONS } from '../../../modules/local/sitesfrompositions/main' -include { VCF_ANNOTATE_ENSEMBLVEP as VCFANNOTATEPANEL } from '../../../subworkflows/nf-core/vcf_annotate_ensemblvep_panel/main' +include { SITESFROMPOSITIONS } from '../../../modules/local/sitesfrompositions/main' +include { VCF_ANNOTATE_ENSEMBLVEP as VCFANNOTATEPANEL } from '../../../subworkflows/nf-core/vcf_annotate_ensemblvep_panel/main' -include { POSTPROCESS_VEP_ANNOTATION as POSTPROCESSVEPPANEL } from '../../../modules/local/process_annotation/panel/main' +include { POSTPROCESS_VEP_ANNOTATION as POSTPROCESSVEPPANEL } from '../../../modules/local/process_annotation/panel/main' -include { CUSTOM_ANNOTATION_PROCESSING as CUSTOMPROCESSING } from '../../../modules/local/process_annotation/panelcustom/main' -include { CUSTOM_ANNOTATION_PROCESSING as CUSTOMPROCESSINGRICH } from '../../../modules/local/process_annotation/panelcustom/main' +include { CUSTOM_ANNOTATION_PROCESSING as CUSTOMPROCESSING } from '../../../modules/local/process_annotation/panelcustom/main' +include { CUSTOM_ANNOTATION_PROCESSING as CUSTOMPROCESSINGRICH } from '../../../modules/local/process_annotation/panelcustom/main' -include { DOMAIN_ANNOTATION as DOMAINANNOTATION } from '../../../modules/local/process_annotation/domain/main' +include { DOMAIN_ANNOTATION as DOMAINANNOTATION } from '../../../modules/local/process_annotation/domain/main' +include { CREATECAPTUREDPANELS } from '../../../modules/local/createpanels/captured/main' -include { CREATECAPTUREDPANELS } from '../../../modules/local/createpanels/captured/main' - -include { CREATESAMPLEPANELS as CREATESAMPLEPANELSALL } from '../../../modules/local/createpanels/sample/main' -include { CREATESAMPLEPANELS as CREATESAMPLEPANELSPROTAFFECT } from '../../../modules/local/createpanels/sample/main' -include { CREATESAMPLEPANELS as CREATESAMPLEPANELSNONPROTAFFECT } from '../../../modules/local/createpanels/sample/main' -include { CREATESAMPLEPANELS as CREATESAMPLEPANELSEXONS } from '../../../modules/local/createpanels/sample/main' -include { CREATESAMPLEPANELS as CREATESAMPLEPANELSINTRONS } from '../../../modules/local/createpanels/sample/main' -include { CREATESAMPLEPANELS as CREATESAMPLEPANELSSYNONYMOUS } from '../../../modules/local/createpanels/sample/main' - - -include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSALL } from '../../../modules/local/createpanels/consensus/main' -include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSPROTAFFECT } from '../../../modules/local/createpanels/consensus/main' -include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSNONPROTAFFECT } from '../../../modules/local/createpanels/consensus/main' -include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSEXONS } from '../../../modules/local/createpanels/consensus/main' -include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSINTRONS } from '../../../modules/local/createpanels/consensus/main' -include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSSYNONYMOUS } from '../../../modules/local/createpanels/consensus/main' - -include { COMPARE_TRINUCLEOTIDE_PROPORTIONS_PANELS as COMPAREPANELPROPORTIONS } from '../../../modules/local/createpanels/compare/main' +include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSALL } from '../../../modules/local/createpanels/consensus/main' +include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSPROTAFFECT } from '../../../modules/local/createpanels/consensus/main' +include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSNONPROTAFFECT } from '../../../modules/local/createpanels/consensus/main' +include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSEXONS } from '../../../modules/local/createpanels/consensus/main' +include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSINTRONS } from '../../../modules/local/createpanels/consensus/main' +include { CREATECONSENSUSPANELS as CREATECONSENSUSPANELSSYNONYMOUS } from '../../../modules/local/createpanels/consensus/main' +include { COMPARE_TRINUCLEOTIDE_PROPORTIONS_PANELS as COMPAREPANELPROPORTIONS } from '../../../modules/local/createpanels/compare/main' def restructurePanel(panel) { // return panel.map{ it -> [[id: it[0].name.tokenize('.')[0..1].join('.')], it[1]] } @@ -35,15 +25,6 @@ def restructurePanel(panel) { return panel.map { it -> [[id: it.name.tokenize('.')[1]], it] } } - -def restructureSamplePanel(panel) { - // return panel.map{ it -> [[id: it[0].name.tokenize('.')[0..1].join('.')], it[1]] } - // return panel.map { it -> [[id: it.name.tokenize('.')[0..1].join('.')], it] } - return panel.map { it -> [[id: it.name.tokenize('.')[0]], it] } - } - - - workflow CREATE_PANELS { take: @@ -109,17 +90,6 @@ workflow CREATE_PANELS { restructurePanel(CREATECAPTUREDPANELS.out.captured_panel_synonymous).set{synonymous_panel} restructurePanel(CREATECAPTUREDPANELS.out.captured_panel_synonymous_bed).set{synonymous_bed} - if (params.create_sample_panels){ - // Create sample-specific panels: all modalities - CREATESAMPLEPANELSALL(all_panel, depths, params.sample_panel_min_depth) - CREATESAMPLEPANELSPROTAFFECT(prot_panel, depths, params.sample_panel_min_depth) - CREATESAMPLEPANELSNONPROTAFFECT(nonprot_panel, depths, params.sample_panel_min_depth) - CREATESAMPLEPANELSEXONS(exons_panel, depths, params.sample_panel_min_depth) - CREATESAMPLEPANELSINTRONS(introns_panel, depths, params.sample_panel_min_depth) - CREATESAMPLEPANELSSYNONYMOUS(synonymous_panel, depths, params.sample_panel_min_depth) - } - - // Create consensus panel: all modalities CREATECONSENSUSPANELSALL(all_panel, depths) CREATECONSENSUSPANELSPROTAFFECT(prot_panel, depths) @@ -128,6 +98,7 @@ workflow CREATE_PANELS { CREATECONSENSUSPANELSINTRONS(introns_panel, depths) CREATECONSENSUSPANELSSYNONYMOUS(synonymous_panel, depths) + // Compare trinucleotide proportions in the panels to WGS background - QC plots channel.empty() .concat(CREATECONSENSUSPANELSALL.out.consensus_panel.map{ it -> it[1]}.flatten()) .concat(CREATECONSENSUSPANELSPROTAFFECT.out.consensus_panel.map{ it -> it[1]}.flatten()) @@ -140,6 +111,7 @@ workflow CREATE_PANELS { COMPAREPANELPROPORTIONS(all_consensus_panels, wgs_trinucleotides) + emit: full_panel_annotated = VCFANNOTATEPANEL.out.tab.first() all_panel = all_panel.first() @@ -169,6 +141,7 @@ workflow CREATE_PANELS { synonymous_consensus_panel = CREATECONSENSUSPANELSSYNONYMOUS.out.consensus_panel.first() synonymous_consensus_bed = CREATECONSENSUSPANELSSYNONYMOUS.out.consensus_panel_bed.first() + panel_annotated_rich = rich_annotated.first() added_custom_regions = added_regions.first() domains_panel_bed = DOMAINANNOTATION.out.domains_bed.first() @@ -176,15 +149,4 @@ workflow CREATE_PANELS { postprocessed_panel = POSTPROCESSVEPPANEL.out.compact_panel_annotation.first() postprocessed_panel_rich = POSTPROCESSVEPPANEL.out.rich_panel_annotation.first() - // all_sample_panel = restructureSamplePanel(CREATESAMPLEPANELSALL.out.sample_specific_panel.flatten()) - // all_sample_bed = restructureSamplePanel(CREATESAMPLEPANELSALL.out.sample_specific_panel_bed.flatten()) - // prot_sample_panel = restructureSamplePanel(CREATESAMPLEPANELSPROTAFFECT.out.sample_specific_panel.flatten()) - // prot_sample_bed = restructureSamplePanel(CREATESAMPLEPANELSPROTAFFECT.out.sample_specific_panel_bed.flatten()) - // nonprot_sample_panel = restructureSamplePanel(CREATESAMPLEPANELSNONPROTAFFECT.out.sample_specific_panel.flatten()) - // nonprot_sample_bed = restructureSamplePanel(CREATESAMPLEPANELSNONPROTAFFECT.out.sample_specific_panel_bed.flatten()) - // exons_sample_panel = restructureSamplePanel(CREATESAMPLEPANELSEXONS.out.sample_specific_panel.flatten()) - // exons_sample_bed = restructureSamplePanel(CREATESAMPLEPANELSEXONS.out.sample_specific_panel_bed.flatten()) - // introns_sample_panel = restructureSamplePanel(CREATESAMPLEPANELSINTRONS.out.sample_specific_panel.flatten()) - // introns_sample_bed = restructureSamplePanel(CREATESAMPLEPANELSINTRONS.out.sample_specific_panel_bed.flatten()) - } diff --git a/subworkflows/local/createpanels/meta.yml b/subworkflows/local/createpanels/meta.yml index e69de29b..45312099 100644 --- a/subworkflows/local/createpanels/meta.yml +++ b/subworkflows/local/createpanels/meta.yml @@ -0,0 +1,213 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "CREATE_PANELS" +description: | + Creates comprehensive genomic panels from depth files through a multi-stage annotation pipeline. + This workflow generates both captured panels (all possible mutations in sequenced regions) and + consensus panels (regions with adequate depth coverage across samples). It performs VEP annotation, + optional custom annotation processing, protein domain mapping, and creates panel variants for + different genomic features (protein-affecting, exons, synonymous, etc.). The outputs are used + throughout the pipeline for mutation filtering, depth analysis, and selection analysis. +keywords: + - annotation + - panel + - vep + - ensembl + - domains + - consensus + - captured regions + - depth coverage +components: + - SITESFROMPOSITIONS + - VCF_ANNOTATE_ENSEMBLVEP + - POSTPROCESS_VEP_ANNOTATION + - CUSTOM_ANNOTATION_PROCESSING + - DOMAIN_ANNOTATION + - CREATECAPTUREDPANELS + - CREATECONSENSUSPANELS + +input: + - depths: + type: file + description: | + Channel containing per-sample depth information files for all captured/sequenced regions. + These files are used to (1) generate all possible mutations within the captured panel and + (2) determine which regions have consensus coverage (adequate depth across the cohort). + pattern: "*.{txt,tsv,bed,gz}" + +output: + - full_panel_annotated: + type: file + description: | + Complete VEP annotation output in tab-separated format (raw VEP output). + Contains all transcript annotations before postprocessing. + pattern: "*.tab" + - complete_annotated_panel: + type: file + description: | + Compact annotated panel with one annotation per mutation (postprocessed). + Optionally includes custom annotation modifications if params.customize_annotation is true. + This is the primary annotation reference used throughout the pipeline. + pattern: "*.tsv" + - panel_annotated_rich: + type: file + description: | + Rich annotated panel with extended VEP annotation fields. + Used for detailed analyses and domain annotation mapping. + pattern: "*.tsv" + - added_custom_regions: + type: file + description: | + Regions added or modified during custom annotation processing. + Empty channel if params.customize_annotation is false. + pattern: "*.tsv" + - domains_panel_bed: + type: file + description: | + BED file with protein domain coordinates mapped to genomic positions. + Used for domain-level selection analysis. + pattern: "*.bed" + - domains_in_panel: + type: file + description: | + Tab-separated file listing all protein domains present in the captured panel. + pattern: "*.tsv" + - postprocessed_panel: + type: file + description: | + Compact panel annotation immediately after VEP postprocessing (before custom annotation). + pattern: "*.tsv" + - postprocessed_panel_rich: + type: file + description: | + Rich panel annotation immediately after VEP postprocessing (before custom annotation). + pattern: "*.tsv" + - all_panel: + type: file + description: | + Captured panel containing all possible mutations in sequenced regions (TSV format). + pattern: "*.tsv" + - all_bed: + type: file + description: | + Captured panel BED file containing all sequenced genomic regions. + pattern: "*.bed" + - prot_panel: + type: file + description: | + Captured panel subset: protein-affecting mutations only (missense, nonsense, etc.). + pattern: "*.tsv" + - prot_bed: + type: file + description: | + Captured panel BED file for protein-affecting regions. + pattern: "*.bed" + - nonprot_panel: + type: file + description: | + Captured panel subset: non-protein-affecting mutations (introns, intergenic, UTRs). + pattern: "*.tsv" + - nonprot_bed: + type: file + description: | + Captured panel BED file for non-protein-affecting regions. + pattern: "*.bed" + - exons_panel: + type: file + description: | + Captured panel subset: exons and splice sites. + pattern: "*.tsv" + - exons_bed: + type: file + description: | + Captured panel BED file for exons and splice sites. Used by FILTEREXONS process. + pattern: "*.bed" + - introns_panel: + type: file + description: | + Captured panel subset: introns and intergenic regions. + pattern: "*.tsv" + - introns_bed: + type: file + description: | + Captured panel BED file for introns and intergenic regions. + pattern: "*.bed" + - synonymous_panel: + type: file + description: | + Captured panel subset: synonymous mutations only. + pattern: "*.tsv" + - synonymous_bed: + type: file + description: | + Captured panel BED file for regions containing synonymous mutations. + pattern: "*.bed" + - all_consensus_panel: + type: file + description: | + Consensus panel: regions with adequate depth coverage across the cohort (all regions). + Used for downstream depth analysis and mutation filtering. + pattern: "*.tsv" + - all_consensus_bed: + type: file + description: | + Consensus panel BED file for all regions with adequate coverage. + Used by FILTERPANEL to mark mutations outside consensus regions as "not_covered". + pattern: "*.bed" + - prot_consensus_panel: + type: file + description: | + Consensus panel subset: protein-affecting regions with adequate coverage. + pattern: "*.tsv" + - prot_consensus_bed: + type: file + description: | + Consensus panel BED file for protein-affecting regions with adequate coverage. + pattern: "*.bed" + - nonprot_consensus_panel: + type: file + description: | + Consensus panel subset: non-protein-affecting regions with adequate coverage. + pattern: "*.tsv" + - nonprot_consensus_bed: + type: file + description: | + Consensus panel BED file for non-protein-affecting regions with adequate coverage. + pattern: "*.bed" + - exons_consensus_panel: + type: file + description: | + Consensus panel subset: exons and splice sites with adequate coverage. + pattern: "*.tsv" + - exons_consensus_bed: + type: file + description: | + Consensus panel BED file for exons and splice sites with adequate coverage. + Used for exon-specific depth plots and analyses. + pattern: "*.bed" + - introns_consensus_panel: + type: file + description: | + Consensus panel subset: introns and intergenic regions with adequate coverage. + pattern: "*.tsv" + - introns_consensus_bed: + type: file + description: | + Consensus panel BED file for introns and intergenic regions with adequate coverage. + pattern: "*.bed" + - synonymous_consensus_panel: + type: file + description: | + Consensus panel subset: synonymous mutation sites with adequate coverage. + pattern: "*.tsv" + - synonymous_consensus_bed: + type: file + description: | + Consensus panel BED file for synonymous mutation sites with adequate coverage. + Used for synonymous-specific selection analyses. + pattern: "*.bed" + +authors: + - Ferriol Calvet (ferriol.calvet@irbbarcelona.org) +maintainers: + - Ferriol Calvet (ferriol.calvet@irbbarcelona.org) + - Marta Huertas (marta.huertas@irbbarcelona.org) \ No newline at end of file diff --git a/subworkflows/local/mutationpreprocessing/main.nf b/subworkflows/local/mutationpreprocessing/main.nf index de7f2918..d5265230 100644 --- a/subworkflows/local/mutationpreprocessing/main.nf +++ b/subworkflows/local/mutationpreprocessing/main.nf @@ -1,25 +1,26 @@ // Annotation -include { VCF_ANNOTATE_ENSEMBLVEP as VCFANNOTATE } from '../../nf-core/vcf_annotate_ensemblvep/main' - - -include { SUMMARIZE_ANNOTATION as SUMANNOTATION } from '../../../modules/local/process_annotation/mutations/main' -include { CUSTOM_MUTATION_PROCESSING as CUSTOMANNOTATION } from '../../../modules/local/process_annotation/mutations_custom/main' -include { VCF2MAF as VCF2MAF } from '../../../modules/local/vcf2maf/main' -include { FILTERBED as FILTERPANEL } from '../../../modules/local/filterbed/main' -include { FILTERBED as FILTEREXONS } from '../../../modules/local/filterbed/main' -include { FILTERBED as FILTERNANOSEQSNP } from '../../../modules/local/filterbed/main' -include { FILTERBED as FILTERNANOSEQNOISE} from '../../../modules/local/filterbed/main' -include { MERGE_BATCH as MERGEBATCH } from '../../../modules/local/mergemafs/main' -include { FILTER_BATCH as FILTERBATCH } from '../../../modules/local/filtermaf/main' -include { WRITE_MAFS as WRITEMAF } from '../../../modules/local/writemaf/main' -include { SUBSET_MAF as SOMATICMUTATIONS } from '../../../modules/local/subsetmaf/main' -include { SUBSET_MAF as CLEANMUTATIONS } from '../../../modules/local/subsetmaf/main' -include { BLACKLIST_MUTATIONS as BLACKLISTMUTS } from '../../../modules/local/blacklistmuts/main' -include { PLOT_MUTATIONS as PLOTMAF } from '../../../modules/local/plot/mutations_summary/main' -include { PLOT_MUTATIONS as PLOTSOMATICMAF } from '../../../modules/local/plot/mutations_summary/main' -include { PLOT_NEEDLES as PLOTNEEDLES } from '../../../modules/local/plot/needles/main' -include { DOWNSAMPLE_MUTATIONS as DOWNSAMPLEMUTS } from '../../../modules/local/downsample/mutations/main' -include { COMPUTE_CONTAMINATION as CONTAMINATION } from '../../../modules/local/contamination/main' +include { VCF_ANNOTATE_ENSEMBLVEP as VCFANNOTATE } from '../../nf-core/vcf_annotate_ensemblvep/main' + + +include { SUMMARIZE_ANNOTATION as SUMANNOTATION } from '../../../modules/local/process_annotation/mutations/main' +include { CUSTOM_MUTATION_PROCESSING as CUSTOMANNOTATION } from '../../../modules/local/process_annotation/mutations_custom/main' +include { VCF2MAF as VCF2MAF } from '../../../modules/local/vcf2maf/main' +include { FILTERBED as FILTERPANEL } from '../../../modules/local/filterbed/main' +include { FILTERBED as FILTEREXONS } from '../../../modules/local/filterbed/main' +include { FILTERBED as FILTERNANOSEQSNP } from '../../../modules/local/filterbed/main' +include { FILTERBED as FILTERNANOSEQNOISE } from '../../../modules/local/filterbed/main' +include { CREATE_MASK_MATRIX as CREATEMASKMATRIX } from '../../../modules/local/createmaskmatrix/main' +include { MERGE_BATCH as MERGEBATCH } from '../../../modules/local/mergemafs/main' +include { FILTER_BATCH as FILTERBATCH } from '../../../modules/local/filtermaf/main' +include { WRITE_MAFS as WRITEMAF } from '../../../modules/local/writemaf/main' +include { SUBSET_MAF as SOMATICMUTATIONS } from '../../../modules/local/subsetmaf/main' +include { SUBSET_MAF as CLEANMUTATIONS } from '../../../modules/local/subsetmaf/main' +include { BLACKLIST_MUTATIONS as BLACKLISTMUTS } from '../../../modules/local/blacklistmuts/main' +include { PLOT_MUTATIONS as PLOTMAF } from '../../../modules/local/plot/mutations_summary/main' +include { PLOT_MUTATIONS as PLOTSOMATICMAF } from '../../../modules/local/plot/mutations_summary/main' +include { PLOT_NEEDLES as PLOTNEEDLES } from '../../../modules/local/plot/needles/main' +include { DOWNSAMPLE_MUTATIONS as DOWNSAMPLEMUTS } from '../../../modules/local/downsample/mutations/main' +include { COMPUTE_CONTAMINATION as CONTAMINATION } from '../../../modules/local/contamination/main' workflow MUTATION_PREPROCESSING { @@ -88,9 +89,10 @@ workflow MUTATION_PREPROCESSING { filtered_maf_masks = params.nanoseq_noise ? FILTERNANOSEQNOISE.out.maf : filtered_maf_snp } filtered_maf_panels = masks_applied ? filtered_maf_masks : FILTERPANEL.out.maf + // Join all samples' MAFs and put them in a channel to be merged filtered_maf_panels.map{ it -> it[1] }.collect().map{ it -> [[ id:"all_samples" ], it]}.set{ samples_maf } - + MERGEBATCH(samples_maf) FILTERBATCH(MERGEBATCH.out.cohort_maf) @@ -98,6 +100,9 @@ workflow MUTATION_PREPROCESSING { PLOTMAF(FILTERBATCH.out.cohort_maf) WRITEMAF(FILTERBATCH.out.cohort_maf, all_groups) + + // Create the mask matrix used to mask positions in the depths + CREATEMASKMATRIX(WRITEMAF.out.sample_flagged_beds) // Here we flatten the output of the WRITEMAF module to have a channel where each item is a sample-maf pair WRITEMAF.out.mafs.flatten().map{ it -> [ [id : it.name.tokenize('.')[0]] , it] }.set{ named_mafs } @@ -125,7 +130,6 @@ workflow MUTATION_PREPROCESSING { // Keep only somatic mutations SOMATICMUTATIONS(CLEANMUTATIONS.out.mutations) - channel.of([["id": "all_samples"]]) .join(named_mafs).first() @@ -170,6 +174,7 @@ workflow MUTATION_PREPROCESSING { emit: mafs = named_mafs + mask_matrix = CREATEMASKMATRIX.out.mask_matrix somatic_mafs = SOMATICMUTATIONS.out.mutations clean_mafs = CLEANMUTATIONS.out.mutations mutations_all_samples = muts_all_samples diff --git a/workflows/deepcsa.nf b/workflows/deepcsa.nf index b04082f2..7ad9dfb7 100644 --- a/workflows/deepcsa.nf +++ b/workflows/deepcsa.nf @@ -23,7 +23,7 @@ include { PLOT_DEPTHS as PLOTDEPTHSEXONSCONS } from '../subworkfl include { MUTATION_PREPROCESSING as MUT_PREPROCESSING } from '../subworkflows/local/mutationpreprocessing/main' -include { ENRICHPANELS as ENRICHPANELS } from '../subworkflows/local/enrichpanels/main' +include { ENRICHPANELS as ENRICHPANELS } from '../subworkflows/local/enrichpanels/main' include { MUTATION_DENSITY as MUTDENSITYALL } from '../subworkflows/local/mutationdensity/main' @@ -194,10 +194,23 @@ workflow DEEPCSA{ // Depth analysis: compute and plots DEPTHANALYSIS(INPUT_CHECK.out.sample_inputs, custom_bed_file) - // Panels generation: all modalities + // Panels annotation CREATEPANELS(DEPTHANALYSIS.out.depths, wgs_trinucs) - ANNOTATEDEPTHS(DEPTHANALYSIS.out.depths, CREATEPANELS.out.all_panel, TABLE2GROUP.out.json_allgroups, file(params.input)) + // Mutation preprocessing + MUT_PREPROCESSING(meta_vcfs_alone, + CREATEPANELS.out.all_consensus_bed, + CREATEPANELS.out.exons_bed, + TABLE2GROUP.out.json_allgroups, + group_keys_ch, + seqinfo_df, + CREATEPANELS.out.added_custom_regions + ) + somatic_mutations = MUT_PREPROCESSING.out.somatic_mafs + + positive_selection_results = somatic_mutations + + ANNOTATEDEPTHS(DEPTHANALYSIS.out.depths, CREATEPANELS.out.all_panel, TABLE2GROUP.out.json_allgroups, MUT_PREPROCESSING.out.mask_matrix) ANNOTATEDEPTHS.out.annotated_depths.flatten().map{ it -> [ [id : it.name.tokenize('.')[0]] , it] }.set{ annotated_depths_full } // if (params.downsample && params.downsample_proportion < 1) { @@ -214,20 +227,6 @@ workflow DEEPCSA{ } PLOTDEPTHSEXONSCONS(ANNOTATEDEPTHS.out.all_samples_depths, CREATEPANELS.out.exons_consensus_bed, CREATEPANELS.out.exons_consensus_panel) - // Mutation preprocessing - MUT_PREPROCESSING(meta_vcfs_alone, - CREATEPANELS.out.all_consensus_bed, - CREATEPANELS.out.exons_bed, - TABLE2GROUP.out.json_allgroups, - group_keys_ch, - seqinfo_df, - CREATEPANELS.out.added_custom_regions - ) - somatic_mutations = MUT_PREPROCESSING.out.somatic_mafs - - positive_selection_results = somatic_mutations - - // Enrich regions in consensus panels ENRICHPANELS(MUT_PREPROCESSING.out.mutations_all_samples, ANNOTATEDEPTHS.out.all_samples_depths,