diff --git a/Snakefile b/Snakefile index 3cd39ff8..6b4b0001 100644 --- a/Snakefile +++ b/Snakefile @@ -21,12 +21,15 @@ wildcard_constraints: # without declaration! _config.init_global(config) +def without_keys(d: dict, keys: list): + if set(keys) & set(d.keys()) != set(keys): raise RuntimeError(f"Keys {keys} not fully present in {list(d.keys())}!") + return {k: v for k, v in d.items() if k not in keys} + out_dir = _config.config.out_dir algorithm_params = _config.config.algorithm_params -pca_params = _config.config.pca_params -hac_params = _config.config.hac_params container_settings = _config.config.container_settings -include_aggregate_algo_eval = _config.config.analysis_include_evaluation_aggregate_algo +pca_params = without_keys(vars(_config.config.analysis.pca), ["pca_chosen", "include", "aggregate_per_algorithm"]) +hac_params = without_keys(vars(_config.config.analysis.hac), ["include", "aggregate_per_algorithm"]) # Return the dataset or gold_standard dictionary from the config file given the label def get_dataset(_datasets, label): @@ -70,55 +73,76 @@ def write_dataset_log(dataset, logfile): def make_final_input(wildcards): final_input = [] - if _config.config.analysis_include_summary: + if _config.config.analysis.summary.include: # add summary output file for each pathway # TODO: reuse in the future once we make summary work for mixed graphs. See https://github.com/Reed-CompBio/spras/issues/128 # final_input.extend(expand('{out_dir}{sep}{dataset}-{algorithm_params}{sep}summary.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) # add table summarizing all pathways for each dataset final_input.extend(expand('{out_dir}{sep}{dataset}-pathway-summary.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels)) - if _config.config.analysis_include_cytoscape: + if _config.config.analysis.cytoscape.include: final_input.extend(expand('{out_dir}{sep}{dataset}-cytoscape.cys',out_dir=out_dir,sep=SEP,dataset=dataset_labels)) - if _config.config.analysis_include_ml: + if _config.config.analysis.pca.include: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - if _config.config.analysis_include_ml_aggregate_algo: + if _config.config.analysis.pca.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) + + if _config.config.analysis.hac.include: + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + + if _config.config.analysis.hac.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) + + if _config.config.analysis.ensemble.include: + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + + if _config.config.analysis.ensemble.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms)) + + if _config.config.analysis.jaccard.include: + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + + if _config.config.analysis.jaccard.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms)) - if _config.config.analysis_include_evaluation: + if _config.config.analysis.evaluation.include: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + # dummy file final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}dummy-edge.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_edge_pairs)) - - if _config.config.analysis_include_evaluation_aggregate_algo: + + if _config.config.analysis.evaluation.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-for-{algorithm}-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs,algorithm=algorithms)) + + if _config.config.analysis.pca.pca_chosen.include: + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + + if _config.config.analysis.pca.pca_chosen.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-per-algorithm-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-pca-chosen-pathway-per-algorithm-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + + if _config.config.analysis.ensemble.evaluation.include: + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) + + if _config.config.analysis.ensemble.evaluation.aggregate_per_algorithm: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes-per-algorithm-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-curve-ensemble-nodes-per-algorithm-nodes.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_node_pairs)) @@ -460,7 +484,7 @@ rule evaluation_per_algo_pr_per_pathways: run: node_table = Evaluation.from_file(input.node_gold_standard_file).node_table pr_df = Evaluation.node_precision_and_recall(input.pathways, node_table) - Evaluation.precision_and_recall_per_pathway(pr_df, output.node_pr_file, output.node_pr_png, include_aggregate_algo_eval) + Evaluation.precision_and_recall_per_pathway(pr_df, output.node_pr_file, output.node_pr_png, _config.config.analysis.evaluation.aggregate_per_algorithm) # Return pathway summary file per dataset def collect_summary_statistics_per_dataset(wildcards): @@ -508,7 +532,7 @@ rule evaluation_per_algo_pca_chosen: node_table = Evaluation.from_file(input.node_gold_standard_file).node_table pca_chosen_pathways = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, input.pathway_summary_file, out_dir) pr_df = Evaluation.node_precision_and_recall(pca_chosen_pathways, node_table) - Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.node_pca_chosen_pr_file, output.node_pca_chosen_pr_png, include_aggregate_algo_eval) + Evaluation.precision_and_recall_pca_chosen_pathway(pr_df, output.node_pca_chosen_pr_file, output.node_pca_chosen_pr_png, _config.config.analysis.pca.pca_chosen.aggregate_per_algorithm) # Return the dataset pickle file for a specific dataset def get_dataset_pickle_file(wildcards): @@ -551,7 +575,7 @@ rule evaluation_per_algo_ensemble_pr_curve: run: node_table = Evaluation.from_file(input.node_gold_standard_file).node_table node_ensembles_dict = Evaluation.edge_frequency_node_ensemble(node_table, input.ensemble_files, input.dataset_file) - Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, node_table, output.node_pr_curve_png, output.node_pr_curve_file, include_aggregate_algo_eval) + Evaluation.precision_recall_curve_node_ensemble(node_ensembles_dict, node_table, output.node_pr_curve_png, output.node_pr_curve_file, _config.config.analysis.evaluation.aggregate_per_algorithm) rule evaluation_edge_dummy: input: diff --git a/config/config.yaml b/config/config.yaml index fc718e3c..a4595f1e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -218,33 +218,50 @@ analysis: # Create Cytoscape session file with all pathway graphs for each dataset cytoscape: include: true - # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset - ml: - # ml analysis per dataset + # The following analysis options also have an `aggregate_per_algorithm` option, + # which adds the respective analysis to an algorithm as a whole. + # This will only run if the adjacent `include` is true. + + # Principle component analysis of the pathway output files + pca: include: true - # adds ml analysis per algorithm output - # only runs for algorithms with multiple parameter combinations chosen aggregate_per_algorithm: true + pca_chosen: + include: true + aggregate_per_algorithm: true # specify how many principal components to calculate components: 2 # boolean to show the labels on the pca graph labels: true - # 'ward', 'complete', 'average', 'single' - # if linkage: ward, must use metric: euclidean - linkage: 'ward' - # 'euclidean', 'manhattan', 'cosine' - metric: 'euclidean' # controls whether kernel density estimation (KDE) is computed and visualized on top of PCA plots. # the coordinates of the KDE maximum (kde_peak) are also saved to the PCA coordinates output file. # KDE needs to be run in order to select a parameter combination with PCA because the maximum kernel density is used # to pick the 'best' parameter combination. kde: true - # removes empty pathways from consideration in ml analysis (pca only) + # removes empty pathways from consideration in ml analysis remove_empty_pathways: false + # Hierarchical agglomerative clustering analysis of the pathway output files + hac: + include: true + aggregate_per_algorithm: true + # 'ward', 'complete', 'average', 'single' + # if linkage: ward, must use metric: euclidean + linkage: 'ward' + # 'euclidean', 'manhattan', 'cosine' + metric: 'euclidean' + # Ensembling pathway output + ensemble: + include: true + aggregate_per_algorithm: true + evaluation: + include: true + aggregate_per_algorithm: true + # Jaccard pathway output + jaccard: + include: true evaluation: - # evaluation per dataset-goldstandard pair - # evaluation will not run unless ml include is set to true + # evaluation per dataset-goldstandard pair. + # This evaluation specifically generates precision-recall curves: + # to run evaluation on top of the other options, see the respective `evaluation` blocks under the other analyses. include: true - # adds evaluation per algorithm per dataset-goldstandard pair - # evaluation per algorithm will not run unless ml include and ml aggregate_per_algorithm are set to true aggregate_per_algorithm: true diff --git a/config/egfr.yaml b/config/egfr.yaml index b93c593c..8c6d2b76 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -124,16 +124,33 @@ reconstruction_settings: locations: reconstruction_dir: output/egfr analysis: - cytoscape: - include: true summary: include: true - ml: + cytoscape: + include: true + pca: include: true aggregate_per_algorithm: true + pca_chosen: + include: true + aggregate_per_algorithm: true + components: 2 labels: true kde: true remove_empty_pathways: true - evaluation: + hac: include: true aggregate_per_algorithm: true + linkage: 'ward' + metric: 'euclidean' + ensemble: + include: true + aggregate_per_algorithm: true + evaluation: + include: true + aggregate_per_algorithm: true + jaccard: + include: false + evaluation: + include: false + aggregate_per_algorithm: false diff --git a/docker-wrappers/SPRAS/example_config.yaml b/docker-wrappers/SPRAS/example_config.yaml index 1e7fd69c..adf044d5 100644 --- a/docker-wrappers/SPRAS/example_config.yaml +++ b/docker-wrappers/SPRAS/example_config.yaml @@ -137,18 +137,48 @@ analysis: include: true # Create Cytoscape session file with all pathway graphs for each dataset cytoscape: - include: false - # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset - ml: include: true + # The following analysis options also have an `aggregate_per_algorithm` option, + # which adds the respective analysis to an algorithm as a whole. + # This will only run if the adjacent `include` is true. + + # Principle component analysis of the pathway output files + pca: + include: true + aggregate_per_algorithm: true + pca_chosen: + include: true + aggregate_per_algorithm: true # specify how many principal components to calculate components: 2 # boolean to show the labels on the pca graph labels: true + # controls whether kernel density estimation (KDE) is computed and visualized on top of PCA plots. + # the coordinates of the KDE maximum (kde_peak) are also saved to the PCA coordinates output file. + # KDE needs to be run in order to select a parameter combination with PCA because the maximum kernel density is used + # to pick the 'best' parameter combination. + kde: true + # removes empty pathways from consideration in ml analysis + remove_empty_pathways: false + # Hierarchical agglomerative clustering analysis of the pathway output files + hac: + include: true + aggregate_per_algorithm: true # 'ward', 'complete', 'average', 'single' # if linkage: ward, must use metric: euclidean linkage: 'ward' # 'euclidean', 'manhattan', 'cosine' metric: 'euclidean' + # Ensembling pathway output + ensemble: + include: true + aggregate_per_algorithm: true + evaluation: + include: true + aggregate_per_algorithm: true evaluation: - include: false + # evaluation per dataset-goldstandard pair. + # This evaluation specifically generates precision-recall curves: + # to run evaluation on top of the other options, see the respective `evaluation` blocks under the other analyses. + include: true + aggregate_per_algorithm: true diff --git a/docs/tutorial/advanced.rst b/docs/tutorial/advanced.rst index 368d9cb4..97210740 100644 --- a/docs/tutorial/advanced.rst +++ b/docs/tutorial/advanced.rst @@ -152,6 +152,13 @@ for a specific dataset against the corresponding gold standards. analysis: evaluation: include: true + # One could also enable + # evaluation for PCA and HAC, and ensembling. + # For example, + jaccard: + include: true + evaluation: + include: true A gold standard dataset must include the following types of keys and files: diff --git a/docs/tutorial/beginner.rst b/docs/tutorial/beginner.rst index a0846666..f6f237b8 100644 --- a/docs/tutorial/beginner.rst +++ b/docs/tutorial/beginner.rst @@ -250,7 +250,7 @@ Analysis include: true cytoscape: include: true - ml: + pca: include: true SPRAS includes multiple downstream analyses that can be toggled on or diff --git a/docs/tutorial/intermediate.rst b/docs/tutorial/intermediate.rst index 1055b879..db7ab0b6 100644 --- a/docs/tutorial/intermediate.rst +++ b/docs/tutorial/intermediate.rst @@ -804,30 +804,35 @@ contains the following reconstructed subnetwork: MK01_HUMAN ERF_HUMAN 1 D MRE11_HUMAN RAD50_HUMAN 1 U -****************************** - Step 3: Use ML post-analysis -****************************** +Step 3: Use ML-related post-analysis +==================================== -3.1 Adding ML post-analysis to the intermediate configuration -============================================================= +3.1 Adding ML-related post-analysis to the intermediate configuration +--------------------------------------------------------------------- -To enable the ML analysis, update the analysis section in your -configuration file by setting ml to true. Your analysis section in the -configuration file should look like this: +To enable ML-related analysis, update the analysis section in your +configuration file by setting your desired ML analyses to true. Your +analysis section in the configuration file should look like this: .. code:: yaml analysis: - ml: + pca: + include: true + hac: + include: true + ensembling: + include: true + jaccard: include: true ... (other parameters preset) -``ml`` will perform unsupervised analyses such as principal component -analysis (PCA), hierarchical agglomerative clustering (HAC), ensembling, -and jaccard similarity comparisons of the pathways. +These settings will perform principal component analysis (PCA), +hierarchical agglomerative clustering (HAC), ensembling, and jaccard +similarity comparisons of the pathways, respectively. -- The ``ml`` section includes configurable parameters that let you - adjust the behavior of the analyses performed. +- These sections includes configurable parameters that let you adjust + the behavior of the analyses performed. With these updates, SPRAS will run the full set of unsupervised machine learning analyses across all outputs for a given dataset. diff --git a/spras/config/config.py b/spras/config/config.py index 0a4670a4..411b111e 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -65,29 +65,8 @@ def __init__(self, raw_config: dict[str, Any]): # Only includes algorithms that are set to be run with 'include: true'. self.algorithm_params: dict[str, dict[str, Any]] = dict() # A dict with the analysis settings - self.analysis_params = parsed_raw_config.analysis - # A dict with the evaluation settings - self.evaluation_params = self.analysis_params.evaluation - # A dict with the ML settings - self.ml_params = self.analysis_params.ml - # A Boolean specifying whether to run ML analysis for individual algorithms - self.analysis_include_ml_aggregate_algo = None - # A dict with the PCA settings - self.pca_params = None - # A dict with the hierarchical clustering settings - self.hac_params = None - # A Boolean specifying whether to run the summary analysis - self.analysis_include_summary = None - # A Boolean specifying whether to run the Cytoscape analysis - self.analysis_include_cytoscape = None - # A Boolean specifying whether to run the ML analysis - self.analysis_include_ml = None - # A Boolean specifying whether to run the Evaluation analysis - self.analysis_include_evaluation = None - # A Boolean specifying whether to run the ML per algorithm analysis - self.analysis_include_ml_aggregate_algo = None - # A Boolean specifying whether to run the evaluation per algorithm analysis - self.analysis_include_evaluation_aggregate_algo = None + self.analysis = parsed_raw_config.analysis + # Specifies whether the files should be OSDF-immutable (i.e. the file names change when the file itself changes) self.immutable_files = parsed_raw_config.immutable_files @@ -161,7 +140,7 @@ def process_algorithms(self, raw_config: RawConfig): Keys in the parameter dictionary are strings """ prior_params_hashes = set() - self.algorithm_params = dict() + self.algorithm_params: dict[str, Any] = dict() # We copy raw_config.algorithms to avoid mutating the original config # when we attach the SPRAS revision to algorithm names later. for alg in raw_config.algorithms[:]: @@ -218,67 +197,12 @@ def process_algorithms(self, raw_config: RawConfig): self.algorithm_params[alg.name][params_hash] = run_dict - def process_analysis(self, raw_config: RawConfig): - if not raw_config.analysis: - return - - # self.ml_params is a class, pca_params needs to be a dict. - self.pca_params = { - "components": self.ml_params.components, - "labels": self.ml_params.labels, - "kde": self.ml_params.kde, - "remove_empty_pathways": self.ml_params.remove_empty_pathways - } - - self.hac_params = { - "linkage": self.ml_params.linkage, - "metric": self.ml_params.metric - } - - self.analysis_include_summary = raw_config.analysis.summary.include - self.analysis_include_cytoscape = raw_config.analysis.cytoscape.include - self.analysis_include_ml = raw_config.analysis.ml.include - self.analysis_include_evaluation = raw_config.analysis.evaluation.include - - # Only run ML aggregate per algorithm if analysis include ML is set to True - if self.ml_params.aggregate_per_algorithm and self.analysis_include_ml: - self.analysis_include_ml_aggregate_algo = raw_config.analysis.ml.aggregate_per_algorithm - else: - self.analysis_include_ml_aggregate_algo = False - + def process_analysis(self): # Raises an error if Evaluation is enabled but no gold standard data is provided - if self.gold_standards == {} and self.analysis_include_evaluation: + if self.gold_standards == {} and self.analysis.evaluation.include: raise ValueError("Evaluation analysis cannot run as gold standard data not provided. " "Please set evaluation include to false or provide gold standard data.") - # Only run Evaluation if ML is set to True - if not self.analysis_include_ml: - self.analysis_include_evaluation = False - - # Only run Evaluation aggregate per algorithm if analysis include ML is set to True - if self.evaluation_params.aggregate_per_algorithm and self.analysis_include_evaluation: - self.analysis_include_evaluation_aggregate_algo = raw_config.analysis.evaluation.aggregate_per_algorithm - else: - self.analysis_include_evaluation_aggregate_algo = False - - # Only run Evaluation per algorithm if ML per algorithm is set to True - if not self.analysis_include_ml_aggregate_algo: - self.analysis_include_evaluation_aggregate_algo = False - - # Set kde to True if Evaluation is set to True - # When Evaluation is True, PCA is used to pick a single parameter combination for all algorithms with multiple - # parameter combinations and KDE is used to choose the parameter combination in the PC space - if self.analysis_include_evaluation and not self.pca_params["kde"]: - self.pca_params["kde"] = True - print("Setting kde to true; Evaluation analysis needs to run KDE for PCA-Chosen parameter selection.") - - # Set summary include to True if Evaluation is set to True - # When a PCA-chosen parameter set is chosen, summary statistics are used to resolve tiebreakers. - if self.analysis_include_evaluation and not self.analysis_include_summary: - self.analysis_include_summary = True - print("Setting summary include to true; Evaluation analysis needs to use summary statistics for PCA-Chosen parameter selection.") - - def process_config(self, raw_config: RawConfig): self.out_dir = raw_config.reconstruction_settings.locations.reconstruction_dir @@ -287,4 +211,4 @@ def process_config(self, raw_config: RawConfig): self.process_datasets(raw_config) self.process_algorithms(raw_config) - self.process_analysis(raw_config) + self.process_analysis() diff --git a/spras/config/schema.py b/spras/config/schema.py index 1a965c75..a26bdc44 100644 --- a/spras/config/schema.py +++ b/spras/config/schema.py @@ -10,9 +10,10 @@ - `CaseInsensitiveEnum` (see ./util.py) """ +import warnings from typing import Annotated -from pydantic import AfterValidator, BaseModel, ConfigDict +from pydantic import AfterValidator, BaseModel, ConfigDict, model_validator from spras.config.algorithms import AlgorithmUnion from spras.config.container_schema import ContainerSettings @@ -38,42 +39,74 @@ class CytoscapeAnalysis(BaseModel): # Note that CaseInsensitiveEnum is not pydantic: pydantic # has special support for enums, but we avoid the # pydantic-specific "model_config" key here for this reason. -class MlLinkage(CaseInsensitiveEnum): +class HacLinkage(CaseInsensitiveEnum): ward = 'ward' complete = 'complete' average = 'average' single = 'single' -class MlMetric(CaseInsensitiveEnum): +class HacMetric(CaseInsensitiveEnum): euclidean = 'euclidean' manhattan = 'manhattan' cosine = 'cosine' -class MlAnalysis(BaseModel): +def implies(source: bool, target: bool, source_str: str, target_str: str): + if target and not source: + warnings.warn(f"{source_str} is False but {target_str} is True; setting {target_str} to False", stacklevel=2) + return False + return target + +class AggregateAnalysis(BaseModel): include: bool aggregate_per_algorithm: bool = False + + model_config = ConfigDict(extra='forbid') + + @model_validator(mode='after') + def check_aggregate_when_include(self): + self.aggregate_per_algorithm = implies(self.include, self.aggregate_per_algorithm, "include", "aggregate_per_algorithm") + return self + +class EvaluationAnalysis(AggregateAnalysis): pass + +class PcaAnalysis(AggregateAnalysis): components: int = 2 labels: bool = True kde: bool = False remove_empty_pathways: bool = False - linkage: MlLinkage = MlLinkage.ward - metric: MlMetric = MlMetric.euclidean + pca_chosen: EvaluationAnalysis = EvaluationAnalysis(include=False) - model_config = ConfigDict(extra='forbid') + @model_validator(mode='after') + def check_include_when_evaluation_include(self): + self.pca_chosen.include = implies(self.include, self.pca_chosen.include, "include", "pca_chosen.include") + self.pca_chosen.aggregate_per_algorithm = implies(self.aggregate_per_algorithm, self.pca_chosen.aggregate_per_algorithm, "aggregate_per_algorithm", "pca_chosen.aggregate_per_algorithm") + return self -class EvaluationAnalysis(BaseModel): - include: bool - aggregate_per_algorithm: bool = False +class HacAnalysis(AggregateAnalysis): + linkage: HacLinkage = HacLinkage.ward + metric: HacMetric = HacMetric.euclidean - model_config = ConfigDict(extra='forbid') +class EnsembleAnalysis(AggregateAnalysis): + evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) + + @model_validator(mode='after') + def check_include_when_evaluation_include(self): + self.evaluation.include = implies(self.include, self.evaluation.include, "include", "evaluation.include") + self.evaluation.aggregate_per_algorithm = implies(self.aggregate_per_algorithm, self.evaluation.aggregate_per_algorithm, "aggregate_per_algorithm", "evaluation.aggregate_per_algorithm") + return self +class JaccardAnalysis(AggregateAnalysis): pass class Analysis(BaseModel): summary: SummaryAnalysis = SummaryAnalysis(include=False) cytoscape: CytoscapeAnalysis = CytoscapeAnalysis(include=False) - ml: MlAnalysis = MlAnalysis(include=False) + pca: PcaAnalysis = PcaAnalysis(include=False) + hac: HacAnalysis = HacAnalysis(include=False) + jaccard: JaccardAnalysis = JaccardAnalysis(include=False) + ensemble: EnsembleAnalysis = EnsembleAnalysis(include=False) evaluation: EvaluationAnalysis = EvaluationAnalysis(include=False) + """Enables PR curve evaluation.""" - model_config = ConfigDict(extra='forbid') + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) # The default length of the truncated hash used to identify parameter combinations diff --git a/test/test_config.py b/test/test_config.py index 41551c38..574db249 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -94,13 +94,29 @@ def get_test_config(): "summary": { "include": False }, - "ml": { + "pca": { "include": False, "aggregate_per_algorithm": False, + "pca_chosen": { + "include": False + } + }, + "hac": { + "include": False, + "aggregate_per_algorithm": False + }, + "ensemble": { + "include": False, + "evaluation": { + "include": False + } }, "cytoscape": { "include": False }, + "jaccard": { + "include": False + }, "evaluation": { "include": False, "aggregate_per_algorithm": False @@ -304,54 +320,24 @@ def test_config_values(self): MEOParams(local_search=False, max_path_length=2) ]) - @pytest.mark.parametrize("ml_include, eval_include, expected_ml, expected_eval", [ + @pytest.mark.parametrize("include, eval_include, expected_include, expected_eval", [ (True, True, True, True), (True, False, True, False), (False, True, False, False), (False, False, False, False) ]) - def test_eval_ml_coupling(self, ml_include, eval_include, expected_ml, expected_eval): - test_config = get_test_config() - test_config["analysis"]["ml"]["include"] = ml_include - test_config["analysis"]["evaluation"]["include"] = eval_include - config.init_global(test_config) - - assert config.config.analysis_include_ml == expected_ml - assert config.config.analysis_include_evaluation == expected_eval - - @pytest.mark.parametrize("ml_include, ml_agg_include, expected_ml, expected_ml_agg", [ - (True, True, True, True), - (True, False, True, False), - (False, True, False, False), - (False, False, False, False) + @pytest.mark.parametrize("analysis_type, evaluation_type", [ + ("pca", "pca_chosen"), + ("ensemble", "evaluation") ]) - def test_ml_agg_algo_coupling(self, ml_include, ml_agg_include, expected_ml, expected_ml_agg): + def test_eval_pca_coupling(self, include, eval_include, expected_include, expected_eval, analysis_type, evaluation_type): test_config = get_test_config() - test_config["analysis"]["ml"]["include"] = ml_include - test_config["analysis"]["ml"]["aggregate_per_algorithm"] = ml_agg_include + test_config["analysis"][analysis_type]["include"] = include + test_config["analysis"][analysis_type][evaluation_type]["include"] = eval_include config.init_global(test_config) - assert config.config.analysis_include_ml == expected_ml - assert config.config.analysis_include_ml_aggregate_algo == expected_ml_agg - - @pytest.mark.parametrize("eval_include, agg_algo, expected_eval, expected_agg_algo", [ - (True, True, True, True), - (True, False, True, False), - (False, True, False, False), - (False, False, False, False), - ]) - def test_eval_agg_algo_coupling(self, eval_include, agg_algo, expected_eval, expected_agg_algo): - test_config = get_test_config() - test_config["analysis"]["ml"]["include"] = True - test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True - - test_config["analysis"]["evaluation"]["include"] = eval_include - test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = agg_algo - - config.init_global(test_config) - - assert config.config.analysis_include_evaluation == expected_eval - assert config.config.analysis_include_evaluation_aggregate_algo == expected_agg_algo + assert vars(config.config.analysis)[analysis_type].include == expected_include + assert vars(vars(config.config.analysis)[analysis_type])[evaluation_type].include == expected_eval @pytest.mark.parametrize("ml_include, ml_agg, eval_include, eval_agg, expected_ml, expected_ml_agg, expected_eval, expected_eval_agg", [ (False, True, True, True, False, False, False, False), @@ -360,61 +346,24 @@ def test_eval_agg_algo_coupling(self, eval_include, agg_algo, expected_eval, exp (True, True, True, True, True, True, True, True), (True, False, False, False, True, False, False, False), ]) + @pytest.mark.parametrize("analysis_type, evaluation_type", [ + ("pca", "pca_chosen"), + ("ensemble", "evaluation") + ]) def test_eval_ml_agg_algo_coupling(self, ml_include, ml_agg, eval_include, eval_agg, expected_ml, expected_ml_agg, - expected_eval, expected_eval_agg): - # the value of ml include and ml aggregate_per_algorithm can affect the value of evaluation include and + expected_eval, expected_eval_agg, analysis_type, evaluation_type): + # the value of pca include and pca aggregate_per_algorithm can affect the value of evaluation include and # evaluation aggregate_per_algorithm test_config = get_test_config() - test_config["analysis"]["ml"]["include"] = ml_include - test_config["analysis"]["ml"]["aggregate_per_algorithm"] = ml_agg - test_config["analysis"]["evaluation"]["include"] = eval_include - test_config["analysis"]["evaluation"]["aggregate_per_algorithm"] = eval_agg - - config.init_global(test_config) - - assert config.config.analysis_include_ml == expected_ml - assert config.config.analysis_include_ml_aggregate_algo == expected_ml_agg - assert config.config.analysis_include_evaluation == expected_eval - assert config.config.analysis_include_evaluation_aggregate_algo == expected_eval_agg - - @pytest.mark.parametrize("eval_include, kde, expected_eval, expected_kde", [ - (True, True, True, True), - (True, False, True, True), - (False, True, False, True), - (False, False, False, False), - ]) - def test_eval_kde_coupling(self, eval_include, kde, expected_eval, expected_kde): - test_config = get_test_config() - test_config["analysis"]["ml"]["include"] = True - # dealing with other coupling issue - test_config["analysis"]["summary"]["include"] = True - - test_config["analysis"]["ml"]["kde"] = kde - test_config["analysis"]["evaluation"]["include"] = eval_include + test_config["analysis"][analysis_type]["include"] = ml_include + test_config["analysis"][analysis_type]["aggregate_per_algorithm"] = ml_agg + test_config["analysis"][analysis_type][evaluation_type]["include"] = eval_include + test_config["analysis"][analysis_type][evaluation_type]["aggregate_per_algorithm"] = eval_agg config.init_global(test_config) - assert config.config.analysis_include_evaluation == expected_eval - assert config.config.pca_params["kde"] == expected_kde - - @pytest.mark.parametrize("eval_include, summary_include, expected_eval, expected_summary", [ - (True, True, True, True), - (True, False, True, True), - (False, True, False, True), - (False, False, False, False), - ]) - def test_eval_summary_coupling(self, eval_include, summary_include, expected_eval, expected_summary): - test_config = get_test_config() - # dealing with other coupling issue - test_config["analysis"]["ml"]["include"] = True - test_config["analysis"]["ml"]["kde"] = True - - test_config["analysis"]["summary"]["include"] = summary_include - test_config["analysis"]["evaluation"]["include"] = eval_include - - config.init_global(test_config) - - assert config.config.analysis_include_evaluation == expected_eval - assert config.config.analysis_include_summary == expected_summary - + assert vars(config.config.analysis)[analysis_type].include == expected_ml, f"Include was not {expected_ml}!" + assert vars(config.config.analysis)[analysis_type].aggregate_per_algorithm == expected_ml_agg, f"Aggregate per algorithm was not {expected_ml_agg}!" + assert vars(vars(config.config.analysis)[analysis_type])[evaluation_type].include == expected_eval, f"evaluation include was not {expected_eval}!" + assert vars(vars(config.config.analysis)[analysis_type])[evaluation_type].aggregate_per_algorithm == expected_eval_agg, f"evaluation aggregate per algorithm was not {expected_eval_agg}!"