From 6ec4f6237970a9cbf4cb8defd42ab8620b6d7c92 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Fri, 10 Oct 2025 06:32:06 +0000
Subject: [PATCH 01/17] refactor: separate statistic computation

we also make it lazy
---
 spras/analysis/summary.py | 44 +++-----------------
 spras/statistics.py       | 88 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 38 deletions(-)
 create mode 100644 spras/statistics.py

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
index c8abc1cad..fd70db8f3 100644
--- a/spras/analysis/summary.py
+++ b/spras/analysis/summary.py
@@ -1,10 +1,11 @@
 from pathlib import Path
-from statistics import median
 from typing import Iterable
 
 import networkx as nx
 import pandas as pd
 
+from spras.statistics import compute_statistics, statistics_options
+
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
                        algo_with_params: list) -> pd.DataFrame:
@@ -47,44 +48,11 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
 
         # Save the network name, number of nodes, number edges, and number of connected components
         nw_name = str(file_path)
-        number_nodes = nw.number_of_nodes()
-        number_edges = nw.number_of_edges()
-        ncc = nx.number_connected_components(nw)
-
-        # Save the max/median degree, average clustering coefficient, and density
-        if number_nodes == 0:
-            max_degree = 0
-            median_degree = 0.0
-            density = 0.0
-        else:
-            degrees = [deg for _, deg in nw.degree()]
-            max_degree = max(degrees)
-            median_degree = median(degrees)
-            density = nx.density(nw)
-
-        cc = list(nx.connected_components(nw))
-        # Save the max diameter
-        # Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
-        diameters = [
-            nx.diameter(nw.subgraph(c).copy()) if len(c) > 1 else 0
-            for c in cc
-        ]
-        max_diameter = max(diameters, default=0)
-
-        # Save the average path lengths
-        # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
-        avg_path_lengths = [
-            nx.average_shortest_path_length(nw.subgraph(c).copy()) if len(c) > 1 else 0.0
-            for c in cc
-        ]
-
-        if len(avg_path_lengths) != 0:
-            avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
-        else:
-            avg_path_len = 0.0
+
+        graph_statistics = compute_statistics(nw, statistics_options)
 
         # Initialize list to store current network information
-        cur_nw_info = [nw_name, number_nodes, number_edges, ncc, density, max_degree, median_degree, max_diameter, avg_path_len]
+        cur_nw_info = [nw_name, *graph_statistics]
 
         # Iterate through each node property and save the intersection with the current network
         for node_list in nodes_by_col:
@@ -104,7 +72,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
         nw_info.append(cur_nw_info)
 
     # Prepare column names
-    col_names = ['Name', 'Number of nodes', 'Number of edges', 'Number of connected components', 'Density', 'Max degree', 'Median degree', 'Max diameter', 'Average path length']
+    col_names = ['Name', *statistics_options]
     col_names.extend(nodes_by_col_labs)
     col_names.append('Parameter combination')
 
diff --git a/spras/statistics.py b/spras/statistics.py
new file mode 100644
index 000000000..843e5292a
--- /dev/null
+++ b/spras/statistics.py
@@ -0,0 +1,88 @@
+"""
+Graph statistics, used to power summary.py.
+
+We allow for arbitrary computation of any specific statistic on some graph,
+computing more than necessary if we have dependencies. See the top level
+`statistics_computation` dictionary for usage.
+"""
+
+import itertools
+import networkx as nx
+from statistics import median
+from typing import Callable
+
+def compute_degree(graph: nx.DiGraph) -> tuple[int, float]:
+    """
+    Computes the (max, median) degree of a `graph`.
+    """
+    # number_of_nodes is a cheap call
+    if graph.number_of_nodes() == 0:
+        return (0, 0.0)
+    else:
+        degrees = [deg for _, deg in graph.degree()]
+        return max(degrees), median(degrees)
+
+def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]:
+    cc = list(nx.connected_components(graph))
+    # Save the max diameter
+    # Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
+    diameters = [
+        nx.diameter(graph.subgraph(c).copy()) if len(c) > 1 else 0
+        for c in cc
+    ]
+    max_diameter = max(diameters, default=0)
+
+    # Save the average path lengths
+    # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
+    avg_path_lengths = [
+        nx.average_shortest_path_length(graph.subgraph(c).copy()) if len(c) > 1 else 0.0
+        for c in cc
+    ]
+
+    if len(avg_path_lengths) != 0:
+        avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
+    else:
+        avg_path_len = 0.0
+    
+    return max_diameter, avg_path_len
+
+# The type signature on here is quite bad. I would like to say that an n-tuple has n-outputs.
+statistics_computation: dict[tuple[str, ...], Callable[[nx.DiGraph], tuple[float | int, ...]]] = {
+    ('Number of nodes',): lambda graph : (graph.number_of_nodes(),),
+    ('Number of edges',): lambda graph : (graph.number_of_edges(),),
+    ('Number of connected components',): lambda graph : (nx.number_connected_components(graph),),
+    ('Density',): lambda graph : (nx.density(graph),),
+    
+    ('Max degree', 'Median degree'): compute_degree,
+    ('Max diameter', 'Average path length'): compute_on_cc,
+}
+
+# All of the keys inside statistics_computation, flattened.
+statistics_options: list[str] = list(itertools.chain(*(list(key) for key in statistics_computation.keys())))
+
+def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, float | int]:
+    """
+    Computes `statistics` for a graph corresponding to the top-level `statistics` dictionary
+    in this file. 
+    """
+
+    # early-scan cutoff for statistics:
+    # we want to err as soon as possible
+    for stat in statistics:
+        if stat not in statistics_options:
+            raise RuntimeError(f"Statistic {stat} not a computable statistics! Available statistics: {statistics_options}")
+    
+    # now, we can compute statistics only
+    computed_statistics: dict[str, float | int] = dict()
+    for statistic_tuple, compute in statistics_computation.items():
+        # when we want them
+        if not set(statistic_tuple).isdisjoint(set(statistics)):
+            computed_tuple = compute(graph)
+            assert len(statistic_tuple) == computed_tuple, f"bad tuple length for {statistic_tuple}"
+
+            current_computed_statistics = zip(statistic_tuple, computed_tuple)
+            for stat, value in current_computed_statistics:
+                computed_statistics[stat] = value
+
+    # (and return only the statistics we wanted)
+    return {key: computed_statistics[key] for key in statistics}

From 9987189d8e0d9a9006ae1897cd44836500a5c906 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Fri, 10 Oct 2025 06:48:54 +0000
Subject: [PATCH 02/17] fix: correct tuple assumption

---
 spras/statistics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/statistics.py b/spras/statistics.py
index 843e5292a..ac91b80a9 100644
--- a/spras/statistics.py
+++ b/spras/statistics.py
@@ -78,7 +78,7 @@ def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, fl
         # when we want them
         if not set(statistic_tuple).isdisjoint(set(statistics)):
             computed_tuple = compute(graph)
-            assert len(statistic_tuple) == computed_tuple, f"bad tuple length for {statistic_tuple}"
+            assert len(statistic_tuple) == len(computed_tuple), f"bad tuple length for {statistic_tuple}"
 
             current_computed_statistics = zip(statistic_tuple, computed_tuple)
             for stat, value in current_computed_statistics:

From 25eef5e72aee4fb7aea6f6b5e9d11dff7fd5be16 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Fri, 10 Oct 2025 07:06:46 +0000
Subject: [PATCH 03/17] fix: stably use graph statistic values

---
 spras/analysis/summary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
index fd70db8f3..432dba0a4 100644
--- a/spras/analysis/summary.py
+++ b/spras/analysis/summary.py
@@ -52,7 +52,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
         graph_statistics = compute_statistics(nw, statistics_options)
 
         # Initialize list to store current network information
-        cur_nw_info = [nw_name, *graph_statistics]
+        cur_nw_info = [nw_name, *graph_statistics.values()]
 
         # Iterate through each node property and save the intersection with the current network
         for node_list in nodes_by_col:

From cb373c130760c7040b16ec03ba1d2673e343465b Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Wed, 29 Oct 2025 17:56:22 -0700
Subject: [PATCH 04/17] style: fmt

---
 spras/config/config.py |  4 ++--
 spras/statistics.py    | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/spras/config/config.py b/spras/config/config.py
index 22e655941..add815d9d 100644
--- a/spras/config/config.py
+++ b/spras/config/config.py
@@ -71,7 +71,7 @@ def __init__(self, raw_config: dict[str, Any]):
         self.container_prefix: str = DEFAULT_CONTAINER_PREFIX
         # A Boolean specifying whether to unpack singularity containers. Default is False
         self.unpack_singularity = False
-        # A Boolean indiciating whether to enable container runtime profiling (apptainer/singularity only)
+        # A Boolean indicating whether to enable container runtime profiling (apptainer/singularity only)
         self.enable_profiling = False
         # A dictionary to store configured datasets against which SPRAS will be run
         self.datasets = None
@@ -308,7 +308,7 @@ def process_config(self, raw_config: RawConfig):
         if raw_config.container_registry and raw_config.container_registry.base_url != "" and raw_config.container_registry.owner != "":
             self.container_prefix = raw_config.container_registry.base_url + "/" + raw_config.container_registry.owner
 
-        if raw_config.enable_profiling and not raw_config.container_framework in ["singularity", "apptainer"]:
+        if raw_config.enable_profiling and raw_config.container_framework not in ["singularity", "apptainer"]:
             warnings.warn("enable_profiling is set to true, but the container framework is not singularity/apptainer. This setting will have no effect.")
         self.enable_profiling = raw_config.enable_profiling
 
diff --git a/spras/statistics.py b/spras/statistics.py
index ac91b80a9..49ae8b3fc 100644
--- a/spras/statistics.py
+++ b/spras/statistics.py
@@ -7,10 +7,12 @@
 """
 
 import itertools
-import networkx as nx
 from statistics import median
 from typing import Callable
 
+import networkx as nx
+
+
 def compute_degree(graph: nx.DiGraph) -> tuple[int, float]:
     """
     Computes the (max, median) degree of a `graph`.
@@ -43,7 +45,7 @@ def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]:
         avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths)
     else:
         avg_path_len = 0.0
-    
+
     return max_diameter, avg_path_len
 
 # The type signature on here is quite bad. I would like to say that an n-tuple has n-outputs.
@@ -52,7 +54,7 @@ def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]:
     ('Number of edges',): lambda graph : (graph.number_of_edges(),),
     ('Number of connected components',): lambda graph : (nx.number_connected_components(graph),),
     ('Density',): lambda graph : (nx.density(graph),),
-    
+
     ('Max degree', 'Median degree'): compute_degree,
     ('Max diameter', 'Average path length'): compute_on_cc,
 }
@@ -63,7 +65,7 @@ def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]:
 def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, float | int]:
     """
     Computes `statistics` for a graph corresponding to the top-level `statistics` dictionary
-    in this file. 
+    in this file.
     """
 
     # early-scan cutoff for statistics:
@@ -71,7 +73,7 @@ def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, fl
     for stat in statistics:
         if stat not in statistics_options:
             raise RuntimeError(f"Statistic {stat} not a computable statistics! Available statistics: {statistics_options}")
-    
+
     # now, we can compute statistics only
     computed_statistics: dict[str, float | int] = dict()
     for statistic_tuple, compute in statistics_computation.items():

From 898d568a49053467d74af1cb952bdceac400436d Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Wed, 29 Oct 2025 18:15:23 -0700
Subject: [PATCH 05/17] style: specify zip strict

---
 spras/statistics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/statistics.py b/spras/statistics.py
index 49ae8b3fc..1ebe7cc62 100644
--- a/spras/statistics.py
+++ b/spras/statistics.py
@@ -82,7 +82,7 @@ def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, fl
             computed_tuple = compute(graph)
             assert len(statistic_tuple) == len(computed_tuple), f"bad tuple length for {statistic_tuple}"
 
-            current_computed_statistics = zip(statistic_tuple, computed_tuple)
+            current_computed_statistics = zip(statistic_tuple, computed_tuple, strict=True)
             for stat, value in current_computed_statistics:
                 computed_statistics[stat] = value
 

From c675eced3b62b8a62204d9f6105628e1cdc09045 Mon Sep 17 00:00:00 2001
From: "Tristan F." <LeoDog896@hotmail.com>
Date: Thu, 6 Nov 2025 02:22:45 +0000
Subject: [PATCH 06/17] fix: make undirected for determining number of
 connected components

---
 spras/statistics.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spras/statistics.py b/spras/statistics.py
index 1ebe7cc62..222051d23 100644
--- a/spras/statistics.py
+++ b/spras/statistics.py
@@ -24,7 +24,8 @@ def compute_degree(graph: nx.DiGraph) -> tuple[int, float]:
         degrees = [deg for _, deg in graph.degree()]
         return max(degrees), median(degrees)
 
-def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]:
+def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
+    graph: nx.Graph = directed_graph.to_undirected()
     cc = list(nx.connected_components(graph))
     # Save the max diameter
     # Use diameter only for components with ≥2 nodes (singleton components have diameter 0)
@@ -52,7 +53,7 @@ def compute_on_cc(graph: nx.DiGraph) -> tuple[int, float]:
 statistics_computation: dict[tuple[str, ...], Callable[[nx.DiGraph], tuple[float | int, ...]]] = {
     ('Number of nodes',): lambda graph : (graph.number_of_nodes(),),
     ('Number of edges',): lambda graph : (graph.number_of_edges(),),
-    ('Number of connected components',): lambda graph : (nx.number_connected_components(graph),),
+    ('Number of connected components',): lambda graph : (nx.number_connected_components(graph.to_undirected()),),
     ('Density',): lambda graph : (nx.density(graph),),
 
     ('Max degree', 'Median degree'): compute_degree,

From 1ca730e4cd36e0542fbd90496d972997db340d19 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Tue, 13 Jan 2026 12:13:21 -0800
Subject: [PATCH 07/17] feat: snakemake-based summary generation

---
 Snakefile                 | 24 ++++++++++++++++++++----
 spras/analysis/summary.py | 15 +++++++++------
 spras/statistics.py       | 28 ++--------------------------
 3 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/Snakefile b/Snakefile
index cf075b0fa..060696c71 100644
--- a/Snakefile
+++ b/Snakefile
@@ -2,10 +2,11 @@ import os
 from spras import runner
 import shutil
 import yaml
-from spras.dataset import Dataset
-from spras.evaluation import Evaluation
 from spras.analysis import ml, summary, cytoscape
 import spras.config.config as _config
+from spras.dataset import Dataset
+from spras.evaluation import Evaluation
+from spras.statistics import from_edgelist, statistics_computation, statistics_options
 
 # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037
 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them
@@ -310,18 +311,33 @@ rule viz_cytoscape:
     run:
         cytoscape.run_cytoscape(input.pathways, output.session, container_settings)
 
+for keys, values in statistics_computation.items():
+    pythonic_name = 'generate_' + '_and_'.join([key.lower().replace(' ', '_') for key in keys])
+    rule:
+        name: pythonic_name
+        input: pathway_file = rules.reconstruct.output.pathway_file
+        output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys]
+        run:
+            (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True)
+            graph = from_edgelist(input.pathway_file)
+            for computed, output in zip(values(graph), output):
+                Path(output).write_text(str(computed))
 
 # Write a single summary table for all pathways for each dataset
 rule summary_table:
     input:
         # Collect all pathways generated for the dataset
         pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params),
-        dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle'])
+        dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']),
+        # Collect all possible options
+        statistics = expand(
+            '{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}statistics{sep}{statistic}.txt',
+            out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, statistic=statistics_options)
     output: summary_table = SEP.join([out_dir, '{dataset}-pathway-summary.txt'])
     run:
         # Load the node table from the pickled dataset file
         node_table = Dataset.from_file(input.dataset_file).node_table
-        summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params)
+        summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params, input.statistics)
         summary_df.to_csv(output.summary_table, sep='\t', index=False)
 
 # Cluster the output pathways for each dataset
diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
index cdffe0f68..0bd025aa4 100644
--- a/spras/analysis/summary.py
+++ b/spras/analysis/summary.py
@@ -1,14 +1,14 @@
+import ast
 from pathlib import Path
 from typing import Iterable
 
-import networkx as nx
 import pandas as pd
 
-from spras.statistics import compute_statistics, statistics_options
+from spras.statistics import from_edgelist
 
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
-                       algo_with_params: list) -> pd.DataFrame:
+                       algo_with_params: list, statistics_files: list) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
     in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the
@@ -44,15 +44,16 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
             lines = f.readlines()[1:]  # skip the header line
 
         # directed or mixed graphs are parsed and summarized as an undirected graph
-        nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str)))
+        nw = from_edgelist(lines)
 
         # Save the network name, number of nodes, number edges, and number of connected components
         nw_name = str(file_path)
 
-        graph_statistics = compute_statistics(nw, statistics_options)
+        # We use literal_eval here to easily coerce to either ints or floats, depending.
+        graph_statistics = [ast.literal_eval(Path(file).read_text()) for file in statistics_files]
 
         # Initialize list to store current network information
-        cur_nw_info = [nw_name, *graph_statistics.values()]
+        cur_nw_info = [nw_name, *graph_statistics]
 
         # Iterate through each node property and save the intersection with the current network
         for node_list in nodes_by_col:
@@ -73,6 +74,8 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
         # Save the current network information to the network summary list
         nw_info.append(cur_nw_info)
 
+    # Get the list of statistic names by their file names
+    statistics_options = [Path(file).stem for file in statistics_files]
     # Prepare column names
     col_names = ['Name', *statistics_options]
     col_names.extend(nodes_by_col_labs)
diff --git a/spras/statistics.py b/spras/statistics.py
index 222051d23..7bc8253c6 100644
--- a/spras/statistics.py
+++ b/spras/statistics.py
@@ -63,29 +63,5 @@ def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
 # All of the keys inside statistics_computation, flattened.
 statistics_options: list[str] = list(itertools.chain(*(list(key) for key in statistics_computation.keys())))
 
-def compute_statistics(graph: nx.DiGraph, statistics: list[str]) -> dict[str, float | int]:
-    """
-    Computes `statistics` for a graph corresponding to the top-level `statistics` dictionary
-    in this file.
-    """
-
-    # early-scan cutoff for statistics:
-    # we want to err as soon as possible
-    for stat in statistics:
-        if stat not in statistics_options:
-            raise RuntimeError(f"Statistic {stat} not a computable statistics! Available statistics: {statistics_options}")
-
-    # now, we can compute statistics only
-    computed_statistics: dict[str, float | int] = dict()
-    for statistic_tuple, compute in statistics_computation.items():
-        # when we want them
-        if not set(statistic_tuple).isdisjoint(set(statistics)):
-            computed_tuple = compute(graph)
-            assert len(statistic_tuple) == len(computed_tuple), f"bad tuple length for {statistic_tuple}"
-
-            current_computed_statistics = zip(statistic_tuple, computed_tuple, strict=True)
-            for stat, value in current_computed_statistics:
-                computed_statistics[stat] = value
-
-    # (and return only the statistics we wanted)
-    return {key: computed_statistics[key] for key in statistics}
+def from_edgelist(lines) -> nx.Graph:
+    return nx.read_edgelist(lines, data=(('weight', float), ('Direction', str)))

From d67186dcd5679c44264b24836d86f25816aecb52 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Tue, 13 Jan 2026 12:19:43 -0800
Subject: [PATCH 08/17] fix(Snakefile): use parse_output for edgelist parsing

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 060696c71..532be6fe7 100644
--- a/Snakefile
+++ b/Snakefile
@@ -315,7 +315,7 @@ for keys, values in statistics_computation.items():
     pythonic_name = 'generate_' + '_and_'.join([key.lower().replace(' ', '_') for key in keys])
     rule:
         name: pythonic_name
-        input: pathway_file = rules.reconstruct.output.pathway_file
+        input: pathway_file = rules.parse_output.output.standardized_file
         output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys]
         run:
             (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True)

From fd483c3af9ab15bb5b1717b6a33b1ae338b25472 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Tue, 13 Jan 2026 12:37:46 -0800
Subject: [PATCH 09/17] fix: parse edgelist with rank, embed header skip inside
 from_edgelist

this had incorrect behavior ?
---
 Snakefile                     |  4 ++--
 spras/analysis/summary.py     |  7 ++-----
 spras/statistics.py           |  7 +++++--
 test/analysis/test_summary.py | 24 ++++++++++++------------
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/Snakefile b/Snakefile
index 532be6fe7..9673b80b0 100644
--- a/Snakefile
+++ b/Snakefile
@@ -6,7 +6,7 @@ from spras.analysis import ml, summary, cytoscape
 import spras.config.config as _config
 from spras.dataset import Dataset
 from spras.evaluation import Evaluation
-from spras.statistics import from_edgelist, statistics_computation, statistics_options
+from spras.statistics import from_output_pathway, statistics_computation, statistics_options
 
 # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037
 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them
@@ -319,7 +319,7 @@ for keys, values in statistics_computation.items():
         output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys]
         run:
             (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True)
-            graph = from_edgelist(input.pathway_file)
+            graph = from_output_pathway(input.pathway_file)
             for computed, output in zip(values(graph), output):
                 Path(output).write_text(str(computed))
 
diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
index 0bd025aa4..1f627493f 100644
--- a/spras/analysis/summary.py
+++ b/spras/analysis/summary.py
@@ -4,7 +4,7 @@
 
 import pandas as pd
 
-from spras.statistics import from_edgelist
+from spras.statistics import from_output_pathway
 
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
@@ -40,11 +40,8 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
 
     # Iterate through each network file path
     for index, file_path in enumerate(sorted(file_paths)):
-        with open(file_path, 'r') as f:
-            lines = f.readlines()[1:]  # skip the header line
-
         # directed or mixed graphs are parsed and summarized as an undirected graph
-        nw = from_edgelist(lines)
+        nw = from_output_pathway(file_path)
 
         # Save the network name, number of nodes, number edges, and number of connected components
         nw_name = str(file_path)
diff --git a/spras/statistics.py b/spras/statistics.py
index 7bc8253c6..5399da390 100644
--- a/spras/statistics.py
+++ b/spras/statistics.py
@@ -63,5 +63,8 @@ def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
 # All of the keys inside statistics_computation, flattened.
 statistics_options: list[str] = list(itertools.chain(*(list(key) for key in statistics_computation.keys())))
 
-def from_edgelist(lines) -> nx.Graph:
-    return nx.read_edgelist(lines, data=(('weight', float), ('Direction', str)))
+def from_output_pathway(lines) -> nx.Graph:
+    with open(lines, 'r') as f:
+        lines = f.readlines()[1:]
+    
+    return nx.read_edgelist(lines, data=(('Rank', int), ('Direction', str)))
diff --git a/test/analysis/test_summary.py b/test/analysis/test_summary.py
index 57f1f6012..8618f0a2f 100644
--- a/test/analysis/test_summary.py
+++ b/test/analysis/test_summary.py
@@ -12,9 +12,9 @@
 # - 'NODEID' is required as the first column label in the node table
 # - file_paths must be an iterable, even if a single file path is passed
 
-INPUT_DIR = 'test/analysis/input/'
-OUT_DIR = 'test/analysis/output/'
-EXPECT_DIR = 'test/analysis/expected_output/'
+INPUT_DIR = Path('test', 'analysis', 'input')
+OUT_DIR = Path('test', 'analysis', 'output')
+EXPECT_DIR = Path('test', 'analysis', 'expected_output')
 
 
 class TestSummary:
@@ -35,14 +35,14 @@ def test_example_networks(self):
                         }
         example_dataset = Dataset(example_dict)
         example_node_table = example_dataset.node_table
-        config.init_from_file(INPUT_DIR + "config.yaml")
+        config.init_from_file(INPUT_DIR / "config.yaml")
         algorithm_params = config.config.algorithm_params
         algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in
                                   algorithm_params.items() for params_hash in param_combos.keys()]
 
-        example_network_files = Path(INPUT_DIR + "example").glob("*.txt")  # must be path to use .glob()
+        example_network_files = Path(INPUT_DIR, "example").glob("*.txt")
 
-        out_path = Path(OUT_DIR + "test_example_summary.txt")
+        out_path = Path(OUT_DIR, "test_example_summary.txt")
         out_path.unlink(missing_ok=True)
         summarize_example = summarize_networks(example_network_files, example_node_table, algorithm_params,
                                                algorithms_with_params)
@@ -51,7 +51,7 @@ def test_example_networks(self):
 
         # Comparing the dataframes directly with equals does not match because of how the parameter
         # combinations column is loaded from disk. Therefore, write both to disk and compare the files.
-        assert filecmp.cmp(out_path, EXPECT_DIR + "expected_example_summary.txt", shallow=False)
+        assert filecmp.cmp(out_path, EXPECT_DIR / "expected_example_summary.txt", shallow=False)
 
     def test_egfr_networks(self):
         """Test data from EGFR workflow"""
@@ -64,14 +64,14 @@ def test_egfr_networks(self):
 
         egfr_dataset = Dataset(egfr_dict)
         egfr_node_table = egfr_dataset.node_table
-        config.init_from_file(INPUT_DIR + "egfr.yaml")
+        config.init_from_file(INPUT_DIR / "egfr.yaml")
         algorithm_params = config.config.algorithm_params
         algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in
                                   algorithm_params.items() for params_hash in param_combos.keys()]
 
-        egfr_network_files = Path(INPUT_DIR + "egfr").glob("*.txt")  # must be path to use .glob()
+        egfr_network_files = Path(INPUT_DIR, "egfr").glob("*.txt")  # must be path to use .glob()
 
-        out_path = Path(OUT_DIR + "test_egfr_summary.txt")
+        out_path = Path(OUT_DIR, "test_egfr_summary.txt")
         out_path.unlink(missing_ok=True)
         summarize_egfr = summarize_networks(egfr_network_files, egfr_node_table, algorithm_params,
                                             algorithms_with_params)
@@ -80,7 +80,7 @@ def test_egfr_networks(self):
 
         # Comparing the dataframes directly with equals does not match because of how the parameter
         # combinations column is loaded from disk. Therefore, write both to disk and compare the files.
-        assert filecmp.cmp(out_path, EXPECT_DIR + "expected_egfr_summary.txt", shallow=False)
+        assert filecmp.cmp(out_path, EXPECT_DIR / "expected_egfr_summary.txt", shallow=False)
 
     def test_load_dataset_dict(self):
         """Test loading files from dataset_dict"""
@@ -95,7 +95,7 @@ def test_load_dataset_dict(self):
 
         # node_table contents are not generated consistently in the same order,
         # so we will check that the contents are the same, but row order doesn't matter
-        expected_node_table = pd.read_csv((EXPECT_DIR + "expected_node_table.txt"), sep="\t")
+        expected_node_table = pd.read_csv((EXPECT_DIR / "expected_node_table.txt"), sep="\t")
 
         # ignore 'NODEID' column because this changes each time upon new generation
         cols_to_compare = [col for col in example_node_table.columns if col != "NODEID"]

From fd5046f165f3ab29e6e154f29f4eab7316a0fb45 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Tue, 13 Jan 2026 12:55:38 -0800
Subject: [PATCH 10/17] style: fmt

---
 spras/statistics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/statistics.py b/spras/statistics.py
index 5399da390..342f1a5e2 100644
--- a/spras/statistics.py
+++ b/spras/statistics.py
@@ -66,5 +66,5 @@ def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
 def from_output_pathway(lines) -> nx.Graph:
     with open(lines, 'r') as f:
         lines = f.readlines()[1:]
-    
+
     return nx.read_edgelist(lines, data=(('Rank', int), ('Direction', str)))

From 79cf748b9efe78dff51e69963591ef267a3eb0c8 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Tue, 13 Jan 2026 13:17:48 -0800
Subject: [PATCH 11/17] chore: mention statistics_files param

---
 spras/analysis/summary.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
index 1f627493f..e5c0b1f73 100644
--- a/spras/analysis/summary.py
+++ b/spras/analysis/summary.py
@@ -18,6 +18,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
     @param algo_params: a nested dict mapping algorithm names to dicts that map parameter hashes to parameter
     combinations.
     @param algo_with_params: a list of <algorithm>-params-<params_hash> combinations
+    @param statistics_files: a list of statistic files with the computed statistics.
     @return: pandas DataFrame with summary information
     """
     # Ensure that NODEID is the first column

From 85e0ea8a020133186074b7663c83fa4c1a253a9b Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Sat, 14 Feb 2026 01:09:44 +0000
Subject: [PATCH 12/17] docs: more info on summary & statistics

---
 Snakefile                 |  3 ++
 spras/analysis/summary.py | 66 ++-------------------------------------
 spras/statistics.py       | 10 ++++--
 3 files changed, 14 insertions(+), 65 deletions(-)

diff --git a/Snakefile b/Snakefile
index e4b829dee..e6d9204a5 100644
--- a/Snakefile
+++ b/Snakefile
@@ -310,9 +310,12 @@ rule viz_cytoscape:
     run:
         cytoscape.run_cytoscape(input.pathways, output.session, container_settings)
 
+# We generate new Snakemake rules for every statistic
+# to allow parallel and lazy computation of individual statistics
 for keys, values in statistics_computation.items():
     pythonic_name = 'generate_' + '_and_'.join([key.lower().replace(' ', '_') for key in keys])
     rule:
+        # (See https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#procedural-rule-definition)
         name: pythonic_name
         input: pathway_file = rules.parse_output.output.standardized_file
         output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys]
diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
index 2907b0e34..d2059bb21 100644
--- a/spras/analysis/summary.py
+++ b/spras/analysis/summary.py
@@ -47,7 +47,8 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
         # Save the network name, number of nodes, number edges, and number of connected components
         nw_name = str(file_path)
 
-        # We use literal_eval here to easily coerce to either ints or floats, depending.
+        # We use ast.literal_eval here to convert statistic file outputs to ints or floats depending on their string representation.
+        # (e.g. "5.0" -> float(5.0), while "5" -> int(5).)
         graph_statistics = [ast.literal_eval(Path(file).read_text()) for file in statistics_files]
 
         # Initialize list to store current network information
@@ -89,65 +90,4 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
     return nw_info
 
 
-def degree(g):
-    return dict(g.degree)
-
-# TODO: redo .run code to work on mixed graphs
-# stats is just a list of functions to apply to the graph.
-# They should take as input a networkx graph or digraph but may have any output.
-# stats = [degree, nx.clustering, nx.betweenness_centrality]
-
-
-# def produce_statistics(g: nx.Graph, s=None) -> dict:
-#     global stats
-#     if s is not None:
-#         stats = s
-#     d = dict()
-#     for s in stats:
-#         sname = s.__name__
-#         d[sname] = s(g)
-#     return d
-
-
-# def load_graph(path: str) -> nx.Graph:
-#     g = nx.read_edgelist(path, data=(('weight', float), ('Direction',str)))
-#     return g
-
-
-# def save(data, pth):
-#     fout = open(pth, 'w')
-#     fout.write('#node\t%s\n' % '\t'.join([s.__name__ for s in stats]))
-#     for node in data[stats[0].__name__]:
-#         row = [data[s.__name__][node] for s in stats]
-#         fout.write('%s\t%s\n' % (node, '\t'.join([str(d) for d in row])))
-#     fout.close()
-
-
-# def run(infile: str, outfile: str) -> None:
-#     """
-#     run function that wraps above functions.
-#     """
-#     # if output directory doesn't exist, make it.
-#     outdir = os.path.dirname(outfile)
-#     if not os.path.exists(outdir):
-#         os.makedirs(outdir)
-
-#     # load graph, produce stats, and write to human-readable file.
-#     g = load_graph(infile)
-#     dat = produce_statistics(g)
-#     save(dat, outfile)
-
-
-# def main(argv):
-#     """
-#     for testing
-#     """
-#     g = load_graph(argv[1])
-#     print(g.nodes)
-#     dat = produce_statistics(g)
-#     print(dat)
-#     save(dat, argv[2])
-
-
-# if __name__ == '__main__':
-#     main(sys.argv)
+# TODO: redo the above code to work on mixed graphs
diff --git a/spras/statistics.py b/spras/statistics.py
index 342f1a5e2..9c510a151 100644
--- a/spras/statistics.py
+++ b/spras/statistics.py
@@ -4,6 +4,10 @@
 We allow for arbitrary computation of any specific statistic on some graph,
 computing more than necessary if we have dependencies. See the top level
 `statistics_computation` dictionary for usage.
+
+To make the statistics allow directed graph input, they will always take 
+in a networkx.DiGraph, which contains even more information, even though
+the underlying graph may be just as easily represented by networkx.Graph.
 """
 
 import itertools
@@ -25,6 +29,9 @@ def compute_degree(graph: nx.DiGraph) -> tuple[int, float]:
         return max(degrees), median(degrees)
 
 def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
+    # We convert our directed_graph to an undirected graph as networkx (reasonably) does
+    # not allow for computing the connected components of a directed graph, but the connected
+    # component count still is a useful statistic for us.
     graph: nx.Graph = directed_graph.to_undirected()
     cc = list(nx.connected_components(graph))
     # Save the max diameter
@@ -49,13 +56,12 @@ def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
 
     return max_diameter, avg_path_len
 
-# The type signature on here is quite bad. I would like to say that an n-tuple has n-outputs.
+# The type signature here is meant to be 'an n-tuple has n-outputs.'
 statistics_computation: dict[tuple[str, ...], Callable[[nx.DiGraph], tuple[float | int, ...]]] = {
     ('Number of nodes',): lambda graph : (graph.number_of_nodes(),),
     ('Number of edges',): lambda graph : (graph.number_of_edges(),),
     ('Number of connected components',): lambda graph : (nx.number_connected_components(graph.to_undirected()),),
     ('Density',): lambda graph : (nx.density(graph),),
-
     ('Max degree', 'Median degree'): compute_degree,
     ('Max diameter', 'Average path length'): compute_on_cc,
 }

From 804849a4c7800d1a62eb67b11d0ded2b996e1e1d Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Sat, 14 Feb 2026 01:12:19 +0000
Subject: [PATCH 13/17] style: fmt

---
 spras/statistics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/statistics.py b/spras/statistics.py
index 9c510a151..251fecca2 100644
--- a/spras/statistics.py
+++ b/spras/statistics.py
@@ -5,7 +5,7 @@
 computing more than necessary if we have dependencies. See the top level
 `statistics_computation` dictionary for usage.
 
-To make the statistics allow directed graph input, they will always take 
+To make the statistics allow directed graph input, they will always take
 in a networkx.DiGraph, which contains even more information, even though
 the underlying graph may be just as easily represented by networkx.Graph.
 """

From ae61e5752302234e3cd1dc5e9a751bfb33856213 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Fri, 17 Apr 2026 19:27:21 +0000
Subject: [PATCH 14/17] Merge branch 'umain' into generate-all-inputs

---
 Snakefile                        | 2 +-
 spras/analysis/summary.py        | 6 ++----
 test/analysis/input/egfr.yaml    | 3 +++
 test/analysis/input/example.yaml | 3 +++
 test/analysis/test_summary.py    | 7 ++++---
 5 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/Snakefile b/Snakefile
index 166c8b2ff..ea935bec2 100644
--- a/Snakefile
+++ b/Snakefile
@@ -334,7 +334,7 @@ rule summary_table:
         # Collect all pathways generated for the dataset
         pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params),
         dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']),
-        # Collect all possible options
+        # Collect all possible statistics into a dictionary
         statistics = expand(
             '{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}statistics{sep}{statistic}.txt',
             out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, statistic=statistics_options)
diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
index 237f7c5ba..cac952403 100644
--- a/spras/analysis/summary.py
+++ b/spras/analysis/summary.py
@@ -1,5 +1,6 @@
 import ast
 import json
+import os
 from pathlib import Path
 from typing import Iterable
 
@@ -9,7 +10,7 @@
 
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
-                       algo_with_params: list[str], statistics_files: list) -> pd.DataFrame:
+                       algo_with_params: list[str], statistics_files: list[str | os.PathLike]) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
     in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the
@@ -90,6 +91,3 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
     )
 
     return nw_info
-
-
-# TODO: redo the above code to work on mixed graphs
diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml
index c9ed5f735..77c3bedf3 100644
--- a/test/analysis/input/egfr.yaml
+++ b/test/analysis/input/egfr.yaml
@@ -31,3 +31,6 @@ datasets:
 reconstruction_settings:
   locations:
     reconstruction_dir: "test/analysis/input/run/egfr"
+analysis:
+  summary:
+    include: true
diff --git a/test/analysis/input/example.yaml b/test/analysis/input/example.yaml
index 1a4514c00..15f4a69b4 100644
--- a/test/analysis/input/example.yaml
+++ b/test/analysis/input/example.yaml
@@ -48,3 +48,6 @@ gold_standards:
 reconstruction_settings:
   locations:
     reconstruction_dir: "test/analysis/input/run/example"
+analysis:
+  summary:
+    include: true
diff --git a/test/analysis/test_summary.py b/test/analysis/test_summary.py
index b548b8087..f6e940cf5 100644
--- a/test/analysis/test_summary.py
+++ b/test/analysis/test_summary.py
@@ -32,7 +32,7 @@ def snakemake_output(request):
     param = request.param
     subprocess.run(["snakemake", "--cores", "1", "--configfile", f"test/analysis/input/{param}.yaml"])
     yield param # this runs the test itself: once this is passed, we go to test cleanup.
-    shutil.rmtree(f"test/analysis/input/run/{param}")
+    # shutil.rmtree(f"test/analysis/input/run/{param}")
 
 class TestSummary:
     @classmethod
@@ -56,11 +56,12 @@ def test_example_networks(self, snakemake_output):
         algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in
                                   algorithm_params.items() for params_hash in param_combos.keys()]
 
-        example_network_files = (INPUT_DIR / "run" / snakemake_output).rglob("pathway.txt")
+        network_files = (INPUT_DIR / "run" / snakemake_output).rglob("pathway.txt")
+        statistics_files = (INPUT_DIR / "run" / snakemake_output).rglob("**/statistics/**")
 
         out_path = Path(OUT_DIR, f"test_{snakemake_output}_summary.txt")
         out_path.unlink(missing_ok=True)
-        summarize_out = summarize_networks(example_network_files, example_node_table, algorithm_params,
+        summarize_out = summarize_networks(network_files, example_node_table, algorithm_params,
                                                algorithms_with_params)
         # We do some post-processing to ensure that we get a stable summarize_out, since the attached hash
         # is subject to variation (especially in testing) whenever the SPRAS commit revision gets changed

From 4fe949d89d23501c3d45edab93800124ce656177 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Sat, 25 Apr 2026 08:24:16 +0000
Subject: [PATCH 15/17] refactor: use dictionaries instead of a flat list

along with proper Snakemake procedural rule usage
---
 Snakefile                     | 26 +++++++++++++++++++-------
 spras/analysis/summary.py     | 24 ++++++++++++++++--------
 spras/statistics.py           |  5 ++---
 test/analysis/test_summary.py |  7 ++++---
 4 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/Snakefile b/Snakefile
index ea935bec2..f294efd1d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -315,34 +315,46 @@ rule viz_cytoscape:
 
 # We generate new Snakemake rules for every statistic
 # to allow parallel and lazy computation of individual statistics
-for keys, values in statistics_computation.items():
+for keys in statistics_computation.keys():
     pythonic_name = 'generate_' + '_and_'.join([key.lower().replace(' ', '_') for key in keys])
     rule:
         # (See https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#procedural-rule-definition)
         name: pythonic_name
         input: pathway_file = rules.parse_output.output.standardized_file
         output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys]
+        # It is very tempting to use `.items()` instead of `.keys()` above, but
+        # We instead need to pass keys in via parameters, else the job would use the latest values in the statistics_computation.
+        # More info is in the procedural rule link ab
+        params: statistics_names=keys
         run:
             (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True)
             graph = from_output_pathway(input.pathway_file)
-            for computed, output in zip(values(graph), output):
+            for computed, output in zip(statistics_computation[params.statistics_names](graph), output):
                 Path(output).write_text(str(computed))
 
+# We isolate this to a separate input function, as we want to preserve the dictionary structure
+def summary_files(wildcards):
+    return {
+        algorithm_param: expand(
+            '{out_dir}{sep}{dataset}-{algorithm_param}{sep}statistics{sep}{statistic}.txt',
+            out_dir=out_dir, sep=SEP, algorithm_param=algorithm_param, statistic=statistics_options,
+            dataset=wildcards.dataset
+        ) for algorithm_param in algorithms_with_params
+    }
+
 # Write a single summary table for all pathways for each dataset
 rule summary_table:
     input:
         # Collect all pathways generated for the dataset
         pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params),
         dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']),
-        # Collect all possible statistics into a dictionary
-        statistics = expand(
-            '{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}statistics{sep}{statistic}.txt',
-            out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, statistic=statistics_options)
+        # Collect all possible statistics from the `summary_files` dictionary-based input function
+        statistics = lambda wildcards: flatten(list(summary_files(wildcards).values()))
     output: summary_table = SEP.join([out_dir, '{dataset}-pathway-summary.txt'])
     run:
         # Load the node table from the pickled dataset file
         node_table = Dataset.from_file(input.dataset_file).node_table
-        summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params, input.statistics)
+        summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params, summary_files(wildcards))
         summary_df.to_csv(output.summary_table, sep='\t', index=False)
 
 # Cluster the output pathways for each dataset
diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
index cac952403..bdec9baca 100644
--- a/spras/analysis/summary.py
+++ b/spras/analysis/summary.py
@@ -1,16 +1,17 @@
 import ast
+import itertools
 import json
 import os
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Mapping
 
 import pandas as pd
 
-from spras.statistics import from_output_pathway
+from spras.statistics import from_output_pathway, statistics_options
 
 
 def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params: dict[str, dict],
-                       algo_with_params: list[str], statistics_files: list[str | os.PathLike]) -> pd.DataFrame:
+                       algo_with_params: list[str], statistics_files: Mapping[str, Iterable[str | os.PathLike]]) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths, including which nodes are present
     in node_table columns. Network directionality is ignored and all edges are treated as undirected. The order of the
@@ -20,7 +21,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
     @param algo_params: a nested dict mapping algorithm names to dicts that map parameter hashes to parameter
     combinations.
     @param algo_with_params: a list of <algorithm>-params-<params_hash> combinations
-    @param statistics_files: a list of statistic files with the computed statistics.
+    @param statistics_files: a dictionary from algo_with_params to lists of statistic files with the computed statistics.
     @return: pandas DataFrame with summary information
     """
     # Ensure that NODEID is the first column
@@ -51,7 +52,11 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
 
         # We use ast.literal_eval here to convert statistic file outputs to ints or floats depending on their string representation.
         # (e.g. "5.0" -> float(5.0), while "5" -> int(5).)
-        graph_statistics = [ast.literal_eval(Path(file).read_text()) for file in statistics_files]
+        graph_statistics = [
+            ast.literal_eval(Path(file).read_text()) for file in
+            # along with sorting to keep the output stable (this happens again)
+            sorted(statistics_files[algo_with_params[index]], key=lambda x: statistics_options.index(Path(x).stem))
+        ]
 
         # Initialize list to store current network information
         cur_nw_info = [nw_name, *graph_statistics]
@@ -76,10 +81,13 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
         # Save the current network information to the network summary list
         nw_info.append(cur_nw_info)
 
-    # Get the list of statistic names by their file names
-    statistics_options = [Path(file).stem for file in statistics_files]
+    # Get the list of statistic names by their file names (via finding all requested statistics in the provided files)
+    current_statistics_options = sorted(
+        set(Path(file).stem for file in itertools.chain(*statistics_files.values())),
+        key=lambda x: statistics_options.index(x)
+    )
     # Prepare column names
-    col_names = ['Name', *statistics_options]
+    col_names = ['Name', *current_statistics_options]
     col_names.extend(nodes_by_col_labs)
     col_names.append('Parameter combination')
 
diff --git a/spras/statistics.py b/spras/statistics.py
index 251fecca2..f303c8a8f 100644
--- a/spras/statistics.py
+++ b/spras/statistics.py
@@ -71,6 +71,5 @@ def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
 
 def from_output_pathway(lines) -> nx.Graph:
     with open(lines, 'r') as f:
-        lines = f.readlines()[1:]
-
-    return nx.read_edgelist(lines, data=(('Rank', int), ('Direction', str)))
+        next(f) # skip the header line
+        return nx.read_edgelist(f, data=(('Rank', int), ('Direction', str)), delimiter='\t')
diff --git a/test/analysis/test_summary.py b/test/analysis/test_summary.py
index f6e940cf5..303970997 100644
--- a/test/analysis/test_summary.py
+++ b/test/analysis/test_summary.py
@@ -1,5 +1,4 @@
 import filecmp
-import shutil
 import subprocess
 from pathlib import Path
 
@@ -57,12 +56,14 @@ def test_example_networks(self, snakemake_output):
                                   algorithm_params.items() for params_hash in param_combos.keys()]
 
         network_files = (INPUT_DIR / "run" / snakemake_output).rglob("pathway.txt")
-        statistics_files = (INPUT_DIR / "run" / snakemake_output).rglob("**/statistics/**")
+        statistics_folders = [Path(file) for file in (INPUT_DIR / "run" / snakemake_output).rglob("**/statistics") if Path(file).name == "statistics"]
+        # We do some string fiddling here to make sure the folder matches up with algorithms_with_params. This may be susceptible to a good refactor.
+        statistics_files = {"-".join(folder.parent.stem.split("-")[1:]): list(folder.glob("*.txt")) for folder in statistics_folders}
 
         out_path = Path(OUT_DIR, f"test_{snakemake_output}_summary.txt")
         out_path.unlink(missing_ok=True)
         summarize_out = summarize_networks(network_files, example_node_table, algorithm_params,
-                                               algorithms_with_params)
+                                               algorithms_with_params, statistics_files)
         # We do some post-processing to ensure that we get a stable summarize_out, since the attached hash
         # is subject to variation (especially in testing) whenever the SPRAS commit revision gets changed
         summarize_out["Parameter combination"] = summarize_out["Parameter combination"].astype(str)

From a86354ff2cda0ab3e18d22fa3f084ea18b2c1d49 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Mon, 27 Apr 2026 01:04:43 -0700
Subject: [PATCH 16/17] docs: clarification

---
 Snakefile                 | 2 +-
 spras/analysis/summary.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index f294efd1d..40adaa1f4 100644
--- a/Snakefile
+++ b/Snakefile
@@ -324,7 +324,7 @@ for keys in statistics_computation.keys():
         output: [SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'statistics', f'{key}.txt']) for key in keys]
         # It is very tempting to use `.items()` instead of `.keys()` above, but
         # We instead need to pass keys in via parameters, else the job would use the latest values in the statistics_computation.
-        # More info is in the procedural rule link ab
+        # More info is in the procedural rule link above
         params: statistics_names=keys
         run:
             (Path(input.pathway_file).parent / 'statistics').mkdir(exist_ok=True)
diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
index bdec9baca..9ede86479 100644
--- a/spras/analysis/summary.py
+++ b/spras/analysis/summary.py
@@ -54,7 +54,7 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, alg
         # (e.g. "5.0" -> float(5.0), while "5" -> int(5).)
         graph_statistics = [
             ast.literal_eval(Path(file).read_text()) for file in
-            # along with sorting to keep the output stable (this happens again)
+            # along with sorting to keep the output stable (we do this same sorting procedure once more in this function)
             sorted(statistics_files[algo_with_params[index]], key=lambda x: statistics_options.index(Path(x).stem))
         ]
 

From 9053a5984201bb84c08c1f8dedb14aad74d8fbe9 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Wed, 29 Apr 2026 21:02:40 +0000
Subject: [PATCH 17/17] refactor: apply suggestions

---
 spras/statistics.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/spras/statistics.py b/spras/statistics.py
index f303c8a8f..61e876705 100644
--- a/spras/statistics.py
+++ b/spras/statistics.py
@@ -28,10 +28,11 @@ def compute_degree(graph: nx.DiGraph) -> tuple[int, float]:
         degrees = [deg for _, deg in graph.degree()]
         return max(degrees), median(degrees)
 
-def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
+def compute_max_diameter(directed_graph: nx.DiGraph) -> tuple[int,]:
     # We convert our directed_graph to an undirected graph as networkx (reasonably) does
     # not allow for computing the connected components of a directed graph, but the connected
     # component count still is a useful statistic for us.
+    # We do this a few more times throughout the file.
     graph: nx.Graph = directed_graph.to_undirected()
     cc = list(nx.connected_components(graph))
     # Save the max diameter
@@ -40,12 +41,15 @@ def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
         nx.diameter(graph.subgraph(c).copy()) if len(c) > 1 else 0
         for c in cc
     ]
-    max_diameter = max(diameters, default=0)
+    return (max(diameters, default=0),)
 
+def compute_avg_path_lengths(directed_graph: nx.DiGraph) -> tuple[float,]:
+    graph: nx.Graph = directed_graph.to_undirected()
+    cc = list(nx.connected_components(graph))
     # Save the average path lengths
     # Compute average shortest path length only for components with ≥2 nodes (undefined for singletons, set to 0.0)
     avg_path_lengths = [
-        nx.average_shortest_path_length(graph.subgraph(c).copy()) if len(c) > 1 else 0.0
+        nx.average_shortest_path_length(directed_graph.subgraph(c).copy()) if len(c) > 1 else 0.0
         for c in cc
     ]
 
@@ -54,16 +58,17 @@ def compute_on_cc(directed_graph: nx.DiGraph) -> tuple[int, float]:
     else:
         avg_path_len = 0.0
 
-    return max_diameter, avg_path_len
+    return (avg_path_len,)
 
-# The type signature here is meant to be 'an n-tuple has n-outputs.'
+# The type signature here is meant to be 'an n-tuple has n outputs.'
 statistics_computation: dict[tuple[str, ...], Callable[[nx.DiGraph], tuple[float | int, ...]]] = {
     ('Number of nodes',): lambda graph : (graph.number_of_nodes(),),
     ('Number of edges',): lambda graph : (graph.number_of_edges(),),
     ('Number of connected components',): lambda graph : (nx.number_connected_components(graph.to_undirected()),),
     ('Density',): lambda graph : (nx.density(graph),),
     ('Max degree', 'Median degree'): compute_degree,
-    ('Max diameter', 'Average path length'): compute_on_cc,
+    ('Max diameter',): compute_max_diameter,
+    ('Average path length',): compute_avg_path_lengths,
 }
 
 # All of the keys inside statistics_computation, flattened.