diff --git a/.github/workflows/label.yaml b/.github/workflows/label.yaml index ccb7e65..efe19b5 100644 --- a/.github/workflows/label.yaml +++ b/.github/workflows/label.yaml @@ -14,5 +14,5 @@ jobs: - uses: actions/labeler@v5 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - configuration-path: .github/labeler.yml + configuration-path: .github/labeler.yaml sync-labels: true diff --git a/Makefile b/Makefile index e1b5c64..370d1e2 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all setup check lint typecheck test docs docs-serve clean help +.PHONY: all setup check lint typecheck test stest docs docs-serve clean help UV=uv UVX=uvx @@ -29,6 +29,10 @@ test: @echo '=== Tests ===' $(UV) run $(PYTEST) --cov=hyperbench --cov-report=term-missing +stest: + @echo '=== Test for $(FILE) ===' + $(UV) run $(PYTEST) hyperbench/tests/$(FILE) -v -s + docs: @echo '=== Building docs ===' $(UV) run mkdocs build -f $(MKDOCS_CONFIG) @@ -49,6 +53,7 @@ help: @echo " lint - Run linter" @echo " typecheck - Run type checker" @echo " test - Run tests" + @echo " stest - Run single test" @echo " check - Run lint and typecheck" @echo " docs - Build documentation" @echo " docs-serve - Serve docs locally at http://127.0.0.1:8000" diff --git a/hyperbench/data/dataset.py b/hyperbench/data/dataset.py index 24b04eb..1346696 100644 --- a/hyperbench/data/dataset.py +++ b/hyperbench/data/dataset.py @@ -469,3 +469,11 @@ def __process_x(self, num_nodes: int) -> Tensor: x = torch.ones((num_nodes, 1), dtype=torch.float) return x # shape [num_nodes, num_node_features] + + def stats(self) -> Dict[str, Any]: + """ + Compute statistics for the dataset. + This method currently delegates to the underlying HData's stats method. + """ + + return self.hdata.stats() diff --git a/hyperbench/tests/data/dataset_test.py b/hyperbench/tests/data/dataset_test.py index ca6e62a..75ee8d7 100644 --- a/hyperbench/tests/data/dataset_test.py +++ b/hyperbench/tests/data/dataset_test.py @@ -1022,3 +1022,47 @@ def test_from_hdata_with_explicit_strategy(mock_hdata): assert dataset.sampling_strategy == SamplingStrategy.NODE assert len(dataset) == 3 # mock_hdata has 3 nodes + + +@pytest.fixture +def mock_hdata_stats(): + x = torch.tensor( + [ + [0.0, 1.0, 2.0, 3.0], + [1.0, 2.0, 3.0, 4.0], + [2.0, 3.0, 4.0, 5.0], + [3.0, 4.0, 5.0, 6.0], + ], + dtype=torch.float, + ) + hyperedge_index = torch.tensor( + [ + [0, 1, 2, 2, 3], + [0, 0, 0, 1, 1], + ] + ) + return HData(x=x, hyperedge_index=hyperedge_index) + + +def test_dataset_stats_computation(mock_hdata_stats): + expected_stats = { + "shape_x": torch.Size([4, 4]), + "shape_hyperedge_attr": None, + "num_nodes": 4, + "num_hyperedges": 2, + "avg_degree_node": 1.25, + "avg_degree_hyperedge": 2.5, + "node_degree_max": 2, + "hyperedge_degree_max": 3, + "node_degree_median": 1, + "hyperedge_degree_median": 2, + "distribution_node_degree": [1, 1, 2, 1], + "distribution_hyperedge_size": [3, 2], + "distribution_node_degree_hist": {1: 3, 2: 1}, + "distribution_hyperedge_size_hist": {2: 1, 3: 1}, + } + + dataset = Dataset.from_hdata(mock_hdata_stats) + + stats = dataset.stats() + assert stats == expected_stats diff --git a/hyperbench/tests/mock/hif_stats.hif.json b/hyperbench/tests/mock/hif_stats.hif.json new file mode 100644 index 0000000..38f159f --- /dev/null +++ b/hyperbench/tests/mock/hif_stats.hif.json @@ -0,0 +1,54 @@ +{ + "edges": [ + { + "attrs": {}, + "edges": 0 + }, + { + "attrs": {}, + "edges": 1 + } + ], + "incidences": [ + { + "edge": 0, + "node": "0" + }, + { + "edge": 0, + "node": "1" + }, + { + "edge": 0, + "node": "2" + }, + { + "edge": 1, + "node": "2" + }, + { + "edge": 1, + "node": "3" + } + ], + "metadata": {}, + "network-type": "undirected", + "nodes": [ + { + "attrs": {}, + "nodes": "0" + }, + { + "attrs": {}, + "nodes": "1" + }, + { + "attrs": {}, + "nodes": "2" + }, + { + "attrs": {}, + "nodes": "3" + } + ] +} diff --git a/hyperbench/tests/types/hdata_test.py b/hyperbench/tests/types/hdata_test.py index 6741a2e..e815ded 100644 --- a/hyperbench/tests/types/hdata_test.py +++ b/hyperbench/tests/types/hdata_test.py @@ -21,6 +21,26 @@ def mock_hdata(): return HData(x=x, hyperedge_index=hyperedge_index, hyperedge_attr=hyperedge_attr) +@pytest.fixture +def mock_hdata_stats(): + x = torch.tensor( + [ + [0.0, 1.0, 2.0, 3.0], + [1.0, 2.0, 3.0, 4.0], + [2.0, 3.0, 4.0, 5.0], + [3.0, 4.0, 5.0, 6.0], + ], + dtype=torch.float, + ) + hyperedge_index = torch.tensor( + [ + [0, 1, 2, 2, 3], + [0, 0, 0, 1, 1], + ] + ) + return HData(x=x, hyperedge_index=hyperedge_index) + + @pytest.mark.parametrize( "explicit_num_nodes, expected_num_nodes", [ @@ -629,3 +649,51 @@ def test_shuffle_with_no_seed_set(mock_hdata): assert shuffled_hdata1.num_nodes == mock_hdata.num_nodes assert shuffled_hdata1.num_hyperedges == mock_hdata.num_hyperedges assert shuffled_hdata1.hyperedge_index.shape == mock_hdata.hyperedge_index.shape + + +def test_stats_returns_correct_statistics(mock_hdata_stats): + expected_stats = { + "shape_x": torch.Size([4, 4]), + "shape_hyperedge_attr": None, + "num_nodes": 4, + "num_hyperedges": 2, + "avg_degree_node": 1.25, + "avg_degree_hyperedge": 2.5, + "node_degree_max": 2, + "hyperedge_degree_max": 3, + "node_degree_median": 1, + "hyperedge_degree_median": 2, + "distribution_node_degree": [1, 1, 2, 1], + "distribution_hyperedge_size": [3, 2], + "distribution_node_degree_hist": {1: 3, 2: 1}, + "distribution_hyperedge_size_hist": {2: 1, 3: 1}, + } + + stats = mock_hdata_stats.stats() + + assert stats == expected_stats + + +def test_stats_with_empty_hdata(): + empty_hdata = HData.empty() + + expected_stats = { + "shape_x": torch.Size([0, 0]), + "shape_hyperedge_attr": None, + "num_nodes": 0, + "num_hyperedges": 0, + "avg_degree_node": 0, + "avg_degree_hyperedge": 0, + "node_degree_max": 0, + "hyperedge_degree_max": 0, + "node_degree_median": 0, + "hyperedge_degree_median": 0, + "distribution_node_degree": [], + "distribution_hyperedge_size": [], + "distribution_node_degree_hist": {}, + "distribution_hyperedge_size_hist": {}, + } + + stats = empty_hdata.stats() + + assert stats == expected_stats diff --git a/hyperbench/tests/types/hypergraph_test.py b/hyperbench/tests/types/hypergraph_test.py index cfabcc0..6fabb18 100644 --- a/hyperbench/tests/types/hypergraph_test.py +++ b/hyperbench/tests/types/hypergraph_test.py @@ -296,6 +296,97 @@ def test_neighbors_of_all(hyperedges, expected_neighbors_map): assert hypergraph.neighbors_of_all() == expected_neighbors_map +@pytest.mark.parametrize( + "hyperedges, expected_stats", + [ + pytest.param( + [], + { + "num_nodes": 0, + "num_hyperedges": 0, + "avg_degree_node": 0.0, + "avg_degree_hyperedge": 0.0, + "node_degree_max": 0, + "hyperedge_degree_max": 0, + "node_degree_median": 0.0, + "hyperedge_degree_median": 0.0, + "distribution_node_degree": [], + "distribution_hyperedge_size": [], + "distribution_node_degree_hist": {}, + "distribution_hyperedge_size_hist": {}, + }, + id="empty_hypergraph", + ), + pytest.param( + [[0, 1]], + { + "num_nodes": 2, + "num_hyperedges": 1, + "avg_degree_node": 1.0, + "avg_degree_hyperedge": 2.0, + "node_degree_max": 1, + "hyperedge_degree_max": 2, + "node_degree_median": 1.0, + "hyperedge_degree_median": 2.0, + "distribution_node_degree": [1, 1], + "distribution_hyperedge_size": [2], + "distribution_node_degree_hist": {1: 2}, + "distribution_hyperedge_size_hist": {2: 1}, + }, + id="single_hyperedge_two_nodes", + ), + pytest.param( + [[0, 1, 2], [2, 3]], + { + "num_nodes": 4, + "num_hyperedges": 2, + "avg_degree_node": 1.25, + "avg_degree_hyperedge": 2.5, + "node_degree_max": 2, + "hyperedge_degree_max": 3, + "node_degree_median": 1.0, + "hyperedge_degree_median": 2.5, + "distribution_node_degree": [1, 1, 1, 2], + "distribution_hyperedge_size": [3, 2], + "distribution_node_degree_hist": {1: 3, 2: 1}, + "distribution_hyperedge_size_hist": {3: 1, 2: 1}, + }, + id="two_hyperedges_varying_sizes", + ), + ], +) +def test_hypergraph_stats_returns_correct_statistics(hyperedges, expected_stats): + hypergraph = Hypergraph(hyperedges) + stats = hypergraph.stats() + + assert stats == expected_stats + + +def test_hifhypergraph_stats_returns_correct_statistics(): + expected_stats = { + "num_nodes": 4, + "num_hyperedges": 2, + "avg_degree_node": 1.25, + "avg_degree_hyperedge": 2.5, + "node_degree_max": 2, + "hyperedge_degree_max": 3, + "node_degree_median": 1.0, + "hyperedge_degree_median": 2.5, + "distribution_node_degree": [1, 1, 1, 2], + "distribution_hyperedge_size": [2, 3], + "distribution_node_degree_hist": {1: 3, 2: 1}, + "distribution_hyperedge_size_hist": {3: 1, 2: 1}, + } + + with open(f"{MOCK_BASE_PATH}/hif_stats.hif.json", "r") as f: + hiftext = json.load(f) + + hypergraph = HIFHypergraph.from_hif(hiftext) + stats = hypergraph.stats() + + assert stats == expected_stats + + @pytest.mark.parametrize( "hyperedge_index_tensor, hyperedge_id, expected_nodes", [ diff --git a/hyperbench/types/hdata.py b/hyperbench/types/hdata.py index 335fbe0..9d2d8b2 100644 --- a/hyperbench/types/hdata.py +++ b/hyperbench/types/hdata.py @@ -1,7 +1,7 @@ import torch from torch import Tensor -from typing import Optional, Sequence +from typing import Optional, Sequence, Dict, Any from hyperbench.utils import empty_hyperedgeindex, empty_nodefeatures from .hypergraph import HyperedgeIndex @@ -379,3 +379,92 @@ def with_y_ones(self) -> "HData": def with_y_zeros(self) -> "HData": """Return a copy of this instance with a y attribute of all zeros.""" return self.with_y_to(0.0) + + def stats(self) -> Dict[str, Any]: + """ + Compute statistics for the hypergraph data. + The field returned in the dictionary include: + - ``shape_x``: The shape of the node feature matrix ``x``. + - ``shape_hyperedge_attr``: The shape of the hyperedge attribute matrix, or ``None`` if hyperedge attributes are not present. + - ``num_nodes``: The number of nodes in the hypergraph. + - ``num_hyperedges``: The number of hyperedges in the hypergraph. + - ``avg_degree_node``: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to. + - ``avg_degree_hyperedge``: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains. + - ``node_degree_max``: The maximum degree of any node in the hypergraph. + - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph. + - ``node_degree_median``: The median degree of nodes in the hypergraph. + - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph. + - ``distribution_node_degree``: A list where the value at index ``i`` represents the count of nodes with degree ``i``. + - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents the count of hyperedges with size ``i``. + - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and the values are the count of nodes with that degree. + - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size. + + Returns: + A dictionary containing various statistics about the hypergraph. + """ + + node_ids = self.hyperedge_index[0] + hyperedge_ids = self.hyperedge_index[1] + + # Degree of each node = number of hyperedges it belongs to + # Size of each hyperedge = number of nodes it contains + if node_ids.numel() > 0: + distribution_node_degree = torch.bincount(node_ids, minlength=self.num_nodes).float() + distribution_hyperedge_size = torch.bincount( + hyperedge_ids, minlength=self.num_hyperedges + ).float() + else: + distribution_node_degree = torch.zeros(self.num_nodes, dtype=torch.float) + distribution_hyperedge_size = torch.zeros(self.num_hyperedges, dtype=torch.float) + + num_nodes = self.num_nodes + num_hyperedges = self.num_hyperedges + + if distribution_node_degree.numel() > 0: + avg_degree_node = distribution_node_degree.mean().item() + avg_degree_hyperedge = distribution_hyperedge_size.mean().item() + node_degree_max = int(distribution_node_degree.max().item()) + hyperedge_degree_max = int(distribution_hyperedge_size.max().item()) + node_degree_median = int(distribution_node_degree.median().item()) + hyperedge_degree_median = int(distribution_hyperedge_size.median().item()) + else: + avg_degree_node = 0 + avg_degree_hyperedge = 0 + node_degree_max = 0 + hyperedge_degree_max = 0 + node_degree_median = 0 + hyperedge_degree_median = 0 + + # Histograms: index i holds count of nodes/hyperedges with degree/size i + distribution_node_degree_hist = torch.bincount(distribution_node_degree.long()) + distribution_hyperedge_size_hist = torch.bincount(distribution_hyperedge_size.long()) + + distribution_node_degree_hist = { + i: int(count.item()) + for i, count in enumerate(distribution_node_degree_hist) + if count.item() > 0 + } + distribution_hyperedge_size_hist = { + i: int(count.item()) + for i, count in enumerate(distribution_hyperedge_size_hist) + if count.item() > 0 + } + + return { + "shape_x": self.x.shape, + "shape_hyperedge_attr": self.hyperedge_attr.shape + if self.hyperedge_attr is not None + else None, + "num_nodes": num_nodes, + "num_hyperedges": num_hyperedges, + "avg_degree_node": avg_degree_node, + "avg_degree_hyperedge": avg_degree_hyperedge, + "node_degree_max": node_degree_max, + "hyperedge_degree_max": hyperedge_degree_max, + "node_degree_median": node_degree_median, + "hyperedge_degree_median": hyperedge_degree_median, + "distribution_node_degree": distribution_node_degree.int().tolist(), + "distribution_hyperedge_size": distribution_hyperedge_size.int().tolist(), + "distribution_node_degree_hist": distribution_node_degree_hist, + "distribution_hyperedge_size_hist": distribution_hyperedge_size_hist, + } diff --git a/hyperbench/types/hypergraph.py b/hyperbench/types/hypergraph.py index ce81047..b030cb8 100644 --- a/hyperbench/types/hypergraph.py +++ b/hyperbench/types/hypergraph.py @@ -75,6 +75,101 @@ def num_edges(self) -> int: """Return the number of edges in the hypergraph.""" return len(self.edges) + def stats(self) -> Dict[str, Any]: + """ + Compute statistics for the HIFhypergraph. + The field returned in the dictionary include: + - ``num_nodes``: The number of nodes in the hypergraph. + - ``num_hyperedges``: The number of hyperedges in the hypergraph. + - ``avg_degree_node``: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to. + - ``avg_degree_hyperedge``: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains. + - ``node_degree_max``: The maximum degree of any node in the hypergraph. + - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph. + - ``node_degree_median``: The median degree of nodes in the hypergraph. + - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph. + - ``distribution_node_degree``: A list where the value at index ``i`` represents the count of nodes with degree ``i``. + - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents the count of hyperedges with size ``i``. + - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and the values are the count of nodes with that degree. + - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size. + + Returns: + A dictionary containing various statistics about the hypergraph. + """ + + node_degree: Dict[Any, int] = {} + hyperedge_size: Dict[Any, int] = {} + + for incidence in self.incidences: + node_id = incidence.get("node") + edge_id = incidence.get("edge") + node_degree[node_id] = node_degree.get(node_id, 0) + 1 + hyperedge_size[edge_id] = hyperedge_size.get(edge_id, 0) + 1 + + num_nodes = len(self.nodes) + num_hyperedges = len(self.edges) + total_incidences = len(self.incidences) + + distribution_node_degree: List[int] = sorted(node_degree.values()) + distribution_hyperedge_size: List[int] = sorted(hyperedge_size.values()) + + avg_degree_node = total_incidences / num_nodes if num_nodes else 0 + avg_degree_hyperedge = total_incidences / num_hyperedges if num_hyperedges else 0 + + node_degree_max = max(distribution_node_degree) if distribution_node_degree else 0 + hyperedge_degree_max = ( + max(distribution_hyperedge_size) if distribution_hyperedge_size else 0 + ) + + n_n = len(distribution_node_degree) + node_degree_median = ( + ( + distribution_node_degree[n_n // 2] + if n_n % 2 + else (distribution_node_degree[n_n // 2 - 1] + distribution_node_degree[n_n // 2]) + / 2 + ) + if n_n + else 0 + ) + + n_e = len(distribution_hyperedge_size) + hyperedge_degree_median = ( + ( + distribution_hyperedge_size[n_e // 2] + if n_e % 2 + else ( + distribution_hyperedge_size[n_e // 2 - 1] + + distribution_hyperedge_size[n_e // 2] + ) + / 2 + ) + if n_e + else 0 + ) + + distribution_node_degree_hist: Dict[int, int] = {} + for d in distribution_node_degree: + distribution_node_degree_hist[d] = distribution_node_degree_hist.get(d, 0) + 1 + + distribution_hyperedge_size_hist: Dict[int, int] = {} + for s in distribution_hyperedge_size: + distribution_hyperedge_size_hist[s] = distribution_hyperedge_size_hist.get(s, 0) + 1 + + return { + "num_nodes": num_nodes, + "num_hyperedges": num_hyperedges, + "avg_degree_node": avg_degree_node, + "avg_degree_hyperedge": avg_degree_hyperedge, + "node_degree_max": node_degree_max, + "hyperedge_degree_max": hyperedge_degree_max, + "node_degree_median": node_degree_median, + "hyperedge_degree_median": hyperedge_degree_median, + "distribution_node_degree": distribution_node_degree, + "distribution_hyperedge_size": distribution_hyperedge_size, + "distribution_node_degree_hist": distribution_node_degree_hist, + "distribution_hyperedge_size_hist": distribution_hyperedge_size_hist, + } + class Hypergraph: """ @@ -138,6 +233,79 @@ def neighbors_of_all(self) -> Dict[int, Neighborhood]: return node_to_neighbors + def stats(self) -> Dict[str, Any]: + """Return basic statistics about the hypergraph.""" + node_degree: Dict[int, int] = {} + distribution_hyperedge_size: List[int] = [] + total_incidences = 0 + + for hyperedge in self.hyperedges: + size = len(hyperedge) + distribution_hyperedge_size.append(size) + total_incidences += size + for node in hyperedge: + node_degree[node] = node_degree.get(node, 0) + 1 + + num_nodes = len(node_degree) + num_hyperedges = len(self.hyperedges) + distribution_node_degree: List[int] = sorted(node_degree.values()) + + avg_degree_hyperedge = total_incidences / num_hyperedges if num_hyperedges else 0 + total_incidences_nodes = sum(distribution_node_degree) + avg_degree_node = total_incidences_nodes / num_nodes if num_nodes else 0 + + hyperedge_degree_max = ( + max(distribution_hyperedge_size) if distribution_hyperedge_size else 0 + ) + node_degree_max = max(distribution_node_degree) if distribution_node_degree else 0 + + sorted_hyperedge_sizes = sorted(distribution_hyperedge_size) + n_e = len(sorted_hyperedge_sizes) + hyperedge_degree_median = ( + ( + sorted_hyperedge_sizes[n_e // 2] + if n_e % 2 + else (sorted_hyperedge_sizes[n_e // 2 - 1] + sorted_hyperedge_sizes[n_e // 2]) / 2 + ) + if n_e + else 0 + ) + + n_n = len(distribution_node_degree) + node_degree_median = ( + ( + distribution_node_degree[n_n // 2] + if n_n % 2 + else (distribution_node_degree[n_n // 2 - 1] + distribution_node_degree[n_n // 2]) + / 2 + ) + if n_n + else 0 + ) + + distribution_hyperedge_size_hist: Dict[int, int] = {} + for s in distribution_hyperedge_size: + distribution_hyperedge_size_hist[s] = distribution_hyperedge_size_hist.get(s, 0) + 1 + + distribution_node_degree_hist: Dict[int, int] = {} + for d in distribution_node_degree: + distribution_node_degree_hist[d] = distribution_node_degree_hist.get(d, 0) + 1 + + return { + "num_nodes": num_nodes, + "num_hyperedges": num_hyperedges, + "avg_degree_node": avg_degree_node, + "avg_degree_hyperedge": avg_degree_hyperedge, + "node_degree_max": node_degree_max, + "hyperedge_degree_max": hyperedge_degree_max, + "node_degree_median": node_degree_median, + "hyperedge_degree_median": hyperedge_degree_median, + "distribution_node_degree": distribution_node_degree, + "distribution_hyperedge_size": distribution_hyperedge_size, + "distribution_node_degree_hist": distribution_node_degree_hist, + "distribution_hyperedge_size_hist": distribution_hyperedge_size_hist, + } + @classmethod def from_hyperedge_index(cls, hyperedge_index: Tensor) -> "Hypergraph": """