diff --git a/README.md b/README.md index 7e01eb151..0fd936a53 100755 --- a/README.md +++ b/README.md @@ -351,6 +351,7 @@ Specially useful in pre-processing steps, these are the general data manipulatio | InfereKNNConnectivity | Generates the k-nearest neighbor connectivity of the input point cloud. | | IdentityTransform | An identity transform that does nothing to the input data. | | EqualGausFeatures | Generates equal Gaussian features for all nodes. | +| FeatureDimensionalityReduction | Reduces feature dimensionality through SVD. | | CalculateSimplicialCurvature | Calculates the simplicial curvature of the input graph. | | LapPE | Computes Laplacian eigenvectors positional encodings. | | RWSE | Computes Random Walk structural encodings. | @@ -373,6 +374,7 @@ Specially useful in pre-processing steps, these are the general data manipulatio | IMDB-BIN | Classification | Graph-level classification. | [Source](https://dl.acm.org/doi/10.1145/2783258.2783417) | | IMDB-MUL | Classification | Graph-level classification. | [Source](https://dl.acm.org/doi/10.1145/2783258.2783417) | | REDDIT | Classification | Graph-level classification. | [Source](https://proceedings.neurips.cc/paper_files/paper/2017/file/5dd9db5e033da9c6fb5ba83c7a7ebea9-Paper.pdf) | +| GitHub (MUSAE) | Classification | Node-level classification. | [Source](https://arxiv.org/abs/1909.13021) | | Amazon | Classification | Heterophilic dataset. | [Source](https://arxiv.org/pdf/1205.6233) | | Minesweeper | Classification | Heterophilic dataset. | [Source](https://arxiv.org/pdf/2302.11640) | | Empire | Classification | Heterophilic dataset. | [Source](https://arxiv.org/pdf/2302.11640) | diff --git a/configs/dataset/graph/musae_github.yaml b/configs/dataset/graph/musae_github.yaml new file mode 100644 index 000000000..b703ca759 --- /dev/null +++ b/configs/dataset/graph/musae_github.yaml @@ -0,0 +1,34 @@ +# Dataset loader config +loader: + _target_: topobench.data.loaders.MusaeGitHubDatasetLoader + parameters: + data_domain: graph + data_type: MUSAE + data_name: musae_github + data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} + +# Dataset parameters +parameters: + # Dataset parameters + num_features: 128 # Default feature dimension after reduction 128 (Rozemberczki & Sarkar 2020), without reduction: 4005 + num_classes: 2 + num_nodes: 37700 + task: classification + loss_type: cross_entropy + monitor_metric: accuracy + task_level: node + +#splits +split_params: + learning_setting: transductive + data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name} + data_seed: 0 + split_type: random #'k-fold' # either "k-fold" or "random" strategies + k: 10 # for "k-fold" Cross-Validation + train_prop: 0.5 # for "random" strategy splitting + +# Dataloader parameters +dataloader_params: + batch_size: 1 # Fixed + num_workers: 1 + pin_memory: False diff --git a/configs/transforms/data_manipulations/feature_dimensionality_reduction.yaml b/configs/transforms/data_manipulations/feature_dimensionality_reduction.yaml new file mode 100644 index 000000000..80d88c863 --- /dev/null +++ b/configs/transforms/data_manipulations/feature_dimensionality_reduction.yaml @@ -0,0 +1,6 @@ +transform_name: "FeatureDimensionalityReduction" +transform_type: "data manipulation" + +reduced_dim: ${dataset.parameters.num_features} +svd_iter: 20 +svd_seed: ${seed} \ No newline at end of file diff --git a/configs/transforms/dataset_defaults/musae_github.yaml b/configs/transforms/dataset_defaults/musae_github.yaml new file mode 100644 index 000000000..ec36d9759 --- /dev/null +++ b/configs/transforms/dataset_defaults/musae_github.yaml @@ -0,0 +1,3 @@ +# MUSAE Github dataset needs feature dimensionality reduction transform for TopoBench models +defaults: + - data_manipulations: feature_dimensionality_reduction \ No newline at end of file diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py index 785987159..1a0612364 100644 --- a/test/pipeline/test_pipeline.py +++ b/test/pipeline/test_pipeline.py @@ -4,8 +4,8 @@ from test._utils.simplified_pipeline import run -DATASET = "graph/MUTAG" # ADD YOUR DATASET HERE -MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE +DATASET = "graph/musae_github" # ADD YOUR DATASET HERE +MODELS = ["graph/gcn"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE class TestPipeline: diff --git a/test/transforms/data_manipulations/test_FeatureDimensionalityReduction.py b/test/transforms/data_manipulations/test_FeatureDimensionalityReduction.py new file mode 100644 index 000000000..057025035 --- /dev/null +++ b/test/transforms/data_manipulations/test_FeatureDimensionalityReduction.py @@ -0,0 +1,164 @@ +"""Test FeatureDimensionalityReduction transform.""" + +import pytest +import torch +from torch_geometric.data import Data +from topobench.transforms.data_manipulations import FeatureDimensionalityReduction + + +class TestFeatureDimensionalityReduction: + """Test FeatureDimensionalityReduction transform.""" + + def setup_method(self): + """Set up test fixtures before each test method.""" + # Using the default values from config + self.reduced_dim = 3 # example value, would be from dataset.parameters.num_features + self.svd_iter = 20 + self.svd_seed = 42 + self.transform = FeatureDimensionalityReduction( + reduced_dim=self.reduced_dim, + svd_iter=self.svd_iter, + svd_seed=self.svd_seed + ) + + @staticmethod + def _make_sparse_data(num_nodes: int=5, num_features: int=30): + """Create a simple Data object with sparse node features.""" + torch.manual_seed(42) + x_dense = torch.randn(num_nodes, num_features, dtype=torch.float32) + x_sparse = x_dense.to_sparse() + + if num_nodes > 1: + #num_edges = 0.5 * num_nodes * (num_nodes - 1) + #edge_index = torch.randint(0, num_nodes, (2, num_edges)) + row = torch.arange(0, num_nodes - 1) + col = torch.arange(1, num_nodes) + edge_index = torch.stack([row, col], dim=0) + else: + edge_index = torch.zeros((2, 0), dtype=torch.long) + + data = Data( + x=x_sparse, + edge_index=edge_index, + num_nodes=num_nodes, + ) + return data + + def test_initialization(self): + """Test initialization with different parameters.""" + assert self.transform.type == "feature_dim_reduction" + assert self.transform.reduced_dim == self.reduced_dim + assert self.transform.svd_iter == self.svd_iter + assert self.transform.svd_seed == self.svd_seed + + # Check that the internal TruncatedSVD is configured correctly + assert self.transform.svd.n_components == self.reduced_dim + assert self.transform.svd.n_iter == self.svd_iter + assert self.transform.svd.random_state == self.svd_seed + + def test_repr(self): + """Test string representation of the transform.""" + repr_str = repr(self.transform) + assert "FeatureDimensionalityReduction" in repr_str + assert "feature_dim_reduction" in repr_str + assert f"reduced_dim={self.reduced_dim}" in repr_str + assert f"svd_iter={self.svd_iter}" in repr_str + assert f"svd_seed={self.svd_seed}" in repr_str + # __repr__ includes the underlying SVD object as `svd_red=...` + assert "svd_red=" in repr_str + + def test_forward_basic(self): + """Test basic forward pass on sparse node features.""" + num_nodes = 5 + num_features = 30 + data = self._make_sparse_data( + num_nodes=num_nodes, + num_features=num_features + ) + + transformed = self.transform(data) + + # Feature matrix should be dense and reduced to (num_nodes, reduced_dim) + assert transformed.x.size() == (num_nodes, self.reduced_dim) + assert transformed.x.dtype == torch.float32 + + # Check other attributes are preserved + assert transformed.num_nodes == data.num_nodes + assert torch.equal(transformed.edge_index, data.edge_index) + + def test_forward_without_x_attribute(self): + """Test transform on a graph without node features (x is None).""" + data = Data( + edge_index=torch.tensor([[0, 1], [1, 0]]), + num_nodes=2, + ) + + transformed = self.transform(data) + + # Check that is the same object + assert data.x is None + assert transformed.x is None + assert torch.equal(transformed.edge_index, data.edge_index) + assert transformed.num_nodes == data.num_nodes + + def test_deterministic_with_same_seed(self): + """Test deterministic behavior with the same seed.""" + data1 = self._make_sparse_data( + num_nodes=6, + num_features=7 + ) + data2 = data1.clone() + + t1 = FeatureDimensionalityReduction( + reduced_dim=self.reduced_dim, + svd_iter=self.svd_iter, + svd_seed=self.svd_seed, + ) + t2 = FeatureDimensionalityReduction( + reduced_dim=self.reduced_dim, + svd_iter=self.svd_iter, + svd_seed=self.svd_seed, + ) + result1 = t1(data1).x + result2 = t2(data2).x + + assert result1.shape == result2.shape + assert torch.allclose(result1, result2, atol=1e-6) + + def test_invalid_reduced_dim_raises(self): + """reduced_dim > num_features should raise a ValueError from SVD.""" + num_nodes = 5 + num_features = 3 + data = self._make_sparse_data( + num_nodes=num_nodes, + num_features=num_features + ) + + bad_transform = FeatureDimensionalityReduction( + reduced_dim=num_features + 1, + svd_iter=self.svd_iter, + svd_seed=self.svd_seed, + ) + + with pytest.raises(ValueError): + _ = bad_transform(data) + + def test_attribute_preservation(self): + """Test preservation of additional attributes besides x.""" + num_nodes = 4 + num_features = 5 + data = self._make_sparse_data( + num_nodes=num_nodes, + num_features=num_features + ) + + data.edge_attr = torch.randn(data.edge_index.size(1), 2) + data.custom_attr = "test" + + transformed = self.transform(data) + + # x is changed, but other attributes should be preserved + assert transformed.x.size() == (num_nodes, self.reduced_dim) + assert torch.equal(transformed.edge_index, data.edge_index) + assert torch.equal(transformed.edge_attr, data.edge_attr) + assert transformed.custom_attr == data.custom_attr \ No newline at end of file diff --git a/topobench/data/datasets/musae_github_dataset.py b/topobench/data/datasets/musae_github_dataset.py new file mode 100644 index 000000000..946765512 --- /dev/null +++ b/topobench/data/datasets/musae_github_dataset.py @@ -0,0 +1,203 @@ +"""Dataset class for MUSAE GitHub dataset.""" + +import json +import os +import os.path as osp +import shutil +from typing import ClassVar + +import numpy as np +import pandas as pd +import torch +from omegaconf import DictConfig +from torch_geometric.data import Data, InMemoryDataset, extract_zip +from torch_geometric.io import fs + +from topobench.data.utils import ( + download_file_from_link, +) + + +class MusaeGitHubDataset(InMemoryDataset): + r"""Dataset class for MUSAE GitHub dataset. + + Parameters + ---------- + root : str + Root directory where the dataset will be saved. + name : str + Name of the dataset. + parameters : DictConfig + Configuration parameters for the dataset. + + Attributes + ---------- + URLS (dict): Dictionary containing the URLs for downloading the dataset. + FILE_FORMAT (dict): Dictionary containing the file formats for the dataset. + RAW_FILE_NAMES (dict): Dictionary containing the raw file names for the dataset. + """ + + URLS: ClassVar = { + "musae_github": "https://snap.stanford.edu/data/git_web_ml.zip", + } + + FILE_FORMAT: ClassVar = { + "musae_github": "zip", + } + + RAW_FILE_NAMES: ClassVar = {} + + def __init__( + self, + root: str, + name: str, + parameters: DictConfig, + ) -> None: + self.name = name + self.raw_name = "git_web_ml" + self.parameters = parameters + super().__init__( + root, + ) + + out = fs.torch_load(self.processed_paths[0]) + assert len(out) == 3 or len(out) == 4 + + if len(out) == 3: # Backward compatibility. + data, self.slices, self.sizes = out + data_cls = Data + else: + data, self.slices, self.sizes, data_cls = out + + if not isinstance(data, dict): # Backward compatibility. + self.data = data + else: + self.data = data_cls.from_dict(data) + + assert isinstance(self._data, Data) + + def __repr__(self) -> str: + return f"{self.name}(self.root={self.root}, self.name={self.name}, " \ + f"self.parameters={self.parameters}, self.force_reload={self.force_reload})" + + @property + def raw_dir(self) -> str: + """Return the path to the raw directory of the dataset. + + Returns + ------- + str + Path to the raw directory. + """ + return osp.join(self.root, self.name, "raw") + + @property + def processed_dir(self) -> str: + """Return the path to the processed directory of the dataset. + + Returns + ------- + str + Path to the processed directory. + """ + + return osp.join(self.root, self.name, "processed") + + @property + def raw_file_names(self) -> list[str]: + """Return the raw file names for the dataset. + + Returns + ------- + list[str] + List of raw file names. + """ + return ["musae_git_edges.csv", "musae_git_features.json", "musae_git_target.csv"] + + @property + def processed_file_names(self) -> str: + """Return the processed file name for the dataset. + + Returns + ------- + str + Processed file name. + """ + return "data.pt" + + def download(self) -> None: + r"""Download the dataset from a URL and saves it to the raw directory. + + Raises: + FileNotFoundError: If the dataset URL is not found. + """ + # Download data from the source + self.url = self.URLS[self.name] + self.file_format = self.FILE_FORMAT[self.name] + download_file_from_link( + file_link=self.url, + path_to_save=self.raw_dir, + dataset_name=self.raw_name, + file_format=self.file_format, + ) + + # Extract zip file + folder = self.raw_dir + filename = f"{self.raw_name}.{self.file_format}" + path = osp.join(folder, filename) + extract_zip(path, folder) + # Delete zip file + os.unlink(path) + + # Move files from osp.join(folder, name_download) to folder + for file in os.listdir(osp.join(folder, self.raw_name)): + shutil.move(osp.join(folder, self.raw_name, file), folder) + # Delete osp.join(folder, self.name) dir + shutil.rmtree(osp.join(folder, self.raw_name)) + + def process(self) -> None: + r"""Handle the data for the dataset. + + This method loads the MUSAE GitHub data, applies any pre- + processing transformations if specified, and saves the processed data + to the appropriate location. + """ + # Step 1: Load raw data files + folder = self.raw_dir + # Edges: + tmp = pd.read_csv(osp.join(folder, "musae_git_edges.csv"))[["id_1","id_2"]].to_numpy() + edge_index = torch.tensor(tmp, dtype=torch.long).t().contiguous() + # Targets: + tmp = pd.read_csv(osp.join(folder,"musae_git_target.csv")).sort_values("id")["ml_target"].to_numpy() + y = torch.tensor(tmp, dtype=torch.long) + # Node features: + with open(osp.join(folder,"musae_git_features.json")) as infile: + featdict = json.load(infile) + row = [] + col = [] + values = [] + for node_id_str, feature_list in featdict.items(): + node_id = int(node_id_str) + for feature_id in feature_list: + row.append(node_id) + col.append(int(feature_id)) + values.append(1) + row = np.array(row, dtype=int) + col = np.array(col, dtype=int) + values = np.array(values, dtype=int) + node_count = row.max() + 1 + feature_count = col.max() + 1 + shape = (node_count, feature_count) + x = torch.sparse_coo_tensor(np.stack([row, col], axis=0), values, shape) + data = Data(x=x, y=y, edge_index=edge_index) + data_list = [data] + + # Step 2: collate the graphs + self.data, self.slices = self.collate(data_list) + self._data_list = None # Reset cache. + + # Step 3: save processed data + fs.torch_save( + (self._data.to_dict(), self.slices, {}, self._data.__class__), + self.processed_paths[0], + ) diff --git a/topobench/data/loaders/graph/musae_github_dataset_loader.py b/topobench/data/loaders/graph/musae_github_dataset_loader.py new file mode 100644 index 000000000..841bb0127 --- /dev/null +++ b/topobench/data/loaders/graph/musae_github_dataset_loader.py @@ -0,0 +1,43 @@ +"""Loader for MUSAE GitHub dataset.""" + + +from omegaconf import DictConfig + +from topobench.data.datasets import MusaeGitHubDataset +from topobench.data.loaders.base import AbstractLoader + + +class MusaeGitHubDatasetLoader(AbstractLoader): + """Load MUSAE GitHub dataset. + + Parameters + ---------- + parameters : DictConfig + Configuration parameters containing: + - data_dir: Root directory for data + - data_name: Name of the dataset + """ + + def __init__(self, parameters: DictConfig) -> None: + super().__init__(parameters) + + def load_dataset(self) -> MusaeGitHubDataset: + """Load MUSAE GitHub dataset. + + Returns + ------- + Dataset + The loaded MUSAE GitHub dataset. + + Raises + ------ + RuntimeError + If dataset loading fails. + """ + + dataset = MusaeGitHubDataset( + root=str(self.root_data_dir), + name=self.parameters.data_name, + parameters=self.parameters, + ) + return dataset diff --git a/topobench/transforms/data_manipulations/feature_dimensionality_reduction.py b/topobench/transforms/data_manipulations/feature_dimensionality_reduction.py new file mode 100644 index 000000000..3cd894226 --- /dev/null +++ b/topobench/transforms/data_manipulations/feature_dimensionality_reduction.py @@ -0,0 +1,63 @@ +"""This module contains a transform that reduces feature dimensionality for the input graph.""" + +import numpy as np +import torch +import torch_geometric +from scipy.sparse import coo_matrix +from sklearn.decomposition import TruncatedSVD + + +class FeatureDimensionalityReduction(torch_geometric.transforms.BaseTransform): + r"""A transform that reduces dimensionality of node features. + + Parameters + ---------- + reduced_dim : int + The retained number of components after dimensionality reduction. + svd_iter : int + The number of iterations for randomized SVD. + svd_seed : int + The field containing the node features. + **kwargs : optional + Additional arguments for the class. + """ + def __init__( + self, + reduced_dim: int, + svd_iter: int, + svd_seed: int, + **kwargs, + ) -> None: + super().__init__() + self.type = "feature_dim_reduction" + self.reduced_dim = reduced_dim + self.svd_iter = svd_iter + self.svd_seed = svd_seed + self.svd = TruncatedSVD( + n_components = self.reduced_dim, + n_iter = self.svd_iter, + random_state = self.svd_seed, + ) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(type={self.type!r}, reduced_dim={self.reduced_dim}, svd_iter={self.svd_iter}, svd_seed={self.svd_seed}), svd_red={self.svd}" + + def forward(self, data: torch_geometric.data.Data): + r"""Apply the transform to the input data. + + Parameters + ---------- + data : torch_geometric.data.Data + The input data. + + Returns + ------- + torch_geometric.data.Data + The transformed data. + """ + if not hasattr(data, "x") or data.x is None: + return data + x_sparse = coo_matrix((data.x.coalesce().values().numpy(), data.x.coalesce().indices().numpy()), tuple(data.x.shape)) + x = self.svd.fit_transform(x_sparse).astype(np.float32) + data.x = torch.from_numpy(x) + return data