Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ Specially useful in pre-processing steps, these are the general data manipulatio
| InfereKNNConnectivity | Generates the k-nearest neighbor connectivity of the input point cloud. |
| IdentityTransform | An identity transform that does nothing to the input data. |
| EqualGausFeatures | Generates equal Gaussian features for all nodes. |
| FeatureDimensionalityReduction | Reduces feature dimensionality through SVD. |
| CalculateSimplicialCurvature | Calculates the simplicial curvature of the input graph. |
| LapPE | Computes Laplacian eigenvectors positional encodings. |
| RWSE | Computes Random Walk structural encodings. |
Expand All @@ -373,6 +374,7 @@ Specially useful in pre-processing steps, these are the general data manipulatio
| IMDB-BIN | Classification | Graph-level classification. | [Source](https://dl.acm.org/doi/10.1145/2783258.2783417) |
| IMDB-MUL | Classification | Graph-level classification. | [Source](https://dl.acm.org/doi/10.1145/2783258.2783417) |
| REDDIT | Classification | Graph-level classification. | [Source](https://proceedings.neurips.cc/paper_files/paper/2017/file/5dd9db5e033da9c6fb5ba83c7a7ebea9-Paper.pdf) |
| Deezer Europe | Classification | Node-level classification. | [Source](https://arxiv.org/abs/1909.13021) |
| Amazon | Classification | Heterophilic dataset. | [Source](https://arxiv.org/pdf/1205.6233) |
| Minesweeper | Classification | Heterophilic dataset. | [Source](https://arxiv.org/pdf/2302.11640) |
| Empire | Classification | Heterophilic dataset. | [Source](https://arxiv.org/pdf/2302.11640) |
Expand Down
34 changes: 34 additions & 0 deletions configs/dataset/graph/musae_deezer_europe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Dataset loader config
loader:
_target_: topobench.data.loaders.MusaeDeezerEuropeDatasetLoader
parameters:
data_domain: graph
data_type: MUSAE
data_name: musae_deezer_europe
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}

# Dataset parameters
parameters:
# Dataset parameters
num_features: 128 # Default feature dimension after reduction 128 (Rozemberczki & Sarkar 2020), without reduction: 30978
num_classes: 2
num_nodes: 28281
task: classification
loss_type: cross_entropy
monitor_metric: accuracy
task_level: node

#splits
split_params:
learning_setting: transductive
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0
split_type: random #'k-fold' # either "k-fold" or "random" strategies
k: 10 # for "k-fold" Cross-Validation
train_prop: 0.5 # for "random" strategy splitting

# Dataloader parameters
dataloader_params:
batch_size: 1 # Fixed
num_workers: 1
pin_memory: False
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
transform_name: "FeatureDimensionalityReduction"
transform_type: "data manipulation"

reduced_dim: ${dataset.parameters.num_features}
svd_iter: 20
svd_seed: ${seed}
3 changes: 3 additions & 0 deletions configs/transforms/dataset_defaults/musae_deezer_europe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# MUSAE Deezer Europe dataset needs feature dimensionality reduction transform for TopoBench models
defaults:
- data_manipulations: feature_dimensionality_reduction
4 changes: 2 additions & 2 deletions test/pipeline/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from test._utils.simplified_pipeline import run


DATASET = "graph/MUTAG" # ADD YOUR DATASET HERE
MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE
DATASET = "graph/musae_deezer_europe" # ADD YOUR DATASET HERE
MODELS = ["graph/gcn"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE


class TestPipeline:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""Test FeatureDimensionalityReduction transform."""

import pytest
import torch
from torch_geometric.data import Data
from topobench.transforms.data_manipulations import FeatureDimensionalityReduction


class TestFeatureDimensionalityReduction:
"""Test FeatureDimensionalityReduction transform."""

def setup_method(self):
"""Set up test fixtures before each test method."""
# Using the default values from config
self.reduced_dim = 3 # example value, would be from dataset.parameters.num_features
self.svd_iter = 20
self.svd_seed = 42
self.transform = FeatureDimensionalityReduction(
reduced_dim=self.reduced_dim,
svd_iter=self.svd_iter,
svd_seed=self.svd_seed
)

@staticmethod
def _make_sparse_data(num_nodes: int=5, num_features: int=30):
"""Create a simple Data object with sparse node features."""
torch.manual_seed(42)
x_dense = torch.randn(num_nodes, num_features, dtype=torch.float32)
x_sparse = x_dense.to_sparse()

if num_nodes > 1:
#num_edges = 0.5 * num_nodes * (num_nodes - 1)
#edge_index = torch.randint(0, num_nodes, (2, num_edges))
row = torch.arange(0, num_nodes - 1)
col = torch.arange(1, num_nodes)
edge_index = torch.stack([row, col], dim=0)
else:
edge_index = torch.zeros((2, 0), dtype=torch.long)

data = Data(
x=x_sparse,
edge_index=edge_index,
num_nodes=num_nodes,
)
return data

def test_initialization(self):
"""Test initialization with different parameters."""
assert self.transform.type == "feature_dim_reduction"
assert self.transform.reduced_dim == self.reduced_dim
assert self.transform.svd_iter == self.svd_iter
assert self.transform.svd_seed == self.svd_seed

# Check that the internal TruncatedSVD is configured correctly
assert self.transform.svd.n_components == self.reduced_dim
assert self.transform.svd.n_iter == self.svd_iter
assert self.transform.svd.random_state == self.svd_seed

def test_repr(self):
"""Test string representation of the transform."""
repr_str = repr(self.transform)
assert "FeatureDimensionalityReduction" in repr_str
assert "feature_dim_reduction" in repr_str
assert f"reduced_dim={self.reduced_dim}" in repr_str
assert f"svd_iter={self.svd_iter}" in repr_str
assert f"svd_seed={self.svd_seed}" in repr_str
# __repr__ includes the underlying SVD object as `svd_red=...`
assert "svd_red=" in repr_str

def test_forward_basic(self):
"""Test basic forward pass on sparse node features."""
num_nodes = 5
num_features = 30
data = self._make_sparse_data(
num_nodes=num_nodes,
num_features=num_features
)

transformed = self.transform(data)

# Feature matrix should be dense and reduced to (num_nodes, reduced_dim)
assert transformed.x.size() == (num_nodes, self.reduced_dim)
assert transformed.x.dtype == torch.float32

# Check other attributes are preserved
assert transformed.num_nodes == data.num_nodes
assert torch.equal(transformed.edge_index, data.edge_index)

def test_forward_without_x_attribute(self):
"""Test transform on a graph without node features (x is None)."""
data = Data(
edge_index=torch.tensor([[0, 1], [1, 0]]),
num_nodes=2,
)

transformed = self.transform(data)

# Check that is the same object
assert data.x is None
assert transformed.x is None
assert torch.equal(transformed.edge_index, data.edge_index)
assert transformed.num_nodes == data.num_nodes

def test_deterministic_with_same_seed(self):
"""Test deterministic behavior with the same seed."""
data1 = self._make_sparse_data(
num_nodes=6,
num_features=7
)
data2 = data1.clone()

t1 = FeatureDimensionalityReduction(
reduced_dim=self.reduced_dim,
svd_iter=self.svd_iter,
svd_seed=self.svd_seed,
)
t2 = FeatureDimensionalityReduction(
reduced_dim=self.reduced_dim,
svd_iter=self.svd_iter,
svd_seed=self.svd_seed,
)
result1 = t1(data1).x
result2 = t2(data2).x

assert result1.shape == result2.shape
assert torch.allclose(result1, result2, atol=1e-6)

def test_invalid_reduced_dim_raises(self):
"""reduced_dim > num_features should raise a ValueError from SVD."""
num_nodes = 5
num_features = 3
data = self._make_sparse_data(
num_nodes=num_nodes,
num_features=num_features
)

bad_transform = FeatureDimensionalityReduction(
reduced_dim=num_features + 1,
svd_iter=self.svd_iter,
svd_seed=self.svd_seed,
)

with pytest.raises(ValueError):
_ = bad_transform(data)

def test_attribute_preservation(self):
"""Test preservation of additional attributes besides x."""
num_nodes = 4
num_features = 5
data = self._make_sparse_data(
num_nodes=num_nodes,
num_features=num_features
)

data.edge_attr = torch.randn(data.edge_index.size(1), 2)
data.custom_attr = "test"

transformed = self.transform(data)

# x is changed, but other attributes should be preserved
assert transformed.x.size() == (num_nodes, self.reduced_dim)
assert torch.equal(transformed.edge_index, data.edge_index)
assert torch.equal(transformed.edge_attr, data.edge_attr)
assert transformed.custom_attr == data.custom_attr
Loading