Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions configs/dataset/hypergraph/chordonomicon.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Dataset loader config
loader:
_target_: topobench.data.loaders.ChordonomiconDatasetLoader
parameters:
data_domain: hypergraph
data_type: chords
data_name: chordonomicon
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
version: all_scales # options: ['single_scale', 'all_scales']

# Dataset parameters
parameters:
num_features: 1
num_classes: 1
num_edge_features: 1
num_node_features_single_scale: 12
num_node_features_all_scales: 38
task: regression
loss_type: mse
monitor_metric: mae
task_level: edge

#splits
split_params:
learning_setting: transductive
data_seed: 0
split_type: random #'k-fold' # either "k-fold" or "random" strategies
k: 10 # for "k-fold" Cross-Validation
train_prop: 0.9 # for "random" strategy splitting
standardize: False
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}/${dataset.loader.parameters.version}

# Dataloader parameters
dataloader_params:
batch_size: 1 # Fixed since transductive
num_workers: 0
pin_memory: False
150 changes: 129 additions & 21 deletions test/pipeline/test_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
"""Test pipeline for a particular dataset and model."""

import hydra
from test._utils.simplified_pipeline import run


DATASET = "graph/MUTAG" # ADD YOUR DATASET HERE
MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE
import lightning as pl
import torch
from omegaconf import OmegaConf
from hydra.utils import instantiate
from topobench.data.preprocessor import PreProcessor
from topobench.dataloader import TBDataloader
from topobench.loss.loss import TBLoss
from topobench.model.model import TBModel
from topobench.evaluator.evaluator import TBEvaluator
from topobench.nn.readouts import identical
from topobench.optimizer import TBOptimizer


class TestPipeline:
Expand All @@ -14,22 +20,124 @@ class TestPipeline:
def setup_method(self):
"""Setup method."""
hydra.core.global_hydra.GlobalHydra.instance().clear()

def test_pipeline(self):
"""Test pipeline."""
with hydra.initialize(config_path="../../configs", job_name="job"):
for MODEL in MODELS:
cfg = hydra.compose(
config_name="run.yaml",
overrides=[
f"model={MODEL}",
f"dataset={DATASET}", # IF YOU IMPLEMENT A LARGE DATASET WITH AN OPTION TO USE A SLICE OF IT, ADD BELOW THE CORRESPONDING OPTION
"trainer.max_epochs=2",
"trainer.min_epochs=1",
"trainer.check_val_every_n_epoch=1",
"paths=test",
"callbacks=model_checkpoint",
],
return_hydra_config=True

# configs
config_dataset = OmegaConf.load("configs/dataset/hypergraph/chordonomicon.yaml")
config_dataset.split_params.data_split_dir = f"datasets/data_splits/chordonomicon/{config_dataset.loader.parameters.version}" # pylint: disable=line-too-long
config_dataset.loader.parameters.data_dir = "datasets/hypergraph/chords"
config_evaluator = {"task": "regression",
"num_classes": config_dataset.parameters.num_classes,
"metrics": ["rmse", "mse", "mae"]}
config_loss = {"dataset_loss":
{
"task": "regression",
"loss_type": "mse"
}
}
config_readout = {
"hidden_dim": config_dataset.parameters.num_classes,
"out_channels": config_dataset.parameters.num_classes,
"task_level": config_dataset.parameters.task_level,
"logits_linear_layer": False,
}
config_optimizer = {"optimizer_id": "Adam",
"parameters":
{"lr": 0.01,"weight_decay": 0.0005}
}

# backbone class definition
class ModelPipeLine(pl.LightningModule):
"""Custom model pipeline for testing.

Parameters
----------
dim_in_node : int
Dimension of input node features.
dim_hidden : int
Dimension of hidden layers.
dim_out : int
Dimension of output features.
"""
def __init__(self,
dim_in_node, #batch.x.size(0)+batch.x_hyperedges.shape[1]
dim_hidden,
dim_out,
):
super().__init__()
self.dim_hidden = dim_hidden
self.linear_node_0 = torch.nn.Linear(dim_in_node, dim_hidden)
self.linear_hyperedge_0 = torch.nn.Linear(dim_hidden, dim_out)

def forward(self, batch): #pylint: disable=arguments-differ
"""Forward pass.

Parameters
----------
batch : torch_geometric.data.Data
Input batch of data.

Returns
-------
dict
Output dictionary containing node representation and hyperedge logits.
"""
x_node = torch.concat((batch.x,
torch.sparse.mm(batch.incidence_hyperedges, batch.x_hyperedges)), #pylint: disable=not-callable
dim=1)
h_node = self.linear_node_0(x_node)
h_node = torch.relu(h_node)
h_hyperedge = torch.mm(batch.incidence_hyperedges.T, h_node)
h_hyperedge = self.linear_hyperedge_0(h_hyperedge)
model_out = {'h_node': h_node,
'h_hyperedge': h_hyperedge,
"labels": batch.y_hyperedges}
model_out["logits"] = model_out["h_hyperedge"]
return model_out

# dataset
dataset_loader = instantiate(config_dataset.loader)
dataset, dataset_dir = dataset_loader.load()
preprocessor = PreProcessor(dataset, dataset_dir)
dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(config_dataset.split_params) #pylint: disable=line-too-long
datamodule = TBDataloader(
dataset_train=dataset_train,
dataset_val=dataset_val,
dataset_test=dataset_test,
**config_dataset.get("dataloader_params", {}),
)
run(cfg)

# model
input_dim = config_dataset.parameters.num_edge_features
if config_dataset.loader.parameters.version == "single_scale":
input_dim += config_dataset.parameters.num_node_features_single_scale
elif config_dataset.loader.parameters.version == "all_scales":
input_dim += config_dataset.parameters.num_node_features_all_scales
backbone = ModelPipeLine(dim_in_node=input_dim,
dim_hidden=10,
dim_out=config_dataset.parameters.num_classes)
loss = TBLoss(config_loss["dataset_loss"])
optimizer = TBOptimizer(**config_optimizer)
readout = identical.NoReadOut(**config_readout)
evaluator = TBEvaluator(**config_evaluator)
optimizer = TBOptimizer(**config_optimizer)
model = TBModel(backbone=backbone,
readout=readout,
loss=loss,
optimizer=optimizer,
evaluator=evaluator,
compile=False)

# train
trainer = pl.Trainer(max_epochs=3,
accelerator="cpu",
enable_progress_bar=False,
log_every_n_steps=1)
trainer.fit(model, datamodule)
trainer.test(model, datamodule)
test_metrics = trainer.callback_metrics
print(' Testing metrics\n', '-'*25)
for key in test_metrics:
print('{:<20s} {:>5.4f}'.format(key+':', test_metrics[key].item()))
160 changes: 160 additions & 0 deletions topobench/data/datasets/chordonomicon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""Dataset class for Chordonomicon dataset."""

import ast
import os
import os.path as osp

import numpy as np
import pandas as pd
import requests
import torch
from torch_geometric.data import Data, InMemoryDataset, extract_zip
from torch_geometric.io import fs


class ChordonomiconDataset(InMemoryDataset):
"""Dataset class for Chordonomicon dataset.

Parameters
----------
root : str
Directory where the dataset will be stored, raw
and processed will be subdirectories of it.
name : str
Name of the dataset (e.g., 'Chordonomicon').
version : str
Version of the dataset, options are 'single_scale' or 'all_scales'.
"""

def __init__(self, root, name, version):
self.name = name
self.root = root
self.version = version
self.folder_chordonomicon = osp.join(self.root, self.name)
if self.version == "single_scale":
self.url = "https://huggingface.co/datasets/PierrickLeKing/topobench-music-synergy/resolve/main/dataframe_226.zip" # pylint: disable=line-too-long
elif self.version == "all_scales":
self.url = "https://huggingface.co/datasets/PierrickLeKing/topobench-music-synergy/resolve/main/dataframe_4313.zip" # pylint: disable=line-too-long
super().__init__(
root,
)
out = fs.torch_load(self.processed_paths[0])
data, self.slices, self.sizes, data_cls = out
self.data = data_cls.from_dict(data)

def download(self):
"""Download the Chordonomicon dataset.

Raises:
requests.exceptions.HTTPError: If the download fails.
"""
r = requests.get(self.url, timeout=30)
r.raise_for_status()
with open(
osp.join(self.folder_chordonomicon, "dataframe.zip"), "wb"
) as f:
f.write(r.content)
extract_zip(
osp.join(self.folder_chordonomicon, "dataframe.zip"),
osp.join(self.folder_chordonomicon, "raw"),
)
os.unlink(osp.join(self.folder_chordonomicon, "dataframe.zip"))

def process(self):
"""Handle the Chordonomicon dataset.

Convert the raw data into a PyTorch Geometric Data object and save it.
"""
df = pd.read_csv(
osp.join(self.folder_chordonomicon, "raw", self.raw_file_names[0])
)
df["chords"] = (
df["chords"].apply(ast.literal_eval).apply(list).apply(np.array)
)
t1 = torch.from_numpy(np.concatenate(df["chords"].values))
t2 = torch.tensor(df["chords"].apply(len).values)
indices = torch.stack(
(t1, torch.repeat_interleave(torch.arange(len(t2)), t2))
)
incidence_hyperedges = torch.sparse_coo_tensor(
indices, torch.ones(indices.shape[1])
).coalesce()
x_hyperedges = torch.tensor(
df["frequency"].values, dtype=torch.float32
).unsqueeze(1)
y_hyperedges = torch.tensor(
df["local_o_info"].values, dtype=torch.float32
)
data = Data(
incidence_hyperedges=incidence_hyperedges,
num_hyperedges=incidence_hyperedges.size(1),
x_hyperedges=x_hyperedges,
y_hyperedges=y_hyperedges,
y=y_hyperedges,
x=torch.eye(incidence_hyperedges.size(0)),
)
data_list = [data]
data, slices = self.collate(data_list)
fs.torch_save(
(
data.to_dict(),
slices,
{},
data.__class__,
),
self.processed_paths[0],
)

@property
def raw_file_names(self) -> list[str]:
"""Return the raw file names for the dataset.

Returns
-------
list[str]
List of raw file names.
"""
if self.version == "single_scale":
return ["dataframe_226.csv"]
elif self.version == "all_scales":
return ["dataframe_4313.csv"]
else:
raise ValueError(f"Unknown version: {self.version}")

@property
def processed_file_names(self) -> str:
"""Return the processed file name for the dataset.

Returns
-------
str
Processed file name.
"""
if self.version == "single_scale":
return "data_226.pt"
elif self.version == "all_scales":
return "data_4313.pt"
else:
raise ValueError(f"Unknown version: {self.version}")

@property
def raw_dir(self) -> str:
"""Return the path to the raw directory of the dataset.

Returns
-------
str
Path to the raw directory.
"""
return osp.join(self.root, self.name, "raw")

@property
def processed_dir(self) -> str:
"""Return the path to the processed directory of the dataset.

Returns
-------
str
Path to the processed directory.
"""
return osp.join(self.root, self.name, "processed")
35 changes: 35 additions & 0 deletions topobench/data/loaders/hypergraph/chordonomicon_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Loader for Chordonomicon dataset."""

from topobench.data.datasets import ChordonomiconDataset
from topobench.data.loaders.base import AbstractLoader


class ChordonomiconDatasetLoader(AbstractLoader):
"""Loader class for Chordonomicon dataset.

Parameters
----------
parameters : DictConfig
Configuration parameters containing:
- data_dir (str): Root directory where the dataset folder is stored.
- data_name (str): Name of the dataset.
- version (str): Version of the dataset, options are 'single_scale', 'all_scales'.
"""

def __init__(self, parameters):
super().__init__(parameters)
self.version = parameters.version

def load_dataset(self) -> ChordonomiconDataset:
"""Load the Chordonomicon dataset.

Returns
-------
ChordonomiconDataset
The loaded Chordonomicon dataset.
"""
return ChordonomiconDataset(
root=self.root_data_dir,
name=self.parameters.data_name,
version=self.version,
)
Loading