diff --git a/configs/dataset/hypergraph/chordonomicon.yaml b/configs/dataset/hypergraph/chordonomicon.yaml new file mode 100644 index 000000000..b2e2fbc4d --- /dev/null +++ b/configs/dataset/hypergraph/chordonomicon.yaml @@ -0,0 +1,37 @@ +# Dataset loader config +loader: + _target_: topobench.data.loaders.ChordonomiconDatasetLoader + parameters: + data_domain: hypergraph + data_type: chords + data_name: chordonomicon + data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} + version: all_scales # options: ['single_scale', 'all_scales'] + +# Dataset parameters +parameters: + num_features: 1 + num_classes: 1 + num_edge_features: 1 + num_node_features_single_scale: 12 + num_node_features_all_scales: 38 + task: regression + loss_type: mse + monitor_metric: mae + task_level: edge + +#splits +split_params: + learning_setting: transductive + data_seed: 0 + split_type: random #'k-fold' # either "k-fold" or "random" strategies + k: 10 # for "k-fold" Cross-Validation + train_prop: 0.9 # for "random" strategy splitting + standardize: False + data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}/${dataset.loader.parameters.version} + +# Dataloader parameters +dataloader_params: + batch_size: 1 # Fixed since transductive + num_workers: 0 + pin_memory: False diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py index 785987159..fdcc8aa74 100644 --- a/test/pipeline/test_pipeline.py +++ b/test/pipeline/test_pipeline.py @@ -1,11 +1,17 @@ """Test pipeline for a particular dataset and model.""" import hydra -from test._utils.simplified_pipeline import run - - -DATASET = "graph/MUTAG" # ADD YOUR DATASET HERE -MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE +import lightning as pl +import torch +from omegaconf import OmegaConf +from hydra.utils import instantiate +from topobench.data.preprocessor import PreProcessor +from topobench.dataloader import TBDataloader +from topobench.loss.loss import TBLoss +from topobench.model.model import TBModel +from topobench.evaluator.evaluator import TBEvaluator +from topobench.nn.readouts import identical +from topobench.optimizer import TBOptimizer class TestPipeline: @@ -14,22 +20,124 @@ class TestPipeline: def setup_method(self): """Setup method.""" hydra.core.global_hydra.GlobalHydra.instance().clear() - + def test_pipeline(self): """Test pipeline.""" - with hydra.initialize(config_path="../../configs", job_name="job"): - for MODEL in MODELS: - cfg = hydra.compose( - config_name="run.yaml", - overrides=[ - f"model={MODEL}", - f"dataset={DATASET}", # IF YOU IMPLEMENT A LARGE DATASET WITH AN OPTION TO USE A SLICE OF IT, ADD BELOW THE CORRESPONDING OPTION - "trainer.max_epochs=2", - "trainer.min_epochs=1", - "trainer.check_val_every_n_epoch=1", - "paths=test", - "callbacks=model_checkpoint", - ], - return_hydra_config=True + + # configs + config_dataset = OmegaConf.load("configs/dataset/hypergraph/chordonomicon.yaml") + config_dataset.split_params.data_split_dir = f"datasets/data_splits/chordonomicon/{config_dataset.loader.parameters.version}" # pylint: disable=line-too-long + config_dataset.loader.parameters.data_dir = "datasets/hypergraph/chords" + config_evaluator = {"task": "regression", + "num_classes": config_dataset.parameters.num_classes, + "metrics": ["rmse", "mse", "mae"]} + config_loss = {"dataset_loss": + { + "task": "regression", + "loss_type": "mse" + } + } + config_readout = { + "hidden_dim": config_dataset.parameters.num_classes, + "out_channels": config_dataset.parameters.num_classes, + "task_level": config_dataset.parameters.task_level, + "logits_linear_layer": False, + } + config_optimizer = {"optimizer_id": "Adam", + "parameters": + {"lr": 0.01,"weight_decay": 0.0005} + } + + # backbone class definition + class ModelPipeLine(pl.LightningModule): + """Custom model pipeline for testing. + + Parameters + ---------- + dim_in_node : int + Dimension of input node features. + dim_hidden : int + Dimension of hidden layers. + dim_out : int + Dimension of output features. + """ + def __init__(self, + dim_in_node, #batch.x.size(0)+batch.x_hyperedges.shape[1] + dim_hidden, + dim_out, + ): + super().__init__() + self.dim_hidden = dim_hidden + self.linear_node_0 = torch.nn.Linear(dim_in_node, dim_hidden) + self.linear_hyperedge_0 = torch.nn.Linear(dim_hidden, dim_out) + + def forward(self, batch): #pylint: disable=arguments-differ + """Forward pass. + + Parameters + ---------- + batch : torch_geometric.data.Data + Input batch of data. + + Returns + ------- + dict + Output dictionary containing node representation and hyperedge logits. + """ + x_node = torch.concat((batch.x, + torch.sparse.mm(batch.incidence_hyperedges, batch.x_hyperedges)), #pylint: disable=not-callable + dim=1) + h_node = self.linear_node_0(x_node) + h_node = torch.relu(h_node) + h_hyperedge = torch.mm(batch.incidence_hyperedges.T, h_node) + h_hyperedge = self.linear_hyperedge_0(h_hyperedge) + model_out = {'h_node': h_node, + 'h_hyperedge': h_hyperedge, + "labels": batch.y_hyperedges} + model_out["logits"] = model_out["h_hyperedge"] + return model_out + + # dataset + dataset_loader = instantiate(config_dataset.loader) + dataset, dataset_dir = dataset_loader.load() + preprocessor = PreProcessor(dataset, dataset_dir) + dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(config_dataset.split_params) #pylint: disable=line-too-long + datamodule = TBDataloader( + dataset_train=dataset_train, + dataset_val=dataset_val, + dataset_test=dataset_test, + **config_dataset.get("dataloader_params", {}), ) - run(cfg) \ No newline at end of file + + # model + input_dim = config_dataset.parameters.num_edge_features + if config_dataset.loader.parameters.version == "single_scale": + input_dim += config_dataset.parameters.num_node_features_single_scale + elif config_dataset.loader.parameters.version == "all_scales": + input_dim += config_dataset.parameters.num_node_features_all_scales + backbone = ModelPipeLine(dim_in_node=input_dim, + dim_hidden=10, + dim_out=config_dataset.parameters.num_classes) + loss = TBLoss(config_loss["dataset_loss"]) + optimizer = TBOptimizer(**config_optimizer) + readout = identical.NoReadOut(**config_readout) + evaluator = TBEvaluator(**config_evaluator) + optimizer = TBOptimizer(**config_optimizer) + model = TBModel(backbone=backbone, + readout=readout, + loss=loss, + optimizer=optimizer, + evaluator=evaluator, + compile=False) + + # train + trainer = pl.Trainer(max_epochs=3, + accelerator="cpu", + enable_progress_bar=False, + log_every_n_steps=1) + trainer.fit(model, datamodule) + trainer.test(model, datamodule) + test_metrics = trainer.callback_metrics + print(' Testing metrics\n', '-'*25) + for key in test_metrics: + print('{:<20s} {:>5.4f}'.format(key+':', test_metrics[key].item())) diff --git a/topobench/data/datasets/chordonomicon.py b/topobench/data/datasets/chordonomicon.py new file mode 100644 index 000000000..38531ee92 --- /dev/null +++ b/topobench/data/datasets/chordonomicon.py @@ -0,0 +1,160 @@ +"""Dataset class for Chordonomicon dataset.""" + +import ast +import os +import os.path as osp + +import numpy as np +import pandas as pd +import requests +import torch +from torch_geometric.data import Data, InMemoryDataset, extract_zip +from torch_geometric.io import fs + + +class ChordonomiconDataset(InMemoryDataset): + """Dataset class for Chordonomicon dataset. + + Parameters + ---------- + root : str + Directory where the dataset will be stored, raw + and processed will be subdirectories of it. + name : str + Name of the dataset (e.g., 'Chordonomicon'). + version : str + Version of the dataset, options are 'single_scale' or 'all_scales'. + """ + + def __init__(self, root, name, version): + self.name = name + self.root = root + self.version = version + self.folder_chordonomicon = osp.join(self.root, self.name) + if self.version == "single_scale": + self.url = "https://huggingface.co/datasets/PierrickLeKing/topobench-music-synergy/resolve/main/dataframe_226.zip" # pylint: disable=line-too-long + elif self.version == "all_scales": + self.url = "https://huggingface.co/datasets/PierrickLeKing/topobench-music-synergy/resolve/main/dataframe_4313.zip" # pylint: disable=line-too-long + super().__init__( + root, + ) + out = fs.torch_load(self.processed_paths[0]) + data, self.slices, self.sizes, data_cls = out + self.data = data_cls.from_dict(data) + + def download(self): + """Download the Chordonomicon dataset. + + Raises: + requests.exceptions.HTTPError: If the download fails. + """ + r = requests.get(self.url, timeout=30) + r.raise_for_status() + with open( + osp.join(self.folder_chordonomicon, "dataframe.zip"), "wb" + ) as f: + f.write(r.content) + extract_zip( + osp.join(self.folder_chordonomicon, "dataframe.zip"), + osp.join(self.folder_chordonomicon, "raw"), + ) + os.unlink(osp.join(self.folder_chordonomicon, "dataframe.zip")) + + def process(self): + """Handle the Chordonomicon dataset. + + Convert the raw data into a PyTorch Geometric Data object and save it. + """ + df = pd.read_csv( + osp.join(self.folder_chordonomicon, "raw", self.raw_file_names[0]) + ) + df["chords"] = ( + df["chords"].apply(ast.literal_eval).apply(list).apply(np.array) + ) + t1 = torch.from_numpy(np.concatenate(df["chords"].values)) + t2 = torch.tensor(df["chords"].apply(len).values) + indices = torch.stack( + (t1, torch.repeat_interleave(torch.arange(len(t2)), t2)) + ) + incidence_hyperedges = torch.sparse_coo_tensor( + indices, torch.ones(indices.shape[1]) + ).coalesce() + x_hyperedges = torch.tensor( + df["frequency"].values, dtype=torch.float32 + ).unsqueeze(1) + y_hyperedges = torch.tensor( + df["local_o_info"].values, dtype=torch.float32 + ) + data = Data( + incidence_hyperedges=incidence_hyperedges, + num_hyperedges=incidence_hyperedges.size(1), + x_hyperedges=x_hyperedges, + y_hyperedges=y_hyperedges, + y=y_hyperedges, + x=torch.eye(incidence_hyperedges.size(0)), + ) + data_list = [data] + data, slices = self.collate(data_list) + fs.torch_save( + ( + data.to_dict(), + slices, + {}, + data.__class__, + ), + self.processed_paths[0], + ) + + @property + def raw_file_names(self) -> list[str]: + """Return the raw file names for the dataset. + + Returns + ------- + list[str] + List of raw file names. + """ + if self.version == "single_scale": + return ["dataframe_226.csv"] + elif self.version == "all_scales": + return ["dataframe_4313.csv"] + else: + raise ValueError(f"Unknown version: {self.version}") + + @property + def processed_file_names(self) -> str: + """Return the processed file name for the dataset. + + Returns + ------- + str + Processed file name. + """ + if self.version == "single_scale": + return "data_226.pt" + elif self.version == "all_scales": + return "data_4313.pt" + else: + raise ValueError(f"Unknown version: {self.version}") + + @property + def raw_dir(self) -> str: + """Return the path to the raw directory of the dataset. + + Returns + ------- + str + Path to the raw directory. + """ + return osp.join(self.root, self.name, "raw") + + @property + def processed_dir(self) -> str: + """Return the path to the processed directory of the dataset. + + Returns + ------- + str + Path to the processed directory. + """ + return osp.join(self.root, self.name, "processed") diff --git a/topobench/data/loaders/hypergraph/chordonomicon_loader.py b/topobench/data/loaders/hypergraph/chordonomicon_loader.py new file mode 100644 index 000000000..3d8bd96df --- /dev/null +++ b/topobench/data/loaders/hypergraph/chordonomicon_loader.py @@ -0,0 +1,35 @@ +"""Loader for Chordonomicon dataset.""" + +from topobench.data.datasets import ChordonomiconDataset +from topobench.data.loaders.base import AbstractLoader + + +class ChordonomiconDatasetLoader(AbstractLoader): + """Loader class for Chordonomicon dataset. + + Parameters + ---------- + parameters : DictConfig + Configuration parameters containing: + - data_dir (str): Root directory where the dataset folder is stored. + - data_name (str): Name of the dataset. + - version (str): Version of the dataset, options are 'single_scale', 'all_scales'. + """ + + def __init__(self, parameters): + super().__init__(parameters) + self.version = parameters.version + + def load_dataset(self) -> ChordonomiconDataset: + """Load the Chordonomicon dataset. + + Returns + ------- + ChordonomiconDataset + The loaded Chordonomicon dataset. + """ + return ChordonomiconDataset( + root=self.root_data_dir, + name=self.parameters.data_name, + version=self.version, + ) diff --git a/topobench/nn/readouts/base.py b/topobench/nn/readouts/base.py index 6fdd8412f..a2f20cdc7 100755 --- a/topobench/nn/readouts/base.py +++ b/topobench/nn/readouts/base.py @@ -42,7 +42,9 @@ def __init__( if hidden_dim != out_channels or logits_linear_layer else torch.nn.Identity() ) - assert task_level in ["graph", "node"], "Invalid task_level" + assert task_level in ["graph", "node"] or ( + self.name == "NoReadOut" and task_level == "edge" + ), "Invalid task_level" self.task_level = task_level self.logits_linear_layer = logits_linear_layer diff --git a/topobench/nn/readouts/identical.py b/topobench/nn/readouts/identical.py index 723eb9ca9..108773883 100644 --- a/topobench/nn/readouts/identical.py +++ b/topobench/nn/readouts/identical.py @@ -17,6 +17,7 @@ class NoReadOut(AbstractZeroCellReadOut): """ def __init__(self, **kwargs): + self.name = "NoReadOut" super().__init__(**kwargs) def forward(