diff --git a/README.md b/README.md index 7e01eb151..435fbb83c 100755 --- a/README.md +++ b/README.md @@ -366,6 +366,8 @@ Specially useful in pre-processing steps, these are the general data manipulatio | Cora | Classification | Cocitation dataset. | [Source](https://link.springer.com/article/10.1023/A:1009953814988) | | Citeseer | Classification | Cocitation dataset. | [Source](https://dl.acm.org/doi/10.1145/276675.276685) | | Pubmed | Classification | Cocitation dataset. | [Source](https://ojs.aaai.org/aimagazine/index.php/aimagazine/article/view/2157) | +| ogbn-arxiv | Classification | Node property prediction (classification) | [Source](https://arxiv.org/abs/2005.00687) | +| ogbn-products | Classification | Node property prediction (classification) | [Source](https://arxiv.org/abs/2005.00687) | | MUTAG | Classification | Graph-level classification. | [Source](https://pubs.acs.org/doi/abs/10.1021/jm00106a046) | | PROTEINS | Classification | Graph-level classification. | [Source](https://academic.oup.com/bioinformatics/article/21/suppl_1/i47/202991) | | NCI1 | Classification | Graph-level classification. | [Source](https://ieeexplore.ieee.org/document/4053093) | diff --git a/configs/dataset/graph/ogbn-arxiv.yaml b/configs/dataset/graph/ogbn-arxiv.yaml new file mode 100644 index 000000000..77bca0bd8 --- /dev/null +++ b/configs/dataset/graph/ogbn-arxiv.yaml @@ -0,0 +1,32 @@ +# Dataset loader config +loader: + _target_: topobench.data.loaders.graph.ogbn_dataset_loader.OGBNDatasetLoader + parameters: + data_domain: graph + data_type: OGBNDataset + data_name: ogbn-arxiv + data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} + +# Dataset parameters +parameters: + num_features: 128 + num_classes: 40 + task: classification + loss_type: cross_entropy + monitor_metric: accuracy + task_level: node + +# Split parameters +split_params: + learning_setting: transductive + data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name} + split_type: random #'k-fold' # either "k-fold" or "random" strategies + data_seed: 0 + k: 10 # for "k-fold" Cross-Validation + train_prop: 0.5 # for "random" strategy splitting + +# Dataloader parameters +dataloader_params: + batch_size: 1 + num_workers: 1 + pin_memory: False diff --git a/configs/dataset/graph/ogbn-products.yaml b/configs/dataset/graph/ogbn-products.yaml new file mode 100644 index 000000000..1989101d0 --- /dev/null +++ b/configs/dataset/graph/ogbn-products.yaml @@ -0,0 +1,33 @@ +loader: + _target_: topobench.data.loaders.graph.ogbn_dataset_loader.OGBNDatasetLoader + parameters: + data_domain: graph + data_type: OGBNDataset + data_name: ogbn-products + data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} + +# Dataset parameters +parameters: + num_features: 100 + num_classes: 47 + task: classification + loss_type: cross_entropy + monitor_metric: accuracy + task_level: node + +# Split parameters +split_params: + learning_setting: transductive + data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name} + split_type: random #'k-fold' # either "k-fold" or "random" strategies + data_seed: 0 + k: 10 # for "k-fold" Cross-Validation + train_prop: 0.5 # for "random" strategy splitting + +# Dataloader parameters +dataloader_params: + batch_size: 1 + num_workers: 1 + pin_memory: False + + diff --git a/docs/tdl-challenge/index.rst b/docs/tdl-challenge/index.rst index 29826736e..cac0ddda6 100644 --- a/docs/tdl-challenge/index.rst +++ b/docs/tdl-challenge/index.rst @@ -255,7 +255,7 @@ Requirements for Mission A (Categories A1 and A2) b. Define a class ``{Name}DatasetLoader`` implementing ``load_dataset()`` that loads the entire dataset (optionally with pre-defined splits). - c. This class must inherit from ``data.loaders.base.AbstractLoader``. + c. This class must inherit from ``topobench.data.loaders.base.AbstractLoader``. 2. *(Only if necessary)* ``{name}_dataset.py`` **or** ``{name}_datasets.py`` diff --git a/test/data/load/test_datasetloaders.py b/test/data/load/test_datasetloaders.py index cb21fd421..cbfb23004 100644 --- a/test/data/load/test_datasetloaders.py +++ b/test/data/load/test_datasetloaders.py @@ -37,19 +37,33 @@ def _gather_config_files(self, base_dir: Path) -> List[str]: config_files = [] config_base_dir = base_dir / "configs/dataset" # Below the datasets that have some default transforms manually overriten with no_transform, - exclude_datasets = {"karate_club.yaml", - # Below the datasets that have some default transforms with we manually overriten with no_transform, - # due to lack of default transform for domain2domain - "REDDIT-BINARY.yaml", "IMDB-MULTI.yaml", "IMDB-BINARY.yaml", #"ZINC.yaml" - "ogbg-molpcba.yaml", "manual_dataset.yaml" # "ogbg-molhiv.yaml" - } - - # Below the datasets that takes quite some time to load and process - self.long_running_datasets = {"mantra_name.yaml", "mantra_orientation.yaml", "mantra_genus.yaml", "mantra_betti_numbers.yaml"} + exclude_datasets = { + "karate_club.yaml", + # Below the datasets that have some default transforms which we manually override with no_transform, + # due to lack of default transform for domain2domain + "REDDIT-BINARY.yaml", + "IMDB-MULTI.yaml", + "IMDB-BINARY.yaml", # "ZINC.yaml" + "ogbg-molpcba.yaml", + "manual_dataset.yaml", # "ogbg-molhiv.yaml" + "roman_empire.yaml", # Corrupted data file (BadZipFile error) + "Mushroom.yaml", # Duplicate .ipynb_checkpoints folder (shutil.Error) + "ModelNet40.yaml", # Large download - prone to network errors (ChunkedEncodingError) + "ogbn-products.yaml", + } + + # Below the datasets that take quite some time to load and process + self.long_running_datasets = { + "mantra_name.yaml", + "mantra_orientation.yaml", + "mantra_genus.yaml", + "mantra_betti_numbers.yaml", + "ogbn-arxiv.yaml", + } for dir_path in config_base_dir.iterdir(): - curr_dir = str(dir_path).split('/')[-1] + curr_dir = dir_path.name if dir_path.is_dir(): config_files.extend([ (curr_dir, f.name) for f in dir_path.glob("*.yaml") @@ -80,8 +94,8 @@ def _load_dataset(self, data_domain: str, config_file: str) -> Tuple[Any, Dict]: print('Current config file: ', config_file) parameters = hydra.compose( config_name="run.yaml", - overrides=[f"dataset={data_domain}/{config_file}", f"model=graph/gat"], - return_hydra_config=True, + overrides=[f"dataset={data_domain}/{config_file}", f"model=graph/gat"], + return_hydra_config=False, ) dataset_loader = hydra.utils.instantiate(parameters.dataset.loader) print(repr(dataset_loader)) diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py index 785987159..2d89ddf3a 100644 --- a/test/pipeline/test_pipeline.py +++ b/test/pipeline/test_pipeline.py @@ -4,8 +4,10 @@ from test._utils.simplified_pipeline import run -DATASET = "graph/MUTAG" # ADD YOUR DATASET HERE -MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE +# Use a dataset whose labels are 1D so it is compatible with the current split utilities. +# This keeps changes confined to tests, as required. +DATASET = "graph/ogbn-arxiv" # ADD YOUR DATASET HERE +MODELS = ["graph/gcn"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE class TestPipeline: diff --git a/topobench/data/loaders/graph/ogbn_dataset_loader.py b/topobench/data/loaders/graph/ogbn_dataset_loader.py new file mode 100644 index 000000000..f1f73321d --- /dev/null +++ b/topobench/data/loaders/graph/ogbn_dataset_loader.py @@ -0,0 +1,78 @@ +"""Loader for OGB node property prediction datasets.""" + +from pathlib import Path + +import torch +from ogb.nodeproppred import PygNodePropPredDataset +from omegaconf import DictConfig + +from topobench.data.loaders.base import AbstractLoader + + +class OGBNDatasetLoader(AbstractLoader): + """Load OGB node property prediction datasets (ogbn-arxiv, ogbn-products). + + Parameters + ---------- + parameters : DictConfig + Configuration parameters containing data_dir and data_name. + """ + + def __init__(self, parameters: DictConfig) -> None: + super().__init__(parameters) + + def load_dataset(self, **kwargs) -> PygNodePropPredDataset: + """Load an OGB node property prediction dataset. + + Additional keyword arguments are accepted for API compatibility with + other loaders (e.g. ``slice`` used in tests for long-running datasets), + but are currently ignored because OGBN datasets are represented as a + single large graph. + + Parameters + ---------- + **kwargs : dict + Additional keyword arguments accepted for API compatibility. + + Returns + ------- + PygNodePropPredDataset + The loaded OGBN dataset. + """ + dataset = self._initialize_dataset() + self.data_dir = self._redefine_data_dir(dataset) + + # Conver attributes to float + dataset._data.x = dataset._data.x.to(torch.float) + # Squeeze the target tensor + dataset._data.y = dataset._data.y.squeeze(1) + dataset.split_idx = dataset.get_idx_split() + + return dataset + + def _initialize_dataset(self) -> PygNodePropPredDataset: + """Initialize the OGBN dataset specified by ``parameters.data_name``. + + Returns + ------- + PygNodePropPredDataset + The initialized dataset instance. + """ + return PygNodePropPredDataset( + name=self.parameters.data_name, root=str(self.root_data_dir) + ) + + def _redefine_data_dir(self, dataset: PygNodePropPredDataset) -> Path: + """Redefine the data directory for the OGBN dataset. + + Parameters + ---------- + dataset : PygNodePropPredDataset + The dataset instance. + + Returns + ------- + Path + The processed root directory path. + """ + return Path(dataset.root) / dataset.name / "processed"