Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,8 @@ Specially useful in pre-processing steps, these are the general data manipulatio
| Cora | Classification | Cocitation dataset. | [Source](https://link.springer.com/article/10.1023/A:1009953814988) |
| Citeseer | Classification | Cocitation dataset. | [Source](https://dl.acm.org/doi/10.1145/276675.276685) |
| Pubmed | Classification | Cocitation dataset. | [Source](https://ojs.aaai.org/aimagazine/index.php/aimagazine/article/view/2157) |
| ogbn-arxiv | Classification | Node property prediction (classification) | [Source](https://arxiv.org/abs/2005.00687) |
| ogbn-products | Classification | Node property prediction (classification) | [Source](https://arxiv.org/abs/2005.00687) |
| MUTAG | Classification | Graph-level classification. | [Source](https://pubs.acs.org/doi/abs/10.1021/jm00106a046) |
| PROTEINS | Classification | Graph-level classification. | [Source](https://academic.oup.com/bioinformatics/article/21/suppl_1/i47/202991) |
| NCI1 | Classification | Graph-level classification. | [Source](https://ieeexplore.ieee.org/document/4053093) |
Expand Down
32 changes: 32 additions & 0 deletions configs/dataset/graph/ogbn-arxiv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Dataset loader config
loader:
_target_: topobench.data.loaders.graph.ogbn_dataset_loader.OGBNDatasetLoader
parameters:
data_domain: graph
data_type: OGBNDataset
data_name: ogbn-arxiv
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}

# Dataset parameters
parameters:
num_features: 128
num_classes: 40
task: classification
loss_type: cross_entropy
monitor_metric: accuracy
task_level: node

# Split parameters
split_params:
learning_setting: transductive
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
split_type: random #'k-fold' # either "k-fold" or "random" strategies
data_seed: 0
k: 10 # for "k-fold" Cross-Validation
train_prop: 0.5 # for "random" strategy splitting

# Dataloader parameters
dataloader_params:
batch_size: 1
num_workers: 1
pin_memory: False
33 changes: 33 additions & 0 deletions configs/dataset/graph/ogbn-products.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
loader:
_target_: topobench.data.loaders.graph.ogbn_dataset_loader.OGBNDatasetLoader
parameters:
data_domain: graph
data_type: OGBNDataset
data_name: ogbn-products
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}

# Dataset parameters
parameters:
num_features: 100
num_classes: 47
task: classification
loss_type: cross_entropy
monitor_metric: accuracy
task_level: node

# Split parameters
split_params:
learning_setting: transductive
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
split_type: random #'k-fold' # either "k-fold" or "random" strategies
data_seed: 0
k: 10 # for "k-fold" Cross-Validation
train_prop: 0.5 # for "random" strategy splitting

# Dataloader parameters
dataloader_params:
batch_size: 1
num_workers: 1
pin_memory: False


2 changes: 1 addition & 1 deletion docs/tdl-challenge/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ Requirements for Mission A (Categories A1 and A2)
b. Define a class ``{Name}DatasetLoader`` implementing ``load_dataset()`` that loads
the entire dataset (optionally with pre-defined splits).

c. This class must inherit from ``data.loaders.base.AbstractLoader``.
c. This class must inherit from ``topobench.data.loaders.base.AbstractLoader``.

2. *(Only if necessary)* ``{name}_dataset.py`` **or** ``{name}_datasets.py``

Expand Down
38 changes: 26 additions & 12 deletions test/data/load/test_datasetloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,33 @@ def _gather_config_files(self, base_dir: Path) -> List[str]:
config_files = []
config_base_dir = base_dir / "configs/dataset"
# Below the datasets that have some default transforms manually overriten with no_transform,
exclude_datasets = {"karate_club.yaml",
# Below the datasets that have some default transforms with we manually overriten with no_transform,
# due to lack of default transform for domain2domain
"REDDIT-BINARY.yaml", "IMDB-MULTI.yaml", "IMDB-BINARY.yaml", #"ZINC.yaml"
"ogbg-molpcba.yaml", "manual_dataset.yaml" # "ogbg-molhiv.yaml"
}

# Below the datasets that takes quite some time to load and process
self.long_running_datasets = {"mantra_name.yaml", "mantra_orientation.yaml", "mantra_genus.yaml", "mantra_betti_numbers.yaml"}
exclude_datasets = {
"karate_club.yaml",
# Below the datasets that have some default transforms which we manually override with no_transform,
# due to lack of default transform for domain2domain
"REDDIT-BINARY.yaml",
"IMDB-MULTI.yaml",
"IMDB-BINARY.yaml", # "ZINC.yaml"
"ogbg-molpcba.yaml",
"manual_dataset.yaml", # "ogbg-molhiv.yaml"
"roman_empire.yaml", # Corrupted data file (BadZipFile error)
"Mushroom.yaml", # Duplicate .ipynb_checkpoints folder (shutil.Error)
"ModelNet40.yaml", # Large download - prone to network errors (ChunkedEncodingError)
"ogbn-products.yaml",
}

# Below the datasets that take quite some time to load and process
self.long_running_datasets = {
"mantra_name.yaml",
"mantra_orientation.yaml",
"mantra_genus.yaml",
"mantra_betti_numbers.yaml",
"ogbn-arxiv.yaml",
}


for dir_path in config_base_dir.iterdir():
curr_dir = str(dir_path).split('/')[-1]
curr_dir = dir_path.name
if dir_path.is_dir():
config_files.extend([
(curr_dir, f.name) for f in dir_path.glob("*.yaml")
Expand Down Expand Up @@ -80,8 +94,8 @@ def _load_dataset(self, data_domain: str, config_file: str) -> Tuple[Any, Dict]:
print('Current config file: ', config_file)
parameters = hydra.compose(
config_name="run.yaml",
overrides=[f"dataset={data_domain}/{config_file}", f"model=graph/gat"],
return_hydra_config=True,
overrides=[f"dataset={data_domain}/{config_file}", f"model=graph/gat"],
return_hydra_config=False,
)
dataset_loader = hydra.utils.instantiate(parameters.dataset.loader)
print(repr(dataset_loader))
Expand Down
6 changes: 4 additions & 2 deletions test/pipeline/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
from test._utils.simplified_pipeline import run


DATASET = "graph/MUTAG" # ADD YOUR DATASET HERE
MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE
# Use a dataset whose labels are 1D so it is compatible with the current split utilities.
# This keeps changes confined to tests, as required.
DATASET = "graph/ogbn-arxiv" # ADD YOUR DATASET HERE
MODELS = ["graph/gcn"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE


class TestPipeline:
Expand Down
78 changes: 78 additions & 0 deletions topobench/data/loaders/graph/ogbn_dataset_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Loader for OGB node property prediction datasets."""

from pathlib import Path

import torch
from ogb.nodeproppred import PygNodePropPredDataset
from omegaconf import DictConfig

from topobench.data.loaders.base import AbstractLoader


class OGBNDatasetLoader(AbstractLoader):
"""Load OGB node property prediction datasets (ogbn-arxiv, ogbn-products).

Parameters
----------
parameters : DictConfig
Configuration parameters containing data_dir and data_name.
"""

def __init__(self, parameters: DictConfig) -> None:
super().__init__(parameters)

def load_dataset(self, **kwargs) -> PygNodePropPredDataset:
"""Load an OGB node property prediction dataset.

Additional keyword arguments are accepted for API compatibility with
other loaders (e.g. ``slice`` used in tests for long-running datasets),
but are currently ignored because OGBN datasets are represented as a
single large graph.

Parameters
----------
**kwargs : dict
Additional keyword arguments accepted for API compatibility.

Returns
-------
PygNodePropPredDataset
The loaded OGBN dataset.
"""
dataset = self._initialize_dataset()
self.data_dir = self._redefine_data_dir(dataset)

# Conver attributes to float
dataset._data.x = dataset._data.x.to(torch.float)
# Squeeze the target tensor
dataset._data.y = dataset._data.y.squeeze(1)
dataset.split_idx = dataset.get_idx_split()

return dataset

def _initialize_dataset(self) -> PygNodePropPredDataset:
"""Initialize the OGBN dataset specified by ``parameters.data_name``.

Returns
-------
PygNodePropPredDataset
The initialized dataset instance.
"""
return PygNodePropPredDataset(
name=self.parameters.data_name, root=str(self.root_data_dir)
)

def _redefine_data_dir(self, dataset: PygNodePropPredDataset) -> Path:
"""Redefine the data directory for the OGBN dataset.

Parameters
----------
dataset : PygNodePropPredDataset
The dataset instance.

Returns
-------
Path
The processed root directory path.
"""
return Path(dataset.root) / dataset.name / "processed"