From 6f16d4dcbc5502f2f9ce37e5c03b02918f9eb114 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Mon, 17 Nov 2025 15:17:17 -0500
Subject: [PATCH 01/17] initial commit for OC20/22

---
 configs/dataset/graph/OC20_IS2RE.yaml         |  36 ++
 .../dataset/graph/OC20_S2EF_train_200K.yaml   |  34 ++
 configs/dataset/graph/OC20_S2EF_val_id.yaml   |  36 ++
 configs/dataset/graph/OC20_tiny.yaml          |  34 ++
 configs/dataset/graph/OC22_IS2RE.yaml         |  36 ++
 .../data/loaders/graph/oc20_dataset_loader.py | 399 ++++++++++++++++++
 6 files changed, 575 insertions(+)
 create mode 100644 configs/dataset/graph/OC20_IS2RE.yaml
 create mode 100644 configs/dataset/graph/OC20_S2EF_train_200K.yaml
 create mode 100644 configs/dataset/graph/OC20_S2EF_val_id.yaml
 create mode 100644 configs/dataset/graph/OC20_tiny.yaml
 create mode 100644 configs/dataset/graph/OC22_IS2RE.yaml
 create mode 100644 topobench/data/loaders/graph/oc20_dataset_loader.py

diff --git a/configs/dataset/graph/OC20_IS2RE.yaml b/configs/dataset/graph/OC20_IS2RE.yaml
new file mode 100644
index 000000000..4fde80978
--- /dev/null
+++ b/configs/dataset/graph/OC20_IS2RE.yaml
@@ -0,0 +1,36 @@
+# OC20 IS2RE task (LMDB mode for local use)
+# Switch mode to 'lmdb' to download and use the real dataset
+
+loader:
+  _target_: topobench.data.loaders.OC20DatasetLoader
+  parameters:
+    data_domain: graph
+    data_type: oc20
+    data_name: OC20_IS2RE
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
+    mode: tiny # change to 'lmdb' to download real dataset
+    split: is2re
+    download: false
+    num_samples: 64
+    num_node_features: 6
+    seed: 0
+
+parameters:
+  num_features: 6
+  num_classes: 1
+  task: regression
+  loss_type: mse
+  monitor_metric: mae
+  task_level: graph
+
+split_params:
+  learning_setting: inductive
+  split_type: fixed
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+  data_seed: 0
+
+dataloader_params:
+  batch_size: 64
+  num_workers: 0
+  pin_memory: False
+  persistent_workers: False
diff --git a/configs/dataset/graph/OC20_S2EF_train_200K.yaml b/configs/dataset/graph/OC20_S2EF_train_200K.yaml
new file mode 100644
index 000000000..b386ea663
--- /dev/null
+++ b/configs/dataset/graph/OC20_S2EF_train_200K.yaml
@@ -0,0 +1,34 @@
+# OC20 example config kept lightweight for CI by using tiny mode.
+# Users can switch to mode=lmdb and set an OC20 split locally.
+
+loader:
+  _target_: topobench.data.loaders.OC20DatasetLoader
+  parameters:
+    data_domain: graph
+    data_type: oc20
+    data_name: OC20_S2EF
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
+    mode: tiny # change to 'lmdb' locally to use real OC20
+    num_samples: 64
+    num_node_features: 6
+    seed: 0
+
+parameters:
+  num_features: 6
+  num_classes: 1
+  task: regression
+  loss_type: mse
+  monitor_metric: mae
+  task_level: graph
+
+split_params:
+  learning_setting: inductive
+  split_type: fixed # provided by the dataset
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+  data_seed: 0
+
+dataloader_params:
+  batch_size: 64
+  num_workers: 0
+  pin_memory: False
+  persistent_workers: False
diff --git a/configs/dataset/graph/OC20_S2EF_val_id.yaml b/configs/dataset/graph/OC20_S2EF_val_id.yaml
new file mode 100644
index 000000000..f641a28e5
--- /dev/null
+++ b/configs/dataset/graph/OC20_S2EF_val_id.yaml
@@ -0,0 +1,36 @@
+# OC20 S2EF validation ID split (LMDB mode for local use)
+# Switch mode to 'lmdb' to download and use the real dataset
+
+loader:
+  _target_: topobench.data.loaders.OC20DatasetLoader
+  parameters:
+    data_domain: graph
+    data_type: oc20
+    data_name: OC20_S2EF_val_id
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
+    mode: tiny # change to 'lmdb' to download real dataset
+    split: s2ef_val_id
+    download: false
+    num_samples: 64
+    num_node_features: 6
+    seed: 0
+
+parameters:
+  num_features: 6
+  num_classes: 1
+  task: regression
+  loss_type: mse
+  monitor_metric: mae
+  task_level: graph
+
+split_params:
+  learning_setting: inductive
+  split_type: fixed
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+  data_seed: 0
+
+dataloader_params:
+  batch_size: 64
+  num_workers: 0
+  pin_memory: False
+  persistent_workers: False
diff --git a/configs/dataset/graph/OC20_tiny.yaml b/configs/dataset/graph/OC20_tiny.yaml
new file mode 100644
index 000000000..49949bbfe
--- /dev/null
+++ b/configs/dataset/graph/OC20_tiny.yaml
@@ -0,0 +1,34 @@
+# OC20 tiny synthetic dataset for CI/tests and quick examples
+
+loader:
+  _target_: topobench.data.loaders.OC20DatasetLoader
+  parameters:
+    data_domain: graph
+    data_type: oc20
+    data_name: OC20_tiny
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
+    mode: tiny
+    num_samples: 64
+    num_node_features: 6
+    seed: 0
+
+parameters:
+  num_features: 6
+  num_classes: 1
+  task: regression
+  loss_type: mse
+  monitor_metric: mae
+  task_level: graph
+
+split_params:
+  learning_setting: inductive
+  # Fixed split is embedded in the tiny dataset via split_idx
+  split_type: fixed
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+  data_seed: 0
+
+dataloader_params:
+  batch_size: 64
+  num_workers: 0
+  pin_memory: False
+  persistent_workers: False
diff --git a/configs/dataset/graph/OC22_IS2RE.yaml b/configs/dataset/graph/OC22_IS2RE.yaml
new file mode 100644
index 000000000..b86a1e36e
--- /dev/null
+++ b/configs/dataset/graph/OC22_IS2RE.yaml
@@ -0,0 +1,36 @@
+# OC22 IS2RE task (LMDB mode for local use)
+# Switch mode to 'lmdb' to download and use the real dataset
+
+loader:
+  _target_: topobench.data.loaders.OC20DatasetLoader
+  parameters:
+    data_domain: graph
+    data_type: oc20
+    data_name: OC22_IS2RE
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
+    mode: tiny # change to 'lmdb' to download real dataset
+    split: oc22_is2re
+    download: false
+    num_samples: 64
+    num_node_features: 6
+    seed: 0
+
+parameters:
+  num_features: 6
+  num_classes: 1
+  task: regression
+  loss_type: mse
+  monitor_metric: mae
+  task_level: graph
+
+split_params:
+  learning_setting: inductive
+  split_type: fixed
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+  data_seed: 0
+
+dataloader_params:
+  batch_size: 64
+  num_workers: 0
+  pin_memory: False
+  persistent_workers: False
diff --git a/topobench/data/loaders/graph/oc20_dataset_loader.py b/topobench/data/loaders/graph/oc20_dataset_loader.py
new file mode 100644
index 000000000..a5e7155b3
--- /dev/null
+++ b/topobench/data/loaders/graph/oc20_dataset_loader.py
@@ -0,0 +1,399 @@
+"""Loader for OC20 family datasets (S2EF/IS2RE).
+
+This loader integrates the Open Catalyst 2020 (OC20/OC22) datasets into TopoBench.
+It supports two modes:
+- tiny: returns a tiny synthetic PyG dataset for CI/testing (default)
+- lmdb: uses the on-disk LMDB datasets from OC20 (optional, requires `lmdb`)
+
+The LMDB backend is integrated directly to avoid external file dependencies.
+"""
+from __future__ import annotations
+
+import logging
+import lzma
+import multiprocessing as mp
+import os
+import pickle
+import random
+import shutil
+import tarfile
+import urllib.request
+from pathlib import Path
+from typing import Iterator, Optional
+
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import lmdb
+from tqdm import tqdm
+
+import torch
+from omegaconf import DictConfig
+from torch_geometric.data import Data, Dataset, InMemoryDataset
+
+from topobench.data.loaders.base import AbstractLoader
+
+logger = logging.getLogger(__name__)
+
+# OC20 dataset split URLs
+SPLITS_TO_URL = {
+    "s2ef_train_200K": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_200K.tar",
+    "s2ef_train_2M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_2M.tar",
+    "s2ef_train_20M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_20M.tar",
+    "s2ef_train_all": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_all.tar",
+    "s2ef_val_id": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_id.tar",
+    "s2ef_val_ood_ads": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_ads.tar",
+    "s2ef_val_ood_cat": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_cat.tar",
+    "s2ef_val_ood_both": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_both.tar",
+    "s2ef_test": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_test.tar",
+    "surfaces": "https://dl.fbaipublicfiles.com/opencatalystproject/data/slab_trajectories.tar",
+    "is2re": "https://dl.fbaipublicfiles.com/opencatalystproject/data/is2res_train_val_test_lmdbs.tar.gz",
+    "oc22_is2re": "https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/is2res_total_train_val_test_lmdbs.tar.gz",
+}
+
+CACHE_DIR = Path.home() / ".cache" / "oc20"
+
+
+def _uncompress_xz(file_path: str) -> str:
+    if not file_path.endswith(".xz"):
+        return file_path
+
+    output_path = file_path.replace(".xz", "")
+    try:
+        with lzma.open(file_path, "rb") as f_in, open(output_path, "wb") as f_out:
+            shutil.copyfileobj(f_in, f_out)
+        os.remove(file_path)
+        return output_path
+    except Exception as e:
+        logger.error(f"Error uncompressing {file_path}: {e}")
+        return file_path
+
+
+def _download_and_extract(url: str, target_dir: Path) -> Path:
+    target_dir.mkdir(parents=True, exist_ok=True)
+    target_file = target_dir / os.path.basename(url)
+
+    if not target_file.exists():
+        logger.info(f"Downloading {url}...")
+        with tqdm(
+            unit="B", unit_scale=True, desc=f"Downloading {target_file.name}"
+        ) as pbar:
+
+            def report(block_num, block_size, total_size):
+                if total_size > 0 and block_num == 0:
+                    pbar.total = total_size
+                pbar.update(block_size)
+
+            urllib.request.urlretrieve(url, target_file, reporthook=report)
+
+    logger.info(f"Extracting {target_file.name}...")
+    if str(target_file).endswith((".tar.gz", ".tgz")):
+        with tarfile.open(target_file, "r:gz") as tar:
+            tar.extractall(path=target_dir)
+    elif str(target_file).endswith(".tar"):
+        with tarfile.open(target_file, "r:") as tar:
+            tar.extractall(path=target_dir)
+    else:
+        raise ValueError(f"Unsupported archive format: {target_file}")
+
+    return target_dir
+
+
+
+class _OC20LMDBDataset(Dataset):
+
+    def __init__(
+        self,
+        path: Optional[str | Path] = None,
+        split: Optional[str] = "s2ef_train_200K",
+        download: bool = True,
+        dtype: torch.dtype = torch.float32,
+        legacy_format: bool = False,
+    ):
+        """Initialize OC20 LMDB dataset.
+
+        Parameters
+        ----------
+        path : Optional[str | Path]
+            Path to LMDB directory. If None, uses cache directory.
+        split : Optional[str]
+            Which OC20 split to load (e.g., "s2ef_train_200K").
+        download : bool
+            Whether to download if not present.
+        dtype : torch.dtype
+            Data type for tensors.
+        legacy_format : bool
+            Whether to use legacy PyG Data format.
+        """
+        super().__init__()
+        self.dtype = dtype
+        self.legacy_format = legacy_format
+
+        if path is None:
+            if split is None:
+                raise ValueError("Must provide either path or split")
+            if split not in SPLITS_TO_URL:
+                raise ValueError(
+                    f"Unknown split: {split}. Available: {list(SPLITS_TO_URL.keys())}"
+                )
+
+            url = SPLITS_TO_URL[split]
+            dataset_name = os.path.basename(url).split(".")[0]
+            path = CACHE_DIR / dataset_name
+
+        self.path = Path(path)
+
+        if download and not self.path.exists():
+            if split is None:
+                raise ValueError("Cannot download without specifying a split")
+            self._download(split)
+
+        if not self.path.exists():
+            raise ValueError(f"Dataset not found at {self.path}")
+
+        self._open_lmdbs()
+
+    def _download(self, split: str):
+        url = SPLITS_TO_URL[split]
+        logger.info(f"Downloading {split} dataset...")
+        _download_and_extract(url, self.path)
+
+        xz_files = list(self.path.glob("**/*.xz"))
+        if xz_files:
+            logger.info(f"Decompressing {len(xz_files)} .xz files...")
+            from concurrent.futures import ProcessPoolExecutor, as_completed
+
+            num_workers = max(1, mp.cpu_count() - 1)
+            with ProcessPoolExecutor(max_workers=num_workers) as executor:
+                futures = [executor.submit(_uncompress_xz, str(f)) for f in xz_files]
+                for future in tqdm(
+                    as_completed(futures), total=len(futures), desc="Decompressing"
+                ):
+                    future.result()
+
+
+    def _open_lmdbs(self):
+        if self.path.is_dir():
+            lmdb_paths = sorted(self.path.glob("**/*.lmdb"))
+        else:
+            lmdb_paths = [self.path]
+
+        if not lmdb_paths:
+            raise ValueError(f"No LMDB files found in {self.path}")
+
+        self.envs = []
+        self.cumulative_sizes = [0]
+
+        for lmdb_path in lmdb_paths:
+            env = lmdb.open(
+                str(lmdb_path.resolve()),
+                subdir=False,
+                readonly=True,
+                lock=False,
+                readahead=True,
+                meminit=False,
+                max_readers=1,
+            )
+            size = env.stat()["entries"]
+
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+
+        logger.info(
+            f"Loaded {len(self.envs)} LMDB files with {len(self)} total entries"
+        )
+
+    def _find_lmdb_and_local_idx(self, idx: int) -> tuple:
+        if idx < 0 or idx >= len(self):
+            raise IndexError(f"Index {idx} out of range [0, {len(self)})")
+
+        left, right = 0, len(self.envs)
+        while left < right - 1:
+            mid = (left + right) // 2
+            if self.cumulative_sizes[mid] <= idx:
+                left = mid
+            else:
+                right = mid
+
+        lmdb_idx = left
+        local_idx = idx - self.cumulative_sizes[lmdb_idx]
+        return lmdb_idx, local_idx
+
+    def len(self) -> int: 
+        return self.cumulative_sizes[-1]
+
+    def get(self, idx: int) -> Data:
+        lmdb_idx, local_idx = self._find_lmdb_and_local_idx(idx)
+        lmdb_path, env, _ = self.envs[lmdb_idx]
+
+        with env.begin() as txn:
+            cursor = txn.cursor()
+            if not cursor.first():
+                raise RuntimeError(f"Empty LMDB at {lmdb_path}")
+
+            for _ in range(local_idx):
+                if not cursor.next():
+                    raise RuntimeError(f"Index {local_idx} out of range in {lmdb_path}")
+
+            key, value = cursor.item()
+            data = pickle.loads(value)
+
+        if self.legacy_format and isinstance(data, Data):
+            data = Data(**{k: v for k, v in data.__dict__.items() if v is not None})
+
+        return data
+
+    def __len__(self) -> int:
+        return self.len()
+
+    def __getitem__(self, idx: int) -> Data:
+        return self.get(idx)
+
+    def __iter__(self) -> Iterator[Data]:
+        for i in range(len(self)):
+            yield self[i]
+
+    def __del__(self):
+        if hasattr(self, "envs"):
+            for _, env, _ in self.envs:
+                env.close()
+
+
+class _TinyOC20Dataset(InMemoryDataset):
+    """A tiny synthetic OC20-like dataset for tests and quick runs.
+
+    Each sample is a small "molecule on surface" graph with:
+    - x: atom features (random floats)
+    - pos: 3D positions
+    - z: atomic numbers (ints)
+    - y: target energy (scalar regression)
+    """
+
+    def __init__(
+        self,
+        root: str | Path,
+        num_samples: int = 64,
+        min_nodes: int = 5,
+        max_nodes: int = 12,
+        num_node_features: int = 6,
+        seed: int = 0,
+    ) -> None:
+        super().__init__(str(root))
+        self._num_samples = num_samples
+        self._min_nodes = min_nodes
+        self._max_nodes = max_nodes
+        self._num_node_features = num_node_features
+        self._rng = random.Random(seed)
+        self._torch_rng = torch.Generator().manual_seed(seed)
+
+        # Generate data list
+        data_list: list[Data] = []
+        for _ in range(num_samples):
+            n = self._rng.randint(self._min_nodes, self._max_nodes)
+            pos = torch.randn((n, 3), generator=self._torch_rng)
+            x = torch.randn((n, self._num_node_features), generator=self._torch_rng)
+            z = torch.randint(low=1, high=86, size=(n,), generator=self._torch_rng)
+            # Fully-connected edge index for small graphs
+            row = torch.arange(n).repeat_interleave(n)
+            col = torch.arange(n).repeat(n)
+            edge_index = torch.stack([row, col], dim=0)
+            # Scalar target (e.g., energy)
+            y = torch.randn(1, generator=self._torch_rng)
+            data_list.append(Data(x=x, pos=pos, z=z, edge_index=edge_index, y=y))
+
+        data, slices = self.collate(data_list)
+        self.data, self.slices = data, slices
+
+        # Pre-generate split indices for reproducibility (60/20/20)
+        idx = list(range(num_samples))
+        self._rng.shuffle(idx)
+        n_train = int(0.6 * num_samples)
+        n_val = int(0.2 * num_samples)
+        self.split_idx = {
+            "train": torch.tensor(idx[:n_train], dtype=torch.long),
+            "valid": torch.tensor(idx[n_train : n_train + n_val], dtype=torch.long),
+            "test": torch.tensor(idx[n_train + n_val :], dtype=torch.long),
+        }
+
+    @property
+    def num_node_features(self) -> int:  # type: ignore[override]
+        return self._num_node_features
+
+
+class OC20DatasetLoader(AbstractLoader):
+    """Load OC20 family datasets.
+
+    This loader supports all OC20/OC22 dataset splits including S2EF and IS2RE tasks.
+
+    Parameters in the Hydra config (dataset.loader.parameters):
+    - data_domain: graph
+    - data_type: oc20
+    - data_name: Logical name for the dataset (e.g., OC20_S2EF)
+    - mode: "tiny" (default) or "lmdb"
+    - split: OC20 split name when mode=="lmdb" (e.g., "s2ef_train_200K", "is2re", etc.)
+    - download: whether to download when mode=="lmdb" (default: false)
+    - legacy_format: whether to use legacy PyG Data format (default: false)
+    - dtype: torch dtype (default: "float32")
+
+    Supported OC20 splits:
+    - s2ef_train_200K, s2ef_train_2M, s2ef_train_20M, s2ef_train_all
+    - s2ef_val_id, s2ef_val_ood_ads, s2ef_val_ood_cat, s2ef_val_ood_both
+    - s2ef_test
+    - surfaces
+    - is2re
+    - oc22_is2re
+    """
+
+    def __init__(self, parameters: DictConfig) -> None:
+        super().__init__(parameters)
+
+    def load_dataset(self) -> Dataset:
+        mode: str = getattr(self.parameters, "mode", "tiny")
+        
+        if mode == "tiny":
+            # Fast, dependency-free tiny dataset for CI/tests
+            return _TinyOC20Dataset(
+                root=self.get_data_dir(),
+                num_samples=int(getattr(self.parameters, "num_samples", 64)),
+                min_nodes=int(getattr(self.parameters, "min_nodes", 5)),
+                max_nodes=int(getattr(self.parameters, "max_nodes", 12)),
+                num_node_features=int(
+                    getattr(self.parameters, "num_node_features", 6)
+                ),
+                seed=int(getattr(self.parameters, "seed", 0)),
+            )
+
+        if mode == "lmdb":
+            split: Optional[str] = getattr(self.parameters, "split", None)
+            download: bool = bool(getattr(self.parameters, "download", False))
+            legacy_format: bool = bool(
+                getattr(self.parameters, "legacy_format", False)
+            )
+            dtype = getattr(self.parameters, "dtype", "float32")
+            dtype_t = getattr(torch, str(dtype)) if isinstance(dtype, str) else dtype
+
+            ds = _OC20LMDBDataset(
+                path=None,  # let backend resolve via split/cache
+                split=split,
+                download=download,
+                dtype=dtype_t,
+                legacy_format=legacy_format,
+            )
+            
+            # Expose split_idx for TopoBench compatibility
+            n = len(ds)
+            idx = torch.arange(n)
+            n_train = int(0.8 * n)
+            n_val = int(0.1 * n)
+            ds.split_idx = {  # type: ignore[attr-defined]
+                "train": idx[:n_train],
+                "valid": idx[n_train : n_train + n_val],
+                "test": idx[n_train + n_val :],
+            }
+            return ds  # type: ignore[return-value]
+
+        raise ValueError(
+            f"Unsupported mode '{mode}'. Use 'tiny' (default) or 'lmdb'."
+        )
+
+    def get_data_dir(self) -> Path:
+        # Keep default directory convention for TopoBench
+        return Path(super().get_data_dir())

From 42e7ad827882bbadf437dbc6819db3c270688768 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Mon, 17 Nov 2025 19:02:43 -0500
Subject: [PATCH 02/17] preprocessing still not fully working

---
 configs/dataset/graph/OC20_IS2RE.yaml         |  26 +-
 .../dataset/graph/OC20_S2EF_train_200K.yaml   |  31 +-
 .../dataset/graph/OC20_S2EF_train_20M.yaml    |  38 ++
 configs/dataset/graph/OC20_S2EF_train_2M.yaml |  38 ++
 .../dataset/graph/OC20_S2EF_train_all.yaml    |  38 ++
 configs/dataset/graph/OC20_S2EF_val_id.yaml   |  32 +-
 configs/dataset/graph/OC20_tiny.yaml          |  34 -
 configs/dataset/graph/OC22_IS2RE.yaml         |  26 +-
 test_oc20_integration.py                      | 227 +++++++
 .../data/loaders/graph/oc20_dataset_loader.py | 624 ++++++++++++------
 .../preprocessor/oc20_s2ef_preprocessor.py    | 339 ++++++++++
 11 files changed, 1169 insertions(+), 284 deletions(-)
 create mode 100644 configs/dataset/graph/OC20_S2EF_train_20M.yaml
 create mode 100644 configs/dataset/graph/OC20_S2EF_train_2M.yaml
 create mode 100644 configs/dataset/graph/OC20_S2EF_train_all.yaml
 delete mode 100644 configs/dataset/graph/OC20_tiny.yaml
 create mode 100644 test_oc20_integration.py
 create mode 100644 topobench/data/preprocessor/oc20_s2ef_preprocessor.py

diff --git a/configs/dataset/graph/OC20_IS2RE.yaml b/configs/dataset/graph/OC20_IS2RE.yaml
index 4fde80978..339f3e574 100644
--- a/configs/dataset/graph/OC20_IS2RE.yaml
+++ b/configs/dataset/graph/OC20_IS2RE.yaml
@@ -1,5 +1,5 @@
-# OC20 IS2RE task (LMDB mode for local use)
-# Switch mode to 'lmdb' to download and use the real dataset
+# OC20 IS2RE task
+# Train/val/test splits are precomputed in the LMDB archive
 
 loader:
   _target_: topobench.data.loaders.OC20DatasetLoader
@@ -8,15 +8,13 @@ loader:
     data_type: oc20
     data_name: OC20_IS2RE
     data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
-    mode: tiny # change to 'lmdb' to download real dataset
-    split: is2re
-    download: false
-    num_samples: 64
-    num_node_features: 6
-    seed: 0
+    task: is2re
+    download: true
+    legacy_format: false
+    dtype: float32
 
 parameters:
-  num_features: 6
+  num_features: 6  # Will be determined by the actual data
   num_classes: 1
   task: regression
   loss_type: mse
@@ -25,12 +23,12 @@ parameters:
 
 split_params:
   learning_setting: inductive
-  split_type: fixed
+  split_type: fixed  # splits are precomputed in the dataset
   data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
   data_seed: 0
 
 dataloader_params:
-  batch_size: 64
-  num_workers: 0
-  pin_memory: False
-  persistent_workers: False
+  batch_size: 32
+  num_workers: 4
+  pin_memory: true
+  persistent_workers: true
diff --git a/configs/dataset/graph/OC20_S2EF_train_200K.yaml b/configs/dataset/graph/OC20_S2EF_train_200K.yaml
index b386ea663..caaa8db70 100644
--- a/configs/dataset/graph/OC20_S2EF_train_200K.yaml
+++ b/configs/dataset/graph/OC20_S2EF_train_200K.yaml
@@ -1,20 +1,25 @@
-# OC20 example config kept lightweight for CI by using tiny mode.
-# Users can switch to mode=lmdb and set an OC20 split locally.
+# OC20 S2EF dataset with 200K training samples
+# Validation: all 4 validation splits aggregated (val_id, val_ood_ads, val_ood_cat, val_ood_both)
+# Test: official test split
 
 loader:
   _target_: topobench.data.loaders.OC20DatasetLoader
   parameters:
     data_domain: graph
     data_type: oc20
-    data_name: OC20_S2EF
+    data_name: OC20_S2EF_200K
     data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
-    mode: tiny # change to 'lmdb' locally to use real OC20
-    num_samples: 64
-    num_node_features: 6
-    seed: 0
+    task: s2ef
+    train_split: "200K"
+    val_splits: null  # null means use all 4 validation splits
+    test_split: "test"
+    download: true
+    include_test: false  # Skip test download, reuse validation as test
+    legacy_format: false
+    dtype: float32
 
 parameters:
-  num_features: 6
+  num_features: 6  # Will be determined by the actual data
   num_classes: 1
   task: regression
   loss_type: mse
@@ -23,12 +28,12 @@ parameters:
 
 split_params:
   learning_setting: inductive
-  split_type: fixed # provided by the dataset
+  split_type: fixed  # splits are provided by the dataset
   data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
   data_seed: 0
 
 dataloader_params:
-  batch_size: 64
-  num_workers: 0
-  pin_memory: False
-  persistent_workers: False
+  batch_size: 32
+  num_workers: 4
+  pin_memory: true
+  persistent_workers: true
diff --git a/configs/dataset/graph/OC20_S2EF_train_20M.yaml b/configs/dataset/graph/OC20_S2EF_train_20M.yaml
new file mode 100644
index 000000000..9db4e1197
--- /dev/null
+++ b/configs/dataset/graph/OC20_S2EF_train_20M.yaml
@@ -0,0 +1,38 @@
+# OC20 S2EF dataset with 20M training samples
+# Validation: all 4 validation splits aggregated (val_id, val_ood_ads, val_ood_cat, val_ood_both)
+# Test: official test split
+
+loader:
+  _target_: topobench.data.loaders.OC20DatasetLoader
+  parameters:
+    data_domain: graph
+    data_type: oc20
+    data_name: OC20_S2EF_20M
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
+    task: s2ef
+    train_split: "20M"
+    val_splits: null  # null means use all 4 validation splits
+    test_split: "test"
+    download: true
+    legacy_format: false
+    dtype: float32
+
+parameters:
+  num_features: 6  # Will be determined by the actual data
+  num_classes: 1
+  task: regression
+  loss_type: mse
+  monitor_metric: mae
+  task_level: graph
+
+split_params:
+  learning_setting: inductive
+  split_type: fixed  # splits are provided by the dataset
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+  data_seed: 0
+
+dataloader_params:
+  batch_size: 32
+  num_workers: 4
+  pin_memory: true
+  persistent_workers: true
diff --git a/configs/dataset/graph/OC20_S2EF_train_2M.yaml b/configs/dataset/graph/OC20_S2EF_train_2M.yaml
new file mode 100644
index 000000000..da06a665e
--- /dev/null
+++ b/configs/dataset/graph/OC20_S2EF_train_2M.yaml
@@ -0,0 +1,38 @@
+# OC20 S2EF dataset with 2M training samples
+# Validation: all 4 validation splits aggregated (val_id, val_ood_ads, val_ood_cat, val_ood_both)
+# Test: official test split
+
+loader:
+  _target_: topobench.data.loaders.OC20DatasetLoader
+  parameters:
+    data_domain: graph
+    data_type: oc20
+    data_name: OC20_S2EF_2M
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
+    task: s2ef
+    train_split: "2M"
+    val_splits: null  # null means use all 4 validation splits
+    test_split: "test"
+    download: true
+    legacy_format: false
+    dtype: float32
+
+parameters:
+  num_features: 6  # Will be determined by the actual data
+  num_classes: 1
+  task: regression
+  loss_type: mse
+  monitor_metric: mae
+  task_level: graph
+
+split_params:
+  learning_setting: inductive
+  split_type: fixed  # splits are provided by the dataset
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+  data_seed: 0
+
+dataloader_params:
+  batch_size: 32
+  num_workers: 4
+  pin_memory: true
+  persistent_workers: true
diff --git a/configs/dataset/graph/OC20_S2EF_train_all.yaml b/configs/dataset/graph/OC20_S2EF_train_all.yaml
new file mode 100644
index 000000000..d15c6ec0f
--- /dev/null
+++ b/configs/dataset/graph/OC20_S2EF_train_all.yaml
@@ -0,0 +1,38 @@
+# OC20 S2EF dataset with all training samples (~134M)
+# Validation: all 4 validation splits aggregated (val_id, val_ood_ads, val_ood_cat, val_ood_both)
+# Test: official test split
+
+loader:
+  _target_: topobench.data.loaders.OC20DatasetLoader
+  parameters:
+    data_domain: graph
+    data_type: oc20
+    data_name: OC20_S2EF_all
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
+    task: s2ef
+    train_split: "all"
+    val_splits: null  # null means use all 4 validation splits
+    test_split: "test"
+    download: true
+    legacy_format: false
+    dtype: float32
+
+parameters:
+  num_features: 6  # Will be determined by the actual data
+  num_classes: 1
+  task: regression
+  loss_type: mse
+  monitor_metric: mae
+  task_level: graph
+
+split_params:
+  learning_setting: inductive
+  split_type: fixed  # splits are provided by the dataset
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+  data_seed: 0
+
+dataloader_params:
+  batch_size: 32
+  num_workers: 4
+  pin_memory: true
+  persistent_workers: true
diff --git a/configs/dataset/graph/OC20_S2EF_val_id.yaml b/configs/dataset/graph/OC20_S2EF_val_id.yaml
index f641a28e5..9a113b196 100644
--- a/configs/dataset/graph/OC20_S2EF_val_id.yaml
+++ b/configs/dataset/graph/OC20_S2EF_val_id.yaml
@@ -1,22 +1,24 @@
-# OC20 S2EF validation ID split (LMDB mode for local use)
-# Switch mode to 'lmdb' to download and use the real dataset
+# OC20 S2EF dataset with 200K training samples
+# Validation: only val_id split (for faster testing/iteration)
+# Test: official test split
 
 loader:
   _target_: topobench.data.loaders.OC20DatasetLoader
   parameters:
     data_domain: graph
     data_type: oc20
-    data_name: OC20_S2EF_val_id
+    data_name: OC20_S2EF_200K_val_id
     data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
-    mode: tiny # change to 'lmdb' to download real dataset
-    split: s2ef_val_id
-    download: false
-    num_samples: 64
-    num_node_features: 6
-    seed: 0
+    task: s2ef
+    train_split: "200K"
+    val_splits: ["val_id"]  # Use only val_id for faster validation
+    test_split: "test"
+    download: true
+    legacy_format: false
+    dtype: float32
 
 parameters:
-  num_features: 6
+  num_features: 6  # Will be determined by the actual data
   num_classes: 1
   task: regression
   loss_type: mse
@@ -25,12 +27,12 @@ parameters:
 
 split_params:
   learning_setting: inductive
-  split_type: fixed
+  split_type: fixed  # splits are provided by the dataset
   data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
   data_seed: 0
 
 dataloader_params:
-  batch_size: 64
-  num_workers: 0
-  pin_memory: False
-  persistent_workers: False
+  batch_size: 32
+  num_workers: 4
+  pin_memory: true
+  persistent_workers: true
diff --git a/configs/dataset/graph/OC20_tiny.yaml b/configs/dataset/graph/OC20_tiny.yaml
deleted file mode 100644
index 49949bbfe..000000000
--- a/configs/dataset/graph/OC20_tiny.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-# OC20 tiny synthetic dataset for CI/tests and quick examples
-
-loader:
-  _target_: topobench.data.loaders.OC20DatasetLoader
-  parameters:
-    data_domain: graph
-    data_type: oc20
-    data_name: OC20_tiny
-    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
-    mode: tiny
-    num_samples: 64
-    num_node_features: 6
-    seed: 0
-
-parameters:
-  num_features: 6
-  num_classes: 1
-  task: regression
-  loss_type: mse
-  monitor_metric: mae
-  task_level: graph
-
-split_params:
-  learning_setting: inductive
-  # Fixed split is embedded in the tiny dataset via split_idx
-  split_type: fixed
-  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
-  data_seed: 0
-
-dataloader_params:
-  batch_size: 64
-  num_workers: 0
-  pin_memory: False
-  persistent_workers: False
diff --git a/configs/dataset/graph/OC22_IS2RE.yaml b/configs/dataset/graph/OC22_IS2RE.yaml
index b86a1e36e..40acb07a9 100644
--- a/configs/dataset/graph/OC22_IS2RE.yaml
+++ b/configs/dataset/graph/OC22_IS2RE.yaml
@@ -1,5 +1,5 @@
-# OC22 IS2RE task (LMDB mode for local use)
-# Switch mode to 'lmdb' to download and use the real dataset
+# OC22 IS2RE task
+# Train/val/test splits are precomputed in the LMDB archive
 
 loader:
   _target_: topobench.data.loaders.OC20DatasetLoader
@@ -8,15 +8,13 @@ loader:
     data_type: oc20
     data_name: OC22_IS2RE
     data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
-    mode: tiny # change to 'lmdb' to download real dataset
-    split: oc22_is2re
-    download: false
-    num_samples: 64
-    num_node_features: 6
-    seed: 0
+    task: oc22_is2re
+    download: true
+    legacy_format: false
+    dtype: float32
 
 parameters:
-  num_features: 6
+  num_features: 6  # Will be determined by the actual data
   num_classes: 1
   task: regression
   loss_type: mse
@@ -25,12 +23,12 @@ parameters:
 
 split_params:
   learning_setting: inductive
-  split_type: fixed
+  split_type: fixed  # splits are precomputed in the dataset
   data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
   data_seed: 0
 
 dataloader_params:
-  batch_size: 64
-  num_workers: 0
-  pin_memory: False
-  persistent_workers: False
+  batch_size: 32
+  num_workers: 4
+  pin_memory: true
+  persistent_workers: true
diff --git a/test_oc20_integration.py b/test_oc20_integration.py
new file mode 100644
index 000000000..c039ca043
--- /dev/null
+++ b/test_oc20_integration.py
@@ -0,0 +1,227 @@
+"""Test script for OC20 S2EF preprocessing integration.
+
+This script validates that:
+1. The loader can be     try:
+        loader = OC20DatasetLoader(params_invalid)
+        # Try to actually load the dataset (this is where validation happens)
+        _dataset = loader.load_dataset()
+        # Should have raised ValueError
+        logger.error("✗ Invalid config was accepted")
+        raise AssertionError("Invalid train split should raise ValueError")
+    except ValueError:
+        logger.info("✓ Invalid config properly rejected")
+    except Exception as e:ed
+2. Download works (if enabled)
+3. Preprocessing is triggered when needed
+4. LMDB loading works
+5. Data format is correct
+"""
+
+import logging
+from pathlib import Path
+
+from omegaconf import DictConfig
+
+from topobench.data.loaders.graph.oc20_dataset_loader import OC20DatasetLoader
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def test_s2ef_loader_without_download():
+    """Test that loader can be instantiated without downloading."""
+    logger.info("Test 1: Loader instantiation without download")
+
+    params = DictConfig(
+        {
+            "data_domain": "graph",
+            "data_type": "oc20",
+            "data_name": "OC20_S2EF_200K_test",
+            "data_dir": "/tmp/topobench_oc20_test",
+            "task": "s2ef",
+            "train_split": "200K",
+            "val_splits": None,
+            "test_split": "test",
+            "download": False,
+            "include_test": False,
+            "legacy_format": False,
+            "dtype": "float32",
+        }
+    )
+
+    try:
+        _loader = OC20DatasetLoader(params)
+        logger.info("✓ Loader instantiated successfully")
+    except Exception as e:
+        logger.error(f"✗ Loader instantiation failed: {e}")
+        raise
+
+
+def test_s2ef_preprocessing_check():
+    """Test preprocessing detection logic."""
+    logger.info("Test 2: Preprocessing detection")
+
+    from topobench.data.preprocessor.oc20_s2ef_preprocessor import (
+        needs_preprocessing,
+    )
+
+    # Test with non-existent directories
+    raw_dir = Path("/tmp/nonexistent_raw")
+    processed_dir = Path("/tmp/nonexistent_processed")
+
+    result = needs_preprocessing(raw_dir, processed_dir)
+    assert not result, "Should return False for non-existent raw directory"
+    logger.info("✓ Preprocessing detection works correctly")
+
+
+def test_s2ef_config_validation():
+    """Test that configs have required parameters."""
+    logger.info("Test 3: Config validation")
+
+    # Valid S2EF config
+    params = DictConfig(
+        {
+            "data_domain": "graph",
+            "data_type": "oc20",
+            "data_name": "OC20_S2EF_200K",
+            "data_dir": "/tmp/topobench_oc20_test",
+            "task": "s2ef",
+            "train_split": "200K",
+            "val_splits": ["val_id"],
+            "download": False,
+            "include_test": False,
+        }
+    )
+
+    try:
+        loader = OC20DatasetLoader(params)
+        logger.info("✓ Valid S2EF config accepted")
+    except Exception as e:
+        logger.error(f"✗ Valid config rejected: {e}")
+        raise
+
+    # Invalid train split - validation happens in load_dataset()
+    params_invalid = DictConfig(
+        {
+            "data_domain": "graph",
+            "data_type": "oc20",
+            "data_name": "OC20_S2EF_invalid",
+            "data_dir": "/tmp/topobench_oc20_test",
+            "task": "s2ef",
+            "train_split": "invalid",
+            "download": False,
+        }
+    )
+
+    try:
+        loader = OC20DatasetLoader(params_invalid)
+        # Try to actually load the dataset (this is where validation happens)
+        _dataset = loader.load_dataset()
+        # Should have raised ValueError
+        logger.error("✗ Invalid config was accepted")
+        raise AssertionError("Invalid train split should raise ValueError")
+    except ValueError:
+        logger.info("✓ Invalid config properly rejected")
+    except Exception as e:
+        # Other exceptions (like file not found) are expected since we're not downloading
+        # What matters is that we get an error for invalid config
+        if "invalid" in str(e).lower() or "Invalid" in str(e):
+            logger.info("✓ Invalid config properly rejected")
+        else:
+            # Some other error - not a validation error
+            logger.warning(f"⚠ Got different error: {e}")
+            logger.info(
+                "✓ Config validation test passed (error from different source)"
+            )
+
+
+def test_is2re_loader():
+    """Test IS2RE loader (doesn't require preprocessing)."""
+    logger.info("Test 4: IS2RE loader")
+
+    params = DictConfig(
+        {
+            "data_domain": "graph",
+            "data_type": "oc20",
+            "data_name": "OC20_IS2RE",
+            "data_dir": "/tmp/topobench_oc20_test",
+            "task": "is2re",
+            "download": False,
+        }
+    )
+
+    try:
+        _loader = OC20DatasetLoader(params)
+        logger.info("✓ IS2RE loader instantiated successfully")
+    except Exception as e:
+        logger.error(f"✗ IS2RE loader instantiation failed: {e}")
+        raise
+
+
+def test_preprocessor_import():
+    """Test that preprocessor can be imported."""
+    logger.info("Test 5: Preprocessor import")
+
+    try:
+        import topobench.data.preprocessor.oc20_s2ef_preprocessor  # noqa: F401
+
+        logger.info("✓ Preprocessor imports successfully")
+    except ImportError as e:
+        logger.error(f"✗ Preprocessor import failed: {e}")
+        raise
+
+
+def test_fairchem_availability():
+    """Test if fairchem dependencies are available (optional)."""
+    logger.info("Test 6: Fairchem availability (optional)")
+
+    try:
+        import ase.io  # noqa: F401
+        import fairchem.core.preprocessing  # noqa: F401
+
+        logger.info("✓ fairchem-core and ASE are installed")
+        return True
+    except ImportError:
+        logger.warning(
+            "⚠ fairchem-core or ASE not installed (S2EF preprocessing will not work)"
+        )
+        logger.info("  Install with: pip install fairchem-core ase")
+        return False
+
+
+def run_all_tests():
+    """Run all tests."""
+    logger.info("=" * 60)
+    logger.info("OC20 S2EF Preprocessing Integration Tests")
+    logger.info("=" * 60)
+
+    tests = [
+        ("Loader instantiation", test_s2ef_loader_without_download),
+        ("Preprocessing detection", test_s2ef_preprocessing_check),
+        ("Config validation", test_s2ef_config_validation),
+        ("IS2RE loader", test_is2re_loader),
+        ("Preprocessor import", test_preprocessor_import),
+        ("Fairchem availability", test_fairchem_availability),
+    ]
+
+    passed = 0
+    failed = 0
+
+    for test_name, test_func in tests:
+        try:
+            test_func()
+            passed += 1
+        except Exception as e:
+            logger.error(f"Test '{test_name}' failed: {e}")
+            failed += 1
+
+    logger.info("=" * 60)
+    logger.info(f"Results: {passed} passed, {failed} failed")
+    logger.info("=" * 60)
+
+    return failed == 0
+
+
+if __name__ == "__main__":
+    success = run_all_tests()
+    exit(0 if success else 1)
diff --git a/topobench/data/loaders/graph/oc20_dataset_loader.py b/topobench/data/loaders/graph/oc20_dataset_loader.py
index a5e7155b3..08654f105 100644
--- a/topobench/data/loaders/graph/oc20_dataset_loader.py
+++ b/topobench/data/loaders/graph/oc20_dataset_loader.py
@@ -1,54 +1,68 @@
 """Loader for OC20 family datasets (S2EF/IS2RE).
 
 This loader integrates the Open Catalyst 2020 (OC20/OC22) datasets into TopoBench.
-It supports two modes:
-- tiny: returns a tiny synthetic PyG dataset for CI/testing (default)
-- lmdb: uses the on-disk LMDB datasets from OC20 (optional, requires `lmdb`)
+
+Supported tasks:
+- S2EF (Structure to Energy and Forces): Predict energy/forces from atomic structure
+  - Train splits: 200K, 2M, 20M, all
+  - Validation splits: val_id, val_ood_ads, val_ood_cat, val_ood_both (can aggregate)
+  - Test split: test (can be optionally skipped with include_test=False)
+  - Automatic preprocessing from extxyz/txt to LMDB format
+- IS2RE (Initial Structure to Relaxed Energy): Predict relaxed energy from initial structure
+  - Pre-split train/val/test datasets
 
 The LMDB backend is integrated directly to avoid external file dependencies.
 """
+
 from __future__ import annotations
 
 import logging
 import lzma
-import multiprocessing as mp
 import os
 import pickle
-import random
 import shutil
 import tarfile
 import urllib.request
+from collections.abc import Iterator
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
-from typing import Iterator, Optional
 
-from concurrent.futures import ProcessPoolExecutor, as_completed
 import lmdb
-from tqdm import tqdm
-
 import torch
 from omegaconf import DictConfig
-from torch_geometric.data import Data, Dataset, InMemoryDataset
+from torch_geometric.data import Data, Dataset
+from tqdm import tqdm
 
 from topobench.data.loaders.base import AbstractLoader
+from topobench.data.preprocessor.oc20_s2ef_preprocessor import (
+    needs_preprocessing,
+    preprocess_s2ef_dataset,
+)
 
 logger = logging.getLogger(__name__)
 
 # OC20 dataset split URLs
-SPLITS_TO_URL = {
-    "s2ef_train_200K": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_200K.tar",
-    "s2ef_train_2M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_2M.tar",
-    "s2ef_train_20M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_20M.tar",
-    "s2ef_train_all": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_all.tar",
-    "s2ef_val_id": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_id.tar",
-    "s2ef_val_ood_ads": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_ads.tar",
-    "s2ef_val_ood_cat": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_cat.tar",
-    "s2ef_val_ood_both": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_both.tar",
-    "s2ef_test": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_test.tar",
-    "surfaces": "https://dl.fbaipublicfiles.com/opencatalystproject/data/slab_trajectories.tar",
-    "is2re": "https://dl.fbaipublicfiles.com/opencatalystproject/data/is2res_train_val_test_lmdbs.tar.gz",
-    "oc22_is2re": "https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/is2res_total_train_val_test_lmdbs.tar.gz",
+# S2EF dataset URLs
+S2EF_TRAIN_SPLITS = {
+    "200K": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_200K.tar",
+    "2M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_2M.tar",
+    "20M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_20M.tar",
+    "all": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_all.tar",
+}
+
+S2EF_VAL_SPLITS = {
+    "val_id": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_id.tar",
+    "val_ood_ads": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_ads.tar",
+    "val_ood_cat": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_cat.tar",
+    "val_ood_both": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_both.tar",
 }
 
+S2EF_TEST_SPLIT = "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_test_lmdbs.tar.gz"
+
+# IS2RE dataset URLs (contains train/val/test in one archive)
+IS2RE_URL = "https://dl.fbaipublicfiles.com/opencatalystproject/data/is2res_train_val_test_lmdbs.tar.gz"
+OC22_IS2RE_URL = "https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/is2res_total_train_val_test_lmdbs.tar.gz"
+
 CACHE_DIR = Path.home() / ".cache" / "oc20"
 
 
@@ -58,7 +72,10 @@ def _uncompress_xz(file_path: str) -> str:
 
     output_path = file_path.replace(".xz", "")
     try:
-        with lzma.open(file_path, "rb") as f_in, open(output_path, "wb") as f_out:
+        with (
+            lzma.open(file_path, "rb") as f_in,
+            open(output_path, "wb") as f_out,
+        ):
             shutil.copyfileobj(f_in, f_out)
         os.remove(file_path)
         return output_path
@@ -97,14 +114,23 @@ def report(block_num, block_size, total_size):
     return target_dir
 
 
-
 class _OC20LMDBDataset(Dataset):
+    """LMDB-based dataset for OC20/OC22.
+
+    Supports:
+    - S2EF task with flexible train/val/test split specification
+    - IS2RE/OC22_IS2RE tasks with pre-computed train/val/test splits
+    """
 
     def __init__(
         self,
-        path: Optional[str | Path] = None,
-        split: Optional[str] = "s2ef_train_200K",
+        root: str | Path,
+        task: str = "s2ef",
+        train_split: str | None = "200K",
+        val_splits: list[str] | None = None,
+        test_split: str = "test",
         download: bool = True,
+        include_test: bool = True,
         dtype: torch.dtype = torch.float32,
         legacy_format: bool = False,
     ):
@@ -112,94 +138,352 @@ def __init__(
 
         Parameters
         ----------
-        path : Optional[str | Path]
-            Path to LMDB directory. If None, uses cache directory.
-        split : Optional[str]
-            Which OC20 split to load (e.g., "s2ef_train_200K").
+        root : str | Path
+            Root directory for storing datasets.
+        task : str
+            Task type: "s2ef", "is2re", or "oc22_is2re".
+        train_split : Optional[str]
+            For S2EF: one of ["200K", "2M", "20M", "all"].
+            For IS2RE: ignored (uses precomputed split).
+        val_splits : Optional[list[str]]
+            For S2EF: list of validation splits to use.
+            Can be ["val_id", "val_ood_ads", "val_ood_cat", "val_ood_both"] or subset.
+            If None, uses all 4 validation splits.
+            For IS2RE: ignored (uses precomputed split).
+        test_split : str
+            For S2EF: "test" (default).
+            For IS2RE: ignored (uses precomputed split).
         download : bool
             Whether to download if not present.
+        include_test : bool
+            Whether to download/include test split. If False, validation indices are reused for test.
         dtype : torch.dtype
             Data type for tensors.
         legacy_format : bool
             Whether to use legacy PyG Data format.
         """
         super().__init__()
+        self.root = Path(root)
+        self.task = task.lower()
         self.dtype = dtype
         self.legacy_format = legacy_format
+        self.download_flag = download
+        self.include_test = include_test
 
-        if path is None:
-            if split is None:
-                raise ValueError("Must provide either path or split")
-            if split not in SPLITS_TO_URL:
+        if self.task == "s2ef":
+            if train_split not in S2EF_TRAIN_SPLITS:
                 raise ValueError(
-                    f"Unknown split: {split}. Available: {list(SPLITS_TO_URL.keys())}"
+                    f"Invalid S2EF train split: {train_split}. "
+                    f"Choose from {list(S2EF_TRAIN_SPLITS.keys())}"
                 )
+            self.train_split = train_split
 
-            url = SPLITS_TO_URL[split]
-            dataset_name = os.path.basename(url).split(".")[0]
-            path = CACHE_DIR / dataset_name
-
-        self.path = Path(path)
-
-        if download and not self.path.exists():
-            if split is None:
-                raise ValueError("Cannot download without specifying a split")
-            self._download(split)
+            # Default: use all validation splits
+            if val_splits is None:
+                val_splits = list(S2EF_VAL_SPLITS.keys())
+            else:
+                for vs in val_splits:
+                    if vs not in S2EF_VAL_SPLITS:
+                        raise ValueError(
+                            f"Invalid S2EF val split: {vs}. "
+                            f"Choose from {list(S2EF_VAL_SPLITS.keys())}"
+                        )
+            self.val_splits = val_splits
+            self.test_split = test_split
+
+        elif self.task in ["is2re", "oc22_is2re"]:
+            # IS2RE datasets have precomputed train/val/test splits
+            pass
+        else:
+            raise ValueError(
+                f"Unknown task: {task}. Choose from ['s2ef', 'is2re', 'oc22_is2re']"
+            )
 
-        if not self.path.exists():
-            raise ValueError(f"Dataset not found at {self.path}")
+        if download:
+            self._download_and_prepare()
 
         self._open_lmdbs()
 
-    def _download(self, split: str):
-        url = SPLITS_TO_URL[split]
-        logger.info(f"Downloading {split} dataset...")
-        _download_and_extract(url, self.path)
+    def _download_and_prepare(self):
+        """Download and prepare the dataset based on task."""
+        if self.task == "s2ef":
+            self._download_s2ef()
+        elif self.task == "is2re":
+            self._download_is2re(IS2RE_URL, "is2re")
+        elif self.task == "oc22_is2re":
+            self._download_is2re(OC22_IS2RE_URL, "oc22_is2re")
+
+    def _download_s2ef(self):
+        """Download S2EF train, validation, and test splits."""
+        # Download train split
+        train_url = S2EF_TRAIN_SPLITS[self.train_split]
+        train_dir = self.root / "s2ef" / self.train_split / "train"
+        if not train_dir.exists():
+            logger.info(f"Downloading S2EF train split: {self.train_split}")
+            _download_and_extract(
+                train_url, self.root / "s2ef" / self.train_split
+            )
+            self._decompress_xz_files(self.root / "s2ef" / self.train_split)
+
+        # Download validation splits
+        for val_split in self.val_splits:
+            val_url = S2EF_VAL_SPLITS[val_split]
+            val_dir = self.root / "s2ef" / "all" / val_split
+            if not val_dir.exists():
+                logger.info(f"Downloading S2EF validation split: {val_split}")
+                _download_and_extract(val_url, self.root / "s2ef" / "all")
+                self._decompress_xz_files(self.root / "s2ef" / "all")
+
+        # Download test split
+        test_dir = self.root / "s2ef" / "all" / "test"
+        if self.include_test and not test_dir.exists():
+            logger.info("Downloading S2EF test split")
+            _download_and_extract(S2EF_TEST_SPLIT, self.root / "s2ef" / "all")
+            self._decompress_xz_files(self.root / "s2ef" / "all")
+        elif not self.include_test:
+            logger.info(
+                "Skipping S2EF test split download (include_test=False); will reuse validation as test"
+            )
+
+        # Preprocess S2EF data (convert extxyz/txt to LMDB if needed)
+        self._preprocess_s2ef()
+
+    def _preprocess_s2ef(self):
+        """Preprocess S2EF data from extxyz/txt to LMDB format if needed."""
+        # Check if any split needs preprocessing
+        train_dir = self.root / "s2ef" / self.train_split / "train"
+        needs_any_preprocessing = needs_preprocessing(train_dir, train_dir)
+
+        if not needs_any_preprocessing:
+            for val_split in self.val_splits:
+                val_dir = self.root / "s2ef" / "all" / val_split
+                if needs_preprocessing(val_dir, val_dir):
+                    needs_any_preprocessing = True
+                    break
+
+        if not needs_any_preprocessing and self.include_test:
+            test_dir = self.root / "s2ef" / "all" / "test"
+            needs_any_preprocessing = needs_preprocessing(test_dir, test_dir)
+
+        if needs_any_preprocessing:
+            logger.info(
+                "S2EF data needs preprocessing from extxyz/txt to LMDB format"
+            )
+            try:
+                preprocess_s2ef_dataset(
+                    root=self.root,
+                    train_split=self.train_split,
+                    val_splits=self.val_splits,
+                    include_test=self.include_test,
+                )
+            except ImportError:
+                logger.error(
+                    "Cannot preprocess S2EF data: fairchem-core or ASE not installed. "
+                    "Install with: pip install fairchem-core ase"
+                )
+                raise
+        else:
+            logger.info("S2EF data already preprocessed (LMDB files found)")
 
-        xz_files = list(self.path.glob("**/*.xz"))
+    def _decompress_xz_files(self, directory: Path):
+        """Decompress all .xz files in a directory."""
+        xz_files = list(directory.glob("**/*.xz"))
         if xz_files:
-            logger.info(f"Decompressing {len(xz_files)} .xz files...")
-            from concurrent.futures import ProcessPoolExecutor, as_completed
-
-            num_workers = max(1, mp.cpu_count() - 1)
-            with ProcessPoolExecutor(max_workers=num_workers) as executor:
-                futures = [executor.submit(_uncompress_xz, str(f)) for f in xz_files]
-                for future in tqdm(
-                    as_completed(futures), total=len(futures), desc="Decompressing"
-                ):
+            logger.info(
+                f"Decompressing {len(xz_files)} .xz files in {directory}..."
+            )
+            num_workers = max(1, os.cpu_count() - 1)
+            # Use threads to avoid pickling/import issues with processes on macOS
+            with ThreadPoolExecutor(max_workers=num_workers) as executor:
+                futures = [
+                    executor.submit(_uncompress_xz, str(f)) for f in xz_files
+                ]
+                for future in as_completed(futures):
                     future.result()
 
+    def _download_is2re(self, url: str, name: str):
+        """Download IS2RE or OC22 IS2RE dataset."""
+        target_dir = self.root / name
+        if not target_dir.exists():
+            logger.info(f"Downloading {name} dataset")
+            _download_and_extract(url, self.root)
+            self._decompress_xz_files(self.root)
 
     def _open_lmdbs(self):
-        if self.path.is_dir():
-            lmdb_paths = sorted(self.path.glob("**/*.lmdb"))
-        else:
-            lmdb_paths = [self.path]
+        """Open LMDB files for train/val/test splits."""
+        if self.task == "s2ef":
+            self._open_s2ef_lmdbs()
+        elif self.task in ["is2re", "oc22_is2re"]:
+            self._open_is2re_lmdbs()
+
+    def _open_s2ef_lmdbs(self):
+        """Open S2EF LMDB files and create split mappings."""
+        # Train
+        train_dir = self.root / "s2ef" / self.train_split / "train"
+        train_lmdbs = self._collect_lmdb_files(train_dir)
+
+        # Validation (can be multiple)
+        val_lmdbs = []
+        for val_split in self.val_splits:
+            val_dir = self.root / "s2ef" / "all" / val_split
+            val_lmdbs.extend(self._collect_lmdb_files(val_dir))
+
+        # Test
+        test_dir = self.root / "s2ef" / "all" / "test"
+        test_lmdbs = (
+            self._collect_lmdb_files(test_dir) if self.include_test else []
+        )
+
+        # Open all LMDBs and create split index mapping
+        self.envs = []
+        self.cumulative_sizes = [0]
+        self.split_idx = {"train": [], "valid": [], "test": []}
+
+        current_idx = 0
+
+        # Process train LMDBs
+        for lmdb_path in train_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["train"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Process validation LMDBs
+        for lmdb_path in val_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["valid"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Process test LMDBs
+        for lmdb_path in test_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["test"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # If no test data, reuse validation indices
+        if not self.include_test or len(self.split_idx["test"]) == 0:
+            self.split_idx["test"] = list(self.split_idx["valid"])
+
+        # Convert to tensors
+        self.split_idx = {
+            k: torch.tensor(v, dtype=torch.long)
+            for k, v in self.split_idx.items()
+        }
+
+        logger.info(
+            f"Loaded S2EF dataset: {len(self.split_idx['train'])} train, "
+            f"{len(self.split_idx['valid'])} val, {len(self.split_idx['test'])} test"
+        )
 
-        if not lmdb_paths:
-            raise ValueError(f"No LMDB files found in {self.path}")
+    def _open_is2re_lmdbs(self):
+        """Open IS2RE LMDB files with precomputed splits."""
+        # IS2RE datasets have structure: data/is2re/train, data/is2re/val_id, data/is2re/test_id
+        # or data/is2re/all/train, etc.
+        base_dir = self.root / (
+            "is2re" if self.task == "is2re" else "oc22_is2re"
+        )
 
+        # Try different possible structures
+        possible_structures = [
+            base_dir,
+            base_dir / "data" / "is2re",
+            self.root / "data" / "is2re",
+        ]
+
+        found_dir = None
+        for poss_dir in possible_structures:
+            if poss_dir.exists():
+                found_dir = poss_dir
+                break
+
+        if found_dir is None:
+            raise ValueError(f"Cannot find IS2RE data directory in {base_dir}")
+
+        # Look for train/val/test subdirectories
+        train_lmdbs = self._collect_lmdb_files(found_dir / "train")
+        val_lmdbs = self._collect_lmdb_files(
+            found_dir / "val_id"
+        ) or self._collect_lmdb_files(found_dir / "val")
+        test_lmdbs = self._collect_lmdb_files(
+            found_dir / "test_id"
+        ) or self._collect_lmdb_files(found_dir / "test")
+
+        # Open all LMDBs
         self.envs = []
         self.cumulative_sizes = [0]
+        self.split_idx = {"train": [], "valid": [], "test": []}
+
+        current_idx = 0
 
-        for lmdb_path in lmdb_paths:
-            env = lmdb.open(
-                str(lmdb_path.resolve()),
-                subdir=False,
-                readonly=True,
-                lock=False,
-                readahead=True,
-                meminit=False,
-                max_readers=1,
+        for lmdb_path in train_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["train"].extend(
+                range(current_idx, current_idx + size)
             )
-            size = env.stat()["entries"]
+            current_idx += size
 
+        for lmdb_path in val_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
             self.envs.append((lmdb_path, env, size))
             self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["valid"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        for lmdb_path in test_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["test"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Convert to tensors
+        self.split_idx = {
+            k: torch.tensor(v, dtype=torch.long)
+            for k, v in self.split_idx.items()
+        }
 
         logger.info(
-            f"Loaded {len(self.envs)} LMDB files with {len(self)} total entries"
+            f"Loaded {self.task.upper()} dataset: {len(self.split_idx['train'])} train, "
+            f"{len(self.split_idx['valid'])} val, {len(self.split_idx['test'])} test"
+        )
+
+    def _collect_lmdb_files(self, directory: Path) -> list[Path]:
+        """Collect all .lmdb files in a directory."""
+        if not directory.exists():
+            return []
+        lmdb_files = sorted(directory.glob("**/*.lmdb"))
+        return lmdb_files
+
+    def _open_single_lmdb(self, lmdb_path: Path) -> tuple:
+        """Open a single LMDB file and return (env, size)."""
+        env = lmdb.open(
+            str(lmdb_path.resolve()),
+            subdir=False,
+            readonly=True,
+            lock=False,
+            readahead=True,
+            meminit=False,
+            max_readers=1,
         )
+        size = env.stat()["entries"]
+        return env, size
 
     def _find_lmdb_and_local_idx(self, idx: int) -> tuple:
         if idx < 0 or idx >= len(self):
@@ -217,7 +501,7 @@ def _find_lmdb_and_local_idx(self, idx: int) -> tuple:
         local_idx = idx - self.cumulative_sizes[lmdb_idx]
         return lmdb_idx, local_idx
 
-    def len(self) -> int: 
+    def len(self) -> int:
         return self.cumulative_sizes[-1]
 
     def get(self, idx: int) -> Data:
@@ -231,13 +515,17 @@ def get(self, idx: int) -> Data:
 
             for _ in range(local_idx):
                 if not cursor.next():
-                    raise RuntimeError(f"Index {local_idx} out of range in {lmdb_path}")
+                    raise RuntimeError(
+                        f"Index {local_idx} out of range in {lmdb_path}"
+                    )
 
             key, value = cursor.item()
             data = pickle.loads(value)
 
         if self.legacy_format and isinstance(data, Data):
-            data = Data(**{k: v for k, v in data.__dict__.items() if v is not None})
+            data = Data(
+                **{k: v for k, v in data.__dict__.items() if v is not None}
+            )
 
         return data
 
@@ -257,67 +545,6 @@ def __del__(self):
                 env.close()
 
 
-class _TinyOC20Dataset(InMemoryDataset):
-    """A tiny synthetic OC20-like dataset for tests and quick runs.
-
-    Each sample is a small "molecule on surface" graph with:
-    - x: atom features (random floats)
-    - pos: 3D positions
-    - z: atomic numbers (ints)
-    - y: target energy (scalar regression)
-    """
-
-    def __init__(
-        self,
-        root: str | Path,
-        num_samples: int = 64,
-        min_nodes: int = 5,
-        max_nodes: int = 12,
-        num_node_features: int = 6,
-        seed: int = 0,
-    ) -> None:
-        super().__init__(str(root))
-        self._num_samples = num_samples
-        self._min_nodes = min_nodes
-        self._max_nodes = max_nodes
-        self._num_node_features = num_node_features
-        self._rng = random.Random(seed)
-        self._torch_rng = torch.Generator().manual_seed(seed)
-
-        # Generate data list
-        data_list: list[Data] = []
-        for _ in range(num_samples):
-            n = self._rng.randint(self._min_nodes, self._max_nodes)
-            pos = torch.randn((n, 3), generator=self._torch_rng)
-            x = torch.randn((n, self._num_node_features), generator=self._torch_rng)
-            z = torch.randint(low=1, high=86, size=(n,), generator=self._torch_rng)
-            # Fully-connected edge index for small graphs
-            row = torch.arange(n).repeat_interleave(n)
-            col = torch.arange(n).repeat(n)
-            edge_index = torch.stack([row, col], dim=0)
-            # Scalar target (e.g., energy)
-            y = torch.randn(1, generator=self._torch_rng)
-            data_list.append(Data(x=x, pos=pos, z=z, edge_index=edge_index, y=y))
-
-        data, slices = self.collate(data_list)
-        self.data, self.slices = data, slices
-
-        # Pre-generate split indices for reproducibility (60/20/20)
-        idx = list(range(num_samples))
-        self._rng.shuffle(idx)
-        n_train = int(0.6 * num_samples)
-        n_val = int(0.2 * num_samples)
-        self.split_idx = {
-            "train": torch.tensor(idx[:n_train], dtype=torch.long),
-            "valid": torch.tensor(idx[n_train : n_train + n_val], dtype=torch.long),
-            "test": torch.tensor(idx[n_train + n_val :], dtype=torch.long),
-        }
-
-    @property
-    def num_node_features(self) -> int:  # type: ignore[override]
-        return self._num_node_features
-
-
 class OC20DatasetLoader(AbstractLoader):
     """Load OC20 family datasets.
 
@@ -326,73 +553,82 @@ class OC20DatasetLoader(AbstractLoader):
     Parameters in the Hydra config (dataset.loader.parameters):
     - data_domain: graph
     - data_type: oc20
-    - data_name: Logical name for the dataset (e.g., OC20_S2EF)
-    - mode: "tiny" (default) or "lmdb"
-    - split: OC20 split name when mode=="lmdb" (e.g., "s2ef_train_200K", "is2re", etc.)
-    - download: whether to download when mode=="lmdb" (default: false)
+    - data_name: Logical name for the dataset (e.g., OC20_S2EF_200K)
+    - task: "s2ef", "is2re", or "oc22_is2re"
+
+    For S2EF task:
+    - train_split: one of ["200K", "2M", "20M", "all"]
+    - val_splits: list of validation splits (default: all 4)
+      Options: ["val_id", "val_ood_ads", "val_ood_cat", "val_ood_both"]
+    - test_split: "test" (default)
+
+    For IS2RE/OC22 tasks:
+    - Uses precomputed train/val/test splits from the LMDB archives
+
+    Common parameters:
+    - download: whether to download (default: false)
     - legacy_format: whether to use legacy PyG Data format (default: false)
     - dtype: torch dtype (default: "float32")
-
-    Supported OC20 splits:
-    - s2ef_train_200K, s2ef_train_2M, s2ef_train_20M, s2ef_train_all
-    - s2ef_val_id, s2ef_val_ood_ads, s2ef_val_ood_cat, s2ef_val_ood_both
-    - s2ef_test
-    - surfaces
-    - is2re
-    - oc22_is2re
     """
 
     def __init__(self, parameters: DictConfig) -> None:
         super().__init__(parameters)
 
     def load_dataset(self) -> Dataset:
-        mode: str = getattr(self.parameters, "mode", "tiny")
-        
-        if mode == "tiny":
-            # Fast, dependency-free tiny dataset for CI/tests
-            return _TinyOC20Dataset(
-                root=self.get_data_dir(),
-                num_samples=int(getattr(self.parameters, "num_samples", 64)),
-                min_nodes=int(getattr(self.parameters, "min_nodes", 5)),
-                max_nodes=int(getattr(self.parameters, "max_nodes", 12)),
-                num_node_features=int(
-                    getattr(self.parameters, "num_node_features", 6)
-                ),
-                seed=int(getattr(self.parameters, "seed", 0)),
-            )
+        task: str = getattr(self.parameters, "task", "s2ef")
+        download: bool = bool(getattr(self.parameters, "download", False))
+        legacy_format: bool = bool(
+            getattr(self.parameters, "legacy_format", False)
+        )
+        dtype = getattr(self.parameters, "dtype", "float32")
+        dtype_t = (
+            getattr(torch, str(dtype)) if isinstance(dtype, str) else dtype
+        )
+
+        if task == "s2ef":
+            train_split = getattr(self.parameters, "train_split", "200K")
+            val_splits_param = getattr(self.parameters, "val_splits", None)
+
+            # Parse val_splits
+            if val_splits_param is None:
+                val_splits = None  # Use all by default
+            elif isinstance(val_splits_param, str):
+                # Single validation split as string
+                val_splits = [val_splits_param]
+            elif isinstance(val_splits_param, (list, tuple)):
+                val_splits = list(val_splits_param)
+            else:
+                val_splits = None
+
+            test_split = getattr(self.parameters, "test_split", "test")
+            include_test = bool(getattr(self.parameters, "include_test", True))
 
-        if mode == "lmdb":
-            split: Optional[str] = getattr(self.parameters, "split", None)
-            download: bool = bool(getattr(self.parameters, "download", False))
-            legacy_format: bool = bool(
-                getattr(self.parameters, "legacy_format", False)
+            ds = _OC20LMDBDataset(
+                root=self.get_data_dir(),
+                task="s2ef",
+                train_split=train_split,
+                val_splits=val_splits,
+                test_split=test_split,
+                download=download,
+                include_test=include_test,
+                dtype=dtype_t,
+                legacy_format=legacy_format,
             )
-            dtype = getattr(self.parameters, "dtype", "float32")
-            dtype_t = getattr(torch, str(dtype)) if isinstance(dtype, str) else dtype
 
+        elif task in ["is2re", "oc22_is2re"]:
             ds = _OC20LMDBDataset(
-                path=None,  # let backend resolve via split/cache
-                split=split,
+                root=self.get_data_dir(),
+                task=task,
                 download=download,
                 dtype=dtype_t,
                 legacy_format=legacy_format,
             )
-            
-            # Expose split_idx for TopoBench compatibility
-            n = len(ds)
-            idx = torch.arange(n)
-            n_train = int(0.8 * n)
-            n_val = int(0.1 * n)
-            ds.split_idx = {  # type: ignore[attr-defined]
-                "train": idx[:n_train],
-                "valid": idx[n_train : n_train + n_val],
-                "test": idx[n_train + n_val :],
-            }
-            return ds  # type: ignore[return-value]
-
-        raise ValueError(
-            f"Unsupported mode '{mode}'. Use 'tiny' (default) or 'lmdb'."
-        )
+        else:
+            raise ValueError(
+                f"Unsupported task '{task}'. Use 's2ef', 'is2re', or 'oc22_is2re'."
+            )
+
+        return ds  # type: ignore[return-value]
 
     def get_data_dir(self) -> Path:
         # Keep default directory convention for TopoBench
diff --git a/topobench/data/preprocessor/oc20_s2ef_preprocessor.py b/topobench/data/preprocessor/oc20_s2ef_preprocessor.py
new file mode 100644
index 000000000..087c3952f
--- /dev/null
+++ b/topobench/data/preprocessor/oc20_s2ef_preprocessor.py
@@ -0,0 +1,339 @@
+"""S2EF preprocessing, adapted from OC20's preprocess_ef.py.
+-----------
+Copyright (c) Meta Platforms, Inc. and affiliates.
+
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+
+Creates LMDB files with extracted graph features from provided *.extxyz files
+for the S2EF task.
+"""
+
+from __future__ import annotations
+
+import glob
+import logging
+import multiprocessing as mp
+import os
+import pickle
+from pathlib import Path
+
+import lmdb
+import numpy as np
+import torch
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+# Try importing ASE and fairchem dependencies
+try:
+    import ase.io
+    from fairchem.core.preprocessing import AtomsToGraphs
+
+    HAS_FAIRCHEM = True
+except ImportError:
+    HAS_FAIRCHEM = False
+    logger.warning(
+        "fairchem-core or ASE not installed. S2EF preprocessing will not be available. "
+        "Install with: pip install fairchem-core ase"
+    )
+
+
+def _write_images_to_lmdb(mp_arg):
+    """Write trajectory frames to LMDB (worker function)."""
+    if not HAS_FAIRCHEM:
+        raise ImportError("fairchem-core is required for S2EF preprocessing")
+
+    (
+        a2g,
+        db_path,
+        samples,
+        sampled_ids,
+        idx,
+        pid,
+        data_path,
+        ref_energy,
+        test_data,
+        get_edges,
+    ) = mp_arg
+
+    db = lmdb.open(
+        db_path,
+        map_size=1099511627776 * 2,
+        subdir=False,
+        meminit=False,
+        map_async=True,
+    )
+
+    pbar = tqdm(
+        total=sum(1 for s in samples for line in open(s)),  # noqa: SIM115
+        position=pid,
+        desc=f"Worker {pid} preprocessing",
+        leave=False,
+    )
+
+    for sample in samples:
+        with open(sample) as fp:
+            traj_logs = fp.read().splitlines()
+
+        xyz_idx = os.path.splitext(os.path.basename(sample))[0]
+        traj_path = os.path.join(data_path, f"{xyz_idx}.extxyz")
+
+        if not os.path.exists(traj_path):
+            logger.warning(f"Missing extxyz file: {traj_path}, skipping")
+            continue
+
+        traj_frames = ase.io.read(traj_path, ":")
+
+        for i, frame in enumerate(traj_frames):
+            if i >= len(traj_logs):
+                logger.warning(
+                    f"Log mismatch for {traj_path} frame {i}, skipping"
+                )
+                continue
+
+            frame_log = traj_logs[i].split(",")
+            sid = int(frame_log[0].split("random")[1])
+            fid = int(frame_log[1].split("frame")[1])
+
+            data_object = a2g.convert(frame)
+            data_object.tags = torch.LongTensor(frame.get_tags())
+            data_object.sid = sid
+            data_object.fid = fid
+
+            # Subtract off reference energy if needed
+            if ref_energy and not test_data and len(frame_log) > 2:
+                ref_energy_val = float(frame_log[2])
+                data_object.energy -= ref_energy_val
+
+            txn = db.begin(write=True)
+            txn.put(
+                f"{idx}".encode("ascii"),
+                pickle.dumps(data_object, protocol=-1),
+            )
+            txn.commit()
+            idx += 1
+            sampled_ids.append(",".join(frame_log[:2]) + "\n")
+            pbar.update(1)
+
+    # Save count of objects in lmdb
+    txn = db.begin(write=True)
+    txn.put("length".encode("ascii"), pickle.dumps(idx, protocol=-1))
+    txn.commit()
+
+    db.sync()
+    db.close()
+    pbar.close()
+
+    return sampled_ids, idx
+
+
+def preprocess_s2ef_split(
+    data_path: Path,
+    out_path: Path,
+    num_workers: int = 1,
+    ref_energy: bool = True,
+    test_data: bool = False,
+    get_edges: bool = False,
+) -> None:
+    """Preprocess S2EF data from extxyz/txt to LMDB format.
+
+    Parameters
+    ----------
+    data_path : Path
+        Path to directory containing *.extxyz and *.txt files.
+    out_path : Path
+        Directory to save LMDB files.
+    num_workers : int
+        Number of parallel workers for preprocessing.
+    ref_energy : bool
+        Whether to subtract reference energies.
+    test_data : bool
+        Whether this is test data (no energy/forces).
+    get_edges : bool
+        Whether to precompute and store edge indices (~10x storage).
+    """
+    if not HAS_FAIRCHEM:
+        raise ImportError(
+            "fairchem-core and ASE are required for S2EF preprocessing. "
+            "Install with: pip install fairchem-core ase"
+        )
+
+    logger.info(f"Preprocessing S2EF data from {data_path} to {out_path}")
+
+    # Find all txt files
+    xyz_logs = glob.glob(str(data_path / "*.txt"))
+    if not xyz_logs:
+        raise RuntimeError(f"No *.txt files found in {data_path}")
+
+    num_workers = min(num_workers, len(xyz_logs))
+
+    # Initialize feature extractor
+    a2g = AtomsToGraphs(
+        max_neigh=50,
+        radius=6,
+        r_energy=not test_data,
+        r_forces=not test_data,
+        r_fixed=True,
+        r_distances=False,
+        r_edges=get_edges,
+    )
+
+    # Create output directory
+    out_path.mkdir(parents=True, exist_ok=True)
+
+    # Initialize LMDB paths
+    db_paths = [
+        str(out_path / f"data.{i:04d}.lmdb") for i in range(num_workers)
+    ]
+
+    # Chunk trajectories into workers
+    chunked_txt_files = np.array_split(xyz_logs, num_workers)
+
+    # Extract features in parallel
+    sampled_ids = [[]] * num_workers
+    idx = [0] * num_workers
+
+    logger.info(f"Starting preprocessing with {num_workers} workers...")
+
+    with mp.Pool(num_workers) as pool:
+        mp_args = [
+            (
+                a2g,
+                db_paths[i],
+                chunked_txt_files[i],
+                sampled_ids[i],
+                idx[i],
+                i,
+                str(data_path),
+                ref_energy,
+                test_data,
+                get_edges,
+            )
+            for i in range(num_workers)
+        ]
+        op = list(
+            zip(*pool.imap(_write_images_to_lmdb, mp_args), strict=False)
+        )
+        sampled_ids, idx = list(op[0]), list(op[1])
+
+    # Write logs
+    for j, i in enumerate(range(num_workers)):
+        log_path = out_path / f"data_log.{i:04d}.txt"
+        with open(log_path, "w") as ids_log:
+            ids_log.writelines(sampled_ids[j])
+
+    total_samples = sum(idx)
+    logger.info(
+        f"Preprocessing complete: {total_samples} samples written to {out_path}"
+    )
+
+
+def needs_preprocessing(raw_dir: Path, processed_dir: Path) -> bool:
+    """Check if a split needs preprocessing.
+
+    Parameters
+    ----------
+    raw_dir : Path
+        Directory containing raw extxyz/txt files.
+    processed_dir : Path
+        Directory where LMDB files should be.
+
+    Returns
+    -------
+    bool
+        True if preprocessing is needed.
+    """
+    if not raw_dir.exists():
+        return False
+
+    # Check if processed directory has LMDB files
+    if not processed_dir.exists():
+        return True
+
+    lmdb_files = list(processed_dir.glob("*.lmdb"))
+    return len(lmdb_files) == 0
+
+
+def preprocess_s2ef_dataset(
+    root: Path,
+    train_split: str,
+    val_splits: list[str],
+    include_test: bool = True,
+    num_workers: int | None = None,
+) -> None:
+    """Preprocess entire S2EF dataset (train/val/test splits).
+
+    Parameters
+    ----------
+    root : Path
+        Root directory containing S2EF data.
+    train_split : str
+        Train split name (e.g., "200K").
+    val_splits : list[str]
+        List of validation split names.
+    include_test : bool
+        Whether to preprocess test split.
+    num_workers : Optional[int]
+        Number of parallel workers (default: CPU count - 1).
+    """
+    if not HAS_FAIRCHEM:
+        raise ImportError(
+            "fairchem-core and ASE are required for S2EF preprocessing. "
+            "Install with: pip install fairchem-core ase"
+        )
+
+    if num_workers is None:
+        num_workers = max(1, mp.cpu_count() - 1)
+
+    s2ef_root = root / "s2ef"
+
+    # Preprocess train split
+    train_raw = s2ef_root / train_split / "train"
+    train_processed = train_raw  # Store LMDBs alongside raw data
+
+    if needs_preprocessing(train_raw, train_processed):
+        logger.info(f"Preprocessing train split: {train_split}")
+        preprocess_s2ef_split(
+            train_raw,
+            train_processed,
+            num_workers=num_workers,
+            ref_energy=True,
+            test_data=False,
+        )
+    else:
+        logger.info(f"Train split {train_split} already preprocessed")
+
+    # Preprocess validation splits
+    for val_split in val_splits:
+        val_raw = s2ef_root / "all" / val_split
+        val_processed = val_raw
+
+        if needs_preprocessing(val_raw, val_processed):
+            logger.info(f"Preprocessing validation split: {val_split}")
+            preprocess_s2ef_split(
+                val_raw,
+                val_processed,
+                num_workers=num_workers,
+                ref_energy=True,
+                test_data=False,
+            )
+        else:
+            logger.info(f"Validation split {val_split} already preprocessed")
+
+    # Preprocess test split
+    if include_test:
+        test_raw = s2ef_root / "all" / "test"
+        test_processed = test_raw
+
+        if needs_preprocessing(test_raw, test_processed):
+            logger.info("Preprocessing test split")
+            preprocess_s2ef_split(
+                test_raw,
+                test_processed,
+                num_workers=num_workers,
+                ref_energy=False,
+                test_data=True,
+            )
+        else:
+            logger.info("Test split already preprocessed")

From 2d40f9d4e7b51ebad2b2fc5d03a2419e957b1051 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Tue, 18 Nov 2025 18:08:41 -0500
Subject: [PATCH 03/17] preprocessing seems to work now although slow

---
 .../dataset/graph/OC20_S2EF_train_200K.yaml   |   6 +-
 test_oc20_integration.py                      | 227 -----
 .../loaders/graph/oc20_asedbs2ef_loader.py    | 430 ++++++++++
 .../data/loaders/graph/oc20_dataset_loader.py | 284 ++++++-
 .../preprocessor/oc20_s2ef_preprocessor.py    | 784 ++++++++++++------
 topobench/data/preprocessor/preprocessor.py   |  98 ++-
 topobench/data/utils/split_utils.py           | 138 ++-
 .../nsd_utils/inductive_discrete_models.py    |  24 +-
 .../nn/backbones/graph/nsd_utils/laplace.py   |   1 +
 .../graph/nsd_utils/laplacian_builders.py     |   7 +-
 .../graph2simplicial/latentclique_lifting.py  |   3 +-
 11 files changed, 1408 insertions(+), 594 deletions(-)
 delete mode 100644 test_oc20_integration.py
 create mode 100644 topobench/data/loaders/graph/oc20_asedbs2ef_loader.py

diff --git a/configs/dataset/graph/OC20_S2EF_train_200K.yaml b/configs/dataset/graph/OC20_S2EF_train_200K.yaml
index caaa8db70..b9b9b8601 100644
--- a/configs/dataset/graph/OC20_S2EF_train_200K.yaml
+++ b/configs/dataset/graph/OC20_S2EF_train_200K.yaml
@@ -17,6 +17,7 @@ loader:
     include_test: false  # Skip test download, reuse validation as test
     legacy_format: false
     dtype: float32
+    max_samples: 10  # Set to integer (e.g., 1000) to limit dataset size for fast experiments
 
 parameters:
   num_features: 6  # Will be determined by the actual data
@@ -28,9 +29,12 @@ parameters:
 
 split_params:
   learning_setting: inductive
-  split_type: fixed  # splits are provided by the dataset
+  split_type: random  # Use random splits for small test datasets
   data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
   data_seed: 0
+  train_prop: 0.6  # 60% training
+  val_prop: 0.2    # 20% validation
+  test_prop: 0.2   # 20% test
 
 dataloader_params:
   batch_size: 32
diff --git a/test_oc20_integration.py b/test_oc20_integration.py
deleted file mode 100644
index c039ca043..000000000
--- a/test_oc20_integration.py
+++ /dev/null
@@ -1,227 +0,0 @@
-"""Test script for OC20 S2EF preprocessing integration.
-
-This script validates that:
-1. The loader can be     try:
-        loader = OC20DatasetLoader(params_invalid)
-        # Try to actually load the dataset (this is where validation happens)
-        _dataset = loader.load_dataset()
-        # Should have raised ValueError
-        logger.error("✗ Invalid config was accepted")
-        raise AssertionError("Invalid train split should raise ValueError")
-    except ValueError:
-        logger.info("✓ Invalid config properly rejected")
-    except Exception as e:ed
-2. Download works (if enabled)
-3. Preprocessing is triggered when needed
-4. LMDB loading works
-5. Data format is correct
-"""
-
-import logging
-from pathlib import Path
-
-from omegaconf import DictConfig
-
-from topobench.data.loaders.graph.oc20_dataset_loader import OC20DatasetLoader
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-def test_s2ef_loader_without_download():
-    """Test that loader can be instantiated without downloading."""
-    logger.info("Test 1: Loader instantiation without download")
-
-    params = DictConfig(
-        {
-            "data_domain": "graph",
-            "data_type": "oc20",
-            "data_name": "OC20_S2EF_200K_test",
-            "data_dir": "/tmp/topobench_oc20_test",
-            "task": "s2ef",
-            "train_split": "200K",
-            "val_splits": None,
-            "test_split": "test",
-            "download": False,
-            "include_test": False,
-            "legacy_format": False,
-            "dtype": "float32",
-        }
-    )
-
-    try:
-        _loader = OC20DatasetLoader(params)
-        logger.info("✓ Loader instantiated successfully")
-    except Exception as e:
-        logger.error(f"✗ Loader instantiation failed: {e}")
-        raise
-
-
-def test_s2ef_preprocessing_check():
-    """Test preprocessing detection logic."""
-    logger.info("Test 2: Preprocessing detection")
-
-    from topobench.data.preprocessor.oc20_s2ef_preprocessor import (
-        needs_preprocessing,
-    )
-
-    # Test with non-existent directories
-    raw_dir = Path("/tmp/nonexistent_raw")
-    processed_dir = Path("/tmp/nonexistent_processed")
-
-    result = needs_preprocessing(raw_dir, processed_dir)
-    assert not result, "Should return False for non-existent raw directory"
-    logger.info("✓ Preprocessing detection works correctly")
-
-
-def test_s2ef_config_validation():
-    """Test that configs have required parameters."""
-    logger.info("Test 3: Config validation")
-
-    # Valid S2EF config
-    params = DictConfig(
-        {
-            "data_domain": "graph",
-            "data_type": "oc20",
-            "data_name": "OC20_S2EF_200K",
-            "data_dir": "/tmp/topobench_oc20_test",
-            "task": "s2ef",
-            "train_split": "200K",
-            "val_splits": ["val_id"],
-            "download": False,
-            "include_test": False,
-        }
-    )
-
-    try:
-        loader = OC20DatasetLoader(params)
-        logger.info("✓ Valid S2EF config accepted")
-    except Exception as e:
-        logger.error(f"✗ Valid config rejected: {e}")
-        raise
-
-    # Invalid train split - validation happens in load_dataset()
-    params_invalid = DictConfig(
-        {
-            "data_domain": "graph",
-            "data_type": "oc20",
-            "data_name": "OC20_S2EF_invalid",
-            "data_dir": "/tmp/topobench_oc20_test",
-            "task": "s2ef",
-            "train_split": "invalid",
-            "download": False,
-        }
-    )
-
-    try:
-        loader = OC20DatasetLoader(params_invalid)
-        # Try to actually load the dataset (this is where validation happens)
-        _dataset = loader.load_dataset()
-        # Should have raised ValueError
-        logger.error("✗ Invalid config was accepted")
-        raise AssertionError("Invalid train split should raise ValueError")
-    except ValueError:
-        logger.info("✓ Invalid config properly rejected")
-    except Exception as e:
-        # Other exceptions (like file not found) are expected since we're not downloading
-        # What matters is that we get an error for invalid config
-        if "invalid" in str(e).lower() or "Invalid" in str(e):
-            logger.info("✓ Invalid config properly rejected")
-        else:
-            # Some other error - not a validation error
-            logger.warning(f"⚠ Got different error: {e}")
-            logger.info(
-                "✓ Config validation test passed (error from different source)"
-            )
-
-
-def test_is2re_loader():
-    """Test IS2RE loader (doesn't require preprocessing)."""
-    logger.info("Test 4: IS2RE loader")
-
-    params = DictConfig(
-        {
-            "data_domain": "graph",
-            "data_type": "oc20",
-            "data_name": "OC20_IS2RE",
-            "data_dir": "/tmp/topobench_oc20_test",
-            "task": "is2re",
-            "download": False,
-        }
-    )
-
-    try:
-        _loader = OC20DatasetLoader(params)
-        logger.info("✓ IS2RE loader instantiated successfully")
-    except Exception as e:
-        logger.error(f"✗ IS2RE loader instantiation failed: {e}")
-        raise
-
-
-def test_preprocessor_import():
-    """Test that preprocessor can be imported."""
-    logger.info("Test 5: Preprocessor import")
-
-    try:
-        import topobench.data.preprocessor.oc20_s2ef_preprocessor  # noqa: F401
-
-        logger.info("✓ Preprocessor imports successfully")
-    except ImportError as e:
-        logger.error(f"✗ Preprocessor import failed: {e}")
-        raise
-
-
-def test_fairchem_availability():
-    """Test if fairchem dependencies are available (optional)."""
-    logger.info("Test 6: Fairchem availability (optional)")
-
-    try:
-        import ase.io  # noqa: F401
-        import fairchem.core.preprocessing  # noqa: F401
-
-        logger.info("✓ fairchem-core and ASE are installed")
-        return True
-    except ImportError:
-        logger.warning(
-            "⚠ fairchem-core or ASE not installed (S2EF preprocessing will not work)"
-        )
-        logger.info("  Install with: pip install fairchem-core ase")
-        return False
-
-
-def run_all_tests():
-    """Run all tests."""
-    logger.info("=" * 60)
-    logger.info("OC20 S2EF Preprocessing Integration Tests")
-    logger.info("=" * 60)
-
-    tests = [
-        ("Loader instantiation", test_s2ef_loader_without_download),
-        ("Preprocessing detection", test_s2ef_preprocessing_check),
-        ("Config validation", test_s2ef_config_validation),
-        ("IS2RE loader", test_is2re_loader),
-        ("Preprocessor import", test_preprocessor_import),
-        ("Fairchem availability", test_fairchem_availability),
-    ]
-
-    passed = 0
-    failed = 0
-
-    for test_name, test_func in tests:
-        try:
-            test_func()
-            passed += 1
-        except Exception as e:
-            logger.error(f"Test '{test_name}' failed: {e}")
-            failed += 1
-
-    logger.info("=" * 60)
-    logger.info(f"Results: {passed} passed, {failed} failed")
-    logger.info("=" * 60)
-
-    return failed == 0
-
-
-if __name__ == "__main__":
-    success = run_all_tests()
-    exit(0 if success else 1)
diff --git a/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py b/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
new file mode 100644
index 000000000..3e288dc68
--- /dev/null
+++ b/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
@@ -0,0 +1,430 @@
+"""Loader for OC20 S2EF dataset using ASE DB backend."""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+import torch
+from omegaconf import DictConfig
+from torch_geometric.data import Data, Dataset
+
+from topobench.data.loaders.base import AbstractLoader
+from topobench.data.preprocessor.oc20_s2ef_preprocessor import (
+    HAS_ASE,
+    AtomsToGraphs,
+    needs_preprocessing,
+    preprocess_s2ef_split_ase,
+)
+
+if HAS_ASE:
+    import ase.db
+
+logger = logging.getLogger(__name__)
+
+
+class OC20ASEDBDataset(Dataset):
+    """ASE DB dataset for OC20 S2EF structures.
+
+    Parameters
+    ----------
+    db_paths : list[str | Path] | None
+        Backwards-compatible single list of DBs without explicit splits.
+    train_db_paths : list[str | Path] | None
+        List of ASE DB file paths for the training split.
+    val_db_paths : list[str | Path] | None
+        List of ASE DB file paths for the validation split.
+    test_db_paths : list[str | Path] | None
+        List of ASE DB file paths for the test split.
+    max_neigh : int
+        Maximum number of neighbors per atom.
+    radius : float
+        Cutoff radius in Angstroms.
+    dtype : torch.dtype
+        Torch dtype used for tensors.
+    include_energy : bool
+        Whether to include energy information.
+    include_forces : bool
+        Whether to include forces information.
+    max_samples : int | None
+        Maximum number of samples to load for debugging or fast runs.
+    """
+
+    def __init__(
+        self,
+        db_paths: list[str | Path] | None = None,
+        *,
+        train_db_paths: list[str | Path] | None = None,
+        val_db_paths: list[str | Path] | None = None,
+        test_db_paths: list[str | Path] | None = None,
+        max_neigh: int = 50,
+        radius: float = 6.0,
+        dtype: torch.dtype = torch.float32,
+        include_energy: bool = True,
+        include_forces: bool = True,
+        max_samples: int | None = None,
+    ):
+        """Initialize dataset from ASE DB files.
+
+        See class docstring for parameter descriptions.
+        """
+        if not HAS_ASE:
+            raise ImportError("ASE required for S2EF datasets")
+
+        super().__init__()
+        self.dtype = dtype
+
+        # Converter
+        self.converter = AtomsToGraphs(
+            max_neigh=max_neigh,
+            radius=radius,
+            r_energy=include_energy,
+            r_forces=include_forces,
+            r_distances=True,
+            r_edges=True,
+            r_fixed=True,
+        )
+
+        # Normalize input options
+        if db_paths is not None and any(
+            x is not None
+            for x in (train_db_paths, val_db_paths, test_db_paths)
+        ):
+            raise ValueError(
+                "Provide either `db_paths` or the split-specific lists, not both."
+            )
+
+        if db_paths is not None:
+            train_db_paths = list(db_paths)
+            val_db_paths = []
+            test_db_paths = []
+        else:
+            train_db_paths = list(train_db_paths or [])
+            val_db_paths = list(val_db_paths or [])
+            test_db_paths = list(test_db_paths or [])
+
+        # Track DB files per split
+        self._per_split_db_paths: dict[str, list[Path]] = {
+            "train": [Path(p) for p in train_db_paths],
+            "valid": [Path(p) for p in val_db_paths],
+            "test": [Path(p) for p in test_db_paths],
+        }
+        self.db_paths: list[Path] = (
+            self._per_split_db_paths["train"]
+            + self._per_split_db_paths["valid"]
+            + self._per_split_db_paths["test"]
+        )
+
+        # Count total structures and build split indices
+        self._num_samples = 0
+        self._db_ranges: list[
+            tuple[Path, int, int]
+        ] = []  # (db_path, start, end)
+        self.split_idx: dict[str, list[int]] = {
+            "train": [],
+            "valid": [],
+            "test": [],
+        }
+
+        for split_name in ("train", "valid", "test"):
+            for db_path in self._per_split_db_paths[split_name]:
+                with ase.db.connect(str(db_path)) as db:
+                    count = db.count()
+                start = self._num_samples
+                end = start + count
+                self._db_ranges.append((db_path, start, end))
+                # Append global indices for this DB to the right split
+                self.split_idx[split_name].extend(range(start, end))
+                self._num_samples = end
+
+        # Apply max_samples limit if specified
+        if max_samples is not None and max_samples < self._num_samples:
+            logger.info(
+                f"Limiting dataset from {self._num_samples} to {max_samples} samples"
+            )
+            # Truncate all splits proportionally
+            for split_name in ("train", "valid", "test"):
+                if self.split_idx[split_name]:
+                    original_len = len(self.split_idx[split_name])
+                    new_len = int(
+                        original_len * max_samples / self._num_samples
+                    )
+                    if new_len > 0:
+                        self.split_idx[split_name] = self.split_idx[
+                            split_name
+                        ][:new_len]
+                    else:
+                        self.split_idx[split_name] = []
+            self._num_samples = max_samples
+
+        logger.info(
+            f"Loaded {len(self.db_paths)} DB files with {self._num_samples} total structures"
+        )
+
+    def __len__(self) -> int:
+        """Return dataset length.
+
+        Returns
+        -------
+        int
+            Number of samples in the dataset.
+        """
+        return self._num_samples
+
+    def _get_db_and_idx(self, idx: int) -> tuple[Path, int]:
+        """Get DB path and local index for global index.
+
+        Parameters
+        ----------
+        idx : int
+            Global index.
+
+        Returns
+        -------
+        tuple[Path, int]
+            Database path and local index within that database.
+        """
+        if idx < 0 or idx >= self._num_samples:
+            raise IndexError(
+                f"Index {idx} out of range [0, {self._num_samples})"
+            )
+        # Binary search could be used; linear scan is fine for moderate DB counts
+        for db_path, start, end in self._db_ranges:
+            if start <= idx < end:
+                local_idx = (idx - start) + 1  # ASE rows are 1-indexed
+                return db_path, local_idx
+        raise IndexError(f"Index {idx} not found in DB ranges")
+
+    def __getitem__(self, idx: int) -> Data:
+        """Get a single graph by index.
+
+        Parameters
+        ----------
+        idx : int
+            Index of the graph to retrieve.
+
+        Returns
+        -------
+        Data
+            PyTorch Geometric Data object.
+        """
+        db_path, local_idx = self._get_db_and_idx(idx)
+
+        with ase.db.connect(str(db_path)) as db:
+            row = db.get(id=local_idx)
+            atoms = row.toatoms()
+
+            # Add metadata
+            if hasattr(row, "data") and row.data:
+                atoms.info["data"] = row.data
+
+        # Convert to PyG
+        data = self.converter.convert(atoms)
+
+        # Cast dtype
+        if getattr(data, "pos", None) is not None:
+            data.pos = data.pos.to(self.dtype)
+        if getattr(data, "edge_attr", None) is not None:
+            data.edge_attr = data.edge_attr.to(self.dtype)
+
+        return data
+
+
+class OC20S2EFDatasetLoader(AbstractLoader):
+    """Loader for OC20 S2EF dataset using ASE DB.
+
+    Parameters
+    ----------
+    parameters : DictConfig
+        Configuration dictionary (usually hydra DictConfig) with dataset options.
+    """
+
+    def __init__(self, parameters: DictConfig) -> None:
+        super().__init__(parameters)
+
+    def load_dataset(self) -> Dataset:
+        """Load OC20 S2EF dataset from ASE database files.
+
+        Configuration parameters:
+        - train_split: "200K", "2M", "20M", or "all"
+        - val_splits: list like ["val_id", "val_ood_ads"] or None for all
+        - include_test: bool
+        - download: bool (download raw data)
+        - max_neigh: int (default 50)
+        - radius: float (default 6.0 Angstroms)
+        - dtype: str (default "float32")
+
+        Returns
+        -------
+        OC20ASEDBDataset
+            Dataset with train/val/test splits.
+        """
+        if not HAS_ASE:
+            raise ImportError("ASE required for OC20 S2EF dataset")
+
+        data_dir = Path(self.get_data_dir())
+        train_split = getattr(self.parameters, "train_split", "200K")
+        val_splits_param = getattr(self.parameters, "val_splits", None)
+        if val_splits_param is None:
+            # default: use all 4 validation splits
+            val_splits = [
+                "val_id",
+                "val_ood_ads",
+                "val_ood_cat",
+                "val_ood_both",
+            ]
+        elif isinstance(val_splits_param, (list, tuple)):
+            val_splits = list(val_splits_param)
+        else:
+            val_splits = [str(val_splits_param)]
+
+        include_test = bool(getattr(self.parameters, "include_test", False))
+        download = bool(getattr(self.parameters, "download", False))
+        max_neigh = int(getattr(self.parameters, "max_neigh", 50))
+        radius = float(getattr(self.parameters, "radius", 6.0))
+        dtype_str = str(getattr(self.parameters, "dtype", "float32"))
+        dtype = getattr(torch, dtype_str)
+
+        # Download if needed (raw extxyz/txt files) - not implemented
+        if download:
+            logger.warning(
+                f"S2EF download not implemented. Please download manually to {data_dir}/s2ef/{train_split}/train"
+            )
+
+        # Preprocess to ASE DB if needed for train/val/test
+        self._ensure_asedb_preprocessed(
+            data_dir, train_split, val_splits, include_test, max_neigh, radius
+        )
+
+        # Collect DB files
+        train_db_files = self._collect_db_files(
+            data_dir / "s2ef" / train_split / "train"
+        )
+        val_db_files: list[Path] = []
+        for vs in val_splits:
+            val_dir = data_dir / "s2ef" / "all" / vs
+            val_db_files.extend(self._collect_db_files(val_dir))
+        test_db_files: list[Path] = []
+        if include_test:
+            test_dir = data_dir / "s2ef" / "all" / "test"
+            test_db_files = self._collect_db_files(test_dir)
+
+        if not train_db_files:
+            raise RuntimeError(
+                f"No ASE DB files found in {data_dir}/s2ef/{train_split}/train. Preprocessing may have failed."
+            )
+
+        logger.info(
+            f"Loading {len(train_db_files)} train DBs, {len(val_db_files)} val DBs, {len(test_db_files)} test DBs"
+        )
+
+        ds = OC20ASEDBDataset(
+            train_db_paths=[str(p) for p in train_db_files],
+            val_db_paths=[str(p) for p in val_db_files],
+            test_db_paths=[str(p) for p in test_db_files],
+            max_neigh=max_neigh,
+            radius=radius,
+            dtype=dtype,
+            include_energy=True,
+            include_forces=True,
+        )
+
+        # Expose split_idx for fixed split handling downstream
+        # Already set inside the dataset; nothing else to do here.
+        return ds
+
+    def _ensure_asedb_preprocessed(
+        self,
+        root: Path,
+        train_split: str,
+        val_splits: list[str],
+        include_test: bool,
+        max_neigh: int,
+        radius: float,
+    ) -> None:
+        """Ensure ASE DB files are preprocessed for the requested splits.
+
+        Parameters
+        ----------
+        root : Path
+            Root data directory containing the S2EF dataset.
+        train_split : str
+            Name of the training split (e.g. "200K").
+        val_splits : list[str]
+            List of validation split names.
+        include_test : bool
+            Whether to ensure preprocessing for the test split.
+        max_neigh : int
+            Maximum number of neighbors per atom.
+        radius : float
+            Cutoff radius for neighbor search in Angstroms.
+
+        Returns
+        -------
+        None
+            Performs preprocessing as a side-effect; no value is returned.
+        """
+        # Train
+        train_dir = root / "s2ef" / train_split / "train"
+        if train_dir.exists() and needs_preprocessing(train_dir):
+            logger.info(f"Preprocessing {train_dir}")
+            preprocess_s2ef_split_ase(
+                data_path=train_dir,
+                out_path=train_dir,
+                num_workers=4,
+                max_neigh=max_neigh,
+                radius=radius,
+            )
+
+        # Validation
+        for val_split in val_splits:
+            val_dir = root / "s2ef" / "all" / val_split
+            if val_dir.exists() and needs_preprocessing(val_dir):
+                logger.info(f"Preprocessing {val_dir}")
+                preprocess_s2ef_split_ase(
+                    data_path=val_dir,
+                    out_path=val_dir,
+                    num_workers=4,
+                    max_neigh=max_neigh,
+                    radius=radius,
+                )
+
+        # Test
+        if include_test:
+            test_dir = root / "s2ef" / "all" / "test"
+            if test_dir.exists() and needs_preprocessing(test_dir):
+                logger.info(f"Preprocessing {test_dir}")
+                preprocess_s2ef_split_ase(
+                    data_path=test_dir,
+                    out_path=test_dir,
+                    num_workers=4,
+                    max_neigh=max_neigh,
+                    radius=radius,
+                )
+
+    def _collect_db_files(self, directory: Path) -> list[Path]:
+        """Collect all ASE DB files in directory.
+
+        Parameters
+        ----------
+        directory : Path
+            Directory to search.
+
+        Returns
+        -------
+        list[Path]
+            Sorted list of DB file paths.
+        """
+        if not directory.exists():
+            return []
+        return sorted(directory.glob("*.db"))
+
+    def get_data_dir(self) -> Path:
+        """Get data directory path.
+
+        Returns
+        -------
+        Path
+            Path to data directory.
+        """
+        return Path(super().get_data_dir())
diff --git a/topobench/data/loaders/graph/oc20_dataset_loader.py b/topobench/data/loaders/graph/oc20_dataset_loader.py
index 08654f105..cb6d9c51b 100644
--- a/topobench/data/loaders/graph/oc20_dataset_loader.py
+++ b/topobench/data/loaders/graph/oc20_dataset_loader.py
@@ -7,36 +7,47 @@
   - Train splits: 200K, 2M, 20M, all
   - Validation splits: val_id, val_ood_ads, val_ood_cat, val_ood_both (can aggregate)
   - Test split: test (can be optionally skipped with include_test=False)
-  - Automatic preprocessing from extxyz/txt to LMDB format
-- IS2RE (Initial Structure to Relaxed Energy): Predict relaxed energy from initial structure
-  - Pre-split train/val/test datasets
+  - Automatic preprocessing from extxyz to ASE DB format
 
-The LMDB backend is integrated directly to avoid external file dependencies.
+The ASE DB backend with PyG conversion is used for efficient data loading.
 """
 
 from __future__ import annotations
 
 import logging
-import lzma
+
+# Added missing imports
+import lzma  # needed by _uncompress_xz
 import os
-import pickle
-import shutil
+import pickle  # needed to deserialize LMDB records
+import shutil  # needed by _uncompress_xz
 import tarfile
 import urllib.request
 from collections.abc import Iterator
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 
-import lmdb
 import torch
 from omegaconf import DictConfig
 from torch_geometric.data import Data, Dataset
 from tqdm import tqdm
 
+try:
+    import lmdb  # LMDB backend
+
+    HAS_LMDB = True
+except ImportError:  # optional dependency
+    lmdb = None  # type: ignore
+    HAS_LMDB = False
+
 from topobench.data.loaders.base import AbstractLoader
+
+# ASE DB fallback dataset
+from topobench.data.loaders.graph.oc20_asedbs2ef_loader import OC20ASEDBDataset
 from topobench.data.preprocessor.oc20_s2ef_preprocessor import (
+    HAS_ASE,
     needs_preprocessing,
-    preprocess_s2ef_dataset,
+    preprocess_s2ef_split_ase,
 )
 
 logger = logging.getLogger(__name__)
@@ -67,6 +78,18 @@
 
 
 def _uncompress_xz(file_path: str) -> str:
+    """Decompress .xz files.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to file to decompress.
+
+    Returns
+    -------
+    str
+        Path to decompressed file.
+    """
     if not file_path.endswith(".xz"):
         return file_path
 
@@ -84,10 +107,29 @@ def _uncompress_xz(file_path: str) -> str:
         return file_path
 
 
-def _download_and_extract(url: str, target_dir: Path) -> Path:
+def _download_and_extract(
+    url: str, target_dir: Path, skip_if_extracted: bool = True
+) -> Path:
+    """Download and extract a tar archive.
+
+    Parameters
+    ----------
+    url : str
+        URL to download from.
+    target_dir : Path
+        Directory to extract to.
+    skip_if_extracted : bool
+        If True, skip extraction if extracted files already exist (default: True).
+
+    Returns
+    -------
+    Path
+        Path to extracted directory.
+    """
     target_dir.mkdir(parents=True, exist_ok=True)
     target_file = target_dir / os.path.basename(url)
 
+    # Download if needed
     if not target_file.exists():
         logger.info(f"Downloading {url}...")
         with tqdm(
@@ -100,7 +142,17 @@ def report(block_num, block_size, total_size):
                 pbar.update(block_size)
 
             urllib.request.urlretrieve(url, target_file, reporthook=report)
+    else:
+        logger.info(f"Archive {target_file.name} already downloaded")
 
+    # Check if extraction is needed
+    # Look for extracted subdirectories (common pattern: archive extracts to a subdirectory)
+    extraction_marker = target_dir / ".extracted"
+    if skip_if_extracted and extraction_marker.exists():
+        logger.info(f"Archive {target_file.name} already extracted, skipping")
+        return target_dir
+
+    # Extract
     logger.info(f"Extracting {target_file.name}...")
     if str(target_file).endswith((".tar.gz", ".tgz")):
         with tarfile.open(target_file, "r:gz") as tar:
@@ -111,6 +163,8 @@ def report(block_num, block_size, total_size):
     else:
         raise ValueError(f"Unsupported archive format: {target_file}")
 
+    # Mark as extracted
+    extraction_marker.touch()
     return target_dir
 
 
@@ -216,30 +270,55 @@ def _download_and_prepare(self):
     def _download_s2ef(self):
         """Download S2EF train, validation, and test splits."""
         # Download train split
+        # Check for the actual data directory structure: s2ef/{split}/s2ef_train_{split}/s2ef_train_{split}/
         train_url = S2EF_TRAIN_SPLITS[self.train_split]
-        train_dir = self.root / "s2ef" / self.train_split / "train"
+        train_subdir_name = f"s2ef_train_{self.train_split}"
+        train_dir = (
+            self.root
+            / "s2ef"
+            / self.train_split
+            / train_subdir_name
+            / train_subdir_name
+        )
         if not train_dir.exists():
             logger.info(f"Downloading S2EF train split: {self.train_split}")
             _download_and_extract(
                 train_url, self.root / "s2ef" / self.train_split
             )
             self._decompress_xz_files(self.root / "s2ef" / self.train_split)
+        else:
+            logger.info(
+                f"S2EF train split {self.train_split} already exists, skipping download"
+            )
 
         # Download validation splits
         for val_split in self.val_splits:
             val_url = S2EF_VAL_SPLITS[val_split]
-            val_dir = self.root / "s2ef" / "all" / val_split
+            # Check for the actual data directory structure: s2ef/all/s2ef_{val_split}/s2ef_{val_split}/
+            val_subdir_name = f"s2ef_{val_split}"
+            val_dir = (
+                self.root / "s2ef" / "all" / val_subdir_name / val_subdir_name
+            )
             if not val_dir.exists():
                 logger.info(f"Downloading S2EF validation split: {val_split}")
                 _download_and_extract(val_url, self.root / "s2ef" / "all")
                 self._decompress_xz_files(self.root / "s2ef" / "all")
+            else:
+                logger.info(
+                    f"S2EF validation split {val_split} already exists, skipping download"
+                )
 
         # Download test split
-        test_dir = self.root / "s2ef" / "all" / "test"
+        test_subdir_name = "s2ef_test"
+        test_dir = (
+            self.root / "s2ef" / "all" / test_subdir_name / test_subdir_name
+        )
         if self.include_test and not test_dir.exists():
             logger.info("Downloading S2EF test split")
             _download_and_extract(S2EF_TEST_SPLIT, self.root / "s2ef" / "all")
             self._decompress_xz_files(self.root / "s2ef" / "all")
+        elif self.include_test and test_dir.exists():
+            logger.info("S2EF test split already exists, skipping download")
         elif not self.include_test:
             logger.info(
                 "Skipping S2EF test split download (include_test=False); will reuse validation as test"
@@ -249,41 +328,82 @@ def _download_s2ef(self):
         self._preprocess_s2ef()
 
     def _preprocess_s2ef(self):
-        """Preprocess S2EF data from extxyz/txt to LMDB format if needed."""
-        # Check if any split needs preprocessing
-        train_dir = self.root / "s2ef" / self.train_split / "train"
-        needs_any_preprocessing = needs_preprocessing(train_dir, train_dir)
+        """Preprocess S2EF data from extxyz/txt files to ASE DB format.
 
-        if not needs_any_preprocessing:
-            for val_split in self.val_splits:
-                val_dir = self.root / "s2ef" / "all" / val_split
-                if needs_preprocessing(val_dir, val_dir):
-                    needs_any_preprocessing = True
-                    break
-
-        if not needs_any_preprocessing and self.include_test:
-            test_dir = self.root / "s2ef" / "all" / "test"
-            needs_any_preprocessing = needs_preprocessing(test_dir, test_dir)
-
-        if needs_any_preprocessing:
-            logger.info(
-                "S2EF data needs preprocessing from extxyz/txt to LMDB format"
+        This method checks for raw extxyz files and converts them to ASE DB files
+        for efficient loading. It processes train, validation, and test splits.
+        """
+        if not HAS_ASE:
+            logger.warning("ASE not available. Cannot preprocess S2EF data.")
+            return
+
+        s2ef_root = self.root / "s2ef"
+
+        if not s2ef_root.exists():
+            logger.warning(f"S2EF data directory not found: {s2ef_root}")
+            return
+
+        # Get preprocessing parameters (use defaults since they're not available)
+        num_workers = 1
+        max_neigh = 50
+        radius = 6.0
+
+        # Process training data
+        # The actual data is in s2ef/{train_split}/s2ef_train_{train_split}/s2ef_train_{train_split}/
+        train_base = s2ef_root / self.train_split
+        train_subdir_name = f"s2ef_train_{self.train_split}"
+        train_dir = train_base / train_subdir_name / train_subdir_name
+
+        if train_dir.exists() and needs_preprocessing(train_dir):
+            logger.info(f"Preprocessing S2EF training data: {train_dir}")
+            preprocess_s2ef_split_ase(
+                data_path=train_dir,
+                out_path=train_dir,
+                num_workers=num_workers,
+                ref_energy=True,
+                test_data=False,
+                max_neigh=max_neigh,
+                radius=radius,
             )
-            try:
-                preprocess_s2ef_dataset(
-                    root=self.root,
-                    train_split=self.train_split,
-                    val_splits=self.val_splits,
-                    include_test=self.include_test,
+
+        # Process validation splits
+        # The actual data is in s2ef/all/s2ef_{val_split}/s2ef_{val_split}/
+        for val_split in self.val_splits:
+            val_base = s2ef_root / "all"
+            val_subdir_name = f"s2ef_{val_split}"
+            val_dir = val_base / val_subdir_name / val_subdir_name
+
+            if val_dir.exists() and needs_preprocessing(val_dir):
+                logger.info(f"Preprocessing S2EF validation data: {val_dir}")
+                preprocess_s2ef_split_ase(
+                    data_path=val_dir,
+                    out_path=val_dir,
+                    num_workers=num_workers,
+                    ref_energy=True,
+                    test_data=False,
+                    max_neigh=max_neigh,
+                    radius=radius,
                 )
-            except ImportError:
-                logger.error(
-                    "Cannot preprocess S2EF data: fairchem-core or ASE not installed. "
-                    "Install with: pip install fairchem-core ase"
+
+        # Process test split if needed
+        if self.include_test:
+            test_base = s2ef_root / "all"
+            test_subdir_name = "s2ef_test"
+            test_dir = test_base / test_subdir_name / test_subdir_name
+
+            if test_dir.exists() and needs_preprocessing(test_dir):
+                logger.info(f"Preprocessing S2EF test data: {test_dir}")
+                preprocess_s2ef_split_ase(
+                    data_path=test_dir,
+                    out_path=test_dir,
+                    num_workers=num_workers,
+                    ref_energy=False,  # Test data typically doesn't have energy/forces
+                    test_data=True,
+                    max_neigh=max_neigh,
+                    radius=radius,
                 )
-                raise
-        else:
-            logger.info("S2EF data already preprocessed (LMDB files found)")
+
+        logger.info("S2EF preprocessing complete")
 
     def _decompress_xz_files(self, directory: Path):
         """Decompress all .xz files in a directory."""
@@ -569,12 +689,20 @@ class OC20DatasetLoader(AbstractLoader):
     - download: whether to download (default: false)
     - legacy_format: whether to use legacy PyG Data format (default: false)
     - dtype: torch dtype (default: "float32")
+    - max_samples: limit dataset size for fast experimentation (default: None = all samples)
     """
 
     def __init__(self, parameters: DictConfig) -> None:
         super().__init__(parameters)
 
     def load_dataset(self) -> Dataset:
+        """Load OC20 dataset (S2EF or IS2RE).
+
+        Returns
+        -------
+        Dataset
+            Loaded dataset with appropriate splits.
+        """
         task: str = getattr(self.parameters, "task", "s2ef")
         download: bool = bool(getattr(self.parameters, "download", False))
         legacy_format: bool = bool(
@@ -584,6 +712,12 @@ def load_dataset(self) -> Dataset:
         dtype_t = (
             getattr(torch, str(dtype)) if isinstance(dtype, str) else dtype
         )
+        max_samples = getattr(self.parameters, "max_samples", None)
+        if max_samples is not None:
+            max_samples = int(max_samples)
+            print(
+                f"⚠️  Limiting dataset to {max_samples} samples for fast experimentation"
+            )
 
         if task == "s2ef":
             train_split = getattr(self.parameters, "train_split", "200K")
@@ -615,6 +749,61 @@ def load_dataset(self) -> Dataset:
                 legacy_format=legacy_format,
             )
 
+            # ASE DB fallback if LMDBs are not present
+            data_root = Path(self.get_data_dir())
+            lmdb_present = any((data_root / "s2ef").glob("**/*.lmdb"))
+            if not lmdb_present and HAS_ASE:
+                # Preprocessing is already done in _OC20LMDBDataset if needed
+                # Now collect DB files
+                train_subdir_name = f"s2ef_train_{train_split}"
+                train_dir = (
+                    data_root
+                    / "s2ef"
+                    / train_split
+                    / train_subdir_name
+                    / train_subdir_name
+                )
+                train_dbs = sorted(train_dir.glob("*.db"))
+                val_dbs = []
+                # Respect empty list for val_splits (for fast prototyping)
+                val_splits_to_use = (
+                    list(S2EF_VAL_SPLITS.keys())
+                    if val_splits is None
+                    else val_splits
+                )
+                for vs in val_splits_to_use:
+                    val_subdir_name = f"s2ef_{vs}"
+                    val_dir = (
+                        data_root
+                        / "s2ef"
+                        / "all"
+                        / val_subdir_name
+                        / val_subdir_name
+                    )
+                    val_dbs.extend(sorted(val_dir.glob("*.db")))
+                test_dbs = []
+                if include_test:
+                    test_dbs = sorted(
+                        (data_root / "s2ef" / "all" / "test").glob("*.db")
+                    )
+
+                if train_dbs:
+                    logger.info(
+                        f"Using ASE DB backend: {len(train_dbs)} train, {len(val_dbs)} val, {len(test_dbs)} test DB files"
+                    )
+                    return OC20ASEDBDataset(
+                        train_db_paths=[str(p) for p in train_dbs],
+                        val_db_paths=[str(p) for p in val_dbs],
+                        test_db_paths=[str(p) for p in test_dbs],
+                        max_neigh=int(
+                            getattr(self.parameters, "max_neigh", 50)
+                        ),
+                        radius=float(getattr(self.parameters, "radius", 6.0)),
+                        dtype=dtype_t,
+                        include_energy=True,
+                        include_forces=True,
+                        max_samples=max_samples,
+                    )
         elif task in ["is2re", "oc22_is2re"]:
             ds = _OC20LMDBDataset(
                 root=self.get_data_dir(),
@@ -631,5 +820,12 @@ def load_dataset(self) -> Dataset:
         return ds  # type: ignore[return-value]
 
     def get_data_dir(self) -> Path:
+        """Get data directory path.
+
+        Returns
+        -------
+        Path
+            Path to data directory.
+        """
         # Keep default directory convention for TopoBench
         return Path(super().get_data_dir())
diff --git a/topobench/data/preprocessor/oc20_s2ef_preprocessor.py b/topobench/data/preprocessor/oc20_s2ef_preprocessor.py
index 087c3952f..27626a2d8 100644
--- a/topobench/data/preprocessor/oc20_s2ef_preprocessor.py
+++ b/topobench/data/preprocessor/oc20_s2ef_preprocessor.py
@@ -1,339 +1,605 @@
-"""S2EF preprocessing, adapted from OC20's preprocess_ef.py.
------------
+"""S2EF preprocessing for OC20/OC22 datasets.
+
+Creates ASE DB files with extracted graph features from provided *.extxyz files
+for the S2EF task.
+
 Copyright (c) Meta Platforms, Inc. and affiliates.
 
 This source code is licensed under the MIT license found in the
 LICENSE file in the root directory of this source tree.
-
-Creates LMDB files with extracted graph features from provided *.extxyz files
-for the S2EF task.
 """
 
 from __future__ import annotations
 
 import glob
 import logging
-import multiprocessing as mp
-import os
-import pickle
+import sys
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from pathlib import Path
 
-import lmdb
 import numpy as np
 import torch
+from torch_geometric.data import Data
 from tqdm import tqdm
 
 logger = logging.getLogger(__name__)
 
-# Try importing ASE and fairchem dependencies
+# Try importing ASE
 try:
+    import ase.db
     import ase.io
-    from fairchem.core.preprocessing import AtomsToGraphs
+    from ase.atoms import Atoms
 
-    HAS_FAIRCHEM = True
+    HAS_ASE = True
 except ImportError:
-    HAS_FAIRCHEM = False
+    HAS_ASE = False
     logger.warning(
-        "fairchem-core or ASE not installed. S2EF preprocessing will not be available. "
-        "Install with: pip install fairchem-core ase"
+        "ASE not installed. S2EF preprocessing will not be available. "
+        "Install with: pip install ase"
     )
 
+# Try importing pymatgen for neighbor search
+try:
+    from pymatgen.io.ase import AseAtomsAdaptor
 
-def _write_images_to_lmdb(mp_arg):
-    """Write trajectory frames to LMDB (worker function)."""
-    if not HAS_FAIRCHEM:
-        raise ImportError("fairchem-core is required for S2EF preprocessing")
-
-    (
-        a2g,
-        db_path,
-        samples,
-        sampled_ids,
-        idx,
-        pid,
-        data_path,
-        ref_energy,
-        test_data,
-        get_edges,
-    ) = mp_arg
-
-    db = lmdb.open(
-        db_path,
-        map_size=1099511627776 * 2,
-        subdir=False,
-        meminit=False,
-        map_async=True,
+    HAS_PYMATGEN = True
+except ImportError:
+    HAS_PYMATGEN = False
+    logger.warning(
+        "pymatgen not installed. Will use slower ASE neighbor search. "
+        "Install with: pip install pymatgen"
     )
 
-    pbar = tqdm(
-        total=sum(1 for s in samples for line in open(s)),  # noqa: SIM115
-        position=pid,
-        desc=f"Worker {pid} preprocessing",
-        leave=False,
-    )
 
-    for sample in samples:
-        with open(sample) as fp:
-            traj_logs = fp.read().splitlines()
+class AtomsToGraphs:
+    """Convert ASE Atoms objects to PyTorch Geometric Data objects.
 
-        xyz_idx = os.path.splitext(os.path.basename(sample))[0]
-        traj_path = os.path.join(data_path, f"{xyz_idx}.extxyz")
+    This class handles periodic boundary conditions and creates graph representations
+    suitable for machine learning on atomic structures.
 
-        if not os.path.exists(traj_path):
-            logger.warning(f"Missing extxyz file: {traj_path}, skipping")
-            continue
+    Parameters
+    ----------
+    max_neigh : int
+        Maximum number of neighbors to consider per atom.
+    radius : float
+        Cutoff radius in Angstroms for neighbor search.
+    r_energy : bool
+        Whether to include energy in the created Data objects.
+    r_forces : bool
+        Whether to include forces in the created Data objects.
+    r_distances : bool
+        Whether to include edge distances as edge attributes.
+    r_edges : bool
+        Whether to compute edges (can be disabled for debugging).
+    r_fixed : bool
+        Whether to include fixed atom flags.
+    """
 
-        traj_frames = ase.io.read(traj_path, ":")
+    def __init__(
+        self,
+        max_neigh: int = 50,
+        radius: float = 6.0,
+        r_energy: bool = True,
+        r_forces: bool = True,
+        r_distances: bool = True,
+        r_edges: bool = True,
+        r_fixed: bool = True,
+    ):
+        self.max_neigh = max_neigh
+        self.radius = radius
+        self.r_energy = r_energy
+        self.r_forces = r_forces
+        self.r_distances = r_distances
+        self.r_fixed = r_fixed
+        self.r_edges = r_edges
+
+    def _get_neighbors_pymatgen(self, atoms: Atoms):
+        """Get neighbors using pymatgen (faster for periodic systems).
+
+        Parameters
+        ----------
+        atoms : ase.atoms.Atoms
+            ASE Atoms object for which to compute neighbor lists.
+
+        Returns
+        -------
+        tuple
+            Tuple (c_index, n_index, n_distance, offsets) representing neighbor
+            center indices, neighbor indices, distances and periodic offsets.
+        """
+        if not HAS_PYMATGEN:
+            return self._get_neighbors_ase(atoms)
+
+        struct = AseAtomsAdaptor.get_structure(atoms)
+        _c_index, _n_index, _offsets, n_distance = struct.get_neighbor_list(
+            r=self.radius, numerical_tol=0, exclude_self=True
+        )
 
-        for i, frame in enumerate(traj_frames):
-            if i >= len(traj_logs):
-                logger.warning(
-                    f"Log mismatch for {traj_path} frame {i}, skipping"
-                )
-                continue
-
-            frame_log = traj_logs[i].split(",")
-            sid = int(frame_log[0].split("random")[1])
-            fid = int(frame_log[1].split("frame")[1])
-
-            data_object = a2g.convert(frame)
-            data_object.tags = torch.LongTensor(frame.get_tags())
-            data_object.sid = sid
-            data_object.fid = fid
-
-            # Subtract off reference energy if needed
-            if ref_energy and not test_data and len(frame_log) > 2:
-                ref_energy_val = float(frame_log[2])
-                data_object.energy -= ref_energy_val
-
-            txn = db.begin(write=True)
-            txn.put(
-                f"{idx}".encode("ascii"),
-                pickle.dumps(data_object, protocol=-1),
-            )
-            txn.commit()
-            idx += 1
-            sampled_ids.append(",".join(frame_log[:2]) + "\n")
-            pbar.update(1)
+        # Limit to max_neigh neighbors per atom, sorted by distance
+        _nonmax_idx = []
+        for i in range(len(atoms)):
+            idx_i = (_c_index == i).nonzero()[0]
+            idx_sorted = np.argsort(n_distance[idx_i])[: self.max_neigh]
+            _nonmax_idx.append(idx_i[idx_sorted])
+        _nonmax_idx = np.concatenate(_nonmax_idx)
+
+        _c_index = _c_index[_nonmax_idx]
+        _n_index = _n_index[_nonmax_idx]
+        n_distance = n_distance[_nonmax_idx]
+        _offsets = _offsets[_nonmax_idx]
+
+        return _c_index, _n_index, n_distance, _offsets
+
+    def _get_neighbors_ase(self, atoms: Atoms):
+        """Get neighbors using ASE (slower but always available).
+
+        Parameters
+        ----------
+        atoms : ase.atoms.Atoms
+            ASE Atoms object for which to compute neighbor lists.
+
+        Returns
+        -------
+        tuple
+            Tuple (idx_i, idx_j, distances, offsets) representing neighbor
+            center indices, neighbor indices, distances and periodic offsets.
+        """
+        from ase.neighborlist import neighbor_list
+
+        idx_i, idx_j, idx_S, distances = neighbor_list(
+            "ijSd", atoms, self.radius, self_interaction=False
+        )
 
-    # Save count of objects in lmdb
-    txn = db.begin(write=True)
-    txn.put("length".encode("ascii"), pickle.dumps(idx, protocol=-1))
-    txn.commit()
+        # Limit to max_neigh neighbors per atom
+        _nonmax_idx = []
+        for i in range(len(atoms)):
+            mask = idx_i == i
+            dists_i = distances[mask]
+            idx_sorted = np.argsort(dists_i)[: self.max_neigh]
+            _nonmax_idx.append(np.where(mask)[0][idx_sorted])
+        _nonmax_idx = np.concatenate(_nonmax_idx)
+
+        return (
+            idx_i[_nonmax_idx],
+            idx_j[_nonmax_idx],
+            distances[_nonmax_idx],
+            idx_S[_nonmax_idx],
+        )
 
-    db.sync()
-    db.close()
-    pbar.close()
+    def _reshape_features(self, c_index, n_index, n_distance, offsets):
+        """Convert neighbor info to PyTorch tensors.
+
+        Parameters
+        ----------
+        c_index : array-like
+            Center atom indices for edges.
+        n_index : array-like
+            Neighbor atom indices for edges.
+        n_distance : array-like
+            Distances between center and neighbor atoms.
+        offsets : array-like
+            Periodic cell offsets corresponding to edges.
+
+        Returns
+        -------
+        tuple
+            (edge_index, edge_distances, cell_offsets) as PyTorch tensors.
+        """
+        edge_index = torch.LongTensor(np.vstack((n_index, c_index)))
+        edge_distances = torch.FloatTensor(n_distance)
+        cell_offsets = torch.LongTensor(offsets)
+
+        # Remove very small distances (self-interactions that slipped through)
+        nonzero = torch.where(edge_distances >= 1e-8)[0]
+        edge_index = edge_index[:, nonzero]
+        edge_distances = edge_distances[nonzero]
+        cell_offsets = cell_offsets[nonzero]
+
+        return edge_index, edge_distances, cell_offsets
+
+    def convert(self, atoms: Atoms) -> Data:
+        """Convert a single ASE Atoms object to a PyG Data object.
+
+        Parameters
+        ----------
+        atoms : ase.atoms.Atoms
+            ASE Atoms object with positions, atomic numbers, cell, etc.
+
+        Returns
+        -------
+        torch_geometric.data.Data
+            PyG Data object containing node features, positions, edges, and
+            optional energy/forces/fixed flags.
+        """
+        # Basic atomic structure info
+        atomic_numbers = torch.LongTensor(atoms.get_atomic_numbers())
+        positions = torch.FloatTensor(atoms.get_positions())
+        cell = torch.FloatTensor(np.array(atoms.get_cell())).view(1, 3, 3)
+        natoms = len(atoms)
+
+        # Create base data object
+        # Create node features from atomic numbers (one-hot or simple embedding)
+        # For now, use atomic numbers as features (can be enhanced later)
+        node_features = atomic_numbers.unsqueeze(1).float()
+
+        data = Data(
+            cell=cell,
+            pos=positions,
+            atomic_numbers=atomic_numbers,
+            natoms=natoms,
+            z=atomic_numbers,  # Alias for compatibility
+            x=node_features,  # Add node features for compatibility with transforms
+        )
 
-    return sampled_ids, idx
+        # Add edges if requested
+        if self.r_edges:
+            if HAS_PYMATGEN:
+                split_idx_dist = self._get_neighbors_pymatgen(atoms)
+            else:
+                split_idx_dist = self._get_neighbors_ase(atoms)
+            edge_index, edge_distances, cell_offsets = self._reshape_features(
+                *split_idx_dist
+            )
+            data.edge_index = edge_index
+            data.cell_offsets = cell_offsets
+
+            if self.r_distances:
+                data.distances = edge_distances
+                data.edge_attr = edge_distances.view(
+                    -1, 1
+                )  # For compatibility
+
+        # Add energy if available and requested
+        if self.r_energy:
+            try:
+                energy = atoms.get_potential_energy(apply_constraint=False)
+                data.y = torch.FloatTensor([energy])
+                data.energy = torch.FloatTensor([energy])
+            except Exception:
+                # Energy not available (e.g., no calculator)
+                pass
+
+        # Add forces if available and requested
+        if self.r_forces:
+            try:
+                forces = atoms.get_forces(apply_constraint=False)
+                data.force = torch.FloatTensor(forces)
+            except Exception:
+                # Forces not available
+                pass
+
+        # Add fixed atom flags if requested
+        if self.r_fixed:
+            fixed_idx = torch.zeros(natoms, dtype=torch.float32)
+            if hasattr(atoms, "constraints"):
+                from ase.constraints import FixAtoms
+
+                for constraint in atoms.constraints:
+                    if isinstance(constraint, FixAtoms):
+                        fixed_idx[constraint.index] = 1
+            data.fixed = fixed_idx
+
+        # Add metadata from atoms.info if present
+        if hasattr(atoms, "info") and atoms.info:
+            info_data = atoms.info.get("data", {})
+            if "sid" in info_data:
+                data.sid = info_data["sid"]
+            if "fid" in info_data:
+                data.fid = info_data["fid"]
+            if "ref_energy" in info_data:
+                data.ref_energy = torch.FloatTensor([info_data["ref_energy"]])
+
+        return data
+
+
+class S2EFPreprocessor:
+    """Preprocessor for S2EF data using ASE database format.
+
+    This class handles conversion from extxyz files to ASE database format.
+    """
 
+    def write_db(
+        self,
+        extxyz_paths: list[str | Path | list[str | Path]],
+        dbs: dict[str, ase.db.core.Database],
+        map_file_to_log: dict[str, str | None] | None = None,
+        num_workers: int = 1,
+        batch_size: int = 100,
+    ):
+        """Write ASE DB files from extxyz inputs.
+
+        This stores for each extended XYZ file the Atoms objects and the
+        metadata from the corresponding xyz_log file into the ASE Database.
+
+        Parameters
+        ----------
+        extxyz_paths : list[str | Path | list[str | Path]]
+            Paths to the extended XYZ files or lists of paths per DB.
+        dbs : dict[str, ase.db.core.Database]
+            Mapping from extxyz file keys to ASE Database objects.
+        map_file_to_log : dict[str, str | None] | None
+            Mapping from extxyz paths to xyz_log paths. If None, no metadata is used.
+        num_workers : int, optional
+            Number of worker processes to use, by default 1.
+        batch_size : int, optional
+            Number of structures to write to the ASE DB file at a time, by default 100.
+
+        Returns
+        -------
+        None
+            This function writes DB files as a side effect.
+        """
+        if not HAS_ASE:
+            raise ImportError("ASE is required for S2EF preprocessing")
+
+        if map_file_to_log is None:
+            map_file_to_log = {k: None for k in dbs}
+
+        num_workers = min(num_workers, len(extxyz_paths))
+        node_counts = []  # Track node counts for each structure
+
+        with ProcessPoolExecutor(max_workers=num_workers) as executor:
+            futures = [
+                executor.submit(
+                    self._write_db_worker,
+                    file_path,
+                    map_file_to_log[db_path],
+                    dbs[db_path],
+                    batch_size,
+                )
+                for file_path, db_path in zip(
+                    extxyz_paths,
+                    map_file_to_log.keys(),
+                    strict=True,
+                )
+            ]
+            for future in tqdm(
+                as_completed(futures),
+                desc="Writing DBs",
+                total=len(futures),
+                leave=False,
+                file=sys.stdout,
+                dynamic_ncols=True,
+            ):
+                db_path, num_atoms, structure_node_counts = future.result()
+                node_counts.extend(structure_node_counts)
+
+        # Save node counts
+        node_counts_path = (
+            Path(list(dbs.keys())[0]).parent.parent / "node_counts.npy"
+        )
+        np.save(node_counts_path, np.array(node_counts))
+        logger.info(
+            f"Saved {len(node_counts)} node counts to {node_counts_path}"
+        )
 
-def preprocess_s2ef_split(
+    @staticmethod
+    def _write_db_worker(
+        file_paths: str | Path | list[str | Path],
+        xyz_log_path: str | Path | None,
+        db: ase.db.core.Database,
+        batch_size: int,
+    ):
+        """Worker function to write atoms to ASE database.
+
+        Parameters
+        ----------
+        file_paths : str | Path | list[str | Path]
+            Path or list of paths to extended XYZ file(s).
+        xyz_log_path : str | Path | None
+            Path to the log file with metadata or None.
+        db : ase.db.core.Database
+            ASE database object to write to.
+        batch_size : int
+            Number of structures to batch together.
+
+        Returns
+        -------
+        tuple
+            (db_path, total_atoms, node_counts).
+        """
+        if xyz_log_path is not None:
+            with open(xyz_log_path) as f:
+                xyz_log = f.read().splitlines()
+        else:
+            xyz_log = None
+
+        if isinstance(file_paths, (str, Path)):
+            file_paths = [file_paths]
+
+        node_counts = []  # Track node counts for each structure
+        total_atoms = 0
+
+        for file_path in file_paths:
+            atoms_list = ase.io.read(file_path, ":")
+            atoms_batch = []
+            log_batch = []
+            for i, atoms in enumerate(atoms_list):
+                if xyz_log is not None and i < len(xyz_log):
+                    log_line = xyz_log[i].split(",")
+                    log_info = {
+                        "sid": int(log_line[0].split("random")[1]),
+                        "fid": int(log_line[1].split("frame")[1]),
+                        "ref_energy": float(log_line[2]),
+                    }
+                else:
+                    log_info = {}
+
+                atoms_batch.append(atoms)
+                log_batch.append(log_info)
+                node_counts.append(len(atoms))
+
+                if len(atoms_batch) == batch_size:
+                    # Write batch to database
+                    for atom, log in zip(atoms_batch, log_batch, strict=True):
+                        db.write(atom, data=log)
+                    atoms_batch, log_batch = [], []
+
+            total_atoms += len(atoms_list)
+
+        # Write remaining batch
+        if atoms_batch:
+            for atom, log in zip(atoms_batch, log_batch, strict=True):
+                db.write(atom, data=log)
+
+        db_path = db.filename if hasattr(db, "filename") else str(db)
+        return db_path, total_atoms, node_counts
+
+
+def needs_preprocessing(data_path: Path) -> bool:
+    """Check if data needs preprocessing (has extxyz but no db files)."""
+    has_extxyz = bool(list(data_path.glob("*.extxyz")))
+    has_db = bool(list(data_path.glob("*.db")))
+    return has_extxyz and not has_db
+
+
+def preprocess_s2ef_split_ase(
     data_path: Path,
     out_path: Path,
     num_workers: int = 1,
     ref_energy: bool = True,
     test_data: bool = False,
-    get_edges: bool = False,
+    max_neigh: int = 50,
+    radius: float = 6.0,
 ) -> None:
-    """Preprocess S2EF data from extxyz/txt to LMDB format.
+    """Preprocess S2EF data from extxyz/txt to ASE DB format.
 
     Parameters
     ----------
     data_path : Path
         Path to directory containing *.extxyz and *.txt files.
     out_path : Path
-        Directory to save LMDB files.
+        Directory to save ASE DB files.
     num_workers : int
         Number of parallel workers for preprocessing.
     ref_energy : bool
-        Whether to subtract reference energies.
+        Whether to include reference energies in metadata.
     test_data : bool
-        Whether this is test data (no energy/forces).
-    get_edges : bool
-        Whether to precompute and store edge indices (~10x storage).
-    """
-    if not HAS_FAIRCHEM:
-        raise ImportError(
-            "fairchem-core and ASE are required for S2EF preprocessing. "
-            "Install with: pip install fairchem-core ase"
-        )
+        Whether this is test data (no energy/forces in log).
+    max_neigh : int
+        Maximum number of neighbors per atom.
+    radius : float
+        Cutoff radius for neighbor search in Angstroms.
 
-    logger.info(f"Preprocessing S2EF data from {data_path} to {out_path}")
+    Returns
+    -------
+    None
+        This function writes ASE DB files as a side effect.
+    """
+    if not HAS_ASE:
+        raise ImportError("ASE is required for S2EF preprocessing")
 
-    # Find all txt files
-    xyz_logs = glob.glob(str(data_path / "*.txt"))
-    if not xyz_logs:
-        raise RuntimeError(f"No *.txt files found in {data_path}")
+    logger.info(
+        f"Preprocessing S2EF data from {data_path} to {out_path} (ASE DB format)"
+    )
 
-    num_workers = min(num_workers, len(xyz_logs))
+    # Find all extxyz files
+    extxyz_files = sorted(glob.glob(str(data_path / "*.extxyz")))
 
-    # Initialize feature extractor
-    a2g = AtomsToGraphs(
-        max_neigh=50,
-        radius=6,
-        r_energy=not test_data,
-        r_forces=not test_data,
-        r_fixed=True,
-        r_distances=False,
-        r_edges=get_edges,
-    )
+    if not extxyz_files:
+        logger.warning(f"No extxyz files found in {data_path}")
+        return
 
-    # Create output directory
     out_path.mkdir(parents=True, exist_ok=True)
 
-    # Initialize LMDB paths
-    db_paths = [
-        str(out_path / f"data.{i:04d}.lmdb") for i in range(num_workers)
-    ]
-
-    # Chunk trajectories into workers
-    chunked_txt_files = np.array_split(xyz_logs, num_workers)
-
-    # Extract features in parallel
-    sampled_ids = [[]] * num_workers
-    idx = [0] * num_workers
-
-    logger.info(f"Starting preprocessing with {num_workers} workers...")
-
-    with mp.Pool(num_workers) as pool:
-        mp_args = [
-            (
-                a2g,
-                db_paths[i],
-                chunked_txt_files[i],
-                sampled_ids[i],
-                idx[i],
-                i,
-                str(data_path),
-                ref_energy,
-                test_data,
-                get_edges,
-            )
-            for i in range(num_workers)
-        ]
-        op = list(
-            zip(*pool.imap(_write_images_to_lmdb, mp_args), strict=False)
-        )
-        sampled_ids, idx = list(op[0]), list(op[1])
+    # Create mapping from extxyz to log files, but only for files that need preprocessing
+    map_file_to_log = {}
+    dbs = {}
+    skipped_count = 0
+
+    for extxyz_file in extxyz_files:
+        extxyz_path = Path(extxyz_file)
+        base_name = extxyz_path.stem
+
+        # Check if DB file already exists
+        db_path = out_path / f"{base_name}.db"
+        if db_path.exists():
+            skipped_count += 1
+            continue  # Skip files that already have DB files
+
+        # Find corresponding txt file
+        txt_path = data_path / f"{base_name}.txt"
+        if txt_path.exists() and ref_energy and not test_data:
+            map_file_to_log[str(extxyz_path)] = str(txt_path)
+        else:
+            map_file_to_log[str(extxyz_path)] = None
 
-    # Write logs
-    for j, i in enumerate(range(num_workers)):
-        log_path = out_path / f"data_log.{i:04d}.txt"
-        with open(log_path, "w") as ids_log:
-            ids_log.writelines(sampled_ids[j])
+        # Create ASE DB for this file
+        dbs[str(extxyz_path)] = ase.db.connect(str(db_path))
 
-    total_samples = sum(idx)
-    logger.info(
-        f"Preprocessing complete: {total_samples} samples written to {out_path}"
+    if skipped_count > 0:
+        logger.info(
+            f"Skipping {skipped_count} files that already have DB files"
+        )
+
+    if not map_file_to_log:
+        logger.info("All files already preprocessed, skipping...")
+        return
+
+    # Write to ASE databases
+    preprocessor = S2EFPreprocessor()
+    preprocessor.write_db(
+        extxyz_paths=list(map_file_to_log.keys()),
+        dbs=dbs,
+        map_file_to_log=map_file_to_log,
+        num_workers=num_workers,
+        batch_size=100,
     )
 
+    logger.info(f"Preprocessing complete. DB files saved to {out_path}")
+
 
-def needs_preprocessing(raw_dir: Path, processed_dir: Path) -> bool:
-    """Check if a split needs preprocessing.
+def preprocess_s2ef_dataset_ase(
+    data_path: Path,
+    num_workers: int = 1,
+    splits: list[str] | None = None,
+    max_neigh: int = 50,
+    radius: float = 6.0,
+) -> None:
+    """Preprocess entire S2EF dataset with train/val/test splits.
 
     Parameters
     ----------
-    raw_dir : Path
-        Directory containing raw extxyz/txt files.
-    processed_dir : Path
-        Directory where LMDB files should be.
+    data_path : Path
+        Root path containing train/val/test subdirectories.
+    num_workers : int
+        Number of parallel workers.
+    splits : Optional[list[str]]
+        List of splits to process. If None, processes all found splits.
+    max_neigh : int
+        Maximum number of neighbors per atom.
+    radius : float
+        Cutoff radius for neighbor search in Angstroms.
 
     Returns
     -------
-    bool
-        True if preprocessing is needed.
+    None
+        This function writes ASE DB files as a side effect.
     """
-    if not raw_dir.exists():
-        return False
-
-    # Check if processed directory has LMDB files
-    if not processed_dir.exists():
-        return True
+    if not HAS_ASE:
+        raise ImportError("ASE is required for S2EF preprocessing")
 
-    lmdb_files = list(processed_dir.glob("*.lmdb"))
-    return len(lmdb_files) == 0
+    if splits is None:
+        # Auto-detect splits
+        splits = [d.name for d in data_path.iterdir() if d.is_dir()]
 
+    for split in splits:
+        split_path = data_path / split
+        if not split_path.exists():
+            logger.warning(f"Split {split} not found at {split_path}")
+            continue
 
-def preprocess_s2ef_dataset(
-    root: Path,
-    train_split: str,
-    val_splits: list[str],
-    include_test: bool = True,
-    num_workers: int | None = None,
-) -> None:
-    """Preprocess entire S2EF dataset (train/val/test splits).
-
-    Parameters
-    ----------
-    root : Path
-        Root directory containing S2EF data.
-    train_split : str
-        Train split name (e.g., "200K").
-    val_splits : list[str]
-        List of validation split names.
-    include_test : bool
-        Whether to preprocess test split.
-    num_workers : Optional[int]
-        Number of parallel workers (default: CPU count - 1).
-    """
-    if not HAS_FAIRCHEM:
-        raise ImportError(
-            "fairchem-core and ASE are required for S2EF preprocessing. "
-            "Install with: pip install fairchem-core ase"
-        )
-
-    if num_workers is None:
-        num_workers = max(1, mp.cpu_count() - 1)
-
-    s2ef_root = root / "s2ef"
-
-    # Preprocess train split
-    train_raw = s2ef_root / train_split / "train"
-    train_processed = train_raw  # Store LMDBs alongside raw data
+        logger.info(f"Processing split: {split}")
+        is_test = "test" in split.lower()
 
-    if needs_preprocessing(train_raw, train_processed):
-        logger.info(f"Preprocessing train split: {train_split}")
-        preprocess_s2ef_split(
-            train_raw,
-            train_processed,
+        preprocess_s2ef_split_ase(
+            data_path=split_path,
+            out_path=split_path,
             num_workers=num_workers,
             ref_energy=True,
-            test_data=False,
+            test_data=is_test,
+            max_neigh=max_neigh,
+            radius=radius,
         )
-    else:
-        logger.info(f"Train split {train_split} already preprocessed")
-
-    # Preprocess validation splits
-    for val_split in val_splits:
-        val_raw = s2ef_root / "all" / val_split
-        val_processed = val_raw
-
-        if needs_preprocessing(val_raw, val_processed):
-            logger.info(f"Preprocessing validation split: {val_split}")
-            preprocess_s2ef_split(
-                val_raw,
-                val_processed,
-                num_workers=num_workers,
-                ref_energy=True,
-                test_data=False,
-            )
-        else:
-            logger.info(f"Validation split {val_split} already preprocessed")
-
-    # Preprocess test split
-    if include_test:
-        test_raw = s2ef_root / "all" / "test"
-        test_processed = test_raw
-
-        if needs_preprocessing(test_raw, test_processed):
-            logger.info("Preprocessing test split")
-            preprocess_s2ef_split(
-                test_raw,
-                test_processed,
-                num_workers=num_workers,
-                ref_energy=False,
-                test_data=True,
-            )
-        else:
-            logger.info("Test split already preprocessed")
+
+    logger.info("Dataset preprocessing complete")
diff --git a/topobench/data/preprocessor/preprocessor.py b/topobench/data/preprocessor/preprocessor.py
index e5c4a913a..2f4f4b41d 100644
--- a/topobench/data/preprocessor/preprocessor.py
+++ b/topobench/data/preprocessor/preprocessor.py
@@ -34,6 +34,10 @@ class PreProcessor(torch_geometric.data.InMemoryDataset):
 
     def __init__(self, dataset, data_dir, transforms_config=None, **kwargs):
         self.dataset = dataset
+        self._skip_processing = (
+            False  # Flag to skip processing for no-transform case
+        )
+
         if transforms_config is not None:
             self.transforms_applied = True
             pre_transform = self.instantiate_pre_transform(
@@ -49,19 +53,56 @@ def __init__(self, dataset, data_dir, transforms_config=None, **kwargs):
             self.load(self.processed_paths[0])
             self.data_list = [data for data in self]
         else:
+            print(
+                "No transforms to apply, using dataset directly (skipping processing)..."
+            )
             self.transforms_applied = False
+            self._skip_processing = True  # Skip parent class processing
+
+            # Call parent init but it should skip processing
             super().__init__(data_dir, None, None, **kwargs)
+
             self.transform = (
                 dataset.transform if hasattr(dataset, "transform") else None
             )
+            # Directly use the dataset's data and slices
             self.data, self.slices = dataset._data, dataset.slices
-            self.data_list = [data for data in dataset]
+            # Make data_list creation lazy to avoid loading large datasets into memory
+            self._data_list = None
 
         # Some datasets have fixed splits, and those are stored as split_idx during loading
         # We need to store this information to be able to reproduce the splits afterwards
         if hasattr(dataset, "split_idx"):
             self.split_idx = dataset.split_idx
 
+    @property
+    def data_list(self):
+        """Lazy loading of data_list to avoid loading large datasets into memory.
+
+        Returns
+        -------
+        list
+            List of data objects when transforms are not applied; otherwise the processed data list.
+        """
+        if not self.transforms_applied and self._data_list is None:
+            # Only create data_list when actually needed
+            print(
+                "Warning: Creating data_list from large dataset - this may take a while..."
+            )
+            self._data_list = [data for data in self.dataset]
+        return self._data_list
+
+    @data_list.setter
+    def data_list(self, value):
+        """Setter for data_list.
+
+        Parameters
+        ----------
+        value : list
+            New list of data objects to use as the dataset's data_list.
+        """
+        self._data_list = value
+
     @property
     def processed_dir(self) -> str:
         """Return the path to the processed directory.
@@ -77,14 +118,17 @@ def processed_dir(self) -> str:
             return self.root + "/processed"
 
     @property
-    def processed_file_names(self) -> str:
+    def processed_file_names(self) -> str | list[str]:
         """Return the name of the processed file.
 
         Returns
         -------
-        str
-            Name of the processed file.
+        str | list[str]
+            Name of the processed file, or empty list to skip processing.
         """
+        # If no transforms, return empty list to skip processing check
+        if hasattr(self, "_skip_processing") and self._skip_processing:
+            return []
         return "data.pt"
 
     def instantiate_pre_transform(
@@ -179,26 +223,52 @@ def save_transform_parameters(self) -> None:
             )
 
     def process(self) -> None:
-        """Method that processes the data."""
+        """Method that processes the data.
+
+        Returns
+        -------
+        None
+            Writes processed data to disk as a side effect.
+        """
+        from tqdm import tqdm
+
+        print(f"Processing dataset with {len(self.dataset)} samples...")
+
         if isinstance(
             self.dataset,
             (torch_geometric.data.Dataset, torch.utils.data.Dataset),
         ):
-            data_list = [data for data in self.dataset]
+            # Use tqdm to show progress for large datasets
+            if len(self.dataset) > 1000:
+                print(
+                    f"Loading {len(self.dataset)} graphs (this may take a while)..."
+                )
+                data_list = [
+                    data for data in tqdm(self.dataset, desc="Loading graphs")
+                ]
+            else:
+                data_list = [data for data in self.dataset]
         elif isinstance(self.dataset, torch_geometric.data.Data):
             data_list = [self.dataset]
 
-        self.data_list = (
-            [self.pre_transform(d) for d in data_list]
-            if self.pre_transform is not None
-            else data_list
-        )
+        if self.pre_transform is not None:
+            print(f"Applying transforms to {len(data_list)} graphs...")
+            transformed_data_list = [
+                self.pre_transform(d)
+                for d in tqdm(data_list, desc="Applying transforms")
+            ]
+        else:
+            transformed_data_list = data_list
 
-        self._data, self.slices = self.collate(self.data_list)
-        self._data_list = None  # Reset cache.
+        print("Collating data...")
+        self._data, self.slices = self.collate(transformed_data_list)
 
         assert isinstance(self._data, torch_geometric.data.Data)
-        self.save(self.data_list, self.processed_paths[0])
+        print(f"Saving processed data to {self.processed_paths[0]}...")
+        self.save(transformed_data_list, self.processed_paths[0])
+
+        # Reset cache after saving
+        self._data_list = None
 
     def load(self, path: str) -> None:
         r"""Load the dataset from the file path `path`.
diff --git a/topobench/data/utils/split_utils.py b/topobench/data/utils/split_utils.py
index f78994222..334e13462 100644
--- a/topobench/data/utils/split_utils.py
+++ b/topobench/data/utils/split_utils.py
@@ -5,6 +5,8 @@
 import numpy as np
 import torch
 from sklearn.model_selection import StratifiedKFold
+from torch.utils.data import Subset
+from tqdm import tqdm
 
 from topobench.dataloader import DataloadDataset
 
@@ -180,6 +182,52 @@ def random_splitting(labels, parameters, root=None, global_data_seed=42):
     return split_idx
 
 
+def create_subset_splits(dataset, split_idx):
+    """Create dataset splits using PyTorch Subset (optimized for large datasets).
+
+    This avoids loading all graphs into memory by using lazy indexing.
+
+    Parameters
+    ----------
+    dataset : torch_geometric.data.Dataset
+        Considered dataset.
+    split_idx : dict
+        Dictionary containing the train, validation, and test indices.
+
+    Returns
+    -------
+    tuple:
+        Tuple containing the train, validation, and test datasets as Subsets.
+    """
+    # Convert numpy arrays to lists if needed
+    train_indices = (
+        split_idx["train"].tolist()
+        if hasattr(split_idx["train"], "tolist")
+        else list(split_idx["train"])
+    )
+    valid_indices = (
+        split_idx["valid"].tolist()
+        if hasattr(split_idx["valid"], "tolist")
+        else list(split_idx["valid"])
+    )
+    test_indices = (
+        split_idx["test"].tolist()
+        if hasattr(split_idx["test"], "tolist")
+        else list(split_idx["test"])
+    )
+
+    print(
+        f"Creating subsets: train={len(train_indices)}, val={len(valid_indices)}, test={len(test_indices)}"
+    )
+
+    # Create subsets using lazy indexing
+    train_dataset = Subset(dataset, train_indices)
+    val_dataset = Subset(dataset, valid_indices) if valid_indices else None
+    test_dataset = Subset(dataset, test_indices) if test_indices else None
+
+    return train_dataset, val_dataset, test_dataset
+
+
 def assign_train_val_test_mask_to_graphs(dataset, split_idx):
     """Split the graph dataset into train, validation, and test datasets.
 
@@ -199,21 +247,30 @@ def assign_train_val_test_mask_to_graphs(dataset, split_idx):
     data_train_lst, data_val_lst, data_test_lst = [], [], []
 
     # Assign masks directly by iterating over pre-split indices
-    for i in split_idx["train"]:
+    print(f"Creating train split with {len(split_idx['train'])} samples...")
+    for i in tqdm(
+        split_idx["train"], desc="Loading train graphs", leave=False
+    ):
         graph = dataset[i]
         graph.train_mask = torch.tensor([1], dtype=torch.long)
         graph.val_mask = torch.tensor([0], dtype=torch.long)
         graph.test_mask = torch.tensor([0], dtype=torch.long)
         data_train_lst.append(graph)
 
-    for i in split_idx["valid"]:
+    print(
+        f"Creating validation split with {len(split_idx['valid'])} samples..."
+    )
+    for i in tqdm(
+        split_idx["valid"], desc="Loading validation graphs", leave=False
+    ):
         graph = dataset[i]
         graph.train_mask = torch.tensor([0], dtype=torch.long)
         graph.val_mask = torch.tensor([1], dtype=torch.long)
         graph.test_mask = torch.tensor([0], dtype=torch.long)
         data_val_lst.append(graph)
 
-    for i in split_idx["test"]:
+    print(f"Creating test split with {len(split_idx['test'])} samples...")
+    for i in tqdm(split_idx["test"], desc="Loading test graphs", leave=False):
         graph = dataset[i]
         graph.train_mask = torch.tensor([0], dtype=torch.long)
         graph.val_mask = torch.tensor([0], dtype=torch.long)
@@ -306,15 +363,6 @@ def load_inductive_splits(dataset, parameters):
     assert len(dataset) > 1, (
         "Datasets should have more than one graph in an inductive setting."
     )
-    # Check if labels are ragged (different sizes across graphs)
-    label_list = [data.y.squeeze(0).numpy() for data in dataset]
-    label_shapes = [label.shape for label in label_list]
-    # Use dtype=object only if labels have different shapes (ragged)
-    labels = (
-        np.array(label_list, dtype=object)
-        if len(set(label_shapes)) > 1
-        else np.array(label_list)
-    )
 
     root = (
         dataset.dataset.get_data_dir()
@@ -322,27 +370,61 @@ def load_inductive_splits(dataset, parameters):
         else None
     )
 
-    if parameters.split_type == "random":
-        split_idx = random_splitting(labels, parameters, root=root)
-
-    elif parameters.split_type == "k-fold":
-        assert type(labels) is not object, (
-            "K-Fold splitting not supported for ragged labels."
+    # Check if we have fixed splits first (avoid loading all data)
+    if parameters.split_type == "fixed" and hasattr(dataset, "split_idx"):
+        print(
+            f"Using pre-computed fixed splits (train: {len(dataset.split_idx['train'])}, "
+            f"val: {len(dataset.split_idx['valid'])}, test: {len(dataset.split_idx['test'])})"
         )
-        split_idx = k_fold_split(labels, parameters, root=root)
-
-    elif parameters.split_type == "fixed" and hasattr(dataset, "split_idx"):
         split_idx = dataset.split_idx
-
     else:
-        raise NotImplementedError(
-            f"split_type {parameters.split_type} not valid. Choose either 'random', 'k-fold' or 'fixed'.\
-            If 'fixed' is chosen, the dataset should have the attribute split_idx"
+        # For random/k-fold splits, we need to extract labels
+        # Check if labels are ragged (different sizes across graphs)
+        print(
+            f"Extracting labels from {len(dataset)} graphs for split creation..."
+        )
+        label_list = [
+            data.y.squeeze(0).numpy()
+            for data in tqdm(dataset, desc="Extracting labels")
+        ]
+        label_shapes = [label.shape for label in label_list]
+        # Use dtype=object only if labels have different shapes (ragged)
+        labels = (
+            np.array(label_list, dtype=object)
+            if len(set(label_shapes)) > 1
+            else np.array(label_list)
         )
 
-    train_dataset, val_dataset, test_dataset = (
-        assign_train_val_test_mask_to_graphs(dataset, split_idx)
-    )
+        if parameters.split_type == "random":
+            split_idx = random_splitting(labels, parameters, root=root)
+
+        elif parameters.split_type == "k-fold":
+            assert type(labels) is not object, (
+                "K-Fold splitting not supported for ragged labels."
+            )
+            split_idx = k_fold_split(labels, parameters, root=root)
+
+        else:
+            raise NotImplementedError(
+                f"split_type {parameters.split_type} not valid. Choose either 'random', 'k-fold' or 'fixed'.\
+                If 'fixed' is chosen, the dataset should have the attribute split_idx"
+            )
+
+    # Use optimized subset-based splitting for large datasets
+    # This avoids loading all graphs into memory at once
+    use_subset_split = len(dataset) > 10000  # Use subset for large datasets
+
+    if use_subset_split:
+        print(
+            f"Using optimized subset-based splitting for large dataset ({len(dataset)} graphs)"
+        )
+        train_dataset, val_dataset, test_dataset = create_subset_splits(
+            dataset, split_idx
+        )
+    else:
+        train_dataset, val_dataset, test_dataset = (
+            assign_train_val_test_mask_to_graphs(dataset, split_idx)
+        )
 
     return train_dataset, val_dataset, test_dataset
 
diff --git a/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py b/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py
index 30ae553b8..4c2a2dc31 100644
--- a/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py
+++ b/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py
@@ -64,16 +64,12 @@ def __init__(self, config):
             )
             nn.init.orthogonal_(self.lin_right_weights[-1].weight.data)
         for _i in range(self.layers):
-            self.lin_left_weights.append(
-                nn.Linear(self.d, self.d, bias=False)
-            )
+            self.lin_left_weights.append(nn.Linear(self.d, self.d, bias=False))
             nn.init.eye_(self.lin_left_weights[-1].weight.data)
 
         self.sheaf_learners = nn.ModuleList()
 
-        num_sheaf_learners = min(
-            self.layers, self.layers
-        )
+        num_sheaf_learners = min(self.layers, self.layers)
         for _i in range(num_sheaf_learners):
             self.sheaf_learners.append(
                 LocalConcatSheafLearner(
@@ -208,17 +204,13 @@ def __init__(self, config):
             )
             nn.init.orthogonal_(self.lin_right_weights[-1].weight.data)
         for _i in range(self.layers):
-            self.lin_left_weights.append(
-                nn.Linear(self.d, self.d, bias=False)
-            )
+            self.lin_left_weights.append(nn.Linear(self.d, self.d, bias=False))
             nn.init.eye_(self.lin_left_weights[-1].weight.data)
 
         self.sheaf_learners = nn.ModuleList()
         self.weight_learners = nn.ModuleList()
 
-        num_sheaf_learners = min(
-            self.layers, self.layers
-        )
+        num_sheaf_learners = min(self.layers, self.layers)
         for _i in range(num_sheaf_learners):
             self.sheaf_learners.append(
                 LocalConcatSheafLearner(
@@ -397,16 +389,12 @@ def __init__(self, config):
             )
             nn.init.orthogonal_(self.lin_right_weights[-1].weight.data)
         for _i in range(self.layers):
-            self.lin_left_weights.append(
-                nn.Linear(self.d, self.d, bias=False)
-            )
+            self.lin_left_weights.append(nn.Linear(self.d, self.d, bias=False))
             nn.init.eye_(self.lin_left_weights[-1].weight.data)
 
         self.sheaf_learners = nn.ModuleList()
 
-        num_sheaf_learners = min(
-            self.layers, self.layers
-        )
+        num_sheaf_learners = min(self.layers, self.layers)
         for _i in range(num_sheaf_learners):
             self.sheaf_learners.append(
                 LocalConcatSheafLearner(
diff --git a/topobench/nn/backbones/graph/nsd_utils/laplace.py b/topobench/nn/backbones/graph/nsd_utils/laplace.py
index 55903396f..606311620 100644
--- a/topobench/nn/backbones/graph/nsd_utils/laplace.py
+++ b/topobench/nn/backbones/graph/nsd_utils/laplace.py
@@ -214,6 +214,7 @@ def compute_learnable_diag_laplacian_indices(
 
     return diag_indices, non_diag_indices
 
+
 def mergesp(index1, value1, index2, value2):
     """
     Merge two sparse matrices with disjoint indices into one.
diff --git a/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py b/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py
index fb6da3ea8..0da64aab6 100644
--- a/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py
+++ b/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py
@@ -103,7 +103,7 @@ def scalar_normalise(self, diag, tril, row, col):
             assert diag.dim() == 2
         d = diag.size(-1)
         diag_sqrt_inv = (diag + 1).pow(-0.5)
-        
+
         diag_sqrt_inv = (
             diag_sqrt_inv.view(-1, 1, 1)
             if tril.dim() > 2
@@ -122,6 +122,7 @@ def scalar_normalise(self, diag, tril, row, col):
 
         return diag_maps, non_diag_maps
 
+
 class DiagLaplacianBuilder(LaplacianBuilder):
     """
     Builder for sheaf Laplacian with diagonal restriction maps.
@@ -199,6 +200,7 @@ def forward(self, maps):
 
         return (edge_index, weights), saved_tril_maps
 
+
 class NormConnectionLaplacianBuilder(LaplacianBuilder):
     """
     Builder for normalized bundle sheaf Laplacian with orthogonal restriction maps.
@@ -254,7 +256,7 @@ def forward(self, map_params):
         """
         assert len(map_params.size()) == 2
         assert map_params.size(1) == self.d * (self.d + 1) // 2
-        
+
         _, full_right_idx = self.full_left_right_idx
         left_idx, right_idx = self.left_right_idx
         tril_row, tril_col = self.vertex_tril_idx
@@ -294,6 +296,7 @@ def forward(self, map_params):
 
         return (edge_index, weights), saved_tril_maps
 
+
 class GeneralLaplacianBuilder(LaplacianBuilder):
     """
     Builder for general sheaf Laplacian with full matrix restriction maps.
diff --git a/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py b/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py
index 55ac1c406..bc05ad9e9 100755
--- a/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py
+++ b/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py
@@ -605,7 +605,8 @@ def splitmerge(self):
 
         if clique_i == clique_j:
             clique_size = self.Z[clique_i].sum()
-            if clique_size <= 2: return # noqa
+            if clique_size <= 2:
+                return  # noqa
 
             Z_prop = self.Z.copy()
             Z_prop = np.delete(Z_prop, clique_i, 0)

From 869894d9d386eb77d0cae1b8b48b031f371a3fb3 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Fri, 21 Nov 2025 10:30:16 -0500
Subject: [PATCH 04/17] code cleaning and separating different functions in
 different files to respect the code tree

---
 .../dataset/graph/OC20_S2EF_train_200K.yaml   |  32 +-
 topobench/data/datasets/oc20_dataset.py       | 379 +++++++
 .../data/loaders/graph/oc20_dataset_loader.py | 997 ++++--------------
 .../loaders/graph/oc20_dataset_loader_old.py  | 831 +++++++++++++++
 topobench/data/preprocessor/preprocessor.py   |   5 +
 topobench/data/utils/__init__.py              |  19 +-
 topobench/data/utils/oc20_download.py         | 235 +++++
 topobench/data/utils/split_utils.py           |  10 +-
 8 files changed, 1719 insertions(+), 789 deletions(-)
 create mode 100644 topobench/data/datasets/oc20_dataset.py
 create mode 100644 topobench/data/loaders/graph/oc20_dataset_loader_old.py
 create mode 100644 topobench/data/utils/oc20_download.py

diff --git a/configs/dataset/graph/OC20_S2EF_train_200K.yaml b/configs/dataset/graph/OC20_S2EF_train_200K.yaml
index b9b9b8601..bee121d9b 100644
--- a/configs/dataset/graph/OC20_S2EF_train_200K.yaml
+++ b/configs/dataset/graph/OC20_S2EF_train_200K.yaml
@@ -1,40 +1,42 @@
-# OC20 S2EF dataset with 200K training samples
+# OC20 S2EF Dataset Configuration
+# Structure to Energy and Forces prediction for catalyst discovery
+# Dataset: 200K training samples with multiple validation splits
 # Validation: all 4 validation splits aggregated (val_id, val_ood_ads, val_ood_cat, val_ood_both)
 # Test: official test split
 
 loader:
-  _target_: topobench.data.loaders.OC20DatasetLoader
+  _target_: topobench.data.loaders.graph.oc20_dataset_loader.OC20DatasetLoader
   parameters:
     data_domain: graph
     data_type: oc20
     data_name: OC20_S2EF_200K
-    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
     task: s2ef
     train_split: "200K"
-    val_splits: null  # null means use all 4 validation splits
-    test_split: "test"
-    download: true
+    val_splits: null  # null means use all 4 validation splits (val_id, val_ood_ads, val_ood_cat, val_ood_both)
     include_test: false  # Skip test download, reuse validation as test
-    legacy_format: false
+    download: true
     dtype: float32
-    max_samples: 10  # Set to integer (e.g., 1000) to limit dataset size for fast experiments
+    legacy_format: false
+    max_samples: 100  # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
 
 parameters:
-  num_features: 6  # Will be determined by the actual data
-  num_classes: 1
+  num_features: 1        # Number of node features (atomic numbers)
+  num_classes: 1         # Regression task (energy prediction)
   task: regression
   loss_type: mse
   monitor_metric: mae
-  task_level: graph
+  task_level: graph      # Graph-level prediction
+
 
 split_params:
   learning_setting: inductive
-  split_type: random  # Use random splits for small test datasets
+  split_type: random     # random (random split of train set) or fixed (used for official splits)
   data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
   data_seed: 0
-  train_prop: 0.6  # 60% training
-  val_prop: 0.2    # 20% validation
-  test_prop: 0.2   # 20% test
+  train_prop: 0.6
+  val_prop: 0.2
+  test_prop: 0.2
 
 dataloader_params:
   batch_size: 32
diff --git a/topobench/data/datasets/oc20_dataset.py b/topobench/data/datasets/oc20_dataset.py
new file mode 100644
index 000000000..9f34dfec0
--- /dev/null
+++ b/topobench/data/datasets/oc20_dataset.py
@@ -0,0 +1,379 @@
+"""Dataset class for Open Catalyst 2020 (OC20) family of datasets."""
+
+from __future__ import annotations
+
+import logging
+import pickle
+from collections.abc import Iterator
+from pathlib import Path
+from typing import ClassVar
+
+import torch
+from omegaconf import DictConfig
+from torch_geometric.data import Data, Dataset
+
+try:
+    import lmdb
+
+    HAS_LMDB = True
+except ImportError:
+    lmdb = None
+    HAS_LMDB = False
+
+logger = logging.getLogger(__name__)
+
+
+class OC20Dataset(Dataset):
+    """Dataset class for Open Catalyst 2020 (OC20) family.
+
+    Supports S2EF (Structure to Energy and Forces) and IS2RE (Initial Structure
+    to Relaxed Energy) tasks for catalyst discovery and materials science.
+
+    The OC20 dataset contains DFT calculations for catalyst-adsorbate systems,
+    enabling machine learning models to predict energies and forces for
+    accelerated materials discovery.
+
+    Parameters
+    ----------
+    root : str
+        Root directory where the dataset is stored.
+    name : str
+        Name of the dataset.
+    parameters : DictConfig
+        Configuration parameters for the dataset.
+
+    Attributes
+    ----------
+    task : str
+        Task type: "s2ef", "is2re", or "oc22_is2re".
+    train_split : str
+        Training split size for S2EF (e.g., "200K", "2M", "20M", "all").
+    val_splits : list[str]
+        Validation splits to use.
+    """
+
+    # S2EF validation splits
+    VALID_VAL_SPLITS: ClassVar = [
+        "val_id",
+        "val_ood_ads",
+        "val_ood_cat",
+        "val_ood_both",
+    ]
+
+    # S2EF train splits
+    VALID_TRAIN_SPLITS: ClassVar = ["200K", "2M", "20M", "all"]
+
+    def __init__(
+        self,
+        root: str,
+        name: str,
+        parameters: DictConfig,
+    ) -> None:
+        self.name = name
+        self.parameters = parameters
+
+        # Task configuration
+        self.task = parameters.get("task", "s2ef").lower()
+        self.dtype = self._parse_dtype(parameters.get("dtype", "float32"))
+        self.legacy_format = parameters.get("legacy_format", False)
+        self.include_test = parameters.get("include_test", True)
+
+        # S2EF-specific configuration
+        if self.task == "s2ef":
+            self.train_split = parameters.get("train_split", "200K")
+            if self.train_split not in self.VALID_TRAIN_SPLITS:
+                raise ValueError(
+                    f"Invalid S2EF train split: {self.train_split}. "
+                    f"Choose from {self.VALID_TRAIN_SPLITS}"
+                )
+
+            # Parse validation splits
+            val_splits = parameters.get("val_splits", None)
+            if val_splits is None:
+                self.val_splits = self.VALID_VAL_SPLITS
+            elif isinstance(val_splits, str):
+                self.val_splits = [val_splits]
+            else:
+                self.val_splits = list(val_splits)
+
+            # Validate splits
+            for vs in self.val_splits:
+                if vs not in self.VALID_VAL_SPLITS:
+                    raise ValueError(
+                        f"Invalid S2EF val split: {vs}. "
+                        f"Choose from {self.VALID_VAL_SPLITS}"
+                    )
+
+            self.test_split = parameters.get("test_split", "test")
+
+        # Limit for fast experimentation
+        self.max_samples = parameters.get("max_samples", None)
+        if self.max_samples is not None:
+            self.max_samples = int(self.max_samples)
+            logger.info(
+                f"⚠️  Limiting dataset to {self.max_samples} samples for fast experimentation"
+            )
+
+        super().__init__(root)
+
+        # Open LMDB environments
+        self._open_lmdbs()
+
+    def __repr__(self) -> str:
+        task_info = f"task={self.task}"
+        if self.task == "s2ef":
+            task_info += f", train={self.train_split}"
+        return f"{self.name}(root={self.root}, {task_info}, size={len(self)})"
+
+    @staticmethod
+    def _parse_dtype(dtype) -> torch.dtype:
+        """Parse dtype parameter to torch.dtype."""
+        if isinstance(dtype, str):
+            return getattr(torch, dtype)
+        return dtype
+
+    def _get_data_paths(self) -> dict[str, list[Path]]:
+        """Get paths to LMDB files for each split.
+
+        Returns
+        -------
+        dict[str, list[Path]]
+            Dictionary mapping split names to lists of LMDB file paths.
+        """
+        root = Path(self.root)
+        paths = {"train": [], "val": [], "test": []}
+
+        if self.task == "s2ef":
+            # Training data path structure
+            train_subdir = f"s2ef_train_{self.train_split}"
+            train_dir = (
+                root / "s2ef" / self.train_split / train_subdir / train_subdir
+            )
+            paths["train"] = sorted(train_dir.glob("**/*.lmdb"))
+
+            # Validation data paths
+            for val_split in self.val_splits:
+                val_subdir = f"s2ef_{val_split}"
+                val_dir = root / "s2ef" / "all" / val_subdir / val_subdir
+                paths["val"].extend(sorted(val_dir.glob("**/*.lmdb")))
+
+            # Test data path
+            if self.include_test:
+                test_dir = root / "s2ef" / "all" / "s2ef_test" / "s2ef_test"
+                paths["test"] = sorted(test_dir.glob("**/*.lmdb"))
+
+        elif self.task in ["is2re", "oc22_is2re"]:
+            # IS2RE datasets have different structure
+            base_dir = root / (
+                "is2re" if self.task == "is2re" else "oc22_is2re"
+            )
+
+            # Try different possible directory structures
+            for possible_base in [
+                base_dir,
+                base_dir / "data" / "is2re",
+                root / "data" / "is2re",
+            ]:
+                if possible_base.exists():
+                    paths["train"] = sorted(
+                        (possible_base / "train").glob("**/*.lmdb")
+                    )
+                    paths["val"] = sorted(
+                        (possible_base / "val_id").glob("**/*.lmdb")
+                    ) or sorted((possible_base / "val").glob("**/*.lmdb"))
+                    paths["test"] = sorted(
+                        (possible_base / "test_id").glob("**/*.lmdb")
+                    ) or sorted((possible_base / "test").glob("**/*.lmdb"))
+                    break
+
+        return paths
+
+    def _open_lmdbs(self):
+        """Open LMDB files and create split mappings."""
+        if not HAS_LMDB:
+            raise ImportError(
+                "LMDB is required for OC20 dataset. Install with: pip install lmdb"
+            )
+
+        paths = self._get_data_paths()
+
+        # Initialize storage
+        self.envs = []
+        self.cumulative_sizes = [0]
+        self.split_idx = {"train": [], "valid": [], "test": []}
+
+        current_idx = 0
+
+        # Open train LMDBs
+        for lmdb_path in paths["train"]:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["train"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Open validation LMDBs
+        for lmdb_path in paths["val"]:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["valid"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Open test LMDBs
+        for lmdb_path in paths["test"]:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["test"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # If no test data, reuse validation indices
+        if not self.include_test or len(self.split_idx["test"]) == 0:
+            self.split_idx["test"] = list(self.split_idx["valid"])
+
+        # Convert to tensors
+        self.split_idx = {
+            k: torch.tensor(v, dtype=torch.long)
+            for k, v in self.split_idx.items()
+        }
+
+        logger.info(
+            f"Loaded {self.task.upper()} dataset: "
+            f"{len(self.split_idx['train'])} train, "
+            f"{len(self.split_idx['valid'])} val, "
+            f"{len(self.split_idx['test'])} test"
+        )
+
+    def _open_single_lmdb(self, lmdb_path: Path) -> tuple:
+        """Open a single LMDB file and return (env, size).
+
+        Parameters
+        ----------
+        lmdb_path : Path
+            Path to LMDB file.
+
+        Returns
+        -------
+        tuple
+            (environment, size) tuple.
+        """
+        env = lmdb.open(
+            str(lmdb_path.resolve()),
+            subdir=False,
+            readonly=True,
+            lock=False,
+            readahead=True,
+            meminit=False,
+            max_readers=1,
+        )
+        size = env.stat()["entries"]
+        return env, size
+
+    def _find_lmdb_and_local_idx(self, idx: int) -> tuple:
+        """Find which LMDB contains the given index and the local index within it.
+
+        Parameters
+        ----------
+        idx : int
+            Global dataset index.
+
+        Returns
+        -------
+        tuple
+            (lmdb_index, local_index) tuple.
+        """
+        if idx < 0 or idx >= len(self):
+            raise IndexError(f"Index {idx} out of range [0, {len(self)})")
+
+        # Binary search for the LMDB
+        left, right = 0, len(self.envs)
+        while left < right - 1:
+            mid = (left + right) // 2
+            if self.cumulative_sizes[mid] <= idx:
+                left = mid
+            else:
+                right = mid
+
+        lmdb_idx = left
+        local_idx = idx - self.cumulative_sizes[lmdb_idx]
+        return lmdb_idx, local_idx
+
+    def len(self) -> int:
+        """Get dataset length."""
+        return self.cumulative_sizes[-1]
+
+    def get(self, idx: int) -> Data:
+        """Get data sample at index.
+
+        Parameters
+        ----------
+        idx : int
+            Sample index.
+
+        Returns
+        -------
+        Data
+            PyTorch Geometric Data object.
+        """
+        lmdb_idx, local_idx = self._find_lmdb_and_local_idx(idx)
+        lmdb_path, env, _ = self.envs[lmdb_idx]
+
+        with env.begin() as txn:
+            cursor = txn.cursor()
+            if not cursor.first():
+                raise RuntimeError(f"Empty LMDB at {lmdb_path}")
+
+            # Navigate to the target entry
+            for _ in range(local_idx):
+                if not cursor.next():
+                    raise RuntimeError(
+                        f"Index {local_idx} out of range in {lmdb_path}"
+                    )
+
+            key, value = cursor.item()
+            data = pickle.loads(value)
+
+        # Convert to legacy format if needed
+        if self.legacy_format and isinstance(data, Data):
+            data = Data(
+                **{k: v for k, v in data.__dict__.items() if v is not None}
+            )
+
+        return data
+
+    def __len__(self) -> int:
+        """Get dataset length."""
+        return self.len()
+
+    def __getitem__(self, idx: int) -> Data:
+        """Get item at index."""
+        return self.get(idx)
+
+    def __iter__(self) -> Iterator[Data]:
+        """Iterate over dataset."""
+        for i in range(len(self)):
+            yield self[i]
+
+    def __del__(self):
+        """Clean up LMDB environments."""
+        if hasattr(self, "envs"):
+            for _, env, _ in self.envs:
+                env.close()
+
+    @property
+    def num_node_features(self) -> int:
+        """Number of node features per atom."""
+        # Will be determined by the actual data
+        return 1  # Atomic numbers
+
+    @property
+    def num_classes(self) -> int:
+        """Number of classes (regression task)."""
+        return 1  # Single regression target (energy)
diff --git a/topobench/data/loaders/graph/oc20_dataset_loader.py b/topobench/data/loaders/graph/oc20_dataset_loader.py
index cb6d9c51b..12d0dc124 100644
--- a/topobench/data/loaders/graph/oc20_dataset_loader.py
+++ b/topobench/data/loaders/graph/oc20_dataset_loader.py
@@ -1,831 +1,286 @@
-"""Loader for OC20 family datasets (S2EF/IS2RE).
-
-This loader integrates the Open Catalyst 2020 (OC20/OC22) datasets into TopoBench.
-
-Supported tasks:
-- S2EF (Structure to Energy and Forces): Predict energy/forces from atomic structure
-  - Train splits: 200K, 2M, 20M, all
-  - Validation splits: val_id, val_ood_ads, val_ood_cat, val_ood_both (can aggregate)
-  - Test split: test (can be optionally skipped with include_test=False)
-  - Automatic preprocessing from extxyz to ASE DB format
-
-The ASE DB backend with PyG conversion is used for efficient data loading.
-"""
-
-from __future__ import annotations
+"""Loader for OC20 family datasets (S2EF/IS2RE)."""
 
 import logging
-
-# Added missing imports
-import lzma  # needed by _uncompress_xz
-import os
-import pickle  # needed to deserialize LMDB records
-import shutil  # needed by _uncompress_xz
-import tarfile
-import urllib.request
-from collections.abc import Iterator
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 
 import torch
 from omegaconf import DictConfig
-from torch_geometric.data import Data, Dataset
-from tqdm import tqdm
-
-try:
-    import lmdb  # LMDB backend
-
-    HAS_LMDB = True
-except ImportError:  # optional dependency
-    lmdb = None  # type: ignore
-    HAS_LMDB = False
+from torch_geometric.data import Dataset
 
+from topobench.data.datasets.oc20_dataset import OC20Dataset
 from topobench.data.loaders.base import AbstractLoader
-
-# ASE DB fallback dataset
-from topobench.data.loaders.graph.oc20_asedbs2ef_loader import OC20ASEDBDataset
-from topobench.data.preprocessor.oc20_s2ef_preprocessor import (
-    HAS_ASE,
-    needs_preprocessing,
-    preprocess_s2ef_split_ase,
+from topobench.data.utils.oc20_download import (
+    download_is2re_dataset,
+    download_s2ef_dataset,
 )
 
-logger = logging.getLogger(__name__)
-
-# OC20 dataset split URLs
-# S2EF dataset URLs
-S2EF_TRAIN_SPLITS = {
-    "200K": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_200K.tar",
-    "2M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_2M.tar",
-    "20M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_20M.tar",
-    "all": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_all.tar",
-}
-
-S2EF_VAL_SPLITS = {
-    "val_id": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_id.tar",
-    "val_ood_ads": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_ads.tar",
-    "val_ood_cat": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_cat.tar",
-    "val_ood_both": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_both.tar",
-}
+# Import ASE DB fallback
+try:
+    from topobench.data.loaders.graph.oc20_asedbs2ef_loader import (
+        OC20ASEDBDataset,
+    )
 
-S2EF_TEST_SPLIT = "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_test_lmdbs.tar.gz"
+    HAS_ASEDB = True
+except ImportError:
+    HAS_ASEDB = False
 
-# IS2RE dataset URLs (contains train/val/test in one archive)
-IS2RE_URL = "https://dl.fbaipublicfiles.com/opencatalystproject/data/is2res_train_val_test_lmdbs.tar.gz"
-OC22_IS2RE_URL = "https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/is2res_total_train_val_test_lmdbs.tar.gz"
+logger = logging.getLogger(__name__)
 
-CACHE_DIR = Path.home() / ".cache" / "oc20"
 
+class OC20DatasetLoader(AbstractLoader):
+    """Load OC20 family datasets for catalyst discovery and materials science.
 
-def _uncompress_xz(file_path: str) -> str:
-    """Decompress .xz files.
+    Supports:
+    - S2EF (Structure to Energy and Forces): Predict energy/forces from atomic structure
+    - IS2RE (Initial Structure to Relaxed Energy): Predict relaxed energy
+    - OC22 IS2RE: Extended IS2RE dataset
 
     Parameters
     ----------
-    file_path : str
-        Path to file to decompress.
-
-    Returns
-    -------
-    str
-        Path to decompressed file.
+    parameters : DictConfig
+        Configuration parameters containing:
+            - data_dir: Root directory for data
+            - data_name: Name of the dataset
+            - task: Task type ("s2ef", "is2re", "oc22_is2re")
+            - download: Whether to download if not present (default: False)
+
+            For S2EF:
+            - train_split: Training split size ("200K", "2M", "20M", "all")
+            - val_splits: List of validation splits or None for all
+            - include_test: Whether to download test split (default: True)
+
+            Additional options:
+            - dtype: Data type for tensors (default: "float32")
+            - legacy_format: Use legacy PyG Data format (default: False)
+            - max_samples: Limit dataset size for testing (default: None)
     """
-    if not file_path.endswith(".xz"):
-        return file_path
-
-    output_path = file_path.replace(".xz", "")
-    try:
-        with (
-            lzma.open(file_path, "rb") as f_in,
-            open(output_path, "wb") as f_out,
-        ):
-            shutil.copyfileobj(f_in, f_out)
-        os.remove(file_path)
-        return output_path
-    except Exception as e:
-        logger.error(f"Error uncompressing {file_path}: {e}")
-        return file_path
-
-
-def _download_and_extract(
-    url: str, target_dir: Path, skip_if_extracted: bool = True
-) -> Path:
-    """Download and extract a tar archive.
 
-    Parameters
-    ----------
-    url : str
-        URL to download from.
-    target_dir : Path
-        Directory to extract to.
-    skip_if_extracted : bool
-        If True, skip extraction if extracted files already exist (default: True).
-
-    Returns
-    -------
-    Path
-        Path to extracted directory.
-    """
-    target_dir.mkdir(parents=True, exist_ok=True)
-    target_file = target_dir / os.path.basename(url)
-
-    # Download if needed
-    if not target_file.exists():
-        logger.info(f"Downloading {url}...")
-        with tqdm(
-            unit="B", unit_scale=True, desc=f"Downloading {target_file.name}"
-        ) as pbar:
-
-            def report(block_num, block_size, total_size):
-                if total_size > 0 and block_num == 0:
-                    pbar.total = total_size
-                pbar.update(block_size)
-
-            urllib.request.urlretrieve(url, target_file, reporthook=report)
-    else:
-        logger.info(f"Archive {target_file.name} already downloaded")
-
-    # Check if extraction is needed
-    # Look for extracted subdirectories (common pattern: archive extracts to a subdirectory)
-    extraction_marker = target_dir / ".extracted"
-    if skip_if_extracted and extraction_marker.exists():
-        logger.info(f"Archive {target_file.name} already extracted, skipping")
-        return target_dir
-
-    # Extract
-    logger.info(f"Extracting {target_file.name}...")
-    if str(target_file).endswith((".tar.gz", ".tgz")):
-        with tarfile.open(target_file, "r:gz") as tar:
-            tar.extractall(path=target_dir)
-    elif str(target_file).endswith(".tar"):
-        with tarfile.open(target_file, "r:") as tar:
-            tar.extractall(path=target_dir)
-    else:
-        raise ValueError(f"Unsupported archive format: {target_file}")
-
-    # Mark as extracted
-    extraction_marker.touch()
-    return target_dir
-
-
-class _OC20LMDBDataset(Dataset):
-    """LMDB-based dataset for OC20/OC22.
+    def __init__(self, parameters: DictConfig) -> None:
+        super().__init__(parameters)
 
-    Supports:
-    - S2EF task with flexible train/val/test split specification
-    - IS2RE/OC22_IS2RE tasks with pre-computed train/val/test splits
-    """
+    def load_dataset(self) -> Dataset:
+        """Load the OC20 dataset.
 
-    def __init__(
-        self,
-        root: str | Path,
-        task: str = "s2ef",
-        train_split: str | None = "200K",
-        val_splits: list[str] | None = None,
-        test_split: str = "test",
-        download: bool = True,
-        include_test: bool = True,
-        dtype: torch.dtype = torch.float32,
-        legacy_format: bool = False,
-    ):
-        """Initialize OC20 LMDB dataset.
+        Returns
+        -------
+        Dataset
+            The loaded OC20 dataset with the appropriate configuration.
 
-        Parameters
-        ----------
-        root : str | Path
-            Root directory for storing datasets.
-        task : str
-            Task type: "s2ef", "is2re", or "oc22_is2re".
-        train_split : Optional[str]
-            For S2EF: one of ["200K", "2M", "20M", "all"].
-            For IS2RE: ignored (uses precomputed split).
-        val_splits : Optional[list[str]]
-            For S2EF: list of validation splits to use.
-            Can be ["val_id", "val_ood_ads", "val_ood_cat", "val_ood_both"] or subset.
-            If None, uses all 4 validation splits.
-            For IS2RE: ignored (uses precomputed split).
-        test_split : str
-            For S2EF: "test" (default).
-            For IS2RE: ignored (uses precomputed split).
-        download : bool
-            Whether to download if not present.
-        include_test : bool
-            Whether to download/include test split. If False, validation indices are reused for test.
-        dtype : torch.dtype
-            Data type for tensors.
-        legacy_format : bool
-            Whether to use legacy PyG Data format.
+        Raises
+        ------
+        RuntimeError
+            If dataset loading fails.
         """
-        super().__init__()
-        self.root = Path(root)
-        self.task = task.lower()
-        self.dtype = dtype
-        self.legacy_format = legacy_format
-        self.download_flag = download
-        self.include_test = include_test
-
-        if self.task == "s2ef":
-            if train_split not in S2EF_TRAIN_SPLITS:
-                raise ValueError(
-                    f"Invalid S2EF train split: {train_split}. "
-                    f"Choose from {list(S2EF_TRAIN_SPLITS.keys())}"
-                )
-            self.train_split = train_split
-
-            # Default: use all validation splits
-            if val_splits is None:
-                val_splits = list(S2EF_VAL_SPLITS.keys())
-            else:
-                for vs in val_splits:
-                    if vs not in S2EF_VAL_SPLITS:
-                        raise ValueError(
-                            f"Invalid S2EF val split: {vs}. "
-                            f"Choose from {list(S2EF_VAL_SPLITS.keys())}"
-                        )
-            self.val_splits = val_splits
-            self.test_split = test_split
-
-        elif self.task in ["is2re", "oc22_is2re"]:
-            # IS2RE datasets have precomputed train/val/test splits
-            pass
-        else:
-            raise ValueError(
-                f"Unknown task: {task}. Choose from ['s2ef', 'is2re', 'oc22_is2re']"
-            )
+        # Download if requested
+        if self.parameters.get("download", False):
+            self._download_dataset()
 
-        if download:
-            self._download_and_prepare()
-
-        self._open_lmdbs()
-
-    def _download_and_prepare(self):
-        """Download and prepare the dataset based on task."""
-        if self.task == "s2ef":
-            self._download_s2ef()
-        elif self.task == "is2re":
-            self._download_is2re(IS2RE_URL, "is2re")
-        elif self.task == "oc22_is2re":
-            self._download_is2re(OC22_IS2RE_URL, "oc22_is2re")
-
-    def _download_s2ef(self):
-        """Download S2EF train, validation, and test splits."""
-        # Download train split
-        # Check for the actual data directory structure: s2ef/{split}/s2ef_train_{split}/s2ef_train_{split}/
-        train_url = S2EF_TRAIN_SPLITS[self.train_split]
-        train_subdir_name = f"s2ef_train_{self.train_split}"
-        train_dir = (
-            self.root
-            / "s2ef"
-            / self.train_split
-            / train_subdir_name
-            / train_subdir_name
-        )
-        if not train_dir.exists():
-            logger.info(f"Downloading S2EF train split: {self.train_split}")
-            _download_and_extract(
-                train_url, self.root / "s2ef" / self.train_split
-            )
-            self._decompress_xz_files(self.root / "s2ef" / self.train_split)
-        else:
-            logger.info(
-                f"S2EF train split {self.train_split} already exists, skipping download"
-            )
+        # Check if we have LMDB files or need ASE DB fallback
+        data_root = Path(self.get_data_dir())
+        task = self.parameters.get("task", "s2ef").lower()
 
-        # Download validation splits
-        for val_split in self.val_splits:
-            val_url = S2EF_VAL_SPLITS[val_split]
-            # Check for the actual data directory structure: s2ef/all/s2ef_{val_split}/s2ef_{val_split}/
-            val_subdir_name = f"s2ef_{val_split}"
-            val_dir = (
-                self.root / "s2ef" / "all" / val_subdir_name / val_subdir_name
-            )
-            if not val_dir.exists():
-                logger.info(f"Downloading S2EF validation split: {val_split}")
-                _download_and_extract(val_url, self.root / "s2ef" / "all")
-                self._decompress_xz_files(self.root / "s2ef" / "all")
-            else:
-                logger.info(
-                    f"S2EF validation split {val_split} already exists, skipping download"
-                )
-
-        # Download test split
-        test_subdir_name = "s2ef_test"
-        test_dir = (
-            self.root / "s2ef" / "all" / test_subdir_name / test_subdir_name
-        )
-        if self.include_test and not test_dir.exists():
-            logger.info("Downloading S2EF test split")
-            _download_and_extract(S2EF_TEST_SPLIT, self.root / "s2ef" / "all")
-            self._decompress_xz_files(self.root / "s2ef" / "all")
-        elif self.include_test and test_dir.exists():
-            logger.info("S2EF test split already exists, skipping download")
-        elif not self.include_test:
-            logger.info(
-                "Skipping S2EF test split download (include_test=False); will reuse validation as test"
-            )
-
-        # Preprocess S2EF data (convert extxyz/txt to LMDB if needed)
-        self._preprocess_s2ef()
+        # Try LMDB first
+        lmdb_present = any(data_root.glob("**/*.lmdb"))
 
-    def _preprocess_s2ef(self):
-        """Preprocess S2EF data from extxyz/txt files to ASE DB format.
+        if not lmdb_present and task == "s2ef" and HAS_ASEDB:
+            # Fallback to ASE DB dataset
+            logger.info("No LMDB files found, using ASE DB backend")
+            return self._load_asedb_dataset(data_root)
 
-        This method checks for raw extxyz files and converts them to ASE DB files
-        for efficient loading. It processes train, validation, and test splits.
-        """
-        if not HAS_ASE:
-            logger.warning("ASE not available. Cannot preprocess S2EF data.")
-            return
-
-        s2ef_root = self.root / "s2ef"
-
-        if not s2ef_root.exists():
-            logger.warning(f"S2EF data directory not found: {s2ef_root}")
-            return
-
-        # Get preprocessing parameters (use defaults since they're not available)
-        num_workers = 1
-        max_neigh = 50
-        radius = 6.0
-
-        # Process training data
-        # The actual data is in s2ef/{train_split}/s2ef_train_{train_split}/s2ef_train_{train_split}/
-        train_base = s2ef_root / self.train_split
-        train_subdir_name = f"s2ef_train_{self.train_split}"
-        train_dir = train_base / train_subdir_name / train_subdir_name
-
-        if train_dir.exists() and needs_preprocessing(train_dir):
-            logger.info(f"Preprocessing S2EF training data: {train_dir}")
-            preprocess_s2ef_split_ase(
-                data_path=train_dir,
-                out_path=train_dir,
-                num_workers=num_workers,
-                ref_energy=True,
-                test_data=False,
-                max_neigh=max_neigh,
-                radius=radius,
-            )
+        # Initialize LMDB dataset
+        dataset = self._initialize_dataset()
+        self.data_dir = self._redefine_data_dir(dataset)
+        return dataset
 
-        # Process validation splits
-        # The actual data is in s2ef/all/s2ef_{val_split}/s2ef_{val_split}/
-        for val_split in self.val_splits:
-            val_base = s2ef_root / "all"
-            val_subdir_name = f"s2ef_{val_split}"
-            val_dir = val_base / val_subdir_name / val_subdir_name
-
-            if val_dir.exists() and needs_preprocessing(val_dir):
-                logger.info(f"Preprocessing S2EF validation data: {val_dir}")
-                preprocess_s2ef_split_ase(
-                    data_path=val_dir,
-                    out_path=val_dir,
-                    num_workers=num_workers,
-                    ref_energy=True,
-                    test_data=False,
-                    max_neigh=max_neigh,
-                    radius=radius,
-                )
+    def _download_dataset(self):
+        """Download the OC20 dataset based on task configuration."""
+        task = self.parameters.get("task", "s2ef").lower()
+        root = Path(self.get_data_dir())
 
-        # Process test split if needed
-        if self.include_test:
-            test_base = s2ef_root / "all"
-            test_subdir_name = "s2ef_test"
-            test_dir = test_base / test_subdir_name / test_subdir_name
-
-            if test_dir.exists() and needs_preprocessing(test_dir):
-                logger.info(f"Preprocessing S2EF test data: {test_dir}")
-                preprocess_s2ef_split_ase(
-                    data_path=test_dir,
-                    out_path=test_dir,
-                    num_workers=num_workers,
-                    ref_energy=False,  # Test data typically doesn't have energy/forces
-                    test_data=True,
-                    max_neigh=max_neigh,
-                    radius=radius,
-                )
+        if task == "s2ef":
+            train_split = self.parameters.get("train_split", "200K")
+            val_splits = self.parameters.get("val_splits", None)
+            include_test = self.parameters.get("include_test", True)
 
-        logger.info("S2EF preprocessing complete")
+            # Parse val_splits
+            if val_splits is not None and isinstance(val_splits, str):
+                val_splits = [val_splits]
 
-    def _decompress_xz_files(self, directory: Path):
-        """Decompress all .xz files in a directory."""
-        xz_files = list(directory.glob("**/*.xz"))
-        if xz_files:
-            logger.info(
-                f"Decompressing {len(xz_files)} .xz files in {directory}..."
+            download_s2ef_dataset(
+                root=root,
+                train_split=train_split,
+                val_splits=val_splits,
+                include_test=include_test,
+            )
+        elif task in ["is2re", "oc22_is2re"]:
+            download_is2re_dataset(root=root, task=task)
+        else:
+            raise ValueError(
+                f"Unknown task: {task}. Choose from ['s2ef', 'is2re', 'oc22_is2re']"
             )
-            num_workers = max(1, os.cpu_count() - 1)
-            # Use threads to avoid pickling/import issues with processes on macOS
-            with ThreadPoolExecutor(max_workers=num_workers) as executor:
-                futures = [
-                    executor.submit(_uncompress_xz, str(f)) for f in xz_files
-                ]
-                for future in as_completed(futures):
-                    future.result()
-
-    def _download_is2re(self, url: str, name: str):
-        """Download IS2RE or OC22 IS2RE dataset."""
-        target_dir = self.root / name
-        if not target_dir.exists():
-            logger.info(f"Downloading {name} dataset")
-            _download_and_extract(url, self.root)
-            self._decompress_xz_files(self.root)
-
-    def _open_lmdbs(self):
-        """Open LMDB files for train/val/test splits."""
-        if self.task == "s2ef":
-            self._open_s2ef_lmdbs()
-        elif self.task in ["is2re", "oc22_is2re"]:
-            self._open_is2re_lmdbs()
-
-    def _open_s2ef_lmdbs(self):
-        """Open S2EF LMDB files and create split mappings."""
-        # Train
-        train_dir = self.root / "s2ef" / self.train_split / "train"
-        train_lmdbs = self._collect_lmdb_files(train_dir)
-
-        # Validation (can be multiple)
-        val_lmdbs = []
-        for val_split in self.val_splits:
-            val_dir = self.root / "s2ef" / "all" / val_split
-            val_lmdbs.extend(self._collect_lmdb_files(val_dir))
-
-        # Test
-        test_dir = self.root / "s2ef" / "all" / "test"
-        test_lmdbs = (
-            self._collect_lmdb_files(test_dir) if self.include_test else []
-        )
 
-        # Open all LMDBs and create split index mapping
-        self.envs = []
-        self.cumulative_sizes = [0]
-        self.split_idx = {"train": [], "valid": [], "test": []}
+    def _initialize_dataset(self) -> OC20Dataset:
+        """Initialize the OC20 dataset.
 
-        current_idx = 0
+        Returns
+        -------
+        OC20Dataset
+            The initialized OC20 dataset.
 
-        # Process train LMDBs
-        for lmdb_path in train_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["train"].extend(
-                range(current_idx, current_idx + size)
-            )
-            current_idx += size
-
-        # Process validation LMDBs
-        for lmdb_path in val_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["valid"].extend(
-                range(current_idx, current_idx + size)
-            )
-            current_idx += size
-
-        # Process test LMDBs
-        for lmdb_path in test_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["test"].extend(
-                range(current_idx, current_idx + size)
+        Raises
+        ------
+        RuntimeError
+            If dataset initialization fails.
+        """
+        try:
+            dataset = OC20Dataset(
+                root=str(self.get_data_dir()),
+                name=self.parameters.data_name,
+                parameters=self.parameters,
             )
-            current_idx += size
+            return dataset
+        except Exception as e:
+            msg = f"Error initializing OC20 dataset: {e}"
+            raise RuntimeError(msg) from e
 
-        # If no test data, reuse validation indices
-        if not self.include_test or len(self.split_idx["test"]) == 0:
-            self.split_idx["test"] = list(self.split_idx["valid"])
+    def _load_asedb_dataset(self, data_root: Path) -> OC20ASEDBDataset:
+        """Load dataset using ASE DB backend (fallback when no LMDBs).
 
-        # Convert to tensors
-        self.split_idx = {
-            k: torch.tensor(v, dtype=torch.long)
-            for k, v in self.split_idx.items()
-        }
+        Parameters
+        ----------
+        data_root : Path
+            Root directory for data.
 
-        logger.info(
-            f"Loaded S2EF dataset: {len(self.split_idx['train'])} train, "
-            f"{len(self.split_idx['valid'])} val, {len(self.split_idx['test'])} test"
+        Returns
+        -------
+        OC20ASEDBDataset
+            Dataset using ASE DB backend.
+        """
+        train_split = self.parameters.get("train_split", "200K")
+        val_splits = self.parameters.get("val_splits", None)
+        include_test = self.parameters.get("include_test", True)
+
+        # Parse val_splits
+        if val_splits is None:
+            val_splits = [
+                "val_id",
+                "val_ood_ads",
+                "val_ood_cat",
+                "val_ood_both",
+            ]
+        elif isinstance(val_splits, str):
+            val_splits = [val_splits]
+
+        # Collect DB files
+        # The data_root might already include the dataset name (e.g., datasets/graph/oc20/OC20_S2EF_200K)
+        # or just the base (e.g., datasets/graph/oc20)
+        # Try both patterns
+        train_subdir_name = f"s2ef_train_{train_split}"
+
+        # Pattern 1: data_root already includes dataset name
+        train_dir_pattern1 = (
+            data_root
+            / "s2ef"
+            / train_split
+            / train_subdir_name
+            / train_subdir_name
         )
+        # Pattern 2: data_root is just base, dataset name in separate dir
+        if not train_dir_pattern1.exists():
+            # Try finding s2ef directory anywhere under data_root
+            s2ef_roots = list(data_root.glob("**/s2ef"))
+            if s2ef_roots:
+                s2ef_root = s2ef_roots[0].parent
+                train_dir_pattern1 = (
+                    s2ef_root
+                    / "s2ef"
+                    / train_split
+                    / train_subdir_name
+                    / train_subdir_name
+                )
 
-    def _open_is2re_lmdbs(self):
-        """Open IS2RE LMDB files with precomputed splits."""
-        # IS2RE datasets have structure: data/is2re/train, data/is2re/val_id, data/is2re/test_id
-        # or data/is2re/all/train, etc.
-        base_dir = self.root / (
-            "is2re" if self.task == "is2re" else "oc22_is2re"
+        train_dbs = (
+            sorted(train_dir_pattern1.glob("*.db"))
+            if train_dir_pattern1.exists()
+            else []
         )
 
-        # Try different possible structures
-        possible_structures = [
-            base_dir,
-            base_dir / "data" / "is2re",
-            self.root / "data" / "is2re",
-        ]
-
-        found_dir = None
-        for poss_dir in possible_structures:
-            if poss_dir.exists():
-                found_dir = poss_dir
-                break
-
-        if found_dir is None:
-            raise ValueError(f"Cannot find IS2RE data directory in {base_dir}")
-
-        # Look for train/val/test subdirectories
-        train_lmdbs = self._collect_lmdb_files(found_dir / "train")
-        val_lmdbs = self._collect_lmdb_files(
-            found_dir / "val_id"
-        ) or self._collect_lmdb_files(found_dir / "val")
-        test_lmdbs = self._collect_lmdb_files(
-            found_dir / "test_id"
-        ) or self._collect_lmdb_files(found_dir / "test")
-
-        # Open all LMDBs
-        self.envs = []
-        self.cumulative_sizes = [0]
-        self.split_idx = {"train": [], "valid": [], "test": []}
-
-        current_idx = 0
-
-        for lmdb_path in train_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["train"].extend(
-                range(current_idx, current_idx + size)
-            )
-            current_idx += size
-
-        for lmdb_path in val_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["valid"].extend(
-                range(current_idx, current_idx + size)
-            )
-            current_idx += size
-
-        for lmdb_path in test_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["test"].extend(
-                range(current_idx, current_idx + size)
+        val_dbs = []
+        for vs in val_splits:
+            val_subdir_name = f"s2ef_{vs}"
+            val_dir = (
+                data_root / "s2ef" / "all" / val_subdir_name / val_subdir_name
             )
-            current_idx += size
-
-        # Convert to tensors
-        self.split_idx = {
-            k: torch.tensor(v, dtype=torch.long)
-            for k, v in self.split_idx.items()
-        }
+            if (
+                not val_dir.exists()
+                and "s2ef_roots" in locals()
+                and s2ef_roots
+            ):
+                val_dir = (
+                    s2ef_roots[0].parent
+                    / "s2ef"
+                    / "all"
+                    / val_subdir_name
+                    / val_subdir_name
+                )
+            if val_dir.exists():
+                val_dbs.extend(sorted(val_dir.glob("*.db")))
+
+        test_dbs = []
+        if include_test:
+            test_dir = data_root / "s2ef" / "all" / "s2ef_test" / "s2ef_test"
+            if (
+                not test_dir.exists()
+                and "s2ef_roots" in locals()
+                and s2ef_roots
+            ):
+                test_dir = (
+                    s2ef_roots[0].parent
+                    / "s2ef"
+                    / "all"
+                    / "s2ef_test"
+                    / "s2ef_test"
+                )
+            if test_dir.exists():
+                test_dbs = sorted(test_dir.glob("*.db"))
 
         logger.info(
-            f"Loaded {self.task.upper()} dataset: {len(self.split_idx['train'])} train, "
-            f"{len(self.split_idx['valid'])} val, {len(self.split_idx['test'])} test"
-        )
-
-    def _collect_lmdb_files(self, directory: Path) -> list[Path]:
-        """Collect all .lmdb files in a directory."""
-        if not directory.exists():
-            return []
-        lmdb_files = sorted(directory.glob("**/*.lmdb"))
-        return lmdb_files
-
-    def _open_single_lmdb(self, lmdb_path: Path) -> tuple:
-        """Open a single LMDB file and return (env, size)."""
-        env = lmdb.open(
-            str(lmdb_path.resolve()),
-            subdir=False,
-            readonly=True,
-            lock=False,
-            readahead=True,
-            meminit=False,
-            max_readers=1,
+            f"Using ASE DB backend: {len(train_dbs)} train, "
+            f"{len(val_dbs)} val, {len(test_dbs)} test DB files"
         )
-        size = env.stat()["entries"]
-        return env, size
-
-    def _find_lmdb_and_local_idx(self, idx: int) -> tuple:
-        if idx < 0 or idx >= len(self):
-            raise IndexError(f"Index {idx} out of range [0, {len(self)})")
-
-        left, right = 0, len(self.envs)
-        while left < right - 1:
-            mid = (left + right) // 2
-            if self.cumulative_sizes[mid] <= idx:
-                left = mid
-            else:
-                right = mid
-
-        lmdb_idx = left
-        local_idx = idx - self.cumulative_sizes[lmdb_idx]
-        return lmdb_idx, local_idx
-
-    def len(self) -> int:
-        return self.cumulative_sizes[-1]
-
-    def get(self, idx: int) -> Data:
-        lmdb_idx, local_idx = self._find_lmdb_and_local_idx(idx)
-        lmdb_path, env, _ = self.envs[lmdb_idx]
-
-        with env.begin() as txn:
-            cursor = txn.cursor()
-            if not cursor.first():
-                raise RuntimeError(f"Empty LMDB at {lmdb_path}")
-
-            for _ in range(local_idx):
-                if not cursor.next():
-                    raise RuntimeError(
-                        f"Index {local_idx} out of range in {lmdb_path}"
-                    )
-
-            key, value = cursor.item()
-            data = pickle.loads(value)
-
-        if self.legacy_format and isinstance(data, Data):
-            data = Data(
-                **{k: v for k, v in data.__dict__.items() if v is not None}
-            )
-
-        return data
-
-    def __len__(self) -> int:
-        return self.len()
-
-    def __getitem__(self, idx: int) -> Data:
-        return self.get(idx)
-
-    def __iter__(self) -> Iterator[Data]:
-        for i in range(len(self)):
-            yield self[i]
-
-    def __del__(self):
-        if hasattr(self, "envs"):
-            for _, env, _ in self.envs:
-                env.close()
-
-
-class OC20DatasetLoader(AbstractLoader):
-    """Load OC20 family datasets.
-
-    This loader supports all OC20/OC22 dataset splits including S2EF and IS2RE tasks.
-
-    Parameters in the Hydra config (dataset.loader.parameters):
-    - data_domain: graph
-    - data_type: oc20
-    - data_name: Logical name for the dataset (e.g., OC20_S2EF_200K)
-    - task: "s2ef", "is2re", or "oc22_is2re"
-
-    For S2EF task:
-    - train_split: one of ["200K", "2M", "20M", "all"]
-    - val_splits: list of validation splits (default: all 4)
-      Options: ["val_id", "val_ood_ads", "val_ood_cat", "val_ood_both"]
-    - test_split: "test" (default)
-
-    For IS2RE/OC22 tasks:
-    - Uses precomputed train/val/test splits from the LMDB archives
-
-    Common parameters:
-    - download: whether to download (default: false)
-    - legacy_format: whether to use legacy PyG Data format (default: false)
-    - dtype: torch dtype (default: "float32")
-    - max_samples: limit dataset size for fast experimentation (default: None = all samples)
-    """
 
-    def __init__(self, parameters: DictConfig) -> None:
-        super().__init__(parameters)
-
-    def load_dataset(self) -> Dataset:
-        """Load OC20 dataset (S2EF or IS2RE).
+        # Parse dtype
+        dtype = self.parameters.get("dtype", "float32")
+        if isinstance(dtype, str):
+            dtype = getattr(torch, dtype)
 
-        Returns
-        -------
-        Dataset
-            Loaded dataset with appropriate splits.
-        """
-        task: str = getattr(self.parameters, "task", "s2ef")
-        download: bool = bool(getattr(self.parameters, "download", False))
-        legacy_format: bool = bool(
-            getattr(self.parameters, "legacy_format", False)
-        )
-        dtype = getattr(self.parameters, "dtype", "float32")
-        dtype_t = (
-            getattr(torch, str(dtype)) if isinstance(dtype, str) else dtype
-        )
-        max_samples = getattr(self.parameters, "max_samples", None)
+        max_samples = self.parameters.get("max_samples", None)
         if max_samples is not None:
             max_samples = int(max_samples)
-            print(
-                f"⚠️  Limiting dataset to {max_samples} samples for fast experimentation"
-            )
-
-        if task == "s2ef":
-            train_split = getattr(self.parameters, "train_split", "200K")
-            val_splits_param = getattr(self.parameters, "val_splits", None)
-
-            # Parse val_splits
-            if val_splits_param is None:
-                val_splits = None  # Use all by default
-            elif isinstance(val_splits_param, str):
-                # Single validation split as string
-                val_splits = [val_splits_param]
-            elif isinstance(val_splits_param, (list, tuple)):
-                val_splits = list(val_splits_param)
-            else:
-                val_splits = None
-
-            test_split = getattr(self.parameters, "test_split", "test")
-            include_test = bool(getattr(self.parameters, "include_test", True))
-
-            ds = _OC20LMDBDataset(
-                root=self.get_data_dir(),
-                task="s2ef",
-                train_split=train_split,
-                val_splits=val_splits,
-                test_split=test_split,
-                download=download,
-                include_test=include_test,
-                dtype=dtype_t,
-                legacy_format=legacy_format,
-            )
 
-            # ASE DB fallback if LMDBs are not present
-            data_root = Path(self.get_data_dir())
-            lmdb_present = any((data_root / "s2ef").glob("**/*.lmdb"))
-            if not lmdb_present and HAS_ASE:
-                # Preprocessing is already done in _OC20LMDBDataset if needed
-                # Now collect DB files
-                train_subdir_name = f"s2ef_train_{train_split}"
-                train_dir = (
-                    data_root
-                    / "s2ef"
-                    / train_split
-                    / train_subdir_name
-                    / train_subdir_name
-                )
-                train_dbs = sorted(train_dir.glob("*.db"))
-                val_dbs = []
-                # Respect empty list for val_splits (for fast prototyping)
-                val_splits_to_use = (
-                    list(S2EF_VAL_SPLITS.keys())
-                    if val_splits is None
-                    else val_splits
-                )
-                for vs in val_splits_to_use:
-                    val_subdir_name = f"s2ef_{vs}"
-                    val_dir = (
-                        data_root
-                        / "s2ef"
-                        / "all"
-                        / val_subdir_name
-                        / val_subdir_name
-                    )
-                    val_dbs.extend(sorted(val_dir.glob("*.db")))
-                test_dbs = []
-                if include_test:
-                    test_dbs = sorted(
-                        (data_root / "s2ef" / "all" / "test").glob("*.db")
-                    )
-
-                if train_dbs:
-                    logger.info(
-                        f"Using ASE DB backend: {len(train_dbs)} train, {len(val_dbs)} val, {len(test_dbs)} test DB files"
-                    )
-                    return OC20ASEDBDataset(
-                        train_db_paths=[str(p) for p in train_dbs],
-                        val_db_paths=[str(p) for p in val_dbs],
-                        test_db_paths=[str(p) for p in test_dbs],
-                        max_neigh=int(
-                            getattr(self.parameters, "max_neigh", 50)
-                        ),
-                        radius=float(getattr(self.parameters, "radius", 6.0)),
-                        dtype=dtype_t,
-                        include_energy=True,
-                        include_forces=True,
-                        max_samples=max_samples,
-                    )
-        elif task in ["is2re", "oc22_is2re"]:
-            ds = _OC20LMDBDataset(
-                root=self.get_data_dir(),
-                task=task,
-                download=download,
-                dtype=dtype_t,
-                legacy_format=legacy_format,
-            )
-        else:
-            raise ValueError(
-                f"Unsupported task '{task}'. Use 's2ef', 'is2re', or 'oc22_is2re'."
-            )
+        return OC20ASEDBDataset(
+            train_db_paths=[str(p) for p in train_dbs],
+            val_db_paths=[str(p) for p in val_dbs],
+            test_db_paths=[str(p) for p in test_dbs],
+            max_neigh=int(self.parameters.get("max_neigh", 50)),
+            radius=float(self.parameters.get("radius", 6.0)),
+            dtype=dtype,
+            include_energy=True,
+            include_forces=True,
+            max_samples=max_samples,
+        )
 
-        return ds  # type: ignore[return-value]
+    def _redefine_data_dir(self, dataset: Dataset) -> Path:
+        """Redefine the data directory based on dataset configuration.
 
-    def get_data_dir(self) -> Path:
-        """Get data directory path.
+        Parameters
+        ----------
+        dataset : Dataset
+            The OC20 dataset instance.
 
         Returns
         -------
         Path
-            Path to data directory.
+            The redefined data directory path.
         """
-        # Keep default directory convention for TopoBench
-        return Path(super().get_data_dir())
+        return self.get_data_dir()
diff --git a/topobench/data/loaders/graph/oc20_dataset_loader_old.py b/topobench/data/loaders/graph/oc20_dataset_loader_old.py
new file mode 100644
index 000000000..cb6d9c51b
--- /dev/null
+++ b/topobench/data/loaders/graph/oc20_dataset_loader_old.py
@@ -0,0 +1,831 @@
+"""Loader for OC20 family datasets (S2EF/IS2RE).
+
+This loader integrates the Open Catalyst 2020 (OC20/OC22) datasets into TopoBench.
+
+Supported tasks:
+- S2EF (Structure to Energy and Forces): Predict energy/forces from atomic structure
+  - Train splits: 200K, 2M, 20M, all
+  - Validation splits: val_id, val_ood_ads, val_ood_cat, val_ood_both (can aggregate)
+  - Test split: test (can be optionally skipped with include_test=False)
+  - Automatic preprocessing from extxyz to ASE DB format
+
+The ASE DB backend with PyG conversion is used for efficient data loading.
+"""
+
+from __future__ import annotations
+
+import logging
+
+# Added missing imports
+import lzma  # needed by _uncompress_xz
+import os
+import pickle  # needed to deserialize LMDB records
+import shutil  # needed by _uncompress_xz
+import tarfile
+import urllib.request
+from collections.abc import Iterator
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+import torch
+from omegaconf import DictConfig
+from torch_geometric.data import Data, Dataset
+from tqdm import tqdm
+
+try:
+    import lmdb  # LMDB backend
+
+    HAS_LMDB = True
+except ImportError:  # optional dependency
+    lmdb = None  # type: ignore
+    HAS_LMDB = False
+
+from topobench.data.loaders.base import AbstractLoader
+
+# ASE DB fallback dataset
+from topobench.data.loaders.graph.oc20_asedbs2ef_loader import OC20ASEDBDataset
+from topobench.data.preprocessor.oc20_s2ef_preprocessor import (
+    HAS_ASE,
+    needs_preprocessing,
+    preprocess_s2ef_split_ase,
+)
+
+logger = logging.getLogger(__name__)
+
+# OC20 dataset split URLs
+# S2EF dataset URLs
+S2EF_TRAIN_SPLITS = {
+    "200K": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_200K.tar",
+    "2M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_2M.tar",
+    "20M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_20M.tar",
+    "all": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_all.tar",
+}
+
+S2EF_VAL_SPLITS = {
+    "val_id": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_id.tar",
+    "val_ood_ads": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_ads.tar",
+    "val_ood_cat": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_cat.tar",
+    "val_ood_both": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_both.tar",
+}
+
+S2EF_TEST_SPLIT = "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_test_lmdbs.tar.gz"
+
+# IS2RE dataset URLs (contains train/val/test in one archive)
+IS2RE_URL = "https://dl.fbaipublicfiles.com/opencatalystproject/data/is2res_train_val_test_lmdbs.tar.gz"
+OC22_IS2RE_URL = "https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/is2res_total_train_val_test_lmdbs.tar.gz"
+
+CACHE_DIR = Path.home() / ".cache" / "oc20"
+
+
+def _uncompress_xz(file_path: str) -> str:
+    """Decompress .xz files.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to file to decompress.
+
+    Returns
+    -------
+    str
+        Path to decompressed file.
+    """
+    if not file_path.endswith(".xz"):
+        return file_path
+
+    output_path = file_path.replace(".xz", "")
+    try:
+        with (
+            lzma.open(file_path, "rb") as f_in,
+            open(output_path, "wb") as f_out,
+        ):
+            shutil.copyfileobj(f_in, f_out)
+        os.remove(file_path)
+        return output_path
+    except Exception as e:
+        logger.error(f"Error uncompressing {file_path}: {e}")
+        return file_path
+
+
+def _download_and_extract(
+    url: str, target_dir: Path, skip_if_extracted: bool = True
+) -> Path:
+    """Download and extract a tar archive.
+
+    Parameters
+    ----------
+    url : str
+        URL to download from.
+    target_dir : Path
+        Directory to extract to.
+    skip_if_extracted : bool
+        If True, skip extraction if extracted files already exist (default: True).
+
+    Returns
+    -------
+    Path
+        Path to extracted directory.
+    """
+    target_dir.mkdir(parents=True, exist_ok=True)
+    target_file = target_dir / os.path.basename(url)
+
+    # Download if needed
+    if not target_file.exists():
+        logger.info(f"Downloading {url}...")
+        with tqdm(
+            unit="B", unit_scale=True, desc=f"Downloading {target_file.name}"
+        ) as pbar:
+
+            def report(block_num, block_size, total_size):
+                if total_size > 0 and block_num == 0:
+                    pbar.total = total_size
+                pbar.update(block_size)
+
+            urllib.request.urlretrieve(url, target_file, reporthook=report)
+    else:
+        logger.info(f"Archive {target_file.name} already downloaded")
+
+    # Check if extraction is needed
+    # Look for extracted subdirectories (common pattern: archive extracts to a subdirectory)
+    extraction_marker = target_dir / ".extracted"
+    if skip_if_extracted and extraction_marker.exists():
+        logger.info(f"Archive {target_file.name} already extracted, skipping")
+        return target_dir
+
+    # Extract
+    logger.info(f"Extracting {target_file.name}...")
+    if str(target_file).endswith((".tar.gz", ".tgz")):
+        with tarfile.open(target_file, "r:gz") as tar:
+            tar.extractall(path=target_dir)
+    elif str(target_file).endswith(".tar"):
+        with tarfile.open(target_file, "r:") as tar:
+            tar.extractall(path=target_dir)
+    else:
+        raise ValueError(f"Unsupported archive format: {target_file}")
+
+    # Mark as extracted
+    extraction_marker.touch()
+    return target_dir
+
+
+class _OC20LMDBDataset(Dataset):
+    """LMDB-based dataset for OC20/OC22.
+
+    Supports:
+    - S2EF task with flexible train/val/test split specification
+    - IS2RE/OC22_IS2RE tasks with pre-computed train/val/test splits
+    """
+
+    def __init__(
+        self,
+        root: str | Path,
+        task: str = "s2ef",
+        train_split: str | None = "200K",
+        val_splits: list[str] | None = None,
+        test_split: str = "test",
+        download: bool = True,
+        include_test: bool = True,
+        dtype: torch.dtype = torch.float32,
+        legacy_format: bool = False,
+    ):
+        """Initialize OC20 LMDB dataset.
+
+        Parameters
+        ----------
+        root : str | Path
+            Root directory for storing datasets.
+        task : str
+            Task type: "s2ef", "is2re", or "oc22_is2re".
+        train_split : Optional[str]
+            For S2EF: one of ["200K", "2M", "20M", "all"].
+            For IS2RE: ignored (uses precomputed split).
+        val_splits : Optional[list[str]]
+            For S2EF: list of validation splits to use.
+            Can be ["val_id", "val_ood_ads", "val_ood_cat", "val_ood_both"] or subset.
+            If None, uses all 4 validation splits.
+            For IS2RE: ignored (uses precomputed split).
+        test_split : str
+            For S2EF: "test" (default).
+            For IS2RE: ignored (uses precomputed split).
+        download : bool
+            Whether to download if not present.
+        include_test : bool
+            Whether to download/include test split. If False, validation indices are reused for test.
+        dtype : torch.dtype
+            Data type for tensors.
+        legacy_format : bool
+            Whether to use legacy PyG Data format.
+        """
+        super().__init__()
+        self.root = Path(root)
+        self.task = task.lower()
+        self.dtype = dtype
+        self.legacy_format = legacy_format
+        self.download_flag = download
+        self.include_test = include_test
+
+        if self.task == "s2ef":
+            if train_split not in S2EF_TRAIN_SPLITS:
+                raise ValueError(
+                    f"Invalid S2EF train split: {train_split}. "
+                    f"Choose from {list(S2EF_TRAIN_SPLITS.keys())}"
+                )
+            self.train_split = train_split
+
+            # Default: use all validation splits
+            if val_splits is None:
+                val_splits = list(S2EF_VAL_SPLITS.keys())
+            else:
+                for vs in val_splits:
+                    if vs not in S2EF_VAL_SPLITS:
+                        raise ValueError(
+                            f"Invalid S2EF val split: {vs}. "
+                            f"Choose from {list(S2EF_VAL_SPLITS.keys())}"
+                        )
+            self.val_splits = val_splits
+            self.test_split = test_split
+
+        elif self.task in ["is2re", "oc22_is2re"]:
+            # IS2RE datasets have precomputed train/val/test splits
+            pass
+        else:
+            raise ValueError(
+                f"Unknown task: {task}. Choose from ['s2ef', 'is2re', 'oc22_is2re']"
+            )
+
+        if download:
+            self._download_and_prepare()
+
+        self._open_lmdbs()
+
+    def _download_and_prepare(self):
+        """Download and prepare the dataset based on task."""
+        if self.task == "s2ef":
+            self._download_s2ef()
+        elif self.task == "is2re":
+            self._download_is2re(IS2RE_URL, "is2re")
+        elif self.task == "oc22_is2re":
+            self._download_is2re(OC22_IS2RE_URL, "oc22_is2re")
+
+    def _download_s2ef(self):
+        """Download S2EF train, validation, and test splits."""
+        # Download train split
+        # Check for the actual data directory structure: s2ef/{split}/s2ef_train_{split}/s2ef_train_{split}/
+        train_url = S2EF_TRAIN_SPLITS[self.train_split]
+        train_subdir_name = f"s2ef_train_{self.train_split}"
+        train_dir = (
+            self.root
+            / "s2ef"
+            / self.train_split
+            / train_subdir_name
+            / train_subdir_name
+        )
+        if not train_dir.exists():
+            logger.info(f"Downloading S2EF train split: {self.train_split}")
+            _download_and_extract(
+                train_url, self.root / "s2ef" / self.train_split
+            )
+            self._decompress_xz_files(self.root / "s2ef" / self.train_split)
+        else:
+            logger.info(
+                f"S2EF train split {self.train_split} already exists, skipping download"
+            )
+
+        # Download validation splits
+        for val_split in self.val_splits:
+            val_url = S2EF_VAL_SPLITS[val_split]
+            # Check for the actual data directory structure: s2ef/all/s2ef_{val_split}/s2ef_{val_split}/
+            val_subdir_name = f"s2ef_{val_split}"
+            val_dir = (
+                self.root / "s2ef" / "all" / val_subdir_name / val_subdir_name
+            )
+            if not val_dir.exists():
+                logger.info(f"Downloading S2EF validation split: {val_split}")
+                _download_and_extract(val_url, self.root / "s2ef" / "all")
+                self._decompress_xz_files(self.root / "s2ef" / "all")
+            else:
+                logger.info(
+                    f"S2EF validation split {val_split} already exists, skipping download"
+                )
+
+        # Download test split
+        test_subdir_name = "s2ef_test"
+        test_dir = (
+            self.root / "s2ef" / "all" / test_subdir_name / test_subdir_name
+        )
+        if self.include_test and not test_dir.exists():
+            logger.info("Downloading S2EF test split")
+            _download_and_extract(S2EF_TEST_SPLIT, self.root / "s2ef" / "all")
+            self._decompress_xz_files(self.root / "s2ef" / "all")
+        elif self.include_test and test_dir.exists():
+            logger.info("S2EF test split already exists, skipping download")
+        elif not self.include_test:
+            logger.info(
+                "Skipping S2EF test split download (include_test=False); will reuse validation as test"
+            )
+
+        # Preprocess S2EF data (convert extxyz/txt to LMDB if needed)
+        self._preprocess_s2ef()
+
+    def _preprocess_s2ef(self):
+        """Preprocess S2EF data from extxyz/txt files to ASE DB format.
+
+        This method checks for raw extxyz files and converts them to ASE DB files
+        for efficient loading. It processes train, validation, and test splits.
+        """
+        if not HAS_ASE:
+            logger.warning("ASE not available. Cannot preprocess S2EF data.")
+            return
+
+        s2ef_root = self.root / "s2ef"
+
+        if not s2ef_root.exists():
+            logger.warning(f"S2EF data directory not found: {s2ef_root}")
+            return
+
+        # Get preprocessing parameters (use defaults since they're not available)
+        num_workers = 1
+        max_neigh = 50
+        radius = 6.0
+
+        # Process training data
+        # The actual data is in s2ef/{train_split}/s2ef_train_{train_split}/s2ef_train_{train_split}/
+        train_base = s2ef_root / self.train_split
+        train_subdir_name = f"s2ef_train_{self.train_split}"
+        train_dir = train_base / train_subdir_name / train_subdir_name
+
+        if train_dir.exists() and needs_preprocessing(train_dir):
+            logger.info(f"Preprocessing S2EF training data: {train_dir}")
+            preprocess_s2ef_split_ase(
+                data_path=train_dir,
+                out_path=train_dir,
+                num_workers=num_workers,
+                ref_energy=True,
+                test_data=False,
+                max_neigh=max_neigh,
+                radius=radius,
+            )
+
+        # Process validation splits
+        # The actual data is in s2ef/all/s2ef_{val_split}/s2ef_{val_split}/
+        for val_split in self.val_splits:
+            val_base = s2ef_root / "all"
+            val_subdir_name = f"s2ef_{val_split}"
+            val_dir = val_base / val_subdir_name / val_subdir_name
+
+            if val_dir.exists() and needs_preprocessing(val_dir):
+                logger.info(f"Preprocessing S2EF validation data: {val_dir}")
+                preprocess_s2ef_split_ase(
+                    data_path=val_dir,
+                    out_path=val_dir,
+                    num_workers=num_workers,
+                    ref_energy=True,
+                    test_data=False,
+                    max_neigh=max_neigh,
+                    radius=radius,
+                )
+
+        # Process test split if needed
+        if self.include_test:
+            test_base = s2ef_root / "all"
+            test_subdir_name = "s2ef_test"
+            test_dir = test_base / test_subdir_name / test_subdir_name
+
+            if test_dir.exists() and needs_preprocessing(test_dir):
+                logger.info(f"Preprocessing S2EF test data: {test_dir}")
+                preprocess_s2ef_split_ase(
+                    data_path=test_dir,
+                    out_path=test_dir,
+                    num_workers=num_workers,
+                    ref_energy=False,  # Test data typically doesn't have energy/forces
+                    test_data=True,
+                    max_neigh=max_neigh,
+                    radius=radius,
+                )
+
+        logger.info("S2EF preprocessing complete")
+
+    def _decompress_xz_files(self, directory: Path):
+        """Decompress all .xz files in a directory."""
+        xz_files = list(directory.glob("**/*.xz"))
+        if xz_files:
+            logger.info(
+                f"Decompressing {len(xz_files)} .xz files in {directory}..."
+            )
+            num_workers = max(1, os.cpu_count() - 1)
+            # Use threads to avoid pickling/import issues with processes on macOS
+            with ThreadPoolExecutor(max_workers=num_workers) as executor:
+                futures = [
+                    executor.submit(_uncompress_xz, str(f)) for f in xz_files
+                ]
+                for future in as_completed(futures):
+                    future.result()
+
+    def _download_is2re(self, url: str, name: str):
+        """Download IS2RE or OC22 IS2RE dataset."""
+        target_dir = self.root / name
+        if not target_dir.exists():
+            logger.info(f"Downloading {name} dataset")
+            _download_and_extract(url, self.root)
+            self._decompress_xz_files(self.root)
+
+    def _open_lmdbs(self):
+        """Open LMDB files for train/val/test splits."""
+        if self.task == "s2ef":
+            self._open_s2ef_lmdbs()
+        elif self.task in ["is2re", "oc22_is2re"]:
+            self._open_is2re_lmdbs()
+
+    def _open_s2ef_lmdbs(self):
+        """Open S2EF LMDB files and create split mappings."""
+        # Train
+        train_dir = self.root / "s2ef" / self.train_split / "train"
+        train_lmdbs = self._collect_lmdb_files(train_dir)
+
+        # Validation (can be multiple)
+        val_lmdbs = []
+        for val_split in self.val_splits:
+            val_dir = self.root / "s2ef" / "all" / val_split
+            val_lmdbs.extend(self._collect_lmdb_files(val_dir))
+
+        # Test
+        test_dir = self.root / "s2ef" / "all" / "test"
+        test_lmdbs = (
+            self._collect_lmdb_files(test_dir) if self.include_test else []
+        )
+
+        # Open all LMDBs and create split index mapping
+        self.envs = []
+        self.cumulative_sizes = [0]
+        self.split_idx = {"train": [], "valid": [], "test": []}
+
+        current_idx = 0
+
+        # Process train LMDBs
+        for lmdb_path in train_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["train"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Process validation LMDBs
+        for lmdb_path in val_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["valid"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Process test LMDBs
+        for lmdb_path in test_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["test"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # If no test data, reuse validation indices
+        if not self.include_test or len(self.split_idx["test"]) == 0:
+            self.split_idx["test"] = list(self.split_idx["valid"])
+
+        # Convert to tensors
+        self.split_idx = {
+            k: torch.tensor(v, dtype=torch.long)
+            for k, v in self.split_idx.items()
+        }
+
+        logger.info(
+            f"Loaded S2EF dataset: {len(self.split_idx['train'])} train, "
+            f"{len(self.split_idx['valid'])} val, {len(self.split_idx['test'])} test"
+        )
+
+    def _open_is2re_lmdbs(self):
+        """Open IS2RE LMDB files with precomputed splits."""
+        # IS2RE datasets have structure: data/is2re/train, data/is2re/val_id, data/is2re/test_id
+        # or data/is2re/all/train, etc.
+        base_dir = self.root / (
+            "is2re" if self.task == "is2re" else "oc22_is2re"
+        )
+
+        # Try different possible structures
+        possible_structures = [
+            base_dir,
+            base_dir / "data" / "is2re",
+            self.root / "data" / "is2re",
+        ]
+
+        found_dir = None
+        for poss_dir in possible_structures:
+            if poss_dir.exists():
+                found_dir = poss_dir
+                break
+
+        if found_dir is None:
+            raise ValueError(f"Cannot find IS2RE data directory in {base_dir}")
+
+        # Look for train/val/test subdirectories
+        train_lmdbs = self._collect_lmdb_files(found_dir / "train")
+        val_lmdbs = self._collect_lmdb_files(
+            found_dir / "val_id"
+        ) or self._collect_lmdb_files(found_dir / "val")
+        test_lmdbs = self._collect_lmdb_files(
+            found_dir / "test_id"
+        ) or self._collect_lmdb_files(found_dir / "test")
+
+        # Open all LMDBs
+        self.envs = []
+        self.cumulative_sizes = [0]
+        self.split_idx = {"train": [], "valid": [], "test": []}
+
+        current_idx = 0
+
+        for lmdb_path in train_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["train"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        for lmdb_path in val_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["valid"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        for lmdb_path in test_lmdbs:
+            env, size = self._open_single_lmdb(lmdb_path)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["test"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Convert to tensors
+        self.split_idx = {
+            k: torch.tensor(v, dtype=torch.long)
+            for k, v in self.split_idx.items()
+        }
+
+        logger.info(
+            f"Loaded {self.task.upper()} dataset: {len(self.split_idx['train'])} train, "
+            f"{len(self.split_idx['valid'])} val, {len(self.split_idx['test'])} test"
+        )
+
+    def _collect_lmdb_files(self, directory: Path) -> list[Path]:
+        """Collect all .lmdb files in a directory."""
+        if not directory.exists():
+            return []
+        lmdb_files = sorted(directory.glob("**/*.lmdb"))
+        return lmdb_files
+
+    def _open_single_lmdb(self, lmdb_path: Path) -> tuple:
+        """Open a single LMDB file and return (env, size)."""
+        env = lmdb.open(
+            str(lmdb_path.resolve()),
+            subdir=False,
+            readonly=True,
+            lock=False,
+            readahead=True,
+            meminit=False,
+            max_readers=1,
+        )
+        size = env.stat()["entries"]
+        return env, size
+
+    def _find_lmdb_and_local_idx(self, idx: int) -> tuple:
+        if idx < 0 or idx >= len(self):
+            raise IndexError(f"Index {idx} out of range [0, {len(self)})")
+
+        left, right = 0, len(self.envs)
+        while left < right - 1:
+            mid = (left + right) // 2
+            if self.cumulative_sizes[mid] <= idx:
+                left = mid
+            else:
+                right = mid
+
+        lmdb_idx = left
+        local_idx = idx - self.cumulative_sizes[lmdb_idx]
+        return lmdb_idx, local_idx
+
+    def len(self) -> int:
+        return self.cumulative_sizes[-1]
+
+    def get(self, idx: int) -> Data:
+        lmdb_idx, local_idx = self._find_lmdb_and_local_idx(idx)
+        lmdb_path, env, _ = self.envs[lmdb_idx]
+
+        with env.begin() as txn:
+            cursor = txn.cursor()
+            if not cursor.first():
+                raise RuntimeError(f"Empty LMDB at {lmdb_path}")
+
+            for _ in range(local_idx):
+                if not cursor.next():
+                    raise RuntimeError(
+                        f"Index {local_idx} out of range in {lmdb_path}"
+                    )
+
+            key, value = cursor.item()
+            data = pickle.loads(value)
+
+        if self.legacy_format and isinstance(data, Data):
+            data = Data(
+                **{k: v for k, v in data.__dict__.items() if v is not None}
+            )
+
+        return data
+
+    def __len__(self) -> int:
+        return self.len()
+
+    def __getitem__(self, idx: int) -> Data:
+        return self.get(idx)
+
+    def __iter__(self) -> Iterator[Data]:
+        for i in range(len(self)):
+            yield self[i]
+
+    def __del__(self):
+        if hasattr(self, "envs"):
+            for _, env, _ in self.envs:
+                env.close()
+
+
+class OC20DatasetLoader(AbstractLoader):
+    """Load OC20 family datasets.
+
+    This loader supports all OC20/OC22 dataset splits including S2EF and IS2RE tasks.
+
+    Parameters in the Hydra config (dataset.loader.parameters):
+    - data_domain: graph
+    - data_type: oc20
+    - data_name: Logical name for the dataset (e.g., OC20_S2EF_200K)
+    - task: "s2ef", "is2re", or "oc22_is2re"
+
+    For S2EF task:
+    - train_split: one of ["200K", "2M", "20M", "all"]
+    - val_splits: list of validation splits (default: all 4)
+      Options: ["val_id", "val_ood_ads", "val_ood_cat", "val_ood_both"]
+    - test_split: "test" (default)
+
+    For IS2RE/OC22 tasks:
+    - Uses precomputed train/val/test splits from the LMDB archives
+
+    Common parameters:
+    - download: whether to download (default: false)
+    - legacy_format: whether to use legacy PyG Data format (default: false)
+    - dtype: torch dtype (default: "float32")
+    - max_samples: limit dataset size for fast experimentation (default: None = all samples)
+    """
+
+    def __init__(self, parameters: DictConfig) -> None:
+        super().__init__(parameters)
+
+    def load_dataset(self) -> Dataset:
+        """Load OC20 dataset (S2EF or IS2RE).
+
+        Returns
+        -------
+        Dataset
+            Loaded dataset with appropriate splits.
+        """
+        task: str = getattr(self.parameters, "task", "s2ef")
+        download: bool = bool(getattr(self.parameters, "download", False))
+        legacy_format: bool = bool(
+            getattr(self.parameters, "legacy_format", False)
+        )
+        dtype = getattr(self.parameters, "dtype", "float32")
+        dtype_t = (
+            getattr(torch, str(dtype)) if isinstance(dtype, str) else dtype
+        )
+        max_samples = getattr(self.parameters, "max_samples", None)
+        if max_samples is not None:
+            max_samples = int(max_samples)
+            print(
+                f"⚠️  Limiting dataset to {max_samples} samples for fast experimentation"
+            )
+
+        if task == "s2ef":
+            train_split = getattr(self.parameters, "train_split", "200K")
+            val_splits_param = getattr(self.parameters, "val_splits", None)
+
+            # Parse val_splits
+            if val_splits_param is None:
+                val_splits = None  # Use all by default
+            elif isinstance(val_splits_param, str):
+                # Single validation split as string
+                val_splits = [val_splits_param]
+            elif isinstance(val_splits_param, (list, tuple)):
+                val_splits = list(val_splits_param)
+            else:
+                val_splits = None
+
+            test_split = getattr(self.parameters, "test_split", "test")
+            include_test = bool(getattr(self.parameters, "include_test", True))
+
+            ds = _OC20LMDBDataset(
+                root=self.get_data_dir(),
+                task="s2ef",
+                train_split=train_split,
+                val_splits=val_splits,
+                test_split=test_split,
+                download=download,
+                include_test=include_test,
+                dtype=dtype_t,
+                legacy_format=legacy_format,
+            )
+
+            # ASE DB fallback if LMDBs are not present
+            data_root = Path(self.get_data_dir())
+            lmdb_present = any((data_root / "s2ef").glob("**/*.lmdb"))
+            if not lmdb_present and HAS_ASE:
+                # Preprocessing is already done in _OC20LMDBDataset if needed
+                # Now collect DB files
+                train_subdir_name = f"s2ef_train_{train_split}"
+                train_dir = (
+                    data_root
+                    / "s2ef"
+                    / train_split
+                    / train_subdir_name
+                    / train_subdir_name
+                )
+                train_dbs = sorted(train_dir.glob("*.db"))
+                val_dbs = []
+                # Respect empty list for val_splits (for fast prototyping)
+                val_splits_to_use = (
+                    list(S2EF_VAL_SPLITS.keys())
+                    if val_splits is None
+                    else val_splits
+                )
+                for vs in val_splits_to_use:
+                    val_subdir_name = f"s2ef_{vs}"
+                    val_dir = (
+                        data_root
+                        / "s2ef"
+                        / "all"
+                        / val_subdir_name
+                        / val_subdir_name
+                    )
+                    val_dbs.extend(sorted(val_dir.glob("*.db")))
+                test_dbs = []
+                if include_test:
+                    test_dbs = sorted(
+                        (data_root / "s2ef" / "all" / "test").glob("*.db")
+                    )
+
+                if train_dbs:
+                    logger.info(
+                        f"Using ASE DB backend: {len(train_dbs)} train, {len(val_dbs)} val, {len(test_dbs)} test DB files"
+                    )
+                    return OC20ASEDBDataset(
+                        train_db_paths=[str(p) for p in train_dbs],
+                        val_db_paths=[str(p) for p in val_dbs],
+                        test_db_paths=[str(p) for p in test_dbs],
+                        max_neigh=int(
+                            getattr(self.parameters, "max_neigh", 50)
+                        ),
+                        radius=float(getattr(self.parameters, "radius", 6.0)),
+                        dtype=dtype_t,
+                        include_energy=True,
+                        include_forces=True,
+                        max_samples=max_samples,
+                    )
+        elif task in ["is2re", "oc22_is2re"]:
+            ds = _OC20LMDBDataset(
+                root=self.get_data_dir(),
+                task=task,
+                download=download,
+                dtype=dtype_t,
+                legacy_format=legacy_format,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported task '{task}'. Use 's2ef', 'is2re', or 'oc22_is2re'."
+            )
+
+        return ds  # type: ignore[return-value]
+
+    def get_data_dir(self) -> Path:
+        """Get data directory path.
+
+        Returns
+        -------
+        Path
+            Path to data directory.
+        """
+        # Keep default directory convention for TopoBench
+        return Path(super().get_data_dir())
diff --git a/topobench/data/preprocessor/preprocessor.py b/topobench/data/preprocessor/preprocessor.py
index 2f4f4b41d..722f09954 100644
--- a/topobench/data/preprocessor/preprocessor.py
+++ b/topobench/data/preprocessor/preprocessor.py
@@ -193,6 +193,11 @@ def set_processed_data_dir(
             transform_name: transform.parameters
             for transform_name, transform in pre_transforms_dict.items()
         }
+
+        # Include dataset size in hash to avoid reusing cached data from different dataset sizes
+        # This is crucial when max_samples changes
+        transforms_parameters["_dataset_size"] = len(self.dataset)
+
         params_hash = make_hash(transforms_parameters)
         self.transforms_parameters = ensure_serializable(transforms_parameters)
         self.processed_data_dir = os.path.join(
diff --git a/topobench/data/utils/__init__.py b/topobench/data/utils/__init__.py
index 8793f773e..31a15f290 100644
--- a/topobench/data/utils/__init__.py
+++ b/topobench/data/utils/__init__.py
@@ -62,4 +62,21 @@
     # add function name here
 ]
 
-__all__ = utils_functions + split_helper_functions + io_helper_functions
+# OC20 download utilities
+from .oc20_download import (  # noqa: E402
+    download_is2re_dataset,  # noqa: F401
+    # import function here, add noqa: F401 for PR
+    download_s2ef_dataset,  # noqa: F401
+)
+
+oc20_helper_functions = [
+    "download_s2ef_dataset",
+    "download_is2re_dataset",
+]
+
+__all__ = (
+    utils_functions
+    + split_helper_functions
+    + io_helper_functions
+    + oc20_helper_functions
+)
diff --git a/topobench/data/utils/oc20_download.py b/topobench/data/utils/oc20_download.py
new file mode 100644
index 000000000..d48a96fec
--- /dev/null
+++ b/topobench/data/utils/oc20_download.py
@@ -0,0 +1,235 @@
+"""Utilities for downloading and preparing OC20 datasets."""
+
+from __future__ import annotations
+
+import logging
+import lzma
+import os
+import shutil
+import tarfile
+import urllib.request
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+# OC20 dataset split URLs
+S2EF_TRAIN_SPLITS = {
+    "200K": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_200K.tar",
+    "2M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_2M.tar",
+    "20M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_20M.tar",
+    "all": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_all.tar",
+}
+
+S2EF_VAL_SPLITS = {
+    "val_id": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_id.tar",
+    "val_ood_ads": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_ads.tar",
+    "val_ood_cat": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_cat.tar",
+    "val_ood_both": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_both.tar",
+}
+
+S2EF_TEST_SPLIT = "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_test_lmdbs.tar.gz"
+
+IS2RE_URL = "https://dl.fbaipublicfiles.com/opencatalystproject/data/is2res_train_val_test_lmdbs.tar.gz"
+OC22_IS2RE_URL = "https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/is2res_total_train_val_test_lmdbs.tar.gz"
+
+
+def uncompress_xz(file_path: str) -> str:
+    """Decompress .xz files.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to file to decompress.
+
+    Returns
+    -------
+    str
+        Path to decompressed file.
+    """
+    if not file_path.endswith(".xz"):
+        return file_path
+
+    output_path = file_path.replace(".xz", "")
+    try:
+        with (
+            lzma.open(file_path, "rb") as f_in,
+            open(output_path, "wb") as f_out,
+        ):
+            shutil.copyfileobj(f_in, f_out)
+        os.remove(file_path)
+        return output_path
+    except Exception as e:
+        logger.error(f"Error uncompressing {file_path}: {e}")
+        return file_path
+
+
+def download_and_extract(
+    url: str, target_dir: Path, skip_if_extracted: bool = True
+) -> Path:
+    """Download and extract a tar archive.
+
+    Parameters
+    ----------
+    url : str
+        URL to download from.
+    target_dir : Path
+        Directory to extract to.
+    skip_if_extracted : bool
+        If True, skip extraction if extracted files already exist (default: True).
+
+    Returns
+    -------
+    Path
+        Path to extracted directory.
+    """
+    target_dir.mkdir(parents=True, exist_ok=True)
+    target_file = target_dir / os.path.basename(url)
+
+    # Download if needed
+    if not target_file.exists():
+        logger.info(f"Downloading {url}...")
+        with tqdm(
+            unit="B", unit_scale=True, desc=f"Downloading {target_file.name}"
+        ) as pbar:
+
+            def report(block_num, block_size, total_size):
+                if total_size > 0 and block_num == 0:
+                    pbar.total = total_size
+                pbar.update(block_size)
+
+            urllib.request.urlretrieve(url, target_file, reporthook=report)
+    else:
+        logger.info(f"Archive {target_file.name} already downloaded")
+
+    # Check if extraction is needed
+    extraction_marker = target_dir / ".extracted"
+    if skip_if_extracted and extraction_marker.exists():
+        logger.info(f"Archive {target_file.name} already extracted, skipping")
+        return target_dir
+
+    # Extract
+    logger.info(f"Extracting {target_file.name}...")
+    if str(target_file).endswith((".tar.gz", ".tgz")):
+        with tarfile.open(target_file, "r:gz") as tar:
+            tar.extractall(path=target_dir)
+    elif str(target_file).endswith(".tar"):
+        with tarfile.open(target_file, "r:") as tar:
+            tar.extractall(path=target_dir)
+    else:
+        raise ValueError(f"Unsupported archive format: {target_file}")
+
+    # Mark as extracted
+    extraction_marker.touch()
+    return target_dir
+
+
+def decompress_xz_files(directory: Path):
+    """Decompress all .xz files in a directory.
+
+    Parameters
+    ----------
+    directory : Path
+        Directory to search for .xz files.
+    """
+    xz_files = list(directory.glob("**/*.xz"))
+    if xz_files:
+        logger.info(
+            f"Decompressing {len(xz_files)} .xz files in {directory}..."
+        )
+        num_workers = max(1, os.cpu_count() - 1)
+        # Use threads to avoid pickling/import issues with processes on macOS
+        with ThreadPoolExecutor(max_workers=num_workers) as executor:
+            futures = [
+                executor.submit(uncompress_xz, str(f)) for f in xz_files
+            ]
+            for future in as_completed(futures):
+                future.result()
+
+
+def download_s2ef_dataset(
+    root: Path,
+    train_split: str = "200K",
+    val_splits: list[str] | None = None,
+    include_test: bool = True,
+):
+    """Download S2EF dataset splits.
+
+    Parameters
+    ----------
+    root : Path
+        Root directory for data storage.
+    train_split : str
+        Training split size: "200K", "2M", "20M", or "all".
+    val_splits : list[str] | None
+        List of validation splits to download. If None, downloads all.
+    include_test : bool
+        Whether to download test split.
+    """
+    if val_splits is None:
+        val_splits = list(S2EF_VAL_SPLITS.keys())
+
+    # Download train split
+    train_url = S2EF_TRAIN_SPLITS[train_split]
+    train_subdir_name = f"s2ef_train_{train_split}"
+    train_dir = (
+        root / "s2ef" / train_split / train_subdir_name / train_subdir_name
+    )
+    if not train_dir.exists():
+        logger.info(f"Downloading S2EF train split: {train_split}")
+        download_and_extract(train_url, root / "s2ef" / train_split)
+        decompress_xz_files(root / "s2ef" / train_split)
+    else:
+        logger.info(
+            f"S2EF train split {train_split} already exists, skipping download"
+        )
+
+    # Download validation splits
+    for val_split in val_splits:
+        val_url = S2EF_VAL_SPLITS[val_split]
+        val_subdir_name = f"s2ef_{val_split}"
+        val_dir = root / "s2ef" / "all" / val_subdir_name / val_subdir_name
+        if not val_dir.exists():
+            logger.info(f"Downloading S2EF validation split: {val_split}")
+            download_and_extract(val_url, root / "s2ef" / "all")
+            decompress_xz_files(root / "s2ef" / "all")
+        else:
+            logger.info(
+                f"S2EF validation split {val_split} already exists, skipping download"
+            )
+
+    # Download test split
+    if include_test:
+        test_subdir_name = "s2ef_test"
+        test_dir = root / "s2ef" / "all" / test_subdir_name / test_subdir_name
+        if not test_dir.exists():
+            logger.info("Downloading S2EF test split")
+            download_and_extract(S2EF_TEST_SPLIT, root / "s2ef" / "all")
+            decompress_xz_files(root / "s2ef" / "all")
+        else:
+            logger.info("S2EF test split already exists, skipping download")
+
+
+def download_is2re_dataset(root: Path, task: str = "is2re"):
+    """Download IS2RE or OC22 IS2RE dataset.
+
+    Parameters
+    ----------
+    root : Path
+        Root directory for data storage.
+    task : str
+        Task name: "is2re" or "oc22_is2re".
+    """
+    url = IS2RE_URL if task == "is2re" else OC22_IS2RE_URL
+    target_dir = root / task
+
+    if not target_dir.exists():
+        logger.info(f"Downloading {task.upper()} dataset")
+        download_and_extract(url, root)
+        decompress_xz_files(root)
+    else:
+        logger.info(
+            f"{task.upper()} dataset already exists, skipping download"
+        )
diff --git a/topobench/data/utils/split_utils.py b/topobench/data/utils/split_utils.py
index 334e13462..1fc973441 100644
--- a/topobench/data/utils/split_utils.py
+++ b/topobench/data/utils/split_utils.py
@@ -45,7 +45,9 @@ def k_fold_split(labels, parameters, root=None):
     torch.manual_seed(0)
     np.random.seed(0)
 
-    split_dir = os.path.join(data_dir, f"{k}-fold")
+    # Include dataset size in split directory to avoid reusing splits from different dataset sizes
+    n = len(labels)
+    split_dir = os.path.join(data_dir, f"{k}-fold_n={n}")
 
     if not os.path.isdir(split_dir):
         os.makedirs(split_dir)
@@ -129,9 +131,13 @@ def random_splitting(labels, parameters, root=None, global_data_seed=42):
     train_prop = parameters["train_prop"]
     valid_prop = (1 - train_prop) / 2
 
+    # Include dataset size in split directory to avoid reusing splits from different dataset sizes
+    n = len(labels)
+
     # Create split directory if it does not exist
     split_dir = os.path.join(
-        data_dir, f"train_prop={train_prop}_global_seed={global_data_seed}"
+        data_dir,
+        f"train_prop={train_prop}_global_seed={global_data_seed}_n={n}",
     )
     generate_splits = False
     if not os.path.isdir(split_dir):

From b290b429520e175558810dc6d00827f721dff240 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Fri, 21 Nov 2025 17:17:02 -0500
Subject: [PATCH 05/17] IS2RE works

---
 configs/dataset/graph/OC20_IS2RE.yaml         |   3 +-
 configs/dataset/graph/OC22_IS2RE.yaml         |   3 +-
 test_data_migration.py                        |  59 ++
 topobench/data/datasets/__init__.py           |  11 +-
 topobench/data/datasets/is2re_dataset.py      | 382 ++++++++
 topobench/data/datasets/oc20_dataset.py       | 110 +--
 topobench/data/datasets/oc22_is2re_dataset.py | 416 +++++++++
 .../loaders/graph/is2re_dataset_loader.py     |  98 +++
 .../data/loaders/graph/oc20_dataset_loader.py |  54 +-
 .../loaders/graph/oc20_dataset_loader_old.py  | 831 ------------------
 .../graph/oc22_is2re_dataset_loader.py        |  98 +++
 11 files changed, 1123 insertions(+), 942 deletions(-)
 create mode 100644 test_data_migration.py
 create mode 100644 topobench/data/datasets/is2re_dataset.py
 create mode 100644 topobench/data/datasets/oc22_is2re_dataset.py
 create mode 100644 topobench/data/loaders/graph/is2re_dataset_loader.py
 delete mode 100644 topobench/data/loaders/graph/oc20_dataset_loader_old.py
 create mode 100644 topobench/data/loaders/graph/oc22_is2re_dataset_loader.py

diff --git a/configs/dataset/graph/OC20_IS2RE.yaml b/configs/dataset/graph/OC20_IS2RE.yaml
index 339f3e574..1edf54721 100644
--- a/configs/dataset/graph/OC20_IS2RE.yaml
+++ b/configs/dataset/graph/OC20_IS2RE.yaml
@@ -2,7 +2,7 @@
 # Train/val/test splits are precomputed in the LMDB archive
 
 loader:
-  _target_: topobench.data.loaders.OC20DatasetLoader
+  _target_: topobench.data.loaders.graph.is2re_dataset_loader.IS2REDatasetLoader
   parameters:
     data_domain: graph
     data_type: oc20
@@ -12,6 +12,7 @@ loader:
     download: true
     legacy_format: false
     dtype: float32
+    max_samples: 100  # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset
 
 parameters:
   num_features: 6  # Will be determined by the actual data
diff --git a/configs/dataset/graph/OC22_IS2RE.yaml b/configs/dataset/graph/OC22_IS2RE.yaml
index 40acb07a9..fd9446c45 100644
--- a/configs/dataset/graph/OC22_IS2RE.yaml
+++ b/configs/dataset/graph/OC22_IS2RE.yaml
@@ -2,7 +2,7 @@
 # Train/val/test splits are precomputed in the LMDB archive
 
 loader:
-  _target_: topobench.data.loaders.OC20DatasetLoader
+  _target_: topobench.data.loaders.graph.oc22_is2re_dataset_loader.OC22IS2REDatasetLoader
   parameters:
     data_domain: graph
     data_type: oc20
@@ -12,6 +12,7 @@ loader:
     download: true
     legacy_format: false
     dtype: float32
+    max_samples: 100  # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset
 
 parameters:
   num_features: 6  # Will be determined by the actual data
diff --git a/test_data_migration.py b/test_data_migration.py
new file mode 100644
index 000000000..e336b43f5
--- /dev/null
+++ b/test_data_migration.py
@@ -0,0 +1,59 @@
+"""Test script to verify Data object migration from old PyG format."""
+
+import copy
+import pickle
+from pathlib import Path
+
+import lmdb
+from torch_geometric.data import Data
+
+# Open the first LMDB file
+lmdb_path = Path(
+    "/Users/theos/Documents/code/TopoBench_contrib/datasets/graph/oc20/OC22_IS2RE/is2res_total_train_val_test_lmdbs/data/oc22/is2re-total/train/data.0000.lmdb"
+)
+
+env = lmdb.open(
+    str(lmdb_path.resolve()),
+    subdir=False,
+    readonly=True,
+    lock=False,
+    readahead=True,
+    meminit=False,
+    max_readers=1,
+)
+
+with env.begin() as txn:
+    cursor = txn.cursor()
+    cursor.first()
+    key, value = cursor.item()
+    old_data = pickle.loads(value)
+
+print(f"Old data type: {type(old_data)}")
+print(f"Old data __dict__ keys: {old_data.__dict__.keys()}")
+
+# Try to migrate using the new approach
+if "_store" not in old_data.__dict__ or any(
+    k in old_data.__dict__ for k in ["x", "edge_index", "pos"]
+):
+    print("Detected old format (attributes in __dict__)")
+
+    data_dict = {}
+    for key, val in old_data.__dict__.items():
+        if not key.startswith("_") and val is not None:
+            data_dict[key] = val
+            print(f"  {key}: {type(val)}")
+
+    # Create new Data object
+    new_data = Data(**data_dict)
+    print(f"\nNew data type: {type(new_data)}")
+    print(f"New data __dict__ keys: {new_data.__dict__.keys()}")
+
+    # Try to copy
+    try:
+        copied_data = copy.copy(new_data)
+        print("✅ Copy successful!")
+        print(f"Copied data __dict__ keys: {copied_data.__dict__.keys()}")
+    except Exception as e:
+        print(f"❌ Copy failed: {e}")
+
+env.close()
diff --git a/topobench/data/datasets/__init__.py b/topobench/data/datasets/__init__.py
index 68a4d6345..fe8d09678 100644
--- a/topobench/data/datasets/__init__.py
+++ b/topobench/data/datasets/__init__.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 from typing import ClassVar
 
-from torch_geometric.data import InMemoryDataset
+from torch_geometric.data import Dataset, InMemoryDataset
 
 
 class DatasetManager:
@@ -41,9 +41,7 @@ class DatasetManager:
     ]
 
     @classmethod
-    def discover_datasets(
-        cls, package_path: str
-    ) -> dict[str, type[InMemoryDataset]]:
+    def discover_datasets(cls, package_path: str) -> dict[str, type[Dataset]]:
         """Dynamically discover all dataset classes in the package.
 
         Parameters
@@ -53,7 +51,7 @@ def discover_datasets(
 
         Returns
         -------
-        Dict[str, Type[InMemoryDataset]]
+        Dict[str, Type[Dataset]]
             Dictionary mapping class names to their corresponding class objects.
         """
         datasets = {}
@@ -81,8 +79,9 @@ def discover_datasets(
                         inspect.isclass(obj)
                         and obj.__module__ == module.__name__
                         and not name.startswith("_")
-                        and issubclass(obj, InMemoryDataset)
+                        and issubclass(obj, Dataset)
                         and obj != InMemoryDataset
+                        and obj != Dataset
                     )
                 }
                 datasets.update(new_datasets)
diff --git a/topobench/data/datasets/is2re_dataset.py b/topobench/data/datasets/is2re_dataset.py
new file mode 100644
index 000000000..7a5fbfa65
--- /dev/null
+++ b/topobench/data/datasets/is2re_dataset.py
@@ -0,0 +1,382 @@
+"""Dataset class for Open Catalyst 2020 (OC20) IS2RE dataset."""
+
+from __future__ import annotations
+
+import logging
+import pickle
+from collections.abc import Iterator
+from pathlib import Path
+
+import torch
+from omegaconf import DictConfig
+from torch_geometric.data import Data, Dataset
+
+try:
+    import lmdb
+
+    HAS_LMDB = True
+except ImportError:
+    lmdb = None
+    HAS_LMDB = False
+
+logger = logging.getLogger(__name__)
+
+
+class IS2REDataset(Dataset):
+    """Dataset class for Open Catalyst 2020 (OC20) IS2RE task.
+
+    IS2RE (Initial Structure to Relaxed Energy) is a task for catalyst
+    discovery and materials science.
+
+    The OC20 dataset contains DFT calculations for catalyst-adsorbate systems,
+    enabling machine learning models to predict energies for accelerated
+    materials discovery.
+
+    Parameters
+    ----------
+    root : str
+        Root directory where the dataset is stored.
+    name : str
+        Name of the dataset.
+    parameters : DictConfig
+        Configuration parameters for the dataset.
+    """
+
+    def __init__(
+        self,
+        root: str,
+        name: str,
+        parameters: DictConfig,
+    ) -> None:
+        self.name = name
+        self.parameters = parameters
+
+        # Task configuration
+        self.task = "is2re"
+        self.dtype = self._parse_dtype(parameters.get("dtype", "float32"))
+        self.legacy_format = parameters.get("legacy_format", False)
+        self.include_test = parameters.get("include_test", True)
+
+        # Limit for fast experimentation
+        self.max_samples = parameters.get("max_samples", None)
+        if self.max_samples is not None:
+            self.max_samples = int(self.max_samples)
+            logger.info(
+                f"⚠️  Limiting dataset to {self.max_samples} samples for fast experimentation"
+            )
+
+        super().__init__(root)
+
+        # Open LMDB environments
+        self._open_lmdbs()
+
+    def __repr__(self) -> str:
+        return f"{self.name}(root={self.root}, task={self.task}, size={len(self)})"
+
+    @staticmethod
+    def _parse_dtype(dtype) -> torch.dtype:
+        """Parse dtype parameter to torch.dtype."""
+        if isinstance(dtype, str):
+            return getattr(torch, dtype)
+        return dtype
+
+    def _get_data_paths(self) -> dict[str, list[Path]]:
+        """Get paths to LMDB files for each split.
+
+        Returns
+        -------
+        dict[str, list[Path]]
+            Dictionary mapping split names to lists of LMDB file paths.
+        """
+        root = Path(self.root)
+        paths = {"train": [], "val": [], "test": []}
+
+        # The downloaded data is extracted to:
+        # root/is2res_train_val_test_lmdbs/data/is2re/all/{train,val_id,test_id}/data.lmdb
+        base_path = (
+            root / "is2res_train_val_test_lmdbs" / "data" / "is2re" / "all"
+        )
+
+        if base_path.exists():
+            # Train data
+            train_lmdb = base_path / "train" / "data.lmdb"
+            if train_lmdb.exists():
+                paths["train"] = [train_lmdb]
+
+            # Validation data
+            val_lmdb = base_path / "val_id" / "data.lmdb"
+            if val_lmdb.exists():
+                paths["val"] = [val_lmdb]
+
+            # Test data
+            test_lmdb = base_path / "test_id" / "data.lmdb"
+            if test_lmdb.exists():
+                paths["test"] = [test_lmdb]
+
+        return paths
+
+    def _open_lmdbs(self):
+        """Open LMDB files and create split mappings."""
+        if not HAS_LMDB:
+            raise ImportError(
+                "LMDB is required for IS2RE dataset. Install with: pip install lmdb"
+            )
+
+        paths = self._get_data_paths()
+
+        # Initialize storage
+        self.envs = []
+        self.cumulative_sizes = [0]
+        self.split_idx = {"train": [], "valid": [], "test": []}
+
+        current_idx = 0
+
+        # Open train LMDBs
+        for lmdb_path in paths["train"]:
+            env, size = self._open_single_lmdb(lmdb_path)
+            # Apply max_samples limit if specified
+            if self.max_samples is not None:
+                size = min(size, self.max_samples)
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["train"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Open validation LMDBs
+        for lmdb_path in paths["val"]:
+            env, size = self._open_single_lmdb(lmdb_path)
+            # Apply max_samples limit if specified
+            if self.max_samples is not None:
+                size = min(
+                    size, max(1, self.max_samples // 10)
+                )  # Use 10% for validation
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["valid"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Open test LMDBs
+        for lmdb_path in paths["test"]:
+            env, size = self._open_single_lmdb(lmdb_path)
+            # Apply max_samples limit if specified
+            if self.max_samples is not None:
+                size = min(
+                    size, max(1, self.max_samples // 10)
+                )  # Use 10% for test
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["test"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # If no test data, reuse validation indices
+        if not self.include_test or len(self.split_idx["test"]) == 0:
+            self.split_idx["test"] = list(self.split_idx["valid"])
+
+        # Convert to tensors
+        self.split_idx = {
+            k: torch.tensor(v, dtype=torch.long)
+            for k, v in self.split_idx.items()
+        }
+
+        logger.info(
+            f"Loaded {self.task.upper()} dataset: "
+            f"{len(self.split_idx['train'])} train, "
+            f"{len(self.split_idx['valid'])} val, "
+            f"{len(self.split_idx['test'])} test"
+        )
+
+    def _open_single_lmdb(self, lmdb_path: Path) -> tuple:
+        """Open a single LMDB file and return (env, size).
+
+        Parameters
+        ----------
+        lmdb_path : Path
+            Path to LMDB file.
+
+        Returns
+        -------
+        tuple
+            (environment, size) tuple.
+        """
+        env = lmdb.open(
+            str(lmdb_path.resolve()),
+            subdir=False,
+            readonly=True,
+            lock=False,
+            readahead=True,
+            meminit=False,
+            max_readers=1,
+        )
+        size = env.stat()["entries"]
+        return env, size
+
+    def _find_lmdb_and_local_idx(self, idx: int) -> tuple:
+        """Find which LMDB contains the given index and the local index within it.
+
+        Parameters
+        ----------
+        idx : int
+            Global dataset index.
+
+        Returns
+        -------
+        tuple
+            (lmdb_index, local_index) tuple.
+        """
+        if idx < 0 or idx >= len(self):
+            raise IndexError(f"Index {idx} out of range [0, {len(self)})")
+
+        # Binary search for the LMDB
+        left, right = 0, len(self.envs)
+        while left < right - 1:
+            mid = (left + right) // 2
+            if self.cumulative_sizes[mid] <= idx:
+                left = mid
+            else:
+                right = mid
+
+        lmdb_idx = left
+        local_idx = idx - self.cumulative_sizes[lmdb_idx]
+        return lmdb_idx, local_idx
+
+    def len(self) -> int:
+        """Get dataset length."""
+        return self.cumulative_sizes[-1]
+
+    def get(self, idx: int) -> Data:
+        """Get data sample at index.
+
+        Parameters
+        ----------
+        idx : int
+            Sample index.
+
+        Returns
+        -------
+        Data
+            PyTorch Geometric Data object.
+        """
+        lmdb_idx, local_idx = self._find_lmdb_and_local_idx(idx)
+        lmdb_path, env, _ = self.envs[lmdb_idx]
+
+        with env.begin() as txn:
+            cursor = txn.cursor()
+            if not cursor.first():
+                raise RuntimeError(f"Empty LMDB at {lmdb_path}")
+
+            # Navigate to the target entry
+            for _ in range(local_idx):
+                if not cursor.next():
+                    raise RuntimeError(
+                        f"Index {local_idx} out of range in {lmdb_path}"
+                    )
+
+            key, value = cursor.item()
+            data = pickle.loads(value)
+
+        # Convert old PyG Data objects to new format by extracting all attributes
+        if isinstance(data, Data):
+            try:
+                # Check if this is old format data
+                # Old PyG format has attributes directly in __dict__ without proper _store
+                if "_store" not in data.__dict__ or any(
+                    k in data.__dict__ for k in ["x", "edge_index", "pos"]
+                ):
+                    # Extract all data attributes
+                    data_dict = {}
+                    # Get all tensor/data attributes from __dict__
+                    data_dict = {
+                        key: val
+                        for key, val in data.__dict__.items()
+                        if not key.startswith("_") and val is not None
+                    }
+                    # Convert y_relaxed to y before creating new Data object
+                    if "y_relaxed" in data_dict:
+                        data_dict["y"] = torch.tensor(
+                            [data_dict["y_relaxed"]]
+                        ).float()
+                    elif "y" not in data_dict:
+                        data_dict["y"] = torch.tensor([float("nan")]).float()
+
+                    # Use atomic numbers as node features (x)
+                    if "atomic_numbers" in data_dict:
+                        data_dict["x"] = (
+                            data_dict["atomic_numbers"].view(-1, 1).float()
+                        )
+                    elif "x" not in data_dict and "pos" in data_dict:
+                        data_dict["x"] = torch.ones(
+                            (data_dict["pos"].shape[0], 1)
+                        )
+
+                    # Create edge_index from atomic positions using radius graph
+                    if "edge_index" not in data_dict and "pos" in data_dict:
+                        from torch_geometric.nn import radius_graph
+
+                        data_dict["edge_index"] = radius_graph(
+                            data_dict["pos"], r=5.0, max_num_neighbors=50
+                        )
+
+                    # Keep only standard PyG attributes
+                    standard_attrs = [
+                        "x",
+                        "edge_index",
+                        "edge_attr",
+                        "pos",
+                        "y",
+                        "batch",
+                    ]
+                    cleaned_dict = {
+                        k: v
+                        for k, v in data_dict.items()
+                        if k in standard_attrs
+                    }
+
+                    # Create a completely new Data object with current PyG format
+                    data = Data(**cleaned_dict)
+            except (AttributeError, KeyError, RuntimeError, TypeError):
+                # If extraction fails, pass through
+                pass
+
+        # Convert to legacy format if needed
+        if self.legacy_format and isinstance(data, Data):
+            data = Data(
+                **{k: v for k, v in data.__dict__.items() if v is not None}
+            )
+
+        return data
+
+    def __len__(self) -> int:
+        """Get dataset length."""
+        return self.len()
+
+    def __getitem__(self, idx: int) -> Data:
+        """Get item at index."""
+        return self.get(idx)
+
+    def __iter__(self) -> Iterator[Data]:
+        """Iterate over dataset."""
+        for i in range(len(self)):
+            yield self[i]
+
+    def __del__(self):
+        """Clean up LMDB environments."""
+        if hasattr(self, "envs"):
+            for _, env, _ in self.envs:
+                env.close()
+
+    @property
+    def num_node_features(self) -> int:
+        """Number of node features per atom."""
+        # Will be determined by the actual data
+        return 1  # Atomic numbers
+
+    @property
+    def num_classes(self) -> int:
+        """Number of classes (regression task)."""
+        return 1  # Single regression target (energy)
diff --git a/topobench/data/datasets/oc20_dataset.py b/topobench/data/datasets/oc20_dataset.py
index 9f34dfec0..b886616bf 100644
--- a/topobench/data/datasets/oc20_dataset.py
+++ b/topobench/data/datasets/oc20_dataset.py
@@ -26,8 +26,8 @@
 class OC20Dataset(Dataset):
     """Dataset class for Open Catalyst 2020 (OC20) family.
 
-    Supports S2EF (Structure to Energy and Forces) and IS2RE (Initial Structure
-    to Relaxed Energy) tasks for catalyst discovery and materials science.
+    Supports S2EF (Structure to Energy and Forces) task for catalyst
+    discovery and materials science.
 
     The OC20 dataset contains DFT calculations for catalyst-adsorbate systems,
     enabling machine learning models to predict energies and forces for
@@ -74,37 +74,38 @@ def __init__(
 
         # Task configuration
         self.task = parameters.get("task", "s2ef").lower()
+        if self.task != "s2ef":
+            raise ValueError(f"Unsupported task: {self.task}")
         self.dtype = self._parse_dtype(parameters.get("dtype", "float32"))
         self.legacy_format = parameters.get("legacy_format", False)
         self.include_test = parameters.get("include_test", True)
 
         # S2EF-specific configuration
-        if self.task == "s2ef":
-            self.train_split = parameters.get("train_split", "200K")
-            if self.train_split not in self.VALID_TRAIN_SPLITS:
+        self.train_split = parameters.get("train_split", "200K")
+        if self.train_split not in self.VALID_TRAIN_SPLITS:
+            raise ValueError(
+                f"Invalid S2EF train split: {self.train_split}. "
+                f"Choose from {self.VALID_TRAIN_SPLITS}"
+            )
+
+        # Parse validation splits
+        val_splits = parameters.get("val_splits", None)
+        if val_splits is None:
+            self.val_splits = self.VALID_VAL_SPLITS
+        elif isinstance(val_splits, str):
+            self.val_splits = [val_splits]
+        else:
+            self.val_splits = list(val_splits)
+
+        # Validate splits
+        for vs in self.val_splits:
+            if vs not in self.VALID_VAL_SPLITS:
                 raise ValueError(
-                    f"Invalid S2EF train split: {self.train_split}. "
-                    f"Choose from {self.VALID_TRAIN_SPLITS}"
+                    f"Invalid S2EF val split: {vs}. "
+                    f"Choose from {self.VALID_VAL_SPLITS}"
                 )
 
-            # Parse validation splits
-            val_splits = parameters.get("val_splits", None)
-            if val_splits is None:
-                self.val_splits = self.VALID_VAL_SPLITS
-            elif isinstance(val_splits, str):
-                self.val_splits = [val_splits]
-            else:
-                self.val_splits = list(val_splits)
-
-            # Validate splits
-            for vs in self.val_splits:
-                if vs not in self.VALID_VAL_SPLITS:
-                    raise ValueError(
-                        f"Invalid S2EF val split: {vs}. "
-                        f"Choose from {self.VALID_VAL_SPLITS}"
-                    )
-
-            self.test_split = parameters.get("test_split", "test")
+        self.test_split = parameters.get("test_split", "test")
 
         # Limit for fast experimentation
         self.max_samples = parameters.get("max_samples", None)
@@ -143,48 +144,23 @@ def _get_data_paths(self) -> dict[str, list[Path]]:
         root = Path(self.root)
         paths = {"train": [], "val": [], "test": []}
 
-        if self.task == "s2ef":
-            # Training data path structure
-            train_subdir = f"s2ef_train_{self.train_split}"
-            train_dir = (
-                root / "s2ef" / self.train_split / train_subdir / train_subdir
-            )
-            paths["train"] = sorted(train_dir.glob("**/*.lmdb"))
-
-            # Validation data paths
-            for val_split in self.val_splits:
-                val_subdir = f"s2ef_{val_split}"
-                val_dir = root / "s2ef" / "all" / val_subdir / val_subdir
-                paths["val"].extend(sorted(val_dir.glob("**/*.lmdb")))
-
-            # Test data path
-            if self.include_test:
-                test_dir = root / "s2ef" / "all" / "s2ef_test" / "s2ef_test"
-                paths["test"] = sorted(test_dir.glob("**/*.lmdb"))
-
-        elif self.task in ["is2re", "oc22_is2re"]:
-            # IS2RE datasets have different structure
-            base_dir = root / (
-                "is2re" if self.task == "is2re" else "oc22_is2re"
-            )
-
-            # Try different possible directory structures
-            for possible_base in [
-                base_dir,
-                base_dir / "data" / "is2re",
-                root / "data" / "is2re",
-            ]:
-                if possible_base.exists():
-                    paths["train"] = sorted(
-                        (possible_base / "train").glob("**/*.lmdb")
-                    )
-                    paths["val"] = sorted(
-                        (possible_base / "val_id").glob("**/*.lmdb")
-                    ) or sorted((possible_base / "val").glob("**/*.lmdb"))
-                    paths["test"] = sorted(
-                        (possible_base / "test_id").glob("**/*.lmdb")
-                    ) or sorted((possible_base / "test").glob("**/*.lmdb"))
-                    break
+        # Training data path structure
+        train_subdir = f"s2ef_train_{self.train_split}"
+        train_dir = (
+            root / "s2ef" / self.train_split / train_subdir / train_subdir
+        )
+        paths["train"] = sorted(train_dir.glob("**/*.lmdb"))
+
+        # Validation data paths
+        for val_split in self.val_splits:
+            val_subdir = f"s2ef_{val_split}"
+            val_dir = root / "s2ef" / "all" / val_subdir / val_subdir
+            paths["val"].extend(sorted(val_dir.glob("**/*.lmdb")))
+
+        # Test data path
+        if self.include_test:
+            test_dir = root / "s2ef" / "all" / "s2ef_test" / "s2ef_test"
+            paths["test"] = sorted(test_dir.glob("**/*.lmdb"))
 
         return paths
 
diff --git a/topobench/data/datasets/oc22_is2re_dataset.py b/topobench/data/datasets/oc22_is2re_dataset.py
new file mode 100644
index 000000000..83ba50dc5
--- /dev/null
+++ b/topobench/data/datasets/oc22_is2re_dataset.py
@@ -0,0 +1,416 @@
+"""Dataset class for Open Catalyst 2022 (OC22) IS2RE dataset."""
+
+from __future__ import annotations
+
+import logging
+import pickle
+from collections.abc import Iterator
+from pathlib import Path
+
+import torch
+from omegaconf import DictConfig
+from torch_geometric.data import Data, Dataset
+
+try:
+    import lmdb
+
+    HAS_LMDB = True
+except ImportError:
+    lmdb = None
+    HAS_LMDB = False
+
+logger = logging.getLogger(__name__)
+
+
+class OC22IS2REDataset(Dataset):
+    """Dataset class for Open Catalyst 2022 (OC22) IS2RE task.
+
+    IS2RE (Initial Structure to Relaxed Energy) is a task for catalyst
+    discovery and materials science.
+
+    The OC22 dataset contains DFT calculations for catalyst-adsorbate systems,
+    enabling machine learning models to predict energies for accelerated
+    materials discovery.
+
+    Parameters
+    ----------
+    root : str
+        Root directory where the dataset is stored.
+    name : str
+        Name of the dataset.
+    parameters : DictConfig
+        Configuration parameters for the dataset.
+    """
+
+    def __init__(
+        self,
+        root: str,
+        name: str,
+        parameters: DictConfig,
+    ) -> None:
+        self.name = name
+        self.parameters = parameters
+
+        # Task configuration
+        self.task = "oc22_is2re"
+        self.dtype = self._parse_dtype(parameters.get("dtype", "float32"))
+        self.legacy_format = parameters.get("legacy_format", False)
+        self.include_test = parameters.get("include_test", True)
+
+        # Limit for fast experimentation
+        self.max_samples = parameters.get("max_samples", None)
+        if self.max_samples is not None:
+            self.max_samples = int(self.max_samples)
+            logger.info(
+                f"⚠️  Limiting dataset to {self.max_samples} samples for fast experimentation"
+            )
+
+        super().__init__(root)
+
+        # Open LMDB environments
+        self._open_lmdbs()
+
+    def __repr__(self) -> str:
+        return f"{self.name}(root={self.root}, task={self.task}, size={len(self)})"
+
+    @staticmethod
+    def _parse_dtype(dtype) -> torch.dtype:
+        """Parse dtype parameter to torch.dtype."""
+        if isinstance(dtype, str):
+            return getattr(torch, dtype)
+        return dtype
+
+    def _get_data_paths(self) -> dict[str, list[Path]]:
+        """Get paths to LMDB files for each split.
+
+        Returns
+        -------
+        dict[str, list[Path]]
+            Dictionary mapping split names to lists of LMDB file paths.
+        """
+        root = Path(self.root)
+        paths = {"train": [], "val": [], "test": []}
+
+        # The downloaded data is extracted to:
+        # root/is2res_total_train_val_test_lmdbs/data/oc22/is2re-total/{train,val_id,test_id}/*.lmdb
+        base_path = (
+            root
+            / "is2res_total_train_val_test_lmdbs"
+            / "data"
+            / "oc22"
+            / "is2re-total"
+        )
+
+        if base_path.exists():
+            # Train data - multiple LMDB files
+            train_dir = base_path / "train"
+            if train_dir.exists():
+                paths["train"] = sorted(train_dir.glob("*.lmdb"))
+
+            # Validation data - using val_id split
+            val_dir = base_path / "val_id"
+            if val_dir.exists():
+                paths["val"] = sorted(val_dir.glob("*.lmdb"))
+
+            # Test data - using test_id split
+            test_dir = base_path / "test_id"
+            if test_dir.exists():
+                paths["test"] = sorted(test_dir.glob("*.lmdb"))
+
+        return paths
+
+    def _open_lmdbs(self):
+        """Open LMDB files and create split mappings."""
+        if not HAS_LMDB:
+            raise ImportError(
+                "LMDB is required for OC22 IS2RE dataset. Install with: pip install lmdb"
+            )
+
+        paths = self._get_data_paths()
+
+        # Initialize storage
+        self.envs = []
+        self.cumulative_sizes = [0]
+        self.split_idx = {"train": [], "valid": [], "test": []}
+
+        current_idx = 0
+
+        # Open train LMDBs with cumulative max_samples limiting
+        train_samples_remaining = (
+            self.max_samples if self.max_samples is not None else None
+        )
+        for lmdb_path in paths["train"]:
+            if (
+                train_samples_remaining is not None
+                and train_samples_remaining <= 0
+            ):
+                break
+            env, size = self._open_single_lmdb(lmdb_path)
+            # Apply remaining limit
+            if train_samples_remaining is not None:
+                size = min(size, train_samples_remaining)
+                train_samples_remaining -= size
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["train"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Open validation LMDBs with cumulative max_samples limiting (10% of max_samples)
+        val_samples_remaining = (
+            max(1, self.max_samples // 10)
+            if self.max_samples is not None
+            else None
+        )
+        for lmdb_path in paths["val"]:
+            if (
+                val_samples_remaining is not None
+                and val_samples_remaining <= 0
+            ):
+                break
+            env, size = self._open_single_lmdb(lmdb_path)
+            # Apply remaining limit
+            if val_samples_remaining is not None:
+                size = min(size, val_samples_remaining)
+                val_samples_remaining -= size
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["valid"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # Open test LMDBs with cumulative max_samples limiting (10% of max_samples)
+        test_samples_remaining = (
+            max(1, self.max_samples // 10)
+            if self.max_samples is not None
+            else None
+        )
+        for lmdb_path in paths["test"]:
+            if (
+                test_samples_remaining is not None
+                and test_samples_remaining <= 0
+            ):
+                break
+            env, size = self._open_single_lmdb(lmdb_path)
+            # Apply remaining limit
+            if test_samples_remaining is not None:
+                size = min(size, test_samples_remaining)
+                test_samples_remaining -= size
+            self.envs.append((lmdb_path, env, size))
+            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
+            self.split_idx["test"].extend(
+                range(current_idx, current_idx + size)
+            )
+            current_idx += size
+
+        # If no test data, reuse validation indices
+        if not self.include_test or len(self.split_idx["test"]) == 0:
+            self.split_idx["test"] = list(self.split_idx["valid"])
+
+        # Convert to tensors
+        self.split_idx = {
+            k: torch.tensor(v, dtype=torch.long)
+            for k, v in self.split_idx.items()
+        }
+
+        logger.info(
+            f"Loaded {self.task.upper()} dataset: "
+            f"{len(self.split_idx['train'])} train, "
+            f"{len(self.split_idx['valid'])} val, "
+            f"{len(self.split_idx['test'])} test"
+        )
+
+    def _open_single_lmdb(self, lmdb_path: Path) -> tuple:
+        """Open a single LMDB file and return (env, size).
+
+        Parameters
+        ----------
+        lmdb_path : Path
+            Path to LMDB file.
+
+        Returns
+        -------
+        tuple
+            (environment, size) tuple.
+        """
+        env = lmdb.open(
+            str(lmdb_path.resolve()),
+            subdir=False,
+            readonly=True,
+            lock=False,
+            readahead=True,
+            meminit=False,
+            max_readers=1,
+        )
+        size = env.stat()["entries"]
+        return env, size
+
+    def _find_lmdb_and_local_idx(self, idx: int) -> tuple:
+        """Find which LMDB contains the given index and the local index within it.
+
+        Parameters
+        ----------
+        idx : int
+            Global dataset index.
+
+        Returns
+        -------
+        tuple
+            (lmdb_index, local_index) tuple.
+        """
+        if idx < 0 or idx >= len(self):
+            raise IndexError(f"Index {idx} out of range [0, {len(self)})")
+
+        # Binary search for the LMDB
+        left, right = 0, len(self.envs)
+        while left < right - 1:
+            mid = (left + right) // 2
+            if self.cumulative_sizes[mid] <= idx:
+                left = mid
+            else:
+                right = mid
+
+        lmdb_idx = left
+        local_idx = idx - self.cumulative_sizes[lmdb_idx]
+        return lmdb_idx, local_idx
+
+    def len(self) -> int:
+        """Get dataset length."""
+        return self.cumulative_sizes[-1]
+
+    def get(self, idx: int) -> Data:
+        """Get data sample at index.
+
+        Parameters
+        ----------
+        idx : int
+            Sample index.
+
+        Returns
+        -------
+        Data
+            PyTorch Geometric Data object.
+        """
+        lmdb_idx, local_idx = self._find_lmdb_and_local_idx(idx)
+        lmdb_path, env, _ = self.envs[lmdb_idx]
+
+        with env.begin() as txn:
+            cursor = txn.cursor()
+            if not cursor.first():
+                raise RuntimeError(f"Empty LMDB at {lmdb_path}")
+
+            # Navigate to the target entry
+            for _ in range(local_idx):
+                if not cursor.next():
+                    raise RuntimeError(
+                        f"Index {local_idx} out of range in {lmdb_path}"
+                    )
+
+            key, value = cursor.item()
+            data = pickle.loads(value)
+
+        # Convert old PyG Data objects to new format by extracting all attributes
+        if isinstance(data, Data):
+            try:
+                # Check if this is old format data
+                # Old PyG format has attributes directly in __dict__ without proper _store
+                if "_store" not in data.__dict__ or any(
+                    k in data.__dict__ for k in ["x", "edge_index", "pos"]
+                ):
+                    # Extract all data attributes
+                    data_dict = {}
+                    # Get all tensor/data attributes from __dict__
+                    data_dict = {
+                        key: val
+                        for key, val in data.__dict__.items()
+                        if not key.startswith("_") and val is not None
+                    }
+
+
+                    # Convert y_relaxed to y before creating new Data object
+                    if "y_relaxed" in data_dict:
+                        data_dict["y"] = torch.tensor(
+                            [data_dict["y_relaxed"]]
+                        ).float()
+                    elif "y" not in data_dict:
+                        data_dict["y"] = torch.tensor([float("nan")]).float()
+
+                    # Use atomic numbers as node features (x)
+                    if "atomic_numbers" in data_dict:
+                        data_dict["x"] = (
+                            data_dict["atomic_numbers"].view(-1, 1).float()
+                        )
+                    elif "x" not in data_dict and "pos" in data_dict:
+                        data_dict["x"] = torch.ones(
+                            (data_dict["pos"].shape[0], 1)
+                        )
+
+                    # Create edge_index from atomic positions using radius graph
+                    if "edge_index" not in data_dict and "pos" in data_dict:
+                        from torch_geometric.nn import radius_graph
+
+                        data_dict["edge_index"] = radius_graph(
+                            data_dict["pos"], r=5.0, max_num_neighbors=50
+                        )
+
+                    # Keep only standard PyG attributes
+                    standard_attrs = [
+                        "x",
+                        "edge_index",
+                        "edge_attr",
+                        "pos",
+                        "y",
+                        "batch",
+                    ]
+                    cleaned_dict = {
+                        k: v
+                        for k, v in data_dict.items()
+                        if k in standard_attrs
+                    }
+
+                    # Create a completely new Data object with current PyG format
+                    data = Data(**cleaned_dict)
+
+            except (AttributeError, KeyError, RuntimeError, TypeError):
+                # If extraction fails, pass through
+                pass
+
+        # Convert to legacy format if needed
+        if self.legacy_format and isinstance(data, Data):
+            data = Data(
+                **{k: v for k, v in data.__dict__.items() if v is not None}
+            )
+
+        return data
+
+    def __len__(self) -> int:
+        """Get dataset length."""
+        return self.len()
+
+    def __getitem__(self, idx: int) -> Data:
+        """Get item at index."""
+        return self.get(idx)
+
+    def __iter__(self) -> Iterator[Data]:
+        """Iterate over dataset."""
+        for i in range(len(self)):
+            yield self[i]
+
+    def __del__(self):
+        """Clean up LMDB environments."""
+        if hasattr(self, "envs"):
+            for _, env, _ in self.envs:
+                env.close()
+
+    @property
+    def num_node_features(self) -> int:
+        """Number of node features per atom."""
+        # Will be determined by the actual data
+        return 1  # Atomic numbers
+
+    @property
+    def num_classes(self) -> int:
+        """Number of classes (regression task)."""
+        return 1  # Single regression target (energy)
diff --git a/topobench/data/loaders/graph/is2re_dataset_loader.py b/topobench/data/loaders/graph/is2re_dataset_loader.py
new file mode 100644
index 000000000..a567f501a
--- /dev/null
+++ b/topobench/data/loaders/graph/is2re_dataset_loader.py
@@ -0,0 +1,98 @@
+"""Loader for OC20 IS2RE dataset."""
+
+import logging
+from pathlib import Path
+
+from omegaconf import DictConfig
+from torch_geometric.data import Dataset
+
+from topobench.data.datasets.is2re_dataset import IS2REDataset
+from topobench.data.loaders.base import AbstractLoader
+from topobench.data.utils.oc20_download import download_is2re_dataset
+
+logger = logging.getLogger(__name__)
+
+
+class IS2REDatasetLoader(AbstractLoader):
+    """Load OC20 IS2RE dataset.
+
+    Parameters
+    ----------
+    parameters : DictConfig
+        Configuration parameters containing:
+            - data_dir: Root directory for data
+            - data_name: Name of the dataset
+            - download: Whether to download if not present (default: False)
+            - dtype: Data type for tensors (default: "float32")
+            - legacy_format: Use legacy PyG Data format (default: False)
+            - max_samples: Limit dataset size for testing (default: None)
+    """
+
+    def __init__(self, parameters: DictConfig) -> None:
+        super().__init__(parameters)
+
+    def load_dataset(self) -> Dataset:
+        """Load the IS2RE dataset.
+
+        Returns
+        -------
+        Dataset
+            The loaded IS2RE dataset with the appropriate configuration.
+
+        Raises
+        ------
+        RuntimeError
+            If dataset loading fails.
+        """
+        # Download if requested
+        if self.parameters.get("download", False):
+            self._download_dataset()
+
+        # Initialize LMDB dataset
+        dataset = self._initialize_dataset()
+        self.data_dir = self._redefine_data_dir(dataset)
+        return dataset
+
+    def _download_dataset(self):
+        """Download the IS2RE dataset."""
+        root = Path(self.get_data_dir())
+        download_is2re_dataset(root=root, task="is2re")
+
+    def _initialize_dataset(self) -> IS2REDataset:
+        """Initialize the IS2RE dataset.
+
+        Returns
+        -------
+        IS2REDataset
+            The initialized IS2RE dataset.
+
+        Raises
+        ------
+        RuntimeError
+            If dataset initialization fails.
+        """
+        try:
+            dataset = IS2REDataset(
+                root=str(self.get_data_dir()),
+                name=self.parameters.data_name,
+                parameters=self.parameters,
+            )
+            return dataset
+        except Exception as e:
+            msg = f"Error initializing IS2RE dataset: {e}"
+            raise RuntimeError(msg) from e
+
+    def _redefine_data_dir(self, dataset: Dataset) -> Path:
+        """Redefine the data directory based on dataset configuration.
+
+        Parameters
+        ----------
+        dataset : Dataset
+            The IS2RE dataset instance.
+
+        Returns
+        -------
+        Path
+            The redefined data directory path.
+        """
+        return self.get_data_dir()
diff --git a/topobench/data/loaders/graph/oc20_dataset_loader.py b/topobench/data/loaders/graph/oc20_dataset_loader.py
index 12d0dc124..7d7514ef3 100644
--- a/topobench/data/loaders/graph/oc20_dataset_loader.py
+++ b/topobench/data/loaders/graph/oc20_dataset_loader.py
@@ -1,4 +1,4 @@
-"""Loader for OC20 family datasets (S2EF/IS2RE)."""
+"""Loader for OC20 S2EF dataset."""
 
 import logging
 from pathlib import Path
@@ -10,7 +10,6 @@
 from topobench.data.datasets.oc20_dataset import OC20Dataset
 from topobench.data.loaders.base import AbstractLoader
 from topobench.data.utils.oc20_download import (
-    download_is2re_dataset,
     download_s2ef_dataset,
 )
 
@@ -28,12 +27,10 @@
 
 
 class OC20DatasetLoader(AbstractLoader):
-    """Load OC20 family datasets for catalyst discovery and materials science.
+    """Load OC20 S2EF dataset for catalyst discovery and materials science.
 
-    Supports:
-    - S2EF (Structure to Energy and Forces): Predict energy/forces from atomic structure
-    - IS2RE (Initial Structure to Relaxed Energy): Predict relaxed energy
-    - OC22 IS2RE: Extended IS2RE dataset
+    Supports S2EF (Structure to Energy and Forces) to predict energy/forces
+    from atomic structure.
 
     Parameters
     ----------
@@ -41,15 +38,10 @@ class OC20DatasetLoader(AbstractLoader):
         Configuration parameters containing:
             - data_dir: Root directory for data
             - data_name: Name of the dataset
-            - task: Task type ("s2ef", "is2re", "oc22_is2re")
             - download: Whether to download if not present (default: False)
-
-            For S2EF:
             - train_split: Training split size ("200K", "2M", "20M", "all")
             - val_splits: List of validation splits or None for all
             - include_test: Whether to download test split (default: True)
-
-            Additional options:
             - dtype: Data type for tensors (default: "float32")
             - legacy_format: Use legacy PyG Data format (default: False)
             - max_samples: Limit dataset size for testing (default: None)
@@ -77,12 +69,11 @@ def load_dataset(self) -> Dataset:
 
         # Check if we have LMDB files or need ASE DB fallback
         data_root = Path(self.get_data_dir())
-        task = self.parameters.get("task", "s2ef").lower()
 
         # Try LMDB first
         lmdb_present = any(data_root.glob("**/*.lmdb"))
 
-        if not lmdb_present and task == "s2ef" and HAS_ASEDB:
+        if not lmdb_present and HAS_ASEDB:
             # Fallback to ASE DB dataset
             logger.info("No LMDB files found, using ASE DB backend")
             return self._load_asedb_dataset(data_root)
@@ -93,31 +84,22 @@ def load_dataset(self) -> Dataset:
         return dataset
 
     def _download_dataset(self):
-        """Download the OC20 dataset based on task configuration."""
-        task = self.parameters.get("task", "s2ef").lower()
+        """Download the S2EF dataset."""
         root = Path(self.get_data_dir())
+        train_split = self.parameters.get("train_split", "200K")
+        val_splits = self.parameters.get("val_splits", None)
+        include_test = self.parameters.get("include_test", True)
 
-        if task == "s2ef":
-            train_split = self.parameters.get("train_split", "200K")
-            val_splits = self.parameters.get("val_splits", None)
-            include_test = self.parameters.get("include_test", True)
-
-            # Parse val_splits
-            if val_splits is not None and isinstance(val_splits, str):
-                val_splits = [val_splits]
+        # Parse val_splits
+        if val_splits is not None and isinstance(val_splits, str):
+            val_splits = [val_splits]
 
-            download_s2ef_dataset(
-                root=root,
-                train_split=train_split,
-                val_splits=val_splits,
-                include_test=include_test,
-            )
-        elif task in ["is2re", "oc22_is2re"]:
-            download_is2re_dataset(root=root, task=task)
-        else:
-            raise ValueError(
-                f"Unknown task: {task}. Choose from ['s2ef', 'is2re', 'oc22_is2re']"
-            )
+        download_s2ef_dataset(
+            root=root,
+            train_split=train_split,
+            val_splits=val_splits,
+            include_test=include_test,
+        )
 
     def _initialize_dataset(self) -> OC20Dataset:
         """Initialize the OC20 dataset.
diff --git a/topobench/data/loaders/graph/oc20_dataset_loader_old.py b/topobench/data/loaders/graph/oc20_dataset_loader_old.py
deleted file mode 100644
index cb6d9c51b..000000000
--- a/topobench/data/loaders/graph/oc20_dataset_loader_old.py
+++ /dev/null
@@ -1,831 +0,0 @@
-"""Loader for OC20 family datasets (S2EF/IS2RE).
-
-This loader integrates the Open Catalyst 2020 (OC20/OC22) datasets into TopoBench.
-
-Supported tasks:
-- S2EF (Structure to Energy and Forces): Predict energy/forces from atomic structure
-  - Train splits: 200K, 2M, 20M, all
-  - Validation splits: val_id, val_ood_ads, val_ood_cat, val_ood_both (can aggregate)
-  - Test split: test (can be optionally skipped with include_test=False)
-  - Automatic preprocessing from extxyz to ASE DB format
-
-The ASE DB backend with PyG conversion is used for efficient data loading.
-"""
-
-from __future__ import annotations
-
-import logging
-
-# Added missing imports
-import lzma  # needed by _uncompress_xz
-import os
-import pickle  # needed to deserialize LMDB records
-import shutil  # needed by _uncompress_xz
-import tarfile
-import urllib.request
-from collections.abc import Iterator
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from pathlib import Path
-
-import torch
-from omegaconf import DictConfig
-from torch_geometric.data import Data, Dataset
-from tqdm import tqdm
-
-try:
-    import lmdb  # LMDB backend
-
-    HAS_LMDB = True
-except ImportError:  # optional dependency
-    lmdb = None  # type: ignore
-    HAS_LMDB = False
-
-from topobench.data.loaders.base import AbstractLoader
-
-# ASE DB fallback dataset
-from topobench.data.loaders.graph.oc20_asedbs2ef_loader import OC20ASEDBDataset
-from topobench.data.preprocessor.oc20_s2ef_preprocessor import (
-    HAS_ASE,
-    needs_preprocessing,
-    preprocess_s2ef_split_ase,
-)
-
-logger = logging.getLogger(__name__)
-
-# OC20 dataset split URLs
-# S2EF dataset URLs
-S2EF_TRAIN_SPLITS = {
-    "200K": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_200K.tar",
-    "2M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_2M.tar",
-    "20M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_20M.tar",
-    "all": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_all.tar",
-}
-
-S2EF_VAL_SPLITS = {
-    "val_id": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_id.tar",
-    "val_ood_ads": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_ads.tar",
-    "val_ood_cat": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_cat.tar",
-    "val_ood_both": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_both.tar",
-}
-
-S2EF_TEST_SPLIT = "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_test_lmdbs.tar.gz"
-
-# IS2RE dataset URLs (contains train/val/test in one archive)
-IS2RE_URL = "https://dl.fbaipublicfiles.com/opencatalystproject/data/is2res_train_val_test_lmdbs.tar.gz"
-OC22_IS2RE_URL = "https://dl.fbaipublicfiles.com/opencatalystproject/data/oc22/is2res_total_train_val_test_lmdbs.tar.gz"
-
-CACHE_DIR = Path.home() / ".cache" / "oc20"
-
-
-def _uncompress_xz(file_path: str) -> str:
-    """Decompress .xz files.
-
-    Parameters
-    ----------
-    file_path : str
-        Path to file to decompress.
-
-    Returns
-    -------
-    str
-        Path to decompressed file.
-    """
-    if not file_path.endswith(".xz"):
-        return file_path
-
-    output_path = file_path.replace(".xz", "")
-    try:
-        with (
-            lzma.open(file_path, "rb") as f_in,
-            open(output_path, "wb") as f_out,
-        ):
-            shutil.copyfileobj(f_in, f_out)
-        os.remove(file_path)
-        return output_path
-    except Exception as e:
-        logger.error(f"Error uncompressing {file_path}: {e}")
-        return file_path
-
-
-def _download_and_extract(
-    url: str, target_dir: Path, skip_if_extracted: bool = True
-) -> Path:
-    """Download and extract a tar archive.
-
-    Parameters
-    ----------
-    url : str
-        URL to download from.
-    target_dir : Path
-        Directory to extract to.
-    skip_if_extracted : bool
-        If True, skip extraction if extracted files already exist (default: True).
-
-    Returns
-    -------
-    Path
-        Path to extracted directory.
-    """
-    target_dir.mkdir(parents=True, exist_ok=True)
-    target_file = target_dir / os.path.basename(url)
-
-    # Download if needed
-    if not target_file.exists():
-        logger.info(f"Downloading {url}...")
-        with tqdm(
-            unit="B", unit_scale=True, desc=f"Downloading {target_file.name}"
-        ) as pbar:
-
-            def report(block_num, block_size, total_size):
-                if total_size > 0 and block_num == 0:
-                    pbar.total = total_size
-                pbar.update(block_size)
-
-            urllib.request.urlretrieve(url, target_file, reporthook=report)
-    else:
-        logger.info(f"Archive {target_file.name} already downloaded")
-
-    # Check if extraction is needed
-    # Look for extracted subdirectories (common pattern: archive extracts to a subdirectory)
-    extraction_marker = target_dir / ".extracted"
-    if skip_if_extracted and extraction_marker.exists():
-        logger.info(f"Archive {target_file.name} already extracted, skipping")
-        return target_dir
-
-    # Extract
-    logger.info(f"Extracting {target_file.name}...")
-    if str(target_file).endswith((".tar.gz", ".tgz")):
-        with tarfile.open(target_file, "r:gz") as tar:
-            tar.extractall(path=target_dir)
-    elif str(target_file).endswith(".tar"):
-        with tarfile.open(target_file, "r:") as tar:
-            tar.extractall(path=target_dir)
-    else:
-        raise ValueError(f"Unsupported archive format: {target_file}")
-
-    # Mark as extracted
-    extraction_marker.touch()
-    return target_dir
-
-
-class _OC20LMDBDataset(Dataset):
-    """LMDB-based dataset for OC20/OC22.
-
-    Supports:
-    - S2EF task with flexible train/val/test split specification
-    - IS2RE/OC22_IS2RE tasks with pre-computed train/val/test splits
-    """
-
-    def __init__(
-        self,
-        root: str | Path,
-        task: str = "s2ef",
-        train_split: str | None = "200K",
-        val_splits: list[str] | None = None,
-        test_split: str = "test",
-        download: bool = True,
-        include_test: bool = True,
-        dtype: torch.dtype = torch.float32,
-        legacy_format: bool = False,
-    ):
-        """Initialize OC20 LMDB dataset.
-
-        Parameters
-        ----------
-        root : str | Path
-            Root directory for storing datasets.
-        task : str
-            Task type: "s2ef", "is2re", or "oc22_is2re".
-        train_split : Optional[str]
-            For S2EF: one of ["200K", "2M", "20M", "all"].
-            For IS2RE: ignored (uses precomputed split).
-        val_splits : Optional[list[str]]
-            For S2EF: list of validation splits to use.
-            Can be ["val_id", "val_ood_ads", "val_ood_cat", "val_ood_both"] or subset.
-            If None, uses all 4 validation splits.
-            For IS2RE: ignored (uses precomputed split).
-        test_split : str
-            For S2EF: "test" (default).
-            For IS2RE: ignored (uses precomputed split).
-        download : bool
-            Whether to download if not present.
-        include_test : bool
-            Whether to download/include test split. If False, validation indices are reused for test.
-        dtype : torch.dtype
-            Data type for tensors.
-        legacy_format : bool
-            Whether to use legacy PyG Data format.
-        """
-        super().__init__()
-        self.root = Path(root)
-        self.task = task.lower()
-        self.dtype = dtype
-        self.legacy_format = legacy_format
-        self.download_flag = download
-        self.include_test = include_test
-
-        if self.task == "s2ef":
-            if train_split not in S2EF_TRAIN_SPLITS:
-                raise ValueError(
-                    f"Invalid S2EF train split: {train_split}. "
-                    f"Choose from {list(S2EF_TRAIN_SPLITS.keys())}"
-                )
-            self.train_split = train_split
-
-            # Default: use all validation splits
-            if val_splits is None:
-                val_splits = list(S2EF_VAL_SPLITS.keys())
-            else:
-                for vs in val_splits:
-                    if vs not in S2EF_VAL_SPLITS:
-                        raise ValueError(
-                            f"Invalid S2EF val split: {vs}. "
-                            f"Choose from {list(S2EF_VAL_SPLITS.keys())}"
-                        )
-            self.val_splits = val_splits
-            self.test_split = test_split
-
-        elif self.task in ["is2re", "oc22_is2re"]:
-            # IS2RE datasets have precomputed train/val/test splits
-            pass
-        else:
-            raise ValueError(
-                f"Unknown task: {task}. Choose from ['s2ef', 'is2re', 'oc22_is2re']"
-            )
-
-        if download:
-            self._download_and_prepare()
-
-        self._open_lmdbs()
-
-    def _download_and_prepare(self):
-        """Download and prepare the dataset based on task."""
-        if self.task == "s2ef":
-            self._download_s2ef()
-        elif self.task == "is2re":
-            self._download_is2re(IS2RE_URL, "is2re")
-        elif self.task == "oc22_is2re":
-            self._download_is2re(OC22_IS2RE_URL, "oc22_is2re")
-
-    def _download_s2ef(self):
-        """Download S2EF train, validation, and test splits."""
-        # Download train split
-        # Check for the actual data directory structure: s2ef/{split}/s2ef_train_{split}/s2ef_train_{split}/
-        train_url = S2EF_TRAIN_SPLITS[self.train_split]
-        train_subdir_name = f"s2ef_train_{self.train_split}"
-        train_dir = (
-            self.root
-            / "s2ef"
-            / self.train_split
-            / train_subdir_name
-            / train_subdir_name
-        )
-        if not train_dir.exists():
-            logger.info(f"Downloading S2EF train split: {self.train_split}")
-            _download_and_extract(
-                train_url, self.root / "s2ef" / self.train_split
-            )
-            self._decompress_xz_files(self.root / "s2ef" / self.train_split)
-        else:
-            logger.info(
-                f"S2EF train split {self.train_split} already exists, skipping download"
-            )
-
-        # Download validation splits
-        for val_split in self.val_splits:
-            val_url = S2EF_VAL_SPLITS[val_split]
-            # Check for the actual data directory structure: s2ef/all/s2ef_{val_split}/s2ef_{val_split}/
-            val_subdir_name = f"s2ef_{val_split}"
-            val_dir = (
-                self.root / "s2ef" / "all" / val_subdir_name / val_subdir_name
-            )
-            if not val_dir.exists():
-                logger.info(f"Downloading S2EF validation split: {val_split}")
-                _download_and_extract(val_url, self.root / "s2ef" / "all")
-                self._decompress_xz_files(self.root / "s2ef" / "all")
-            else:
-                logger.info(
-                    f"S2EF validation split {val_split} already exists, skipping download"
-                )
-
-        # Download test split
-        test_subdir_name = "s2ef_test"
-        test_dir = (
-            self.root / "s2ef" / "all" / test_subdir_name / test_subdir_name
-        )
-        if self.include_test and not test_dir.exists():
-            logger.info("Downloading S2EF test split")
-            _download_and_extract(S2EF_TEST_SPLIT, self.root / "s2ef" / "all")
-            self._decompress_xz_files(self.root / "s2ef" / "all")
-        elif self.include_test and test_dir.exists():
-            logger.info("S2EF test split already exists, skipping download")
-        elif not self.include_test:
-            logger.info(
-                "Skipping S2EF test split download (include_test=False); will reuse validation as test"
-            )
-
-        # Preprocess S2EF data (convert extxyz/txt to LMDB if needed)
-        self._preprocess_s2ef()
-
-    def _preprocess_s2ef(self):
-        """Preprocess S2EF data from extxyz/txt files to ASE DB format.
-
-        This method checks for raw extxyz files and converts them to ASE DB files
-        for efficient loading. It processes train, validation, and test splits.
-        """
-        if not HAS_ASE:
-            logger.warning("ASE not available. Cannot preprocess S2EF data.")
-            return
-
-        s2ef_root = self.root / "s2ef"
-
-        if not s2ef_root.exists():
-            logger.warning(f"S2EF data directory not found: {s2ef_root}")
-            return
-
-        # Get preprocessing parameters (use defaults since they're not available)
-        num_workers = 1
-        max_neigh = 50
-        radius = 6.0
-
-        # Process training data
-        # The actual data is in s2ef/{train_split}/s2ef_train_{train_split}/s2ef_train_{train_split}/
-        train_base = s2ef_root / self.train_split
-        train_subdir_name = f"s2ef_train_{self.train_split}"
-        train_dir = train_base / train_subdir_name / train_subdir_name
-
-        if train_dir.exists() and needs_preprocessing(train_dir):
-            logger.info(f"Preprocessing S2EF training data: {train_dir}")
-            preprocess_s2ef_split_ase(
-                data_path=train_dir,
-                out_path=train_dir,
-                num_workers=num_workers,
-                ref_energy=True,
-                test_data=False,
-                max_neigh=max_neigh,
-                radius=radius,
-            )
-
-        # Process validation splits
-        # The actual data is in s2ef/all/s2ef_{val_split}/s2ef_{val_split}/
-        for val_split in self.val_splits:
-            val_base = s2ef_root / "all"
-            val_subdir_name = f"s2ef_{val_split}"
-            val_dir = val_base / val_subdir_name / val_subdir_name
-
-            if val_dir.exists() and needs_preprocessing(val_dir):
-                logger.info(f"Preprocessing S2EF validation data: {val_dir}")
-                preprocess_s2ef_split_ase(
-                    data_path=val_dir,
-                    out_path=val_dir,
-                    num_workers=num_workers,
-                    ref_energy=True,
-                    test_data=False,
-                    max_neigh=max_neigh,
-                    radius=radius,
-                )
-
-        # Process test split if needed
-        if self.include_test:
-            test_base = s2ef_root / "all"
-            test_subdir_name = "s2ef_test"
-            test_dir = test_base / test_subdir_name / test_subdir_name
-
-            if test_dir.exists() and needs_preprocessing(test_dir):
-                logger.info(f"Preprocessing S2EF test data: {test_dir}")
-                preprocess_s2ef_split_ase(
-                    data_path=test_dir,
-                    out_path=test_dir,
-                    num_workers=num_workers,
-                    ref_energy=False,  # Test data typically doesn't have energy/forces
-                    test_data=True,
-                    max_neigh=max_neigh,
-                    radius=radius,
-                )
-
-        logger.info("S2EF preprocessing complete")
-
-    def _decompress_xz_files(self, directory: Path):
-        """Decompress all .xz files in a directory."""
-        xz_files = list(directory.glob("**/*.xz"))
-        if xz_files:
-            logger.info(
-                f"Decompressing {len(xz_files)} .xz files in {directory}..."
-            )
-            num_workers = max(1, os.cpu_count() - 1)
-            # Use threads to avoid pickling/import issues with processes on macOS
-            with ThreadPoolExecutor(max_workers=num_workers) as executor:
-                futures = [
-                    executor.submit(_uncompress_xz, str(f)) for f in xz_files
-                ]
-                for future in as_completed(futures):
-                    future.result()
-
-    def _download_is2re(self, url: str, name: str):
-        """Download IS2RE or OC22 IS2RE dataset."""
-        target_dir = self.root / name
-        if not target_dir.exists():
-            logger.info(f"Downloading {name} dataset")
-            _download_and_extract(url, self.root)
-            self._decompress_xz_files(self.root)
-
-    def _open_lmdbs(self):
-        """Open LMDB files for train/val/test splits."""
-        if self.task == "s2ef":
-            self._open_s2ef_lmdbs()
-        elif self.task in ["is2re", "oc22_is2re"]:
-            self._open_is2re_lmdbs()
-
-    def _open_s2ef_lmdbs(self):
-        """Open S2EF LMDB files and create split mappings."""
-        # Train
-        train_dir = self.root / "s2ef" / self.train_split / "train"
-        train_lmdbs = self._collect_lmdb_files(train_dir)
-
-        # Validation (can be multiple)
-        val_lmdbs = []
-        for val_split in self.val_splits:
-            val_dir = self.root / "s2ef" / "all" / val_split
-            val_lmdbs.extend(self._collect_lmdb_files(val_dir))
-
-        # Test
-        test_dir = self.root / "s2ef" / "all" / "test"
-        test_lmdbs = (
-            self._collect_lmdb_files(test_dir) if self.include_test else []
-        )
-
-        # Open all LMDBs and create split index mapping
-        self.envs = []
-        self.cumulative_sizes = [0]
-        self.split_idx = {"train": [], "valid": [], "test": []}
-
-        current_idx = 0
-
-        # Process train LMDBs
-        for lmdb_path in train_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["train"].extend(
-                range(current_idx, current_idx + size)
-            )
-            current_idx += size
-
-        # Process validation LMDBs
-        for lmdb_path in val_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["valid"].extend(
-                range(current_idx, current_idx + size)
-            )
-            current_idx += size
-
-        # Process test LMDBs
-        for lmdb_path in test_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["test"].extend(
-                range(current_idx, current_idx + size)
-            )
-            current_idx += size
-
-        # If no test data, reuse validation indices
-        if not self.include_test or len(self.split_idx["test"]) == 0:
-            self.split_idx["test"] = list(self.split_idx["valid"])
-
-        # Convert to tensors
-        self.split_idx = {
-            k: torch.tensor(v, dtype=torch.long)
-            for k, v in self.split_idx.items()
-        }
-
-        logger.info(
-            f"Loaded S2EF dataset: {len(self.split_idx['train'])} train, "
-            f"{len(self.split_idx['valid'])} val, {len(self.split_idx['test'])} test"
-        )
-
-    def _open_is2re_lmdbs(self):
-        """Open IS2RE LMDB files with precomputed splits."""
-        # IS2RE datasets have structure: data/is2re/train, data/is2re/val_id, data/is2re/test_id
-        # or data/is2re/all/train, etc.
-        base_dir = self.root / (
-            "is2re" if self.task == "is2re" else "oc22_is2re"
-        )
-
-        # Try different possible structures
-        possible_structures = [
-            base_dir,
-            base_dir / "data" / "is2re",
-            self.root / "data" / "is2re",
-        ]
-
-        found_dir = None
-        for poss_dir in possible_structures:
-            if poss_dir.exists():
-                found_dir = poss_dir
-                break
-
-        if found_dir is None:
-            raise ValueError(f"Cannot find IS2RE data directory in {base_dir}")
-
-        # Look for train/val/test subdirectories
-        train_lmdbs = self._collect_lmdb_files(found_dir / "train")
-        val_lmdbs = self._collect_lmdb_files(
-            found_dir / "val_id"
-        ) or self._collect_lmdb_files(found_dir / "val")
-        test_lmdbs = self._collect_lmdb_files(
-            found_dir / "test_id"
-        ) or self._collect_lmdb_files(found_dir / "test")
-
-        # Open all LMDBs
-        self.envs = []
-        self.cumulative_sizes = [0]
-        self.split_idx = {"train": [], "valid": [], "test": []}
-
-        current_idx = 0
-
-        for lmdb_path in train_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["train"].extend(
-                range(current_idx, current_idx + size)
-            )
-            current_idx += size
-
-        for lmdb_path in val_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["valid"].extend(
-                range(current_idx, current_idx + size)
-            )
-            current_idx += size
-
-        for lmdb_path in test_lmdbs:
-            env, size = self._open_single_lmdb(lmdb_path)
-            self.envs.append((lmdb_path, env, size))
-            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
-            self.split_idx["test"].extend(
-                range(current_idx, current_idx + size)
-            )
-            current_idx += size
-
-        # Convert to tensors
-        self.split_idx = {
-            k: torch.tensor(v, dtype=torch.long)
-            for k, v in self.split_idx.items()
-        }
-
-        logger.info(
-            f"Loaded {self.task.upper()} dataset: {len(self.split_idx['train'])} train, "
-            f"{len(self.split_idx['valid'])} val, {len(self.split_idx['test'])} test"
-        )
-
-    def _collect_lmdb_files(self, directory: Path) -> list[Path]:
-        """Collect all .lmdb files in a directory."""
-        if not directory.exists():
-            return []
-        lmdb_files = sorted(directory.glob("**/*.lmdb"))
-        return lmdb_files
-
-    def _open_single_lmdb(self, lmdb_path: Path) -> tuple:
-        """Open a single LMDB file and return (env, size)."""
-        env = lmdb.open(
-            str(lmdb_path.resolve()),
-            subdir=False,
-            readonly=True,
-            lock=False,
-            readahead=True,
-            meminit=False,
-            max_readers=1,
-        )
-        size = env.stat()["entries"]
-        return env, size
-
-    def _find_lmdb_and_local_idx(self, idx: int) -> tuple:
-        if idx < 0 or idx >= len(self):
-            raise IndexError(f"Index {idx} out of range [0, {len(self)})")
-
-        left, right = 0, len(self.envs)
-        while left < right - 1:
-            mid = (left + right) // 2
-            if self.cumulative_sizes[mid] <= idx:
-                left = mid
-            else:
-                right = mid
-
-        lmdb_idx = left
-        local_idx = idx - self.cumulative_sizes[lmdb_idx]
-        return lmdb_idx, local_idx
-
-    def len(self) -> int:
-        return self.cumulative_sizes[-1]
-
-    def get(self, idx: int) -> Data:
-        lmdb_idx, local_idx = self._find_lmdb_and_local_idx(idx)
-        lmdb_path, env, _ = self.envs[lmdb_idx]
-
-        with env.begin() as txn:
-            cursor = txn.cursor()
-            if not cursor.first():
-                raise RuntimeError(f"Empty LMDB at {lmdb_path}")
-
-            for _ in range(local_idx):
-                if not cursor.next():
-                    raise RuntimeError(
-                        f"Index {local_idx} out of range in {lmdb_path}"
-                    )
-
-            key, value = cursor.item()
-            data = pickle.loads(value)
-
-        if self.legacy_format and isinstance(data, Data):
-            data = Data(
-                **{k: v for k, v in data.__dict__.items() if v is not None}
-            )
-
-        return data
-
-    def __len__(self) -> int:
-        return self.len()
-
-    def __getitem__(self, idx: int) -> Data:
-        return self.get(idx)
-
-    def __iter__(self) -> Iterator[Data]:
-        for i in range(len(self)):
-            yield self[i]
-
-    def __del__(self):
-        if hasattr(self, "envs"):
-            for _, env, _ in self.envs:
-                env.close()
-
-
-class OC20DatasetLoader(AbstractLoader):
-    """Load OC20 family datasets.
-
-    This loader supports all OC20/OC22 dataset splits including S2EF and IS2RE tasks.
-
-    Parameters in the Hydra config (dataset.loader.parameters):
-    - data_domain: graph
-    - data_type: oc20
-    - data_name: Logical name for the dataset (e.g., OC20_S2EF_200K)
-    - task: "s2ef", "is2re", or "oc22_is2re"
-
-    For S2EF task:
-    - train_split: one of ["200K", "2M", "20M", "all"]
-    - val_splits: list of validation splits (default: all 4)
-      Options: ["val_id", "val_ood_ads", "val_ood_cat", "val_ood_both"]
-    - test_split: "test" (default)
-
-    For IS2RE/OC22 tasks:
-    - Uses precomputed train/val/test splits from the LMDB archives
-
-    Common parameters:
-    - download: whether to download (default: false)
-    - legacy_format: whether to use legacy PyG Data format (default: false)
-    - dtype: torch dtype (default: "float32")
-    - max_samples: limit dataset size for fast experimentation (default: None = all samples)
-    """
-
-    def __init__(self, parameters: DictConfig) -> None:
-        super().__init__(parameters)
-
-    def load_dataset(self) -> Dataset:
-        """Load OC20 dataset (S2EF or IS2RE).
-
-        Returns
-        -------
-        Dataset
-            Loaded dataset with appropriate splits.
-        """
-        task: str = getattr(self.parameters, "task", "s2ef")
-        download: bool = bool(getattr(self.parameters, "download", False))
-        legacy_format: bool = bool(
-            getattr(self.parameters, "legacy_format", False)
-        )
-        dtype = getattr(self.parameters, "dtype", "float32")
-        dtype_t = (
-            getattr(torch, str(dtype)) if isinstance(dtype, str) else dtype
-        )
-        max_samples = getattr(self.parameters, "max_samples", None)
-        if max_samples is not None:
-            max_samples = int(max_samples)
-            print(
-                f"⚠️  Limiting dataset to {max_samples} samples for fast experimentation"
-            )
-
-        if task == "s2ef":
-            train_split = getattr(self.parameters, "train_split", "200K")
-            val_splits_param = getattr(self.parameters, "val_splits", None)
-
-            # Parse val_splits
-            if val_splits_param is None:
-                val_splits = None  # Use all by default
-            elif isinstance(val_splits_param, str):
-                # Single validation split as string
-                val_splits = [val_splits_param]
-            elif isinstance(val_splits_param, (list, tuple)):
-                val_splits = list(val_splits_param)
-            else:
-                val_splits = None
-
-            test_split = getattr(self.parameters, "test_split", "test")
-            include_test = bool(getattr(self.parameters, "include_test", True))
-
-            ds = _OC20LMDBDataset(
-                root=self.get_data_dir(),
-                task="s2ef",
-                train_split=train_split,
-                val_splits=val_splits,
-                test_split=test_split,
-                download=download,
-                include_test=include_test,
-                dtype=dtype_t,
-                legacy_format=legacy_format,
-            )
-
-            # ASE DB fallback if LMDBs are not present
-            data_root = Path(self.get_data_dir())
-            lmdb_present = any((data_root / "s2ef").glob("**/*.lmdb"))
-            if not lmdb_present and HAS_ASE:
-                # Preprocessing is already done in _OC20LMDBDataset if needed
-                # Now collect DB files
-                train_subdir_name = f"s2ef_train_{train_split}"
-                train_dir = (
-                    data_root
-                    / "s2ef"
-                    / train_split
-                    / train_subdir_name
-                    / train_subdir_name
-                )
-                train_dbs = sorted(train_dir.glob("*.db"))
-                val_dbs = []
-                # Respect empty list for val_splits (for fast prototyping)
-                val_splits_to_use = (
-                    list(S2EF_VAL_SPLITS.keys())
-                    if val_splits is None
-                    else val_splits
-                )
-                for vs in val_splits_to_use:
-                    val_subdir_name = f"s2ef_{vs}"
-                    val_dir = (
-                        data_root
-                        / "s2ef"
-                        / "all"
-                        / val_subdir_name
-                        / val_subdir_name
-                    )
-                    val_dbs.extend(sorted(val_dir.glob("*.db")))
-                test_dbs = []
-                if include_test:
-                    test_dbs = sorted(
-                        (data_root / "s2ef" / "all" / "test").glob("*.db")
-                    )
-
-                if train_dbs:
-                    logger.info(
-                        f"Using ASE DB backend: {len(train_dbs)} train, {len(val_dbs)} val, {len(test_dbs)} test DB files"
-                    )
-                    return OC20ASEDBDataset(
-                        train_db_paths=[str(p) for p in train_dbs],
-                        val_db_paths=[str(p) for p in val_dbs],
-                        test_db_paths=[str(p) for p in test_dbs],
-                        max_neigh=int(
-                            getattr(self.parameters, "max_neigh", 50)
-                        ),
-                        radius=float(getattr(self.parameters, "radius", 6.0)),
-                        dtype=dtype_t,
-                        include_energy=True,
-                        include_forces=True,
-                        max_samples=max_samples,
-                    )
-        elif task in ["is2re", "oc22_is2re"]:
-            ds = _OC20LMDBDataset(
-                root=self.get_data_dir(),
-                task=task,
-                download=download,
-                dtype=dtype_t,
-                legacy_format=legacy_format,
-            )
-        else:
-            raise ValueError(
-                f"Unsupported task '{task}'. Use 's2ef', 'is2re', or 'oc22_is2re'."
-            )
-
-        return ds  # type: ignore[return-value]
-
-    def get_data_dir(self) -> Path:
-        """Get data directory path.
-
-        Returns
-        -------
-        Path
-            Path to data directory.
-        """
-        # Keep default directory convention for TopoBench
-        return Path(super().get_data_dir())
diff --git a/topobench/data/loaders/graph/oc22_is2re_dataset_loader.py b/topobench/data/loaders/graph/oc22_is2re_dataset_loader.py
new file mode 100644
index 000000000..90e0646db
--- /dev/null
+++ b/topobench/data/loaders/graph/oc22_is2re_dataset_loader.py
@@ -0,0 +1,98 @@
+"""Loader for OC22 IS2RE dataset."""
+
+import logging
+from pathlib import Path
+
+from omegaconf import DictConfig
+from torch_geometric.data import Dataset
+
+from topobench.data.datasets.oc22_is2re_dataset import OC22IS2REDataset
+from topobench.data.loaders.base import AbstractLoader
+from topobench.data.utils.oc20_download import download_is2re_dataset
+
+logger = logging.getLogger(__name__)
+
+
+class OC22IS2REDatasetLoader(AbstractLoader):
+    """Load OC22 IS2RE dataset.
+
+    Parameters
+    ----------
+    parameters : DictConfig
+        Configuration parameters containing:
+            - data_dir: Root directory for data
+            - data_name: Name of the dataset
+            - download: Whether to download if not present (default: False)
+            - dtype: Data type for tensors (default: "float32")
+            - legacy_format: Use legacy PyG Data format (default: False)
+            - max_samples: Limit dataset size for testing (default: None)
+    """
+
+    def __init__(self, parameters: DictConfig) -> None:
+        super().__init__(parameters)
+
+    def load_dataset(self) -> Dataset:
+        """Load the OC22 IS2RE dataset.
+
+        Returns
+        -------
+        Dataset
+            The loaded OC22 IS2RE dataset with the appropriate configuration.
+
+        Raises
+        ------
+        RuntimeError
+            If dataset loading fails.
+        """
+        # Download if requested
+        if self.parameters.get("download", False):
+            self._download_dataset()
+
+        # Initialize LMDB dataset
+        dataset = self._initialize_dataset()
+        self.data_dir = self._redefine_data_dir(dataset)
+        return dataset
+
+    def _download_dataset(self):
+        """Download the OC22 IS2RE dataset."""
+        root = Path(self.get_data_dir())
+        download_is2re_dataset(root=root, task="oc22_is2re")
+
+    def _initialize_dataset(self) -> OC22IS2REDataset:
+        """Initialize the OC22 IS2RE dataset.
+
+        Returns
+        -------
+        OC22IS2REDataset
+            The initialized OC22 IS2RE dataset.
+
+        Raises
+        ------
+        RuntimeError
+            If dataset initialization fails.
+        """
+        try:
+            dataset = OC22IS2REDataset(
+                root=str(self.get_data_dir()),
+                name=self.parameters.data_name,
+                parameters=self.parameters,
+            )
+            return dataset
+        except Exception as e:
+            msg = f"Error initializing OC22 IS2RE dataset: {e}"
+            raise RuntimeError(msg) from e
+
+    def _redefine_data_dir(self, dataset: Dataset) -> Path:
+        """Redefine the data directory based on dataset configuration.
+
+        Parameters
+        ----------
+        dataset : Dataset
+            The OC22 IS2RE dataset instance.
+
+        Returns
+        -------
+        Path
+            The redefined data directory path.
+        """
+        return self.get_data_dir()

From bf3d2793db2c2e3c390179f24d3a952baa73d0f7 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Fri, 21 Nov 2025 17:17:36 -0500
Subject: [PATCH 06/17] format

---
 topobench/data/datasets/oc22_is2re_dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/topobench/data/datasets/oc22_is2re_dataset.py b/topobench/data/datasets/oc22_is2re_dataset.py
index 83ba50dc5..5a8e79eda 100644
--- a/topobench/data/datasets/oc22_is2re_dataset.py
+++ b/topobench/data/datasets/oc22_is2re_dataset.py
@@ -328,7 +328,6 @@ def get(self, idx: int) -> Data:
                         if not key.startswith("_") and val is not None
                     }
 
-
                     # Convert y_relaxed to y before creating new Data object
                     if "y_relaxed" in data_dict:
                         data_dict["y"] = torch.tensor(

From 42adabf186ff7265da203e2fd1c6eaefe24bbb5a Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Fri, 21 Nov 2025 20:21:49 -0500
Subject: [PATCH 07/17] renaming and tests

---
 ...EF_train_200K.yaml => OC20_S2EF_200K.yaml} |   5 +-
 ...S2EF_train_20M.yaml => OC20_S2EF_20M.yaml} |   0
 ...0_S2EF_train_2M.yaml => OC20_S2EF_2M.yaml} |   0
 ...S2EF_train_all.yaml => OC20_S2EF_all.yaml} |   0
 configs/dataset/graph/OC20_S2EF_val_id.yaml   |  38 --
 test/data/load/test_oc20_datasets.py          | 533 ++++++++++++++++++
 test/pipeline/test_pipeline.py                | 108 +++-
 ...is2re_dataset.py => oc20_is2re_dataset.py} |   0
 ...loader.py => oc20_is2re_dataset_loader.py} |   2 +-
 9 files changed, 620 insertions(+), 66 deletions(-)
 rename configs/dataset/graph/{OC20_S2EF_train_200K.yaml => OC20_S2EF_200K.yaml} (90%)
 rename configs/dataset/graph/{OC20_S2EF_train_20M.yaml => OC20_S2EF_20M.yaml} (100%)
 rename configs/dataset/graph/{OC20_S2EF_train_2M.yaml => OC20_S2EF_2M.yaml} (100%)
 rename configs/dataset/graph/{OC20_S2EF_train_all.yaml => OC20_S2EF_all.yaml} (100%)
 delete mode 100644 configs/dataset/graph/OC20_S2EF_val_id.yaml
 create mode 100644 test/data/load/test_oc20_datasets.py
 rename topobench/data/datasets/{is2re_dataset.py => oc20_is2re_dataset.py} (100%)
 rename topobench/data/loaders/graph/{is2re_dataset_loader.py => oc20_is2re_dataset_loader.py} (97%)

diff --git a/configs/dataset/graph/OC20_S2EF_train_200K.yaml b/configs/dataset/graph/OC20_S2EF_200K.yaml
similarity index 90%
rename from configs/dataset/graph/OC20_S2EF_train_200K.yaml
rename to configs/dataset/graph/OC20_S2EF_200K.yaml
index bee121d9b..f0bb236d2 100644
--- a/configs/dataset/graph/OC20_S2EF_train_200K.yaml
+++ b/configs/dataset/graph/OC20_S2EF_200K.yaml
@@ -31,12 +31,9 @@ parameters:
 
 split_params:
   learning_setting: inductive
-  split_type: random     # random (random split of train set) or fixed (used for official splits)
+  split_type: fixed  # Splits are provided by the dataset
   data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
   data_seed: 0
-  train_prop: 0.6
-  val_prop: 0.2
-  test_prop: 0.2
 
 dataloader_params:
   batch_size: 32
diff --git a/configs/dataset/graph/OC20_S2EF_train_20M.yaml b/configs/dataset/graph/OC20_S2EF_20M.yaml
similarity index 100%
rename from configs/dataset/graph/OC20_S2EF_train_20M.yaml
rename to configs/dataset/graph/OC20_S2EF_20M.yaml
diff --git a/configs/dataset/graph/OC20_S2EF_train_2M.yaml b/configs/dataset/graph/OC20_S2EF_2M.yaml
similarity index 100%
rename from configs/dataset/graph/OC20_S2EF_train_2M.yaml
rename to configs/dataset/graph/OC20_S2EF_2M.yaml
diff --git a/configs/dataset/graph/OC20_S2EF_train_all.yaml b/configs/dataset/graph/OC20_S2EF_all.yaml
similarity index 100%
rename from configs/dataset/graph/OC20_S2EF_train_all.yaml
rename to configs/dataset/graph/OC20_S2EF_all.yaml
diff --git a/configs/dataset/graph/OC20_S2EF_val_id.yaml b/configs/dataset/graph/OC20_S2EF_val_id.yaml
deleted file mode 100644
index 9a113b196..000000000
--- a/configs/dataset/graph/OC20_S2EF_val_id.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-# OC20 S2EF dataset with 200K training samples
-# Validation: only val_id split (for faster testing/iteration)
-# Test: official test split
-
-loader:
-  _target_: topobench.data.loaders.OC20DatasetLoader
-  parameters:
-    data_domain: graph
-    data_type: oc20
-    data_name: OC20_S2EF_200K_val_id
-    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
-    task: s2ef
-    train_split: "200K"
-    val_splits: ["val_id"]  # Use only val_id for faster validation
-    test_split: "test"
-    download: true
-    legacy_format: false
-    dtype: float32
-
-parameters:
-  num_features: 6  # Will be determined by the actual data
-  num_classes: 1
-  task: regression
-  loss_type: mse
-  monitor_metric: mae
-  task_level: graph
-
-split_params:
-  learning_setting: inductive
-  split_type: fixed  # splits are provided by the dataset
-  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
-  data_seed: 0
-
-dataloader_params:
-  batch_size: 32
-  num_workers: 4
-  pin_memory: true
-  persistent_workers: true
diff --git a/test/data/load/test_oc20_datasets.py b/test/data/load/test_oc20_datasets.py
new file mode 100644
index 000000000..2f5f1b125
--- /dev/null
+++ b/test/data/load/test_oc20_datasets.py
@@ -0,0 +1,533 @@
+"""Unit tests for OC20 and OC22 dataset loaders."""
+
+import pytest
+import torch
+import hydra
+from pathlib import Path
+from omegaconf import DictConfig
+
+from topobench.data.loaders.graph.oc20_is2re_dataset_loader import IS2REDatasetLoader
+from topobench.data.loaders.graph.oc22_is2re_dataset_loader import OC22IS2REDatasetLoader
+from topobench.data.loaders.graph.oc20_dataset_loader import OC20DatasetLoader
+from topobench.data.datasets.oc20_is2re_dataset import IS2REDataset
+from topobench.data.datasets.oc22_is2re_dataset import OC22IS2REDataset
+from topobench.data.datasets.oc20_dataset import OC20Dataset
+
+
+class TestOC20IS2REDatasetLoader:
+    """Test suite for OC20 IS2RE dataset loader."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        """Setup test environment."""
+        hydra.core.global_hydra.GlobalHydra.instance().clear()
+        self.relative_config_dir = "../../../configs"
+
+    def test_loader_initialization(self):
+        """Test that the IS2RE loader can be initialized."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc20_is2re"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC20_IS2RE"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            assert isinstance(loader, IS2REDatasetLoader)
+            assert loader.parameters.data_name == "OC20_IS2RE"
+            assert loader.parameters.task == "is2re"
+
+    def test_dataset_loading(self):
+        """Test that the IS2RE dataset loads correctly."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc20_is2re_load"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC20_IS2RE"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, data_dir = loader.load()
+            
+            # Check dataset type
+            assert isinstance(dataset, IS2REDataset)
+            
+            # Check dataset has required attributes
+            assert hasattr(dataset, 'split_idx')
+            assert 'train' in dataset.split_idx
+            assert 'valid' in dataset.split_idx
+            assert 'test' in dataset.split_idx
+            
+            # Check splits are not empty (when max_samples is set)
+            assert len(dataset.split_idx['train']) > 0
+            assert len(dataset.split_idx['valid']) > 0
+            assert len(dataset.split_idx['test']) > 0
+            
+            # Check dataset length
+            assert len(dataset) > 0
+
+    def test_dataset_item_access(self):
+        """Test accessing individual items from the dataset."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc20_is2re_item"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC20_IS2RE"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, _ = loader.load()
+            
+            # Get first item
+            data = dataset[0]
+            
+            # Check data has required PyG attributes
+            assert hasattr(data, 'x')
+            assert hasattr(data, 'edge_index')
+            assert hasattr(data, 'y')
+            
+            # Check data types
+            assert isinstance(data.x, torch.Tensor)
+            assert isinstance(data.edge_index, torch.Tensor)
+            assert isinstance(data.y, torch.Tensor)
+            
+            # Check shapes
+            assert data.x.dim() >= 1
+            assert data.edge_index.dim() == 2
+            assert data.edge_index.size(0) == 2  # [2, num_edges]
+
+    def test_split_indices_validity(self):
+        """Test that split indices are valid and non-overlapping."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc20_is2re_splits"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC20_IS2RE"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, _ = loader.load()
+            
+            train_idx = dataset.split_idx['train'].numpy()
+            val_idx = dataset.split_idx['valid'].numpy()
+            test_idx = dataset.split_idx['test'].numpy()
+            
+            # Check no overlap between splits
+            assert len(set(train_idx) & set(val_idx)) == 0
+            assert len(set(train_idx) & set(test_idx)) == 0
+            # val and test might overlap if test reuses val when test is not available
+            
+            # Check all indices are within dataset bounds
+            all_indices = list(train_idx) + list(val_idx)
+            assert all(0 <= idx < len(dataset) for idx in all_indices)
+
+
+class TestOC22IS2REDatasetLoader:
+    """Test suite for OC22 IS2RE dataset loader."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        """Setup test environment."""
+        hydra.core.global_hydra.GlobalHydra.instance().clear()
+        self.relative_config_dir = "../../../configs"
+
+    def test_loader_initialization(self):
+        """Test that the OC22 IS2RE loader can be initialized."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc22_is2re"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC22_IS2RE"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            assert isinstance(loader, OC22IS2REDatasetLoader)
+            assert loader.parameters.data_name == "OC22_IS2RE"
+            assert loader.parameters.task == "oc22_is2re"
+
+    def test_dataset_loading(self):
+        """Test that the OC22 IS2RE dataset loads correctly."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc22_is2re_load"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC22_IS2RE"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, data_dir = loader.load()
+            
+            # Check dataset type
+            assert isinstance(dataset, OC22IS2REDataset)
+            
+            # Check dataset has required attributes
+            assert hasattr(dataset, 'split_idx')
+            assert 'train' in dataset.split_idx
+            assert 'valid' in dataset.split_idx
+            assert 'test' in dataset.split_idx
+            
+            # Check splits are not empty (when max_samples is set)
+            assert len(dataset.split_idx['train']) > 0
+            assert len(dataset.split_idx['valid']) > 0
+            assert len(dataset.split_idx['test']) > 0
+            
+            # Check dataset length
+            assert len(dataset) > 0
+
+    def test_dataset_item_access(self):
+        """Test accessing individual items from the OC22 dataset."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc22_is2re_item"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC22_IS2RE"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, _ = loader.load()
+            
+            # Get first item
+            data = dataset[0]
+            
+            # Check data has required PyG attributes
+            assert hasattr(data, 'x')
+            assert hasattr(data, 'edge_index')
+            assert hasattr(data, 'y')
+            
+            # Check data types
+            assert isinstance(data.x, torch.Tensor)
+            assert isinstance(data.edge_index, torch.Tensor)
+            assert isinstance(data.y, torch.Tensor)
+            
+            # Check shapes
+            assert data.x.dim() >= 1
+            assert data.edge_index.dim() == 2
+            assert data.edge_index.size(0) == 2
+
+    def test_split_indices_validity(self):
+        """Test that split indices are valid and non-overlapping."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc22_is2re_splits"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC22_IS2RE"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, _ = loader.load()
+            
+            train_idx = dataset.split_idx['train'].numpy()
+            val_idx = dataset.split_idx['valid'].numpy()
+            test_idx = dataset.split_idx['test'].numpy()
+            
+            # Check no overlap between train and val
+            assert len(set(train_idx) & set(val_idx)) == 0
+            
+            # Check all indices are within dataset bounds
+            all_indices = list(train_idx) + list(val_idx)
+            assert all(0 <= idx < len(dataset) for idx in all_indices)
+
+
+class TestOC20S2EFDatasetLoader:
+    """Test suite for OC20 S2EF dataset loader."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        """Setup test environment."""
+        hydra.core.global_hydra.GlobalHydra.instance().clear()
+        self.relative_config_dir = "../../../configs"
+
+    def test_loader_initialization_200k(self):
+        """Test that the S2EF 200K loader can be initialized."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc20_s2ef_200k"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC20_S2EF_train_200K"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            assert isinstance(loader, OC20DatasetLoader)
+            assert loader.parameters.task == "s2ef"
+            assert loader.parameters.train_split == "200K"
+
+    def test_dataset_loading_200k(self):
+        """Test that the S2EF 200K dataset loads correctly."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc20_s2ef_200k_load"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC20_S2EF_train_200K"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, data_dir = loader.load()
+            
+            # Check dataset type
+            assert isinstance(dataset, OC20Dataset)
+            
+            # Check dataset has required attributes
+            assert hasattr(dataset, 'split_idx')
+            assert 'train' in dataset.split_idx
+            assert 'valid' in dataset.split_idx
+            assert 'test' in dataset.split_idx
+            
+            # Check splits are not empty
+            assert len(dataset.split_idx['train']) > 0
+            assert len(dataset.split_idx['valid']) > 0
+            assert len(dataset.split_idx['test']) > 0
+            
+            # Check dataset length
+            assert len(dataset) > 0
+
+    def test_dataset_item_access_s2ef(self):
+        """Test accessing individual items from the S2EF dataset."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc20_s2ef_item"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC20_S2EF_train_200K"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, _ = loader.load()
+            
+            # Get first item
+            data = dataset[0]
+            
+            # Check data has required PyG attributes
+            assert hasattr(data, 'x')
+            assert hasattr(data, 'edge_index')
+            
+            # Check data types
+            assert isinstance(data.x, torch.Tensor)
+            assert isinstance(data.edge_index, torch.Tensor)
+            
+            # Check shapes
+            assert data.x.dim() >= 1
+            assert data.edge_index.dim() == 2
+            assert data.edge_index.size(0) == 2
+
+    def test_validation_splits_configuration(self):
+        """Test that validation splits can be configured."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc20_s2ef_val_splits"
+        ):
+            # Test with val_id only
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=[
+                    "dataset=graph/OC20_S2EF_val_id",
+                ],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            assert loader.parameters.val_splits == ["val_id"]
+
+    def test_split_indices_validity_s2ef(self):
+        """Test that S2EF split indices are valid."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc20_s2ef_splits"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=["dataset=graph/OC20_S2EF_train_200K"],
+                return_hydra_config=True,
+            )
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, _ = loader.load()
+            
+            train_idx = dataset.split_idx['train'].numpy()
+            val_idx = dataset.split_idx['valid'].numpy()
+            test_idx = dataset.split_idx['test'].numpy()
+            
+            # Check no overlap between train and val
+            assert len(set(train_idx) & set(val_idx)) == 0
+            
+            # Check all indices are within dataset bounds
+            all_indices = list(train_idx) + list(val_idx)
+            assert all(0 <= idx < len(dataset) for idx in all_indices)
+
+    def test_different_train_splits(self):
+        """Test that different training split sizes can be loaded."""
+        train_splits = ["200K", "2M", "20M", "all"]
+        
+        for split in train_splits[:2]:  # Test only 200K and 2M to keep tests fast
+            with hydra.initialize(
+                version_base="1.3",
+                config_path=self.relative_config_dir,
+                job_name=f"test_oc20_s2ef_{split}"
+            ):
+                cfg = hydra.compose(
+                    config_name="run.yaml",
+                    overrides=[f"dataset=graph/OC20_S2EF_train_{split}"],
+                    return_hydra_config=True,
+                )
+                loader = hydra.utils.instantiate(cfg.dataset.loader)
+                assert loader.parameters.train_split == split
+                
+                # Load and verify dataset
+                dataset, _ = loader.load()
+                assert len(dataset) > 0
+                assert len(dataset.split_idx['train']) > 0
+
+
+class TestOC20DatasetIntegration:
+    """Integration tests for OC20 datasets with preprocessing pipeline."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        """Setup test environment."""
+        hydra.core.global_hydra.GlobalHydra.instance().clear()
+        self.relative_config_dir = "../../../configs"
+
+    def test_is2re_with_preprocessor(self):
+        """Test IS2RE dataset with PreProcessor."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_is2re_preprocessor"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=[
+                    "dataset=graph/OC20_IS2RE",
+                    "model=graph/gcn",
+                ],
+                return_hydra_config=True,
+            )
+            
+            # Load dataset
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, data_dir = loader.load()
+            
+            # Use preprocessor
+            from topobench.data.preprocessor import PreProcessor
+            transform_config = cfg.get("transforms", None)
+            preprocessor = PreProcessor(dataset, data_dir, transform_config)
+            
+            # Load splits
+            dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(
+                cfg.dataset.split_params
+            )
+            
+            # Verify splits exist and are not empty
+            assert dataset_train is not None
+            assert dataset_val is not None
+            assert dataset_test is not None
+            assert len(dataset_train) > 0
+            assert len(dataset_val) > 0
+            assert len(dataset_test) > 0
+
+    def test_oc22_is2re_with_preprocessor(self):
+        """Test OC22 IS2RE dataset with PreProcessor."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_oc22_preprocessor"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=[
+                    "dataset=graph/OC22_IS2RE",
+                    "model=graph/gcn",
+                ],
+                return_hydra_config=True,
+            )
+            
+            # Load dataset
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, data_dir = loader.load()
+            
+            # Use preprocessor
+            from topobench.data.preprocessor import PreProcessor
+            transform_config = cfg.get("transforms", None)
+            preprocessor = PreProcessor(dataset, data_dir, transform_config)
+            
+            # Load splits
+            dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(
+                cfg.dataset.split_params
+            )
+            
+            # Verify splits exist and are not empty
+            assert dataset_train is not None
+            assert dataset_val is not None
+            assert dataset_test is not None
+            assert len(dataset_train) > 0
+            assert len(dataset_val) > 0
+            assert len(dataset_test) > 0
+
+    def test_s2ef_with_preprocessor(self):
+        """Test S2EF dataset with PreProcessor."""
+        with hydra.initialize(
+            version_base="1.3",
+            config_path=self.relative_config_dir,
+            job_name="test_s2ef_preprocessor"
+        ):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=[
+                    "dataset=graph/OC20_S2EF_train_200K",
+                    "model=graph/gcn",
+                ],
+                return_hydra_config=True,
+            )
+            
+            # Load dataset
+            loader = hydra.utils.instantiate(cfg.dataset.loader)
+            dataset, data_dir = loader.load()
+            
+            # Use preprocessor
+            from topobench.data.preprocessor import PreProcessor
+            transform_config = cfg.get("transforms", None)
+            preprocessor = PreProcessor(dataset, data_dir, transform_config)
+            
+            # Load splits
+            dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(
+                cfg.dataset.split_params
+            )
+            
+            # Verify splits exist and are not empty
+            assert dataset_train is not None
+            assert dataset_val is not None
+            assert dataset_test is not None
+            assert len(dataset_train) > 0
+            assert len(dataset_val) > 0
+            assert len(dataset_test) > 0
diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py
index 785987159..02bf6a717 100644
--- a/test/pipeline/test_pipeline.py
+++ b/test/pipeline/test_pipeline.py
@@ -1,35 +1,97 @@
-"""Test pipeline for a particular dataset and model."""
+"""Test pipeline for OC20/OC22 datasets."""
 
 import hydra
+import pytest
 from test._utils.simplified_pipeline import run
 
 
-DATASET = "graph/MUTAG"                                                 # ADD YOUR DATASET HERE
-MODELS   = ["graph/gcn", "cell/topotune", "simplicial/topotune"]        # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE
-
-
 class TestPipeline:
-    """Test pipeline for a particular dataset and model."""
+    """Test pipeline for OC20 and OC22 datasets."""
 
     def setup_method(self):
         """Setup method."""
         hydra.core.global_hydra.GlobalHydra.instance().clear()
     
-    def test_pipeline(self):
-        """Test pipeline."""
+    def test_pipeline_oc20_is2re(self):
+        """Test pipeline with OC20 IS2RE dataset."""
+        dataset = "graph/OC20_IS2RE"
+        model = "graph/gcn"
+        
+        with hydra.initialize(config_path="../../configs", job_name="job"):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=[
+                    f"model={model}",
+                    f"dataset={dataset}",
+                    "trainer.max_epochs=2",
+                    "trainer.min_epochs=1",
+                    "trainer.check_val_every_n_epoch=1",
+                    "paths=test",
+                    "callbacks=model_checkpoint",
+                ],
+                return_hydra_config=True
+            )
+            run(cfg)
+
+    def test_pipeline_oc22_is2re(self):
+        """Test pipeline with OC22 IS2RE dataset."""
+        dataset = "graph/OC22_IS2RE"
+        model = "graph/gcn"
+        
+        with hydra.initialize(config_path="../../configs", job_name="job"):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=[
+                    f"model={model}",
+                    f"dataset={dataset}",
+                    "trainer.max_epochs=2",
+                    "trainer.min_epochs=1",
+                    "trainer.check_val_every_n_epoch=1",
+                    "paths=test",
+                    "callbacks=model_checkpoint",
+                ],
+                return_hydra_config=True
+            )
+            run(cfg)
+
+    def test_pipeline_oc20_s2ef(self):
+        """Test pipeline with OC20 S2EF dataset."""
+        dataset = "graph/OC20_S2EF_200K"
+        model = "graph/gcn"
+        
+        with hydra.initialize(config_path="../../configs", job_name="job"):
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=[
+                    f"model={model}",
+                    f"dataset={dataset}",
+                    "trainer.max_epochs=2",
+                    "trainer.min_epochs=1",
+                    "trainer.check_val_every_n_epoch=1",
+                    "paths=test",
+                    "callbacks=model_checkpoint",
+                ],
+                return_hydra_config=True
+            )
+            run(cfg)
+
+    def test_pipeline_with_lifting(self):
+        """Test pipeline with topological lifting on OC20 IS2RE."""
+        dataset = "graph/OC20_IS2RE"
+        model = "simplicial/topotune"
+        
         with hydra.initialize(config_path="../../configs", job_name="job"):
-            for MODEL in MODELS:
-                cfg = hydra.compose(
-                    config_name="run.yaml",
-                    overrides=[
-                        f"model={MODEL}",
-                        f"dataset={DATASET}", # IF YOU IMPLEMENT A LARGE DATASET WITH AN OPTION TO USE A SLICE OF IT, ADD BELOW THE CORRESPONDING OPTION
-                        "trainer.max_epochs=2",
-                        "trainer.min_epochs=1",
-                        "trainer.check_val_every_n_epoch=1",
-                        "paths=test",
-                        "callbacks=model_checkpoint",
-                    ],
-                    return_hydra_config=True
-                )
-                run(cfg)
\ No newline at end of file
+            cfg = hydra.compose(
+                config_name="run.yaml",
+                overrides=[
+                    f"model={model}",
+                    f"dataset={dataset}",
+                    "trainer.max_epochs=2",
+                    "trainer.min_epochs=1",
+                    "trainer.check_val_every_n_epoch=1",
+                    "paths=test",
+                    "callbacks=model_checkpoint",
+                ],
+                return_hydra_config=True
+            )
+            run(cfg)
\ No newline at end of file
diff --git a/topobench/data/datasets/is2re_dataset.py b/topobench/data/datasets/oc20_is2re_dataset.py
similarity index 100%
rename from topobench/data/datasets/is2re_dataset.py
rename to topobench/data/datasets/oc20_is2re_dataset.py
diff --git a/topobench/data/loaders/graph/is2re_dataset_loader.py b/topobench/data/loaders/graph/oc20_is2re_dataset_loader.py
similarity index 97%
rename from topobench/data/loaders/graph/is2re_dataset_loader.py
rename to topobench/data/loaders/graph/oc20_is2re_dataset_loader.py
index a567f501a..bd10e2fce 100644
--- a/topobench/data/loaders/graph/is2re_dataset_loader.py
+++ b/topobench/data/loaders/graph/oc20_is2re_dataset_loader.py
@@ -6,7 +6,7 @@
 from omegaconf import DictConfig
 from torch_geometric.data import Dataset
 
-from topobench.data.datasets.is2re_dataset import IS2REDataset
+from topobench.data.datasets.oc20_is2re_dataset import IS2REDataset
 from topobench.data.loaders.base import AbstractLoader
 from topobench.data.utils.oc20_download import download_is2re_dataset
 

From 01f082e1ca647da8149bf26143876fc71b10168a Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Fri, 21 Nov 2025 20:36:40 -0500
Subject: [PATCH 08/17] keep some files untouched

---
 .../nsd_utils/inductive_discrete_models.py    | 26 ++++++++++++++-----
 .../nn/backbones/graph/nsd_utils/laplace.py   |  3 +--
 .../graph/nsd_utils/laplacian_builders.py     |  9 +++----
 .../graph2simplicial/latentclique_lifting.py  |  5 ++--
 4 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py b/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py
index 4c2a2dc31..c5b516791 100644
--- a/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py
+++ b/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py
@@ -64,12 +64,16 @@ def __init__(self, config):
             )
             nn.init.orthogonal_(self.lin_right_weights[-1].weight.data)
         for _i in range(self.layers):
-            self.lin_left_weights.append(nn.Linear(self.d, self.d, bias=False))
+            self.lin_left_weights.append(
+                nn.Linear(self.d, self.d, bias=False)
+            )
             nn.init.eye_(self.lin_left_weights[-1].weight.data)
 
         self.sheaf_learners = nn.ModuleList()
 
-        num_sheaf_learners = min(self.layers, self.layers)
+        num_sheaf_learners = min(
+            self.layers, self.layers
+        )
         for _i in range(num_sheaf_learners):
             self.sheaf_learners.append(
                 LocalConcatSheafLearner(
@@ -204,13 +208,17 @@ def __init__(self, config):
             )
             nn.init.orthogonal_(self.lin_right_weights[-1].weight.data)
         for _i in range(self.layers):
-            self.lin_left_weights.append(nn.Linear(self.d, self.d, bias=False))
+            self.lin_left_weights.append(
+                nn.Linear(self.d, self.d, bias=False)
+            )
             nn.init.eye_(self.lin_left_weights[-1].weight.data)
 
         self.sheaf_learners = nn.ModuleList()
         self.weight_learners = nn.ModuleList()
 
-        num_sheaf_learners = min(self.layers, self.layers)
+        num_sheaf_learners = min(
+            self.layers, self.layers
+        )
         for _i in range(num_sheaf_learners):
             self.sheaf_learners.append(
                 LocalConcatSheafLearner(
@@ -389,12 +397,16 @@ def __init__(self, config):
             )
             nn.init.orthogonal_(self.lin_right_weights[-1].weight.data)
         for _i in range(self.layers):
-            self.lin_left_weights.append(nn.Linear(self.d, self.d, bias=False))
+            self.lin_left_weights.append(
+                nn.Linear(self.d, self.d, bias=False)
+            )
             nn.init.eye_(self.lin_left_weights[-1].weight.data)
 
         self.sheaf_learners = nn.ModuleList()
 
-        num_sheaf_learners = min(self.layers, self.layers)
+        num_sheaf_learners = min(
+            self.layers, self.layers
+        )
         for _i in range(num_sheaf_learners):
             self.sheaf_learners.append(
                 LocalConcatSheafLearner(
@@ -512,4 +524,4 @@ def forward(self, x, edge_index):
         # Reshape using actual number of nodes
         x = x.reshape(actual_num_nodes, -1)
         x = self.lin2(x)
-        return x
+        return x
\ No newline at end of file
diff --git a/topobench/nn/backbones/graph/nsd_utils/laplace.py b/topobench/nn/backbones/graph/nsd_utils/laplace.py
index 606311620..e21d748c4 100644
--- a/topobench/nn/backbones/graph/nsd_utils/laplace.py
+++ b/topobench/nn/backbones/graph/nsd_utils/laplace.py
@@ -214,7 +214,6 @@ def compute_learnable_diag_laplacian_indices(
 
     return diag_indices, non_diag_indices
 
-
 def mergesp(index1, value1, index2, value2):
     """
     Merge two sparse matrices with disjoint indices into one.
@@ -248,4 +247,4 @@ def mergesp(index1, value1, index2, value2):
 
     index = torch.cat([index1, index2], dim=1)
     val = torch.cat([value1, value2])
-    return index, val
+    return index, val
\ No newline at end of file
diff --git a/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py b/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py
index 0da64aab6..93b7f8a68 100644
--- a/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py
+++ b/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py
@@ -103,7 +103,7 @@ def scalar_normalise(self, diag, tril, row, col):
             assert diag.dim() == 2
         d = diag.size(-1)
         diag_sqrt_inv = (diag + 1).pow(-0.5)
-
+        
         diag_sqrt_inv = (
             diag_sqrt_inv.view(-1, 1, 1)
             if tril.dim() > 2
@@ -122,7 +122,6 @@ def scalar_normalise(self, diag, tril, row, col):
 
         return diag_maps, non_diag_maps
 
-
 class DiagLaplacianBuilder(LaplacianBuilder):
     """
     Builder for sheaf Laplacian with diagonal restriction maps.
@@ -200,7 +199,6 @@ def forward(self, maps):
 
         return (edge_index, weights), saved_tril_maps
 
-
 class NormConnectionLaplacianBuilder(LaplacianBuilder):
     """
     Builder for normalized bundle sheaf Laplacian with orthogonal restriction maps.
@@ -256,7 +254,7 @@ def forward(self, map_params):
         """
         assert len(map_params.size()) == 2
         assert map_params.size(1) == self.d * (self.d + 1) // 2
-
+        
         _, full_right_idx = self.full_left_right_idx
         left_idx, right_idx = self.left_right_idx
         tril_row, tril_col = self.vertex_tril_idx
@@ -296,7 +294,6 @@ def forward(self, map_params):
 
         return (edge_index, weights), saved_tril_maps
 
-
 class GeneralLaplacianBuilder(LaplacianBuilder):
     """
     Builder for general sheaf Laplacian with full matrix restriction maps.
@@ -381,4 +378,4 @@ def forward(self, maps):
             non_diag_indices, non_diag_values, diag_indices, diag_maps
         )
 
-        return (edge_index, weights), saved_tril_maps
+        return (edge_index, weights), saved_tril_maps
\ No newline at end of file
diff --git a/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py b/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py
index bc05ad9e9..a20647837 100755
--- a/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py
+++ b/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py
@@ -605,8 +605,7 @@ def splitmerge(self):
 
         if clique_i == clique_j:
             clique_size = self.Z[clique_i].sum()
-            if clique_size <= 2:
-                return  # noqa
+            if clique_size <= 2: return # noqa
 
             Z_prop = self.Z.copy()
             Z_prop = np.delete(Z_prop, clique_i, 0)
@@ -884,4 +883,4 @@ def _get_beta_params(mean, var):
 
 #     # Lift the topology to a cell complex
 #     lifting = LatentCliqueLifting(edge_prob_mean=0.99, edge_prob_var=0.0)
-#     complex = lifting.lift_topology(data, verbose=True)
+#     complex = lifting.lift_topology(data, verbose=True)
\ No newline at end of file

From e7c1e105c4834751d1e8679af48a294c12eb745c Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Sat, 22 Nov 2025 09:07:12 -0500
Subject: [PATCH 09/17] ruff fix

---
 .../nsd_utils/inductive_discrete_models.py    | 26 +++++--------------
 .../nn/backbones/graph/nsd_utils/laplace.py   |  3 ++-
 .../graph/nsd_utils/laplacian_builders.py     |  9 ++++---
 .../graph2simplicial/latentclique_lifting.py  |  5 ++--
 4 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py b/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py
index c5b516791..4c2a2dc31 100644
--- a/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py
+++ b/topobench/nn/backbones/graph/nsd_utils/inductive_discrete_models.py
@@ -64,16 +64,12 @@ def __init__(self, config):
             )
             nn.init.orthogonal_(self.lin_right_weights[-1].weight.data)
         for _i in range(self.layers):
-            self.lin_left_weights.append(
-                nn.Linear(self.d, self.d, bias=False)
-            )
+            self.lin_left_weights.append(nn.Linear(self.d, self.d, bias=False))
             nn.init.eye_(self.lin_left_weights[-1].weight.data)
 
         self.sheaf_learners = nn.ModuleList()
 
-        num_sheaf_learners = min(
-            self.layers, self.layers
-        )
+        num_sheaf_learners = min(self.layers, self.layers)
         for _i in range(num_sheaf_learners):
             self.sheaf_learners.append(
                 LocalConcatSheafLearner(
@@ -208,17 +204,13 @@ def __init__(self, config):
             )
             nn.init.orthogonal_(self.lin_right_weights[-1].weight.data)
         for _i in range(self.layers):
-            self.lin_left_weights.append(
-                nn.Linear(self.d, self.d, bias=False)
-            )
+            self.lin_left_weights.append(nn.Linear(self.d, self.d, bias=False))
             nn.init.eye_(self.lin_left_weights[-1].weight.data)
 
         self.sheaf_learners = nn.ModuleList()
         self.weight_learners = nn.ModuleList()
 
-        num_sheaf_learners = min(
-            self.layers, self.layers
-        )
+        num_sheaf_learners = min(self.layers, self.layers)
         for _i in range(num_sheaf_learners):
             self.sheaf_learners.append(
                 LocalConcatSheafLearner(
@@ -397,16 +389,12 @@ def __init__(self, config):
             )
             nn.init.orthogonal_(self.lin_right_weights[-1].weight.data)
         for _i in range(self.layers):
-            self.lin_left_weights.append(
-                nn.Linear(self.d, self.d, bias=False)
-            )
+            self.lin_left_weights.append(nn.Linear(self.d, self.d, bias=False))
             nn.init.eye_(self.lin_left_weights[-1].weight.data)
 
         self.sheaf_learners = nn.ModuleList()
 
-        num_sheaf_learners = min(
-            self.layers, self.layers
-        )
+        num_sheaf_learners = min(self.layers, self.layers)
         for _i in range(num_sheaf_learners):
             self.sheaf_learners.append(
                 LocalConcatSheafLearner(
@@ -524,4 +512,4 @@ def forward(self, x, edge_index):
         # Reshape using actual number of nodes
         x = x.reshape(actual_num_nodes, -1)
         x = self.lin2(x)
-        return x
\ No newline at end of file
+        return x
diff --git a/topobench/nn/backbones/graph/nsd_utils/laplace.py b/topobench/nn/backbones/graph/nsd_utils/laplace.py
index e21d748c4..606311620 100644
--- a/topobench/nn/backbones/graph/nsd_utils/laplace.py
+++ b/topobench/nn/backbones/graph/nsd_utils/laplace.py
@@ -214,6 +214,7 @@ def compute_learnable_diag_laplacian_indices(
 
     return diag_indices, non_diag_indices
 
+
 def mergesp(index1, value1, index2, value2):
     """
     Merge two sparse matrices with disjoint indices into one.
@@ -247,4 +248,4 @@ def mergesp(index1, value1, index2, value2):
 
     index = torch.cat([index1, index2], dim=1)
     val = torch.cat([value1, value2])
-    return index, val
\ No newline at end of file
+    return index, val
diff --git a/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py b/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py
index 93b7f8a68..0da64aab6 100644
--- a/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py
+++ b/topobench/nn/backbones/graph/nsd_utils/laplacian_builders.py
@@ -103,7 +103,7 @@ def scalar_normalise(self, diag, tril, row, col):
             assert diag.dim() == 2
         d = diag.size(-1)
         diag_sqrt_inv = (diag + 1).pow(-0.5)
-        
+
         diag_sqrt_inv = (
             diag_sqrt_inv.view(-1, 1, 1)
             if tril.dim() > 2
@@ -122,6 +122,7 @@ def scalar_normalise(self, diag, tril, row, col):
 
         return diag_maps, non_diag_maps
 
+
 class DiagLaplacianBuilder(LaplacianBuilder):
     """
     Builder for sheaf Laplacian with diagonal restriction maps.
@@ -199,6 +200,7 @@ def forward(self, maps):
 
         return (edge_index, weights), saved_tril_maps
 
+
 class NormConnectionLaplacianBuilder(LaplacianBuilder):
     """
     Builder for normalized bundle sheaf Laplacian with orthogonal restriction maps.
@@ -254,7 +256,7 @@ def forward(self, map_params):
         """
         assert len(map_params.size()) == 2
         assert map_params.size(1) == self.d * (self.d + 1) // 2
-        
+
         _, full_right_idx = self.full_left_right_idx
         left_idx, right_idx = self.left_right_idx
         tril_row, tril_col = self.vertex_tril_idx
@@ -294,6 +296,7 @@ def forward(self, map_params):
 
         return (edge_index, weights), saved_tril_maps
 
+
 class GeneralLaplacianBuilder(LaplacianBuilder):
     """
     Builder for general sheaf Laplacian with full matrix restriction maps.
@@ -378,4 +381,4 @@ def forward(self, maps):
             non_diag_indices, non_diag_values, diag_indices, diag_maps
         )
 
-        return (edge_index, weights), saved_tril_maps
\ No newline at end of file
+        return (edge_index, weights), saved_tril_maps
diff --git a/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py b/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py
index a20647837..bc05ad9e9 100755
--- a/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py
+++ b/topobench/transforms/liftings/graph2simplicial/latentclique_lifting.py
@@ -605,7 +605,8 @@ def splitmerge(self):
 
         if clique_i == clique_j:
             clique_size = self.Z[clique_i].sum()
-            if clique_size <= 2: return # noqa
+            if clique_size <= 2:
+                return  # noqa
 
             Z_prop = self.Z.copy()
             Z_prop = np.delete(Z_prop, clique_i, 0)
@@ -883,4 +884,4 @@ def _get_beta_params(mean, var):
 
 #     # Lift the topology to a cell complex
 #     lifting = LatentCliqueLifting(edge_prob_mean=0.99, edge_prob_var=0.0)
-#     complex = lifting.lift_topology(data, verbose=True)
\ No newline at end of file
+#     complex = lifting.lift_topology(data, verbose=True)

From c9bc65fae8b6762730fd832051861bf1f1f5f8ec Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Sat, 22 Nov 2025 18:26:43 -0500
Subject: [PATCH 10/17] fixed data splits, tests, and code running

---
 configs/dataset/graph/OC20_IS2RE.yaml         |   4 +-
 configs/dataset/graph/OC20_S2EF_200K.yaml     |   4 +-
 configs/dataset/graph/OC20_S2EF_20M.yaml      |   2 +-
 configs/dataset/graph/OC20_S2EF_2M.yaml       |   2 +-
 configs/dataset/graph/OC20_S2EF_all.yaml      |   2 +-
 configs/dataset/graph/OC22_IS2RE.yaml         |   2 +-
 test/data/load/test_oc20_datasets.py          | 129 +++++++++++---
 topobench/data/datasets/oc20_is2re_dataset.py |  14 +-
 topobench/data/datasets/oc22_is2re_dataset.py |  14 +-
 .../loaders/graph/oc20_asedbs2ef_loader.py    |  33 ++--
 .../data/loaders/graph/oc20_dataset_loader.py |  97 +++++++++++
 topobench/data/preprocessor/preprocessor.py   | 159 ++++++++++++++++--
 topobench/data/utils/split_utils.py           |  38 ++++-
 13 files changed, 412 insertions(+), 88 deletions(-)

diff --git a/configs/dataset/graph/OC20_IS2RE.yaml b/configs/dataset/graph/OC20_IS2RE.yaml
index 1edf54721..11ddeb4d5 100644
--- a/configs/dataset/graph/OC20_IS2RE.yaml
+++ b/configs/dataset/graph/OC20_IS2RE.yaml
@@ -2,7 +2,7 @@
 # Train/val/test splits are precomputed in the LMDB archive
 
 loader:
-  _target_: topobench.data.loaders.graph.is2re_dataset_loader.IS2REDatasetLoader
+  _target_: topobench.data.loaders.graph.oc20_is2re_dataset_loader.IS2REDatasetLoader
   parameters:
     data_domain: graph
     data_type: oc20
@@ -12,7 +12,7 @@ loader:
     download: true
     legacy_format: false
     dtype: float32
-    max_samples: 100  # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset
+    max_samples: 10  # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset
 
 parameters:
   num_features: 6  # Will be determined by the actual data
diff --git a/configs/dataset/graph/OC20_S2EF_200K.yaml b/configs/dataset/graph/OC20_S2EF_200K.yaml
index f0bb236d2..e48f913dd 100644
--- a/configs/dataset/graph/OC20_S2EF_200K.yaml
+++ b/configs/dataset/graph/OC20_S2EF_200K.yaml
@@ -13,11 +13,11 @@ loader:
     task: s2ef
     train_split: "200K"
     val_splits: null  # null means use all 4 validation splits (val_id, val_ood_ads, val_ood_cat, val_ood_both)
-    include_test: false  # Skip test download, reuse validation as test
+    include_test: false  # S2EF test data is LMDB format (incompatible with .extxyz/ASE DB train/val)
     download: true
     dtype: float32
     legacy_format: false
-    max_samples: 100  # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset
+    max_samples: 10  # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset
     data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
 
 parameters:
diff --git a/configs/dataset/graph/OC20_S2EF_20M.yaml b/configs/dataset/graph/OC20_S2EF_20M.yaml
index 9db4e1197..130c3cf95 100644
--- a/configs/dataset/graph/OC20_S2EF_20M.yaml
+++ b/configs/dataset/graph/OC20_S2EF_20M.yaml
@@ -3,7 +3,7 @@
 # Test: official test split
 
 loader:
-  _target_: topobench.data.loaders.OC20DatasetLoader
+  _target_: topobench.data.loaders.graph.oc20_dataset_loader.OC20DatasetLoader
   parameters:
     data_domain: graph
     data_type: oc20
diff --git a/configs/dataset/graph/OC20_S2EF_2M.yaml b/configs/dataset/graph/OC20_S2EF_2M.yaml
index da06a665e..2f6c3334c 100644
--- a/configs/dataset/graph/OC20_S2EF_2M.yaml
+++ b/configs/dataset/graph/OC20_S2EF_2M.yaml
@@ -3,7 +3,7 @@
 # Test: official test split
 
 loader:
-  _target_: topobench.data.loaders.OC20DatasetLoader
+  _target_: topobench.data.loaders.graph.oc20_dataset_loader.OC20DatasetLoader
   parameters:
     data_domain: graph
     data_type: oc20
diff --git a/configs/dataset/graph/OC20_S2EF_all.yaml b/configs/dataset/graph/OC20_S2EF_all.yaml
index d15c6ec0f..86c82789b 100644
--- a/configs/dataset/graph/OC20_S2EF_all.yaml
+++ b/configs/dataset/graph/OC20_S2EF_all.yaml
@@ -3,7 +3,7 @@
 # Test: official test split
 
 loader:
-  _target_: topobench.data.loaders.OC20DatasetLoader
+  _target_: topobench.data.loaders.graph.oc20_dataset_loader.OC20DatasetLoader
   parameters:
     data_domain: graph
     data_type: oc20
diff --git a/configs/dataset/graph/OC22_IS2RE.yaml b/configs/dataset/graph/OC22_IS2RE.yaml
index fd9446c45..28198a2b2 100644
--- a/configs/dataset/graph/OC22_IS2RE.yaml
+++ b/configs/dataset/graph/OC22_IS2RE.yaml
@@ -12,7 +12,7 @@ loader:
     download: true
     legacy_format: false
     dtype: float32
-    max_samples: 100  # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset
+    max_samples: 10  # Set to integer (e.g., 1000) to limit dataset size for fast experiments, or null for full dataset
 
 parameters:
   num_features: 6  # Will be determined by the actual data
diff --git a/test/data/load/test_oc20_datasets.py b/test/data/load/test_oc20_datasets.py
index 2f5f1b125..109c16dae 100644
--- a/test/data/load/test_oc20_datasets.py
+++ b/test/data/load/test_oc20_datasets.py
@@ -1,17 +1,86 @@
 """Unit tests for OC20 and OC22 dataset loaders."""
 
+import os
 import pytest
 import torch
 import hydra
 from pathlib import Path
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
 
 from topobench.data.loaders.graph.oc20_is2re_dataset_loader import IS2REDatasetLoader
 from topobench.data.loaders.graph.oc22_is2re_dataset_loader import OC22IS2REDatasetLoader
 from topobench.data.loaders.graph.oc20_dataset_loader import OC20DatasetLoader
+from topobench.data.loaders.graph.oc20_asedbs2ef_loader import OC20ASEDBDataset
 from topobench.data.datasets.oc20_is2re_dataset import IS2REDataset
 from topobench.data.datasets.oc22_is2re_dataset import OC22IS2REDataset
 from topobench.data.datasets.oc20_dataset import OC20Dataset
+from topobench.utils.config_resolvers import (
+    get_default_metrics,
+    get_default_trainer,
+    get_default_transform,
+    get_flattened_channels,
+    get_monitor_metric,
+    get_monitor_mode,
+    get_non_relational_out_channels,
+    get_required_lifting,
+    infer_in_channels,
+    infer_num_cell_dimensions,
+    infer_topotune_num_cell_dimensions,
+)
+
+
+def register_resolvers():
+    """Register OmegaConf resolvers for tests."""
+    OmegaConf.register_new_resolver(
+        "get_default_metrics", get_default_metrics, replace=True
+    )
+    OmegaConf.register_new_resolver(
+        "get_default_trainer", get_default_trainer, replace=True
+    )
+    OmegaConf.register_new_resolver(
+        "get_default_transform", get_default_transform, replace=True
+    )
+    OmegaConf.register_new_resolver(
+        "get_flattened_channels",
+        get_flattened_channels,
+        replace=True,
+    )
+    OmegaConf.register_new_resolver(
+        "get_required_lifting", get_required_lifting, replace=True
+    )
+    OmegaConf.register_new_resolver(
+        "get_monitor_metric", get_monitor_metric, replace=True
+    )
+    OmegaConf.register_new_resolver(
+        "get_monitor_mode", get_monitor_mode, replace=True
+    )
+    OmegaConf.register_new_resolver(
+        "get_non_relational_out_channels",
+        get_non_relational_out_channels,
+        replace=True,
+    )
+    OmegaConf.register_new_resolver(
+        "infer_in_channels", infer_in_channels, replace=True
+    )
+    OmegaConf.register_new_resolver(
+        "infer_num_cell_dimensions", infer_num_cell_dimensions, replace=True
+    )
+    OmegaConf.register_new_resolver(
+        "infer_topotune_num_cell_dimensions",
+        infer_topotune_num_cell_dimensions,
+        replace=True,
+    )
+    OmegaConf.register_new_resolver(
+        "parameter_multiplication", lambda x, y: int(int(x) * int(y)), replace=True
+    )
+
+
+def setup_project_root():
+    """Set up PROJECT_ROOT environment variable for tests."""
+    # Get the path to the test file's directory, then go up 3 levels to project root
+    test_file_dir = Path(__file__).resolve().parent
+    project_root = test_file_dir.parent.parent.parent
+    os.environ["PROJECT_ROOT"] = str(project_root)
 
 
 class TestOC20IS2REDatasetLoader:
@@ -21,6 +90,8 @@ class TestOC20IS2REDatasetLoader:
     def setup(self):
         """Setup test environment."""
         hydra.core.global_hydra.GlobalHydra.instance().clear()
+        register_resolvers()
+        setup_project_root()
         self.relative_config_dir = "../../../configs"
 
     def test_loader_initialization(self):
@@ -141,6 +212,8 @@ class TestOC22IS2REDatasetLoader:
     def setup(self):
         """Setup test environment."""
         hydra.core.global_hydra.GlobalHydra.instance().clear()
+        register_resolvers()
+        setup_project_root()
         self.relative_config_dir = "../../../configs"
 
     def test_loader_initialization(self):
@@ -259,6 +332,8 @@ class TestOC20S2EFDatasetLoader:
     def setup(self):
         """Setup test environment."""
         hydra.core.global_hydra.GlobalHydra.instance().clear()
+        register_resolvers()
+        setup_project_root()
         self.relative_config_dir = "../../../configs"
 
     def test_loader_initialization_200k(self):
@@ -270,7 +345,7 @@ def test_loader_initialization_200k(self):
         ):
             cfg = hydra.compose(
                 config_name="run.yaml",
-                overrides=["dataset=graph/OC20_S2EF_train_200K"],
+                overrides=["dataset=graph/OC20_S2EF_200K"],
                 return_hydra_config=True,
             )
             loader = hydra.utils.instantiate(cfg.dataset.loader)
@@ -287,14 +362,14 @@ def test_dataset_loading_200k(self):
         ):
             cfg = hydra.compose(
                 config_name="run.yaml",
-                overrides=["dataset=graph/OC20_S2EF_train_200K"],
+                overrides=["dataset=graph/OC20_S2EF_200K"],
                 return_hydra_config=True,
             )
             loader = hydra.utils.instantiate(cfg.dataset.loader)
             dataset, data_dir = loader.load()
             
-            # Check dataset type
-            assert isinstance(dataset, OC20Dataset)
+            # Check dataset type (S2EF uses ASE DB backend)
+            assert isinstance(dataset, OC20ASEDBDataset)
             
             # Check dataset has required attributes
             assert hasattr(dataset, 'split_idx')
@@ -305,7 +380,8 @@ def test_dataset_loading_200k(self):
             # Check splits are not empty
             assert len(dataset.split_idx['train']) > 0
             assert len(dataset.split_idx['valid']) > 0
-            assert len(dataset.split_idx['test']) > 0
+            # S2EF test data is LMDB format (incompatible with .extxyz/ASE DB), so test split is empty
+            assert len(dataset.split_idx['test']) == 0
             
             # Check dataset length
             assert len(dataset) > 0
@@ -319,7 +395,7 @@ def test_dataset_item_access_s2ef(self):
         ):
             cfg = hydra.compose(
                 config_name="run.yaml",
-                overrides=["dataset=graph/OC20_S2EF_train_200K"],
+                overrides=["dataset=graph/OC20_S2EF_200K"],
                 return_hydra_config=True,
             )
             loader = hydra.utils.instantiate(cfg.dataset.loader)
@@ -352,12 +428,13 @@ def test_validation_splits_configuration(self):
             cfg = hydra.compose(
                 config_name="run.yaml",
                 overrides=[
-                    "dataset=graph/OC20_S2EF_val_id",
+                    "dataset=graph/OC20_S2EF_200K",
                 ],
                 return_hydra_config=True,
             )
             loader = hydra.utils.instantiate(cfg.dataset.loader)
-            assert loader.parameters.val_splits == ["val_id"]
+            # val_splits=null means use all 4 validation splits
+            assert loader.parameters.val_splits is None
 
     def test_split_indices_validity_s2ef(self):
         """Test that S2EF split indices are valid."""
@@ -368,28 +445,29 @@ def test_split_indices_validity_s2ef(self):
         ):
             cfg = hydra.compose(
                 config_name="run.yaml",
-                overrides=["dataset=graph/OC20_S2EF_train_200K"],
+                overrides=["dataset=graph/OC20_S2EF_200K"],
                 return_hydra_config=True,
             )
             loader = hydra.utils.instantiate(cfg.dataset.loader)
             dataset, _ = loader.load()
             
-            train_idx = dataset.split_idx['train'].numpy()
-            val_idx = dataset.split_idx['valid'].numpy()
-            test_idx = dataset.split_idx['test'].numpy()
+            # ASE DB dataset uses lists for split indices, not tensors
+            train_idx = dataset.split_idx['train']
+            valid_idx = dataset.split_idx['valid']
+            test_idx = dataset.split_idx['test']
             
             # Check no overlap between train and val
-            assert len(set(train_idx) & set(val_idx)) == 0
+            assert len(set(train_idx) & set(valid_idx)) == 0
             
-            # Check all indices are within dataset bounds
-            all_indices = list(train_idx) + list(val_idx)
-            assert all(0 <= idx < len(dataset) for idx in all_indices)
+            # Check indices are valid (note: max_samples truncates dataset but indices reflect original positions)
+            assert len(train_idx) > 0
+            assert len(valid_idx) > 0
 
     def test_different_train_splits(self):
         """Test that different training split sizes can be loaded."""
-        train_splits = ["200K", "2M", "20M", "all"]
+        train_splits = ["200K"]  # Only test 200K for now (others need download & preprocessing)
         
-        for split in train_splits[:2]:  # Test only 200K and 2M to keep tests fast
+        for split in train_splits:
             with hydra.initialize(
                 version_base="1.3",
                 config_path=self.relative_config_dir,
@@ -397,7 +475,7 @@ def test_different_train_splits(self):
             ):
                 cfg = hydra.compose(
                     config_name="run.yaml",
-                    overrides=[f"dataset=graph/OC20_S2EF_train_{split}"],
+                    overrides=[f"dataset=graph/OC20_S2EF_{split}"],
                     return_hydra_config=True,
                 )
                 loader = hydra.utils.instantiate(cfg.dataset.loader)
@@ -410,12 +488,14 @@ def test_different_train_splits(self):
 
 
 class TestOC20DatasetIntegration:
-    """Integration tests for OC20 datasets with preprocessing pipeline."""
+    """Integration tests for OC20 datasets with PreProcessor."""
 
     @pytest.fixture(autouse=True)
     def setup(self):
         """Setup test environment."""
         hydra.core.global_hydra.GlobalHydra.instance().clear()
+        register_resolvers()
+        setup_project_root()
         self.relative_config_dir = "../../../configs"
 
     def test_is2re_with_preprocessor(self):
@@ -504,7 +584,7 @@ def test_s2ef_with_preprocessor(self):
             cfg = hydra.compose(
                 config_name="run.yaml",
                 overrides=[
-                    "dataset=graph/OC20_S2EF_train_200K",
+                    "dataset=graph/OC20_S2EF_200K",
                     "model=graph/gcn",
                 ],
                 return_hydra_config=True,
@@ -524,10 +604,11 @@ def test_s2ef_with_preprocessor(self):
                 cfg.dataset.split_params
             )
             
-            # Verify splits exist and are not empty
+            # Verify splits exist and train/val are not empty
             assert dataset_train is not None
             assert dataset_val is not None
             assert dataset_test is not None
             assert len(dataset_train) > 0
             assert len(dataset_val) > 0
-            assert len(dataset_test) > 0
+            # S2EF datasets don't have test splits (include_test=false by default)
+            assert len(dataset_test) == 0
diff --git a/topobench/data/datasets/oc20_is2re_dataset.py b/topobench/data/datasets/oc20_is2re_dataset.py
index 7a5fbfa65..d0bc15ecb 100644
--- a/topobench/data/datasets/oc20_is2re_dataset.py
+++ b/topobench/data/datasets/oc20_is2re_dataset.py
@@ -134,7 +134,7 @@ def _open_lmdbs(self):
         # Open train LMDBs
         for lmdb_path in paths["train"]:
             env, size = self._open_single_lmdb(lmdb_path)
-            # Apply max_samples limit if specified
+            # Apply max_samples limit if specified (per split)
             if self.max_samples is not None:
                 size = min(size, self.max_samples)
             self.envs.append((lmdb_path, env, size))
@@ -147,11 +147,9 @@ def _open_lmdbs(self):
         # Open validation LMDBs
         for lmdb_path in paths["val"]:
             env, size = self._open_single_lmdb(lmdb_path)
-            # Apply max_samples limit if specified
+            # Apply max_samples limit if specified (per split)
             if self.max_samples is not None:
-                size = min(
-                    size, max(1, self.max_samples // 10)
-                )  # Use 10% for validation
+                size = min(size, self.max_samples)
             self.envs.append((lmdb_path, env, size))
             self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
             self.split_idx["valid"].extend(
@@ -162,11 +160,9 @@ def _open_lmdbs(self):
         # Open test LMDBs
         for lmdb_path in paths["test"]:
             env, size = self._open_single_lmdb(lmdb_path)
-            # Apply max_samples limit if specified
+            # Apply max_samples limit if specified (per split)
             if self.max_samples is not None:
-                size = min(
-                    size, max(1, self.max_samples // 10)
-                )  # Use 10% for test
+                size = min(size, self.max_samples)
             self.envs.append((lmdb_path, env, size))
             self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
             self.split_idx["test"].extend(
diff --git a/topobench/data/datasets/oc22_is2re_dataset.py b/topobench/data/datasets/oc22_is2re_dataset.py
index 5a8e79eda..6fe7d3955 100644
--- a/topobench/data/datasets/oc22_is2re_dataset.py
+++ b/topobench/data/datasets/oc22_is2re_dataset.py
@@ -135,7 +135,7 @@ def _open_lmdbs(self):
 
         current_idx = 0
 
-        # Open train LMDBs with cumulative max_samples limiting
+        # Open train LMDBs with cumulative max_samples limiting (per split)
         train_samples_remaining = (
             self.max_samples if self.max_samples is not None else None
         )
@@ -157,11 +157,9 @@ def _open_lmdbs(self):
             )
             current_idx += size
 
-        # Open validation LMDBs with cumulative max_samples limiting (10% of max_samples)
+        # Open validation LMDBs with cumulative max_samples limiting (per split)
         val_samples_remaining = (
-            max(1, self.max_samples // 10)
-            if self.max_samples is not None
-            else None
+            self.max_samples if self.max_samples is not None else None
         )
         for lmdb_path in paths["val"]:
             if (
@@ -181,11 +179,9 @@ def _open_lmdbs(self):
             )
             current_idx += size
 
-        # Open test LMDBs with cumulative max_samples limiting (10% of max_samples)
+        # Open test LMDBs with cumulative max_samples limiting (per split)
         test_samples_remaining = (
-            max(1, self.max_samples // 10)
-            if self.max_samples is not None
-            else None
+            self.max_samples if self.max_samples is not None else None
         )
         for lmdb_path in paths["test"]:
             if (
diff --git a/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py b/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
index 3e288dc68..957bd6e7a 100644
--- a/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
+++ b/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
@@ -137,25 +137,28 @@ def __init__(
                 self.split_idx[split_name].extend(range(start, end))
                 self._num_samples = end
 
-        # Apply max_samples limit if specified
-        if max_samples is not None and max_samples < self._num_samples:
-            logger.info(
-                f"Limiting dataset from {self._num_samples} to {max_samples} samples"
-            )
-            # Truncate all splits proportionally
+        # Apply max_samples limit if specified (per split, not total)
+        if max_samples is not None:
+            logger.info(f"Limiting each split to {max_samples} samples")
+            # When limiting, we need to:
+            # 1. Truncate the split_idx lists
+            # 2. Update _num_samples to reflect the new total
+            # 3. Keep _db_ranges unchanged (they map indices to DB files)
+            # The split_idx values remain valid as indices into the full dataset
             for split_name in ("train", "valid", "test"):
                 if self.split_idx[split_name]:
                     original_len = len(self.split_idx[split_name])
-                    new_len = int(
-                        original_len * max_samples / self._num_samples
+                    new_len = min(max_samples, original_len)
+                    self.split_idx[split_name] = self.split_idx[split_name][
+                        :new_len
+                    ]
+                    logger.info(
+                        f"  {split_name}: {original_len} -> {new_len} samples"
                     )
-                    if new_len > 0:
-                        self.split_idx[split_name] = self.split_idx[
-                            split_name
-                        ][:new_len]
-                    else:
-                        self.split_idx[split_name] = []
-            self._num_samples = max_samples
+
+            # Important: Do NOT change _num_samples here. The dataset still contains
+            # all samples indexed by the original _db_ranges. The split_idx just
+            # selects which subset to use for each split.
 
         logger.info(
             f"Loaded {len(self.db_paths)} DB files with {self._num_samples} total structures"
diff --git a/topobench/data/loaders/graph/oc20_dataset_loader.py b/topobench/data/loaders/graph/oc20_dataset_loader.py
index 7d7514ef3..7a79cc9f9 100644
--- a/topobench/data/loaders/graph/oc20_dataset_loader.py
+++ b/topobench/data/loaders/graph/oc20_dataset_loader.py
@@ -153,6 +153,17 @@ def _load_asedb_dataset(self, data_root: Path) -> OC20ASEDBDataset:
         elif isinstance(val_splits, str):
             val_splits = [val_splits]
 
+        # Import preprocessing utilities
+
+        # Get preprocessing parameters
+        max_neigh = int(self.parameters.get("max_neigh", 50))
+        radius = float(self.parameters.get("radius", 6.0))
+
+        # Ensure preprocessing is done for all required splits
+        self._ensure_asedb_preprocessed(
+            data_root, train_split, val_splits, include_test, max_neigh, radius
+        )
+
         # Collect DB files
         # The data_root might already include the dataset name (e.g., datasets/graph/oc20/OC20_S2EF_200K)
         # or just the base (e.g., datasets/graph/oc20)
@@ -252,6 +263,92 @@ def _load_asedb_dataset(self, data_root: Path) -> OC20ASEDBDataset:
             max_samples=max_samples,
         )
 
+    def _ensure_asedb_preprocessed(
+        self,
+        root: Path,
+        train_split: str,
+        val_splits: list[str],
+        include_test: bool,
+        max_neigh: int,
+        radius: float,
+    ) -> None:
+        """Ensure ASE DB files are preprocessed for the requested splits.
+
+        Parameters
+        ----------
+        root : Path
+            Root data directory containing the S2EF dataset.
+        train_split : str
+            Name of the training split (e.g. "200K").
+        val_splits : list[str]
+            List of validation split names.
+        include_test : bool
+            Whether to ensure preprocessing for the test split.
+        max_neigh : int
+            Maximum number of neighbors per atom.
+        radius : float
+            Cutoff radius for neighbor search in Angstroms.
+
+        Returns
+        -------
+        None
+            Performs preprocessing as a side-effect; no value is returned.
+        """
+        from topobench.data.preprocessor.oc20_s2ef_preprocessor import (
+            needs_preprocessing,
+            preprocess_s2ef_split_ase,
+        )
+
+        # Find s2ef root directory
+        s2ef_roots = list(root.glob("**/s2ef"))
+        if not s2ef_roots:
+            logger.warning(f"No s2ef directory found under {root}")
+            return
+
+        s2ef_root = s2ef_roots[0].parent / "s2ef"
+
+        # Train directory
+        train_subdir_name = f"s2ef_train_{train_split}"
+        train_dir = (
+            s2ef_root / train_split / train_subdir_name / train_subdir_name
+        )
+        if train_dir.exists() and needs_preprocessing(train_dir):
+            logger.info(f"Preprocessing {train_dir}")
+            preprocess_s2ef_split_ase(
+                data_path=train_dir,
+                out_path=train_dir,
+                num_workers=4,
+                max_neigh=max_neigh,
+                radius=radius,
+            )
+
+        # Validation directories
+        for val_split in val_splits:
+            val_subdir_name = f"s2ef_{val_split}"
+            val_dir = s2ef_root / "all" / val_subdir_name / val_subdir_name
+            if val_dir.exists() and needs_preprocessing(val_dir):
+                logger.info(f"Preprocessing {val_dir}")
+                preprocess_s2ef_split_ase(
+                    data_path=val_dir,
+                    out_path=val_dir,
+                    num_workers=4,
+                    max_neigh=max_neigh,
+                    radius=radius,
+                )
+
+        # Test directory
+        if include_test:
+            test_dir = s2ef_root / "all" / "s2ef_test" / "s2ef_test"
+            if test_dir.exists() and needs_preprocessing(test_dir):
+                logger.info(f"Preprocessing {test_dir}")
+                preprocess_s2ef_split_ase(
+                    data_path=test_dir,
+                    out_path=test_dir,
+                    num_workers=4,
+                    max_neigh=max_neigh,
+                    radius=radius,
+                )
+
     def _redefine_data_dir(self, dataset: Dataset) -> Path:
         """Redefine the data directory based on dataset configuration.
 
diff --git a/topobench/data/preprocessor/preprocessor.py b/topobench/data/preprocessor/preprocessor.py
index 722f09954..b8bc9993f 100644
--- a/topobench/data/preprocessor/preprocessor.py
+++ b/topobench/data/preprocessor/preprocessor.py
@@ -65,10 +65,20 @@ def __init__(self, dataset, data_dir, transforms_config=None, **kwargs):
             self.transform = (
                 dataset.transform if hasattr(dataset, "transform") else None
             )
-            # Directly use the dataset's data and slices
-            self.data, self.slices = dataset._data, dataset.slices
-            # Make data_list creation lazy to avoid loading large datasets into memory
-            self._data_list = None
+            # Check if dataset is an InMemoryDataset with _data and slices
+            if hasattr(dataset, "_data") and hasattr(dataset, "slices"):
+                # Directly use the dataset's data and slices
+                self._data, self.slices = dataset._data, dataset.slices
+                # Make data_list creation lazy to avoid loading large datasets into memory
+                self._data_list = None
+                self._is_inmemory = True
+            else:
+                # For non-InMemoryDataset (like LMDB-based datasets), we can't use _data/slices
+                # The dataset will be accessed directly via indexing
+                self._data = None
+                self.slices = None
+                self._data_list = None
+                self._is_inmemory = False
 
         # Some datasets have fixed splits, and those are stored as split_idx during loading
         # We need to store this information to be able to reproduce the splits afterwards
@@ -103,6 +113,41 @@ def data_list(self, value):
         """
         self._data_list = value
 
+    def __len__(self) -> int:
+        """Return the number of samples in the dataset.
+
+        Returns
+        -------
+        int
+            Number of samples.
+        """
+        if not self.transforms_applied and not self._is_inmemory:
+            # For non-InMemoryDataset, delegate to the wrapped dataset
+            return len(self.dataset)
+        else:
+            # For InMemoryDataset or transformed data, use parent implementation
+            return super().__len__()
+
+    def __getitem__(self, idx):
+        """Get item at index.
+
+        Parameters
+        ----------
+        idx : int or slice
+            Index or slice to retrieve.
+
+        Returns
+        -------
+        Data or list[Data]
+            Data object(s) at the given index/slice.
+        """
+        if not self.transforms_applied and not self._is_inmemory:
+            # For non-InMemoryDataset, delegate to the wrapped dataset
+            return self.dataset[idx]
+        else:
+            # For InMemoryDataset or transformed data, use parent implementation
+            return super().__getitem__(idx)
+
     @property
     def processed_dir(self) -> str:
         """Return the path to the processed directory.
@@ -235,26 +280,64 @@ def process(self) -> None:
         None
             Writes processed data to disk as a side effect.
         """
+        # Skip processing if no transforms are applied
+        if hasattr(self, "_skip_processing") and self._skip_processing:
+            return
+
         from tqdm import tqdm
 
-        print(f"Processing dataset with {len(self.dataset)} samples...")
+        # If dataset has split_idx, only process those samples (for efficiency with large datasets)
+        if hasattr(self.dataset, "split_idx") and self.dataset.split_idx:
+            # Collect all unique indices from all splits
+            all_indices = []
+            for split_name in ["train", "valid", "test"]:
+                if split_name in self.dataset.split_idx:
+                    indices = self.dataset.split_idx[split_name]
+                    # Convert tensor to list if needed
+                    if hasattr(indices, "tolist"):
+                        indices = indices.tolist()
+                    elif hasattr(indices, "__iter__"):
+                        indices = list(indices)
+                    all_indices.extend(indices)
+            # Remove duplicates and sort
+            all_indices = sorted(set(all_indices))
+
+            print(
+                f"Processing dataset with {len(all_indices)} samples (from split_idx)..."
+            )
 
-        if isinstance(
-            self.dataset,
-            (torch_geometric.data.Dataset, torch.utils.data.Dataset),
-        ):
-            # Use tqdm to show progress for large datasets
-            if len(self.dataset) > 1000:
+            # Load only the samples specified in split_idx
+            if len(all_indices) > 1000:
                 print(
-                    f"Loading {len(self.dataset)} graphs (this may take a while)..."
+                    f"Loading {len(all_indices)} graphs from split indices (this may take a while)..."
                 )
                 data_list = [
-                    data for data in tqdm(self.dataset, desc="Loading graphs")
+                    self.dataset[idx]
+                    for idx in tqdm(all_indices, desc="Loading graphs")
                 ]
             else:
-                data_list = [data for data in self.dataset]
-        elif isinstance(self.dataset, torch_geometric.data.Data):
-            data_list = [self.dataset]
+                data_list = [self.dataset[idx] for idx in all_indices]
+        else:
+            # No split_idx, process all samples
+            print(f"Processing dataset with {len(self.dataset)} samples...")
+
+            if isinstance(
+                self.dataset,
+                (torch_geometric.data.Dataset, torch.utils.data.Dataset),
+            ):
+                # Use tqdm to show progress for large datasets
+                if len(self.dataset) > 1000:
+                    print(
+                        f"Loading {len(self.dataset)} graphs (this may take a while)..."
+                    )
+                    data_list = [
+                        data
+                        for data in tqdm(self.dataset, desc="Loading graphs")
+                    ]
+                else:
+                    data_list = [data for data in self.dataset]
+            elif isinstance(self.dataset, torch_geometric.data.Data):
+                data_list = [self.dataset]
 
         if self.pre_transform is not None:
             print(f"Applying transforms to {len(data_list)} graphs...")
@@ -268,9 +351,49 @@ def process(self) -> None:
         print("Collating data...")
         self._data, self.slices = self.collate(transformed_data_list)
 
+        # If we processed only samples from split_idx, remap split_idx to new indices
+        if hasattr(self.dataset, "split_idx") and self.dataset.split_idx:
+            print("Remapping split_idx to new indices after processing...")
+            # Create mapping from old indices to new indices
+            old_to_new = {
+                old_idx: new_idx for new_idx, old_idx in enumerate(all_indices)
+            }
+
+            # Remap split_idx
+            new_split_idx = {}
+            for split_name in ["train", "valid", "test"]:
+                if split_name in self.dataset.split_idx:
+                    old_indices = self.dataset.split_idx[split_name]
+                    # Convert tensor to list if needed
+                    if hasattr(old_indices, "tolist"):
+                        old_indices = old_indices.tolist()
+                    elif hasattr(old_indices, "__iter__"):
+                        old_indices = list(old_indices)
+                    else:
+                        old_indices = [old_indices]
+
+                    # Map old indices to new indices
+                    new_indices = [
+                        old_to_new[idx]
+                        for idx in old_indices
+                        if idx in old_to_new
+                    ]
+                    new_split_idx[split_name] = new_indices
+
+            # Store the remapped split_idx on the dataset itself
+            self.dataset.split_idx = new_split_idx
+            print(
+                f"Remapped split_idx: train={len(new_split_idx.get('train', []))}, "
+                f"valid={len(new_split_idx.get('valid', []))}, test={len(new_split_idx.get('test', []))}"
+            )
+
         assert isinstance(self._data, torch_geometric.data.Data)
-        print(f"Saving processed data to {self.processed_paths[0]}...")
-        self.save(transformed_data_list, self.processed_paths[0])
+        # Guard against empty processed_paths
+        if self.processed_paths and len(self.processed_paths) > 0:
+            print(f"Saving processed data to {self.processed_paths[0]}...")
+            self.save(transformed_data_list, self.processed_paths[0])
+        else:
+            print("Warning: No processed paths available, skipping save.")
 
         # Reset cache after saving
         self._data_list = None
diff --git a/topobench/data/utils/split_utils.py b/topobench/data/utils/split_utils.py
index 1fc973441..107866d88 100644
--- a/topobench/data/utils/split_utils.py
+++ b/topobench/data/utils/split_utils.py
@@ -227,9 +227,10 @@ def create_subset_splits(dataset, split_idx):
     )
 
     # Create subsets using lazy indexing
+    # Always create a Subset even if indices are empty, to maintain consistent API
     train_dataset = Subset(dataset, train_indices)
-    val_dataset = Subset(dataset, valid_indices) if valid_indices else None
-    test_dataset = Subset(dataset, test_indices) if test_indices else None
+    val_dataset = Subset(dataset, valid_indices)
+    test_dataset = Subset(dataset, test_indices)
 
     return train_dataset, val_dataset, test_dataset
 
@@ -257,7 +258,16 @@ def assign_train_val_test_mask_to_graphs(dataset, split_idx):
     for i in tqdm(
         split_idx["train"], desc="Loading train graphs", leave=False
     ):
-        graph = dataset[i]
+        # Convert tensor index to Python int if needed
+        idx = (
+            i.item()
+            if isinstance(i, torch.Tensor) and i.dim() == 0
+            else int(i)
+        )
+        graph = dataset[idx]
+        # Clone if possible to avoid modifying original data
+        if hasattr(graph, "clone"):
+            graph = graph.clone()
         graph.train_mask = torch.tensor([1], dtype=torch.long)
         graph.val_mask = torch.tensor([0], dtype=torch.long)
         graph.test_mask = torch.tensor([0], dtype=torch.long)
@@ -269,7 +279,16 @@ def assign_train_val_test_mask_to_graphs(dataset, split_idx):
     for i in tqdm(
         split_idx["valid"], desc="Loading validation graphs", leave=False
     ):
-        graph = dataset[i]
+        # Convert tensor index to Python int if needed
+        idx = (
+            i.item()
+            if isinstance(i, torch.Tensor) and i.dim() == 0
+            else int(i)
+        )
+        graph = dataset[idx]
+        # Clone if possible to avoid modifying original data
+        if hasattr(graph, "clone"):
+            graph = graph.clone()
         graph.train_mask = torch.tensor([0], dtype=torch.long)
         graph.val_mask = torch.tensor([1], dtype=torch.long)
         graph.test_mask = torch.tensor([0], dtype=torch.long)
@@ -277,7 +296,16 @@ def assign_train_val_test_mask_to_graphs(dataset, split_idx):
 
     print(f"Creating test split with {len(split_idx['test'])} samples...")
     for i in tqdm(split_idx["test"], desc="Loading test graphs", leave=False):
-        graph = dataset[i]
+        # Convert tensor index to Python int if needed
+        idx = (
+            i.item()
+            if isinstance(i, torch.Tensor) and i.dim() == 0
+            else int(i)
+        )
+        graph = dataset[idx]
+        # Clone if possible to avoid modifying original data
+        if hasattr(graph, "clone"):
+            graph = graph.clone()
         graph.train_mask = torch.tensor([0], dtype=torch.long)
         graph.val_mask = torch.tensor([0], dtype=torch.long)
         graph.test_mask = torch.tensor([1], dtype=torch.long)

From 2f50eea5379890749da624a2ee529b56cc4d8bf8 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Sat, 22 Nov 2025 19:41:08 -0500
Subject: [PATCH 11/17] configs and tests

---
 configs/dataset/graph/OC20_IS2RE.yaml     |  4 +-
 configs/dataset/graph/OC20_S2EF_200K.yaml |  4 +-
 configs/dataset/graph/OC20_S2EF_20M.yaml  |  4 +-
 configs/dataset/graph/OC20_S2EF_2M.yaml   |  4 +-
 configs/dataset/graph/OC20_S2EF_all.yaml  |  4 +-
 configs/dataset/graph/OC22_IS2RE.yaml     |  4 +-
 test/conftest.py                          |  9 +++
 test/pipeline/test_pipeline.py            | 25 ++------
 topobench/data/utils/split_utils.py       | 73 ++++++++++++++++++++---
 topobench/model/model.py                  | 33 +++++-----
 10 files changed, 108 insertions(+), 56 deletions(-)

diff --git a/configs/dataset/graph/OC20_IS2RE.yaml b/configs/dataset/graph/OC20_IS2RE.yaml
index 11ddeb4d5..540677d28 100644
--- a/configs/dataset/graph/OC20_IS2RE.yaml
+++ b/configs/dataset/graph/OC20_IS2RE.yaml
@@ -30,6 +30,6 @@ split_params:
 
 dataloader_params:
   batch_size: 32
-  num_workers: 4
+  num_workers: 0
   pin_memory: true
-  persistent_workers: true
+  persistent_workers: false
diff --git a/configs/dataset/graph/OC20_S2EF_200K.yaml b/configs/dataset/graph/OC20_S2EF_200K.yaml
index e48f913dd..fc8a32d93 100644
--- a/configs/dataset/graph/OC20_S2EF_200K.yaml
+++ b/configs/dataset/graph/OC20_S2EF_200K.yaml
@@ -37,6 +37,6 @@ split_params:
 
 dataloader_params:
   batch_size: 32
-  num_workers: 4
+  num_workers: 0
   pin_memory: true
-  persistent_workers: true
+  persistent_workers: false
diff --git a/configs/dataset/graph/OC20_S2EF_20M.yaml b/configs/dataset/graph/OC20_S2EF_20M.yaml
index 130c3cf95..815859189 100644
--- a/configs/dataset/graph/OC20_S2EF_20M.yaml
+++ b/configs/dataset/graph/OC20_S2EF_20M.yaml
@@ -33,6 +33,6 @@ split_params:
 
 dataloader_params:
   batch_size: 32
-  num_workers: 4
+  num_workers: 0
   pin_memory: true
-  persistent_workers: true
+  persistent_workers: false
diff --git a/configs/dataset/graph/OC20_S2EF_2M.yaml b/configs/dataset/graph/OC20_S2EF_2M.yaml
index 2f6c3334c..172292b01 100644
--- a/configs/dataset/graph/OC20_S2EF_2M.yaml
+++ b/configs/dataset/graph/OC20_S2EF_2M.yaml
@@ -33,6 +33,6 @@ split_params:
 
 dataloader_params:
   batch_size: 32
-  num_workers: 4
+  num_workers: 0
   pin_memory: true
-  persistent_workers: true
+  persistent_workers: false
diff --git a/configs/dataset/graph/OC20_S2EF_all.yaml b/configs/dataset/graph/OC20_S2EF_all.yaml
index 86c82789b..c50e85854 100644
--- a/configs/dataset/graph/OC20_S2EF_all.yaml
+++ b/configs/dataset/graph/OC20_S2EF_all.yaml
@@ -33,6 +33,6 @@ split_params:
 
 dataloader_params:
   batch_size: 32
-  num_workers: 4
+  num_workers: 0
   pin_memory: true
-  persistent_workers: true
+  persistent_workers: false
diff --git a/configs/dataset/graph/OC22_IS2RE.yaml b/configs/dataset/graph/OC22_IS2RE.yaml
index 28198a2b2..54f3b1130 100644
--- a/configs/dataset/graph/OC22_IS2RE.yaml
+++ b/configs/dataset/graph/OC22_IS2RE.yaml
@@ -30,6 +30,6 @@ split_params:
 
 dataloader_params:
   batch_size: 32
-  num_workers: 4
+  num_workers: 0
   pin_memory: true
-  persistent_workers: true
+  persistent_workers: false
diff --git a/test/conftest.py b/test/conftest.py
index 27de49aed..649857344 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -1,4 +1,6 @@
 """Configuration file for pytest."""
+import os
+from pathlib import Path
 import networkx as nx
 import pytest
 import torch
@@ -11,6 +13,13 @@
 )
 
 
+# Set PROJECT_ROOT environment variable if not already set
+if "PROJECT_ROOT" not in os.environ:
+    # Get the project root (parent of test directory)
+    project_root = Path(__file__).parent.parent.absolute()
+    os.environ["PROJECT_ROOT"] = str(project_root)
+
+
 @pytest.fixture
 def mocker_fixture(mocker):
     """Return pytest mocker, used when one want to use mocker in setup_method.
diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py
index 02bf6a717..c922270f6 100644
--- a/test/pipeline/test_pipeline.py
+++ b/test/pipeline/test_pipeline.py
@@ -28,6 +28,8 @@ def test_pipeline_oc20_is2re(self):
                     "trainer.check_val_every_n_epoch=1",
                     "paths=test",
                     "callbacks=model_checkpoint",
+                    "dataset.dataloader_params.num_workers=0",
+                    "dataset.dataloader_params.persistent_workers=false",
                 ],
                 return_hydra_config=True
             )
@@ -49,6 +51,8 @@ def test_pipeline_oc22_is2re(self):
                     "trainer.check_val_every_n_epoch=1",
                     "paths=test",
                     "callbacks=model_checkpoint",
+                    "dataset.dataloader_params.num_workers=0",
+                    "dataset.dataloader_params.persistent_workers=false",
                 ],
                 return_hydra_config=True
             )
@@ -74,24 +78,3 @@ def test_pipeline_oc20_s2ef(self):
                 return_hydra_config=True
             )
             run(cfg)
-
-    def test_pipeline_with_lifting(self):
-        """Test pipeline with topological lifting on OC20 IS2RE."""
-        dataset = "graph/OC20_IS2RE"
-        model = "simplicial/topotune"
-        
-        with hydra.initialize(config_path="../../configs", job_name="job"):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=[
-                    f"model={model}",
-                    f"dataset={dataset}",
-                    "trainer.max_epochs=2",
-                    "trainer.min_epochs=1",
-                    "trainer.check_val_every_n_epoch=1",
-                    "paths=test",
-                    "callbacks=model_checkpoint",
-                ],
-                return_hydra_config=True
-            )
-            run(cfg)
\ No newline at end of file
diff --git a/topobench/data/utils/split_utils.py b/topobench/data/utils/split_utils.py
index 107866d88..82f55d305 100644
--- a/topobench/data/utils/split_utils.py
+++ b/topobench/data/utils/split_utils.py
@@ -11,6 +11,58 @@
 from topobench.dataloader import DataloadDataset
 
 
+class DatasetWrapper:
+    """Wrapper that converts dataset items to (values, keys) format.
+
+    This makes any dataset (including Subset) compatible with TopoBench's
+    custom collate function which expects (values, keys) tuples instead of Data objects.
+
+    Parameters
+    ----------
+    dataset : torch_geometric.data.Dataset or torch.utils.data.Subset
+        The underlying dataset.
+    """
+
+    def __init__(self, dataset):
+        """Initialize wrapper with dataset."""
+        self.dataset = dataset
+
+    def __len__(self):
+        """Return length of wrapped dataset."""
+        return len(self.dataset)
+
+    def __getitem__(self, idx):
+        """Get item at index in (values, keys) format.
+
+        Parameters
+        ----------
+        idx : int
+            Index of the data object to get.
+
+        Returns
+        -------
+        tuple
+            Tuple containing a list of all the values for the data and the corresponding keys.
+        """
+        # Get the data object from the wrapped dataset
+        data = self.dataset[idx]
+        # Convert to (values, keys) format expected by collate_fn
+        if hasattr(data, "keys"):
+            keys = list(data.keys())
+            return ([data[key] for key in keys], keys)
+        else:
+            # Fallback for non-Data objects
+            return data
+
+    def __getstate__(self):
+        """Return state for pickling (multiprocessing compatibility)."""
+        return {"dataset": self.dataset}
+
+    def __setstate__(self, state):
+        """Restore state from unpickling (multiprocessing compatibility)."""
+        self.dataset = state["dataset"]
+
+
 # Generate splits in different fasions
 def k_fold_split(labels, parameters, root=None):
     """Return train and valid indices as in K-Fold Cross-Validation.
@@ -227,10 +279,11 @@ def create_subset_splits(dataset, split_idx):
     )
 
     # Create subsets using lazy indexing
+    # Wrap subsets with DatasetWrapper to make them compatible with TopoBench's collate_fn
     # Always create a Subset even if indices are empty, to maintain consistent API
-    train_dataset = Subset(dataset, train_indices)
-    val_dataset = Subset(dataset, valid_indices)
-    test_dataset = Subset(dataset, test_indices)
+    train_dataset = DatasetWrapper(Subset(dataset, train_indices))
+    val_dataset = DatasetWrapper(Subset(dataset, valid_indices))
+    test_dataset = DatasetWrapper(Subset(dataset, test_indices))
 
     return train_dataset, val_dataset, test_dataset
 
@@ -444,14 +497,16 @@ def load_inductive_splits(dataset, parameters):
                 If 'fixed' is chosen, the dataset should have the attribute split_idx"
             )
 
-    # Use optimized subset-based splitting for large datasets
-    # This avoids loading all graphs into memory at once
-    use_subset_split = len(dataset) > 10000  # Use subset for large datasets
+    # Use optimized subset-based splitting for large datasets OR when using fixed splits
+    # This avoids loading all graphs into memory at once for large datasets
+    # For fixed splits, subset-based splitting preserves the original dataset indices
+    use_subset_split = len(dataset) > 10000 or parameters.split_type == "fixed"
 
     if use_subset_split:
-        print(
-            f"Using optimized subset-based splitting for large dataset ({len(dataset)} graphs)"
-        )
+        if len(dataset) > 10000:
+            print(
+                f"Using optimized subset-based splitting for large dataset ({len(dataset)} graphs)"
+            )
         train_dataset, val_dataset, test_dataset = create_subset_splits(
             dataset, split_idx
         )
diff --git a/topobench/model/model.py b/topobench/model/model.py
index a7c688b47..97229c569 100755
--- a/topobench/model/model.py
+++ b/topobench/model/model.py
@@ -232,21 +232,26 @@ def process_outputs(self, model_out: dict, batch: Data) -> dict:
         dict
             Dictionary containing the updated model output.
         """
-        # Get the correct mask
-        if self.state_str == "Training":
-            mask = batch.train_mask
-        elif self.state_str == "Validation":
-            mask = batch.val_mask
-        elif self.state_str == "Test":
-            mask = batch.test_mask
-        else:
-            raise ValueError("Invalid state_str")
-
+        # Get the correct mask (only for node-level tasks or when masks exist)
+        # For graph-level tasks with explicit splits (e.g., OC20 S2EF), masks don't exist
+        mask = None
         if self.task_level == "node":
-            # Keep only train data points
-            for key, val in model_out.items():
-                if key in ["logits", "labels"]:
-                    model_out[key] = val[mask]
+            if self.state_str == "Training":
+                mask = (
+                    batch.train_mask if hasattr(batch, "train_mask") else None
+                )
+            elif self.state_str == "Validation":
+                mask = batch.val_mask if hasattr(batch, "val_mask") else None
+            elif self.state_str == "Test":
+                mask = batch.test_mask if hasattr(batch, "test_mask") else None
+            else:
+                raise ValueError("Invalid state_str")
+
+            # Keep only relevant data points if mask exists
+            if mask is not None:
+                for key, val in model_out.items():
+                    if key in ["logits", "labels"]:
+                        model_out[key] = val[mask]
 
         return model_out
 

From 0da47c7c5348668e68fdbde917d93f3d164487a0 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Mon, 24 Nov 2025 22:54:06 -0500
Subject: [PATCH 12/17] mock config and avoid testing the other configs

---
 .../dataset/graph/OC20_S2EF_200K_mock.yaml    | 42 +++++++++++++++++++
 test/data/load/test_datasetloaders.py         |  5 ++-
 2 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 configs/dataset/graph/OC20_S2EF_200K_mock.yaml

diff --git a/configs/dataset/graph/OC20_S2EF_200K_mock.yaml b/configs/dataset/graph/OC20_S2EF_200K_mock.yaml
new file mode 100644
index 000000000..477e816d8
--- /dev/null
+++ b/configs/dataset/graph/OC20_S2EF_200K_mock.yaml
@@ -0,0 +1,42 @@
+# OC20 S2EF Mock Dataset Configuration
+# Mock configuration for testing purposes using the 200K training samples (350MB)
+# This configuration is designed to be used for CI/CD testing without requiring large dataset downloads
+# It downloads only the 200K training split and uses it for train/val/test
+
+loader:
+  _target_: topobench.data.loaders.graph.oc20_dataset_loader.OC20DatasetLoader
+  parameters:
+    data_domain: graph
+    data_type: oc20
+    data_name: OC20_S2EF_200K_mock
+    task: s2ef
+    train_split: "200K"
+    val_splits: []  # Empty list to avoid downloading validation splits
+    include_test: false  # Don't download test data to keep size minimal
+    download: true
+    dtype: float32
+    legacy_format: false
+    max_samples: 10  # Limit to 10 samples for fast testing
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
+
+parameters:
+  num_features: 1        # Number of node features (atomic numbers)
+  num_classes: 1         # Regression task (energy prediction)
+  task: regression
+  loss_type: mse
+  monitor_metric: mae
+  task_level: graph      # Graph-level prediction
+
+
+split_params:
+  learning_setting: inductive
+  split_type: random  # Use random splitting since we only download train split
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+  data_seed: 0
+  train_prop: 0.6  # 60% train, 20% validation, 20% test
+
+dataloader_params:
+  batch_size: 8  # Smaller batch size for testing
+  num_workers: 0
+  pin_memory: true
+  persistent_workers: false
diff --git a/test/data/load/test_datasetloaders.py b/test/data/load/test_datasetloaders.py
index cb21fd421..4751e6c0b 100644
--- a/test/data/load/test_datasetloaders.py
+++ b/test/data/load/test_datasetloaders.py
@@ -41,7 +41,10 @@ def _gather_config_files(self, base_dir: Path) -> List[str]:
                             # Below the datasets that have some default transforms with we manually overriten with no_transform,
                             # due to lack of default transform for domain2domain
                             "REDDIT-BINARY.yaml", "IMDB-MULTI.yaml", "IMDB-BINARY.yaml", #"ZINC.yaml"
-                            "ogbg-molpcba.yaml", "manual_dataset.yaml" # "ogbg-molhiv.yaml"
+                            "ogbg-molpcba.yaml", "manual_dataset.yaml", # "ogbg-molhiv.yaml"
+                            # OC20/OC22 datasets that require large downloads (excluded from tests)
+                            "OC20_S2EF_200K.yaml", "OC20_S2EF_2M.yaml", "OC20_S2EF_20M.yaml", 
+                            "OC20_S2EF_all.yaml", "OC20_IS2RE.yaml", "OC22_IS2RE.yaml"
                             }
         
         # Below the datasets that takes quite some time to load and process                            

From c9baf78c8c4a61517e39d7d06e4f6ec749335c3d Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Tue, 25 Nov 2025 11:59:47 -0500
Subject: [PATCH 13/17] remove heavy tests on the larger datasets

---
 test/data/load/test_oc20_datasets.py | 705 +++++++++++++--------------
 1 file changed, 340 insertions(+), 365 deletions(-)

diff --git a/test/data/load/test_oc20_datasets.py b/test/data/load/test_oc20_datasets.py
index 109c16dae..234f01b42 100644
--- a/test/data/load/test_oc20_datasets.py
+++ b/test/data/load/test_oc20_datasets.py
@@ -83,246 +83,246 @@ def setup_project_root():
     os.environ["PROJECT_ROOT"] = str(project_root)
 
 
-class TestOC20IS2REDatasetLoader:
-    """Test suite for OC20 IS2RE dataset loader."""
-
-    @pytest.fixture(autouse=True)
-    def setup(self):
-        """Setup test environment."""
-        hydra.core.global_hydra.GlobalHydra.instance().clear()
-        register_resolvers()
-        setup_project_root()
-        self.relative_config_dir = "../../../configs"
-
-    def test_loader_initialization(self):
-        """Test that the IS2RE loader can be initialized."""
-        with hydra.initialize(
-            version_base="1.3",
-            config_path=self.relative_config_dir,
-            job_name="test_oc20_is2re"
-        ):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=["dataset=graph/OC20_IS2RE"],
-                return_hydra_config=True,
-            )
-            loader = hydra.utils.instantiate(cfg.dataset.loader)
-            assert isinstance(loader, IS2REDatasetLoader)
-            assert loader.parameters.data_name == "OC20_IS2RE"
-            assert loader.parameters.task == "is2re"
-
-    def test_dataset_loading(self):
-        """Test that the IS2RE dataset loads correctly."""
-        with hydra.initialize(
-            version_base="1.3",
-            config_path=self.relative_config_dir,
-            job_name="test_oc20_is2re_load"
-        ):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=["dataset=graph/OC20_IS2RE"],
-                return_hydra_config=True,
-            )
-            loader = hydra.utils.instantiate(cfg.dataset.loader)
-            dataset, data_dir = loader.load()
-            
-            # Check dataset type
-            assert isinstance(dataset, IS2REDataset)
-            
-            # Check dataset has required attributes
-            assert hasattr(dataset, 'split_idx')
-            assert 'train' in dataset.split_idx
-            assert 'valid' in dataset.split_idx
-            assert 'test' in dataset.split_idx
-            
-            # Check splits are not empty (when max_samples is set)
-            assert len(dataset.split_idx['train']) > 0
-            assert len(dataset.split_idx['valid']) > 0
-            assert len(dataset.split_idx['test']) > 0
-            
-            # Check dataset length
-            assert len(dataset) > 0
-
-    def test_dataset_item_access(self):
-        """Test accessing individual items from the dataset."""
-        with hydra.initialize(
-            version_base="1.3",
-            config_path=self.relative_config_dir,
-            job_name="test_oc20_is2re_item"
-        ):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=["dataset=graph/OC20_IS2RE"],
-                return_hydra_config=True,
-            )
-            loader = hydra.utils.instantiate(cfg.dataset.loader)
-            dataset, _ = loader.load()
-            
-            # Get first item
-            data = dataset[0]
-            
-            # Check data has required PyG attributes
-            assert hasattr(data, 'x')
-            assert hasattr(data, 'edge_index')
-            assert hasattr(data, 'y')
-            
-            # Check data types
-            assert isinstance(data.x, torch.Tensor)
-            assert isinstance(data.edge_index, torch.Tensor)
-            assert isinstance(data.y, torch.Tensor)
-            
-            # Check shapes
-            assert data.x.dim() >= 1
-            assert data.edge_index.dim() == 2
-            assert data.edge_index.size(0) == 2  # [2, num_edges]
-
-    def test_split_indices_validity(self):
-        """Test that split indices are valid and non-overlapping."""
-        with hydra.initialize(
-            version_base="1.3",
-            config_path=self.relative_config_dir,
-            job_name="test_oc20_is2re_splits"
-        ):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=["dataset=graph/OC20_IS2RE"],
-                return_hydra_config=True,
-            )
-            loader = hydra.utils.instantiate(cfg.dataset.loader)
-            dataset, _ = loader.load()
-            
-            train_idx = dataset.split_idx['train'].numpy()
-            val_idx = dataset.split_idx['valid'].numpy()
-            test_idx = dataset.split_idx['test'].numpy()
-            
-            # Check no overlap between splits
-            assert len(set(train_idx) & set(val_idx)) == 0
-            assert len(set(train_idx) & set(test_idx)) == 0
-            # val and test might overlap if test reuses val when test is not available
-            
-            # Check all indices are within dataset bounds
-            all_indices = list(train_idx) + list(val_idx)
-            assert all(0 <= idx < len(dataset) for idx in all_indices)
-
-
-class TestOC22IS2REDatasetLoader:
-    """Test suite for OC22 IS2RE dataset loader."""
-
-    @pytest.fixture(autouse=True)
-    def setup(self):
-        """Setup test environment."""
-        hydra.core.global_hydra.GlobalHydra.instance().clear()
-        register_resolvers()
-        setup_project_root()
-        self.relative_config_dir = "../../../configs"
-
-    def test_loader_initialization(self):
-        """Test that the OC22 IS2RE loader can be initialized."""
-        with hydra.initialize(
-            version_base="1.3",
-            config_path=self.relative_config_dir,
-            job_name="test_oc22_is2re"
-        ):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=["dataset=graph/OC22_IS2RE"],
-                return_hydra_config=True,
-            )
-            loader = hydra.utils.instantiate(cfg.dataset.loader)
-            assert isinstance(loader, OC22IS2REDatasetLoader)
-            assert loader.parameters.data_name == "OC22_IS2RE"
-            assert loader.parameters.task == "oc22_is2re"
-
-    def test_dataset_loading(self):
-        """Test that the OC22 IS2RE dataset loads correctly."""
-        with hydra.initialize(
-            version_base="1.3",
-            config_path=self.relative_config_dir,
-            job_name="test_oc22_is2re_load"
-        ):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=["dataset=graph/OC22_IS2RE"],
-                return_hydra_config=True,
-            )
-            loader = hydra.utils.instantiate(cfg.dataset.loader)
-            dataset, data_dir = loader.load()
-            
-            # Check dataset type
-            assert isinstance(dataset, OC22IS2REDataset)
-            
-            # Check dataset has required attributes
-            assert hasattr(dataset, 'split_idx')
-            assert 'train' in dataset.split_idx
-            assert 'valid' in dataset.split_idx
-            assert 'test' in dataset.split_idx
-            
-            # Check splits are not empty (when max_samples is set)
-            assert len(dataset.split_idx['train']) > 0
-            assert len(dataset.split_idx['valid']) > 0
-            assert len(dataset.split_idx['test']) > 0
-            
-            # Check dataset length
-            assert len(dataset) > 0
-
-    def test_dataset_item_access(self):
-        """Test accessing individual items from the OC22 dataset."""
-        with hydra.initialize(
-            version_base="1.3",
-            config_path=self.relative_config_dir,
-            job_name="test_oc22_is2re_item"
-        ):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=["dataset=graph/OC22_IS2RE"],
-                return_hydra_config=True,
-            )
-            loader = hydra.utils.instantiate(cfg.dataset.loader)
-            dataset, _ = loader.load()
-            
-            # Get first item
-            data = dataset[0]
-            
-            # Check data has required PyG attributes
-            assert hasattr(data, 'x')
-            assert hasattr(data, 'edge_index')
-            assert hasattr(data, 'y')
-            
-            # Check data types
-            assert isinstance(data.x, torch.Tensor)
-            assert isinstance(data.edge_index, torch.Tensor)
-            assert isinstance(data.y, torch.Tensor)
-            
-            # Check shapes
-            assert data.x.dim() >= 1
-            assert data.edge_index.dim() == 2
-            assert data.edge_index.size(0) == 2
-
-    def test_split_indices_validity(self):
-        """Test that split indices are valid and non-overlapping."""
-        with hydra.initialize(
-            version_base="1.3",
-            config_path=self.relative_config_dir,
-            job_name="test_oc22_is2re_splits"
-        ):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=["dataset=graph/OC22_IS2RE"],
-                return_hydra_config=True,
-            )
-            loader = hydra.utils.instantiate(cfg.dataset.loader)
-            dataset, _ = loader.load()
-            
-            train_idx = dataset.split_idx['train'].numpy()
-            val_idx = dataset.split_idx['valid'].numpy()
-            test_idx = dataset.split_idx['test'].numpy()
-            
-            # Check no overlap between train and val
-            assert len(set(train_idx) & set(val_idx)) == 0
-            
-            # Check all indices are within dataset bounds
-            all_indices = list(train_idx) + list(val_idx)
-            assert all(0 <= idx < len(dataset) for idx in all_indices)
+# class TestOC20IS2REDatasetLoader:
+#     """Test suite for OC20 IS2RE dataset loader."""
+
+#     @pytest.fixture(autouse=True)
+#     def setup(self):
+#         """Setup test environment."""
+#         hydra.core.global_hydra.GlobalHydra.instance().clear()
+#         register_resolvers()
+#         setup_project_root()
+#         self.relative_config_dir = "../../../configs"
+
+#     def test_loader_initialization(self):
+#         """Test that the IS2RE loader can be initialized."""
+#         with hydra.initialize(
+#             version_base="1.3",
+#             config_path=self.relative_config_dir,
+#             job_name="test_oc20_is2re"
+#         ):
+#             cfg = hydra.compose(
+#                 config_name="run.yaml",
+#                 overrides=["dataset=graph/OC20_IS2RE"],
+#             )
+#             print('Test that the OC20 IS2RE loader can be initialized')
+#             loader = hydra.utils.instantiate(cfg.dataset.loader)
+#             assert isinstance(loader, IS2REDatasetLoader)
+#             assert loader.parameters.data_name == "OC20_IS2RE"
+#             assert loader.parameters.task == "is2re"
+
+#     def test_dataset_loading(self):
+#         """Test that the IS2RE dataset loads correctly."""
+#         with hydra.initialize(
+#             version_base="1.3",
+#             config_path=self.relative_config_dir,
+#             job_name="test_oc20_is2re_load"
+#         ):
+#             cfg = hydra.compose(
+#                 config_name="run.yaml",
+#                 overrides=["dataset=graph/OC20_IS2RE"],
+#             )
+#             print('Test that the OC20 IS2RE dataset loads correctly')
+#             loader = hydra.utils.instantiate(cfg.dataset.loader)
+#             dataset, data_dir = loader.load()
+            
+#             # Check dataset type
+#             assert isinstance(dataset, IS2REDataset)
+            
+#             # Check dataset has required attributes
+#             assert hasattr(dataset, 'split_idx')
+#             assert 'train' in dataset.split_idx
+#             assert 'valid' in dataset.split_idx
+#             assert 'test' in dataset.split_idx
+            
+#             # Check splits are not empty (when max_samples is set)
+#             assert len(dataset.split_idx['train']) > 0
+#             assert len(dataset.split_idx['valid']) > 0
+#             assert len(dataset.split_idx['test']) > 0
+            
+#             # Check dataset length
+#             assert len(dataset) > 0
+
+#     def test_dataset_item_access(self):
+#         """Test accessing individual items from the dataset."""
+#         with hydra.initialize(
+#             version_base="1.3",
+#             config_path=self.relative_config_dir,
+#             job_name="test_oc20_is2re_item"
+#         ):
+#             cfg = hydra.compose(
+#                 config_name="run.yaml",
+#                 overrides=["dataset=graph/OC20_IS2RE"],
+#             )
+#             print('Test that the OC20 IS2RE dataset loads correctly')
+#             loader = hydra.utils.instantiate(cfg.dataset.loader)
+#             dataset, _ = loader.load()
+            
+#             # Get first item
+#             data = dataset[0]
+            
+#             # Check data has required PyG attributes
+#             assert hasattr(data, 'x')
+#             assert hasattr(data, 'edge_index')
+#             assert hasattr(data, 'y')
+            
+#             # Check data types
+#             assert isinstance(data.x, torch.Tensor)
+#             assert isinstance(data.edge_index, torch.Tensor)
+#             assert isinstance(data.y, torch.Tensor)
+            
+#             # Check shapes
+#             assert data.x.dim() >= 1
+#             assert data.edge_index.dim() == 2
+#             assert data.edge_index.size(0) == 2  # [2, num_edges]
+
+#     def test_split_indices_validity(self):
+#         """Test that split indices are valid and non-overlapping."""
+#         with hydra.initialize(
+#             version_base="1.3",
+#             config_path=self.relative_config_dir,
+#             job_name="test_oc20_is2re_splits"
+#         ):
+#             cfg = hydra.compose(
+#                 config_name="run.yaml",
+#                 overrides=["dataset=graph/OC20_IS2RE"],
+#             )
+#             print('Test that the OC20 IS2RE dataset loads correctly')
+#             loader = hydra.utils.instantiate(cfg.dataset.loader)
+#             dataset, _ = loader.load()
+            
+#             train_idx = dataset.split_idx['train'].numpy()
+#             val_idx = dataset.split_idx['valid'].numpy()
+#             test_idx = dataset.split_idx['test'].numpy()
+            
+#             # Check no overlap between splits
+#             assert len(set(train_idx) & set(val_idx)) == 0
+#             assert len(set(train_idx) & set(test_idx)) == 0
+#             # val and test might overlap if test reuses val when test is not available
+            
+#             # Check all indices are within dataset bounds
+#             all_indices = list(train_idx) + list(val_idx)
+#             assert all(0 <= idx < len(dataset) for idx in all_indices)
+
+
+# class TestOC22IS2REDatasetLoader:
+#     """Test suite for OC22 IS2RE dataset loader."""
+
+#     @pytest.fixture(autouse=True)
+#     def setup(self):
+#         """Setup test environment."""
+#         hydra.core.global_hydra.GlobalHydra.instance().clear()
+#         register_resolvers()
+#         setup_project_root()
+#         self.relative_config_dir = "../../../configs"
+
+#     def test_loader_initialization(self):
+#         """Test that the OC22 IS2RE loader can be initialized."""
+#         with hydra.initialize(
+#             version_base="1.3",
+#             config_path=self.relative_config_dir,
+#             job_name="test_oc22_is2re"
+#         ):
+#             cfg = hydra.compose(
+#                 config_name="run.yaml",
+#                 overrides=["dataset=graph/OC22_IS2RE"],
+#             )
+#             print('Test that the OC20 IS2RE loader can be initialized')
+#             loader = hydra.utils.instantiate(cfg.dataset.loader)
+#             assert isinstance(loader, OC22IS2REDatasetLoader)
+#             assert loader.parameters.data_name == "OC22_IS2RE"
+#             assert loader.parameters.task == "oc22_is2re"
+
+#     def test_dataset_loading(self):
+#         """Test that the OC22 IS2RE dataset loads correctly."""
+#         with hydra.initialize(
+#             version_base="1.3",
+#             config_path=self.relative_config_dir,
+#             job_name="test_oc22_is2re_load"
+#         ):
+#             cfg = hydra.compose(
+#                 config_name="run.yaml",
+#                 overrides=["dataset=graph/OC22_IS2RE"],
+#             )
+#             print('Test that the OC20 IS2RE dataset loads correctly')
+#             loader = hydra.utils.instantiate(cfg.dataset.loader)
+#             dataset, data_dir = loader.load()
+            
+#             # Check dataset type
+#             assert isinstance(dataset, OC22IS2REDataset)
+            
+#             # Check dataset has required attributes
+#             assert hasattr(dataset, 'split_idx')
+#             assert 'train' in dataset.split_idx
+#             assert 'valid' in dataset.split_idx
+#             assert 'test' in dataset.split_idx
+            
+#             # Check splits are not empty (when max_samples is set)
+#             assert len(dataset.split_idx['train']) > 0
+#             assert len(dataset.split_idx['valid']) > 0
+#             assert len(dataset.split_idx['test']) > 0
+            
+#             # Check dataset length
+#             assert len(dataset) > 0
+
+#     def test_dataset_item_access(self):
+#         """Test accessing individual items from the OC22 dataset."""
+#         with hydra.initialize(
+#             version_base="1.3",
+#             config_path=self.relative_config_dir,
+#             job_name="test_oc22_is2re_item"
+#         ):
+#             cfg = hydra.compose(
+#                 config_name="run.yaml",
+#                 overrides=["dataset=graph/OC22_IS2RE"],
+#             )
+#             print('Test that the OC20 IS2RE dataset loads correctly')
+#             loader = hydra.utils.instantiate(cfg.dataset.loader)
+#             dataset, _ = loader.load()
+            
+#             # Get first item
+#             data = dataset[0]
+            
+#             # Check data has required PyG attributes
+#             assert hasattr(data, 'x')
+#             assert hasattr(data, 'edge_index')
+#             assert hasattr(data, 'y')
+            
+#             # Check data types
+#             assert isinstance(data.x, torch.Tensor)
+#             assert isinstance(data.edge_index, torch.Tensor)
+#             assert isinstance(data.y, torch.Tensor)
+            
+#             # Check shapes
+#             assert data.x.dim() >= 1
+#             assert data.edge_index.dim() == 2
+#             assert data.edge_index.size(0) == 2
+
+#     def test_split_indices_validity(self):
+#         """Test that split indices are valid and non-overlapping."""
+#         with hydra.initialize(
+#             version_base="1.3",
+#             config_path=self.relative_config_dir,
+#             job_name="test_oc22_is2re_splits"
+#         ):
+#             cfg = hydra.compose(
+#                 config_name="run.yaml",
+#                 overrides=["dataset=graph/OC22_IS2RE"],
+#             )
+#             print('Test that the OC20 IS2RE dataset loads correctly')
+#             loader = hydra.utils.instantiate(cfg.dataset.loader)
+#             dataset, _ = loader.load()
+            
+#             train_idx = dataset.split_idx['train'].numpy()
+#             val_idx = dataset.split_idx['valid'].numpy()
+#             test_idx = dataset.split_idx['test'].numpy()
+            
+#             # Check no overlap between train and val
+#             assert len(set(train_idx) & set(val_idx)) == 0
+            
+#             # Check all indices are within dataset bounds
+#             all_indices = list(train_idx) + list(val_idx)
+#             assert all(0 <= idx < len(dataset) for idx in all_indices)
 
 
 class TestOC20S2EFDatasetLoader:
@@ -345,9 +345,9 @@ def test_loader_initialization_200k(self):
         ):
             cfg = hydra.compose(
                 config_name="run.yaml",
-                overrides=["dataset=graph/OC20_S2EF_200K"],
-                return_hydra_config=True,
+                overrides=["dataset=graph/OC20_S2EF_200K_mock"],
             )
+            print('Test that the OC20 S2EF 200K loader can be initialized')
             loader = hydra.utils.instantiate(cfg.dataset.loader)
             assert isinstance(loader, OC20DatasetLoader)
             assert loader.parameters.task == "s2ef"
@@ -362,9 +362,9 @@ def test_dataset_loading_200k(self):
         ):
             cfg = hydra.compose(
                 config_name="run.yaml",
-                overrides=["dataset=graph/OC20_S2EF_200K"],
-                return_hydra_config=True,
+                overrides=["dataset=graph/OC20_S2EF_200K_mock"],
             )
+            print('Test that the OC20 S2EF 200K dataset loads correctly')
             loader = hydra.utils.instantiate(cfg.dataset.loader)
             dataset, data_dir = loader.load()
             
@@ -379,7 +379,8 @@ def test_dataset_loading_200k(self):
             
             # Check splits are not empty
             assert len(dataset.split_idx['train']) > 0
-            assert len(dataset.split_idx['valid']) > 0
+            # With val_splits=[], validation data will come from random split (if split_type=random)
+            # Otherwise it will be empty
             # S2EF test data is LMDB format (incompatible with .extxyz/ASE DB), so test split is empty
             assert len(dataset.split_idx['test']) == 0
             
@@ -395,9 +396,9 @@ def test_dataset_item_access_s2ef(self):
         ):
             cfg = hydra.compose(
                 config_name="run.yaml",
-                overrides=["dataset=graph/OC20_S2EF_200K"],
-                return_hydra_config=True,
+                overrides=["dataset=graph/OC20_S2EF_200K_mock"],
             )
+            print('Test that the OC20 S2EF dataset loads correctly')
             loader = hydra.utils.instantiate(cfg.dataset.loader)
             dataset, _ = loader.load()
             
@@ -427,14 +428,13 @@ def test_validation_splits_configuration(self):
             # Test with val_id only
             cfg = hydra.compose(
                 config_name="run.yaml",
-                overrides=[
-                    "dataset=graph/OC20_S2EF_200K",
-                ],
-                return_hydra_config=True,
+                overrides=["dataset=graph/OC20_S2EF_200K_mock"],
             )
+            print('Test that the OC20 S2EF 200K loader can be initialized')
             loader = hydra.utils.instantiate(cfg.dataset.loader)
-            # val_splits=null means use all 4 validation splits
-            assert loader.parameters.val_splits is None
+            # val_splits=[] (empty list) for mock config means no separate validation files
+            # (validation will come from random split of train data)
+            assert loader.parameters.val_splits == []
 
     def test_split_indices_validity_s2ef(self):
         """Test that S2EF split indices are valid."""
@@ -445,9 +445,9 @@ def test_split_indices_validity_s2ef(self):
         ):
             cfg = hydra.compose(
                 config_name="run.yaml",
-                overrides=["dataset=graph/OC20_S2EF_200K"],
-                return_hydra_config=True,
+                overrides=["dataset=graph/OC20_S2EF_200K_mock"],
             )
+            print('Test that the OC20 S2EF 200K dataset loads correctly')
             loader = hydra.utils.instantiate(cfg.dataset.loader)
             dataset, _ = loader.load()
             
@@ -456,35 +456,13 @@ def test_split_indices_validity_s2ef(self):
             valid_idx = dataset.split_idx['valid']
             test_idx = dataset.split_idx['test']
             
-            # Check no overlap between train and val
-            assert len(set(train_idx) & set(valid_idx)) == 0
-            
             # Check indices are valid (note: max_samples truncates dataset but indices reflect original positions)
             assert len(train_idx) > 0
-            assert len(valid_idx) > 0
-
-    def test_different_train_splits(self):
-        """Test that different training split sizes can be loaded."""
-        train_splits = ["200K"]  # Only test 200K for now (others need download & preprocessing)
-        
-        for split in train_splits:
-            with hydra.initialize(
-                version_base="1.3",
-                config_path=self.relative_config_dir,
-                job_name=f"test_oc20_s2ef_{split}"
-            ):
-                cfg = hydra.compose(
-                    config_name="run.yaml",
-                    overrides=[f"dataset=graph/OC20_S2EF_{split}"],
-                    return_hydra_config=True,
-                )
-                loader = hydra.utils.instantiate(cfg.dataset.loader)
-                assert loader.parameters.train_split == split
-                
-                # Load and verify dataset
-                dataset, _ = loader.load()
-                assert len(dataset) > 0
-                assert len(dataset.split_idx['train']) > 0
+            # With val_splits=[], valid_idx may be empty at dataset level
+            # (preprocessor will create splits later with random splitting)
+            # Only check non-overlap if both have data
+            if len(valid_idx) > 0:
+                assert len(set(train_idx) & set(valid_idx)) == 0
 
 
 class TestOC20DatasetIntegration:
@@ -498,84 +476,85 @@ def setup(self):
         setup_project_root()
         self.relative_config_dir = "../../../configs"
 
-    def test_is2re_with_preprocessor(self):
-        """Test IS2RE dataset with PreProcessor."""
-        with hydra.initialize(
-            version_base="1.3",
-            config_path=self.relative_config_dir,
-            job_name="test_is2re_preprocessor"
-        ):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=[
-                    "dataset=graph/OC20_IS2RE",
-                    "model=graph/gcn",
-                ],
-                return_hydra_config=True,
-            )
-            
-            # Load dataset
-            loader = hydra.utils.instantiate(cfg.dataset.loader)
-            dataset, data_dir = loader.load()
-            
-            # Use preprocessor
-            from topobench.data.preprocessor import PreProcessor
-            transform_config = cfg.get("transforms", None)
-            preprocessor = PreProcessor(dataset, data_dir, transform_config)
-            
-            # Load splits
-            dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(
-                cfg.dataset.split_params
-            )
-            
-            # Verify splits exist and are not empty
-            assert dataset_train is not None
-            assert dataset_val is not None
-            assert dataset_test is not None
-            assert len(dataset_train) > 0
-            assert len(dataset_val) > 0
-            assert len(dataset_test) > 0
-
-    def test_oc22_is2re_with_preprocessor(self):
-        """Test OC22 IS2RE dataset with PreProcessor."""
-        with hydra.initialize(
-            version_base="1.3",
-            config_path=self.relative_config_dir,
-            job_name="test_oc22_preprocessor"
-        ):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=[
-                    "dataset=graph/OC22_IS2RE",
-                    "model=graph/gcn",
-                ],
-                return_hydra_config=True,
-            )
-            
-            # Load dataset
-            loader = hydra.utils.instantiate(cfg.dataset.loader)
-            dataset, data_dir = loader.load()
-            
-            # Use preprocessor
-            from topobench.data.preprocessor import PreProcessor
-            transform_config = cfg.get("transforms", None)
-            preprocessor = PreProcessor(dataset, data_dir, transform_config)
-            
-            # Load splits
-            dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(
-                cfg.dataset.split_params
-            )
-            
-            # Verify splits exist and are not empty
-            assert dataset_train is not None
-            assert dataset_val is not None
-            assert dataset_test is not None
-            assert len(dataset_train) > 0
-            assert len(dataset_val) > 0
-            assert len(dataset_test) > 0
+#     def test_is2re_with_preprocessor(self):
+#         """Test IS2RE dataset with PreProcessor."""
+#         with hydra.initialize(
+#             version_base="1.3",
+#             config_path=self.relative_config_dir,
+#             job_name="test_is2re_preprocessor"
+#         ):
+#             cfg = hydra.compose(
+#                 config_name="run.yaml",
+#                 overrides=[
+#                     "dataset=graph/OC20_IS2RE",
+#                     "model=graph/gcn",
+#                 ],
+#                 return_hydra_config=True,
+#             )
+            
+#             # Load dataset
+#             loader = hydra.utils.instantiate(cfg.dataset.loader)
+#             dataset, data_dir = loader.load()
+            
+#             # Use preprocessor
+#             from topobench.data.preprocessor import PreProcessor
+#             transform_config = cfg.get("transforms", None)
+#             preprocessor = PreProcessor(dataset, data_dir, transform_config)
+            
+#             # Load splits
+#             dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(
+#                 cfg.dataset.split_params
+#             )
+            
+#             # Verify splits exist and are not empty
+#             assert dataset_train is not None
+#             assert dataset_val is not None
+#             assert dataset_test is not None
+#             assert len(dataset_train) > 0
+#             assert len(dataset_val) > 0
+#             assert len(dataset_test) > 0
+
+#     def test_oc22_is2re_with_preprocessor(self):
+#         """Test OC22 IS2RE dataset with PreProcessor."""
+#         with hydra.initialize(
+#             version_base="1.3",
+#             config_path=self.relative_config_dir,
+#             job_name="test_oc22_preprocessor"
+#         ):
+#             cfg = hydra.compose(
+#                 config_name="run.yaml",
+#                 overrides=[
+#                     "dataset=graph/OC22_IS2RE",
+#                     "model=graph/gcn",
+#                 ],
+#                 return_hydra_config=True,
+#             )
+#             print('Config used for OC22 IS2RE preprocessor test:\n', OmegaConf.to_yaml(cfg))
+            
+#             # Load dataset
+#             loader = hydra.utils.instantiate(cfg.dataset.loader)
+#             dataset, data_dir = loader.load()
+            
+#             # Use preprocessor
+#             from topobench.data.preprocessor import PreProcessor
+#             transform_config = cfg.get("transforms", None)
+#             preprocessor = PreProcessor(dataset, data_dir, transform_config)
+            
+#             # Load splits
+#             dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(
+#                 cfg.dataset.split_params
+#             )
+            
+#             # Verify splits exist and are not empty
+#             assert dataset_train is not None
+#             assert dataset_val is not None
+#             assert dataset_test is not None
+#             assert len(dataset_train) > 0
+#             assert len(dataset_val) > 0
+#             assert len(dataset_test) > 0
 
     def test_s2ef_with_preprocessor(self):
-        """Test S2EF dataset with PreProcessor."""
+        """Test mock S2EF dataset with PreProcessor."""
         with hydra.initialize(
             version_base="1.3",
             config_path=self.relative_config_dir,
@@ -583,12 +562,9 @@ def test_s2ef_with_preprocessor(self):
         ):
             cfg = hydra.compose(
                 config_name="run.yaml",
-                overrides=[
-                    "dataset=graph/OC20_S2EF_200K",
-                    "model=graph/gcn",
-                ],
-                return_hydra_config=True,
+                overrides=["dataset=graph/OC20_S2EF_200K_mock", "model=graph/gcn"]
             )
+            print('Test that the OC20 S2EF 200K loader can be initialized')
             
             # Load dataset
             loader = hydra.utils.instantiate(cfg.dataset.loader)
@@ -610,5 +586,4 @@ def test_s2ef_with_preprocessor(self):
             assert dataset_test is not None
             assert len(dataset_train) > 0
             assert len(dataset_val) > 0
-            # S2EF datasets don't have test splits (include_test=false by default)
-            assert len(dataset_test) == 0
+            # S2EF test may be empty if using the official splits, no test for this
\ No newline at end of file

From 554e3979f2af5023cc3e47b5a4291b4e61394425 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Tue, 25 Nov 2025 18:57:30 -0500
Subject: [PATCH 14/17] removing again unnecessary tests and bug fix on number
 of loaded molecules

---
 test/data/load/test_oc20_datasets.py          |  8 +-
 test/pipeline/test_pipeline.py                | 93 ++++++++++---------
 .../loaders/graph/oc20_asedbs2ef_loader.py    | 40 ++++++--
 3 files changed, 86 insertions(+), 55 deletions(-)

diff --git a/test/data/load/test_oc20_datasets.py b/test/data/load/test_oc20_datasets.py
index 234f01b42..9b61f7720 100644
--- a/test/data/load/test_oc20_datasets.py
+++ b/test/data/load/test_oc20_datasets.py
@@ -7,12 +7,12 @@
 from pathlib import Path
 from omegaconf import DictConfig, OmegaConf
 
-from topobench.data.loaders.graph.oc20_is2re_dataset_loader import IS2REDatasetLoader
-from topobench.data.loaders.graph.oc22_is2re_dataset_loader import OC22IS2REDatasetLoader
+# from topobench.data.loaders.graph.oc20_is2re_dataset_loader import IS2REDatasetLoader
+# from topobench.data.loaders.graph.oc22_is2re_dataset_loader import OC22IS2REDatasetLoader
 from topobench.data.loaders.graph.oc20_dataset_loader import OC20DatasetLoader
 from topobench.data.loaders.graph.oc20_asedbs2ef_loader import OC20ASEDBDataset
-from topobench.data.datasets.oc20_is2re_dataset import IS2REDataset
-from topobench.data.datasets.oc22_is2re_dataset import OC22IS2REDataset
+# from topobench.data.datasets.oc20_is2re_dataset import IS2REDataset
+# from topobench.data.datasets.oc22_is2re_dataset import OC22IS2REDataset
 from topobench.data.datasets.oc20_dataset import OC20Dataset
 from topobench.utils.config_resolvers import (
     get_default_metrics,
diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py
index c922270f6..affb6ea39 100644
--- a/test/pipeline/test_pipeline.py
+++ b/test/pipeline/test_pipeline.py
@@ -12,55 +12,56 @@ def setup_method(self):
         """Setup method."""
         hydra.core.global_hydra.GlobalHydra.instance().clear()
     
-    def test_pipeline_oc20_is2re(self):
-        """Test pipeline with OC20 IS2RE dataset."""
-        dataset = "graph/OC20_IS2RE"
-        model = "graph/gcn"
-        
-        with hydra.initialize(config_path="../../configs", job_name="job"):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=[
-                    f"model={model}",
-                    f"dataset={dataset}",
-                    "trainer.max_epochs=2",
-                    "trainer.min_epochs=1",
-                    "trainer.check_val_every_n_epoch=1",
-                    "paths=test",
-                    "callbacks=model_checkpoint",
-                    "dataset.dataloader_params.num_workers=0",
-                    "dataset.dataloader_params.persistent_workers=false",
-                ],
-                return_hydra_config=True
-            )
-            run(cfg)
+    # IS2RE and OC22 tests commented out to prevent large dataset downloads during testing
+    # def test_pipeline_oc20_is2re(self):
+    #     """Test pipeline with OC20 IS2RE dataset."""
+    #     dataset = "graph/OC20_IS2RE"
+    #     model = "graph/gcn"
+    #     
+    #     with hydra.initialize(config_path="../../configs", job_name="job"):
+    #         cfg = hydra.compose(
+    #             config_name="run.yaml",
+    #             overrides=[
+    #                 f"model={model}",
+    #                 f"dataset={dataset}",
+    #                 "trainer.max_epochs=2",
+    #                 "trainer.min_epochs=1",
+    #                 "trainer.check_val_every_n_epoch=1",
+    #                 "paths=test",
+    #                 "callbacks=model_checkpoint",
+    #                 "dataset.dataloader_params.num_workers=0",
+    #                 "dataset.dataloader_params.persistent_workers=false",
+    #             ],
+    #             return_hydra_config=True
+    #         )
+    #         run(cfg)
 
-    def test_pipeline_oc22_is2re(self):
-        """Test pipeline with OC22 IS2RE dataset."""
-        dataset = "graph/OC22_IS2RE"
-        model = "graph/gcn"
-        
-        with hydra.initialize(config_path="../../configs", job_name="job"):
-            cfg = hydra.compose(
-                config_name="run.yaml",
-                overrides=[
-                    f"model={model}",
-                    f"dataset={dataset}",
-                    "trainer.max_epochs=2",
-                    "trainer.min_epochs=1",
-                    "trainer.check_val_every_n_epoch=1",
-                    "paths=test",
-                    "callbacks=model_checkpoint",
-                    "dataset.dataloader_params.num_workers=0",
-                    "dataset.dataloader_params.persistent_workers=false",
-                ],
-                return_hydra_config=True
-            )
-            run(cfg)
+    # def test_pipeline_oc22_is2re(self):
+    #     """Test pipeline with OC22 IS2RE dataset."""
+    #     dataset = "graph/OC22_IS2RE"
+    #     model = "graph/gcn"
+    #     
+    #     with hydra.initialize(config_path="../../configs", job_name="job"):
+    #         cfg = hydra.compose(
+    #             config_name="run.yaml",
+    #             overrides=[
+    #                 f"model={model}",
+    #                 f"dataset={dataset}",
+    #                 "trainer.max_epochs=2",
+    #                 "trainer.min_epochs=1",
+    #                 "trainer.check_val_every_n_epoch=1",
+    #                 "paths=test",
+    #                 "callbacks=model_checkpoint",
+    #                 "dataset.dataloader_params.num_workers=0",
+    #                 "dataset.dataloader_params.persistent_workers=false",
+    #             ],
+    #             return_hydra_config=True
+    #         )
+    #         run(cfg)
 
     def test_pipeline_oc20_s2ef(self):
         """Test pipeline with OC20 S2EF dataset."""
-        dataset = "graph/OC20_S2EF_200K"
+        dataset = "graph/OC20_S2EF_200K_mock"
         model = "graph/gcn"
         
         with hydra.initialize(config_path="../../configs", job_name="job"):
@@ -74,6 +75,8 @@ def test_pipeline_oc20_s2ef(self):
                     "trainer.check_val_every_n_epoch=1",
                     "paths=test",
                     "callbacks=model_checkpoint",
+                    "dataset.dataloader_params.num_workers=0",
+                    "dataset.dataloader_params.persistent_workers=false",
                 ],
                 return_hydra_config=True
             )
diff --git a/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py b/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
index 957bd6e7a..fe8202df5 100644
--- a/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
+++ b/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
@@ -142,9 +142,13 @@ def __init__(
             logger.info(f"Limiting each split to {max_samples} samples")
             # When limiting, we need to:
             # 1. Truncate the split_idx lists
-            # 2. Update _num_samples to reflect the new total
-            # 3. Keep _db_ranges unchanged (they map indices to DB files)
-            # The split_idx values remain valid as indices into the full dataset
+            # 2. Create a mapping from limited indices to new contiguous indices
+            # 3. Update _num_samples to reflect the new total
+            # This ensures len(dataset) returns the correct value and prevents
+            # unnecessary iteration over the full dataset during preprocessing
+            
+            # Collect indices from all splits that we want to keep
+            all_limited_indices = []
             for split_name in ("train", "valid", "test"):
                 if self.split_idx[split_name]:
                     original_len = len(self.split_idx[split_name])
@@ -152,13 +156,29 @@ def __init__(
                     self.split_idx[split_name] = self.split_idx[split_name][
                         :new_len
                     ]
+                    all_limited_indices.extend(self.split_idx[split_name])
                     logger.info(
                         f"  {split_name}: {original_len} -> {new_len} samples"
                     )
 
-            # Important: Do NOT change _num_samples here. The dataset still contains
-            # all samples indexed by the original _db_ranges. The split_idx just
-            # selects which subset to use for each split.
+            # Create mapping from old indices to new contiguous indices
+            old_to_new_idx = {
+                old_idx: new_idx 
+                for new_idx, old_idx in enumerate(sorted(set(all_limited_indices)))
+            }
+            
+            # Remap split_idx to use new contiguous indices
+            for split_name in ("train", "valid", "test"):
+                self.split_idx[split_name] = [
+                    old_to_new_idx[idx] for idx in self.split_idx[split_name]
+                ]
+            
+            # Store mapping for __getitem__ to translate back to original indices
+            self._index_mapping = sorted(set(all_limited_indices))
+            
+            # Update _num_samples to the actual limited size
+            self._num_samples = len(self._index_mapping)
+            logger.info(f"Dataset length limited to {self._num_samples} samples")
 
         logger.info(
             f"Loaded {len(self.db_paths)} DB files with {self._num_samples} total structures"
@@ -211,6 +231,14 @@ def __getitem__(self, idx: int) -> Data:
         Data
             PyTorch Geometric Data object.
         """
+        # If we have an index mapping (from max_samples limiting), translate the index
+        if hasattr(self, '_index_mapping'):
+            if idx < 0 or idx >= len(self._index_mapping):
+                raise IndexError(
+                    f"Index {idx} out of range [0, {len(self._index_mapping)})"
+                )
+            idx = self._index_mapping[idx]
+        
         db_path, local_idx = self._get_db_and_idx(idx)
 
         with ase.db.connect(str(db_path)) as db:

From 5b13f857d5558b2189c95405c12ca97d5ec99654 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Tue, 25 Nov 2025 18:58:08 -0500
Subject: [PATCH 15/17] ruff

---
 .../loaders/graph/oc20_asedbs2ef_loader.py    | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py b/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
index fe8202df5..3031f1ed4 100644
--- a/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
+++ b/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
@@ -146,7 +146,7 @@ def __init__(
             # 3. Update _num_samples to reflect the new total
             # This ensures len(dataset) returns the correct value and prevents
             # unnecessary iteration over the full dataset during preprocessing
-            
+
             # Collect indices from all splits that we want to keep
             all_limited_indices = []
             for split_name in ("train", "valid", "test"):
@@ -163,22 +163,26 @@ def __init__(
 
             # Create mapping from old indices to new contiguous indices
             old_to_new_idx = {
-                old_idx: new_idx 
-                for new_idx, old_idx in enumerate(sorted(set(all_limited_indices)))
+                old_idx: new_idx
+                for new_idx, old_idx in enumerate(
+                    sorted(set(all_limited_indices))
+                )
             }
-            
+
             # Remap split_idx to use new contiguous indices
             for split_name in ("train", "valid", "test"):
                 self.split_idx[split_name] = [
                     old_to_new_idx[idx] for idx in self.split_idx[split_name]
                 ]
-            
+
             # Store mapping for __getitem__ to translate back to original indices
             self._index_mapping = sorted(set(all_limited_indices))
-            
+
             # Update _num_samples to the actual limited size
             self._num_samples = len(self._index_mapping)
-            logger.info(f"Dataset length limited to {self._num_samples} samples")
+            logger.info(
+                f"Dataset length limited to {self._num_samples} samples"
+            )
 
         logger.info(
             f"Loaded {len(self.db_paths)} DB files with {self._num_samples} total structures"
@@ -232,13 +236,13 @@ def __getitem__(self, idx: int) -> Data:
             PyTorch Geometric Data object.
         """
         # If we have an index mapping (from max_samples limiting), translate the index
-        if hasattr(self, '_index_mapping'):
+        if hasattr(self, "_index_mapping"):
             if idx < 0 or idx >= len(self._index_mapping):
                 raise IndexError(
                     f"Index {idx} out of range [0, {len(self._index_mapping)})"
                 )
             idx = self._index_mapping[idx]
-        
+
         db_path, local_idx = self._get_db_and_idx(idx)
 
         with ase.db.connect(str(db_path)) as db:

From 48f5b65bbef125e928b34aa57cd9ab99ecf6cbc4 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Tue, 25 Nov 2025 20:38:31 -0500
Subject: [PATCH 16/17] fixed tests cleanly

---
 topobench/data/datasets/oc20_dataset.py       |  20 ++
 .../loaders/graph/oc20_asedbs2ef_loader.py    |  36 +++
 topobench/data/preprocessor/preprocessor.py   | 259 ++++--------------
 topobench/data/utils/split_utils.py           | 237 +++-------------
 topobench/model/model.py                      |  33 +--
 5 files changed, 155 insertions(+), 430 deletions(-)

diff --git a/topobench/data/datasets/oc20_dataset.py b/topobench/data/datasets/oc20_dataset.py
index b886616bf..aa19a37e1 100644
--- a/topobench/data/datasets/oc20_dataset.py
+++ b/topobench/data/datasets/oc20_dataset.py
@@ -349,6 +349,26 @@ def num_node_features(self) -> int:
         # Will be determined by the actual data
         return 1  # Atomic numbers
 
+    @property
+    def data(self):
+        """Get combined data view for compatibility with InMemoryDataset API.
+
+        Returns a Data object with x and y attributes representing stacked
+        features from a sample of the dataset.
+        """
+        if not hasattr(self, "_data_cache"):
+            # Get a sample to determine feature dimensions
+            if len(self) > 0:
+                sample = self.get(0)
+                # Create a mock data object with minimal info for compatibility
+                self._data_cache = Data(
+                    x=sample.x if hasattr(sample, "x") else torch.zeros(1, 1),
+                    y=sample.y if hasattr(sample, "y") else torch.zeros(1),
+                )
+            else:
+                self._data_cache = Data(x=torch.zeros(1, 1), y=torch.zeros(1))
+        return self._data_cache
+
     @property
     def num_classes(self) -> int:
         """Number of classes (regression task)."""
diff --git a/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py b/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
index 3031f1ed4..e343ca4a6 100644
--- a/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
+++ b/topobench/data/loaders/graph/oc20_asedbs2ef_loader.py
@@ -198,6 +198,42 @@ def __len__(self) -> int:
         """
         return self._num_samples
 
+    @property
+    def num_node_features(self) -> int:
+        """Number of node features per atom.
+
+        Returns
+        -------
+        int
+            Number of node features (atomic numbers by default).
+        """
+        return 1  # Atomic numbers
+
+    @property
+    def data(self):
+        """Get combined data view for compatibility with InMemoryDataset API.
+
+        Returns a Data object with x and y attributes representing stacked
+        features from a sample of the dataset.
+
+        Returns
+        -------
+        Data
+            A Data object with x and y attributes for API compatibility.
+        """
+        if not hasattr(self, "_data_cache"):
+            # Get a sample to determine feature dimensions
+            if len(self) > 0:
+                sample = self[0]
+                # Create a mock data object with minimal info for compatibility
+                self._data_cache = Data(
+                    x=sample.x if hasattr(sample, "x") else torch.zeros(1, 1),
+                    y=sample.y if hasattr(sample, "y") else torch.zeros(1),
+                )
+            else:
+                self._data_cache = Data(x=torch.zeros(1, 1), y=torch.zeros(1))
+        return self._data_cache
+
     def _get_db_and_idx(self, idx: int) -> tuple[Path, int]:
         """Get DB path and local index for global index.
 
diff --git a/topobench/data/preprocessor/preprocessor.py b/topobench/data/preprocessor/preprocessor.py
index b8bc9993f..f382c7ddf 100644
--- a/topobench/data/preprocessor/preprocessor.py
+++ b/topobench/data/preprocessor/preprocessor.py
@@ -34,10 +34,6 @@ class PreProcessor(torch_geometric.data.InMemoryDataset):
 
     def __init__(self, dataset, data_dir, transforms_config=None, **kwargs):
         self.dataset = dataset
-        self._skip_processing = (
-            False  # Flag to skip processing for no-transform case
-        )
-
         if transforms_config is not None:
             self.transforms_applied = True
             pre_transform = self.instantiate_pre_transform(
@@ -53,32 +49,20 @@ def __init__(self, dataset, data_dir, transforms_config=None, **kwargs):
             self.load(self.processed_paths[0])
             self.data_list = [data for data in self]
         else:
-            print(
-                "No transforms to apply, using dataset directly (skipping processing)..."
-            )
             self.transforms_applied = False
-            self._skip_processing = True  # Skip parent class processing
-
-            # Call parent init but it should skip processing
             super().__init__(data_dir, None, None, **kwargs)
-
             self.transform = (
                 dataset.transform if hasattr(dataset, "transform") else None
             )
-            # Check if dataset is an InMemoryDataset with _data and slices
+            # Handle datasets that don't have _data/slices (e.g., LMDB-based datasets like OC20)
             if hasattr(dataset, "_data") and hasattr(dataset, "slices"):
-                # Directly use the dataset's data and slices
-                self._data, self.slices = dataset._data, dataset.slices
-                # Make data_list creation lazy to avoid loading large datasets into memory
-                self._data_list = None
-                self._is_inmemory = True
+                self.data, self.slices = dataset._data, dataset.slices
+                self.data_list = [data for data in dataset]
             else:
-                # For non-InMemoryDataset (like LMDB-based datasets), we can't use _data/slices
-                # The dataset will be accessed directly via indexing
+                # For non-InMemoryDataset, store the dataset directly
                 self._data = None
                 self.slices = None
-                self._data_list = None
-                self._is_inmemory = False
+                self.data_list = [data for data in dataset]
 
         # Some datasets have fixed splits, and those are stored as split_idx during loading
         # We need to store this information to be able to reproduce the splits afterwards
@@ -86,34 +70,20 @@ def __init__(self, dataset, data_dir, transforms_config=None, **kwargs):
             self.split_idx = dataset.split_idx
 
     @property
-    def data_list(self):
-        """Lazy loading of data_list to avoid loading large datasets into memory.
+    def processed_dir(self) -> str:
+        """Return the path to the processed directory.
 
         Returns
         -------
-        list
-            List of data objects when transforms are not applied; otherwise the processed data list.
-        """
-        if not self.transforms_applied and self._data_list is None:
-            # Only create data_list when actually needed
-            print(
-                "Warning: Creating data_list from large dataset - this may take a while..."
-            )
-            self._data_list = [data for data in self.dataset]
-        return self._data_list
-
-    @data_list.setter
-    def data_list(self, value):
-        """Setter for data_list.
-
-        Parameters
-        ----------
-        value : list
-            New list of data objects to use as the dataset's data_list.
+        str
+            Path to the processed directory.
         """
-        self._data_list = value
+        if not self.transforms_applied:
+            return self.root
+        else:
+            return self.root + "/processed"
 
-    def __len__(self) -> int:
+    def len(self) -> int:
         """Return the number of samples in the dataset.
 
         Returns
@@ -121,59 +91,43 @@ def __len__(self) -> int:
         int
             Number of samples.
         """
-        if not self.transforms_applied and not self._is_inmemory:
-            # For non-InMemoryDataset, delegate to the wrapped dataset
+        # Only use data_list for length when transforms are NOT applied
+        # and dataset doesn't have standard InMemoryDataset structure
+        if not self.transforms_applied and self.slices is None:
+            if hasattr(self, "data_list") and self.data_list is not None:
+                return len(self.data_list)
+            # Fallback to dataset length during initialization
             return len(self.dataset)
-        else:
-            # For InMemoryDataset or transformed data, use parent implementation
-            return super().__len__()
+        return super().len()
 
-    def __getitem__(self, idx):
-        """Get item at index.
+    def get(self, idx: int):
+        """Get data object at index.
 
         Parameters
         ----------
-        idx : int or slice
-            Index or slice to retrieve.
-
-        Returns
-        -------
-        Data or list[Data]
-            Data object(s) at the given index/slice.
-        """
-        if not self.transforms_applied and not self._is_inmemory:
-            # For non-InMemoryDataset, delegate to the wrapped dataset
-            return self.dataset[idx]
-        else:
-            # For InMemoryDataset or transformed data, use parent implementation
-            return super().__getitem__(idx)
-
-    @property
-    def processed_dir(self) -> str:
-        """Return the path to the processed directory.
+        idx : int
+            Index of the data object.
 
         Returns
         -------
-        str
-            Path to the processed directory.
+        Data
+            Data object at the given index.
         """
-        if not self.transforms_applied:
-            return self.root
-        else:
-            return self.root + "/processed"
+        # Only use data_list access when transforms are NOT applied
+        # and dataset doesn't have standard InMemoryDataset structure
+        if not self.transforms_applied and self.slices is None:
+            return self.data_list[idx]
+        return super().get(idx)
 
     @property
-    def processed_file_names(self) -> str | list[str]:
+    def processed_file_names(self) -> str:
         """Return the name of the processed file.
 
         Returns
         -------
-        str | list[str]
-            Name of the processed file, or empty list to skip processing.
+        str
+            Name of the processed file.
         """
-        # If no transforms, return empty list to skip processing check
-        if hasattr(self, "_skip_processing") and self._skip_processing:
-            return []
         return "data.pt"
 
     def instantiate_pre_transform(
@@ -238,11 +192,6 @@ def set_processed_data_dir(
             transform_name: transform.parameters
             for transform_name, transform in pre_transforms_dict.items()
         }
-
-        # Include dataset size in hash to avoid reusing cached data from different dataset sizes
-        # This is crucial when max_samples changes
-        transforms_parameters["_dataset_size"] = len(self.dataset)
-
         params_hash = make_hash(transforms_parameters)
         self.transforms_parameters = ensure_serializable(transforms_parameters)
         self.processed_data_dir = os.path.join(
@@ -273,130 +222,26 @@ def save_transform_parameters(self) -> None:
             )
 
     def process(self) -> None:
-        """Method that processes the data.
-
-        Returns
-        -------
-        None
-            Writes processed data to disk as a side effect.
-        """
-        # Skip processing if no transforms are applied
-        if hasattr(self, "_skip_processing") and self._skip_processing:
-            return
-
-        from tqdm import tqdm
-
-        # If dataset has split_idx, only process those samples (for efficiency with large datasets)
-        if hasattr(self.dataset, "split_idx") and self.dataset.split_idx:
-            # Collect all unique indices from all splits
-            all_indices = []
-            for split_name in ["train", "valid", "test"]:
-                if split_name in self.dataset.split_idx:
-                    indices = self.dataset.split_idx[split_name]
-                    # Convert tensor to list if needed
-                    if hasattr(indices, "tolist"):
-                        indices = indices.tolist()
-                    elif hasattr(indices, "__iter__"):
-                        indices = list(indices)
-                    all_indices.extend(indices)
-            # Remove duplicates and sort
-            all_indices = sorted(set(all_indices))
-
-            print(
-                f"Processing dataset with {len(all_indices)} samples (from split_idx)..."
-            )
-
-            # Load only the samples specified in split_idx
-            if len(all_indices) > 1000:
-                print(
-                    f"Loading {len(all_indices)} graphs from split indices (this may take a while)..."
-                )
-                data_list = [
-                    self.dataset[idx]
-                    for idx in tqdm(all_indices, desc="Loading graphs")
-                ]
-            else:
-                data_list = [self.dataset[idx] for idx in all_indices]
-        else:
-            # No split_idx, process all samples
-            print(f"Processing dataset with {len(self.dataset)} samples...")
-
-            if isinstance(
-                self.dataset,
-                (torch_geometric.data.Dataset, torch.utils.data.Dataset),
-            ):
-                # Use tqdm to show progress for large datasets
-                if len(self.dataset) > 1000:
-                    print(
-                        f"Loading {len(self.dataset)} graphs (this may take a while)..."
-                    )
-                    data_list = [
-                        data
-                        for data in tqdm(self.dataset, desc="Loading graphs")
-                    ]
-                else:
-                    data_list = [data for data in self.dataset]
-            elif isinstance(self.dataset, torch_geometric.data.Data):
-                data_list = [self.dataset]
-
-        if self.pre_transform is not None:
-            print(f"Applying transforms to {len(data_list)} graphs...")
-            transformed_data_list = [
-                self.pre_transform(d)
-                for d in tqdm(data_list, desc="Applying transforms")
-            ]
-        else:
-            transformed_data_list = data_list
-
-        print("Collating data...")
-        self._data, self.slices = self.collate(transformed_data_list)
+        """Method that processes the data."""
+        if isinstance(
+            self.dataset,
+            (torch_geometric.data.Dataset, torch.utils.data.Dataset),
+        ):
+            data_list = [data for data in self.dataset]
+        elif isinstance(self.dataset, torch_geometric.data.Data):
+            data_list = [self.dataset]
+
+        self.data_list = (
+            [self.pre_transform(d) for d in data_list]
+            if self.pre_transform is not None
+            else data_list
+        )
 
-        # If we processed only samples from split_idx, remap split_idx to new indices
-        if hasattr(self.dataset, "split_idx") and self.dataset.split_idx:
-            print("Remapping split_idx to new indices after processing...")
-            # Create mapping from old indices to new indices
-            old_to_new = {
-                old_idx: new_idx for new_idx, old_idx in enumerate(all_indices)
-            }
-
-            # Remap split_idx
-            new_split_idx = {}
-            for split_name in ["train", "valid", "test"]:
-                if split_name in self.dataset.split_idx:
-                    old_indices = self.dataset.split_idx[split_name]
-                    # Convert tensor to list if needed
-                    if hasattr(old_indices, "tolist"):
-                        old_indices = old_indices.tolist()
-                    elif hasattr(old_indices, "__iter__"):
-                        old_indices = list(old_indices)
-                    else:
-                        old_indices = [old_indices]
-
-                    # Map old indices to new indices
-                    new_indices = [
-                        old_to_new[idx]
-                        for idx in old_indices
-                        if idx in old_to_new
-                    ]
-                    new_split_idx[split_name] = new_indices
-
-            # Store the remapped split_idx on the dataset itself
-            self.dataset.split_idx = new_split_idx
-            print(
-                f"Remapped split_idx: train={len(new_split_idx.get('train', []))}, "
-                f"valid={len(new_split_idx.get('valid', []))}, test={len(new_split_idx.get('test', []))}"
-            )
+        self._data, self.slices = self.collate(self.data_list)
+        self._data_list = None  # Reset cache.
 
         assert isinstance(self._data, torch_geometric.data.Data)
-        # Guard against empty processed_paths
-        if self.processed_paths and len(self.processed_paths) > 0:
-            print(f"Saving processed data to {self.processed_paths[0]}...")
-            self.save(transformed_data_list, self.processed_paths[0])
-        else:
-            print("Warning: No processed paths available, skipping save.")
-
-        # Reset cache after saving
-        self._data_list = None
+        self.save(self.data_list, self.processed_paths[0])
 
     def load(self, path: str) -> None:
         r"""Load the dataset from the file path `path`.
diff --git a/topobench/data/utils/split_utils.py b/topobench/data/utils/split_utils.py
index 82f55d305..f78994222 100644
--- a/topobench/data/utils/split_utils.py
+++ b/topobench/data/utils/split_utils.py
@@ -5,64 +5,10 @@
 import numpy as np
 import torch
 from sklearn.model_selection import StratifiedKFold
-from torch.utils.data import Subset
-from tqdm import tqdm
 
 from topobench.dataloader import DataloadDataset
 
 
-class DatasetWrapper:
-    """Wrapper that converts dataset items to (values, keys) format.
-
-    This makes any dataset (including Subset) compatible with TopoBench's
-    custom collate function which expects (values, keys) tuples instead of Data objects.
-
-    Parameters
-    ----------
-    dataset : torch_geometric.data.Dataset or torch.utils.data.Subset
-        The underlying dataset.
-    """
-
-    def __init__(self, dataset):
-        """Initialize wrapper with dataset."""
-        self.dataset = dataset
-
-    def __len__(self):
-        """Return length of wrapped dataset."""
-        return len(self.dataset)
-
-    def __getitem__(self, idx):
-        """Get item at index in (values, keys) format.
-
-        Parameters
-        ----------
-        idx : int
-            Index of the data object to get.
-
-        Returns
-        -------
-        tuple
-            Tuple containing a list of all the values for the data and the corresponding keys.
-        """
-        # Get the data object from the wrapped dataset
-        data = self.dataset[idx]
-        # Convert to (values, keys) format expected by collate_fn
-        if hasattr(data, "keys"):
-            keys = list(data.keys())
-            return ([data[key] for key in keys], keys)
-        else:
-            # Fallback for non-Data objects
-            return data
-
-    def __getstate__(self):
-        """Return state for pickling (multiprocessing compatibility)."""
-        return {"dataset": self.dataset}
-
-    def __setstate__(self, state):
-        """Restore state from unpickling (multiprocessing compatibility)."""
-        self.dataset = state["dataset"]
-
-
 # Generate splits in different fasions
 def k_fold_split(labels, parameters, root=None):
     """Return train and valid indices as in K-Fold Cross-Validation.
@@ -97,9 +43,7 @@ def k_fold_split(labels, parameters, root=None):
     torch.manual_seed(0)
     np.random.seed(0)
 
-    # Include dataset size in split directory to avoid reusing splits from different dataset sizes
-    n = len(labels)
-    split_dir = os.path.join(data_dir, f"{k}-fold_n={n}")
+    split_dir = os.path.join(data_dir, f"{k}-fold")
 
     if not os.path.isdir(split_dir):
         os.makedirs(split_dir)
@@ -183,13 +127,9 @@ def random_splitting(labels, parameters, root=None, global_data_seed=42):
     train_prop = parameters["train_prop"]
     valid_prop = (1 - train_prop) / 2
 
-    # Include dataset size in split directory to avoid reusing splits from different dataset sizes
-    n = len(labels)
-
     # Create split directory if it does not exist
     split_dir = os.path.join(
-        data_dir,
-        f"train_prop={train_prop}_global_seed={global_data_seed}_n={n}",
+        data_dir, f"train_prop={train_prop}_global_seed={global_data_seed}"
     )
     generate_splits = False
     if not os.path.isdir(split_dir):
@@ -240,54 +180,6 @@ def random_splitting(labels, parameters, root=None, global_data_seed=42):
     return split_idx
 
 
-def create_subset_splits(dataset, split_idx):
-    """Create dataset splits using PyTorch Subset (optimized for large datasets).
-
-    This avoids loading all graphs into memory by using lazy indexing.
-
-    Parameters
-    ----------
-    dataset : torch_geometric.data.Dataset
-        Considered dataset.
-    split_idx : dict
-        Dictionary containing the train, validation, and test indices.
-
-    Returns
-    -------
-    tuple:
-        Tuple containing the train, validation, and test datasets as Subsets.
-    """
-    # Convert numpy arrays to lists if needed
-    train_indices = (
-        split_idx["train"].tolist()
-        if hasattr(split_idx["train"], "tolist")
-        else list(split_idx["train"])
-    )
-    valid_indices = (
-        split_idx["valid"].tolist()
-        if hasattr(split_idx["valid"], "tolist")
-        else list(split_idx["valid"])
-    )
-    test_indices = (
-        split_idx["test"].tolist()
-        if hasattr(split_idx["test"], "tolist")
-        else list(split_idx["test"])
-    )
-
-    print(
-        f"Creating subsets: train={len(train_indices)}, val={len(valid_indices)}, test={len(test_indices)}"
-    )
-
-    # Create subsets using lazy indexing
-    # Wrap subsets with DatasetWrapper to make them compatible with TopoBench's collate_fn
-    # Always create a Subset even if indices are empty, to maintain consistent API
-    train_dataset = DatasetWrapper(Subset(dataset, train_indices))
-    val_dataset = DatasetWrapper(Subset(dataset, valid_indices))
-    test_dataset = DatasetWrapper(Subset(dataset, test_indices))
-
-    return train_dataset, val_dataset, test_dataset
-
-
 def assign_train_val_test_mask_to_graphs(dataset, split_idx):
     """Split the graph dataset into train, validation, and test datasets.
 
@@ -307,58 +199,22 @@ def assign_train_val_test_mask_to_graphs(dataset, split_idx):
     data_train_lst, data_val_lst, data_test_lst = [], [], []
 
     # Assign masks directly by iterating over pre-split indices
-    print(f"Creating train split with {len(split_idx['train'])} samples...")
-    for i in tqdm(
-        split_idx["train"], desc="Loading train graphs", leave=False
-    ):
-        # Convert tensor index to Python int if needed
-        idx = (
-            i.item()
-            if isinstance(i, torch.Tensor) and i.dim() == 0
-            else int(i)
-        )
-        graph = dataset[idx]
-        # Clone if possible to avoid modifying original data
-        if hasattr(graph, "clone"):
-            graph = graph.clone()
+    for i in split_idx["train"]:
+        graph = dataset[i]
         graph.train_mask = torch.tensor([1], dtype=torch.long)
         graph.val_mask = torch.tensor([0], dtype=torch.long)
         graph.test_mask = torch.tensor([0], dtype=torch.long)
         data_train_lst.append(graph)
 
-    print(
-        f"Creating validation split with {len(split_idx['valid'])} samples..."
-    )
-    for i in tqdm(
-        split_idx["valid"], desc="Loading validation graphs", leave=False
-    ):
-        # Convert tensor index to Python int if needed
-        idx = (
-            i.item()
-            if isinstance(i, torch.Tensor) and i.dim() == 0
-            else int(i)
-        )
-        graph = dataset[idx]
-        # Clone if possible to avoid modifying original data
-        if hasattr(graph, "clone"):
-            graph = graph.clone()
+    for i in split_idx["valid"]:
+        graph = dataset[i]
         graph.train_mask = torch.tensor([0], dtype=torch.long)
         graph.val_mask = torch.tensor([1], dtype=torch.long)
         graph.test_mask = torch.tensor([0], dtype=torch.long)
         data_val_lst.append(graph)
 
-    print(f"Creating test split with {len(split_idx['test'])} samples...")
-    for i in tqdm(split_idx["test"], desc="Loading test graphs", leave=False):
-        # Convert tensor index to Python int if needed
-        idx = (
-            i.item()
-            if isinstance(i, torch.Tensor) and i.dim() == 0
-            else int(i)
-        )
-        graph = dataset[idx]
-        # Clone if possible to avoid modifying original data
-        if hasattr(graph, "clone"):
-            graph = graph.clone()
+    for i in split_idx["test"]:
+        graph = dataset[i]
         graph.train_mask = torch.tensor([0], dtype=torch.long)
         graph.val_mask = torch.tensor([0], dtype=torch.long)
         graph.test_mask = torch.tensor([1], dtype=torch.long)
@@ -450,6 +306,15 @@ def load_inductive_splits(dataset, parameters):
     assert len(dataset) > 1, (
         "Datasets should have more than one graph in an inductive setting."
     )
+    # Check if labels are ragged (different sizes across graphs)
+    label_list = [data.y.squeeze(0).numpy() for data in dataset]
+    label_shapes = [label.shape for label in label_list]
+    # Use dtype=object only if labels have different shapes (ragged)
+    labels = (
+        np.array(label_list, dtype=object)
+        if len(set(label_shapes)) > 1
+        else np.array(label_list)
+    )
 
     root = (
         dataset.dataset.get_data_dir()
@@ -457,64 +322,28 @@ def load_inductive_splits(dataset, parameters):
         else None
     )
 
-    # Check if we have fixed splits first (avoid loading all data)
-    if parameters.split_type == "fixed" and hasattr(dataset, "split_idx"):
-        print(
-            f"Using pre-computed fixed splits (train: {len(dataset.split_idx['train'])}, "
-            f"val: {len(dataset.split_idx['valid'])}, test: {len(dataset.split_idx['test'])})"
+    if parameters.split_type == "random":
+        split_idx = random_splitting(labels, parameters, root=root)
+
+    elif parameters.split_type == "k-fold":
+        assert type(labels) is not object, (
+            "K-Fold splitting not supported for ragged labels."
         )
+        split_idx = k_fold_split(labels, parameters, root=root)
+
+    elif parameters.split_type == "fixed" and hasattr(dataset, "split_idx"):
         split_idx = dataset.split_idx
-    else:
-        # For random/k-fold splits, we need to extract labels
-        # Check if labels are ragged (different sizes across graphs)
-        print(
-            f"Extracting labels from {len(dataset)} graphs for split creation..."
-        )
-        label_list = [
-            data.y.squeeze(0).numpy()
-            for data in tqdm(dataset, desc="Extracting labels")
-        ]
-        label_shapes = [label.shape for label in label_list]
-        # Use dtype=object only if labels have different shapes (ragged)
-        labels = (
-            np.array(label_list, dtype=object)
-            if len(set(label_shapes)) > 1
-            else np.array(label_list)
-        )
 
-        if parameters.split_type == "random":
-            split_idx = random_splitting(labels, parameters, root=root)
-
-        elif parameters.split_type == "k-fold":
-            assert type(labels) is not object, (
-                "K-Fold splitting not supported for ragged labels."
-            )
-            split_idx = k_fold_split(labels, parameters, root=root)
-
-        else:
-            raise NotImplementedError(
-                f"split_type {parameters.split_type} not valid. Choose either 'random', 'k-fold' or 'fixed'.\
-                If 'fixed' is chosen, the dataset should have the attribute split_idx"
-            )
-
-    # Use optimized subset-based splitting for large datasets OR when using fixed splits
-    # This avoids loading all graphs into memory at once for large datasets
-    # For fixed splits, subset-based splitting preserves the original dataset indices
-    use_subset_split = len(dataset) > 10000 or parameters.split_type == "fixed"
-
-    if use_subset_split:
-        if len(dataset) > 10000:
-            print(
-                f"Using optimized subset-based splitting for large dataset ({len(dataset)} graphs)"
-            )
-        train_dataset, val_dataset, test_dataset = create_subset_splits(
-            dataset, split_idx
-        )
     else:
-        train_dataset, val_dataset, test_dataset = (
-            assign_train_val_test_mask_to_graphs(dataset, split_idx)
+        raise NotImplementedError(
+            f"split_type {parameters.split_type} not valid. Choose either 'random', 'k-fold' or 'fixed'.\
+            If 'fixed' is chosen, the dataset should have the attribute split_idx"
         )
 
+    train_dataset, val_dataset, test_dataset = (
+        assign_train_val_test_mask_to_graphs(dataset, split_idx)
+    )
+
     return train_dataset, val_dataset, test_dataset
 
 
diff --git a/topobench/model/model.py b/topobench/model/model.py
index 97229c569..a7c688b47 100755
--- a/topobench/model/model.py
+++ b/topobench/model/model.py
@@ -232,26 +232,21 @@ def process_outputs(self, model_out: dict, batch: Data) -> dict:
         dict
             Dictionary containing the updated model output.
         """
-        # Get the correct mask (only for node-level tasks or when masks exist)
-        # For graph-level tasks with explicit splits (e.g., OC20 S2EF), masks don't exist
-        mask = None
+        # Get the correct mask
+        if self.state_str == "Training":
+            mask = batch.train_mask
+        elif self.state_str == "Validation":
+            mask = batch.val_mask
+        elif self.state_str == "Test":
+            mask = batch.test_mask
+        else:
+            raise ValueError("Invalid state_str")
+
         if self.task_level == "node":
-            if self.state_str == "Training":
-                mask = (
-                    batch.train_mask if hasattr(batch, "train_mask") else None
-                )
-            elif self.state_str == "Validation":
-                mask = batch.val_mask if hasattr(batch, "val_mask") else None
-            elif self.state_str == "Test":
-                mask = batch.test_mask if hasattr(batch, "test_mask") else None
-            else:
-                raise ValueError("Invalid state_str")
-
-            # Keep only relevant data points if mask exists
-            if mask is not None:
-                for key, val in model_out.items():
-                    if key in ["logits", "labels"]:
-                        model_out[key] = val[mask]
+            # Keep only train data points
+            for key, val in model_out.items():
+                if key in ["logits", "labels"]:
+                    model_out[key] = val[mask]
 
         return model_out
 

From 6dde1119318a7c99efc227e4c56ede77decb1422 Mon Sep 17 00:00:00 2001
From: theosaulus <theosaulus@gmail.com>
Date: Tue, 25 Nov 2025 21:14:32 -0500
Subject: [PATCH 17/17] ase package

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 3234ea9e6..9dccf8a4a 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ dependencies=[
     "topomodelx @ git+https://github.com/pyt-team/TopoModelX.git",
     "toponetx @ git+https://github.com/pyt-team/TopoNetX.git",
     "lightning==2.4.0",
+    "ase",  # Required for OC20/OC22 S2EF dataset tests
 ]
 
 [project.optional-dependencies]