geometric-intelligence · grapentt · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/configs/dataset/simplicial/ppi_highppi.yaml b/configs/dataset/simplicial/ppi_highppi.yaml
@@ -0,0 +1,98 @@
+################################################################################
+# HIGH-PPI + CORUM: Protein Interaction Prediction via Simplicial Complexes
+################################################################################
+#
+# Data Structure:
+#  - Proteins (rank 0): ~1,553 proteins
+#  - Edges (rank 1): ~6,660 HIGH-PPI edges with:
+#    * Features: 8-dim (7 interaction types + 1 STRING confidence score)
+#      - Interaction types: reaction, binding, ptmod, activation, inhibition, catalysis, expression
+#      - Confidence score [0, 1] measuring interaction probability (mapped to [-1, 1])
+#  - Higher-order cells: CORUM protein complexes (2+ proteins)
+#    * Features: 1-dim (binary existence: 1=real, -1=fake)
+#
+# Note: Features at any rank can also serve as prediction targets (labels).
+#       Models should mask features of the rank being predicted to avoid data leakage.
+#
+# Prediction Tasks:
+#  - Edge (rank 1): Regression (confidence scores) or multi-label (interaction types)
+#  - Cell (ranks 2+): Binary classification (complex existence)
+#
+################################################################################
+
+# Data loading configuration
+loader:
+  _target_: topobench.data.loaders.PPIHighPPIDatasetLoader
+  parameters:
+    data_domain: simplicial
+    model_domain: simplicial        
+    data_name: ppi_highppi
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}
+
+    # CORUM Complex Configuration
+    min_complex_size: 2             # Minimum proteins per CORUM complex (2+ allowed)
+                                    # Edge features for edges in CORUM:
+                                    #   - In HIGH-PPI: Interaction types + confidence boosted to 1.0
+                                    #   - Not in HIGH-PPI: [0,0,0,0,0,0,0, 1.0] (unknown types, high confidence)
+    max_complex_size: 6             # Maximum proteins per CORUM complex
+
+    # Negative Sampling (for classification tasks)
+    neg_ratio: 1.0                  # Ratio of negative to positive samples (1.0 = balanced)
+
+    # Multi-Rank Prediction
+    target_ranks: [2, 3, 4, 5]      # Which ranks to predict (train/test on)
+                                    # Max target rank must be <= max_complex_size - 1
+
+    # Edge Task Type (only applied when rank 1 in target_ranks)
+    edge_task: score                # "score": Regression - predict confidence of interaction [0-1]
+                                    # "interaction_type": Multi-label - predict 7 interaction types
+
+# Model training configuration
+parameters:
+  _num_proteins: 1553  # HIGH-PPI has 1,553 proteins
+
+  num_features: ${infer_ppi_num_features:${dataset.parameters._num_proteins},${dataset.loader.parameters.edge_task},${dataset.loader.parameters.max_complex_size}}
+
+  num_classes: 2                    # Depends on task:
+                                    # - Higher-order (ranks 2+): 2 (exists/doesn't exist)
+                                    # - Edge regression (rank 1, score): 1 (continuous output)
+                                    # - Edge multi-label (rank 1, interaction_type): 7 (7 types)
+  task: classification              # Depends on target_ranks and edge_task:
+                                    # - Higher-order (ranks 2+): classification
+                                    # - Edge regression (rank 1, score): regression
+                                    # - Edge multi-label (rank 1, interaction_type): classification
+  task_level: cell                  # Predict on cells (edges/triangles/etc), not nodes or graphs
+
+  # Multi-Rank Prediction 
+  target_ranks: ${dataset.loader.parameters.target_ranks}
+
+  loss_type: cross_entropy          # Depends on task:
+                                    # - Higher-order binary: cross_entropy
+                                    # - Edge regression: mse or mae
+                                    # - Edge multi-label: bce_with_logits
+  monitor_metric: auroc             # Depends on task:
+                                    # - Higher-order binary: auroc, f1, accuracy
+                                    # - Edge regression: mae, rmse
+                                    # - Edge multi-label: f1, auroc
+
+# Splits Configuration
+split_params:
+  learning_setting: transductive    # Single complex, split labeled cells
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+  data_seed: 42                     # Random seed for reproducible splits
+
+  # Split Type Options:
+  # - "random": Random splitting with train_prop ratio
+  # - "k-fold": K-fold cross-validation
+  # - "fixed": Use HIGH-PPI's official train/val split (if available in raw data)
+  split_type: random
+
+  train_prop: 0.8                   # For random/k-fold: 80% train, 10% val, 10% test
+                                    # Ignored when split_type: fixed
+
+# Dataloader
+dataloader_params:
+  batch_size: 1
+  num_workers: 0
+  pin_memory: False
+  persistent_workers: False
diff --git a/configs/model/simplicial/sccnn_cell.yaml b/configs/model/simplicial/sccnn_cell.yaml
@@ -0,0 +1,45 @@
+_target_: topobench.model.TBModel
+
+model_name: sccnn_cell
+model_domain: simplicial
+
+_hidden_dim: 32  # Hidden dimension for all ranks
+
+_in_channels: ${infer_in_channels:${dataset},${oc.select:transforms,null}}
+_num_ranks: ${infer_num_cell_dimensions:null,${model._in_channels}}
+_channel_list: ${infer_channel_list:${model._hidden_dim},${model._num_ranks}}
+
+feature_encoder:
+  _target_: topobench.nn.encoders.AllCellFeatureEncoder
+  encoder_name: AllCellFeatureEncoder
+  in_channels: ${model._in_channels}
+  out_channels: ${model._hidden_dim}
+  proj_dropout: 0.0
+
+backbone:
+  _target_: topobench.nn.backbones.simplicial.sccnn.SCCNNCustom
+  in_channels_all: ${model._channel_list}
+  hidden_channels_all: ${model._channel_list}
+  conv_order: 1
+  sc_order: ${model._num_ranks}
+  aggr_norm: false
+  update_func: sigmoid
+  n_layers: 2
+
+backbone_wrapper:
+  _target_: topobench.nn.wrappers.SCCNNCellWrapper
+  _partial_: true
+  wrapper_name: SCCNNCellWrapper
+  num_cell_dimensions: ${model._num_ranks}
+  target_ranks: ${dataset.parameters.target_ranks}
+  out_channels: ${model._hidden_dim}
+
+readout:
+  _target_: topobench.nn.readouts.LinearCellLevelReadout
+  hidden_dim: ${model._hidden_dim}
+  out_channels: ${dataset.parameters.num_classes}
+  num_cell_dimensions: ${model._num_ranks}
+  target_ranks: ${dataset.parameters.target_ranks}
+
+# Compile model for faster training (pytorch 2.0+)
+compile: false
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,6 +54,8 @@ dependencies=[
     "topomodelx @ git+https://github.com/pyt-team/TopoModelX.git",
     "toponetx @ git+https://github.com/pyt-team/TopoNetX.git",
     "lightning==2.4.0",
+    "gdown",
+    "pybiomart",
 ]
 
 [project.optional-dependencies]

diff --git a/test/data/load/test_datasetloaders.py b/test/data/load/test_datasetloaders.py
@@ -1,12 +1,9 @@
 """Comprehensive test suite for all dataset loaders."""
-import os
 import pytest
-import torch
 import hydra
 from pathlib import Path
 from typing import List, Tuple, Dict, Any
-from omegaconf import DictConfig
-from topobench.data.preprocessor.preprocessor import PreProcessor
+
 class TestLoaders:
     """Comprehensive test suite for all dataset loaders."""
 

diff --git a/test/nn/backbones/simplicial/test_sccnn.py b/test/nn/backbones/simplicial/test_sccnn.py
@@ -94,9 +94,8 @@ def test_sccnn_basic_initialization():
 
     # Verify layer structure
     assert len(model.layers) == 2  # Default n_layers is 2
-    assert hasattr(model, 'in_linear_0')
-    assert hasattr(model, 'in_linear_1')
-    assert hasattr(model, 'in_linear_2')
+    assert hasattr(model, 'in_linears')
+    assert len(model.in_linears) == 3  # Should have 3 input linear layers for ranks 0, 1, 2
 
 def test_update_functions():
     """Test different update functions in the SCCNN."""

diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py
@@ -7,6 +7,10 @@
 DATASET = "graph/MUTAG"                                                 # ADD YOUR DATASET HERE
 MODELS   = ["graph/gcn", "cell/topotune", "simplicial/topotune"]        # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE
 
+# HIGH-PPI B2 integration (optional - uncomment to test)
+# DATASET = "simplicial/ppi_highppi"
+# MODELS = ["simplicial/sccnn_cell"]
+
 
 class TestPipeline:
     """Test pipeline for a particular dataset and model."""

diff --git a/test/test_tutorials.py b/test/test_tutorials.py
@@ -1,6 +1,7 @@
 """Unit tests for the tutorials."""
 
 import glob
+import os
 import subprocess
 import tempfile
 
@@ -28,7 +29,16 @@ def _exec_tutorial(path):
         file_name,
         path,
     ]
-    subprocess.check_call(args)
+
+    # Set PYTHONPATH to include the project root so notebooks can import topobench
+    env = os.environ.copy()
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    if 'PYTHONPATH' in env:
+        env['PYTHONPATH'] = f"{project_root}:{env['PYTHONPATH']}"
+    else:
+        env['PYTHONPATH'] = project_root
+
+    subprocess.check_call(args, env=env)
 
 
 paths = sorted(glob.glob("tutorials/*.ipynb"))