Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions configs/dataset/simplicial/ppi_highppi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
################################################################################
# HIGH-PPI + CORUM: Protein Interaction Prediction via Simplicial Complexes
################################################################################
#
# Data Structure:
# - Proteins (rank 0): ~1,553 proteins
# - Edges (rank 1): ~6,660 HIGH-PPI edges with:
# * Features: 8-dim (7 interaction types + 1 STRING confidence score)
# - Interaction types: reaction, binding, ptmod, activation, inhibition, catalysis, expression
# - Confidence score [0, 1] measuring interaction probability (mapped to [-1, 1])
# - Higher-order cells: CORUM protein complexes (2+ proteins)
# * Features: 1-dim (binary existence: 1=real, -1=fake)
#
# Note: Features at any rank can also serve as prediction targets (labels).
# Models should mask features of the rank being predicted to avoid data leakage.
#
# Prediction Tasks:
# - Edge (rank 1): Regression (confidence scores) or multi-label (interaction types)
# - Cell (ranks 2+): Binary classification (complex existence)
#
################################################################################

# Data loading configuration
loader:
_target_: topobench.data.loaders.PPIHighPPIDatasetLoader
parameters:
data_domain: simplicial
model_domain: simplicial
data_name: ppi_highppi
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}

# CORUM Complex Configuration
min_complex_size: 2 # Minimum proteins per CORUM complex (2+ allowed)
# Edge features for edges in CORUM:
# - In HIGH-PPI: Interaction types + confidence boosted to 1.0
# - Not in HIGH-PPI: [0,0,0,0,0,0,0, 1.0] (unknown types, high confidence)
max_complex_size: 6 # Maximum proteins per CORUM complex

# Negative Sampling (for classification tasks)
neg_ratio: 1.0 # Ratio of negative to positive samples (1.0 = balanced)

# Multi-Rank Prediction
target_ranks: [2, 3, 4, 5] # Which ranks to predict (train/test on)
# Max target rank must be <= max_complex_size - 1

# Edge Task Type (only applied when rank 1 in target_ranks)
edge_task: score # "score": Regression - predict confidence of interaction [0-1]
# "interaction_type": Multi-label - predict 7 interaction types

# Model training configuration
parameters:
_num_proteins: 1553 # HIGH-PPI has 1,553 proteins

num_features: ${infer_ppi_num_features:${dataset.parameters._num_proteins},${dataset.loader.parameters.edge_task},${dataset.loader.parameters.max_complex_size}}

num_classes: 2 # Depends on task:
# - Higher-order (ranks 2+): 2 (exists/doesn't exist)
# - Edge regression (rank 1, score): 1 (continuous output)
# - Edge multi-label (rank 1, interaction_type): 7 (7 types)
task: classification # Depends on target_ranks and edge_task:
# - Higher-order (ranks 2+): classification
# - Edge regression (rank 1, score): regression
# - Edge multi-label (rank 1, interaction_type): classification
task_level: cell # Predict on cells (edges/triangles/etc), not nodes or graphs

# Multi-Rank Prediction
target_ranks: ${dataset.loader.parameters.target_ranks}

loss_type: cross_entropy # Depends on task:
# - Higher-order binary: cross_entropy
# - Edge regression: mse or mae
# - Edge multi-label: bce_with_logits
monitor_metric: auroc # Depends on task:
# - Higher-order binary: auroc, f1, accuracy
# - Edge regression: mae, rmse
# - Edge multi-label: f1, auroc

# Splits Configuration
split_params:
learning_setting: transductive # Single complex, split labeled cells
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 42 # Random seed for reproducible splits

# Split Type Options:
# - "random": Random splitting with train_prop ratio
# - "k-fold": K-fold cross-validation
# - "fixed": Use HIGH-PPI's official train/val split (if available in raw data)
split_type: random

train_prop: 0.8 # For random/k-fold: 80% train, 10% val, 10% test
# Ignored when split_type: fixed

# Dataloader
dataloader_params:
batch_size: 1
num_workers: 0
pin_memory: False
persistent_workers: False
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ dependencies=[
"topomodelx @ git+https://github.com/pyt-team/TopoModelX.git",
"toponetx @ git+https://github.com/pyt-team/TopoNetX.git",
"lightning==2.4.0",
"gdown",
"pybiomart",
]

[project.optional-dependencies]
Expand Down
5 changes: 1 addition & 4 deletions test/data/load/test_datasetloaders.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
"""Comprehensive test suite for all dataset loaders."""
import os
import pytest
import torch
import hydra
from pathlib import Path
from typing import List, Tuple, Dict, Any
from omegaconf import DictConfig
from topobench.data.preprocessor.preprocessor import PreProcessor

class TestLoaders:
"""Comprehensive test suite for all dataset loaders."""

Expand Down
Loading
Loading