BorgwardtLab · claying · May 13, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,63 @@
+name: CI
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+
+jobs:
+  test:
+    name: Import checks
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # needed for setuptools-scm
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install package (core only)
+        run: pip install -e . --no-deps
+
+      - name: Verify top-level API
+        run: |
+          python -c "
+          import tedbench
+          assert tedbench.__version__ != '', 'version is empty'
+          models = tedbench.list_models()
+          assert len(models) == 12, f'expected 12 models, got {len(models)}'
+          print('tedbench OK — version:', tedbench.__version__)
+          "
+
+      - name: Verify all subpackages are discoverable
+        run: |
+          python -c "
+          import importlib.util
+
+          packages = [
+              'tedbench',
+              'tedbench.model',
+              'tedbench.data',
+              'tedbench.layer',
+              'tedbench.utils',
+              'minesm',
+              'minesm.layers',
+              'minesm.models',
+              'minesm.tokenization',
+              'minesm.utils',
+              'minesm.utils.constants',
+              'minesm.utils.structure',
+          ]
+          for pkg in packages:
+              spec = importlib.util.find_spec(pkg)
+              assert spec is not None, f'package not found: {pkg}'
+              print(f'  OK  {pkg}')
+          print('all subpackages discoverable')
+          "
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,29 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and
+this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+---
+
+## [0.1.0] — 2026-05-13
+
+### Added
+
+- Initial public release accompanying the ICML 2026 paper
+  *Protein Fold Classification at Scale: Benchmarking and Pretraining*.
+- **TEDBench dataset**: 462,175 AlphaFold structures across 965 CATH topology
+  classes, with train / val / test splits and an external CATH 4.4 experimental
+  test set (27,638 structures). Available on HuggingFace Hub
+  (`TEDBench/ted`) and via auto-download from MPCDF.
+- **MiAE** (Masked Invariant Autoencoders): SE(3)-invariant masked autoencoder
+  for protein backbone frames in three sizes (S / B / L).  Pretrained
+  checkpoints, fine-tuned fold classifiers, and from-scratch baselines are all
+  published on HuggingFace Hub.
+- Top-level convenience API: `tedbench.load_model(name)` and
+  `tedbench.list_models()`.
+- `LightningStructureDataset` data module supporting both HuggingFace Hub and
+  local auto-downloading backends.
+- Baselines: ESM2, SaProt, ProteinMPNN (scripts in `baselines/`; requires
+  `pip install TEDBench[baselines]` for ESM2/SaProt).
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,27 @@
+cff-version: 1.2.0
+message: "If you use TEDBench or MiAE, please cite the following paper."
+title: "Protein Fold Classification at Scale: Benchmarking and Pretraining"
+authors:
+  - family-names: Chen
+    given-names: Dexiong
+    email: dchen@biochem.mpg.de
+  - family-names: Manolache
+    given-names: Andrei
+  - family-names: Niepert
+    given-names: Mathias
+  - family-names: Borgwardt
+    given-names: Karsten
+preferred-citation:
+  type: conference-paper
+  title: "Protein Fold Classification at Scale: Benchmarking and Pretraining"
+  authors:
+    - family-names: Chen
+      given-names: Dexiong
+    - family-names: Manolache
+      given-names: Andrei
+    - family-names: Niepert
+      given-names: Mathias
+    - family-names: Borgwardt
+      given-names: Karsten
+  year: 2026
+  collection-title: "Proceedings of the 43rd International Conference on Machine Learning (ICML)"
diff --git a/README.md b/README.md
@@ -37,6 +37,12 @@ reconstructs the full backbone structure with a lightweight decoder.
 pip install tedbench
 ```
 
+For running ESM2 / SaProt baselines, add the `baselines` extra:
+
+```bash
+pip install "tedbench[baselines]"
+```
+
 **From source** (for training, baselines, or development):
 
 ```bash
@@ -132,10 +138,14 @@ auto-downloading local variant.
 All models are available on HuggingFace and can be loaded with a single call:
 
 ```python
-from tedbench.utils.io import load_from_hf
+import tedbench
+
+model = tedbench.load_model("miae-b")     # pretrained MiAE-B (short name)
+model = tedbench.load_model("miae-b-ft")  # fine-tuned on TEDBench
 
-model = load_from_hf("TEDBench/miae-b")  # pretrained MiAE-B
-model.eval()
+# List all available models
+for m in tedbench.list_models():
+    print(m["name"], m["type"], m["params"])
 ```
 
 ### Pretrained MiAE (feature extractor / fine-tuning starting point)

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,16 @@ requires-python = ">=3.9"
 keywords = ["protein fold classification", "masked autoencoder", "protein structure", "geometric deep learning", "benchmark"]
 license = {text = "BSD-3-Clause"}
 classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: BSD Software License",
+    "Operating System :: OS Independent",
     "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
 ]
 dynamic = ["version"]
 dependencies = [
@@ -25,7 +34,6 @@ dependencies = [
     "torchmetrics",
     "lightning",
     "hydra-core",
-    "fair-esm",
     "biotite",
     "biopython",
     "transformers",
@@ -41,10 +49,16 @@ dependencies = [
     "tqdm",
 ]
 
+[project.optional-dependencies]
+baselines = ["fair-esm"]
+
 [tool.setuptools.packages.find]
 where = ["."]
 include = ["tedbench*", "minesm*"]
 exclude = ["baselines*", "configs*", "datasets*", "tmp*"]
 
+[tool.setuptools.package-data]
+tedbench = ["py.typed"]
+
 [tool.setuptools_scm]
 write_to = "tedbench/_version.py"
diff --git a/tedbench/__init__.py b/tedbench/__init__.py
@@ -25,82 +25,81 @@
 # Model registry
 # ---------------------------------------------------------------------------
 
-_HF_ORG = "dexiongc"
-_PREFIX = "tedbench"
+_HF_ORG = "TEDBench"
 
 _MODEL_REGISTRY: dict[str, dict] = {
     # ---- Small (29M) ----
     "miae-s": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-s",
+        "repo_id": f"{_HF_ORG}/miae-s",
         "params": "29M",
         "type": "pretrained",
         "description": "MiAE-Small pretrained encoder",
     },
     "miae-s-ft": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-s-ft",
+        "repo_id": f"{_HF_ORG}/miae-s-ft",
         "params": "29M",
         "type": "fine-tuned",
         "description": "MiAE-Small fine-tuned on TEDBench fold classification",
     },
     "miae-s-sc": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-s-sc",
+        "repo_id": f"{_HF_ORG}/miae-s-sc",
         "params": "29M",
         "type": "from-scratch",
         "description": "MiAE-Small trained from scratch on TEDBench",
     },
     # ---- Base (102M) ----
     "miae-b": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b",
+        "repo_id": f"{_HF_ORG}/miae-b",
         "params": "102M",
         "type": "pretrained",
         "description": "MiAE-Base pretrained encoder",
     },
     "miae-b-ft": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-ft",
+        "repo_id": f"{_HF_ORG}/miae-b-ft",
         "params": "102M",
         "type": "fine-tuned",
         "description": "MiAE-Base fine-tuned on TEDBench fold classification",
     },
     "miae-b-sc": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-sc",
+        "repo_id": f"{_HF_ORG}/miae-b-sc",
         "params": "102M",
         "type": "from-scratch",
         "description": "MiAE-Base trained from scratch on TEDBench",
     },
     # ---- Base + sequence input (102M) ----
     "miae-b-seq": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-seq",
+        "repo_id": f"{_HF_ORG}/miae-b-seq",
         "params": "102M",
         "type": "pretrained",
         "description": "MiAE-Base+seq pretrained encoder (structure + sequence tokens)",
     },
     "miae-b-seq-ft": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-seq-ft",
+        "repo_id": f"{_HF_ORG}/miae-b-seq-ft",
         "params": "102M",
         "type": "fine-tuned",
         "description": "MiAE-Base+seq fine-tuned on TEDBench fold classification",
     },
     "miae-b-seq-sc": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-seq-sc",
+        "repo_id": f"{_HF_ORG}/miae-b-seq-sc",
         "params": "102M",
         "type": "from-scratch",
         "description": "MiAE-Base+seq trained from scratch on TEDBench",
     },
     # ---- Large (339M) ----
     "miae-l": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-l",
+        "repo_id": f"{_HF_ORG}/miae-l",
         "params": "339M",
         "type": "pretrained",
         "description": "MiAE-Large pretrained encoder",
     },
     "miae-l-ft": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-l-ft",
+        "repo_id": f"{_HF_ORG}/miae-l-ft",
         "params": "339M",
         "type": "fine-tuned",
         "description": "MiAE-Large fine-tuned on TEDBench fold classification",
     },
     "miae-l-sc": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-l-sc",
+        "repo_id": f"{_HF_ORG}/miae-l-sc",
         "params": "339M",
         "type": "from-scratch",
         "description": "MiAE-Large trained from scratch on TEDBench",

diff --git a/tedbench/py.typed b/tedbench/py.typed