From a9c57386b876d73ac0cbc1089d77f232a6fe0387 Mon Sep 17 00:00:00 2001
From: Dexiong Chen <dexiongc@gmail.com>
Date: Wed, 13 May 2026 20:39:40 +0200
Subject: [PATCH] Prepare package for PyPI release
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Packaging
- pyproject.toml: switch to find-based package discovery (tedbench*,
  minesm*) so all subpackages are correctly installed; pin
  setuptools>=64 / setuptools-scm>=8; populate dependencies from
  requirements.txt; move fair-esm to [baselines] optional extra;
  add py.typed to package-data; richer classifiers (Development Status,
  Intended Audience, License, OS, Python 3.9–3.11, two Topic entries)
- tedbench/py.typed: PEP 561 marker for type checkers

Top-level API
- tedbench/__init__.py: __version__ via importlib.metadata,
  list_models() returning metadata for all 12 TEDBench/* HF models,
  load_model(name) resolving short names via registry or passing
  through full HF repo IDs / local paths

Documentation
- README: pip install TEDBench as recommended option, baselines extra,
  tedbench.load_model / tedbench.list_models usage example
- CITATION.cff: machine-readable citation for GitHub's Cite button
- CHANGELOG.md: initial v0.1.0 entry

CI / CD
- .github/workflows/ci.yml: import-check CI on push/PR to main;
  uses --no-deps + find_spec to verify package structure without
  pulling in torch or other heavy runtime deps
- .github/workflows/publish.yml: publish to PyPI on v* tag push
  via OIDC trusted publishing (already committed on main)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 63 ++++++++++++++++++++++++++++++++++++++++
 CHANGELOG.md             | 29 ++++++++++++++++++
 CITATION.cff             | 27 +++++++++++++++++
 README.md                | 16 ++++++++--
 pyproject.toml           | 16 +++++++++-
 tedbench/__init__.py     | 27 +++++++++--------
 tedbench/py.typed        |  0
 7 files changed, 160 insertions(+), 18 deletions(-)
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 CHANGELOG.md
 create mode 100644 CITATION.cff
 create mode 100644 tedbench/py.typed

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..99fb30f
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,63 @@
+name: CI
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+
+jobs:
+  test:
+    name: Import checks
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # needed for setuptools-scm
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install package (core only)
+        run: pip install -e . --no-deps
+
+      - name: Verify top-level API
+        run: |
+          python -c "
+          import tedbench
+          assert tedbench.__version__ != '', 'version is empty'
+          models = tedbench.list_models()
+          assert len(models) == 12, f'expected 12 models, got {len(models)}'
+          print('tedbench OK — version:', tedbench.__version__)
+          "
+
+      - name: Verify all subpackages are discoverable
+        run: |
+          python -c "
+          import importlib.util
+
+          packages = [
+              'tedbench',
+              'tedbench.model',
+              'tedbench.data',
+              'tedbench.layer',
+              'tedbench.utils',
+              'minesm',
+              'minesm.layers',
+              'minesm.models',
+              'minesm.tokenization',
+              'minesm.utils',
+              'minesm.utils.constants',
+              'minesm.utils.structure',
+          ]
+          for pkg in packages:
+              spec = importlib.util.find_spec(pkg)
+              assert spec is not None, f'package not found: {pkg}'
+              print(f'  OK  {pkg}')
+          print('all subpackages discoverable')
+          "
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..0951ecb
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,29 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and
+this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+---
+
+## [0.1.0] — 2026-05-13
+
+### Added
+
+- Initial public release accompanying the ICML 2026 paper
+  *Protein Fold Classification at Scale: Benchmarking and Pretraining*.
+- **TEDBench dataset**: 462,175 AlphaFold structures across 965 CATH topology
+  classes, with train / val / test splits and an external CATH 4.4 experimental
+  test set (27,638 structures). Available on HuggingFace Hub
+  (`TEDBench/ted`) and via auto-download from MPCDF.
+- **MiAE** (Masked Invariant Autoencoders): SE(3)-invariant masked autoencoder
+  for protein backbone frames in three sizes (S / B / L).  Pretrained
+  checkpoints, fine-tuned fold classifiers, and from-scratch baselines are all
+  published on HuggingFace Hub.
+- Top-level convenience API: `tedbench.load_model(name)` and
+  `tedbench.list_models()`.
+- `LightningStructureDataset` data module supporting both HuggingFace Hub and
+  local auto-downloading backends.
+- Baselines: ESM2, SaProt, ProteinMPNN (scripts in `baselines/`; requires
+  `pip install TEDBench[baselines]` for ESM2/SaProt).
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000..1b3a4e8
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,27 @@
+cff-version: 1.2.0
+message: "If you use TEDBench or MiAE, please cite the following paper."
+title: "Protein Fold Classification at Scale: Benchmarking and Pretraining"
+authors:
+  - family-names: Chen
+    given-names: Dexiong
+    email: dchen@biochem.mpg.de
+  - family-names: Manolache
+    given-names: Andrei
+  - family-names: Niepert
+    given-names: Mathias
+  - family-names: Borgwardt
+    given-names: Karsten
+preferred-citation:
+  type: conference-paper
+  title: "Protein Fold Classification at Scale: Benchmarking and Pretraining"
+  authors:
+    - family-names: Chen
+      given-names: Dexiong
+    - family-names: Manolache
+      given-names: Andrei
+    - family-names: Niepert
+      given-names: Mathias
+    - family-names: Borgwardt
+      given-names: Karsten
+  year: 2026
+  collection-title: "Proceedings of the 43rd International Conference on Machine Learning (ICML)"
diff --git a/README.md b/README.md
index 4b5bd81..011d58f 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,12 @@ reconstructs the full backbone structure with a lightweight decoder.
 pip install tedbench
 ```
 
+For running ESM2 / SaProt baselines, add the `baselines` extra:
+
+```bash
+pip install "tedbench[baselines]"
+```
+
 **From source** (for training, baselines, or development):
 
 ```bash
@@ -132,10 +138,14 @@ auto-downloading local variant.
 All models are available on HuggingFace and can be loaded with a single call:
 
 ```python
-from tedbench.utils.io import load_from_hf
+import tedbench
+
+model = tedbench.load_model("miae-b")     # pretrained MiAE-B (short name)
+model = tedbench.load_model("miae-b-ft")  # fine-tuned on TEDBench
 
-model = load_from_hf("TEDBench/miae-b")  # pretrained MiAE-B
-model.eval()
+# List all available models
+for m in tedbench.list_models():
+    print(m["name"], m["type"], m["params"])
 ```
 
 ### Pretrained MiAE (feature extractor / fine-tuning starting point)
diff --git a/pyproject.toml b/pyproject.toml
index 9215598..9beee13 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,16 @@ requires-python = ">=3.9"
 keywords = ["protein fold classification", "masked autoencoder", "protein structure", "geometric deep learning", "benchmark"]
 license = {text = "BSD-3-Clause"}
 classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: BSD Software License",
+    "Operating System :: OS Independent",
     "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
 ]
 dynamic = ["version"]
 dependencies = [
@@ -25,7 +34,6 @@ dependencies = [
     "torchmetrics",
     "lightning",
     "hydra-core",
-    "fair-esm",
     "biotite",
     "biopython",
     "transformers",
@@ -41,10 +49,16 @@ dependencies = [
     "tqdm",
 ]
 
+[project.optional-dependencies]
+baselines = ["fair-esm"]
+
 [tool.setuptools.packages.find]
 where = ["."]
 include = ["tedbench*", "minesm*"]
 exclude = ["baselines*", "configs*", "datasets*", "tmp*"]
 
+[tool.setuptools.package-data]
+tedbench = ["py.typed"]
+
 [tool.setuptools_scm]
 write_to = "tedbench/_version.py"
diff --git a/tedbench/__init__.py b/tedbench/__init__.py
index 8066465..5cf6deb 100644
--- a/tedbench/__init__.py
+++ b/tedbench/__init__.py
@@ -25,82 +25,81 @@
 # Model registry
 # ---------------------------------------------------------------------------
 
-_HF_ORG = "dexiongc"
-_PREFIX = "tedbench"
+_HF_ORG = "TEDBench"
 
 _MODEL_REGISTRY: dict[str, dict] = {
     # ---- Small (29M) ----
     "miae-s": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-s",
+        "repo_id": f"{_HF_ORG}/miae-s",
         "params": "29M",
         "type": "pretrained",
         "description": "MiAE-Small pretrained encoder",
     },
     "miae-s-ft": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-s-ft",
+        "repo_id": f"{_HF_ORG}/miae-s-ft",
         "params": "29M",
         "type": "fine-tuned",
         "description": "MiAE-Small fine-tuned on TEDBench fold classification",
     },
     "miae-s-sc": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-s-sc",
+        "repo_id": f"{_HF_ORG}/miae-s-sc",
         "params": "29M",
         "type": "from-scratch",
         "description": "MiAE-Small trained from scratch on TEDBench",
     },
     # ---- Base (102M) ----
     "miae-b": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b",
+        "repo_id": f"{_HF_ORG}/miae-b",
         "params": "102M",
         "type": "pretrained",
         "description": "MiAE-Base pretrained encoder",
     },
     "miae-b-ft": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-ft",
+        "repo_id": f"{_HF_ORG}/miae-b-ft",
         "params": "102M",
         "type": "fine-tuned",
         "description": "MiAE-Base fine-tuned on TEDBench fold classification",
     },
     "miae-b-sc": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-sc",
+        "repo_id": f"{_HF_ORG}/miae-b-sc",
         "params": "102M",
         "type": "from-scratch",
         "description": "MiAE-Base trained from scratch on TEDBench",
     },
     # ---- Base + sequence input (102M) ----
     "miae-b-seq": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-seq",
+        "repo_id": f"{_HF_ORG}/miae-b-seq",
         "params": "102M",
         "type": "pretrained",
         "description": "MiAE-Base+seq pretrained encoder (structure + sequence tokens)",
     },
     "miae-b-seq-ft": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-seq-ft",
+        "repo_id": f"{_HF_ORG}/miae-b-seq-ft",
         "params": "102M",
         "type": "fine-tuned",
         "description": "MiAE-Base+seq fine-tuned on TEDBench fold classification",
     },
     "miae-b-seq-sc": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-seq-sc",
+        "repo_id": f"{_HF_ORG}/miae-b-seq-sc",
         "params": "102M",
         "type": "from-scratch",
         "description": "MiAE-Base+seq trained from scratch on TEDBench",
     },
     # ---- Large (339M) ----
     "miae-l": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-l",
+        "repo_id": f"{_HF_ORG}/miae-l",
         "params": "339M",
         "type": "pretrained",
         "description": "MiAE-Large pretrained encoder",
     },
     "miae-l-ft": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-l-ft",
+        "repo_id": f"{_HF_ORG}/miae-l-ft",
         "params": "339M",
         "type": "fine-tuned",
         "description": "MiAE-Large fine-tuned on TEDBench fold classification",
     },
     "miae-l-sc": {
-        "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-l-sc",
+        "repo_id": f"{_HF_ORG}/miae-l-sc",
         "params": "339M",
         "type": "from-scratch",
         "description": "MiAE-Large trained from scratch on TEDBench",
diff --git a/tedbench/py.typed b/tedbench/py.typed
new file mode 100644
index 0000000..e69de29