From a9c57386b876d73ac0cbc1089d77f232a6fe0387 Mon Sep 17 00:00:00 2001 From: Dexiong Chen Date: Wed, 13 May 2026 20:39:40 +0200 Subject: [PATCH] Prepare package for PyPI release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Packaging - pyproject.toml: switch to find-based package discovery (tedbench*, minesm*) so all subpackages are correctly installed; pin setuptools>=64 / setuptools-scm>=8; populate dependencies from requirements.txt; move fair-esm to [baselines] optional extra; add py.typed to package-data; richer classifiers (Development Status, Intended Audience, License, OS, Python 3.9–3.11, two Topic entries) - tedbench/py.typed: PEP 561 marker for type checkers Top-level API - tedbench/__init__.py: __version__ via importlib.metadata, list_models() returning metadata for all 12 TEDBench/* HF models, load_model(name) resolving short names via registry or passing through full HF repo IDs / local paths Documentation - README: pip install TEDBench as recommended option, baselines extra, tedbench.load_model / tedbench.list_models usage example - CITATION.cff: machine-readable citation for GitHub's Cite button - CHANGELOG.md: initial v0.1.0 entry CI / CD - .github/workflows/ci.yml: import-check CI on push/PR to main; uses --no-deps + find_spec to verify package structure without pulling in torch or other heavy runtime deps - .github/workflows/publish.yml: publish to PyPI on v* tag push via OIDC trusted publishing (already committed on main) Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 63 ++++++++++++++++++++++++++++++++++++++++ CHANGELOG.md | 29 ++++++++++++++++++ CITATION.cff | 27 +++++++++++++++++ README.md | 16 ++++++++-- pyproject.toml | 16 +++++++++- tedbench/__init__.py | 27 +++++++++-------- tedbench/py.typed | 0 7 files changed, 160 insertions(+), 18 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 CHANGELOG.md create mode 100644 CITATION.cff create mode 100644 tedbench/py.typed diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..99fb30f --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,63 @@ +name: CI + +on: + push: + branches: ["main"] + pull_request: + branches: ["main"] + +jobs: + test: + name: Import checks + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11"] + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # needed for setuptools-scm + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install package (core only) + run: pip install -e . --no-deps + + - name: Verify top-level API + run: | + python -c " + import tedbench + assert tedbench.__version__ != '', 'version is empty' + models = tedbench.list_models() + assert len(models) == 12, f'expected 12 models, got {len(models)}' + print('tedbench OK — version:', tedbench.__version__) + " + + - name: Verify all subpackages are discoverable + run: | + python -c " + import importlib.util + + packages = [ + 'tedbench', + 'tedbench.model', + 'tedbench.data', + 'tedbench.layer', + 'tedbench.utils', + 'minesm', + 'minesm.layers', + 'minesm.models', + 'minesm.tokenization', + 'minesm.utils', + 'minesm.utils.constants', + 'minesm.utils.structure', + ] + for pkg in packages: + spec = importlib.util.find_spec(pkg) + assert spec is not None, f'package not found: {pkg}' + print(f' OK {pkg}') + print('all subpackages discoverable') + " diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..0951ecb --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,29 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and +this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +--- + +## [0.1.0] — 2026-05-13 + +### Added + +- Initial public release accompanying the ICML 2026 paper + *Protein Fold Classification at Scale: Benchmarking and Pretraining*. +- **TEDBench dataset**: 462,175 AlphaFold structures across 965 CATH topology + classes, with train / val / test splits and an external CATH 4.4 experimental + test set (27,638 structures). Available on HuggingFace Hub + (`TEDBench/ted`) and via auto-download from MPCDF. +- **MiAE** (Masked Invariant Autoencoders): SE(3)-invariant masked autoencoder + for protein backbone frames in three sizes (S / B / L). Pretrained + checkpoints, fine-tuned fold classifiers, and from-scratch baselines are all + published on HuggingFace Hub. +- Top-level convenience API: `tedbench.load_model(name)` and + `tedbench.list_models()`. +- `LightningStructureDataset` data module supporting both HuggingFace Hub and + local auto-downloading backends. +- Baselines: ESM2, SaProt, ProteinMPNN (scripts in `baselines/`; requires + `pip install TEDBench[baselines]` for ESM2/SaProt). diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..1b3a4e8 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,27 @@ +cff-version: 1.2.0 +message: "If you use TEDBench or MiAE, please cite the following paper." +title: "Protein Fold Classification at Scale: Benchmarking and Pretraining" +authors: + - family-names: Chen + given-names: Dexiong + email: dchen@biochem.mpg.de + - family-names: Manolache + given-names: Andrei + - family-names: Niepert + given-names: Mathias + - family-names: Borgwardt + given-names: Karsten +preferred-citation: + type: conference-paper + title: "Protein Fold Classification at Scale: Benchmarking and Pretraining" + authors: + - family-names: Chen + given-names: Dexiong + - family-names: Manolache + given-names: Andrei + - family-names: Niepert + given-names: Mathias + - family-names: Borgwardt + given-names: Karsten + year: 2026 + collection-title: "Proceedings of the 43rd International Conference on Machine Learning (ICML)" diff --git a/README.md b/README.md index 4b5bd81..011d58f 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,12 @@ reconstructs the full backbone structure with a lightweight decoder. pip install tedbench ``` +For running ESM2 / SaProt baselines, add the `baselines` extra: + +```bash +pip install "tedbench[baselines]" +``` + **From source** (for training, baselines, or development): ```bash @@ -132,10 +138,14 @@ auto-downloading local variant. All models are available on HuggingFace and can be loaded with a single call: ```python -from tedbench.utils.io import load_from_hf +import tedbench + +model = tedbench.load_model("miae-b") # pretrained MiAE-B (short name) +model = tedbench.load_model("miae-b-ft") # fine-tuned on TEDBench -model = load_from_hf("TEDBench/miae-b") # pretrained MiAE-B -model.eval() +# List all available models +for m in tedbench.list_models(): + print(m["name"], m["type"], m["params"]) ``` ### Pretrained MiAE (feature extractor / fine-tuning starting point) diff --git a/pyproject.toml b/pyproject.toml index 9215598..9beee13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,16 @@ requires-python = ">=3.9" keywords = ["protein fold classification", "masked autoencoder", "protein structure", "geometric deep learning", "benchmark"] license = {text = "BSD-3-Clause"} classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD Software License", + "Operating System :: OS Independent", "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Bio-Informatics", ] dynamic = ["version"] dependencies = [ @@ -25,7 +34,6 @@ dependencies = [ "torchmetrics", "lightning", "hydra-core", - "fair-esm", "biotite", "biopython", "transformers", @@ -41,10 +49,16 @@ dependencies = [ "tqdm", ] +[project.optional-dependencies] +baselines = ["fair-esm"] + [tool.setuptools.packages.find] where = ["."] include = ["tedbench*", "minesm*"] exclude = ["baselines*", "configs*", "datasets*", "tmp*"] +[tool.setuptools.package-data] +tedbench = ["py.typed"] + [tool.setuptools_scm] write_to = "tedbench/_version.py" diff --git a/tedbench/__init__.py b/tedbench/__init__.py index 8066465..5cf6deb 100644 --- a/tedbench/__init__.py +++ b/tedbench/__init__.py @@ -25,82 +25,81 @@ # Model registry # --------------------------------------------------------------------------- -_HF_ORG = "dexiongc" -_PREFIX = "tedbench" +_HF_ORG = "TEDBench" _MODEL_REGISTRY: dict[str, dict] = { # ---- Small (29M) ---- "miae-s": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-s", + "repo_id": f"{_HF_ORG}/miae-s", "params": "29M", "type": "pretrained", "description": "MiAE-Small pretrained encoder", }, "miae-s-ft": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-s-ft", + "repo_id": f"{_HF_ORG}/miae-s-ft", "params": "29M", "type": "fine-tuned", "description": "MiAE-Small fine-tuned on TEDBench fold classification", }, "miae-s-sc": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-s-sc", + "repo_id": f"{_HF_ORG}/miae-s-sc", "params": "29M", "type": "from-scratch", "description": "MiAE-Small trained from scratch on TEDBench", }, # ---- Base (102M) ---- "miae-b": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b", + "repo_id": f"{_HF_ORG}/miae-b", "params": "102M", "type": "pretrained", "description": "MiAE-Base pretrained encoder", }, "miae-b-ft": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-ft", + "repo_id": f"{_HF_ORG}/miae-b-ft", "params": "102M", "type": "fine-tuned", "description": "MiAE-Base fine-tuned on TEDBench fold classification", }, "miae-b-sc": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-sc", + "repo_id": f"{_HF_ORG}/miae-b-sc", "params": "102M", "type": "from-scratch", "description": "MiAE-Base trained from scratch on TEDBench", }, # ---- Base + sequence input (102M) ---- "miae-b-seq": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-seq", + "repo_id": f"{_HF_ORG}/miae-b-seq", "params": "102M", "type": "pretrained", "description": "MiAE-Base+seq pretrained encoder (structure + sequence tokens)", }, "miae-b-seq-ft": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-seq-ft", + "repo_id": f"{_HF_ORG}/miae-b-seq-ft", "params": "102M", "type": "fine-tuned", "description": "MiAE-Base+seq fine-tuned on TEDBench fold classification", }, "miae-b-seq-sc": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-b-seq-sc", + "repo_id": f"{_HF_ORG}/miae-b-seq-sc", "params": "102M", "type": "from-scratch", "description": "MiAE-Base+seq trained from scratch on TEDBench", }, # ---- Large (339M) ---- "miae-l": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-l", + "repo_id": f"{_HF_ORG}/miae-l", "params": "339M", "type": "pretrained", "description": "MiAE-Large pretrained encoder", }, "miae-l-ft": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-l-ft", + "repo_id": f"{_HF_ORG}/miae-l-ft", "params": "339M", "type": "fine-tuned", "description": "MiAE-Large fine-tuned on TEDBench fold classification", }, "miae-l-sc": { - "repo_id": f"{_HF_ORG}/{_PREFIX}-miae-l-sc", + "repo_id": f"{_HF_ORG}/miae-l-sc", "params": "339M", "type": "from-scratch", "description": "MiAE-Large trained from scratch on TEDBench", diff --git a/tedbench/py.typed b/tedbench/py.typed new file mode 100644 index 0000000..e69de29