cbib · majdabd · Jun 28, 2026 · Jun 19, 2026 · Jun 21, 2026 · Jun 21, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,76 @@
+# Copy this file to .env for local TrialMatchAI runtime configuration.
+# Do not commit .env.
+
+TRIALMATCHAI_OUTPUT_DIR=results
+TRIALMATCHAI_TRIALS_JSON_FOLDER=data/trials_jsons
+TRIALMATCHAI_PATIENT_RAW_DIR=data/patients/raw
+TRIALMATCHAI_PATIENT_PROFILE_DIR=data/patients/profiles
+TRIALMATCHAI_PATIENT_SUMMARY_DIR=data/patients/summaries
+TRIALMATCHAI_PATIENT_INPUT_FORMAT=auto
+TRIALMATCHAI_PATIENT_STRICT_VALIDATION=false
+TRIALMATCHAI_PATIENT_COPY_RAW=true
+
+TRIALMATCHAI_SEARCH_BACKEND=lancedb
+TRIALMATCHAI_SEARCH_DB_PATH=data/search
+TRIALMATCHAI_SEARCH_TRIALS_TABLE=trials
+TRIALMATCHAI_SEARCH_CRITERIA_TABLE=criteria
+TRIALMATCHAI_SEARCH_CANDIDATE_LIMIT=1000
+TRIALMATCHAI_SEARCH_MODE=hybrid
+TRIALMATCHAI_FIRST_LEVEL_ENABLED=true
+TRIALMATCHAI_FIRST_LEVEL_MAX_TRIALS=1000
+TRIALMATCHAI_FIRST_LEVEL_PER_CHANNEL_SIZE=300
+TRIALMATCHAI_FIRST_LEVEL_VECTOR_SCORE_THRESHOLD=0.0
+TRIALMATCHAI_FIRST_LEVEL_LLM_EXPANSION_ENABLED=false
+TRIALMATCHAI_FIRST_LEVEL_WRITE_REPORTS=true
+
+TRIALMATCHAI_MODEL_BASE_MODEL=microsoft/phi-4
+TRIALMATCHAI_MODEL_COT_ADAPTER_PATH=models/finetuned_phi_reasoning
+TRIALMATCHAI_MODEL_RERANKER_MODEL_PATH=google/gemma-2-2b-it
+TRIALMATCHAI_MODEL_RERANKER_ADAPTER_PATH=models/finetuned_gemma2
+TRIALMATCHAI_MODEL_TRUST_REMOTE_CODE=false
+TRIALMATCHAI_COT_BACKEND=vllm
+
+# Runtime CoT query expansion (legacy keywords.json behaviour). Off by default;
+# the TREC preset enables it. Model/adapter default to the CoT reasoning model.
+TRIALMATCHAI_QUERY_EXPANSION_ENABLED=false
+TRIALMATCHAI_QUERY_EXPANSION_BACKEND=
+TRIALMATCHAI_QUERY_EXPANSION_MODEL=
+TRIALMATCHAI_QUERY_EXPANSION_ADAPTER=
+
+TRIALMATCHAI_ENTITY_BACKEND=gliner2
+TRIALMATCHAI_ENTITY_MODEL_NAME=fastino/gliner2-base-v1
+TRIALMATCHAI_ENTITY_SCHEMA_PATH=entity_schemas/trialmatchai.yaml
+TRIALMATCHAI_ENTITY_TRUST_REMOTE_CODE=false
+TRIALMATCHAI_CONCEPT_DB_PATH=data/concepts
+TRIALMATCHAI_CONCEPT_TABLE=concepts
+TRIALMATCHAI_LINK_ACCEPT=0.80
+TRIALMATCHAI_LINK_REJECT=0.30
+
+TRIALMATCHAI_CONSTRAINTS_ENABLED=true
+TRIALMATCHAI_CONSTRAINTS_SCORE_WEIGHT=0.25
+TRIALMATCHAI_CONSTRAINTS_LLM_EXTRACTION_ENABLED=false
+TRIALMATCHAI_CONSTRAINTS_WRITE_REPORTS=true
+
+TRIALMATCHAI_REGISTRY_SOURCE=clinicaltrials.gov
+TRIALMATCHAI_REGISTRY_API_BASE_URL=https://clinicaltrials.gov/api/v2/studies
+TRIALMATCHAI_REGISTRY_KEYWORDS_FILE=
+TRIALMATCHAI_REGISTRY_SINCE_DAYS=7
+TRIALMATCHAI_REGISTRY_MAX_STUDIES=
+TRIALMATCHAI_REGISTRY_REQUEST_TIMEOUT=30
+TRIALMATCHAI_REGISTRY_RATE_LIMIT_PER_SECOND=2
+TRIALMATCHAI_REGISTRY_RAW_DIR=data/registry/raw
+TRIALMATCHAI_REGISTRY_MANIFEST_PATH=data/registry/manifest.jsonl
+TRIALMATCHAI_REGISTRY_REPORTS_DIR=data/registry/runs
+TRIALMATCHAI_REGISTRY_FAILURE_THRESHOLD=0.25
+
+TRIALMATCHAI_PROCESSED_TRIALS_SHA256=
+TRIALMATCHAI_MODELS_SHA256=
+TRIALMATCHAI_CRITERIA_PART_0_SHA256=
+TRIALMATCHAI_CRITERIA_PART_1_SHA256=
+TRIALMATCHAI_CRITERIA_PART_2_SHA256=
+TRIALMATCHAI_CRITERIA_PART_3_SHA256=
+TRIALMATCHAI_CRITERIA_PART_4_SHA256=
+TRIALMATCHAI_CRITERIA_PART_5_SHA256=
+
+TRIALMATCHAI_LOG_LEVEL=INFO
+TRIALMATCHAI_LOG_JSON=1
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,27 @@
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    open-pull-requests-limit: 5
+    groups:
+      # The heavy ML stack lives in optional extras and is intentionally pinned to
+      # versions compatible with vLLM (see the CI pip-audit policy). Collapse its
+      # frequent advisories into one low-noise PR instead of dozens.
+      ml-stack:
+        patterns:
+          - "torch*"
+          - "vllm*"
+          - "transformers*"
+          - "accelerate*"
+          - "bitsandbytes*"
+          - "nvidia-*"
+          - "xformers*"
+      dev-dependencies:
+        dependency-type: "development"
+
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -2,18 +2,13 @@ name: CI
 
 on:
   push:
-    branches: [main, dev]
+    branches: [main, dev, deployment-readiness-audit]
   pull_request:
     branches: [main, dev]
   workflow_dispatch:
 
 jobs:
-  test:
-    strategy:
-      matrix:
-        python-version: ["3.11", "3.12"]
-      fail-fast: false
-
+  deployment-readiness:
     runs-on: ubuntu-latest
 
     steps:
@@ -26,10 +21,101 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: "3.11"
 
       - name: Install dependencies
         run: uv sync --frozen
 
+      - name: Validate lockfile
+        run: uv lock --check
+
+      - name: Lint
+        run: uv run ruff check .
+
       - name: Run tests
-        run: uv run pytest -v
+        run: uv run pytest -v
+
+      - name: Build package
+        run: uv build
+
+      - name: Installed package smoke
+        run: |
+          WHEEL="$(ls "$PWD"/dist/trialmatchai-*.whl | head -n 1)"
+          mkdir -p /tmp/trialmatchai-installed-smoke
+          cd /tmp/trialmatchai-installed-smoke
+          uv run --python 3.11 --isolated --with "$WHEEL" python - <<'PY'
+          from pathlib import Path
+          from trialmatchai.config.config_loader import load_config
+
+          cfg = load_config()
+          assert Path(cfg["entity_extraction"]["schema_path"]).exists()
+          assert cfg["paths"]["output_dir"].endswith("results")
+          PY
+
+      - name: CLI help smoke
+        run: |
+          WHEEL="$(ls dist/trialmatchai-*.whl | head -n 1)"
+          uv run --python 3.11 --isolated --with "$WHEEL" trialmatchai --help
+          for cmd in pipeline healthcheck bootstrap-data index build-concepts update-registry \
+                     import-patient build run e2e trec finetune; do
+            uv run --python 3.11 --isolated --with "$WHEEL" trialmatchai "$cmd" --help
+          done
+
+      - name: Secret scan
+        run: uv run pre-commit run gitleaks --all-files
+
+      - name: Dependency audit
+        # vLLM 0.23 pins torch 2.11.0; CVE-2025-3000 has no fixed version listed.
+        run: uv run pip-audit --progress-spinner off --ignore-vuln CVE-2025-3000
+
+  ml-extras-smoke:
+    # Installs the optional ML extras (CPU-only) and imports the local-model
+    # surface that the default job never touches, so a broken import or wrong
+    # transformers/gliner API call is caught instead of shipping silently.
+    # Non-blocking: it exercises heavy third-party deps (torch/gliner) whose
+    # availability we don't gate releases on; failures show up as a signal
+    # without failing the workflow.
+    continue-on-error: true
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install with entity extra (CPU)
+        run: uv sync --extra entity
+
+      - name: Import ML surface
+        run: |
+          uv run python - <<'PY'
+          import importlib
+
+          # The entity extra's heavy libs must import cleanly in the resolved
+          # environment (catches a bad transformers API or a torch pin conflict).
+          for lib in ("torch", "transformers", "gliner", "gliner2"):
+              importlib.import_module(lib)
+              print(f"imported {lib}")
+
+          # The package's local-model modules must import with extras present.
+          modules = [
+              "trialmatchai.entities.recognizers",
+              "trialmatchai.models.embedding.text_embedder",
+              "trialmatchai.models.llm.llm_reranker",
+              "trialmatchai.models.llm.vllm_loader",
+              "trialmatchai.matching.eligibility_reasoning_vllm",
+              "trialmatchai.finetuning.cot",
+              "trialmatchai.finetuning.reranker",
+              "trialmatchai.finetuning.ner",
+          ]
+          for name in modules:
+              importlib.import_module(name)
+              print(f"imported {name}")
+          PY
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -0,0 +1,28 @@
+name: Docs
+
+# Build the MkDocs site (strict) on every push/PR; deploy to GitHub Pages on main.
+on:
+  push:
+    branches: [main, deployment-readiness-audit]
+  pull_request:
+    branches: [main]
+  workflow_dispatch: {}
+
+permissions:
+  contents: write # mkdocs gh-deploy pushes to the gh-pages branch
+
+jobs:
+  docs:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Build docs (strict)
+        run: uv run --group docs mkdocs build --strict
+      - name: Deploy to GitHub Pages (main only)
+        if: github.ref == 'refs/heads/main'
+        run: uv run --group docs mkdocs gh-deploy --force
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,44 @@
+name: Release
+
+# Publishes trialmatchai to PyPI when a GitHub Release is published.
+# Uses PyPI Trusted Publishing (OIDC) — no API token is stored in the repo.
+# One-time setup on PyPI: add a Trusted Publisher for this repo + workflow +
+# the `pypi` environment (https://docs.pypi.org/trusted-publishers/).
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch: {}
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - name: Build sdist + wheel
+        run: uv build
+      - name: Check the built metadata
+        run: uvx twine check dist/*
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
+
+  pypi-publish:
+    needs: build
+    runs-on: ubuntu-latest
+    environment: pypi
+    permissions:
+      id-token: write # required for PyPI Trusted Publishing (OIDC)
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+      - uses: astral-sh/setup-uv@v5
+      - name: Publish to PyPI
+        run: uv publish --trusted-publishing always
diff --git a/.gitignore b/.gitignore
@@ -3,53 +3,40 @@
 # ============================
 data/
 results/
-src/
 ablation/
 logs/
 *.log
+.env
+.env.*
+!.env.example
 
-# ============================
-# Finetuning ignore rules
-# ============================
-
-# Ignore ANY directory under finetuning/ starting with checkpoint
-**/finetuning/**/checkpoint*/
-
-# Ignore ANY directory under finetuning/ starting with finetuned
-**/finetuning/**/finetuned*/
-
-# Ignore ANY directory under finetuning/ ending with "data"
-**/finetuning/**/*data/
-
-# Ignore this explicit model folder
-utils/finetuning/finetune_ner/RoBERTa-large-PM-M3-Voc/
-utils/finetuning/finetune_ner/output_eval
-utils/finetuning/finetune_ner
+/models/
+/source/
+/Parser/
 
 # ============================
-# ElasticSearch artifacts
+# Local working dirs (large data stores / uncommitted code)
 # ============================
-elasticsearch/sif/
-elasticsearch/data1/
-elasticsearch/data/
-elasticsearch/logs/
-elasticsearch/sif/*.sif
+/elasticsearch/
+/utils/
 
 # ============================
-# Parser artifacts
+# Generated indexing state
 # ============================
-models/
-source/Parser/resources/
-source/Parser/models/
-source/Parser/input/
-source/Parser/output/
-Parser/logs/
+*.lance
+*.lancedb
 
 # ============================
 # Python cache
 # ============================
 __pycache__/
 *.py[cod]
+.pytest_cache/
+.ruff_cache/
+.venv/
+build/
+dist/
+*.egg-info/
 
 # ============================
 # Temporary / cache files
@@ -58,6 +45,8 @@ __pycache__/
 *.bak
 *.swp
 *.slurm
+scripts/*.sh
 **/tmp/
 .DS_Store
 Thumbs.db
+site/