From 61a7abf9e4e1b4843e92215a5e646298ab31577a Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Thu, 26 Mar 2026 17:40:41 +0000 Subject: [PATCH 01/10] Added mac-os-test GitHub Actions job --- .github/workflows/github-actions-ci.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml index a7bda7c7..a6621c81 100644 --- a/.github/workflows/github-actions-ci.yml +++ b/.github/workflows/github-actions-ci.yml @@ -111,6 +111,27 @@ jobs: path: ./dist-export/dist/ + macos-test: + runs-on: macos-latest + steps: + - name: Check out repository code + uses: actions/checkout@v5 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: '3.11' + + - name: Install poppler (required by pdf2image) + run: brew install poppler + + - name: Install dependencies + run: uv sync --frozen --group dev --extra delft + + - name: Run pytest + run: uv run python -m pytest -p no:cacheprovider + + testpypi-publish: if: github.ref == 'refs/heads/main' needs: ["build-and-test"] From 36caee95145656a7ca7058f68619818f6d76e418 Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Thu, 26 Mar 2026 17:49:25 +0000 Subject: [PATCH 02/10] ignore cv-model tests for now when running macos test --- .github/workflows/github-actions-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml index a6621c81..69475db2 100644 --- a/.github/workflows/github-actions-ci.yml +++ b/.github/workflows/github-actions-ci.yml @@ -129,7 +129,7 @@ jobs: run: uv sync --frozen --group dev --extra delft - name: Run pytest - run: uv run python -m pytest -p no:cacheprovider + run: uv run python -m pytest -p no:cacheprovider --ignore=tests/cv_models testpypi-publish: From 9c8610c9c7d23d21be24cf634f5cc87871393db6 Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Thu, 26 Mar 2026 18:01:27 +0000 Subject: [PATCH 03/10] Made pdfalto path optional. inferred based on the platform by default --- sciencebeam_parser/app/parser.py | 4 ++-- .../external/pdfalto/wrapper.py | 22 +++++++++++++++++++ .../resources/default_config/config.yml | 5 ++++- tests/external/pdfalto/wrapper_test.py | 5 +++-- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/sciencebeam_parser/app/parser.py b/sciencebeam_parser/app/parser.py index cfb610e9..9e2ec43a 100644 --- a/sciencebeam_parser/app/parser.py +++ b/sciencebeam_parser/app/parser.py @@ -14,7 +14,7 @@ from sciencebeam_parser.app.context import AppContext from sciencebeam_parser.config.config import AppConfig, get_download_dir -from sciencebeam_parser.external.pdfalto.wrapper import PdfAltoWrapper +from sciencebeam_parser.external.pdfalto.wrapper import PdfAltoWrapper, get_default_pdfalto_url from sciencebeam_parser.external.pdfalto.parser import parse_alto_root from sciencebeam_parser.external.wapiti.wrapper import LazyWapitiBinaryWrapper from sciencebeam_parser.lookup.loader import load_lookup_from_config @@ -177,7 +177,7 @@ def __init__(self, config: AppConfig): self.pdfalto_wrapper = PdfAltoWrapper( download_with_zip_path_support( self.download_manager, - config['pdfalto']['path'] + config['pdfalto'].get('path') or get_default_pdfalto_url() ) ) self.pdfalto_wrapper.ensure_executable() diff --git a/sciencebeam_parser/external/pdfalto/wrapper.py b/sciencebeam_parser/external/pdfalto/wrapper.py index 42bb8c52..6283915c 100644 --- a/sciencebeam_parser/external/pdfalto/wrapper.py +++ b/sciencebeam_parser/external/pdfalto/wrapper.py @@ -1,6 +1,8 @@ import logging import os +import platform import stat +import sys from typing import Optional from sciencebeam_parser.utils.background_process import exec_with_logging @@ -9,6 +11,26 @@ LOGGER = logging.getLogger(__name__) +PDFALTO_VERSION = 'v0.6.0' +_PDFALTO_BASE_URL = ( + f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}' +) + + +def get_default_pdfalto_url() -> str: + machine = platform.machine().lower() + if sys.platform == 'darwin': + os_name = 'mac' + elif sys.platform.startswith('linux'): + os_name = 'linux' + else: + raise RuntimeError(f'Unsupported platform: {sys.platform!r}') + arch = 'arm64' if machine in ('arm64', 'aarch64') else '64' + zip_name = f'pdfalto-bin-{os_name}-{arch}.zip' + internal_path = f'pdfalto/{os_name}/{arch}/pdfalto' + return f'{_PDFALTO_BASE_URL}/{zip_name}!/{internal_path}' + + class PdfAltoWrapper: def __init__(self, binary_path: str): self.binary_path = binary_path diff --git a/sciencebeam_parser/resources/default_config/config.yml b/sciencebeam_parser/resources/default_config/config.yml index a96a90c9..c787cc90 100644 --- a/sciencebeam_parser/resources/default_config/config.yml +++ b/sciencebeam_parser/resources/default_config/config.yml @@ -43,7 +43,10 @@ logging: download_dir: '~/.cache/sciencebeam-parser/downloads' pdfalto: - path: https://github.com/kermitt2/pdfalto/releases/download/v0.6.0/pdfalto-bin-linux-64.zip!/pdfalto/linux/64/pdfalto + # path is auto-detected based on the current platform if not specified + # (override with an explicit URL if needed, e.g.:) + # path: https://github.com/kermitt2/pdfalto/releases/download/v0.6.0/pdfalto-bin-linux-64.zip!/pdfalto/linux/64/pdfalto + path: wapiti: install_source: 'https://github.com/kermitt2/Wapiti/archive/a9c25d2bcccd60f1a54a7019689bd5229e866f00.tar.gz' xslt: diff --git a/tests/external/pdfalto/wrapper_test.py b/tests/external/pdfalto/wrapper_test.py index a39eccae..b546c32f 100644 --- a/tests/external/pdfalto/wrapper_test.py +++ b/tests/external/pdfalto/wrapper_test.py @@ -6,7 +6,8 @@ from sciencebeam_parser.config.config import get_download_dir from sciencebeam_parser.external.pdfalto.wrapper import ( - PdfAltoWrapper + PdfAltoWrapper, + get_default_pdfalto_url ) from sciencebeam_parser.utils.download import download_with_zip_path_support @@ -22,7 +23,7 @@ def _pdfalto_wrapper(sciencebeam_parser_config: dict) -> PdfAltoWrapper: pdfalto_wrapper = PdfAltoWrapper( download_with_zip_path_support( download_manager, - sciencebeam_parser_config['pdfalto']['path'] + sciencebeam_parser_config['pdfalto'].get('path') or get_default_pdfalto_url() ) ) pdfalto_wrapper.ensure_executable() From 44f04b1c58c66eccbbb425643c574b1130ec4f94 Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Thu, 26 Mar 2026 18:08:59 +0000 Subject: [PATCH 04/10] run on multiple mac platofrms --- .github/workflows/github-actions-ci.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml index 69475db2..fd768d99 100644 --- a/.github/workflows/github-actions-ci.yml +++ b/.github/workflows/github-actions-ci.yml @@ -112,7 +112,12 @@ jobs: macos-test: - runs-on: macos-latest + strategy: + matrix: + os: + - macos-13 # Intel x86_64 + - macos-latest # Apple Silicon arm64 + runs-on: ${{ matrix.os }} steps: - name: Check out repository code uses: actions/checkout@v5 From d1eeae124e233e7ad0d1c3aba81e93bcf9c80a09 Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Thu, 26 Mar 2026 18:14:38 +0000 Subject: [PATCH 05/10] mac-13 no longer supported, test linux on arm instead --- .github/workflows/github-actions-ci.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml index fd768d99..9caeefc0 100644 --- a/.github/workflows/github-actions-ci.yml +++ b/.github/workflows/github-actions-ci.yml @@ -111,13 +111,15 @@ jobs: path: ./dist-export/dist/ - macos-test: + native-test: strategy: matrix: - os: - - macos-13 # Intel x86_64 - - macos-latest # Apple Silicon arm64 - runs-on: ${{ matrix.os }} + include: + - runs-on: macos-latest + install-poppler: brew install poppler + - runs-on: ubuntu-24.04-arm + install-poppler: sudo apt-get install -y poppler-utils + runs-on: ${{ matrix.runs-on }} steps: - name: Check out repository code uses: actions/checkout@v5 @@ -128,7 +130,7 @@ jobs: python-version: '3.11' - name: Install poppler (required by pdf2image) - run: brew install poppler + run: ${{ matrix.install-poppler }} - name: Install dependencies run: uv sync --frozen --group dev --extra delft From 0122d6f8179d8bae4280d3a9a2ee97ad66acc6d3 Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Thu, 26 Mar 2026 18:20:33 +0000 Subject: [PATCH 06/10] skip additional delft dependencies without arm package for native test --- .github/workflows/github-actions-ci.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml index 9caeefc0..2b7c92d9 100644 --- a/.github/workflows/github-actions-ci.yml +++ b/.github/workflows/github-actions-ci.yml @@ -134,6 +134,13 @@ jobs: - name: Install dependencies run: uv sync --frozen --group dev --extra delft + if: runner.os != 'Linux' || runner.arch != 'ARM64' + + - name: Install dependencies (Linux arm64 - skip delft extras with no arm64 wheels) + run: | + uv sync --frozen --group dev + uv pip install "sciencebeam-trainer-delft>=0.0.36" + if: runner.os == 'Linux' && runner.arch == 'ARM64' - name: Run pytest run: uv run python -m pytest -p no:cacheprovider --ignore=tests/cv_models From 23fe61cd5d856f447ebdd7728b3f75726ea3a723 Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Thu, 26 Mar 2026 18:22:35 +0000 Subject: [PATCH 07/10] Also install setuptools for now --- .github/workflows/github-actions-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml index 2b7c92d9..0ba28b83 100644 --- a/.github/workflows/github-actions-ci.yml +++ b/.github/workflows/github-actions-ci.yml @@ -139,7 +139,7 @@ jobs: - name: Install dependencies (Linux arm64 - skip delft extras with no arm64 wheels) run: | uv sync --frozen --group dev - uv pip install "sciencebeam-trainer-delft>=0.0.36" + uv pip install "sciencebeam-trainer-delft>=0.0.36" setuptools if: runner.os == 'Linux' && runner.arch == 'ARM64' - name: Run pytest From 9d83ba9ace933cd693ad9c0d33894b7a3aea65fb Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Thu, 26 Mar 2026 18:35:28 +0000 Subject: [PATCH 08/10] Also ignore transformers tests --- .github/workflows/github-actions-ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml index 0ba28b83..79f594b7 100644 --- a/.github/workflows/github-actions-ci.yml +++ b/.github/workflows/github-actions-ci.yml @@ -117,8 +117,10 @@ jobs: include: - runs-on: macos-latest install-poppler: brew install poppler + pytest-ignore-args: --ignore=tests/cv_models - runs-on: ubuntu-24.04-arm install-poppler: sudo apt-get install -y poppler-utils + pytest-ignore-args: --ignore=tests/cv_models --ignore=tests/transformers runs-on: ${{ matrix.runs-on }} steps: - name: Check out repository code @@ -139,11 +141,11 @@ jobs: - name: Install dependencies (Linux arm64 - skip delft extras with no arm64 wheels) run: | uv sync --frozen --group dev - uv pip install "sciencebeam-trainer-delft>=0.0.36" setuptools + uv pip install "sciencebeam-trainer-delft>=0.0.36" if: runner.os == 'Linux' && runner.arch == 'ARM64' - name: Run pytest - run: uv run python -m pytest -p no:cacheprovider --ignore=tests/cv_models + run: uv run python -m pytest -p no:cacheprovider ${{ matrix.pytest-ignore-args }} testpypi-publish: From 9f104ce7728ac59b105f53652d0763468d66147d Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Thu, 26 Mar 2026 18:53:56 +0000 Subject: [PATCH 09/10] Run less tests --- .github/workflows/github-actions-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml index 79f594b7..fa5838f5 100644 --- a/.github/workflows/github-actions-ci.yml +++ b/.github/workflows/github-actions-ci.yml @@ -120,7 +120,7 @@ jobs: pytest-ignore-args: --ignore=tests/cv_models - runs-on: ubuntu-24.04-arm install-poppler: sudo apt-get install -y poppler-utils - pytest-ignore-args: --ignore=tests/cv_models --ignore=tests/transformers + pytest-ignore-args: tests/external/pdfalto tests/document tests/utils tests/config tests/lookup runs-on: ${{ matrix.runs-on }} steps: - name: Check out repository code From 0d2da9fa4ceb7fcd04183fc19f6dab404a6ed66c Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Fri, 27 Mar 2026 16:01:16 +0000 Subject: [PATCH 10/10] also avoid pdfalto test in ubuntu arm due to TF dependency --- .github/workflows/github-actions-ci.yml | 2 +- tests/external/pdfalto/url_test.py | 44 +++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 tests/external/pdfalto/url_test.py diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml index fa5838f5..2c851668 100644 --- a/.github/workflows/github-actions-ci.yml +++ b/.github/workflows/github-actions-ci.yml @@ -120,7 +120,7 @@ jobs: pytest-ignore-args: --ignore=tests/cv_models - runs-on: ubuntu-24.04-arm install-poppler: sudo apt-get install -y poppler-utils - pytest-ignore-args: tests/external/pdfalto tests/document tests/utils tests/config tests/lookup + pytest-ignore-args: tests/external/pdfalto/url_test.py tests/external/pdfalto/parser_test.py tests/document tests/utils tests/config tests/lookup runs-on: ${{ matrix.runs-on }} steps: - name: Check out repository code diff --git a/tests/external/pdfalto/url_test.py b/tests/external/pdfalto/url_test.py new file mode 100644 index 00000000..9c925df7 --- /dev/null +++ b/tests/external/pdfalto/url_test.py @@ -0,0 +1,44 @@ +from unittest.mock import patch + +from sciencebeam_parser.external.pdfalto.wrapper import ( + PDFALTO_VERSION, + get_default_pdfalto_url +) + + +class TestGetDefaultPdfaltoUrl: + def test_linux_x86_64(self): + with patch('sys.platform', 'linux'), \ + patch('platform.machine', return_value='x86_64'): + url = get_default_pdfalto_url() + assert url == ( + f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}' + f'/pdfalto-bin-linux-64.zip!/pdfalto/linux/64/pdfalto' + ) + + def test_linux_aarch64(self): + with patch('sys.platform', 'linux'), \ + patch('platform.machine', return_value='aarch64'): + url = get_default_pdfalto_url() + assert url == ( + f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}' + f'/pdfalto-bin-linux-arm64.zip!/pdfalto/linux/arm64/pdfalto' + ) + + def test_macos_x86_64(self): + with patch('sys.platform', 'darwin'), \ + patch('platform.machine', return_value='x86_64'): + url = get_default_pdfalto_url() + assert url == ( + f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}' + f'/pdfalto-bin-mac-64.zip!/pdfalto/mac/64/pdfalto' + ) + + def test_macos_arm64(self): + with patch('sys.platform', 'darwin'), \ + patch('platform.machine', return_value='arm64'): + url = get_default_pdfalto_url() + assert url == ( + f'https://github.com/kermitt2/pdfalto/releases/download/{PDFALTO_VERSION}' + f'/pdfalto-bin-mac-arm64.zip!/pdfalto/mac/arm64/pdfalto' + )