From e3ff262a7bfab1a6fcdca3b1d8aa1e8e9ca8fe68 Mon Sep 17 00:00:00 2001 From: Tom Johnell Date: Thu, 12 Mar 2026 22:15:36 -0500 Subject: [PATCH 1/7] Add GitHub Actions release build workflow --- .github/workflows/release-build.yml | 157 +++++++++++++++ README.md | 16 ++ bin/lib.sh | 30 +-- docs/release_build.md | 25 ++- tooling/ci/build_release_host.sh | 276 +++++++++++++++++++++++++++ tooling/ci/prefilter_stream.sh | 92 +++++++++ tooling/ci/publish_release.sh | 96 ++++++++++ tooling/ci/stream_source_pbf.py | 78 ++++++++ tooling/export_release.py | 68 ++++++- tooling/tests/test_export_release.py | 78 ++++++++ tooling/validate_repo.sh | 15 +- 11 files changed, 907 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/release-build.yml create mode 100755 tooling/ci/build_release_host.sh create mode 100755 tooling/ci/prefilter_stream.sh create mode 100755 tooling/ci/publish_release.sh create mode 100755 tooling/ci/stream_source_pbf.py diff --git a/.github/workflows/release-build.yml b/.github/workflows/release-build.yml new file mode 100644 index 0000000..73cffd5 --- /dev/null +++ b/.github/workflows/release-build.yml @@ -0,0 +1,157 @@ +name: Release Build + +on: + workflow_dispatch: + inputs: + release_id: + description: "Optional release tag. Defaults to a dated GitHub Actions tag." + required: false + type: string + source_url: + description: "Source U.S. PBF URL" + required: false + default: "https://download.geofabrik.de/north-america/us-latest.osm.pbf" + type: string + publish: + description: "Publish the built assets as a GitHub release" + required: false + default: false + type: boolean + +permissions: + contents: write + +concurrency: + group: release-build-${{ github.ref }} + cancel-in-progress: false + +jobs: + prefilter: + runs-on: ubuntu-latest + timeout-minutes: 30 + outputs: + release_id: ${{ steps.meta.outputs.release_id }} + source_url: ${{ steps.meta.outputs.source_url }} + filtered_filename: ${{ steps.meta.outputs.filtered_filename }} + steps: + - uses: actions/checkout@v4 + + - name: Resolve release metadata + id: meta + shell: bash + run: | + release_input='${{ inputs.release_id }}' + if [[ -n "$release_input" ]]; then + release_id="$release_input" + else + release_id="release-$(date -u +%F)-gha-${GITHUB_RUN_NUMBER}" + fi + echo "release_id=$release_id" >> "$GITHUB_OUTPUT" + echo "source_url=${{ inputs.source_url }}" >> "$GITHUB_OUTPUT" + echo "filtered_filename=us-latest.canonical-filtered.osm.pbf" >> "$GITHUB_OUTPUT" + + - name: Show disk budget + run: df -h . + + - name: Install prefilter dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends osmium-tool + + - name: Stream and filter the source PBF + run: | + mkdir -p ci-inputs + tooling/ci/prefilter_stream.sh \ + --source-url "${{ steps.meta.outputs.source_url }}" \ + --output-pbf "$PWD/ci-inputs/${{ steps.meta.outputs.filtered_filename }}" \ + --source-metadata-file "$PWD/ci-inputs/source-pbf-metadata.json" + + - name: Upload filtered build inputs + uses: actions/upload-artifact@v4 + with: + name: prefiltered-${{ steps.meta.outputs.release_id }} + path: | + ci-inputs/${{ steps.meta.outputs.filtered_filename }} + ci-inputs/source-pbf-metadata.json + retention-days: 1 + compression-level: 0 + + build-release: + needs: prefilter + runs-on: ubuntu-latest + timeout-minutes: 45 + steps: + - uses: actions/checkout@v4 + + - uses: dtolnay/rust-toolchain@stable + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Show disk budget + run: df -h . + + - name: Install build dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends osm2pgsql postgresql-client + + - name: Install tooling Python dependencies + run: | + python -m venv "$RUNNER_TEMP/openinterstate-venv" + "$RUNNER_TEMP/openinterstate-venv/bin/pip" install --upgrade pip + "$RUNNER_TEMP/openinterstate-venv/bin/pip" install -r tooling/requirements.txt + echo "$RUNNER_TEMP/openinterstate-venv/bin" >> "$GITHUB_PATH" + + - name: Download filtered build inputs + uses: actions/download-artifact@v4 + with: + name: prefiltered-${{ needs.prefilter.outputs.release_id }} + path: ci-inputs + + - name: Build release from filtered artifact + env: + CARGO_TARGET_DIR: ${{ runner.temp }}/cargo-target + run: | + mkdir -p ci-artifacts/releases + tooling/ci/build_release_host.sh \ + --release-id "${{ needs.prefilter.outputs.release_id }}" \ + --filtered-pbf-file "$PWD/ci-inputs/${{ needs.prefilter.outputs.filtered_filename }}" \ + --source-pbf-metadata-file "$PWD/ci-inputs/source-pbf-metadata.json" \ + --source-url "${{ needs.prefilter.outputs.source_url }}" \ + --output-root "$PWD/ci-artifacts/releases" \ + --work-dir "$RUNNER_TEMP/openinterstate-release-build" + + - name: Upload release bundle + uses: actions/upload-artifact@v4 + with: + name: release-${{ needs.prefilter.outputs.release_id }} + path: | + ci-artifacts/releases/${{ needs.prefilter.outputs.release_id }} + ci-artifacts/releases/openinterstate-${{ needs.prefilter.outputs.release_id }}.tar.gz + retention-days: 7 + compression-level: 0 + + publish: + if: ${{ inputs.publish }} + needs: [prefilter, build-release] + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@v4 + + - name: Download release bundle + uses: actions/download-artifact@v4 + with: + name: release-${{ needs.prefilter.outputs.release_id }} + path: release-output + + - name: Publish GitHub release + env: + GH_TOKEN: ${{ github.token }} + run: | + tooling/ci/publish_release.sh \ + --release-id "${{ needs.prefilter.outputs.release_id }}" \ + --release-dir "$PWD/release-output/${{ needs.prefilter.outputs.release_id }}" \ + --archive-file "$PWD/release-output/openinterstate-${{ needs.prefilter.outputs.release_id }}.tar.gz" diff --git a/README.md b/README.md index f051a82..6bce6e8 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,22 @@ motorway/trunk road context and POI data needed for Interstate derivation, and the downstream Rust graph builders stay focused on Interstate-labeled corridors instead of constructing a much broader national highway graph. +## GitHub Actions Release Build + +The repo now carries a manual GitHub Actions release workflow at +`.github/workflows/release-build.yml`. + +That workflow is shaped to fit standard public GitHub-hosted runners: + +1. stream the raw `us-latest.osm.pbf` directly into the canonical filter +2. upload only the filtered `~160 MB` import PBF plus source metadata +3. rebuild PostGIS, derive tables, and export the release from that artifact +4. optionally publish the archive, manifest, checksums, and source lineage to GitHub + +The key constraint is that the raw source PBF is never persisted on the runner, +which keeps the downstream build small enough to stay inside the standard +runner disk budget. + ## Repo Map - `bin/`: the local command-line entrypoint diff --git a/bin/lib.sh b/bin/lib.sh index 32433f8..5a08463 100755 --- a/bin/lib.sh +++ b/bin/lib.sh @@ -405,6 +405,22 @@ oi_download_pbf() { printf '%s\n' "$resolved_output" } +oi_canonical_filter_args() { + cat <<'EOF' +n/highway=motorway_junction +n/amenity=fuel,restaurant,fast_food,cafe,toilets,charging_station +n/tourism=hotel,motel,guest_house +n/shop=gas +n/cuisine +n/highway=rest_area,services +w/highway=motorway,motorway_link,trunk,trunk_link,rest_area,services +w/amenity=fuel,restaurant,fast_food,cafe,toilets,charging_station +w/tourism=hotel,motel,guest_house +w/shop=gas +w/cuisine +EOF +} + oi_filter_pbf() { local input_pbf="$1" local output_pbf="$2" @@ -421,19 +437,7 @@ oi_filter_pbf() { fi mkdir -p "$(dirname "$output_pbf")" - filter_args=( - n/highway=motorway_junction - n/amenity=fuel,restaurant,fast_food,cafe,toilets,charging_station - n/tourism=hotel,motel,guest_house - n/shop=gas - n/cuisine - n/highway=rest_area,services - w/highway=motorway,motorway_link,trunk,trunk_link,rest_area,services - w/amenity=fuel,restaurant,fast_food,cafe,toilets,charging_station - w/tourism=hotel,motel,guest_house - w/shop=gas - w/cuisine - ) + mapfile -t filter_args < <(oi_canonical_filter_args) state_file="$(oi_state_file filter "$output_pbf")" expected_signature="$( { diff --git a/docs/release_build.md b/docs/release_build.md index de80b9a..648cda4 100644 --- a/docs/release_build.md +++ b/docs/release_build.md @@ -46,6 +46,22 @@ Cargo and runner caches now default under the managed data root as well, so a goose-drive workspace keeps both data artifacts and Rust build cache off the main disk. +## GitHub Actions Workflow + +The repo also includes a manual GitHub Actions workflow at +`.github/workflows/release-build.yml`. + +That workflow is designed for standard public GitHub-hosted runners: + +1. stream the raw U.S. PBF directly into `osmium tags-filter` +2. persist only the filtered canonical import PBF plus source metadata as an artifact +3. rebuild PostGIS, derive product tables, and export the release from that filtered artifact +4. optionally publish the archive and companion metadata files to GitHub Releases + +This avoids storing the full raw `us-latest.osm.pbf` on the runner, which is +the difference between fitting and not fitting inside the default runner disk +budget. + ## Environment Setup The default local workflow works without any env file and stores working data in @@ -132,7 +148,7 @@ GeoParquet packaging can follow later. Every release now records: -1. the raw source PBF path, size, modified time, and SHA-256 +1. the raw source PBF path or streamed source URL, plus size, modified time, and SHA-256 2. the imported canonical filtered PBF path, size, modified time, and SHA-256 3. the source download URL when provided 4. the derivation chain used to produce the release @@ -140,10 +156,13 @@ Every release now records: This lineage is published both inside `manifest.json` and as the standalone asset `source_lineage.json`. +When the GitHub Actions workflow streams the raw source instead of storing it on +disk, `source_pbf.path` is recorded as the downloaded source URL. + ## Published Standalone Release The current standalone release is published as: -1. GitHub release tag: `release-2026-03-12-goose-rerun-branchfix` -2. archive: `openinterstate-release-2026-03-12-goose-rerun-branchfix.tar.gz` +1. GitHub release tag: `release-2026-03-12-coldpath` +2. archive: `openinterstate-release-2026-03-12-coldpath.tar.gz` 3. companion files: `manifest.json`, `source_lineage.json`, and `checksums.txt` diff --git a/tooling/ci/build_release_host.sh b/tooling/ci/build_release_host.sh new file mode 100755 index 0000000..698d929 --- /dev/null +++ b/tooling/ci/build_release_host.sh @@ -0,0 +1,276 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +usage() { + cat <<'EOF' +Usage: + tooling/ci/build_release_host.sh \ + --release-id release-YYYY-MM-DD \ + --filtered-pbf-file /abs/path/us-latest.canonical-filtered.osm.pbf \ + --source-pbf-metadata-file /abs/path/source-pbf-metadata.json \ + --output-root /abs/path/release-output \ + --work-dir /abs/path/workdir \ + [--source-url URL] +EOF +} + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +require_cmd() { + local cmd="$1" + command -v "$cmd" >/dev/null 2>&1 || die "missing required command: $cmd" +} + +free_space_gb() { + local path="$1" + df -Pk "$path" | awk 'NR == 2 { print int($4 / 1024 / 1024) }' +} + +require_free_space_gb() { + local path="$1" + local min_gb="$2" + local available_gb + available_gb="$(free_space_gb "$path")" + if (( available_gb < min_gb )); then + die "need at least ${min_gb}GB free at $path, found ${available_gb}GB" + fi +} + +wait_for_postgres() { + local attempts="${1:-90}" + local sleep_seconds="${2:-2}" + local attempt + + for (( attempt=1; attempt<=attempts; attempt+=1 )); do + if PGPASSWORD="$DB_PASSWORD" pg_isready \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" >/dev/null 2>&1; then + return 0 + fi + sleep "$sleep_seconds" + done + + return 1 +} + +RELEASE_ID="" +FILTERED_PBF_FILE="" +SOURCE_PBF_METADATA_FILE="" +SOURCE_URL="" +OUTPUT_ROOT="" +WORK_DIR="" +STATE_DIR="" + +DB_HOST="${OI_CI_DB_HOST:-127.0.0.1}" +DB_PORT="${OI_CI_DB_PORT:-55432}" +DB_NAME="${OI_CI_DB_NAME:-osm}" +DB_USER="${OI_CI_DB_USER:-osm}" +DB_PASSWORD="${OI_CI_DB_PASSWORD:-osm_dev}" +MIN_FREE_GB="${OI_CI_MIN_FREE_GB:-7}" +IMPORT_CACHE_MB="${OI_IMPORT_CACHE_MB:-2048}" +DB_CONTAINER_NAME="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --release-id) + RELEASE_ID="$2" + shift 2 + ;; + --filtered-pbf-file) + FILTERED_PBF_FILE="$2" + shift 2 + ;; + --source-pbf-metadata-file) + SOURCE_PBF_METADATA_FILE="$2" + shift 2 + ;; + --source-url) + SOURCE_URL="$2" + shift 2 + ;; + --output-root) + OUTPUT_ROOT="$2" + shift 2 + ;; + --work-dir) + WORK_DIR="$2" + shift 2 + ;; + --state-dir) + STATE_DIR="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + usage + die "unknown argument: $1" + ;; + esac +done + +[[ -n "$RELEASE_ID" ]] || die "--release-id is required" +[[ -n "$FILTERED_PBF_FILE" ]] || die "--filtered-pbf-file is required" +[[ -n "$SOURCE_PBF_METADATA_FILE" ]] || die "--source-pbf-metadata-file is required" +[[ -n "$OUTPUT_ROOT" ]] || die "--output-root is required" +[[ -n "$WORK_DIR" ]] || die "--work-dir is required" + +[[ -f "$FILTERED_PBF_FILE" ]] || die "filtered PBF not found: $FILTERED_PBF_FILE" +[[ -f "$SOURCE_PBF_METADATA_FILE" ]] || die "source metadata not found: $SOURCE_PBF_METADATA_FILE" + +STATE_DIR="${STATE_DIR:-$WORK_DIR/state}" +DATABASE_URL="postgres://${DB_USER}:${DB_PASSWORD}@${DB_HOST}:${DB_PORT}/${DB_NAME}" +POSTGRES_DATA_DIR="$WORK_DIR/postgres" +RELEASE_DIR="$OUTPUT_ROOT/$RELEASE_ID" +ARCHIVE_PATH="$OUTPUT_ROOT/openinterstate-$RELEASE_ID.tar.gz" + +require_cmd cargo +require_cmd docker +require_cmd osm2pgsql +require_cmd pg_isready +require_cmd psql +require_cmd python3 +require_cmd tar + +mkdir -p "$OUTPUT_ROOT" "$POSTGRES_DATA_DIR" "$STATE_DIR" +rm -rf "$RELEASE_DIR" "$ARCHIVE_PATH" + +cleanup() { + local exit_code=$? + if [[ -n "$DB_CONTAINER_NAME" ]]; then + if (( exit_code != 0 )); then + docker logs "$DB_CONTAINER_NAME" >&2 || true + fi + docker rm -f "$DB_CONTAINER_NAME" >/dev/null 2>&1 || true + fi +} +trap cleanup EXIT + +log "Free space before release build" +df -h "$WORK_DIR" >&2 +require_free_space_gb "$WORK_DIR" "$MIN_FREE_GB" + +DB_CONTAINER_NAME="openinterstate-ci-db-${RANDOM}-$$" +log "Starting PostGIS container $DB_CONTAINER_NAME" +docker run \ + --detach \ + --rm \ + --name "$DB_CONTAINER_NAME" \ + --shm-size 2g \ + -e POSTGRES_DB="$DB_NAME" \ + -e POSTGRES_USER="$DB_USER" \ + -e POSTGRES_PASSWORD="$DB_PASSWORD" \ + -p "${DB_PORT}:5432" \ + -v "$POSTGRES_DATA_DIR:/var/lib/postgresql/data" \ + postgis/postgis:16-3.4 \ + postgres \ + -c shared_buffers=512MB \ + -c effective_cache_size=2GB \ + -c maintenance_work_mem=512MB \ + -c work_mem=32MB \ + -c max_wal_size=32GB \ + -c min_wal_size=8GB \ + -c checkpoint_timeout=60min \ + -c checkpoint_completion_target=0.9 \ + -c wal_compression=on \ + -c wal_level=minimal \ + -c max_wal_senders=0 \ + -c archive_mode=off \ + -c synchronous_commit=off \ + -c fsync=off \ + -c full_page_writes=off \ + -c autovacuum=off \ + -c effective_io_concurrency=200 \ + -c random_page_cost=1.1 \ + >/dev/null + +wait_for_postgres || die "PostGIS container did not become ready" + +log "Bootstrapping database schema" +PGPASSWORD="$DB_PASSWORD" psql \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" \ + -v ON_ERROR_STOP=1 \ + -c "CREATE EXTENSION IF NOT EXISTS postgis;" +PGPASSWORD="$DB_PASSWORD" psql \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" \ + -v ON_ERROR_STOP=1 \ + -f "$REPO_ROOT/schema/bootstrap.sql" + +log "Importing canonical filtered PBF" +PGPASSWORD="$DB_PASSWORD" osm2pgsql \ + --slim \ + --create \ + --output=flex \ + --style="$REPO_ROOT/schema/osm2pgsql/openinterstate.lua" \ + --database="$DB_NAME" \ + --host="$DB_HOST" \ + --port="$DB_PORT" \ + --username="$DB_USER" \ + --cache="$IMPORT_CACHE_MB" \ + "$FILTERED_PBF_FILE" + +log "Applying deterministic SQL projection" +PGPASSWORD="$DB_PASSWORD" psql \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" \ + -v ON_ERROR_STOP=1 \ + -f "$REPO_ROOT/schema/derive.sql" + +log "Building graph, corridors, and reference routes" +cargo run --locked --release -p openinterstate-derive -- \ + --database-url "$DATABASE_URL" \ + all + +log "Exporting release artifacts" +EXPORT_ARGS=( + python3 + "$REPO_ROOT/tooling/export_release.py" + --database-url "$DATABASE_URL" + --release-id "$RELEASE_ID" + --output-dir "$RELEASE_DIR" + --state-dir "$STATE_DIR" + --source-pbf-metadata-file "$SOURCE_PBF_METADATA_FILE" + --import-pbf-file "$FILTERED_PBF_FILE" +) +if [[ -n "$SOURCE_URL" ]]; then + EXPORT_ARGS+=(--source-url "$SOURCE_URL") +fi +"${EXPORT_ARGS[@]}" + +find "$RELEASE_DIR" \ + \( -name '.DS_Store' -o -name '._*' \) \ + -type f \ + -delete + +log "Packaging release archive" +tar \ + --exclude='.DS_Store' \ + --exclude='._*' \ + -C "$OUTPUT_ROOT" \ + -czf "$ARCHIVE_PATH" \ + "$RELEASE_ID" + +log "Release build complete" +du -sh "$FILTERED_PBF_FILE" "$POSTGRES_DATA_DIR" "$RELEASE_DIR" "$ARCHIVE_PATH" >&2 diff --git a/tooling/ci/prefilter_stream.sh b/tooling/ci/prefilter_stream.sh new file mode 100755 index 0000000..24a8d85 --- /dev/null +++ b/tooling/ci/prefilter_stream.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +usage() { + cat <<'EOF' +Usage: + tooling/ci/prefilter_stream.sh \ + --source-url URL \ + --output-pbf /abs/path/us-latest.canonical-filtered.osm.pbf \ + --source-metadata-file /abs/path/source-pbf-metadata.json +EOF +} + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +require_cmd() { + local cmd="$1" + command -v "$cmd" >/dev/null 2>&1 || die "missing required command: $cmd" +} + +SOURCE_URL="" +OUTPUT_PBF="" +SOURCE_METADATA_FILE="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --source-url) + SOURCE_URL="$2" + shift 2 + ;; + --output-pbf) + OUTPUT_PBF="$2" + shift 2 + ;; + --source-metadata-file) + SOURCE_METADATA_FILE="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + usage + die "unknown argument: $1" + ;; + esac +done + +[[ -n "$SOURCE_URL" ]] || die "--source-url is required" +[[ -n "$OUTPUT_PBF" ]] || die "--output-pbf is required" +[[ -n "$SOURCE_METADATA_FILE" ]] || die "--source-metadata-file is required" + +require_cmd osmium +require_cmd python3 + +mkdir -p "$(dirname "$OUTPUT_PBF")" "$(dirname "$SOURCE_METADATA_FILE")" + +# shellcheck disable=SC1091 +source "$REPO_ROOT/bin/lib.sh" + +log "Free space before streamed prefilter" +df -h "$(dirname "$OUTPUT_PBF")" >&2 + +mapfile -t FILTER_ARGS < <(oi_canonical_filter_args) + +log "Streaming raw source PBF into canonical filter" +python3 "$REPO_ROOT/tooling/ci/stream_source_pbf.py" \ + --url "$SOURCE_URL" \ + --metadata-file "$SOURCE_METADATA_FILE" \ + | osmium tags-filter \ + -F pbf \ + - \ + "${FILTER_ARGS[@]}" \ + --overwrite \ + -o "$OUTPUT_PBF" + +[[ -s "$OUTPUT_PBF" ]] || die "filtered PBF is empty: $OUTPUT_PBF" +osmium fileinfo "$OUTPUT_PBF" >/dev/null + +log "Prefilter complete" +du -sh "$OUTPUT_PBF" "$SOURCE_METADATA_FILE" >&2 diff --git a/tooling/ci/publish_release.sh b/tooling/ci/publish_release.sh new file mode 100755 index 0000000..a956b06 --- /dev/null +++ b/tooling/ci/publish_release.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Usage: + tooling/ci/publish_release.sh \ + --release-id release-YYYY-MM-DD \ + --release-dir /abs/path/release-YYYY-MM-DD \ + --archive-file /abs/path/openinterstate-release-YYYY-MM-DD.tar.gz \ + [--repo owner/name] +EOF +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +require_cmd() { + local cmd="$1" + command -v "$cmd" >/dev/null 2>&1 || die "missing required command: $cmd" +} + +RELEASE_ID="" +RELEASE_DIR="" +ARCHIVE_FILE="" +REPO="tldev/openinterstate" + +while [[ $# -gt 0 ]]; do + case "$1" in + --release-id) + RELEASE_ID="$2" + shift 2 + ;; + --release-dir) + RELEASE_DIR="$2" + shift 2 + ;; + --archive-file) + ARCHIVE_FILE="$2" + shift 2 + ;; + --repo) + REPO="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + usage + die "unknown argument: $1" + ;; + esac +done + +[[ -n "$RELEASE_ID" ]] || die "--release-id is required" +[[ -n "$RELEASE_DIR" ]] || die "--release-dir is required" +[[ -n "$ARCHIVE_FILE" ]] || die "--archive-file is required" + +require_cmd gh + +MANIFEST_PATH="$RELEASE_DIR/manifest.json" +CHECKSUMS_PATH="$RELEASE_DIR/checksums.txt" +SOURCE_LINEAGE_PATH="$RELEASE_DIR/source_lineage.json" + +for path in "$RELEASE_DIR" "$ARCHIVE_FILE" "$MANIFEST_PATH" "$CHECKSUMS_PATH" "$SOURCE_LINEAGE_PATH"; do + [[ -e "$path" ]] || die "missing publish artifact: $path" +done + +RELEASE_NOTES="Rebuilt from the raw U.S. OSM PBF using the standalone OpenInterstate pipeline. See manifest.json and source_lineage.json for raw-source and imported-filter lineage, including SHA-256 hashes." + +if gh release view "$RELEASE_ID" --repo "$REPO" >/dev/null 2>&1; then + gh release upload "$RELEASE_ID" \ + "$ARCHIVE_FILE" \ + "$MANIFEST_PATH" \ + "$CHECKSUMS_PATH" \ + "$SOURCE_LINEAGE_PATH" \ + --repo "$REPO" \ + --clobber + gh release edit "$RELEASE_ID" \ + --repo "$REPO" \ + --title "$RELEASE_ID" \ + --notes "$RELEASE_NOTES" +else + gh release create "$RELEASE_ID" \ + "$ARCHIVE_FILE" \ + "$MANIFEST_PATH" \ + "$CHECKSUMS_PATH" \ + "$SOURCE_LINEAGE_PATH" \ + --repo "$REPO" \ + --title "$RELEASE_ID" \ + --notes "$RELEASE_NOTES" +fi diff --git a/tooling/ci/stream_source_pbf.py b/tooling/ci/stream_source_pbf.py new file mode 100755 index 0000000..79d5a5b --- /dev/null +++ b/tooling/ci/stream_source_pbf.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import hashlib +import json +import sys +from datetime import datetime, timezone +from email.utils import parsedate_to_datetime +from pathlib import Path, PurePosixPath +from urllib.parse import urlparse +from urllib.request import Request, urlopen + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Stream a source PBF to stdout while recording source metadata." + ) + parser.add_argument("--url", required=True) + parser.add_argument("--metadata-file", required=True) + parser.add_argument("--chunk-size", type=int, default=1024 * 1024) + return parser.parse_args() + + +def source_filename(url: str) -> str: + candidate = PurePosixPath(urlparse(url).path).name + return candidate or "source.osm.pbf" + + +def isoformat_http_date(value: str | None) -> str: + if not value: + return datetime.now(timezone.utc).isoformat() + try: + parsed = parsedate_to_datetime(value) + except (TypeError, ValueError, IndexError, OverflowError): + return datetime.now(timezone.utc).isoformat() + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=timezone.utc) + return parsed.astimezone(timezone.utc).isoformat() + + +def main() -> int: + args = parse_args() + request = Request(args.url, headers={"User-Agent": "openinterstate-ci/1"}) + digest = hashlib.sha256() + size_bytes = 0 + + try: + with urlopen(request) as response: + final_url = response.geturl() + modified_at = isoformat_http_date(response.headers.get("Last-Modified")) + while True: + chunk = response.read(args.chunk_size) + if not chunk: + break + digest.update(chunk) + size_bytes += len(chunk) + sys.stdout.buffer.write(chunk) + sys.stdout.buffer.flush() + except BrokenPipeError: + print("downstream consumer closed while streaming source PBF", file=sys.stderr) + return 1 + + metadata = { + "path": final_url, + "filename": source_filename(final_url), + "size_bytes": size_bytes, + "modified_at": modified_at, + "sha256": digest.hexdigest(), + } + metadata_path = Path(args.metadata_file) + metadata_path.parent.mkdir(parents=True, exist_ok=True) + metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tooling/export_release.py b/tooling/export_release.py index ab320cd..b920c96 100755 --- a/tooling/export_release.py +++ b/tooling/export_release.py @@ -20,6 +20,7 @@ # letter-suffixed branches that are part of the Interstate system. INTERSTATE_FILTER = r"^(?:I-?[0-9]+|I-?35[EW]|I-?69[CEW])$" INTERSTATE_NAME_RE = re.compile(INTERSTATE_FILTER) +SHA256_RE = re.compile(r"^[0-9a-f]{64}$") @dataclass(frozen=True) @@ -34,16 +35,24 @@ def is_release_interstate_name(highway: str) -> bool: return bool(INTERSTATE_NAME_RE.fullmatch(highway.strip().upper())) -def parse_args() -> argparse.Namespace: +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser(description="Export OpenInterstate v1 release artifacts.") parser.add_argument("--database-url", required=True) parser.add_argument("--release-id", required=True) parser.add_argument("--output-dir", required=True) parser.add_argument("--state-dir") - parser.add_argument("--source-pbf-file", required=True) + parser.add_argument("--source-pbf-file") + parser.add_argument("--source-pbf-metadata-file") parser.add_argument("--import-pbf-file") parser.add_argument("--source-url") - return parser.parse_args() + args = parser.parse_args(argv) + has_source_file = bool(args.source_pbf_file) + has_source_metadata = bool(args.source_pbf_metadata_file) + if has_source_file == has_source_metadata: + parser.error("exactly one of --source-pbf-file or --source-pbf-metadata-file is required") + if has_source_metadata and not args.import_pbf_file: + parser.error("--import-pbf-file is required when --source-pbf-metadata-file is used") + return args def ensure_dirs(output_dir: Path) -> tuple[Path, Path, Path]: @@ -139,6 +148,46 @@ def build_source_file_metadata( return metadata +def validate_source_file_metadata(raw: Any, label: str) -> dict[str, Any]: + if not isinstance(raw, dict): + raise ValueError(f"{label} metadata must be a JSON object") + + path = raw.get("path") + filename = raw.get("filename") + size_bytes = raw.get("size_bytes") + modified_at = raw.get("modified_at") + sha256 = raw.get("sha256") + + if not isinstance(path, str) or not path.strip(): + raise ValueError(f"{label} metadata must include a non-empty path") + if not isinstance(filename, str) or not filename.strip(): + raise ValueError(f"{label} metadata must include a non-empty filename") + if not isinstance(size_bytes, int) or size_bytes < 0: + raise ValueError(f"{label} metadata must include a non-negative integer size_bytes") + if not isinstance(modified_at, str) or not modified_at.strip(): + raise ValueError(f"{label} metadata must include a non-empty modified_at") + if not isinstance(sha256, str) or not SHA256_RE.fullmatch(sha256): + raise ValueError(f"{label} metadata must include a lowercase 64-character sha256") + + return { + "path": path, + "filename": filename, + "size_bytes": size_bytes, + "modified_at": modified_at, + "sha256": sha256, + } + + +def load_source_file_metadata(path: Path, label: str) -> dict[str, Any]: + try: + raw = json.loads(path.read_text(encoding="utf-8")) + except FileNotFoundError as exc: + raise ValueError(f"{label} metadata file not found: {path}") from exc + except json.JSONDecodeError as exc: + raise ValueError(f"{label} metadata file is not valid JSON: {path}") from exc + return validate_source_file_metadata(raw, label) + + def write_checksums( files: list[Path], output_path: Path, @@ -231,14 +280,21 @@ def main() -> None: output_dir = Path(args.output_dir).resolve() state_dir = Path(args.state_dir).resolve() if args.state_dir else None csv_dir, gpx_dir, examples_dir = ensure_dirs(output_dir) - source_pbf_path = Path(args.source_pbf_file).resolve() + source_pbf_path = Path(args.source_pbf_file).resolve() if args.source_pbf_file else None import_pbf_path = Path(args.import_pbf_file).resolve() if args.import_pbf_file else source_pbf_path hash_cache: dict[tuple[str, int, int], str] = {} + if source_pbf_path is not None: + source_pbf_metadata = build_source_file_metadata(source_pbf_path, state_dir, hash_cache) + else: + source_pbf_metadata = load_source_file_metadata(Path(args.source_pbf_metadata_file).resolve(), "source_pbf") + assert import_pbf_path is not None + import_pbf_metadata = build_source_file_metadata(import_pbf_path, state_dir, hash_cache) + source_lineage = { "source_url": args.source_url, - "source_pbf": build_source_file_metadata(source_pbf_path, state_dir, hash_cache), - "import_pbf": build_source_file_metadata(import_pbf_path, state_dir, hash_cache), + "source_pbf": source_pbf_metadata, + "import_pbf": import_pbf_metadata, "derivation": [ "osm2pgsql flex import via schema/osm2pgsql/openinterstate.lua", "schema/derive.sql", diff --git a/tooling/tests/test_export_release.py b/tooling/tests/test_export_release.py index e17e350..e62446c 100644 --- a/tooling/tests/test_export_release.py +++ b/tooling/tests/test_export_release.py @@ -1,5 +1,7 @@ import importlib.util +import json import sys +import tempfile import types import unittest from pathlib import Path @@ -32,5 +34,81 @@ def test_non_route_labels_are_excluded(self) -> None: self.assertFalse(MODULE.is_release_interstate_name(highway)) +class SourceFileMetadataTests(unittest.TestCase): + def test_parse_args_accepts_source_metadata_with_import_file(self) -> None: + args = MODULE.parse_args( + [ + "--database-url", + "postgres://db", + "--release-id", + "release-2026-03-12", + "--output-dir", + "/tmp/release", + "--source-pbf-metadata-file", + "/tmp/source.json", + "--import-pbf-file", + "/tmp/import.osm.pbf", + ] + ) + self.assertEqual(args.source_pbf_metadata_file, "/tmp/source.json") + self.assertEqual(args.import_pbf_file, "/tmp/import.osm.pbf") + + def test_parse_args_rejects_missing_source_locator(self) -> None: + with self.assertRaises(SystemExit): + MODULE.parse_args( + [ + "--database-url", + "postgres://db", + "--release-id", + "release-2026-03-12", + "--output-dir", + "/tmp/release", + ] + ) + + def test_parse_args_rejects_metadata_without_import_file(self) -> None: + with self.assertRaises(SystemExit): + MODULE.parse_args( + [ + "--database-url", + "postgres://db", + "--release-id", + "release-2026-03-12", + "--output-dir", + "/tmp/release", + "--source-pbf-metadata-file", + "/tmp/source.json", + ] + ) + + def test_load_source_file_metadata_accepts_streamed_source_locator(self) -> None: + metadata = { + "path": "https://download.geofabrik.de/north-america/us-latest.osm.pbf", + "filename": "us-latest.osm.pbf", + "size_bytes": 123, + "modified_at": "2026-03-12T00:00:00+00:00", + "sha256": "a" * 64, + } + with tempfile.TemporaryDirectory() as tmpdir: + metadata_path = Path(tmpdir) / "source.json" + metadata_path.write_text(json.dumps(metadata), encoding="utf-8") + loaded = MODULE.load_source_file_metadata(metadata_path, "source_pbf") + self.assertEqual(loaded, metadata) + + def test_load_source_file_metadata_rejects_bad_sha256(self) -> None: + metadata = { + "path": "streamed://us-latest.osm.pbf", + "filename": "us-latest.osm.pbf", + "size_bytes": 123, + "modified_at": "2026-03-12T00:00:00+00:00", + "sha256": "xyz", + } + with tempfile.TemporaryDirectory() as tmpdir: + metadata_path = Path(tmpdir) / "source.json" + metadata_path.write_text(json.dumps(metadata), encoding="utf-8") + with self.assertRaisesRegex(ValueError, "sha256"): + MODULE.load_source_file_metadata(metadata_path, "source_pbf") + + if __name__ == "__main__": unittest.main() diff --git a/tooling/validate_repo.sh b/tooling/validate_repo.sh index 99cbd03..1aa0927 100755 --- a/tooling/validate_repo.sh +++ b/tooling/validate_repo.sh @@ -21,12 +21,17 @@ required_files=( "examples/duckdb/example_queries.sql" ".env.example" "compose.yaml" + ".github/workflows/release-build.yml" "bin/openinterstate" "bin/lib.sh" "docker/runner/Dockerfile" "schema/bootstrap.sql" "schema/derive.sql" "schema/osm2pgsql/openinterstate.lua" + "tooling/ci/stream_source_pbf.py" + "tooling/ci/prefilter_stream.sh" + "tooling/ci/build_release_host.sh" + "tooling/ci/publish_release.sh" "tooling/export_release.py" "tooling/requirements.txt" "tooling/validate_repo.sh" @@ -38,7 +43,13 @@ for file in "${required_files[@]}"; do test -f "$file" done -bash -n bin/openinterstate bin/lib.sh tooling/validate_repo.sh -python3 -m py_compile tooling/export_release.py +bash -n \ + bin/openinterstate \ + bin/lib.sh \ + tooling/validate_repo.sh \ + tooling/ci/prefilter_stream.sh \ + tooling/ci/build_release_host.sh \ + tooling/ci/publish_release.sh +python3 -m py_compile tooling/export_release.py tooling/ci/stream_source_pbf.py cargo fmt --all --check cargo test --workspace --all-targets From 5586c7aa7bf938a7cd4ced9ffe54be0c5195ab5a Mon Sep 17 00:00:00 2001 From: Tom Johnell Date: Thu, 12 Mar 2026 22:17:03 -0500 Subject: [PATCH 2/7] Run release build workflow on pull requests --- .github/workflows/release-build.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release-build.yml b/.github/workflows/release-build.yml index 73cffd5..4c63895 100644 --- a/.github/workflows/release-build.yml +++ b/.github/workflows/release-build.yml @@ -17,6 +17,15 @@ on: required: false default: false type: boolean + pull_request: + branches: ["main"] + paths: + - ".github/workflows/release-build.yml" + - "bin/lib.sh" + - "schema/bootstrap.sql" + - "schema/derive.sql" + - "schema/osm2pgsql/openinterstate.lua" + - "tooling/**" permissions: contents: write @@ -39,15 +48,18 @@ jobs: - name: Resolve release metadata id: meta shell: bash + env: + RELEASE_INPUT: ${{ inputs.release_id || '' }} + SOURCE_URL_INPUT: ${{ inputs.source_url || 'https://download.geofabrik.de/north-america/us-latest.osm.pbf' }} run: | - release_input='${{ inputs.release_id }}' + release_input="$RELEASE_INPUT" if [[ -n "$release_input" ]]; then release_id="$release_input" else release_id="release-$(date -u +%F)-gha-${GITHUB_RUN_NUMBER}" fi echo "release_id=$release_id" >> "$GITHUB_OUTPUT" - echo "source_url=${{ inputs.source_url }}" >> "$GITHUB_OUTPUT" + echo "source_url=$SOURCE_URL_INPUT" >> "$GITHUB_OUTPUT" echo "filtered_filename=us-latest.canonical-filtered.osm.pbf" >> "$GITHUB_OUTPUT" - name: Show disk budget @@ -134,7 +146,7 @@ jobs: compression-level: 0 publish: - if: ${{ inputs.publish }} + if: ${{ github.event_name == 'workflow_dispatch' && inputs.publish }} needs: [prefilter, build-release] runs-on: ubuntu-latest timeout-minutes: 15 From 282f510a4d5b221b530b073effd7652d89c3f811 Mon Sep 17 00:00:00 2001 From: Tom Johnell Date: Thu, 12 Mar 2026 22:19:31 -0500 Subject: [PATCH 3/7] Use temporary source file for CI prefilter --- README.md | 8 ++++---- docs/release_build.md | 8 ++++---- tooling/ci/prefilter_stream.sh | 25 ++++++++++++++++++------- tooling/ci/stream_source_pbf.py | 29 +++++++++++++++++++++-------- 4 files changed, 47 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 6bce6e8..bdf2c09 100644 --- a/README.md +++ b/README.md @@ -61,14 +61,14 @@ The repo now carries a manual GitHub Actions release workflow at That workflow is shaped to fit standard public GitHub-hosted runners: -1. stream the raw `us-latest.osm.pbf` directly into the canonical filter +1. download the raw `us-latest.osm.pbf` into short-lived runner storage 2. upload only the filtered `~160 MB` import PBF plus source metadata 3. rebuild PostGIS, derive tables, and export the release from that artifact 4. optionally publish the archive, manifest, checksums, and source lineage to GitHub -The key constraint is that the raw source PBF is never persisted on the runner, -which keeps the downstream build small enough to stay inside the standard -runner disk budget. +The raw source PBF is deleted after filtering and is never published as an +artifact, so the persisted handoff between jobs stays small even though the +prefilter job uses temporary local disk. ## Repo Map diff --git a/docs/release_build.md b/docs/release_build.md index 648cda4..9de0f8c 100644 --- a/docs/release_build.md +++ b/docs/release_build.md @@ -53,14 +53,14 @@ The repo also includes a manual GitHub Actions workflow at That workflow is designed for standard public GitHub-hosted runners: -1. stream the raw U.S. PBF directly into `osmium tags-filter` +1. download the raw U.S. PBF into temporary runner storage 2. persist only the filtered canonical import PBF plus source metadata as an artifact 3. rebuild PostGIS, derive product tables, and export the release from that filtered artifact 4. optionally publish the archive and companion metadata files to GitHub Releases -This avoids storing the full raw `us-latest.osm.pbf` on the runner, which is -the difference between fitting and not fitting inside the default runner disk -budget. +The raw source file is deleted after filtering and is never passed between jobs +as an artifact. The only persisted handoff is the filtered canonical import PBF +plus its source metadata. ## Environment Setup diff --git a/tooling/ci/prefilter_stream.sh b/tooling/ci/prefilter_stream.sh index 24a8d85..dce96b1 100755 --- a/tooling/ci/prefilter_stream.sh +++ b/tooling/ci/prefilter_stream.sh @@ -31,6 +31,7 @@ require_cmd() { SOURCE_URL="" OUTPUT_PBF="" SOURCE_METADATA_FILE="" +RAW_PBF="" while [[ $# -gt 0 ]]; do case "$1" in @@ -66,6 +67,13 @@ require_cmd python3 mkdir -p "$(dirname "$OUTPUT_PBF")" "$(dirname "$SOURCE_METADATA_FILE")" +cleanup() { + if [[ -n "$RAW_PBF" && -f "$RAW_PBF" ]]; then + rm -f "$RAW_PBF" + fi +} +trap cleanup EXIT + # shellcheck disable=SC1091 source "$REPO_ROOT/bin/lib.sh" @@ -73,17 +81,20 @@ log "Free space before streamed prefilter" df -h "$(dirname "$OUTPUT_PBF")" >&2 mapfile -t FILTER_ARGS < <(oi_canonical_filter_args) +RAW_PBF="$(mktemp "${TMPDIR:-/tmp}/openinterstate-source-XXXXXX.osm.pbf")" -log "Streaming raw source PBF into canonical filter" +log "Downloading raw source PBF to ephemeral runner storage" python3 "$REPO_ROOT/tooling/ci/stream_source_pbf.py" \ --url "$SOURCE_URL" \ --metadata-file "$SOURCE_METADATA_FILE" \ - | osmium tags-filter \ - -F pbf \ - - \ - "${FILTER_ARGS[@]}" \ - --overwrite \ - -o "$OUTPUT_PBF" + --output-file "$RAW_PBF" + +log "Filtering canonical import PBF" +osmium tags-filter \ + "$RAW_PBF" \ + "${FILTER_ARGS[@]}" \ + --overwrite \ + -o "$OUTPUT_PBF" [[ -s "$OUTPUT_PBF" ]] || die "filtered PBF is empty: $OUTPUT_PBF" osmium fileinfo "$OUTPUT_PBF" >/dev/null diff --git a/tooling/ci/stream_source_pbf.py b/tooling/ci/stream_source_pbf.py index 79d5a5b..fc52d18 100755 --- a/tooling/ci/stream_source_pbf.py +++ b/tooling/ci/stream_source_pbf.py @@ -18,6 +18,7 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument("--url", required=True) parser.add_argument("--metadata-file", required=True) + parser.add_argument("--output-file") parser.add_argument("--chunk-size", type=int, default=1024 * 1024) return parser.parse_args() @@ -44,19 +45,31 @@ def main() -> int: request = Request(args.url, headers={"User-Agent": "openinterstate-ci/1"}) digest = hashlib.sha256() size_bytes = 0 + output_path = Path(args.output_file).resolve() if args.output_file else None try: with urlopen(request) as response: final_url = response.geturl() modified_at = isoformat_http_date(response.headers.get("Last-Modified")) - while True: - chunk = response.read(args.chunk_size) - if not chunk: - break - digest.update(chunk) - size_bytes += len(chunk) - sys.stdout.buffer.write(chunk) - sys.stdout.buffer.flush() + if output_path is not None: + output_path.parent.mkdir(parents=True, exist_ok=True) + with output_path.open("wb") as output_fh: + while True: + chunk = response.read(args.chunk_size) + if not chunk: + break + digest.update(chunk) + size_bytes += len(chunk) + output_fh.write(chunk) + else: + while True: + chunk = response.read(args.chunk_size) + if not chunk: + break + digest.update(chunk) + size_bytes += len(chunk) + sys.stdout.buffer.write(chunk) + sys.stdout.buffer.flush() except BrokenPipeError: print("downstream consumer closed while streaming source PBF", file=sys.stderr) return 1 From d4663065c409d6fcd8724d38432430f52f9bd1d1 Mon Sep 17 00:00:00 2001 From: Tom Johnell Date: Thu, 12 Mar 2026 22:34:27 -0500 Subject: [PATCH 4/7] Use smoke-test source for pull request workflow --- .github/workflows/release-build.yml | 11 +++++++++-- README.md | 5 +++++ docs/release_build.md | 4 ++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release-build.yml b/.github/workflows/release-build.yml index 4c63895..e32ccda 100644 --- a/.github/workflows/release-build.yml +++ b/.github/workflows/release-build.yml @@ -58,9 +58,16 @@ jobs: else release_id="release-$(date -u +%F)-gha-${GITHUB_RUN_NUMBER}" fi + if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]]; then + source_url="https://download.geofabrik.de/north-america/us/rhode-island-latest.osm.pbf" + else + source_url="$SOURCE_URL_INPUT" + fi + source_basename="$(basename "$source_url")" + filtered_filename="${source_basename%.osm.pbf}.canonical-filtered.osm.pbf" echo "release_id=$release_id" >> "$GITHUB_OUTPUT" - echo "source_url=$SOURCE_URL_INPUT" >> "$GITHUB_OUTPUT" - echo "filtered_filename=us-latest.canonical-filtered.osm.pbf" >> "$GITHUB_OUTPUT" + echo "source_url=$source_url" >> "$GITHUB_OUTPUT" + echo "filtered_filename=$filtered_filename" >> "$GITHUB_OUTPUT" - name: Show disk budget run: df -h . diff --git a/README.md b/README.md index bdf2c09..fb19968 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,11 @@ The raw source PBF is deleted after filtering and is never published as an artifact, so the persisted handoff between jobs stays small even though the prefilter job uses temporary local disk. +The manual `workflow_dispatch` path targets the full U.S. source file. The +`pull_request` path uses a smaller Rhode Island smoke-test extract so PR checks +validate the workflow mechanics without paying the full release-build cost on +every iteration. + ## Repo Map - `bin/`: the local command-line entrypoint diff --git a/docs/release_build.md b/docs/release_build.md index 9de0f8c..2a2890b 100644 --- a/docs/release_build.md +++ b/docs/release_build.md @@ -62,6 +62,10 @@ The raw source file is deleted after filtering and is never passed between jobs as an artifact. The only persisted handoff is the filtered canonical import PBF plus its source metadata. +The manual `workflow_dispatch` run uses the full U.S. source by default. The +`pull_request` trigger is intentionally lighter and uses a Rhode Island smoke +test extract so release-workflow changes can be validated quickly in PRs. + ## Environment Setup The default local workflow works without any env file and stores working data in From a844ccb135676cac7d3f910194173660df105360 Mon Sep 17 00:00:00 2001 From: Tom Johnell Date: Thu, 12 Mar 2026 22:36:51 -0500 Subject: [PATCH 5/7] Use portable osm2pgsql username flag in CI --- tooling/ci/build_release_host.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tooling/ci/build_release_host.sh b/tooling/ci/build_release_host.sh index 698d929..70c79e4 100755 --- a/tooling/ci/build_release_host.sh +++ b/tooling/ci/build_release_host.sh @@ -225,7 +225,7 @@ PGPASSWORD="$DB_PASSWORD" osm2pgsql \ --database="$DB_NAME" \ --host="$DB_HOST" \ --port="$DB_PORT" \ - --username="$DB_USER" \ + -U "$DB_USER" \ --cache="$IMPORT_CACHE_MB" \ "$FILTERED_PBF_FILE" From 3e6870269480bfa4973e787d4d5a75da42325c01 Mon Sep 17 00:00:00 2001 From: Tom Johnell Date: Thu, 12 Mar 2026 22:40:17 -0500 Subject: [PATCH 6/7] Ignore postgres permission error in CI summary --- tooling/ci/build_release_host.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tooling/ci/build_release_host.sh b/tooling/ci/build_release_host.sh index 70c79e4..626dc36 100755 --- a/tooling/ci/build_release_host.sh +++ b/tooling/ci/build_release_host.sh @@ -273,4 +273,7 @@ tar \ "$RELEASE_ID" log "Release build complete" -du -sh "$FILTERED_PBF_FILE" "$POSTGRES_DATA_DIR" "$RELEASE_DIR" "$ARCHIVE_PATH" >&2 +du -sh "$FILTERED_PBF_FILE" "$RELEASE_DIR" "$ARCHIVE_PATH" >&2 +if ! du -sh "$POSTGRES_DATA_DIR" >&2 2>/dev/null; then + log "Skipping postgres size summary; directory is owned by the container user" +fi From 47a3bf48d07bb137a9f9352b53d443b1a96ad005 Mon Sep 17 00:00:00 2001 From: Tom Johnell Date: Thu, 12 Mar 2026 22:44:53 -0500 Subject: [PATCH 7/7] Log CI download progress --- tooling/ci/stream_source_pbf.py | 53 +++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tooling/ci/stream_source_pbf.py b/tooling/ci/stream_source_pbf.py index fc52d18..c016140 100755 --- a/tooling/ci/stream_source_pbf.py +++ b/tooling/ci/stream_source_pbf.py @@ -5,6 +5,7 @@ import hashlib import json import sys +import time from datetime import datetime, timezone from email.utils import parsedate_to_datetime from pathlib import Path, PurePosixPath @@ -40,17 +41,61 @@ def isoformat_http_date(value: str | None) -> str: return parsed.astimezone(timezone.utc).isoformat() +def format_bytes(size_bytes: int) -> str: + units = ["B", "KiB", "MiB", "GiB", "TiB"] + size = float(size_bytes) + unit = units[0] + for candidate in units: + unit = candidate + if size < 1024.0 or candidate == units[-1]: + break + size /= 1024.0 + if unit == "B": + return f"{int(size)} {unit}" + return f"{size:.1f} {unit}" + + +def log_progress(downloaded_bytes: int, total_bytes: int | None, started_at: float) -> None: + elapsed = max(time.monotonic() - started_at, 0.001) + rate = downloaded_bytes / elapsed + if total_bytes and total_bytes > 0: + percent = downloaded_bytes / total_bytes * 100.0 + print( + ( + f"downloaded {format_bytes(downloaded_bytes)} / {format_bytes(total_bytes)} " + f"({percent:.1f}%) at {format_bytes(int(rate))}/s" + ), + file=sys.stderr, + ) + return + print( + f"downloaded {format_bytes(downloaded_bytes)} at {format_bytes(int(rate))}/s", + file=sys.stderr, + ) + + def main() -> int: args = parse_args() request = Request(args.url, headers={"User-Agent": "openinterstate-ci/1"}) digest = hashlib.sha256() size_bytes = 0 output_path = Path(args.output_file).resolve() if args.output_file else None + started_at = time.monotonic() + next_progress_at = started_at + 10.0 try: with urlopen(request) as response: final_url = response.geturl() modified_at = isoformat_http_date(response.headers.get("Last-Modified")) + content_length_header = response.headers.get("Content-Length") + total_bytes = int(content_length_header) if content_length_header and content_length_header.isdigit() else None + if total_bytes is not None: + print( + f"starting download of {source_filename(final_url)} ({format_bytes(total_bytes)})", + file=sys.stderr, + ) + else: + print(f"starting download of {source_filename(final_url)}", file=sys.stderr) if output_path is not None: output_path.parent.mkdir(parents=True, exist_ok=True) with output_path.open("wb") as output_fh: @@ -61,6 +106,9 @@ def main() -> int: digest.update(chunk) size_bytes += len(chunk) output_fh.write(chunk) + if time.monotonic() >= next_progress_at: + log_progress(size_bytes, total_bytes, started_at) + next_progress_at = time.monotonic() + 10.0 else: while True: chunk = response.read(args.chunk_size) @@ -69,11 +117,16 @@ def main() -> int: digest.update(chunk) size_bytes += len(chunk) sys.stdout.buffer.write(chunk) + if time.monotonic() >= next_progress_at: + log_progress(size_bytes, total_bytes, started_at) + next_progress_at = time.monotonic() + 10.0 sys.stdout.buffer.flush() except BrokenPipeError: print("downstream consumer closed while streaming source PBF", file=sys.stderr) return 1 + log_progress(size_bytes, total_bytes, started_at) + metadata = { "path": final_url, "filename": source_filename(final_url),