diff --git a/.github/workflows/release-build.yml b/.github/workflows/release-build.yml new file mode 100644 index 0000000..e32ccda --- /dev/null +++ b/.github/workflows/release-build.yml @@ -0,0 +1,176 @@ +name: Release Build + +on: + workflow_dispatch: + inputs: + release_id: + description: "Optional release tag. Defaults to a dated GitHub Actions tag." + required: false + type: string + source_url: + description: "Source U.S. PBF URL" + required: false + default: "https://download.geofabrik.de/north-america/us-latest.osm.pbf" + type: string + publish: + description: "Publish the built assets as a GitHub release" + required: false + default: false + type: boolean + pull_request: + branches: ["main"] + paths: + - ".github/workflows/release-build.yml" + - "bin/lib.sh" + - "schema/bootstrap.sql" + - "schema/derive.sql" + - "schema/osm2pgsql/openinterstate.lua" + - "tooling/**" + +permissions: + contents: write + +concurrency: + group: release-build-${{ github.ref }} + cancel-in-progress: false + +jobs: + prefilter: + runs-on: ubuntu-latest + timeout-minutes: 30 + outputs: + release_id: ${{ steps.meta.outputs.release_id }} + source_url: ${{ steps.meta.outputs.source_url }} + filtered_filename: ${{ steps.meta.outputs.filtered_filename }} + steps: + - uses: actions/checkout@v4 + + - name: Resolve release metadata + id: meta + shell: bash + env: + RELEASE_INPUT: ${{ inputs.release_id || '' }} + SOURCE_URL_INPUT: ${{ inputs.source_url || 'https://download.geofabrik.de/north-america/us-latest.osm.pbf' }} + run: | + release_input="$RELEASE_INPUT" + if [[ -n "$release_input" ]]; then + release_id="$release_input" + else + release_id="release-$(date -u +%F)-gha-${GITHUB_RUN_NUMBER}" + fi + if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]]; then + source_url="https://download.geofabrik.de/north-america/us/rhode-island-latest.osm.pbf" + else + source_url="$SOURCE_URL_INPUT" + fi + source_basename="$(basename "$source_url")" + filtered_filename="${source_basename%.osm.pbf}.canonical-filtered.osm.pbf" + echo "release_id=$release_id" >> "$GITHUB_OUTPUT" + echo "source_url=$source_url" >> "$GITHUB_OUTPUT" + echo "filtered_filename=$filtered_filename" >> "$GITHUB_OUTPUT" + + - name: Show disk budget + run: df -h . + + - name: Install prefilter dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends osmium-tool + + - name: Stream and filter the source PBF + run: | + mkdir -p ci-inputs + tooling/ci/prefilter_stream.sh \ + --source-url "${{ steps.meta.outputs.source_url }}" \ + --output-pbf "$PWD/ci-inputs/${{ steps.meta.outputs.filtered_filename }}" \ + --source-metadata-file "$PWD/ci-inputs/source-pbf-metadata.json" + + - name: Upload filtered build inputs + uses: actions/upload-artifact@v4 + with: + name: prefiltered-${{ steps.meta.outputs.release_id }} + path: | + ci-inputs/${{ steps.meta.outputs.filtered_filename }} + ci-inputs/source-pbf-metadata.json + retention-days: 1 + compression-level: 0 + + build-release: + needs: prefilter + runs-on: ubuntu-latest + timeout-minutes: 45 + steps: + - uses: actions/checkout@v4 + + - uses: dtolnay/rust-toolchain@stable + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Show disk budget + run: df -h . + + - name: Install build dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends osm2pgsql postgresql-client + + - name: Install tooling Python dependencies + run: | + python -m venv "$RUNNER_TEMP/openinterstate-venv" + "$RUNNER_TEMP/openinterstate-venv/bin/pip" install --upgrade pip + "$RUNNER_TEMP/openinterstate-venv/bin/pip" install -r tooling/requirements.txt + echo "$RUNNER_TEMP/openinterstate-venv/bin" >> "$GITHUB_PATH" + + - name: Download filtered build inputs + uses: actions/download-artifact@v4 + with: + name: prefiltered-${{ needs.prefilter.outputs.release_id }} + path: ci-inputs + + - name: Build release from filtered artifact + env: + CARGO_TARGET_DIR: ${{ runner.temp }}/cargo-target + run: | + mkdir -p ci-artifacts/releases + tooling/ci/build_release_host.sh \ + --release-id "${{ needs.prefilter.outputs.release_id }}" \ + --filtered-pbf-file "$PWD/ci-inputs/${{ needs.prefilter.outputs.filtered_filename }}" \ + --source-pbf-metadata-file "$PWD/ci-inputs/source-pbf-metadata.json" \ + --source-url "${{ needs.prefilter.outputs.source_url }}" \ + --output-root "$PWD/ci-artifacts/releases" \ + --work-dir "$RUNNER_TEMP/openinterstate-release-build" + + - name: Upload release bundle + uses: actions/upload-artifact@v4 + with: + name: release-${{ needs.prefilter.outputs.release_id }} + path: | + ci-artifacts/releases/${{ needs.prefilter.outputs.release_id }} + ci-artifacts/releases/openinterstate-${{ needs.prefilter.outputs.release_id }}.tar.gz + retention-days: 7 + compression-level: 0 + + publish: + if: ${{ github.event_name == 'workflow_dispatch' && inputs.publish }} + needs: [prefilter, build-release] + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@v4 + + - name: Download release bundle + uses: actions/download-artifact@v4 + with: + name: release-${{ needs.prefilter.outputs.release_id }} + path: release-output + + - name: Publish GitHub release + env: + GH_TOKEN: ${{ github.token }} + run: | + tooling/ci/publish_release.sh \ + --release-id "${{ needs.prefilter.outputs.release_id }}" \ + --release-dir "$PWD/release-output/${{ needs.prefilter.outputs.release_id }}" \ + --archive-file "$PWD/release-output/openinterstate-${{ needs.prefilter.outputs.release_id }}.tar.gz" diff --git a/README.md b/README.md index f051a82..fb19968 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,27 @@ motorway/trunk road context and POI data needed for Interstate derivation, and the downstream Rust graph builders stay focused on Interstate-labeled corridors instead of constructing a much broader national highway graph. +## GitHub Actions Release Build + +The repo now carries a manual GitHub Actions release workflow at +`.github/workflows/release-build.yml`. + +That workflow is shaped to fit standard public GitHub-hosted runners: + +1. download the raw `us-latest.osm.pbf` into short-lived runner storage +2. upload only the filtered `~160 MB` import PBF plus source metadata +3. rebuild PostGIS, derive tables, and export the release from that artifact +4. optionally publish the archive, manifest, checksums, and source lineage to GitHub + +The raw source PBF is deleted after filtering and is never published as an +artifact, so the persisted handoff between jobs stays small even though the +prefilter job uses temporary local disk. + +The manual `workflow_dispatch` path targets the full U.S. source file. The +`pull_request` path uses a smaller Rhode Island smoke-test extract so PR checks +validate the workflow mechanics without paying the full release-build cost on +every iteration. + ## Repo Map - `bin/`: the local command-line entrypoint diff --git a/bin/lib.sh b/bin/lib.sh index 32433f8..5a08463 100755 --- a/bin/lib.sh +++ b/bin/lib.sh @@ -405,6 +405,22 @@ oi_download_pbf() { printf '%s\n' "$resolved_output" } +oi_canonical_filter_args() { + cat <<'EOF' +n/highway=motorway_junction +n/amenity=fuel,restaurant,fast_food,cafe,toilets,charging_station +n/tourism=hotel,motel,guest_house +n/shop=gas +n/cuisine +n/highway=rest_area,services +w/highway=motorway,motorway_link,trunk,trunk_link,rest_area,services +w/amenity=fuel,restaurant,fast_food,cafe,toilets,charging_station +w/tourism=hotel,motel,guest_house +w/shop=gas +w/cuisine +EOF +} + oi_filter_pbf() { local input_pbf="$1" local output_pbf="$2" @@ -421,19 +437,7 @@ oi_filter_pbf() { fi mkdir -p "$(dirname "$output_pbf")" - filter_args=( - n/highway=motorway_junction - n/amenity=fuel,restaurant,fast_food,cafe,toilets,charging_station - n/tourism=hotel,motel,guest_house - n/shop=gas - n/cuisine - n/highway=rest_area,services - w/highway=motorway,motorway_link,trunk,trunk_link,rest_area,services - w/amenity=fuel,restaurant,fast_food,cafe,toilets,charging_station - w/tourism=hotel,motel,guest_house - w/shop=gas - w/cuisine - ) + mapfile -t filter_args < <(oi_canonical_filter_args) state_file="$(oi_state_file filter "$output_pbf")" expected_signature="$( { diff --git a/docs/release_build.md b/docs/release_build.md index de80b9a..2a2890b 100644 --- a/docs/release_build.md +++ b/docs/release_build.md @@ -46,6 +46,26 @@ Cargo and runner caches now default under the managed data root as well, so a goose-drive workspace keeps both data artifacts and Rust build cache off the main disk. +## GitHub Actions Workflow + +The repo also includes a manual GitHub Actions workflow at +`.github/workflows/release-build.yml`. + +That workflow is designed for standard public GitHub-hosted runners: + +1. download the raw U.S. PBF into temporary runner storage +2. persist only the filtered canonical import PBF plus source metadata as an artifact +3. rebuild PostGIS, derive product tables, and export the release from that filtered artifact +4. optionally publish the archive and companion metadata files to GitHub Releases + +The raw source file is deleted after filtering and is never passed between jobs +as an artifact. The only persisted handoff is the filtered canonical import PBF +plus its source metadata. + +The manual `workflow_dispatch` run uses the full U.S. source by default. The +`pull_request` trigger is intentionally lighter and uses a Rhode Island smoke +test extract so release-workflow changes can be validated quickly in PRs. + ## Environment Setup The default local workflow works without any env file and stores working data in @@ -132,7 +152,7 @@ GeoParquet packaging can follow later. Every release now records: -1. the raw source PBF path, size, modified time, and SHA-256 +1. the raw source PBF path or streamed source URL, plus size, modified time, and SHA-256 2. the imported canonical filtered PBF path, size, modified time, and SHA-256 3. the source download URL when provided 4. the derivation chain used to produce the release @@ -140,10 +160,13 @@ Every release now records: This lineage is published both inside `manifest.json` and as the standalone asset `source_lineage.json`. +When the GitHub Actions workflow streams the raw source instead of storing it on +disk, `source_pbf.path` is recorded as the downloaded source URL. + ## Published Standalone Release The current standalone release is published as: -1. GitHub release tag: `release-2026-03-12-goose-rerun-branchfix` -2. archive: `openinterstate-release-2026-03-12-goose-rerun-branchfix.tar.gz` +1. GitHub release tag: `release-2026-03-12-coldpath` +2. archive: `openinterstate-release-2026-03-12-coldpath.tar.gz` 3. companion files: `manifest.json`, `source_lineage.json`, and `checksums.txt` diff --git a/tooling/ci/build_release_host.sh b/tooling/ci/build_release_host.sh new file mode 100755 index 0000000..626dc36 --- /dev/null +++ b/tooling/ci/build_release_host.sh @@ -0,0 +1,279 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +usage() { + cat <<'EOF' +Usage: + tooling/ci/build_release_host.sh \ + --release-id release-YYYY-MM-DD \ + --filtered-pbf-file /abs/path/us-latest.canonical-filtered.osm.pbf \ + --source-pbf-metadata-file /abs/path/source-pbf-metadata.json \ + --output-root /abs/path/release-output \ + --work-dir /abs/path/workdir \ + [--source-url URL] +EOF +} + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +require_cmd() { + local cmd="$1" + command -v "$cmd" >/dev/null 2>&1 || die "missing required command: $cmd" +} + +free_space_gb() { + local path="$1" + df -Pk "$path" | awk 'NR == 2 { print int($4 / 1024 / 1024) }' +} + +require_free_space_gb() { + local path="$1" + local min_gb="$2" + local available_gb + available_gb="$(free_space_gb "$path")" + if (( available_gb < min_gb )); then + die "need at least ${min_gb}GB free at $path, found ${available_gb}GB" + fi +} + +wait_for_postgres() { + local attempts="${1:-90}" + local sleep_seconds="${2:-2}" + local attempt + + for (( attempt=1; attempt<=attempts; attempt+=1 )); do + if PGPASSWORD="$DB_PASSWORD" pg_isready \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" >/dev/null 2>&1; then + return 0 + fi + sleep "$sleep_seconds" + done + + return 1 +} + +RELEASE_ID="" +FILTERED_PBF_FILE="" +SOURCE_PBF_METADATA_FILE="" +SOURCE_URL="" +OUTPUT_ROOT="" +WORK_DIR="" +STATE_DIR="" + +DB_HOST="${OI_CI_DB_HOST:-127.0.0.1}" +DB_PORT="${OI_CI_DB_PORT:-55432}" +DB_NAME="${OI_CI_DB_NAME:-osm}" +DB_USER="${OI_CI_DB_USER:-osm}" +DB_PASSWORD="${OI_CI_DB_PASSWORD:-osm_dev}" +MIN_FREE_GB="${OI_CI_MIN_FREE_GB:-7}" +IMPORT_CACHE_MB="${OI_IMPORT_CACHE_MB:-2048}" +DB_CONTAINER_NAME="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --release-id) + RELEASE_ID="$2" + shift 2 + ;; + --filtered-pbf-file) + FILTERED_PBF_FILE="$2" + shift 2 + ;; + --source-pbf-metadata-file) + SOURCE_PBF_METADATA_FILE="$2" + shift 2 + ;; + --source-url) + SOURCE_URL="$2" + shift 2 + ;; + --output-root) + OUTPUT_ROOT="$2" + shift 2 + ;; + --work-dir) + WORK_DIR="$2" + shift 2 + ;; + --state-dir) + STATE_DIR="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + usage + die "unknown argument: $1" + ;; + esac +done + +[[ -n "$RELEASE_ID" ]] || die "--release-id is required" +[[ -n "$FILTERED_PBF_FILE" ]] || die "--filtered-pbf-file is required" +[[ -n "$SOURCE_PBF_METADATA_FILE" ]] || die "--source-pbf-metadata-file is required" +[[ -n "$OUTPUT_ROOT" ]] || die "--output-root is required" +[[ -n "$WORK_DIR" ]] || die "--work-dir is required" + +[[ -f "$FILTERED_PBF_FILE" ]] || die "filtered PBF not found: $FILTERED_PBF_FILE" +[[ -f "$SOURCE_PBF_METADATA_FILE" ]] || die "source metadata not found: $SOURCE_PBF_METADATA_FILE" + +STATE_DIR="${STATE_DIR:-$WORK_DIR/state}" +DATABASE_URL="postgres://${DB_USER}:${DB_PASSWORD}@${DB_HOST}:${DB_PORT}/${DB_NAME}" +POSTGRES_DATA_DIR="$WORK_DIR/postgres" +RELEASE_DIR="$OUTPUT_ROOT/$RELEASE_ID" +ARCHIVE_PATH="$OUTPUT_ROOT/openinterstate-$RELEASE_ID.tar.gz" + +require_cmd cargo +require_cmd docker +require_cmd osm2pgsql +require_cmd pg_isready +require_cmd psql +require_cmd python3 +require_cmd tar + +mkdir -p "$OUTPUT_ROOT" "$POSTGRES_DATA_DIR" "$STATE_DIR" +rm -rf "$RELEASE_DIR" "$ARCHIVE_PATH" + +cleanup() { + local exit_code=$? + if [[ -n "$DB_CONTAINER_NAME" ]]; then + if (( exit_code != 0 )); then + docker logs "$DB_CONTAINER_NAME" >&2 || true + fi + docker rm -f "$DB_CONTAINER_NAME" >/dev/null 2>&1 || true + fi +} +trap cleanup EXIT + +log "Free space before release build" +df -h "$WORK_DIR" >&2 +require_free_space_gb "$WORK_DIR" "$MIN_FREE_GB" + +DB_CONTAINER_NAME="openinterstate-ci-db-${RANDOM}-$$" +log "Starting PostGIS container $DB_CONTAINER_NAME" +docker run \ + --detach \ + --rm \ + --name "$DB_CONTAINER_NAME" \ + --shm-size 2g \ + -e POSTGRES_DB="$DB_NAME" \ + -e POSTGRES_USER="$DB_USER" \ + -e POSTGRES_PASSWORD="$DB_PASSWORD" \ + -p "${DB_PORT}:5432" \ + -v "$POSTGRES_DATA_DIR:/var/lib/postgresql/data" \ + postgis/postgis:16-3.4 \ + postgres \ + -c shared_buffers=512MB \ + -c effective_cache_size=2GB \ + -c maintenance_work_mem=512MB \ + -c work_mem=32MB \ + -c max_wal_size=32GB \ + -c min_wal_size=8GB \ + -c checkpoint_timeout=60min \ + -c checkpoint_completion_target=0.9 \ + -c wal_compression=on \ + -c wal_level=minimal \ + -c max_wal_senders=0 \ + -c archive_mode=off \ + -c synchronous_commit=off \ + -c fsync=off \ + -c full_page_writes=off \ + -c autovacuum=off \ + -c effective_io_concurrency=200 \ + -c random_page_cost=1.1 \ + >/dev/null + +wait_for_postgres || die "PostGIS container did not become ready" + +log "Bootstrapping database schema" +PGPASSWORD="$DB_PASSWORD" psql \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" \ + -v ON_ERROR_STOP=1 \ + -c "CREATE EXTENSION IF NOT EXISTS postgis;" +PGPASSWORD="$DB_PASSWORD" psql \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" \ + -v ON_ERROR_STOP=1 \ + -f "$REPO_ROOT/schema/bootstrap.sql" + +log "Importing canonical filtered PBF" +PGPASSWORD="$DB_PASSWORD" osm2pgsql \ + --slim \ + --create \ + --output=flex \ + --style="$REPO_ROOT/schema/osm2pgsql/openinterstate.lua" \ + --database="$DB_NAME" \ + --host="$DB_HOST" \ + --port="$DB_PORT" \ + -U "$DB_USER" \ + --cache="$IMPORT_CACHE_MB" \ + "$FILTERED_PBF_FILE" + +log "Applying deterministic SQL projection" +PGPASSWORD="$DB_PASSWORD" psql \ + -h "$DB_HOST" \ + -p "$DB_PORT" \ + -U "$DB_USER" \ + -d "$DB_NAME" \ + -v ON_ERROR_STOP=1 \ + -f "$REPO_ROOT/schema/derive.sql" + +log "Building graph, corridors, and reference routes" +cargo run --locked --release -p openinterstate-derive -- \ + --database-url "$DATABASE_URL" \ + all + +log "Exporting release artifacts" +EXPORT_ARGS=( + python3 + "$REPO_ROOT/tooling/export_release.py" + --database-url "$DATABASE_URL" + --release-id "$RELEASE_ID" + --output-dir "$RELEASE_DIR" + --state-dir "$STATE_DIR" + --source-pbf-metadata-file "$SOURCE_PBF_METADATA_FILE" + --import-pbf-file "$FILTERED_PBF_FILE" +) +if [[ -n "$SOURCE_URL" ]]; then + EXPORT_ARGS+=(--source-url "$SOURCE_URL") +fi +"${EXPORT_ARGS[@]}" + +find "$RELEASE_DIR" \ + \( -name '.DS_Store' -o -name '._*' \) \ + -type f \ + -delete + +log "Packaging release archive" +tar \ + --exclude='.DS_Store' \ + --exclude='._*' \ + -C "$OUTPUT_ROOT" \ + -czf "$ARCHIVE_PATH" \ + "$RELEASE_ID" + +log "Release build complete" +du -sh "$FILTERED_PBF_FILE" "$RELEASE_DIR" "$ARCHIVE_PATH" >&2 +if ! du -sh "$POSTGRES_DATA_DIR" >&2 2>/dev/null; then + log "Skipping postgres size summary; directory is owned by the container user" +fi diff --git a/tooling/ci/prefilter_stream.sh b/tooling/ci/prefilter_stream.sh new file mode 100755 index 0000000..dce96b1 --- /dev/null +++ b/tooling/ci/prefilter_stream.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +usage() { + cat <<'EOF' +Usage: + tooling/ci/prefilter_stream.sh \ + --source-url URL \ + --output-pbf /abs/path/us-latest.canonical-filtered.osm.pbf \ + --source-metadata-file /abs/path/source-pbf-metadata.json +EOF +} + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +require_cmd() { + local cmd="$1" + command -v "$cmd" >/dev/null 2>&1 || die "missing required command: $cmd" +} + +SOURCE_URL="" +OUTPUT_PBF="" +SOURCE_METADATA_FILE="" +RAW_PBF="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --source-url) + SOURCE_URL="$2" + shift 2 + ;; + --output-pbf) + OUTPUT_PBF="$2" + shift 2 + ;; + --source-metadata-file) + SOURCE_METADATA_FILE="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + usage + die "unknown argument: $1" + ;; + esac +done + +[[ -n "$SOURCE_URL" ]] || die "--source-url is required" +[[ -n "$OUTPUT_PBF" ]] || die "--output-pbf is required" +[[ -n "$SOURCE_METADATA_FILE" ]] || die "--source-metadata-file is required" + +require_cmd osmium +require_cmd python3 + +mkdir -p "$(dirname "$OUTPUT_PBF")" "$(dirname "$SOURCE_METADATA_FILE")" + +cleanup() { + if [[ -n "$RAW_PBF" && -f "$RAW_PBF" ]]; then + rm -f "$RAW_PBF" + fi +} +trap cleanup EXIT + +# shellcheck disable=SC1091 +source "$REPO_ROOT/bin/lib.sh" + +log "Free space before streamed prefilter" +df -h "$(dirname "$OUTPUT_PBF")" >&2 + +mapfile -t FILTER_ARGS < <(oi_canonical_filter_args) +RAW_PBF="$(mktemp "${TMPDIR:-/tmp}/openinterstate-source-XXXXXX.osm.pbf")" + +log "Downloading raw source PBF to ephemeral runner storage" +python3 "$REPO_ROOT/tooling/ci/stream_source_pbf.py" \ + --url "$SOURCE_URL" \ + --metadata-file "$SOURCE_METADATA_FILE" \ + --output-file "$RAW_PBF" + +log "Filtering canonical import PBF" +osmium tags-filter \ + "$RAW_PBF" \ + "${FILTER_ARGS[@]}" \ + --overwrite \ + -o "$OUTPUT_PBF" + +[[ -s "$OUTPUT_PBF" ]] || die "filtered PBF is empty: $OUTPUT_PBF" +osmium fileinfo "$OUTPUT_PBF" >/dev/null + +log "Prefilter complete" +du -sh "$OUTPUT_PBF" "$SOURCE_METADATA_FILE" >&2 diff --git a/tooling/ci/publish_release.sh b/tooling/ci/publish_release.sh new file mode 100755 index 0000000..a956b06 --- /dev/null +++ b/tooling/ci/publish_release.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Usage: + tooling/ci/publish_release.sh \ + --release-id release-YYYY-MM-DD \ + --release-dir /abs/path/release-YYYY-MM-DD \ + --archive-file /abs/path/openinterstate-release-YYYY-MM-DD.tar.gz \ + [--repo owner/name] +EOF +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +require_cmd() { + local cmd="$1" + command -v "$cmd" >/dev/null 2>&1 || die "missing required command: $cmd" +} + +RELEASE_ID="" +RELEASE_DIR="" +ARCHIVE_FILE="" +REPO="tldev/openinterstate" + +while [[ $# -gt 0 ]]; do + case "$1" in + --release-id) + RELEASE_ID="$2" + shift 2 + ;; + --release-dir) + RELEASE_DIR="$2" + shift 2 + ;; + --archive-file) + ARCHIVE_FILE="$2" + shift 2 + ;; + --repo) + REPO="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + usage + die "unknown argument: $1" + ;; + esac +done + +[[ -n "$RELEASE_ID" ]] || die "--release-id is required" +[[ -n "$RELEASE_DIR" ]] || die "--release-dir is required" +[[ -n "$ARCHIVE_FILE" ]] || die "--archive-file is required" + +require_cmd gh + +MANIFEST_PATH="$RELEASE_DIR/manifest.json" +CHECKSUMS_PATH="$RELEASE_DIR/checksums.txt" +SOURCE_LINEAGE_PATH="$RELEASE_DIR/source_lineage.json" + +for path in "$RELEASE_DIR" "$ARCHIVE_FILE" "$MANIFEST_PATH" "$CHECKSUMS_PATH" "$SOURCE_LINEAGE_PATH"; do + [[ -e "$path" ]] || die "missing publish artifact: $path" +done + +RELEASE_NOTES="Rebuilt from the raw U.S. OSM PBF using the standalone OpenInterstate pipeline. See manifest.json and source_lineage.json for raw-source and imported-filter lineage, including SHA-256 hashes." + +if gh release view "$RELEASE_ID" --repo "$REPO" >/dev/null 2>&1; then + gh release upload "$RELEASE_ID" \ + "$ARCHIVE_FILE" \ + "$MANIFEST_PATH" \ + "$CHECKSUMS_PATH" \ + "$SOURCE_LINEAGE_PATH" \ + --repo "$REPO" \ + --clobber + gh release edit "$RELEASE_ID" \ + --repo "$REPO" \ + --title "$RELEASE_ID" \ + --notes "$RELEASE_NOTES" +else + gh release create "$RELEASE_ID" \ + "$ARCHIVE_FILE" \ + "$MANIFEST_PATH" \ + "$CHECKSUMS_PATH" \ + "$SOURCE_LINEAGE_PATH" \ + --repo "$REPO" \ + --title "$RELEASE_ID" \ + --notes "$RELEASE_NOTES" +fi diff --git a/tooling/ci/stream_source_pbf.py b/tooling/ci/stream_source_pbf.py new file mode 100755 index 0000000..c016140 --- /dev/null +++ b/tooling/ci/stream_source_pbf.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import hashlib +import json +import sys +import time +from datetime import datetime, timezone +from email.utils import parsedate_to_datetime +from pathlib import Path, PurePosixPath +from urllib.parse import urlparse +from urllib.request import Request, urlopen + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Stream a source PBF to stdout while recording source metadata." + ) + parser.add_argument("--url", required=True) + parser.add_argument("--metadata-file", required=True) + parser.add_argument("--output-file") + parser.add_argument("--chunk-size", type=int, default=1024 * 1024) + return parser.parse_args() + + +def source_filename(url: str) -> str: + candidate = PurePosixPath(urlparse(url).path).name + return candidate or "source.osm.pbf" + + +def isoformat_http_date(value: str | None) -> str: + if not value: + return datetime.now(timezone.utc).isoformat() + try: + parsed = parsedate_to_datetime(value) + except (TypeError, ValueError, IndexError, OverflowError): + return datetime.now(timezone.utc).isoformat() + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=timezone.utc) + return parsed.astimezone(timezone.utc).isoformat() + + +def format_bytes(size_bytes: int) -> str: + units = ["B", "KiB", "MiB", "GiB", "TiB"] + size = float(size_bytes) + unit = units[0] + for candidate in units: + unit = candidate + if size < 1024.0 or candidate == units[-1]: + break + size /= 1024.0 + if unit == "B": + return f"{int(size)} {unit}" + return f"{size:.1f} {unit}" + + +def log_progress(downloaded_bytes: int, total_bytes: int | None, started_at: float) -> None: + elapsed = max(time.monotonic() - started_at, 0.001) + rate = downloaded_bytes / elapsed + if total_bytes and total_bytes > 0: + percent = downloaded_bytes / total_bytes * 100.0 + print( + ( + f"downloaded {format_bytes(downloaded_bytes)} / {format_bytes(total_bytes)} " + f"({percent:.1f}%) at {format_bytes(int(rate))}/s" + ), + file=sys.stderr, + ) + return + print( + f"downloaded {format_bytes(downloaded_bytes)} at {format_bytes(int(rate))}/s", + file=sys.stderr, + ) + + +def main() -> int: + args = parse_args() + request = Request(args.url, headers={"User-Agent": "openinterstate-ci/1"}) + digest = hashlib.sha256() + size_bytes = 0 + output_path = Path(args.output_file).resolve() if args.output_file else None + started_at = time.monotonic() + next_progress_at = started_at + 10.0 + + try: + with urlopen(request) as response: + final_url = response.geturl() + modified_at = isoformat_http_date(response.headers.get("Last-Modified")) + content_length_header = response.headers.get("Content-Length") + total_bytes = int(content_length_header) if content_length_header and content_length_header.isdigit() else None + if total_bytes is not None: + print( + f"starting download of {source_filename(final_url)} ({format_bytes(total_bytes)})", + file=sys.stderr, + ) + else: + print(f"starting download of {source_filename(final_url)}", file=sys.stderr) + if output_path is not None: + output_path.parent.mkdir(parents=True, exist_ok=True) + with output_path.open("wb") as output_fh: + while True: + chunk = response.read(args.chunk_size) + if not chunk: + break + digest.update(chunk) + size_bytes += len(chunk) + output_fh.write(chunk) + if time.monotonic() >= next_progress_at: + log_progress(size_bytes, total_bytes, started_at) + next_progress_at = time.monotonic() + 10.0 + else: + while True: + chunk = response.read(args.chunk_size) + if not chunk: + break + digest.update(chunk) + size_bytes += len(chunk) + sys.stdout.buffer.write(chunk) + if time.monotonic() >= next_progress_at: + log_progress(size_bytes, total_bytes, started_at) + next_progress_at = time.monotonic() + 10.0 + sys.stdout.buffer.flush() + except BrokenPipeError: + print("downstream consumer closed while streaming source PBF", file=sys.stderr) + return 1 + + log_progress(size_bytes, total_bytes, started_at) + + metadata = { + "path": final_url, + "filename": source_filename(final_url), + "size_bytes": size_bytes, + "modified_at": modified_at, + "sha256": digest.hexdigest(), + } + metadata_path = Path(args.metadata_file) + metadata_path.parent.mkdir(parents=True, exist_ok=True) + metadata_path.write_text(json.dumps(metadata, indent=2), encoding="utf-8") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tooling/export_release.py b/tooling/export_release.py index ab320cd..b920c96 100755 --- a/tooling/export_release.py +++ b/tooling/export_release.py @@ -20,6 +20,7 @@ # letter-suffixed branches that are part of the Interstate system. INTERSTATE_FILTER = r"^(?:I-?[0-9]+|I-?35[EW]|I-?69[CEW])$" INTERSTATE_NAME_RE = re.compile(INTERSTATE_FILTER) +SHA256_RE = re.compile(r"^[0-9a-f]{64}$") @dataclass(frozen=True) @@ -34,16 +35,24 @@ def is_release_interstate_name(highway: str) -> bool: return bool(INTERSTATE_NAME_RE.fullmatch(highway.strip().upper())) -def parse_args() -> argparse.Namespace: +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser(description="Export OpenInterstate v1 release artifacts.") parser.add_argument("--database-url", required=True) parser.add_argument("--release-id", required=True) parser.add_argument("--output-dir", required=True) parser.add_argument("--state-dir") - parser.add_argument("--source-pbf-file", required=True) + parser.add_argument("--source-pbf-file") + parser.add_argument("--source-pbf-metadata-file") parser.add_argument("--import-pbf-file") parser.add_argument("--source-url") - return parser.parse_args() + args = parser.parse_args(argv) + has_source_file = bool(args.source_pbf_file) + has_source_metadata = bool(args.source_pbf_metadata_file) + if has_source_file == has_source_metadata: + parser.error("exactly one of --source-pbf-file or --source-pbf-metadata-file is required") + if has_source_metadata and not args.import_pbf_file: + parser.error("--import-pbf-file is required when --source-pbf-metadata-file is used") + return args def ensure_dirs(output_dir: Path) -> tuple[Path, Path, Path]: @@ -139,6 +148,46 @@ def build_source_file_metadata( return metadata +def validate_source_file_metadata(raw: Any, label: str) -> dict[str, Any]: + if not isinstance(raw, dict): + raise ValueError(f"{label} metadata must be a JSON object") + + path = raw.get("path") + filename = raw.get("filename") + size_bytes = raw.get("size_bytes") + modified_at = raw.get("modified_at") + sha256 = raw.get("sha256") + + if not isinstance(path, str) or not path.strip(): + raise ValueError(f"{label} metadata must include a non-empty path") + if not isinstance(filename, str) or not filename.strip(): + raise ValueError(f"{label} metadata must include a non-empty filename") + if not isinstance(size_bytes, int) or size_bytes < 0: + raise ValueError(f"{label} metadata must include a non-negative integer size_bytes") + if not isinstance(modified_at, str) or not modified_at.strip(): + raise ValueError(f"{label} metadata must include a non-empty modified_at") + if not isinstance(sha256, str) or not SHA256_RE.fullmatch(sha256): + raise ValueError(f"{label} metadata must include a lowercase 64-character sha256") + + return { + "path": path, + "filename": filename, + "size_bytes": size_bytes, + "modified_at": modified_at, + "sha256": sha256, + } + + +def load_source_file_metadata(path: Path, label: str) -> dict[str, Any]: + try: + raw = json.loads(path.read_text(encoding="utf-8")) + except FileNotFoundError as exc: + raise ValueError(f"{label} metadata file not found: {path}") from exc + except json.JSONDecodeError as exc: + raise ValueError(f"{label} metadata file is not valid JSON: {path}") from exc + return validate_source_file_metadata(raw, label) + + def write_checksums( files: list[Path], output_path: Path, @@ -231,14 +280,21 @@ def main() -> None: output_dir = Path(args.output_dir).resolve() state_dir = Path(args.state_dir).resolve() if args.state_dir else None csv_dir, gpx_dir, examples_dir = ensure_dirs(output_dir) - source_pbf_path = Path(args.source_pbf_file).resolve() + source_pbf_path = Path(args.source_pbf_file).resolve() if args.source_pbf_file else None import_pbf_path = Path(args.import_pbf_file).resolve() if args.import_pbf_file else source_pbf_path hash_cache: dict[tuple[str, int, int], str] = {} + if source_pbf_path is not None: + source_pbf_metadata = build_source_file_metadata(source_pbf_path, state_dir, hash_cache) + else: + source_pbf_metadata = load_source_file_metadata(Path(args.source_pbf_metadata_file).resolve(), "source_pbf") + assert import_pbf_path is not None + import_pbf_metadata = build_source_file_metadata(import_pbf_path, state_dir, hash_cache) + source_lineage = { "source_url": args.source_url, - "source_pbf": build_source_file_metadata(source_pbf_path, state_dir, hash_cache), - "import_pbf": build_source_file_metadata(import_pbf_path, state_dir, hash_cache), + "source_pbf": source_pbf_metadata, + "import_pbf": import_pbf_metadata, "derivation": [ "osm2pgsql flex import via schema/osm2pgsql/openinterstate.lua", "schema/derive.sql", diff --git a/tooling/tests/test_export_release.py b/tooling/tests/test_export_release.py index e17e350..e62446c 100644 --- a/tooling/tests/test_export_release.py +++ b/tooling/tests/test_export_release.py @@ -1,5 +1,7 @@ import importlib.util +import json import sys +import tempfile import types import unittest from pathlib import Path @@ -32,5 +34,81 @@ def test_non_route_labels_are_excluded(self) -> None: self.assertFalse(MODULE.is_release_interstate_name(highway)) +class SourceFileMetadataTests(unittest.TestCase): + def test_parse_args_accepts_source_metadata_with_import_file(self) -> None: + args = MODULE.parse_args( + [ + "--database-url", + "postgres://db", + "--release-id", + "release-2026-03-12", + "--output-dir", + "/tmp/release", + "--source-pbf-metadata-file", + "/tmp/source.json", + "--import-pbf-file", + "/tmp/import.osm.pbf", + ] + ) + self.assertEqual(args.source_pbf_metadata_file, "/tmp/source.json") + self.assertEqual(args.import_pbf_file, "/tmp/import.osm.pbf") + + def test_parse_args_rejects_missing_source_locator(self) -> None: + with self.assertRaises(SystemExit): + MODULE.parse_args( + [ + "--database-url", + "postgres://db", + "--release-id", + "release-2026-03-12", + "--output-dir", + "/tmp/release", + ] + ) + + def test_parse_args_rejects_metadata_without_import_file(self) -> None: + with self.assertRaises(SystemExit): + MODULE.parse_args( + [ + "--database-url", + "postgres://db", + "--release-id", + "release-2026-03-12", + "--output-dir", + "/tmp/release", + "--source-pbf-metadata-file", + "/tmp/source.json", + ] + ) + + def test_load_source_file_metadata_accepts_streamed_source_locator(self) -> None: + metadata = { + "path": "https://download.geofabrik.de/north-america/us-latest.osm.pbf", + "filename": "us-latest.osm.pbf", + "size_bytes": 123, + "modified_at": "2026-03-12T00:00:00+00:00", + "sha256": "a" * 64, + } + with tempfile.TemporaryDirectory() as tmpdir: + metadata_path = Path(tmpdir) / "source.json" + metadata_path.write_text(json.dumps(metadata), encoding="utf-8") + loaded = MODULE.load_source_file_metadata(metadata_path, "source_pbf") + self.assertEqual(loaded, metadata) + + def test_load_source_file_metadata_rejects_bad_sha256(self) -> None: + metadata = { + "path": "streamed://us-latest.osm.pbf", + "filename": "us-latest.osm.pbf", + "size_bytes": 123, + "modified_at": "2026-03-12T00:00:00+00:00", + "sha256": "xyz", + } + with tempfile.TemporaryDirectory() as tmpdir: + metadata_path = Path(tmpdir) / "source.json" + metadata_path.write_text(json.dumps(metadata), encoding="utf-8") + with self.assertRaisesRegex(ValueError, "sha256"): + MODULE.load_source_file_metadata(metadata_path, "source_pbf") + + if __name__ == "__main__": unittest.main() diff --git a/tooling/validate_repo.sh b/tooling/validate_repo.sh index 99cbd03..1aa0927 100755 --- a/tooling/validate_repo.sh +++ b/tooling/validate_repo.sh @@ -21,12 +21,17 @@ required_files=( "examples/duckdb/example_queries.sql" ".env.example" "compose.yaml" + ".github/workflows/release-build.yml" "bin/openinterstate" "bin/lib.sh" "docker/runner/Dockerfile" "schema/bootstrap.sql" "schema/derive.sql" "schema/osm2pgsql/openinterstate.lua" + "tooling/ci/stream_source_pbf.py" + "tooling/ci/prefilter_stream.sh" + "tooling/ci/build_release_host.sh" + "tooling/ci/publish_release.sh" "tooling/export_release.py" "tooling/requirements.txt" "tooling/validate_repo.sh" @@ -38,7 +43,13 @@ for file in "${required_files[@]}"; do test -f "$file" done -bash -n bin/openinterstate bin/lib.sh tooling/validate_repo.sh -python3 -m py_compile tooling/export_release.py +bash -n \ + bin/openinterstate \ + bin/lib.sh \ + tooling/validate_repo.sh \ + tooling/ci/prefilter_stream.sh \ + tooling/ci/build_release_host.sh \ + tooling/ci/publish_release.sh +python3 -m py_compile tooling/export_release.py tooling/ci/stream_source_pbf.py cargo fmt --all --check cargo test --workspace --all-targets