diff --git a/.env.example b/.env.example index 0b299f3..6d48ee4 100644 --- a/.env.example +++ b/.env.example @@ -5,18 +5,30 @@ # Host port exposed by compose.yaml for the local PostGIS service. OI_DB_PORT=5434 -# Optional external data workspace. Useful when `/` is space-constrained. +# Parent directory for hash-addressed PBF workspaces. +# Default: +# OI_DATA_PARENT=/Volumes/goose-drive/openinterstate +# +# With the default layout, a source PBF with SHA-256 uses: +# $OI_DATA_PARENT/workspaces/pbf-sha256/ +# Raw source downloads are cached under: +# $OI_DATA_PARENT/source-cache +# Cargo build cache is shared under: +# $OI_DATA_PARENT/cache/cargo + +# Optional explicit workspace override. Leave this unset unless you need to pin +# the build to a specific directory instead of using the PBF SHA-derived path. # Example: -# OI_DATA_ROOT=/Volumes/goose-drive/openinterstate-data +# OI_DATA_ROOT=/Volumes/goose-drive/openinterstate/workspaces/pbf-sha256/ # Optional release output root. Defaults to $OI_DATA_ROOT/releases. # Example: -# OI_RELEASE_DIR=/Volumes/goose-drive/openinterstate-releases +# OI_RELEASE_DIR=/Volumes/goose-drive/openinterstate/releases -# Optional cache roots. By default these now live under $OI_DATA_ROOT/cache so -# Docker + Cargo state can stay on an external volume too. +# Optional cache roots. Cargo cache now defaults under $OI_DATA_PARENT/cache so +# Rust build artifacts are reused across PBF workspaces. # Example: -# OI_CARGO_TARGET_DIR=/Volumes/goose-drive/openinterstate-data/cache/cargo/target +# OI_CARGO_TARGET_DIR=/Volumes/goose-drive/openinterstate/cache/cargo/target # Default source file used by `./bin/openinterstate build`. OI_DEFAULT_US_PBF_URL=https://download.geofabrik.de/north-america/us-latest.osm.pbf diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..cec232d --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,54 @@ +# OpenInterstate Agent Notes + +## Default Workspace + +- Default the managed parent root to `/Volumes/goose-drive/openinterstate`. +- In normal operation, do not choose an explicit `OI_DATA_ROOT` up front. Instead, resolve the source PBF first, hash it with SHA-256, and use the workspace path `/Volumes/goose-drive/openinterstate/workspaces/pbf-sha256/`. +- Treat `/Volumes/goose-drive/openinterstate/source-cache` as the shared raw-source download cache. +- Treat `/Volumes/goose-drive/openinterstate/cache/cargo` as the shared Rust build cache across all PBF workspaces. +- Let release artifacts default under the selected PBF workspace, which means `/releases`, unless the user explicitly overrides the release root. +- Only use `--data-dir` or `OI_DATA_ROOT` when the user explicitly asks to pin an exact workspace path and bypass the SHA-derived default. + +## Import And Derive Workflow + +- Treat canonical PBF import and derive as separate stages when planning work. +- Prefer derive-only iteration against the existing canonical database whenever the current import is still usable. +- Before proposing or running any fresh PBF import, first compute or read the PBF SHA and check the matching workspace under `workspaces/pbf-sha256/`. +- Reuse an existing downloaded source PBF, filtered canonical PBF, and canonical osm2pgsql import whenever their inputs and mappings are still valid. +- Avoid re-importing PBF data unless it is clearly necessary because the source changed, the import mapping changed, the canonical database is missing or invalid, or the user explicitly requests a re-import. + +## Operational Bias + +- When discussing or running local build commands, default to `--data-parent /Volumes/goose-drive/openinterstate` rather than a repo-local `.data/` directory or a hard-coded workspace path. +- If derive-stage work is requested, start from the assumption that the existing canonical import should be preserved and reused. +- If a re-import is required, explain why it is necessary before doing it. + +## Pike SQLite Follow-On + +- After exporting an OpenInterstate release locally, also build Pike's SQLite pack from that exact release archive unless the user explicitly says not to. +- Use Pike's supported release-driven pipeline entrypoint from `/Users/tjohnell/projects/pike/server`: `./pike-pipeline.sh build --release-file /abs/path/openinterstate-release-.tar.gz --reachability-snapshot /Volumes/goose-drive/pike-osrm/reachability/pike.osrm-reachability.snapshot.pgdump`. +- Let Pike keep its own default output locations unless the user asks otherwise. The current default host pack output is `/Users/tjohnell/projects/pike/server/.data/packs/pike.sqlite` and the staged build file is `/Users/tjohnell/projects/pike/server/.data/packs/pike.sqlite.new`. +- After the Pike build finishes, validate the pack with `sqlite3` by checking `PRAGMA integrity_check;` and confirming the `meta` table reports the matching `openinterstate_release_id`. + +## Named Comparison: Pike Interstate Exit Coverage Diff + +- If the user asks to rerun the comparison, refer to it as `Pike Interstate Exit Coverage Diff`. +- Purpose: compare the latest OpenInterstate-derived Pike pack against the latest published Pike release pack, limited to Interstate corridor and exit coverage. +- Inputs: + - OpenInterstate-derived Pike pack: `/Users/tjohnell/projects/pike/server/.data/packs/pike.sqlite` + - Latest published Pike release pack on NFS: newest `/Volumes/goose-plex-media/pike/releases/*/pike.sqlite` +- Before comparing, stage the latest published Pike release pack off NFS into `/Users/tjohnell/projects/pike/server/.data/compare/`. If a same-size, same-mtime local staged copy already exists, reuse it instead of copying again. +- Compare by `highway + canonical_direction`, starting from the OpenInterstate-derived pack's Interstate routes. +- Union exits across duplicate corridor rows for the same `highway + canonical_direction` key before counting or diffing. +- For route-level exit comparison, use distinct exit `ref` values when present. If a route has no usable `ref` values, fall back to a stable label such as `name` or `exit_id`. +- Separate findings into at least three buckets: + - likely real gaps where the published Pike release is a near-superset of the OpenInterstate-derived route + - likely real gaps where the OpenInterstate-derived route is a near-superset of the published Pike release + - likely key pollution or route conflation where one side has far more exits and low overlap +- Always report: + - route-level counts for both packs + - shared exit count + - exits only in OpenInterstate-derived pack + - exits only in published Pike release + - a short list of representative exit refs from each side for the biggest differences +- Write a durable comparison CSV into `/Users/tjohnell/projects/pike/server/.data/compare/` named like `openinterstate--vs-pike--route-exit-compare.csv`. diff --git a/README.md b/README.md index cdfc45b..8840f73 100644 --- a/README.md +++ b/README.md @@ -15,36 +15,35 @@ The repo is organized around one job: If Docker is installed, this works from a fresh clone: ```bash -./bin/openinterstate build +./bin/openinterstate --data-parent /Volumes/goose-drive/openinterstate build ``` That command downloads `us-latest.osm.pbf`, starts PostGIS, imports canonical -OSM, derives product tables, and writes a release into `.data/releases/` by -default. +OSM, derives product tables, and writes a release under a workspace chosen from +the source PBF SHA-256: -If your main disk is tight, move the managed data workspace onto another volume: - -```bash -./bin/openinterstate --data-dir /Volumes/goose-drive/openinterstate-data build +```text +/Volumes/goose-drive/openinterstate/workspaces/pbf-sha256/ ``` -With that command, working data and release artifacts both land under -`/Volumes/goose-drive/openinterstate-data/`. - -Runner caches now follow the managed data root too, so Cargo registry/git -state and the Rust target directory stay on the external volume instead of -quietly growing inside Docker-managed local storage. +Raw source downloads are shared under +`/Volumes/goose-drive/openinterstate/source-cache/`, and Cargo cache is shared +under `/Volumes/goose-drive/openinterstate/cache/cargo/` so Rust builds are +reused across PBF workspaces. If you want release artifacts in a separate folder, set an explicit release root: ```bash ./bin/openinterstate \ - --data-dir /Volumes/goose-drive/openinterstate-data \ - --release-dir /Volumes/goose-drive/openinterstate-releases \ + --data-parent /Volumes/goose-drive/openinterstate \ + --release-dir /Volumes/goose-drive/openinterstate/releases \ build ``` +If you need to pin an exact workspace path and bypass the SHA-derived layout, +use `--data-dir` as an explicit override. + When the source PBF, import mapping, derive inputs, and release exporter are unchanged, repeated builds now skip the already-current stages instead of re-downloading or rebuilding them. diff --git a/bin/lib.sh b/bin/lib.sh index bc40e16..4304ace 100755 --- a/bin/lib.sh +++ b/bin/lib.sh @@ -46,6 +46,15 @@ oi_hash_files() { fi } +oi_hash_file_sha256() { + local path="$1" + if command -v shasum >/dev/null 2>&1; then + shasum -a 256 "$path" | awk '{print $1}' + else + sha256sum "$path" | awk '{print $1}' + fi +} + oi_file_signature() { local path="$1" if stat -c '%n|%s|%Y' "$path" >/dev/null 2>&1; then @@ -152,7 +161,53 @@ oi_load_env() { done } +oi_export_path_vars() { + export OI_DATA_PARENT OI_SOURCE_CACHE_DIR OI_INDEX_DIR OI_PARENT_CACHE_DIR OI_WORKSPACES_DIR + export OI_DATA_ROOT OI_POSTGRES_DIR OI_FLATNODES_DIR OI_DOWNLOAD_DIR OI_FILTERED_DIR + export OI_STATE_DIR OI_CACHE_DIR OI_CARGO_REGISTRY_DIR OI_CARGO_GIT_DIR OI_CARGO_TARGET_DIR + export OI_RELEASE_DIR OI_BUILD_DIR OI_PBF_SHA256 +} + +oi_configure_data_root() { + local data_root="$1" + local release_root + + OI_DATA_ROOT="$(oi_abs_path "$data_root")" + OI_POSTGRES_DIR="$(oi_abs_path "$OI_DATA_ROOT/postgres/db")" + OI_FLATNODES_DIR="$(oi_abs_path "$OI_DATA_ROOT/flatnodes")" + OI_DOWNLOAD_DIR="$(oi_abs_path "$OI_DATA_ROOT/downloads")" + OI_FILTERED_DIR="$(oi_abs_path "$OI_DATA_ROOT/filtered")" + OI_STATE_DIR="$(oi_abs_path "$OI_DATA_ROOT/state")" + OI_CACHE_DIR="$(oi_abs_path "$OI_DATA_ROOT/cache")" + + if [[ "$OI_RELEASE_DIR_IS_EXPLICIT" == true ]]; then + release_root="$OI_RELEASE_DIR" + elif [[ "$OI_BUILD_DIR_IS_EXPLICIT" == true ]]; then + release_root="$OI_BUILD_DIR" + else + release_root="$OI_DATA_ROOT/releases" + fi + OI_RELEASE_DIR="$(oi_abs_path "$release_root")" + OI_BUILD_DIR="$OI_RELEASE_DIR" + + oi_export_path_vars +} + oi_set_defaults() { + local data_root_was_set=false + local release_dir_was_set=false + local build_dir_was_set=false + + if [[ -n "${OI_DATA_ROOT+x}" ]]; then + data_root_was_set=true + fi + if [[ -n "${OI_RELEASE_DIR+x}" ]]; then + release_dir_was_set=true + fi + if [[ -n "${OI_BUILD_DIR+x}" ]]; then + build_dir_was_set=true + fi + OI_DB_PORT="${OI_DB_PORT:-5434}" OI_DEFAULT_US_PBF_URL="${OI_DEFAULT_US_PBF_URL:-https://download.geofabrik.de/north-america/us-latest.osm.pbf}" OSM2PGSQL_MODE="${OSM2PGSQL_MODE:-auto}" @@ -170,26 +225,52 @@ oi_set_defaults() { OI_DB_CONTAINER_PORT="${OI_DB_CONTAINER_PORT:-5432}" PRODUCT_DB_URL="${PRODUCT_DB_URL:-postgres://${OI_DB_USER}:${OI_DB_PASSWORD}@${OI_DB_HOST}:${OI_DB_CONTAINER_PORT}/${OI_DB_NAME}}" - OI_DATA_ROOT="$(oi_abs_path "${OI_DATA_ROOT:-$REPO_ROOT/.data}")" - OI_POSTGRES_DIR="$(oi_abs_path "${OI_POSTGRES_DIR:-$OI_DATA_ROOT/postgres/db}")" - OI_FLATNODES_DIR="$(oi_abs_path "${OI_FLATNODES_DIR:-$OI_DATA_ROOT/flatnodes}")" - OI_DOWNLOAD_DIR="$(oi_abs_path "${OI_DOWNLOAD_DIR:-$OI_DATA_ROOT/downloads}")" - OI_FILTERED_DIR="$(oi_abs_path "${OI_FILTERED_DIR:-$OI_DATA_ROOT/filtered}")" - OI_STATE_DIR="$(oi_abs_path "${OI_STATE_DIR:-$OI_DATA_ROOT/state}")" - OI_CACHE_DIR="$(oi_abs_path "${OI_CACHE_DIR:-$OI_DATA_ROOT/cache}")" - OI_CARGO_REGISTRY_DIR="$(oi_abs_path "${OI_CARGO_REGISTRY_DIR:-$OI_CACHE_DIR/cargo/registry}")" - OI_CARGO_GIT_DIR="$(oi_abs_path "${OI_CARGO_GIT_DIR:-$OI_CACHE_DIR/cargo/git}")" - OI_CARGO_TARGET_DIR="$(oi_abs_path "${OI_CARGO_TARGET_DIR:-$OI_CACHE_DIR/cargo/target}")" - OI_RELEASE_DIR="$(oi_abs_path "${OI_RELEASE_DIR:-${OI_BUILD_DIR:-$OI_DATA_ROOT/releases}}")" - OI_BUILD_DIR="$OI_RELEASE_DIR" + OI_DATA_ROOT_IS_EXPLICIT="$data_root_was_set" + OI_RELEASE_DIR_IS_EXPLICIT="$release_dir_was_set" + OI_BUILD_DIR_IS_EXPLICIT="$build_dir_was_set" + if [[ "$OI_DATA_ROOT_IS_EXPLICIT" == true ]]; then + OI_DATA_PARENT="$(oi_abs_path "${OI_DATA_PARENT:-$OI_DATA_ROOT}")" + else + OI_DATA_PARENT="$(oi_abs_path "${OI_DATA_PARENT:-/Volumes/goose-drive/openinterstate}")" + fi + OI_SOURCE_CACHE_DIR="$(oi_abs_path "${OI_SOURCE_CACHE_DIR:-$OI_DATA_PARENT/source-cache}")" + OI_INDEX_DIR="$(oi_abs_path "${OI_INDEX_DIR:-$OI_DATA_PARENT/index}")" + OI_PARENT_CACHE_DIR="$(oi_abs_path "${OI_PARENT_CACHE_DIR:-$OI_DATA_PARENT/cache}")" + OI_WORKSPACES_DIR="$(oi_abs_path "${OI_WORKSPACES_DIR:-$OI_DATA_PARENT/workspaces/pbf-sha256}")" + OI_PBF_SHA256="${OI_PBF_SHA256:-}" + OI_CARGO_REGISTRY_DIR="$(oi_abs_path "${OI_CARGO_REGISTRY_DIR:-$OI_PARENT_CACHE_DIR/cargo/registry}")" + OI_CARGO_GIT_DIR="$(oi_abs_path "${OI_CARGO_GIT_DIR:-$OI_PARENT_CACHE_DIR/cargo/git}")" + OI_CARGO_TARGET_DIR="$(oi_abs_path "${OI_CARGO_TARGET_DIR:-$OI_PARENT_CACHE_DIR/cargo/target}")" + + if [[ "$OI_DATA_ROOT_IS_EXPLICIT" == true ]]; then + oi_configure_data_root "$OI_DATA_ROOT" + else + oi_configure_data_root "$OI_DATA_PARENT" + fi export OI_DB_PORT - export OI_DATA_ROOT OI_POSTGRES_DIR OI_FLATNODES_DIR OI_DOWNLOAD_DIR OI_FILTERED_DIR - export OI_STATE_DIR OI_CACHE_DIR OI_CARGO_REGISTRY_DIR OI_CARGO_GIT_DIR OI_CARGO_TARGET_DIR - export OI_RELEASE_DIR OI_BUILD_DIR OI_FLATNODES_MODE OI_FLATNODES_AUTO_MAX_PBF_MB OI_IMPORT_CACHE_MB + export OI_DATA_ROOT_IS_EXPLICIT OI_RELEASE_DIR_IS_EXPLICIT OI_BUILD_DIR_IS_EXPLICIT + export OI_FLATNODES_MODE OI_FLATNODES_AUTO_MAX_PBF_MB OI_IMPORT_CACHE_MB +} + +oi_prepare_parent_dirs() { + mkdir -p \ + "$OI_DATA_PARENT" \ + "$OI_SOURCE_CACHE_DIR" \ + "$OI_INDEX_DIR" \ + "$OI_PARENT_CACHE_DIR" \ + "$OI_WORKSPACES_DIR" \ + "$OI_CARGO_REGISTRY_DIR" \ + "$OI_CARGO_GIT_DIR" \ + "$OI_CARGO_TARGET_DIR" + + if [[ "$OI_RELEASE_DIR_IS_EXPLICIT" == true ]]; then + mkdir -p "$OI_RELEASE_DIR" + fi } oi_prepare_dirs() { + oi_prepare_parent_dirs mkdir -p \ "$OI_DATA_ROOT" \ "$OI_POSTGRES_DIR" \ @@ -278,24 +359,117 @@ oi_path_is_in_data_root() { oi_path_is_under "$1" "$OI_DATA_ROOT" } +oi_path_is_in_data_parent() { + oi_path_is_under "$1" "$OI_DATA_PARENT" +} + oi_path_is_in_release_root() { oi_path_is_under "$1" "$OI_RELEASE_DIR" } oi_path_is_managed() { local path="$1" - oi_path_is_in_repo "$path" || oi_path_is_in_data_root "$path" || oi_path_is_in_release_root "$path" + oi_path_is_in_repo "$path" || oi_path_is_in_data_root "$path" || oi_path_is_in_data_parent "$path" || oi_path_is_in_release_root "$path" } oi_managed_path() { local path path="$(oi_abs_path "$1")" if ! oi_path_is_managed "$path"; then - oi_die "path must live inside the repository or data directory: $path" + oi_die "path must live inside the repository or managed data parent: $path" fi printf '%s\n' "$path" } +oi_parent_state_file() { + local scope="$1" + local key="$2" + printf '%s/%s-%s.state\n' "$OI_INDEX_DIR" "$scope" "$(oi_hash_text "$key")" +} + +oi_source_pbf_sha256() { + local source_pbf="$1" + local abs_source signature state_file cached_signature cached_sha256 + + abs_source="$(oi_abs_path "$source_pbf")" + [[ -f "$abs_source" ]] || oi_die "source PBF not found: $abs_source" + + signature="$(oi_file_signature "$abs_source")" + state_file="$(oi_parent_state_file pbf-sha256 "$signature")" + cached_signature="$(oi_state_read "$state_file" signature 2>/dev/null || true)" + cached_sha256="$(oi_state_read "$state_file" sha256 2>/dev/null || true)" + if [[ "$cached_signature" == "$signature" && ${#cached_sha256} -eq 64 ]]; then + printf '%s\n' "$cached_sha256" + return 0 + fi + + oi_log "Hashing source PBF to select workspace" + echo " source: $abs_source" >&2 + cached_sha256="$(oi_hash_file_sha256 "$abs_source")" + oi_state_write "$state_file" \ + signature "$signature" \ + source_pbf "$abs_source" \ + sha256 "$cached_sha256" \ + completed_at "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" + printf '%s\n' "$cached_sha256" +} + +oi_workspace_root_for_sha256() { + local sha256="$1" + printf '%s/%s\n' "$OI_WORKSPACES_DIR" "$sha256" +} + +oi_json_escape() { + local value="$1" + value="${value//\\/\\\\}" + value="${value//\"/\\\"}" + value="${value//$'\n'/\\n}" + printf '%s' "$value" +} + +oi_write_workspace_metadata() { + local source_pbf="$1" + local source_url="${2:-}" + local metadata_file="$OI_DATA_ROOT/workspace.json" + local metadata_tmp="${metadata_file}.tmp.$$" + + mkdir -p "$OI_DATA_ROOT" + { + printf '{\n' + printf ' "layout": "pbf-sha256",\n' + printf ' "pbf_sha256": "%s",\n' "$(oi_json_escape "$OI_PBF_SHA256")" + printf ' "data_parent": "%s",\n' "$(oi_json_escape "$OI_DATA_PARENT")" + printf ' "workspace_root": "%s",\n' "$(oi_json_escape "$OI_DATA_ROOT")" + printf ' "release_root": "%s",\n' "$(oi_json_escape "$OI_RELEASE_DIR")" + printf ' "source_pbf": "%s"' "$(oi_json_escape "$source_pbf")" + if [[ -n "$source_url" ]]; then + printf ',\n "source_url": "%s"\n' "$(oi_json_escape "$source_url")" + else + printf '\n' + fi + printf '}\n' + } > "$metadata_tmp" + mv "$metadata_tmp" "$metadata_file" +} + +oi_activate_workspace_for_source_pbf() { + local source_pbf="$1" + local source_url="${2:-}" + local abs_source source_sha workspace_root + + abs_source="$(oi_abs_path "$source_pbf")" + [[ -f "$abs_source" ]] || oi_die "source PBF not found: $abs_source" + + # A given source PBF always resolves to the same workspace path. + source_sha="$(oi_source_pbf_sha256 "$abs_source")" + workspace_root="$(oi_workspace_root_for_sha256 "$source_sha")" + + OI_PBF_SHA256="$source_sha" + oi_configure_data_root "$workspace_root" + oi_prepare_dirs + oi_write_workspace_metadata "$abs_source" "$source_url" +} + oi_stage_input_file() { local source="$1" local abs_source staged_path @@ -349,6 +523,23 @@ oi_container_path() { return 0 fi + if oi_path_is_in_data_root "$host_path"; then + rel_path="${host_path#$OI_DATA_ROOT/}" + printf '/data/%s\n' "$rel_path" + return 0 + fi + + if [[ "$host_path" == "$OI_DATA_PARENT" ]]; then + printf '/managed\n' + return 0 + fi + + if oi_path_is_in_data_parent "$host_path"; then + rel_path="${host_path#$OI_DATA_PARENT/}" + printf '/managed/%s\n' "$rel_path" + return 0 + fi + rel_path="${host_path#$OI_DATA_ROOT/}" printf '/data/%s\n' "$rel_path" } @@ -375,7 +566,7 @@ oi_download_pbf() { local -a curl_args=() if [[ -z "$output_path" ]]; then - resolved_output="$OI_DOWNLOAD_DIR/$(basename "$source_url")" + resolved_output="$OI_SOURCE_CACHE_DIR/$(basename "$source_url")" else resolved_output="$(oi_managed_path "$output_path")" fi @@ -396,7 +587,7 @@ oi_download_pbf() { curl_args+=(-z "$(oi_container_path "$resolved_output")") fi - oi_runner curl "${curl_args[@]}" "$source_url" -o "$(oi_container_path "$resolved_output")" || return $? + oi_runner curl "${curl_args[@]}" "$source_url" -o "$(oi_container_path "$resolved_output")" >&2 || return $? printf '%s\n' "$resolved_output" } @@ -472,7 +663,7 @@ oi_filter_pbf() { "$(oi_container_path "$input_pbf")" \ "${filter_args[@]}" \ --overwrite \ - -o "$(oi_container_path "$output_tmp")" || return $? + -o "$(oi_container_path "$output_tmp")" >&2 || return $? mv "$output_tmp" "$output_pbf" oi_state_write "$state_file" \ @@ -578,9 +769,10 @@ oi_import_canonical() { local source_pbf="$1" local prefilter="${2:-true}" local force_prefilter="${3:-false}" - local import_pbf pbf_basename pbf_stem filtered_output import_mode + local import_pbf pbf_basename pbf_stem filtered_output import_mode requested_mode signature_mode local flatnodes_path drop_middle mapping_file - local import_state_file import_signature import_size_bytes + local import_state_file import_signature import_size_bytes stored_signature + local legacy_create_signature="" legacy_append_signature="" local use_flatnodes=false flatnodes_mode threshold_bytes local cache_mb local -a osm2pgsql_args=() @@ -601,6 +793,7 @@ oi_import_canonical() { import_pbf="$(oi_filter_pbf "$source_pbf" "$filtered_output" "$force_prefilter")" fi + requested_mode="${OSM2PGSQL_MODE:-auto}" import_mode="$(oi_resolve_import_mode)" if [[ "$import_mode" == "append" ]]; then if oi_canonical_db_updatable; then @@ -630,16 +823,43 @@ oi_import_canonical() { import_size_bytes="$(oi_file_size_bytes "$import_pbf")" threshold_bytes=$(( ${OI_FLATNODES_AUTO_MAX_PBF_MB:-1024} * 1024 * 1024 )) import_state_file="$(oi_state_file import "$OI_DATA_ROOT|$OI_DB_NAME")" + signature_mode="$import_mode" + if [[ "$requested_mode" == "auto" ]]; then + signature_mode="auto" + fi import_signature="$( { oi_file_signature "$import_pbf" printf 'mapping=%s\n' "$(oi_hash_files "$mapping_file")" - printf 'mode=%s\n' "$import_mode" + printf 'mode=%s\n' "$signature_mode" printf 'drop_middle=%s\n' "$drop_middle" printf 'flatnodes_mode=%s\n' "$flatnodes_mode" printf 'cache_mb=%s\n' "$cache_mb" } | oi_hash_stdin )" + stored_signature="$(oi_state_read "$import_state_file" signature 2>/dev/null || true)" + if [[ "$requested_mode" == "auto" ]]; then + legacy_create_signature="$( + { + oi_file_signature "$import_pbf" + printf 'mapping=%s\n' "$(oi_hash_files "$mapping_file")" + printf 'mode=create\n' + printf 'drop_middle=%s\n' "$drop_middle" + printf 'flatnodes_mode=%s\n' "$flatnodes_mode" + printf 'cache_mb=%s\n' "$cache_mb" + } | oi_hash_stdin + )" + legacy_append_signature="$( + { + oi_file_signature "$import_pbf" + printf 'mapping=%s\n' "$(oi_hash_files "$mapping_file")" + printf 'mode=append\n' + printf 'drop_middle=%s\n' "$drop_middle" + printf 'flatnodes_mode=%s\n' "$flatnodes_mode" + printf 'cache_mb=%s\n' "$cache_mb" + } | oi_hash_stdin + )" + fi case "$flatnodes_mode" in always) @@ -662,10 +882,24 @@ oi_import_canonical() { oi_cleanup_unused_flatnodes "$flatnodes_path" fi - if oi_canonical_tables_exist && [[ "$(oi_state_read "$import_state_file" signature 2>/dev/null || true)" == "$import_signature" ]]; then + if oi_canonical_tables_exist && { + [[ "$stored_signature" == "$import_signature" ]] || { + [[ "$requested_mode" == "auto" ]] && { + [[ "$stored_signature" == "$legacy_create_signature" ]] || [[ "$stored_signature" == "$legacy_append_signature" ]] + } + } + }; then oi_assert_canonical_import_ready oi_log "Skipping canonical osm2pgsql import; input and mapping are unchanged" echo " input: $import_pbf" >&2 + oi_state_write "$import_state_file" \ + signature "$import_signature" \ + source_pbf "$source_pbf" \ + import_pbf "$import_pbf" \ + mode "$import_mode" \ + requested_mode "$requested_mode" \ + use_flatnodes "$use_flatnodes" \ + completed_at "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" printf '%s\n' "$import_pbf" return 0 fi @@ -709,13 +943,14 @@ oi_import_canonical() { fi oi_runner env PGPASSWORD="$OI_DB_PASSWORD" \ - osm2pgsql "${osm2pgsql_args[@]}" "$(oi_container_path "$import_pbf")" || return $? + osm2pgsql "${osm2pgsql_args[@]}" "$(oi_container_path "$import_pbf")" >&2 || return $? oi_assert_canonical_import_ready oi_state_write "$import_state_file" \ signature "$import_signature" \ source_pbf "$source_pbf" \ import_pbf "$import_pbf" \ mode "$import_mode" \ + requested_mode "$requested_mode" \ use_flatnodes "$use_flatnodes" \ completed_at "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" @@ -729,7 +964,9 @@ oi_extract_interstate_relation_cache() { import_state_file="$(oi_state_file import "$OI_DATA_ROOT|$OI_DB_NAME")" source_pbf="$(oi_state_read "$import_state_file" source_pbf 2>/dev/null || true)" if [[ -z "$source_pbf" || ! -f "$source_pbf" ]]; then - if [[ -f "$OI_DOWNLOAD_DIR/us-latest.osm.pbf" ]]; then + if [[ -f "$OI_SOURCE_CACHE_DIR/us-latest.osm.pbf" ]]; then + source_pbf="$OI_SOURCE_CACHE_DIR/us-latest.osm.pbf" + elif [[ -f "$OI_DOWNLOAD_DIR/us-latest.osm.pbf" ]]; then source_pbf="$OI_DOWNLOAD_DIR/us-latest.osm.pbf" else oi_die "cannot extract Interstate relation cache: source PBF is unavailable" diff --git a/bin/openinterstate b/bin/openinterstate index 86bcfc8..991ae1a 100755 --- a/bin/openinterstate +++ b/bin/openinterstate @@ -10,23 +10,23 @@ ENV_FILE="$DEFAULT_ENV_FILE" usage() { cat <<'USAGE' Usage: - bin/openinterstate [--env-file /abs/path/.env] [--data-dir /abs/path] [--release-dir /abs/path] [args] + bin/openinterstate [--env-file /abs/path/.env] [--data-parent /abs/path] [--data-dir /abs/workspace] [--release-dir /abs/path] [args] Commands: build Download or use a source PBF, import it, derive data, and export a release publish Publish a built release to GitHub -One-command local run: - ./bin/openinterstate build +Default hash-addressed workspace: + ./bin/openinterstate --data-parent /Volumes/goose-drive/openinterstate build -External data volume: - ./bin/openinterstate --data-dir /Volumes/goose-drive/openinterstate-data build - -External data volume with explicit release output: +Explicit release output: ./bin/openinterstate \ - --data-dir /Volumes/goose-drive/openinterstate-data \ - --release-dir /Volumes/goose-drive/openinterstate-releases \ + --data-parent /Volumes/goose-drive/openinterstate \ + --release-dir /Volumes/goose-drive/openinterstate/releases \ build + +Explicit workspace override: + ./bin/openinterstate --data-dir /Volumes/goose-drive/openinterstate/workspaces/pbf-sha256/ build USAGE } @@ -42,6 +42,11 @@ while [[ $# -gt 0 ]]; do export OI_DATA_ROOT="$2" shift 2 ;; + --data-parent) + [[ $# -ge 3 ]] || { usage; exit 1; } + export OI_DATA_PARENT="$2" + shift 2 + ;; --release-dir) [[ $# -ge 3 ]] || { usage; exit 1; } export OI_RELEASE_DIR="$2" @@ -64,7 +69,7 @@ shift || true oi_require_cmd docker oi_load_env "$ENV_FILE" oi_set_defaults -oi_prepare_dirs +oi_prepare_parent_dirs oi_compose_cmd "$ENV_FILE" case "$COMMAND" in @@ -73,7 +78,8 @@ case "$COMMAND" in PBF_URL="" SOURCE_URL="" RELEASE_ID="release-$(date +%F)" - OUTPUT_ROOT="$OI_RELEASE_DIR" + OUTPUT_ROOT="" + OUTPUT_ROOT_IS_EXPLICIT=false PREFILTER=true FORCE_PREFILTER=false @@ -97,10 +103,12 @@ case "$COMMAND" in ;; --output-root) OUTPUT_ROOT="$2" + OUTPUT_ROOT_IS_EXPLICIT=true shift 2 ;; --release-dir) OUTPUT_ROOT="$2" + OUTPUT_ROOT_IS_EXPLICIT=true shift 2 ;; --skip-prefilter) @@ -136,6 +144,19 @@ case "$COMMAND" in fi fi + if [[ "$OI_DATA_ROOT_IS_EXPLICIT" == true ]]; then + oi_prepare_dirs + else + PBF_FILE="$(oi_abs_path "$PBF_FILE")" + oi_activate_workspace_for_source_pbf "$PBF_FILE" "$SOURCE_URL" || exit $? + fi + + if [[ "$OUTPUT_ROOT_IS_EXPLICIT" == true ]]; then + OUTPUT_ROOT="$(oi_managed_path "$OUTPUT_ROOT")" + else + OUTPUT_ROOT="$OI_RELEASE_DIR" + fi + oi_db_up oi_wait_for_db @@ -149,19 +170,32 @@ case "$COMMAND" in ;; publish) RELEASE_ID="" - OUTPUT_ROOT="$OI_RELEASE_DIR" + OUTPUT_ROOT="" + OUTPUT_ROOT_IS_EXPLICIT=false + PBF_FILE="" + PBF_URL="" while [[ $# -gt 0 ]]; do case "$1" in --release-id) RELEASE_ID="$2" shift 2 ;; + --pbf-file) + PBF_FILE="$2" + shift 2 + ;; + --pbf-url) + PBF_URL="$2" + shift 2 + ;; --output-root) OUTPUT_ROOT="$2" + OUTPUT_ROOT_IS_EXPLICIT=true shift 2 ;; --release-dir) OUTPUT_ROOT="$2" + OUTPUT_ROOT_IS_EXPLICIT=true shift 2 ;; -h|--help) @@ -173,7 +207,30 @@ case "$COMMAND" in ;; esac done + + if [[ -n "$PBF_FILE" && -n "$PBF_URL" ]]; then + oi_die "choose either --pbf-file or --pbf-url" + fi [[ -n "$RELEASE_ID" ]] || oi_die "--release-id is required" + + if [[ -n "$PBF_URL" ]]; then + PBF_FILE="$(oi_download_pbf "$PBF_URL")" || exit $? + fi + + if [[ "$OI_DATA_ROOT_IS_EXPLICIT" == true ]]; then + oi_prepare_dirs + elif [[ "$OUTPUT_ROOT_IS_EXPLICIT" != true ]]; then + [[ -n "$PBF_FILE" ]] || oi_die "publish needs --pbf-file/--pbf-url, --data-dir, or --release-dir when using hash-addressed workspaces" + PBF_FILE="$(oi_abs_path "$PBF_FILE")" + oi_activate_workspace_for_source_pbf "$PBF_FILE" || exit $? + fi + + if [[ "$OUTPUT_ROOT_IS_EXPLICIT" == true ]]; then + OUTPUT_ROOT="$(oi_managed_path "$OUTPUT_ROOT")" + else + OUTPUT_ROOT="$OI_RELEASE_DIR" + fi + oi_publish_release "$RELEASE_ID" "$OUTPUT_ROOT" ;; -h|--help|help) diff --git a/compose.yaml b/compose.yaml index 627b318..444dcbd 100644 --- a/compose.yaml +++ b/compose.yaml @@ -62,6 +62,7 @@ services: working_dir: /workspace volumes: - ./:/workspace + - ${OI_DATA_PARENT:-./.data}:/managed - ${OI_DATA_ROOT}:/data - ${OI_RELEASE_DIR:-./.data/releases}:/releases - ${OI_CARGO_REGISTRY_DIR:-./.data/cache/cargo/registry}:/usr/local/cargo/registry diff --git a/crates/derive/src/graph/compress.rs b/crates/derive/src/graph/compress.rs index 6ed1012..c065aad 100644 --- a/crates/derive/src/graph/compress.rs +++ b/crates/derive/src/graph/compress.rs @@ -5,6 +5,7 @@ use openinterstate_core::geo::haversine_distance; use openinterstate_core::highway_ref::is_interstate_highway_ref; use crate::canonical_types::{ParsedExit, ParsedHighway}; +use crate::interstate_relations::InterstateRouteSignature; use super::directions::compute_component_directions; @@ -44,6 +45,8 @@ struct HighwayGraph { arc_way_ids: HashMap<(i64, i64), BTreeSet>, } +type RouteSignaturesByWay = HashMap>; + struct ConnectorGraph { adjacency: HashMap>, } @@ -80,6 +83,7 @@ impl PartialOrd for SearchState { pub(super) fn compress_highway_graph( highways: &[ParsedHighway], exits: &[ParsedExit], + route_signatures_by_highway_and_way: &HashMap, ) -> (Vec, Vec) { let (all_exit_node_ids, exit_id_by_node) = build_exit_node_index(exits); let ways_by_highway = group_ways_by_highway(highways); @@ -97,14 +101,21 @@ pub(super) fn compress_highway_graph( let Some(graph) = build_highway_graph(highway_ways) else { continue; }; + let route_signatures_by_way = route_signatures_by_highway_and_way.get(&highway); let component_by_node = compute_components(&graph.neighbors_undirected); let stop_nodes = identify_stop_nodes( - &graph.neighbors_undirected, - &graph.neighbors_directed, + &graph, &all_exit_node_ids, + route_signatures_by_way, + ); + let mut edges = walk_compressed_edges( + &highway, + &graph, + &component_by_node, + &stop_nodes, + route_signatures_by_way, ); - let mut edges = walk_compressed_edges(&highway, &graph, &component_by_node, &stop_nodes); let component_directions = compute_component_directions(&edges, &highway); apply_component_directions(&mut edges, &component_directions); @@ -565,31 +576,56 @@ fn compute_components(neighbors_undirected: &HashMap>) -> Has } fn identify_stop_nodes( - neighbors_undirected: &HashMap>, - neighbors_directed: &HashMap>, + graph: &HighwayGraph, all_exit_node_ids: &HashSet, + route_signatures_by_way: Option<&RouteSignaturesByWay>, ) -> HashSet { let mut stop_nodes: HashSet = HashSet::new(); let mut in_degree: HashMap = HashMap::new(); let mut out_degree: HashMap = HashMap::new(); + let mut incoming_neighbors: HashMap> = HashMap::new(); - for (&node, targets) in neighbors_directed { + for (&node, targets) in &graph.neighbors_directed { *out_degree.entry(node).or_default() += targets.len(); for &target in targets { *in_degree.entry(target).or_default() += 1; + incoming_neighbors.entry(target).or_default().insert(node); } } - for &node_id in neighbors_undirected.keys() { + for &node_id in graph.neighbors_undirected.keys() { let incoming = in_degree.get(&node_id).copied().unwrap_or(0); let outgoing = out_degree.get(&node_id).copied().unwrap_or(0); if !(incoming == 1 && outgoing == 1) { stop_nodes.insert(node_id); + continue; + } + + let Some(route_signatures_by_way) = route_signatures_by_way else { + continue; + }; + + let incoming_signature = incoming_neighbors + .get(&node_id) + .and_then(|neighbors| neighbors.iter().next().copied()) + .map(|prev_node| { + arc_route_signature(graph, (prev_node, node_id), route_signatures_by_way) + }); + let outgoing_signature = graph + .neighbors_directed + .get(&node_id) + .and_then(|neighbors| neighbors.iter().next().copied()) + .map(|next_node| { + arc_route_signature(graph, (node_id, next_node), route_signatures_by_way) + }); + + if incoming_signature != outgoing_signature { + stop_nodes.insert(node_id); } } for &node_id in all_exit_node_ids { - if neighbors_undirected.contains_key(&node_id) { + if graph.neighbors_undirected.contains_key(&node_id) { stop_nodes.insert(node_id); } } @@ -602,6 +638,7 @@ fn walk_compressed_edges( graph: &HighwayGraph, component_by_node: &HashMap, stop_nodes: &HashSet, + route_signatures_by_way: Option<&RouteSignaturesByWay>, ) -> Vec { let mut edges = Vec::new(); let mut visited_directed: HashSet<(i64, i64)> = HashSet::new(); @@ -635,6 +672,9 @@ fn walk_compressed_edges( .get(&first_edge) .cloned() .unwrap_or_default(); + let route_signature = route_signatures_by_way.map(|memberships| { + arc_route_signature(graph, first_edge, memberships) + }); visited_directed.insert(first_edge); let mut prev = start_node; @@ -665,6 +705,16 @@ fn walk_compressed_edges( break; } + if let (Some(route_signatures_by_way), Some(route_signature)) = + (route_signatures_by_way, route_signature.as_ref()) + { + let next_signature = + arc_route_signature(graph, next_edge, route_signatures_by_way); + if next_signature != *route_signature { + break; + } + } + polyline.push(next_coord); length_m += haversine_distance(cur_coord.0, cur_coord.1, next_coord.0, next_coord.1); @@ -725,6 +775,22 @@ fn walk_compressed_edges( edges } +fn arc_route_signature( + graph: &HighwayGraph, + arc: (i64, i64), + route_signatures_by_way: &RouteSignaturesByWay, +) -> Vec { + let mut signature: BTreeSet = BTreeSet::new(); + + for way_id in graph.arc_way_ids.get(&arc).into_iter().flatten() { + if let Some(route_signatures) = route_signatures_by_way.get(way_id) { + signature.extend(route_signatures.iter().cloned()); + } + } + + signature.into_iter().collect() +} + fn bounds_for_polyline(polyline: &[(f64, f64)]) -> (f64, f64, f64, f64) { let mut min_lat = f64::INFINITY; let mut max_lat = f64::NEG_INFINITY; @@ -804,6 +870,7 @@ fn build_corridor_entries( #[cfg(test)] mod tests { use super::*; + use crate::interstate_relations::InterstateRouteSignature; fn sample_exit(id: &str, osm_id: i64) -> ParsedExit { ParsedExit { @@ -843,6 +910,16 @@ mod tests { } } + fn sample_route_signature( + root_relation_id: i64, + direction: Option<&str>, + ) -> InterstateRouteSignature { + InterstateRouteSignature { + root_relation_id, + direction: direction.map(|value| value.to_string()), + } + } + #[test] fn compresses_highway_and_assigns_corridor_entry() { let highways = vec![sample_highway( @@ -853,7 +930,8 @@ mod tests { )]; let exits = vec![sample_exit("node/2", 2)]; - let (edges, corridor_entries) = compress_highway_graph(&highways, &exits); + let (edges, corridor_entries) = + compress_highway_graph(&highways, &exits, &HashMap::new()); assert_eq!( edges.len(), @@ -885,7 +963,7 @@ mod tests { ), ]; - let (edges, _) = compress_highway_graph(&highways, &[]); + let (edges, _) = compress_highway_graph(&highways, &[], &HashMap::new()); let edge_low = edges .iter() .find(|edge| edge.start_node == 1 && edge.end_node == 2) @@ -916,7 +994,8 @@ mod tests { ), ]; - let (edges, corridor_entries) = compress_highway_graph(&highways, &[]); + let (edges, corridor_entries) = + compress_highway_graph(&highways, &[], &HashMap::new()); assert_eq!(edges.len(), 1); assert_eq!(edges[0].highway, "I-280"); @@ -948,7 +1027,7 @@ mod tests { ), ]; - let (edges, _) = compress_highway_graph(&highways, &[]); + let (edges, _) = compress_highway_graph(&highways, &[], &HashMap::new()); let i95_edges: Vec<&CompressedEdge> = edges.iter().filter(|edge| edge.highway == "I-95").collect(); @@ -962,4 +1041,55 @@ mod tests { .iter() .any(|edge| edge.start_node == 1 && edge.end_node == 4)); } + + #[test] + fn splits_edges_when_route_membership_signature_changes_mid_chain() { + let highways = vec![ + sample_highway( + "way/1", + &["I-19"], + &[1, 2], + &[(31.0, -110.0), (31.001, -110.0)], + ), + sample_highway( + "way/2", + &["I-19"], + &[2, 3], + &[(31.001, -110.0), (31.002, -110.0)], + ), + sample_highway( + "way/3", + &["I-19"], + &[3, 4], + &[(31.002, -110.0), (31.003, -110.0)], + ), + ]; + let route_signatures_by_highway_and_way = HashMap::from([( + "I-19".to_string(), + HashMap::from([ + ( + 1, + vec![sample_route_signature(2369468, Some("north"))], + ), + ( + 2, + vec![sample_route_signature(2369468, Some("north"))], + ), + ( + 3, + vec![sample_route_signature(2369468, Some("south"))], + ), + ]), + )]); + + let (edges, _) = compress_highway_graph( + &highways, + &[], + &route_signatures_by_highway_and_way, + ); + + assert_eq!(edges.len(), 2); + assert!(edges.iter().any(|edge| edge.start_node == 1 && edge.end_node == 3)); + assert!(edges.iter().any(|edge| edge.start_node == 3 && edge.end_node == 4)); + } } diff --git a/crates/derive/src/graph/mod.rs b/crates/derive/src/graph/mod.rs index 31e250e..6eb54ac 100644 --- a/crates/derive/src/graph/mod.rs +++ b/crates/derive/src/graph/mod.rs @@ -10,7 +10,9 @@ use openinterstate_core::highway_ref::{is_interstate_highway_ref, normalize_high use sqlx::PgPool; use crate::canonical_types::{ParsedExit, ParsedHighway}; -use crate::interstate_relations::load_relation_refs_by_way; +use crate::interstate_relations::{ + load_interstate_relation_members, relation_refs_by_way, route_signatures_by_highway_and_way, +}; use component_ids::stabilize_component_ids; use compress::compress_highway_graph; @@ -29,7 +31,10 @@ pub async fn build_graph( pool: &PgPool, interstate_relation_cache: &Path, ) -> Result { - let relation_refs_by_way = load_relation_refs_by_way(interstate_relation_cache)?; + let relation_members = load_interstate_relation_members(interstate_relation_cache)?; + let relation_refs_by_way = relation_refs_by_way(&relation_members); + let route_signatures_by_highway_and_way = + route_signatures_by_highway_and_way(&relation_members); tracing::info!( "Loaded Interstate relation memberships for {} way ids", relation_refs_by_way.len() @@ -48,7 +53,11 @@ pub async fn build_graph( .execute(pool) .await?; - let (mut edges, mut corridor_entries) = compress_highway_graph(&highways, &exits); + let (mut edges, mut corridor_entries) = compress_highway_graph( + &highways, + &exits, + &route_signatures_by_highway_and_way, + ); stabilize_component_ids(pool, &mut edges, &mut corridor_entries).await?; tracing::info!( diff --git a/crates/derive/src/graph/relation_corridors.rs b/crates/derive/src/graph/relation_corridors.rs index b829e0d..7a2881c 100644 --- a/crates/derive/src/graph/relation_corridors.rs +++ b/crates/derive/src/graph/relation_corridors.rs @@ -102,6 +102,7 @@ struct ExitRow { struct HighwayEdgeRow { edge_id: String, highway: String, + #[allow(dead_code)] direction: Option, source_way_ids: Vec, } @@ -201,6 +202,34 @@ fn filter_route_groups(groups: Vec) -> Vec, usize)> = HashMap::new(); + for group in &groups { + if group.direction.is_some() + || !directional_roots.contains(&(group.highway.clone(), group.root_relation_id)) + { + continue; + } + + let entry = unresolved_by_root + .entry((group.highway.clone(), group.root_relation_id)) + .or_insert_with(|| (BTreeSet::new(), 0)); + entry.1 += group.members.len(); + for member in &group.members { + entry.0.insert(member.leaf_relation_id); + } + } + + for ((highway, root_relation_id), (leaf_relation_ids, blank_member_count)) in unresolved_by_root + { + tracing::warn!( + highway = %highway, + root_relation_id, + ?leaf_relation_ids, + blank_member_count, + "dropping unresolved blank relation members for directional Interstate root" + ); + } + groups .into_iter() .filter(|group| { @@ -358,17 +387,21 @@ fn build_corridor_draft( return Ok(None); } + let allow_unassigned_interstate_connectors = group.direction.is_none(); + adopt_relation_connector_paths( &ordered_members, &mut assigned_way_ids, ways_by_id, connector_graph, + allow_unassigned_interstate_connectors, ); adopt_connector_paths( &mut assigned_way_ids, ways_by_id, connector_graph, &group.highway, + allow_unassigned_interstate_connectors, ); let assigned_ways: Vec<&RouteWay> = assigned_way_ids @@ -399,7 +432,7 @@ fn build_corridor_draft( .collect(), &route_segments, ); - let edge_ids = matched_edge_ids(edge_rows, &assigned_way_ids, Some(&canonical_direction)); + let edge_ids = matched_edge_ids(edge_rows, &group.highway, &assigned_way_ids); let source_way_ids: Vec = assigned_way_ids.into_iter().collect(); if route_segments.len() > 1 { @@ -521,6 +554,7 @@ fn adopt_relation_connector_paths( assigned_way_ids: &mut BTreeSet, ways_by_id: &HashMap, connector_graph: &ConnectorGraph, + allow_unassigned_interstate_connectors: bool, ) { loop { let assigned_ways: Vec<&RouteWay> = assigned_way_ids @@ -581,6 +615,8 @@ fn adopt_relation_connector_paths( connector_graph, ways_by_id, &allowed_refs, + assigned_way_ids, + allow_unassigned_interstate_connectors, false, None, ) @@ -591,6 +627,8 @@ fn adopt_relation_connector_paths( connector_graph, ways_by_id, &allowed_refs, + assigned_way_ids, + allow_unassigned_interstate_connectors, true, Some(SHORT_FALLBACK_CONNECTOR_MAX_COST_M), ) @@ -614,6 +652,7 @@ fn adopt_connector_paths( ways_by_id: &HashMap, connector_graph: &ConnectorGraph, route_highway: &str, + allow_unassigned_interstate_connectors: bool, ) { let allowed_refs = allowed_refs_for_route(route_highway, assigned_way_ids, ways_by_id); @@ -645,6 +684,8 @@ fn adopt_connector_paths( connector_graph, ways_by_id, &allowed_refs, + assigned_way_ids, + allow_unassigned_interstate_connectors, false, None, ) @@ -655,6 +696,8 @@ fn adopt_connector_paths( connector_graph, ways_by_id, &allowed_refs, + assigned_way_ids, + allow_unassigned_interstate_connectors, true, Some(SHORT_FALLBACK_CONNECTOR_MAX_COST_M), ) @@ -689,6 +732,8 @@ fn shortest_connector_path_to_any( connector_graph: &ConnectorGraph, ways_by_id: &HashMap, allowed_interstate_refs: &HashSet, + assigned_way_ids: &BTreeSet, + allow_unassigned_interstate_connectors: bool, allow_short_high_class_fallback: bool, max_cost_m: Option, ) -> Option<(u64, i32, Vec)> { @@ -737,6 +782,8 @@ fn shortest_connector_path_to_any( if !connector_way_allowed_for_refs( way, allowed_interstate_refs, + assigned_way_ids, + allow_unassigned_interstate_connectors, allow_short_high_class_fallback, ) { continue; @@ -1146,16 +1193,13 @@ fn closest_distance_along_route(route_points: &[([f64; 2], f64)], target: [f64; fn matched_edge_ids( edge_rows: &[HighwayEdgeRow], + route_highway: &str, assigned_way_ids: &BTreeSet, - canonical_direction: Option<&str>, ) -> Vec { - let wanted_direction = canonical_direction.and_then(normalize_direction); let mut edge_ids = Vec::new(); for edge in edge_rows { - if let Some(direction) = &wanted_direction { - if edge.direction.as_deref() != Some(direction.as_str()) { - continue; - } + if edge.highway != route_highway { + continue; } if edge .source_way_ids @@ -1170,10 +1214,52 @@ fn matched_edge_ids( edge_ids } +fn validate_edge_claims(drafts: &[CorridorDraft]) -> Result<(), anyhow::Error> { + let mut claims: HashMap = HashMap::new(); + for draft in drafts { + for edge_id in &draft.edge_ids { + match claims.entry(edge_id.clone()) { + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert(( + draft.highway.clone(), + draft.corridor_id, + draft.canonical_direction.clone(), + draft.root_relation_id, + )); + } + std::collections::hash_map::Entry::Occupied(entry) => { + let (existing_highway, existing_corridor_id, existing_direction, existing_root) = + entry.get(); + if *existing_corridor_id == draft.corridor_id { + continue; + } + if *existing_root == draft.root_relation_id { + continue; + } + anyhow::bail!( + "edge claim conflict for highway {} edge {} between corridor {} (relation {}, {}) and corridor {} (relation {}, {})", + existing_highway, + edge_id, + existing_corridor_id, + existing_root, + existing_direction, + draft.corridor_id, + draft.root_relation_id, + draft.canonical_direction, + ); + } + } + } + } + Ok(()) +} + async fn write_corridors( pool: &PgPool, drafts: &[CorridorDraft], ) -> Result { + validate_edge_claims(drafts)?; + let mut tx = pool.begin().await?; sqlx::query("DELETE FROM corridor_exits") .execute(&mut *tx) @@ -1383,8 +1469,22 @@ fn interstate_ref_allowed_for_route(reference: &str, route_family: Option<&str>) fn connector_way_allowed_for_refs( way: &RouteWay, allowed_refs: &HashSet, + assigned_way_ids: &BTreeSet, + allow_unassigned_interstate_connectors: bool, allow_short_high_class_fallback: bool, ) -> bool { + let has_interstate_ref = way + .refs + .iter() + .any(|reference| is_interstate_highway_ref(reference)); + + if has_interstate_ref + && !allow_unassigned_interstate_connectors + && !assigned_way_ids.contains(&way.way_id) + { + return false; + } + way.refs.is_empty() || way .refs @@ -1403,9 +1503,11 @@ mod tests { use super::{ allowed_refs_for_pair, allowed_refs_for_route, build_connector_graph, - connector_way_allowed_for_refs, prune_micro_route_segments, shortest_connector_path_to_any, - RouteWay, SHORT_FALLBACK_CONNECTOR_MAX_COST_M, + connector_way_allowed_for_refs, filter_route_groups, prune_micro_route_segments, + shortest_connector_path_to_any, validate_edge_claims, HighwayEdgeRow, RouteWay, + SHORT_FALLBACK_CONNECTOR_MAX_COST_M, }; + use crate::interstate_relations::{InterstateRelationMember, InterstateRouteGroup}; fn route_way( way_id: i64, @@ -1425,6 +1527,126 @@ mod tests { } } + fn route_group( + highway: &str, + root_relation_id: i64, + direction: Option<&str>, + members: &[(i64, i64)], + ) -> InterstateRouteGroup { + InterstateRouteGroup { + highway: highway.to_string(), + root_relation_id, + direction: direction.map(ToString::to_string), + members: members + .iter() + .enumerate() + .map( + |(sequence_index, (way_id, leaf_relation_id))| InterstateRelationMember { + way_id: *way_id, + highway: highway.to_string(), + root_relation_id, + leaf_relation_id: *leaf_relation_id, + direction: direction.map(ToString::to_string), + role: None, + sequence_index, + }, + ) + .collect(), + } + } + + #[test] + fn filter_route_groups_drops_blank_group_when_directional_siblings_exist() { + let filtered = filter_route_groups(vec![ + route_group("I-30", 100, Some("east"), &[(1, 101)]), + route_group("I-30", 100, None, &[(2, 102), (3, 102)]), + route_group("I-30", 100, Some("west"), &[(4, 103)]), + route_group("I-41", 200, None, &[(5, 201)]), + ]); + + assert_eq!(filtered.len(), 3); + assert!(filtered + .iter() + .any(|group| group.highway == "I-30" && group.direction.as_deref() == Some("east"))); + assert!(filtered + .iter() + .any(|group| group.highway == "I-30" && group.direction.as_deref() == Some("west"))); + assert!(!filtered + .iter() + .any(|group| group.highway == "I-30" && group.direction.is_none())); + assert!(filtered + .iter() + .any(|group| group.highway == "I-41" && group.direction.is_none())); + } + + #[test] + fn filter_route_groups_keeps_blank_groups_for_undirected_roots() { + let filtered = filter_route_groups(vec![ + route_group("I-84", 300, None, &[(1, 301), (2, 301)]), + route_group("I-84", 301, Some("east"), &[(3, 302)]), + ]); + + assert_eq!(filtered.len(), 2); + assert!(filtered + .iter() + .any(|group| group.root_relation_id == 300 && group.direction.is_none())); + assert!(filtered.iter().any( + |group| group.root_relation_id == 301 && group.direction.as_deref() == Some("east") + )); + } + + #[test] + fn matched_edge_ids_ignore_edge_direction_when_way_membership_matches() { + let westbound_corridor = "west"; + let edge_rows = vec![HighwayEdgeRow { + edge_id: "edge/I-10/1/2".to_string(), + highway: "I-10".to_string(), + direction: Some("east".to_string()), + source_way_ids: vec![1001, 1002], + }]; + let assigned_way_ids = BTreeSet::from([1002_i64]); + + let edge_ids = super::matched_edge_ids(&edge_rows, "I-10", &assigned_way_ids); + + assert_eq!(westbound_corridor, "west"); + assert_eq!(edge_ids, vec!["edge/I-10/1/2".to_string()]); + } + + #[test] + fn validate_edge_claims_rejects_overlapping_corridor_assignments() { + let shared_edge_id = "edge/I-10/1/2".to_string(); + let drafts = vec![ + super::CorridorDraft { + corridor_id: 10, + highway: "I-10".to_string(), + canonical_direction: "west".to_string(), + root_relation_id: 1000, + geometry_json: "{\"type\":\"LineString\",\"coordinates\":[]}".to_string(), + source_way_ids: vec![1, 2], + edge_ids: vec![shared_edge_id.clone()], + exits: vec![], + }, + super::CorridorDraft { + corridor_id: 11, + highway: "I-10".to_string(), + canonical_direction: "east".to_string(), + root_relation_id: 1001, + geometry_json: "{\"type\":\"LineString\",\"coordinates\":[]}".to_string(), + source_way_ids: vec![3, 4], + edge_ids: vec![shared_edge_id.clone()], + exits: vec![], + }, + ]; + + let err = validate_edge_claims(&drafts).expect_err("overlap should fail loudly"); + let message = err.to_string(); + assert!(message.contains("edge/I-10/1/2")); + assert!(message.contains("corridor 10")); + assert!(message.contains("corridor 11")); + assert!(message.contains("relation 1000")); + assert!(message.contains("relation 1001")); + } + #[test] fn connector_policy_allows_same_highway_interstate_ways() { let way = route_way( @@ -1439,6 +1661,8 @@ mod tests { assert!(connector_way_allowed_for_refs( &way, &HashSet::from(["I-96".to_string()]), + &BTreeSet::new(), + true, false, )); } @@ -1457,6 +1681,28 @@ mod tests { assert!(!connector_way_allowed_for_refs( &way, &HashSet::from(["I-96".to_string()]), + &BTreeSet::new(), + true, + false, + )); + } + + #[test] + fn directional_connector_policy_rejects_unassigned_interstate_way_even_when_ref_matches() { + let way = route_way( + 12, + &["I-69C", "US-281"], + &[1, 2], + &[(26.2, -98.2), (26.21, -98.2)], + "motorway", + true, + ); + + assert!(!connector_way_allowed_for_refs( + &way, + &HashSet::from(["I-69C".to_string(), "US-281".to_string()]), + &BTreeSet::new(), + false, false, )); } @@ -1495,6 +1741,8 @@ mod tests { assert!(connector_way_allowed_for_refs( &bridge_way, &allowed_refs, + &BTreeSet::new(), + true, false, )); } @@ -1533,6 +1781,8 @@ mod tests { assert!(connector_way_allowed_for_refs( &bridge_way, &allowed_refs, + &BTreeSet::new(), + true, false, )); } @@ -1623,6 +1873,7 @@ mod tests { let graph = build_connector_graph(&[ways_by_id.get(&bridge_way.way_id).unwrap()]); let sources = HashSet::from([1_i64]); let targets = HashMap::from([(3_i64, 1_i32)]); + let assigned_way_ids = BTreeSet::new(); let result = shortest_connector_path_to_any( &sources, @@ -1630,6 +1881,8 @@ mod tests { &graph, &ways_by_id, &HashSet::from(["I-96".to_string()]), + &assigned_way_ids, + true, false, None, ); @@ -1637,6 +1890,65 @@ mod tests { assert_eq!(result.map(|(_, _, way_ids)| way_ids), Some(vec![12])); } + #[test] + fn directional_gap_fill_rejects_same_highway_interstate_connector() { + let source_way = route_way( + 1, + &["I-69C"], + &[1, 2], + &[(26.18, -98.23), (26.19, -98.23)], + "motorway", + true, + ); + let target_way = route_way( + 2, + &["I-69C"], + &[5, 6], + &[(26.22, -98.23), (26.23, -98.23)], + "motorway", + true, + ); + let connector_way = route_way( + 3, + &["I-69C", "US-281"], + &[2, 3, 4, 5], + &[ + (26.19, -98.23), + (26.20, -98.23), + (26.21, -98.23), + (26.22, -98.23), + ], + "motorway", + true, + ); + let ways_by_id = HashMap::from([ + (source_way.way_id, source_way.clone()), + (target_way.way_id, target_way.clone()), + (connector_way.way_id, connector_way.clone()), + ]); + let graph = build_connector_graph(&[ + ways_by_id.get(&source_way.way_id).unwrap(), + ways_by_id.get(&target_way.way_id).unwrap(), + ways_by_id.get(&connector_way.way_id).unwrap(), + ]); + let sources = HashSet::from([2_i64]); + let targets = HashMap::from([(5_i64, 0_i32)]); + let assigned_way_ids = BTreeSet::from([source_way.way_id, target_way.way_id]); + + assert!(shortest_connector_path_to_any( + &sources, + &targets, + &graph, + &ways_by_id, + &HashSet::from(["I-69C".to_string(), "US-281".to_string()]), + &assigned_way_ids, + false, + false, + None, + ) + .is_none()); + } + #[test] fn short_fallback_allows_short_high_class_connector_with_other_ref() { let source_way = route_way( @@ -1681,6 +1993,7 @@ mod tests { let sources = HashSet::from([2_i64]); let targets = HashMap::from([(5_i64, 0_i32)]); let allowed_refs = HashSet::from(["I-12".to_string()]); + let assigned_way_ids = BTreeSet::from([source_way.way_id, target_way.way_id]); assert!(shortest_connector_path_to_any( &sources, @@ -1688,6 +2001,8 @@ mod tests { &graph, &ways_by_id, &allowed_refs, + &assigned_way_ids, + false, false, None, ) @@ -1699,6 +2014,8 @@ mod tests { &graph, &ways_by_id, &allowed_refs, + &assigned_way_ids, + true, true, Some(SHORT_FALLBACK_CONNECTOR_MAX_COST_M), ) @@ -1750,6 +2067,7 @@ mod tests { let sources = HashSet::from([2_i64]); let targets = HashMap::from([(5_i64, 0_i32)]); let allowed_refs = HashSet::from(["I-12".to_string()]); + let assigned_way_ids = BTreeSet::from([source_way.way_id, target_way.way_id]); assert!(shortest_connector_path_to_any( &sources, @@ -1757,6 +2075,8 @@ mod tests { &graph, &ways_by_id, &allowed_refs, + &assigned_way_ids, + true, true, Some(SHORT_FALLBACK_CONNECTOR_MAX_COST_M), ) diff --git a/crates/derive/src/interstate_relations.rs b/crates/derive/src/interstate_relations.rs index 8b8927c..71d3577 100644 --- a/crates/derive/src/interstate_relations.rs +++ b/crates/derive/src/interstate_relations.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::fs; use std::path::Path; @@ -24,6 +24,12 @@ pub struct InterstateRouteGroup { pub members: Vec, } +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct InterstateRouteSignature { + pub root_relation_id: i64, + pub direction: Option, +} + pub fn load_interstate_relation_members( path: &Path, ) -> Result, anyhow::Error> { @@ -84,11 +90,6 @@ pub fn load_interstate_relation_members( Ok(members) } -pub fn load_relation_refs_by_way(path: &Path) -> Result>, anyhow::Error> { - let members = load_interstate_relation_members(path)?; - Ok(relation_refs_by_way(&members)) -} - pub fn relation_refs_by_way(members: &[InterstateRelationMember]) -> HashMap> { let mut refs_by_way: HashMap> = HashMap::new(); for member in members { @@ -106,6 +107,38 @@ pub fn relation_refs_by_way(members: &[InterstateRelationMember]) -> HashMap HashMap>> { + let mut signatures_by_highway_and_way: HashMap< + String, + HashMap>, + > = HashMap::new(); + + for member in members { + signatures_by_highway_and_way + .entry(member.highway.clone()) + .or_default() + .entry(member.way_id) + .or_default() + .insert(InterstateRouteSignature { + root_relation_id: member.root_relation_id, + direction: member.direction.clone(), + }); + } + + signatures_by_highway_and_way + .into_iter() + .map(|(highway, signatures_by_way)| { + let normalized_signatures_by_way = signatures_by_way + .into_iter() + .map(|(way_id, signatures)| (way_id, signatures.into_iter().collect())) + .collect(); + (highway, normalized_signatures_by_way) + }) + .collect() +} + pub fn group_relation_members(members: &[InterstateRelationMember]) -> Vec { let mut members_by_group: HashMap< (String, i64, Option), diff --git a/docs/data_contract_draft.md b/docs/data_contract_draft.md index d47c77e..0835ab2 100644 --- a/docs/data_contract_draft.md +++ b/docs/data_contract_draft.md @@ -31,6 +31,9 @@ Draft key fields: Represents graph edges that belong to corridors. +`direction_code` is the corridor's canonical direction. It is not the raw +travel direction computed for an individual internal `highway_edges` row. + Draft key fields: - `edge_id` @@ -40,6 +43,10 @@ Draft key fields: - `length_m` - `geometry_geojson` +Internal note: `highway_edges.direction` remains derive-stage metadata, but it +is not part of corridor membership semantics or the public corridor edge +contract. + ### corridor_exits Represents normalized exits attached to a corridor. diff --git a/docs/release_build.md b/docs/release_build.md index e6cd795..d2e8e88 100644 --- a/docs/release_build.md +++ b/docs/release_build.md @@ -26,25 +26,34 @@ Optional on the host: 1. `gh` if you want to publish a release to GitHub from your machine -If local disk is constrained, put the managed data workspace on another volume: +The default local layout uses a parent data root and derives a per-PBF +workspace from the source file SHA-256: ```bash -./bin/openinterstate --data-dir /Volumes/goose-drive/openinterstate-data build +./bin/openinterstate --data-parent /Volumes/goose-drive/openinterstate build ``` -If you want release artifacts in a separate directory, set an explicit release -root: +That resolves the working workspace to: + +```text +/Volumes/goose-drive/openinterstate/workspaces/pbf-sha256/ +``` + +Shared raw source downloads live under +`/Volumes/goose-drive/openinterstate/source-cache/`, and shared Cargo cache +lives under `/Volumes/goose-drive/openinterstate/cache/cargo/`. + +If you want release artifacts in a separate directory, set an explicit release root: ```bash ./bin/openinterstate \ - --data-dir /Volumes/goose-drive/openinterstate-data \ - --release-dir /Volumes/goose-drive/openinterstate-releases \ + --data-parent /Volumes/goose-drive/openinterstate \ + --release-dir /Volumes/goose-drive/openinterstate/releases \ build ``` -Cargo and runner caches now default under the managed data root as well, so a -goose-drive workspace keeps both data artifacts and Rust build cache off the -main disk. +If you need to bypass the SHA-derived layout, use `--data-dir` to pin an exact +workspace path explicitly. ## GitHub Actions Workflow @@ -68,21 +77,22 @@ test extract so release-workflow changes can be validated quickly in PRs. ## Environment Setup -The default local workflow works without any env file and stores working data in -repo-local `.data/`, with release artifacts written to `.data/releases/`. +The default local workflow works without any env file and uses the goose-drive +parent root `/Volumes/goose-drive/openinterstate`. Each source PBF resolves to +its own workspace under `workspaces/pbf-sha256/`. If you want to override the defaults, copy `.env.example` to `.env` and update: 1. the exposed Postgres host port -2. the managed data workspace root -3. the release output root +2. the managed parent data root +3. the optional explicit workspace or release output root 4. the default Geofabrik source URL 5. canonical import safety flags ## One-Command Build ```bash -./bin/openinterstate build +./bin/openinterstate --data-parent /Volumes/goose-drive/openinterstate build ``` ## Publish Step @@ -91,6 +101,7 @@ After a successful build, publish the generated release to GitHub: ```bash ./bin/openinterstate publish \ + --pbf-url https://download.geofabrik.de/north-america/us-latest.osm.pbf \ --release-id release-$(date +%F) ``` diff --git a/tooling/export_release.py b/tooling/export_release.py index 0adbf25..431e490 100755 --- a/tooling/export_release.py +++ b/tooling/export_release.py @@ -366,34 +366,8 @@ def build_manifest( } -def main() -> None: - args = parse_args() - output_dir = Path(args.output_dir).resolve() - state_dir = Path(args.state_dir).resolve() if args.state_dir else None - csv_dir, gpx_dir, examples_dir = ensure_dirs(output_dir) - source_pbf_path = Path(args.source_pbf_file).resolve() if args.source_pbf_file else None - import_pbf_path = Path(args.import_pbf_file).resolve() if args.import_pbf_file else source_pbf_path - hash_cache: dict[tuple[str, int, int], str] = {} - - if source_pbf_path is not None: - source_pbf_metadata = build_source_file_metadata(source_pbf_path, state_dir, hash_cache) - else: - source_pbf_metadata = load_source_file_metadata(Path(args.source_pbf_metadata_file).resolve(), "source_pbf") - assert import_pbf_path is not None - import_pbf_metadata = build_source_file_metadata(import_pbf_path, state_dir, hash_cache) - - source_lineage = { - "source_url": args.source_url, - "source_pbf": source_pbf_metadata, - "import_pbf": import_pbf_metadata, - "derivation": [ - "osm2pgsql flex import via schema/osm2pgsql/openinterstate.lua", - "schema/derive.sql", - "openinterstate-derive graph, corridor, and reference-route builders", - ], - } - - specs = [ +def build_export_specs(interstate_filter: str = INTERSTATE_FILTER) -> list[ExportSpec]: + return [ ExportSpec( name="corridors", filename="corridors.csv", @@ -407,7 +381,7 @@ def main() -> None: COUNT(he.id) AS edge_count FROM corridors c LEFT JOIN highway_edges he ON he.corridor_id = c.corridor_id - WHERE c.highway ~ '{INTERSTATE_FILTER}' + WHERE c.highway ~ '{interstate_filter}' GROUP BY c.corridor_id, c.highway, c.canonical_direction ORDER BY c.highway, c.canonical_direction, c.corridor_id """, @@ -420,12 +394,12 @@ def main() -> None: he.id AS edge_id, he.corridor_id, c.highway AS interstate_name, - he.direction AS direction_code, + c.canonical_direction AS direction_code, he.length_m, ST_AsGeoJSON(he.geom) AS geometry_geojson FROM highway_edges he JOIN corridors c ON c.corridor_id = he.corridor_id - WHERE c.highway ~ '{INTERSTATE_FILTER}' + WHERE c.highway ~ '{interstate_filter}' ORDER BY he.corridor_id, he.id """, ), @@ -446,7 +420,7 @@ def main() -> None: json_build_object('type', 'Point', 'coordinates', json_build_array(ce.lon, ce.lat))::text AS geometry_geojson FROM corridor_exits ce JOIN corridors c USING (corridor_id) - WHERE c.highway ~ '{INTERSTATE_FILTER}' + WHERE c.highway ~ '{interstate_filter}' ORDER BY c.highway, c.canonical_direction, ce.corridor_index """, ), @@ -477,7 +451,7 @@ def main() -> None: JOIN exit_poi_candidates epc ON epc.poi_id = p.id JOIN corridor_exits ce ON ce.exit_id = epc.exit_id JOIN corridors c USING (corridor_id) - WHERE c.highway ~ '{INTERSTATE_FILTER}' + WHERE c.highway ~ '{interstate_filter}' ORDER BY p.id """, ), @@ -494,7 +468,7 @@ def main() -> None: FROM exit_poi_candidates epc JOIN corridor_exits ce ON ce.exit_id = epc.exit_id JOIN corridors c USING (corridor_id) - WHERE c.highway ~ '{INTERSTATE_FILTER}' + WHERE c.highway ~ '{interstate_filter}' ORDER BY epc.exit_id, epc.poi_id """, ), @@ -516,7 +490,7 @@ def main() -> None: FROM exit_poi_reachability epr JOIN corridor_exits ce ON ce.exit_id = epr.exit_id JOIN corridors c USING (corridor_id) - WHERE c.highway ~ '{INTERSTATE_FILTER}' + WHERE c.highway ~ '{interstate_filter}' ORDER BY epr.exit_id, epr.poi_id """, ), @@ -535,12 +509,42 @@ def main() -> None: point_count, waypoints_json FROM reference_routes - WHERE highway ~ '{INTERSTATE_FILTER}' + WHERE highway ~ '{interstate_filter}' ORDER BY highway, direction_code, display_name """, ), ] + +def main() -> None: + args = parse_args() + output_dir = Path(args.output_dir).resolve() + state_dir = Path(args.state_dir).resolve() if args.state_dir else None + csv_dir, gpx_dir, examples_dir = ensure_dirs(output_dir) + source_pbf_path = Path(args.source_pbf_file).resolve() if args.source_pbf_file else None + import_pbf_path = Path(args.import_pbf_file).resolve() if args.import_pbf_file else source_pbf_path + hash_cache: dict[tuple[str, int, int], str] = {} + + if source_pbf_path is not None: + source_pbf_metadata = build_source_file_metadata(source_pbf_path, state_dir, hash_cache) + else: + source_pbf_metadata = load_source_file_metadata(Path(args.source_pbf_metadata_file).resolve(), "source_pbf") + assert import_pbf_path is not None + import_pbf_metadata = build_source_file_metadata(import_pbf_path, state_dir, hash_cache) + + source_lineage = { + "source_url": args.source_url, + "source_pbf": source_pbf_metadata, + "import_pbf": import_pbf_metadata, + "derivation": [ + "osm2pgsql flex import via schema/osm2pgsql/openinterstate.lua", + "schema/derive.sql", + "openinterstate-derive graph, corridor, and reference-route builders", + ], + } + + specs = build_export_specs() + row_counts: dict[str, int] = {} written_files: list[Path] = [] diff --git a/tooling/extract_interstate_relations.py b/tooling/extract_interstate_relations.py index 0826026..50fc9e2 100644 --- a/tooling/extract_interstate_relations.py +++ b/tooling/extract_interstate_relations.py @@ -6,6 +6,7 @@ from collections import defaultdict import re import subprocess +import sys import urllib.parse from dataclasses import dataclass from pathlib import Path @@ -14,6 +15,14 @@ INTERSTATE_NETWORK = "US:I" INTERSTATE_REF_RE = re.compile(r"^I?[\s-]*(\d+[A-Z]?)$") CARDINAL_DIRECTIONS = {"north", "south", "east", "west"} +BOUND_CARDINAL_RE = re.compile( + r"^(north|south|east|west)(?:bound)?(?:\b|[^a-z])", + re.IGNORECASE, +) +FLOW_ROLE_RE = re.compile(r"^(forward|backward)(?:\b|[^a-z])", re.IGNORECASE) +TITLE_CARDINAL_RE = re.compile(r"\b(north|south|east|west)\b", re.IGNORECASE) +NORTH_SOUTH_AXIS = frozenset({"north", "south"}) +EAST_WEST_AXIS = frozenset({"east", "west"}) @dataclass(frozen=True) @@ -28,9 +37,18 @@ class InterstateRelation: relation_id: int ref: str direction: str | None + name: str | None + description: str | None members: list[RelationMember] +@dataclass(frozen=True) +class RootMembership: + relation_id: int + way_id: int + direction: str + + def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser( description="Extract cached Interstate route relation memberships from a source PBF." @@ -53,8 +71,39 @@ def normalize_direction(raw: str | None) -> str | None: if raw is None: return None value = urllib.parse.unquote(raw).strip().lower() - if value in CARDINAL_DIRECTIONS: - return value + if not value: + return None + aliases = { + "north": "north", + "northbound": "north", + "n": "north", + "south": "south", + "southbound": "south", + "s": "south", + "east": "east", + "eastbound": "east", + "e": "east", + "west": "west", + "westbound": "west", + "w": "west", + } + if value in aliases: + return aliases[value] + match = BOUND_CARDINAL_RE.match(value) + if match: + return match.group(1).lower() + return None + + +def normalize_flow_role(raw: str | None) -> str | None: + if raw is None: + return None + value = urllib.parse.unquote(raw).strip().lower() + if not value: + return None + match = FLOW_ROLE_RE.match(value) + if match: + return match.group(1).lower() return None @@ -125,6 +174,8 @@ def parse_relation_line(line: str) -> InterstateRelation | None: relation_id=relation_id, ref=ref, direction=normalize_direction(tags.get("direction")), + name=tags.get("name"), + description=tags.get("description"), members=parse_members(members_raw), ) @@ -171,24 +222,311 @@ def referenced_relation_ids(relations: dict[int, InterstateRelation]) -> set[int def effective_direction( relation_direction: str | None, + leaf_default_direction: str | None, member_role: str, inherited_direction: str | None, ) -> str | None: + if relation_direction is not None: + return relation_direction role_direction = normalize_direction(member_role) if role_direction is not None: return role_direction - if relation_direction is not None: - return relation_direction + flow_role = normalize_flow_role(member_role) + role_value = urllib.parse.unquote(member_role).strip() + if leaf_default_direction is not None and ( + not role_value or flow_role == "forward" + ): + return leaf_default_direction return inherited_direction +def matching_axis(directions: set[str]) -> frozenset[str] | None: + if NORTH_SOUTH_AXIS.issubset(directions) and directions.issubset(NORTH_SOUTH_AXIS): + return NORTH_SOUTH_AXIS + if EAST_WEST_AXIS.issubset(directions) and directions.issubset(EAST_WEST_AXIS): + return EAST_WEST_AXIS + return None + + +def root_relation_ids(relations: dict[int, InterstateRelation]) -> list[int]: + return sorted(set(relations) - referenced_relation_ids(relations)) + + +def same_ref_membership_signature( + relations: dict[int, InterstateRelation], + root_relation_id: int, +) -> frozenset[tuple[int, str]]: + subtree_relation_ids: set[int] = set() + + def collect_subtree(relation_id: int, stack: set[int]) -> None: + if relation_id in stack or relation_id in subtree_relation_ids: + return + + subtree_relation_ids.add(relation_id) + relation = relations[relation_id] + next_stack = set(stack) + next_stack.add(relation_id) + for member in relation.members: + if member.member_type != "r" or member.member_id not in relations: + continue + child = relations[member.member_id] + if child.ref != relation.ref: + continue + collect_subtree(child.relation_id, next_stack) + + collect_subtree(root_relation_id, set()) + subtree_relations = { + relation_id: relations[relation_id] for relation_id in subtree_relation_ids + } + relation_to_root = { + relation_id: root_relation_id for relation_id in subtree_relation_ids + } + root_axes = infer_root_axes(subtree_relations, relation_to_root) + leaf_default_directions = infer_leaf_default_directions( + subtree_relations, relation_to_root, root_axes + ) + memberships: set[tuple[int, str]] = set() + + def visit( + relation_id: int, + inherited_direction: str | None, + stack: set[int], + ) -> None: + if relation_id in stack: + return + + relation = subtree_relations[relation_id] + next_stack = set(stack) + next_stack.add(relation_id) + for member in relation.members: + direction = effective_direction( + relation.direction, + leaf_default_directions.get(relation_id), + member.role, + inherited_direction, + ) + if member.member_type == "w": + memberships.add((member.member_id, direction or "")) + continue + + if member.member_type == "r" and member.member_id in subtree_relations: + child = subtree_relations[member.member_id] + if child.ref != relation.ref: + continue + visit(member.member_id, direction, next_stack) + + visit(root_relation_id, relations[root_relation_id].direction, set()) + return frozenset(memberships) + + +def collapse_subsumed_root_aliases( + relations: dict[int, InterstateRelation], + roots: list[int], +) -> dict[int, int]: + aliases = {root_relation_id: root_relation_id for root_relation_id in roots} + signatures = { + root_relation_id: same_ref_membership_signature(relations, root_relation_id) + for root_relation_id in roots + } + roots_by_ref: dict[str, list[int]] = defaultdict(list) + for root_relation_id in roots: + roots_by_ref[relations[root_relation_id].ref].append(root_relation_id) + + def directed_only(sig: frozenset[tuple[int, str]]) -> frozenset[tuple[int, str]]: + has_directed = any(d for _, d in sig) + if not has_directed: + return sig + return frozenset((w, d) for w, d in sig if d) + + for root_ids in roots_by_ref.values(): + for root_relation_id in sorted( + root_ids, + key=lambda candidate: (len(signatures[candidate]), candidate), + ): + signature = directed_only(signatures[root_relation_id]) + candidates: list[int] = [] + for candidate_root_id in root_ids: + if candidate_root_id == root_relation_id: + continue + + candidate_signature = directed_only(signatures[candidate_root_id]) + if not signature.issubset(candidate_signature): + continue + + if ( + signature == candidate_signature + and candidate_root_id > root_relation_id + ): + continue + + candidates.append(candidate_root_id) + + if candidates: + aliases[root_relation_id] = min( + candidates, + key=lambda candidate: (len(signatures[candidate]), candidate), + ) + + def canonical_root(root_relation_id: int) -> int: + alias = aliases[root_relation_id] + if alias == root_relation_id: + return alias + resolved = canonical_root(alias) + aliases[root_relation_id] = resolved + return resolved + + for root_relation_id in roots: + aliases[root_relation_id] = canonical_root(root_relation_id) + + return aliases + + +def assign_roots(relations: dict[int, InterstateRelation]) -> dict[int, int]: + relation_to_root: dict[int, int] = {} + roots = root_relation_ids(relations) + root_aliases = collapse_subsumed_root_aliases(relations, roots) + + def visit(relation_id: int, root_relation_id: int, stack: set[int]) -> None: + if relation_id in stack: + return + relation_to_root.setdefault(relation_id, root_relation_id) + relation = relations[relation_id] + next_stack = set(stack) + next_stack.add(relation_id) + for member in relation.members: + if member.member_type != "r" or member.member_id not in relations: + continue + child = relations[member.member_id] + if child.ref != relation.ref: + continue + visit(child.relation_id, root_relation_id, next_stack) + + for root_relation_id in sorted( + roots, + key=lambda candidate: (root_aliases[candidate] != candidate, candidate), + ): + visit(root_relation_id, root_aliases[root_relation_id], set()) + + for relation_id in sorted(relations): + if relation_id not in relation_to_root: + visit(relation_id, relation_id, set()) + + return relation_to_root + + +def infer_root_axes( + relations: dict[int, InterstateRelation], + relation_to_root: dict[int, int], +) -> dict[int, frozenset[str]]: + directions_by_root: dict[int, set[str]] = defaultdict(set) + for relation_id, relation in relations.items(): + root_relation_id = relation_to_root.get(relation_id, relation_id) + if relation.direction is not None: + directions_by_root[root_relation_id].add(relation.direction) + for member in relation.members: + if member.member_type != "w": + continue + direction = normalize_direction(member.role) + if direction is not None: + directions_by_root[root_relation_id].add(direction) + + axes: dict[int, frozenset[str]] = {} + for root_relation_id, directions in directions_by_root.items(): + axis = matching_axis(directions) + if axis is not None: + axes[root_relation_id] = axis + return axes + + +def extract_title_direction( + relation: InterstateRelation, + root_axis: frozenset[str] | None, +) -> str | None: + if root_axis is None: + return None + text = " ".join( + value for value in (relation.name, relation.description) if value is not None + ) + if not text: + return None + matches = {match.group(1).lower() for match in TITLE_CARDINAL_RE.finditer(text)} + if len(matches) != 1: + return None + direction = next(iter(matches)) + if direction in root_axis: + return direction + return None + + +def infer_leaf_default_directions( + relations: dict[int, InterstateRelation], + relation_to_root: dict[int, int], + root_axes: dict[int, frozenset[str]], +) -> dict[int, str]: + inferred: dict[int, str] = {} + for relation_id, relation in relations.items(): + if relation.direction is not None: + continue + + root_relation_id = relation_to_root.get(relation_id, relation_id) + root_axis = root_axes.get(root_relation_id) + if root_axis is None: + continue + + title_direction = extract_title_direction(relation, root_axis) + if title_direction is not None: + inferred[relation_id] = title_direction + continue + + seed_directions: set[str] = set() + invalid_member_roles = False + for member in relation.members: + if member.member_type != "w": + continue + + member_direction = normalize_direction(member.role) + if member_direction is not None: + if member_direction not in root_axis: + invalid_member_roles = True + break + seed_directions.add(member_direction) + continue + + flow_role = normalize_flow_role(member.role) + role_value = urllib.parse.unquote(member.role).strip() + if role_value and flow_role not in {None, "forward"}: + invalid_member_roles = True + break + + if invalid_member_roles or len(seed_directions) != 1: + continue + + inferred[relation_id] = next(iter(seed_directions)) + + return inferred + + def flatten_relation_memberships( relations: dict[int, InterstateRelation], ) -> list[tuple[int, str, int, int, str, str, int]]: rows: list[tuple[int, str, int, int, str, str, int]] = [] - roots = sorted(set(relations) - referenced_relation_ids(relations)) + relation_to_root = assign_roots(relations) + root_axes = infer_root_axes(relations, relation_to_root) + leaf_default_directions = infer_leaf_default_directions( + relations, relation_to_root, root_axes + ) + roots = root_relation_ids(relations) + root_aliases = collapse_subsumed_root_aliases(relations, roots) + canonical_roots = sorted( + root_relation_id + for root_relation_id in roots + if root_aliases[root_relation_id] == root_relation_id + ) visited_relations: set[int] = set() sequence_by_group: dict[tuple[int, str], int] = defaultdict(int) + unresolved_by_root: dict[tuple[str, int], dict[str, object]] = defaultdict( + lambda: {"leaf_relation_ids": set(), "blank_members": 0} + ) def visit( relation_id: int, @@ -205,11 +543,24 @@ def visit( stack.add(relation_id) for member in relation.members: - direction = effective_direction(relation.direction, member.role, inherited_direction) + direction = effective_direction( + relation.direction, + leaf_default_directions.get(relation_id), + member.role, + inherited_direction, + ) if member.member_type == "w": direction_key = direction or "" sequence_index = sequence_by_group[(root_relation_id, direction_key)] sequence_by_group[(root_relation_id, direction_key)] += 1 + if not direction_key and root_relation_id in root_axes: + unresolved = unresolved_by_root[(relation.ref, root_relation_id)] + leaf_relation_ids = unresolved["leaf_relation_ids"] + assert isinstance(leaf_relation_ids, set) + leaf_relation_ids.add(relation_id) + blank_members = unresolved["blank_members"] + assert isinstance(blank_members, int) + unresolved["blank_members"] = blank_members + 1 rows.append( ( member.member_id, @@ -229,13 +580,29 @@ def visit( continue visit(member.member_id, root_relation_id, direction, stack) - for root_relation_id in roots: + for root_relation_id in canonical_roots: visit(root_relation_id, root_relation_id, relations[root_relation_id].direction, set()) for relation_id in sorted(relations): if relation_id not in visited_relations: + root_relation_id = relation_to_root.get(relation_id, relation_id) + if root_relation_id != relation_id: + continue visit(relation_id, relation_id, relations[relation_id].direction, set()) + for (highway, root_relation_id), unresolved in sorted(unresolved_by_root.items()): + leaf_relation_ids = unresolved["leaf_relation_ids"] + blank_members = unresolved["blank_members"] + assert isinstance(leaf_relation_ids, set) + assert isinstance(blank_members, int) + leafs_text = ",".join(str(leaf_id) for leaf_id in sorted(leaf_relation_ids)) + print( + "warning: unresolved directional Interstate relation root " + f"{highway} root={root_relation_id} " + f"leafs={leafs_text or '(none)'} blank_members={blank_members}", + file=sys.stderr, + ) + return rows diff --git a/tooling/tests/test_export_release.py b/tooling/tests/test_export_release.py new file mode 100644 index 0000000..4488730 --- /dev/null +++ b/tooling/tests/test_export_release.py @@ -0,0 +1,27 @@ +import sys +import types +import unittest +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +sys.modules.setdefault("psycopg", types.SimpleNamespace(Connection=object)) + +from tooling.export_release import build_export_specs # noqa: E402 + + +class ExportReleaseTests(unittest.TestCase): + def test_corridor_edges_direction_code_uses_corridor_canonical_direction(self) -> None: + specs = {spec.name: spec for spec in build_export_specs()} + + corridor_edges_query = specs["corridor_edges"].query + + self.assertIn("c.canonical_direction AS direction_code", corridor_edges_query) + self.assertNotIn("he.direction AS direction_code", corridor_edges_query) + + +if __name__ == "__main__": + unittest.main() diff --git a/tooling/tests/test_extract_interstate_relations.py b/tooling/tests/test_extract_interstate_relations.py new file mode 100644 index 0000000..e092484 --- /dev/null +++ b/tooling/tests/test_extract_interstate_relations.py @@ -0,0 +1,289 @@ +import io +import sys +import unittest +from contextlib import redirect_stderr +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from tooling.extract_interstate_relations import ( # noqa: E402 + InterstateRelation, + RelationMember, + flatten_relation_memberships, + normalize_direction, +) + + +def way_member(way_id: int, role: str = "") -> RelationMember: + return RelationMember(member_type="w", member_id=way_id, role=role) + + +def relation_member(relation_id: int, role: str = "") -> RelationMember: + return RelationMember(member_type="r", member_id=relation_id, role=role) + + +def relation( + relation_id: int, + ref: str, + *, + direction: str | None = None, + name: str | None = None, + description: str | None = None, + members: list[RelationMember] | None = None, +) -> InterstateRelation: + return InterstateRelation( + relation_id=relation_id, + ref=ref, + direction=direction, + name=name, + description=description, + members=members or [], + ) + + +def flatten_rows( + relations: dict[int, InterstateRelation], +) -> tuple[dict[int, tuple[int, str, int, int, str, str, int]], str]: + stderr = io.StringIO() + with redirect_stderr(stderr): + rows = flatten_relation_memberships(relations) + return {row[0]: row for row in rows}, stderr.getvalue() + + +class ExtractInterstateRelationsTests(unittest.TestCase): + def test_normalize_direction_accepts_rich_cardinal_variants(self) -> None: + self.assertEqual(normalize_direction("south (local)"), "south") + self.assertEqual(normalize_direction("north (thru)"), "north") + self.assertEqual(normalize_direction("South"), "south") + self.assertEqual(normalize_direction("eastbound"), "east") + self.assertEqual(normalize_direction("westbound"), "west") + self.assertIsNone(normalize_direction("forward")) + + def test_rule1_infers_direction_from_title_when_root_axis_matches(self) -> None: + rows_by_way, _ = flatten_rows( + { + 100: relation( + 100, + "I-30", + members=[ + relation_member(101), + relation_member(102), + relation_member(103), + ], + ), + 101: relation( + 101, + "I-30", + name="I 30 (AR) (East)", + members=[way_member(1), way_member(2, "forward")], + ), + 102: relation(102, "I-30", direction="east", members=[way_member(3)]), + 103: relation(103, "I-30", direction="west", members=[way_member(4)]), + } + ) + + self.assertEqual(rows_by_way[1][4], "east") + self.assertEqual(rows_by_way[2][4], "east") + + def test_rule1_does_not_infer_wrong_axis_title_direction(self) -> None: + rows_by_way, warnings = flatten_rows( + { + 100: relation( + 100, + "I-30", + members=[ + relation_member(101), + relation_member(102), + relation_member(103), + ], + ), + 101: relation( + 101, + "I-30", + name="I 30 Spur (North)", + members=[way_member(1)], + ), + 102: relation(102, "I-30", direction="east", members=[way_member(3)]), + 103: relation(103, "I-30", direction="west", members=[way_member(4)]), + } + ) + + self.assertEqual(rows_by_way[1][4], "") + self.assertIn("warning: unresolved directional Interstate relation root I-30", warnings) + + def test_rule2_propagates_seeded_direction_to_forward_and_blank_members(self) -> None: + rows_by_way, _ = flatten_rows( + { + 200: relation( + 200, + "I-435", + members=[ + relation_member(201), + relation_member(202), + relation_member(203), + ], + ), + 201: relation( + 201, + "I-435", + description="I 435 (KS/MO) (clockwise)", + members=[ + way_member(10, "south"), + way_member(11, "forward"), + way_member(12), + ], + ), + 202: relation(202, "I-435", direction="north", members=[way_member(20)]), + 203: relation(203, "I-435", direction="south", members=[way_member(21)]), + } + ) + + self.assertEqual(rows_by_way[10][4], "south") + self.assertEqual(rows_by_way[11][4], "south") + self.assertEqual(rows_by_way[12][4], "south") + + def test_rule2_leaves_all_forward_leaf_unresolved_without_seed(self) -> None: + rows_by_way, warnings = flatten_rows( + { + 300: relation( + 300, + "I-41", + members=[ + relation_member(301), + relation_member(302), + relation_member(303), + ], + ), + 301: relation( + 301, + "I-41", + description="I 41 (WI)", + members=[way_member(30, "forward"), way_member(31, "forward")], + ), + 302: relation(302, "I-41", direction="north", members=[way_member(32)]), + 303: relation(303, "I-41", direction="south", members=[way_member(33)]), + } + ) + + self.assertEqual(rows_by_way[30][4], "") + self.assertEqual(rows_by_way[31][4], "") + self.assertIn("root=300", warnings) + self.assertIn("leafs=301", warnings) + self.assertIn("blank_members=2", warnings) + + def test_rule2_does_not_propagate_when_leaf_has_conflicting_seeds(self) -> None: + rows_by_way, warnings = flatten_rows( + { + 400: relation( + 400, + "I-95", + members=[ + relation_member(401), + relation_member(402), + relation_member(403), + ], + ), + 401: relation( + 401, + "I-95", + members=[ + way_member(40, "north"), + way_member(41, "south"), + way_member(42, "forward"), + ], + ), + 402: relation(402, "I-95", direction="north", members=[way_member(43)]), + 403: relation(403, "I-95", direction="south", members=[way_member(44)]), + } + ) + + self.assertEqual(rows_by_way[40][4], "north") + self.assertEqual(rows_by_way[41][4], "south") + self.assertEqual(rows_by_way[42][4], "") + self.assertIn("root=400", warnings) + + def test_relation_direction_overrides_conflicting_member_roles(self) -> None: + rows_by_way, _ = flatten_rows( + { + 500: relation( + 500, + "I-12", + direction="east", + members=[ + way_member(50, "west"), + way_member(51, "east"), + way_member(52, "forward"), + way_member(53), + ], + ), + } + ) + + self.assertEqual(rows_by_way[50][4], "east") + self.assertEqual(rows_by_way[51][4], "east") + self.assertEqual(rows_by_way[52][4], "east") + self.assertEqual(rows_by_way[53][4], "east") + + def test_collapses_subsumed_same_ref_root_memberships(self) -> None: + relations = { + 100: relation( + 100, + "I-10", + members=[relation_member(101), relation_member(102)], + ), + 200: relation( + 200, + "I-10", + members=[ + relation_member(101), + relation_member(102), + relation_member(103), + ], + ), + 101: relation(101, "I-10", direction="east", members=[way_member(1)]), + 102: relation(102, "I-10", direction="west", members=[way_member(2)]), + 103: relation(103, "I-10", direction="east", members=[way_member(3)]), + } + + stderr = io.StringIO() + with redirect_stderr(stderr): + rows = flatten_relation_memberships(relations) + + self.assertEqual({row[0]: row[2] for row in rows}, {1: 200, 2: 200, 3: 200}) + self.assertEqual({row[2] for row in rows}, {200}) + + def test_keeps_same_ref_roots_separate_when_neither_is_subset(self) -> None: + relations = { + 300: relation( + 300, + "I-10", + members=[relation_member(301), relation_member(302)], + ), + 400: relation( + 400, + "I-10", + members=[relation_member(302), relation_member(303)], + ), + 301: relation(301, "I-10", direction="east", members=[way_member(10)]), + 302: relation(302, "I-10", direction="west", members=[way_member(11)]), + 303: relation(303, "I-10", direction="east", members=[way_member(12)]), + } + + stderr = io.StringIO() + with redirect_stderr(stderr): + rows = flatten_relation_memberships(relations) + + root_ids_by_way: dict[int, set[int]] = {} + for way_id, _ref, root_relation_id, *_rest in rows: + root_ids_by_way.setdefault(way_id, set()).add(root_relation_id) + + self.assertEqual(root_ids_by_way[10], {300}) + self.assertEqual(root_ids_by_way[11], {300, 400}) + self.assertEqual(root_ids_by_way[12], {400}) + + +if __name__ == "__main__": + unittest.main()