Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Change log

### 0.3.7 - 2026-04-29
- Added upfront null/NaN placeholder detection before schema validation, including support for string placeholders such as `"null"` and `"nan"` in feature `properties`.
- Changed `issues` behavior to return all detected per-feature schema issues (not only a single best issue per feature).
- Suppressed noisy `AnyOf` summary messages when more specific field-level errors exist for the same feature.
- Improved human-readable validation messages for enum/type failures with clearer field-level context and actionable remediation text.
- Updated enum formatting to use compact previews for long value lists (pipe-separated values with `and N more` suffix).
- Added new regression coverage for `tests/assets/issue_3297.zip` and updated related unit test expectations for new message formats and nullish precheck flow.

### 0.3.6 - 2026-04-10
- Fixed https://dev.azure.com/TDEI-UW/TDEI/_workitems/edit/3469
- Added regression coverage for `tests/assets/task_3469.zip` to assert the exact per-feature `issues` payload: `"null" is not one of "down" or "up"` on `FIFA_sidewalks.edges.geojson` feature index `0`.
Expand Down
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ This package validates OSW GeoJSON datasets packaged as a ZIP file.
- Extracts the provided ZIP file
- Finds supported OSW dataset files inside the extracted directory
- Validates each file (`edges`, `lines`, `nodes`, `points`, `polygons`, and `zones`) against the matching schema
- Performs an upfront data-quality check for null-like placeholders in feature properties (for example `null`, `NaN`, `"null"`, `"nan"`)
- Runs cross-file integrity checks such as duplicate `_id` detection and edge or zone references back to nodes
- Returns a `ValidationResult` object with `is_valid`, `errors`, and `issues`

Expand All @@ -33,13 +34,24 @@ validator = OSWValidation(zipfile_path='<Zip file path>')
result = validator.validate()
print(result.is_valid)
print(result.errors) # returns up to the first 20 high-level errors by default
print(result.issues) # per-file or per-feature issues
print(result.issues) # detailed per-feature issues, capped to first 20 by default

result = validator.validate(max_errors=10)
print(result.is_valid)
print(result.errors) # returns up to the first 10 high-level errors
print(result.issues) # capped by the same max_errors limit
```

## Error behavior

- `errors`: high-level validation messages, capped by `max_errors` (default `20`).
- `issues`: detailed per-feature validation issues, also capped by `max_errors`.
- If null-like placeholders are found in feature `properties`, validation fails early before schema checks with actionable messages such as:
- `Invalid value at 'climb': 'null'. Null/NaN placeholders are not allowed; provide a valid value or remove this property.`
- For enum validation, long allowed-value lists are summarized as:
- first 5 values joined by `|`
- followed by `| and N more` when applicable.

You can also override schemas:

```python
Expand Down
117 changes: 87 additions & 30 deletions src/python_osw_validation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import gc
import json
import math
import traceback
from typing import Dict, Any, Optional, List, Tuple
import geopandas as gpd
Expand All @@ -11,9 +12,9 @@
from .version import __version__
from .helpers import (
_add_additional_properties_hint,
_err_kind,
_feature_index_from_error,
_pretty_message,
_rank_for,
)

SCHEMA_PATH = os.path.join(os.path.dirname(__file__), 'schema')
Expand Down Expand Up @@ -129,6 +130,29 @@ def _schema_key_from_text(self, text: Optional[str]) -> Optional[str]:

return None

def _is_nullish_value(self, value: Any) -> bool:
if value is None:
return True
if isinstance(value, str) and value.strip().lower() in {"null", "nan"}:
return True
return isinstance(value, float) and math.isnan(value)

def _collect_nullish_property_paths(self, obj: Any, prefix: str = "") -> List[Tuple[str, Any]]:
paths: List[Tuple[str, Any]] = []
if isinstance(obj, dict):
for key, value in obj.items():
next_prefix = f"{prefix}.{key}" if prefix else str(key)
paths.extend(self._collect_nullish_property_paths(value, next_prefix))
return paths
if isinstance(obj, list):
for idx, value in enumerate(obj):
next_prefix = f"{prefix}[{idx}]" if prefix else f"[{idx}]"
paths.extend(self._collect_nullish_property_paths(value, next_prefix))
return paths
if self._is_nullish_value(obj):
paths.append((prefix or "value", obj))
return paths

def _contains_disallowed_features_for_02(self, geojson_data: Dict[str, Any]) -> set:
"""Detect Tree coverage or Custom content in legacy 0.2 datasets.

Expand Down Expand Up @@ -197,6 +221,12 @@ def pick_schema_for_file(self, file_path: str, geojson_data: Dict[str, Any]) ->
# Core validation entrypoint
# ----------------------------
def validate(self, max_errors=20) -> ValidationResult:
def _finalize(is_valid: bool, errors: Optional[List[str]] = None) -> ValidationResult:
final_errors = self.errors if errors is None else errors
final_errors = (final_errors or [])[:max_errors]
final_issues = (self.issues or [])[:max_errors]
return ValidationResult(is_valid, final_errors, final_issues)

zip_handler = None
OSW_DATASET: Dict[str, Optional[gpd.GeoDataFrame]] = {}
validator = None
Expand All @@ -211,7 +241,7 @@ def validate(self, max_errors=20) -> ValidationResult:
filename=self.zipfile_path,
feature_index=None
)
return ValidationResult(False, self.errors, self.issues)
return _finalize(False)

# Validate the folder structure
validator = ExtractedDataValidator(self.extracted_dir)
Expand All @@ -222,7 +252,7 @@ def validate(self, max_errors=20) -> ValidationResult:
filename=upload_name,
feature_index=None
)
return ValidationResult(False, self.errors, self.issues)
return _finalize(False)

# Per-file schema validation → populate self.issues (fixme-like)
for file in validator.files:
Expand All @@ -232,7 +262,7 @@ def validate(self, max_errors=20) -> ValidationResult:
break

if self.errors:
return ValidationResult(False, self.errors, self.issues)
return _finalize(False)

# Load GeoDataFrames for integrity checks
for file in validator.files:
Expand Down Expand Up @@ -414,9 +444,9 @@ def validate(self, max_errors=20) -> ValidationResult:
break

if self.errors:
return ValidationResult(False, self.errors, self.issues)
return _finalize(False)
else:
return ValidationResult(True, [], self.issues)
return _finalize(True, [])

except Exception as e:
self.log_errors(
Expand All @@ -425,7 +455,7 @@ def validate(self, max_errors=20) -> ValidationResult:
feature_index=None
)
traceback.print_exc()
return ValidationResult(False, self.errors, self.issues)
return _finalize(False)
finally:
# Cleanup extracted files
try:
Expand Down Expand Up @@ -488,6 +518,37 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
except OSError:
return False

filename = os.path.basename(file_path)

# Upfront guard: reject null/NaN values in feature properties.
# This runs before schema validation to surface data quality issues first.
features = geojson_data.get("features", []) if isinstance(geojson_data, dict) else []
found_nullish = False
for idx, feature in enumerate(features):
if not isinstance(feature, dict):
continue
props = feature.get("properties")
if not isinstance(props, dict):
continue
bad_paths = self._collect_nullish_property_paths(props)
for path, bad_value in bad_paths:
if len(self.errors) >= max_errors:
return False
found_nullish = True
rendered = f'"{bad_value}"' if isinstance(bad_value, str) else str(bad_value)
msg = (
f"Invalid value at '{path}': {rendered}. "
f"Null/NaN placeholders are not allowed; provide a valid value or remove this property."
)
self.errors.append(f"Validation error: {msg}")
self.issues.append({
"filename": filename,
"feature_index": idx,
"error_message": [msg],
})
if found_nullish:
return False

schema_url = geojson_data.get('$schema')
if isinstance(schema_url, str) and '0.2/schema.json' in schema_url:
reasons = self._contains_disallowed_features_for_02(geojson_data)
Expand Down Expand Up @@ -518,15 +579,9 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
schema = self.load_osw_schema(schema_path)
validator = jsonschema_rs.Draft7Validator(schema)

filename = os.path.basename(file_path)

# Per-feature best error accumulator (streaming)
# feature_idx -> (rank_tuple, error_obj)
best_by_feature: Dict[Optional[int], Tuple[tuple, Any]] = {}
feature_order: List[Optional[int]] = [] # preserve first-seen order

# Legacy cap
legacy_count = 0
collected_issues: List[Dict[str, Any]] = []

# --- STREAM over errors; STOP as soon as legacy hits the cap ---
for err in validator.iter_errors(geojson_data):
Expand All @@ -539,26 +594,28 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
# We've reached the legacy cap; stop work to match original performance
break

# Track the best error per feature
# Keep every issue (no per-feature collapsing)
fidx = _feature_index_from_error(err)
r = _rank_for(err)
prev = best_by_feature.get(fidx)
if prev is None:
best_by_feature[fidx] = (r, err)
feature_order.append(fidx)
else:
if r < prev[0]:
best_by_feature[fidx] = (r, err)

# Build per-feature issues (one concise message per feature) in first-seen order
for fidx in feature_order:
_, best_err = best_by_feature[fidx]
pretty = _pretty_message(best_err, schema)
self.issues.append({
collected_issues.append({
"filename": filename,
"feature_index": fidx if fidx is not None else -1,
"error_message": [pretty],
"error_message": [_pretty_message(err, schema)],
"_kind": _err_kind(err),
})

# Drop noisy AnyOf summaries when specific field-level errors exist
# for the same feature.
has_specific_by_feature: Dict[int, bool] = {}
for issue in collected_issues:
fidx = issue["feature_index"]
if issue.get("_kind") != "AnyOf":
has_specific_by_feature[fidx] = True

for issue in collected_issues:
if issue.get("_kind") == "AnyOf" and has_specific_by_feature.get(issue["feature_index"], False):
continue
issue.pop("_kind", None)
self.issues.append(issue)

# Mirror original boolean behavior: False when we exactly hit the cap
return len(self.errors) < max_errors
Loading
Loading