TaskarCenterAtUW · sujata-m · Apr 30, 2026 · Apr 29, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Change log
 
+### 0.3.7 - 2026-04-29
+- Added upfront null/NaN placeholder detection before schema validation, including support for string placeholders such as `"null"` and `"nan"` in feature `properties`.
+- Changed `issues` behavior to return all detected per-feature schema issues (not only a single best issue per feature).
+- Suppressed noisy `AnyOf` summary messages when more specific field-level errors exist for the same feature.
+- Improved human-readable validation messages for enum/type failures with clearer field-level context and actionable remediation text.
+- Updated enum formatting to use compact previews for long value lists (pipe-separated values with `and N more` suffix).
+- Added new regression coverage for `tests/assets/issue_3297.zip` and updated related unit test expectations for new message formats and nullish precheck flow.
+
 ### 0.3.6 - 2026-04-10
 - Fixed https://dev.azure.com/TDEI-UW/TDEI/_workitems/edit/3469
 - Added regression coverage for `tests/assets/task_3469.zip` to assert the exact per-feature `issues` payload: `"null" is not one of "down" or "up"` on `FIFA_sidewalks.edges.geojson` feature index `0`.

diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ This package validates OSW GeoJSON datasets packaged as a ZIP file.
 - Extracts the provided ZIP file
 - Finds supported OSW dataset files inside the extracted directory
 - Validates each file (`edges`, `lines`, `nodes`, `points`, `polygons`, and `zones`) against the matching schema
+- Performs an upfront data-quality check for null-like placeholders in feature properties (for example `null`, `NaN`, `"null"`, `"nan"`)
 - Runs cross-file integrity checks such as duplicate `_id` detection and edge or zone references back to nodes
 - Returns a `ValidationResult` object with `is_valid`, `errors`, and `issues`
 
@@ -33,13 +34,24 @@ validator = OSWValidation(zipfile_path='<Zip file path>')
 result = validator.validate()
 print(result.is_valid)
 print(result.errors)  # returns up to the first 20 high-level errors by default
-print(result.issues)  # per-file or per-feature issues
+print(result.issues)  # detailed per-feature issues, capped to first 20 by default
 
 result = validator.validate(max_errors=10)
 print(result.is_valid)
 print(result.errors)  # returns up to the first 10 high-level errors
+print(result.issues)  # capped by the same max_errors limit
 ```
 
+## Error behavior
+
+- `errors`: high-level validation messages, capped by `max_errors` (default `20`).
+- `issues`: detailed per-feature validation issues, also capped by `max_errors`.
+- If null-like placeholders are found in feature `properties`, validation fails early before schema checks with actionable messages such as:
+  - `Invalid value at 'climb': 'null'. Null/NaN placeholders are not allowed; provide a valid value or remove this property.`
+- For enum validation, long allowed-value lists are summarized as:
+  - first 5 values joined by `|`
+  - followed by `| and N more` when applicable.
+
 You can also override schemas:
 
 ```python

diff --git a/src/python_osw_validation/__init__.py b/src/python_osw_validation/__init__.py
@@ -1,6 +1,7 @@
 import os
 import gc
 import json
+import math
 import traceback
 from typing import Dict, Any, Optional, List, Tuple
 import geopandas as gpd
@@ -11,9 +12,9 @@
 from .version import __version__
 from .helpers import (
     _add_additional_properties_hint,
+    _err_kind,
     _feature_index_from_error,
     _pretty_message,
-    _rank_for,
 )
 
 SCHEMA_PATH = os.path.join(os.path.dirname(__file__), 'schema')
@@ -129,6 +130,29 @@ def _schema_key_from_text(self, text: Optional[str]) -> Optional[str]:
 
         return None
 
+    def _is_nullish_value(self, value: Any) -> bool:
+        if value is None:
+            return True
+        if isinstance(value, str) and value.strip().lower() in {"null", "nan"}:
+            return True
+        return isinstance(value, float) and math.isnan(value)
+
+    def _collect_nullish_property_paths(self, obj: Any, prefix: str = "") -> List[Tuple[str, Any]]:
+        paths: List[Tuple[str, Any]] = []
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                next_prefix = f"{prefix}.{key}" if prefix else str(key)
+                paths.extend(self._collect_nullish_property_paths(value, next_prefix))
+            return paths
+        if isinstance(obj, list):
+            for idx, value in enumerate(obj):
+                next_prefix = f"{prefix}[{idx}]" if prefix else f"[{idx}]"
+                paths.extend(self._collect_nullish_property_paths(value, next_prefix))
+            return paths
+        if self._is_nullish_value(obj):
+            paths.append((prefix or "value", obj))
+        return paths
+
     def _contains_disallowed_features_for_02(self, geojson_data: Dict[str, Any]) -> set:
         """Detect Tree coverage or Custom content in legacy 0.2 datasets.
 
@@ -197,6 +221,12 @@ def pick_schema_for_file(self, file_path: str, geojson_data: Dict[str, Any]) ->
     # Core validation entrypoint
     # ----------------------------
     def validate(self, max_errors=20) -> ValidationResult:
+        def _finalize(is_valid: bool, errors: Optional[List[str]] = None) -> ValidationResult:
+            final_errors = self.errors if errors is None else errors
+            final_errors = (final_errors or [])[:max_errors]
+            final_issues = (self.issues or [])[:max_errors]
+            return ValidationResult(is_valid, final_errors, final_issues)
+
         zip_handler = None
         OSW_DATASET: Dict[str, Optional[gpd.GeoDataFrame]] = {}
         validator = None
@@ -211,7 +241,7 @@ def validate(self, max_errors=20) -> ValidationResult:
                     filename=self.zipfile_path,
                     feature_index=None
                 )
-                return ValidationResult(False, self.errors, self.issues)
+                return _finalize(False)
 
             # Validate the folder structure
             validator = ExtractedDataValidator(self.extracted_dir)
@@ -222,7 +252,7 @@ def validate(self, max_errors=20) -> ValidationResult:
                     filename=upload_name,
                     feature_index=None
                 )
-                return ValidationResult(False, self.errors, self.issues)
+                return _finalize(False)
 
             # Per-file schema validation → populate self.issues (fixme-like)
             for file in validator.files:
@@ -232,7 +262,7 @@ def validate(self, max_errors=20) -> ValidationResult:
                     break
 
             if self.errors:
-                return ValidationResult(False, self.errors, self.issues)
+                return _finalize(False)
 
             # Load GeoDataFrames for integrity checks
             for file in validator.files:
@@ -414,9 +444,9 @@ def validate(self, max_errors=20) -> ValidationResult:
                     break
 
             if self.errors:
-                return ValidationResult(False, self.errors, self.issues)
+                return _finalize(False)
             else:
-                return ValidationResult(True, [], self.issues)
+                return _finalize(True, [])
 
         except Exception as e:
             self.log_errors(
@@ -425,7 +455,7 @@ def validate(self, max_errors=20) -> ValidationResult:
                 feature_index=None
             )
             traceback.print_exc()
-            return ValidationResult(False, self.errors, self.issues)
+            return _finalize(False)
         finally:
             # Cleanup extracted files
             try:
@@ -488,6 +518,37 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
         except OSError:
             return False
 
+        filename = os.path.basename(file_path)
+
+        # Upfront guard: reject null/NaN values in feature properties.
+        # This runs before schema validation to surface data quality issues first.
+        features = geojson_data.get("features", []) if isinstance(geojson_data, dict) else []
+        found_nullish = False
+        for idx, feature in enumerate(features):
+            if not isinstance(feature, dict):
+                continue
+            props = feature.get("properties")
+            if not isinstance(props, dict):
+                continue
+            bad_paths = self._collect_nullish_property_paths(props)
+            for path, bad_value in bad_paths:
+                if len(self.errors) >= max_errors:
+                    return False
+                found_nullish = True
+                rendered = f'"{bad_value}"' if isinstance(bad_value, str) else str(bad_value)
+                msg = (
+                    f"Invalid value at '{path}': {rendered}. "
+                    f"Null/NaN placeholders are not allowed; provide a valid value or remove this property."
+                )
+                self.errors.append(f"Validation error: {msg}")
+                self.issues.append({
+                    "filename": filename,
+                    "feature_index": idx,
+                    "error_message": [msg],
+                })
+        if found_nullish:
+            return False
+
         schema_url = geojson_data.get('$schema')
         if isinstance(schema_url, str) and '0.2/schema.json' in schema_url:
             reasons = self._contains_disallowed_features_for_02(geojson_data)
@@ -518,15 +579,9 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
         schema = self.load_osw_schema(schema_path)
         validator = jsonschema_rs.Draft7Validator(schema)
 
-        filename = os.path.basename(file_path)
-
-        # Per-feature best error accumulator (streaming)
-        #   feature_idx -> (rank_tuple, error_obj)
-        best_by_feature: Dict[Optional[int], Tuple[tuple, Any]] = {}
-        feature_order: List[Optional[int]] = []  # preserve first-seen order
-
         # Legacy cap
         legacy_count = 0
+        collected_issues: List[Dict[str, Any]] = []
 
         # --- STREAM over errors; STOP as soon as legacy hits the cap ---
         for err in validator.iter_errors(geojson_data):
@@ -539,26 +594,28 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
                 # We've reached the legacy cap; stop work to match original performance
                 break
 
-            # Track the best error per feature
+            # Keep every issue (no per-feature collapsing)
             fidx = _feature_index_from_error(err)
-            r = _rank_for(err)
-            prev = best_by_feature.get(fidx)
-            if prev is None:
-                best_by_feature[fidx] = (r, err)
-                feature_order.append(fidx)
-            else:
-                if r < prev[0]:
-                    best_by_feature[fidx] = (r, err)
-
-        # Build per-feature issues (one concise message per feature) in first-seen order
-        for fidx in feature_order:
-            _, best_err = best_by_feature[fidx]
-            pretty = _pretty_message(best_err, schema)
-            self.issues.append({
+            collected_issues.append({
                 "filename": filename,
                 "feature_index": fidx if fidx is not None else -1,
-                "error_message": [pretty],
+                "error_message": [_pretty_message(err, schema)],
+                "_kind": _err_kind(err),
             })
 
+        # Drop noisy AnyOf summaries when specific field-level errors exist
+        # for the same feature.
+        has_specific_by_feature: Dict[int, bool] = {}
+        for issue in collected_issues:
+            fidx = issue["feature_index"]
+            if issue.get("_kind") != "AnyOf":
+                has_specific_by_feature[fidx] = True
+
+        for issue in collected_issues:
+            if issue.get("_kind") == "AnyOf" and has_specific_by_feature.get(issue["feature_index"], False):
+                continue
+            issue.pop("_kind", None)
+            self.issues.append(issue)
+
         # Mirror original boolean behavior: False when we exactly hit the cap
         return len(self.errors) < max_errors