From 96ce799d504af720f5dfbb3843b7683fec3436ba Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Sun, 10 May 2026 15:51:26 -0700 Subject: [PATCH 1/2] Start #1413 joined-row aggregation work From 1d3e477961cb344db3dce4f5f01fe9da1b1650fa Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Sun, 10 May 2026 16:11:12 -0700 Subject: [PATCH 2/2] Fix #1413 IC4 CASE aggregation lowering --- CHANGELOG.md | 1 + graphistry/compute/gfql/cypher/lowering.py | 27 +++--- .../compute/gfql/cypher/test_lowering.py | 82 +++++++++++++++++++ 3 files changed, 100 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb094d12c1..9f447f0a84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - **GFQL / Cypher ORDER BY on stringified-list columns uses Cypher list-orderability (#1359, meta #1353 item #1)**: When a list-valued property is stored as a string column (e.g. round-tripped through CSV / Arrow string columns), `ORDER BY` previously fell back to lex string sort, which mishandles negative numbers because `"-"` < `"2"` in ASCII (e.g. `"[1, -20]"` sorted before `"[1, 2]"`). Added `order_detect_stringified_list_series` + `parse_stringified_list_series` in `graphistry/compute/gfql/row/ordering.py`, and routed the row pipeline through `build_list_sort_columns` after `ast.literal_eval`-parsing the string entries when the column is fully list-shaped (`^\[.*\]$` per-row). Python-list-typed columns continue through the existing list-aware path unchanged. Includes pygraphistry-side regression coverage on both Python-list and stringified-list inputs (`test_string_cypher_order_by_python_list_column_uses_list_orderability`, `test_string_cypher_order_by_stringified_list_column_uses_list_orderability`). The matching TCK port-level fixture/runner fixes that flip the 14 `with-orderBy` wrong-row scenarios to `success_matches_expected` are tracked in [tck-gfql #36](https://github.com/graphistry/tck-gfql/issues/36). ### Fixed +- **GFQL / Cypher joined-row aggregation CASE chained-comparison lowering (#1413, #880)**: Cypher lowering now rewrites chained comparisons inside searched `CASE WHEN ... THEN` conditions before row-expression validation, so LDBC IC4/new-topics-style joined-row aggregation queries using `$startDate <= post.creationDate < $endDate` no longer fail the local GFQL subset gate. The rewrite is constrained to unquoted CASE conditions, preserves unrelated CASE comparison bodies, and adds adversarial regression coverage for multiple chained CASE arms and multiple joined-row aggregation CASE flags in `graphistry/tests/compute/gfql/cypher/test_lowering.py`. - **GFQL / Cypher temporal historical named-zone canonicalization + comparison parity (`#1406`, `#1353`)**: Direct-Cypher datetime canonicalization now applies Neo4j/TCK-compatible historical timezone offsets for pre-standard-time `Europe/Stockholm` named-zone literals in `graphistry/compute/gfql/temporal_text.py` (notably `1818-07-21` -> `+00:53:28`). This closes the residual wrong-row case `expr-temporal2-6-5` and keeps equality/comparison behavior consistent when one side is zone-derived and the other is explicit offset text. Added focused regression coverage in `graphistry/tests/compute/gfql/cypher/test_temporal_text.py` and `graphistry/tests/compute/gfql/cypher/test_lowering.py`. - **GFQL / Cypher structural list/map equality now preserves null-unknown semantics (#1405, #1353)**: Direct-Cypher and row-pipeline comparison evaluation in `graphistry/compute/gfql/row/pipeline.py` now uses recursive tri-valued structural equality for list/map families under `=`, `!=`, and `<>`, so nested null comparisons return `null` instead of collapsing to Python `true/false` (for example `[[1], [2]] = [[1], [null]]`, `{k: null} = {k: null}`). Added regression coverage in `graphistry/tests/compute/test_gfql.py`, plus expanded ORDER BY nested non-primitive (raw + stringified map/list) pandas/cuDF parity amplification in `graphistry/tests/compute/gfql/test_row_pipeline_ops.py`, including RAPIDS 25.02/26.02 dgx validation. - **GFQL / Cypher tri-valued list/map/null expression semantics tranche C (#1407, #1353)**: Row-expression equality/membership now preserves openCypher three-valued null semantics for nested list/map comparisons in `graphistry/compute/gfql/row/pipeline.py` by using recursive value equality with null-unknown propagation and routing `IN` through the same tri-valued comparator. This fixes direct-Cypher wrong-row outcomes where nested null comparisons were previously collapsed to booleans (`expr-comparison1-6-5`, `expr-comparison1-7-{12..16}`, `expr-list5-{21,29,31,34}`). Optional single-MATCH empty-row projection synthesis now infers deterministic `IS NULL` / `IS NOT NULL` outputs for optional-alias operands instead of nulling all projected columns (`expr-null1-3`, `expr-null2-3`) via `projection_planning.py` + `lowering.py` empty-result-row wiring. Added focused regressions in `graphistry/tests/compute/gfql/cypher/test_lowering.py`. diff --git a/graphistry/compute/gfql/cypher/lowering.py b/graphistry/compute/gfql/cypher/lowering.py index f55f675c33..67c7ac15e0 100644 --- a/graphistry/compute/gfql/cypher/lowering.py +++ b/graphistry/compute/gfql/cypher/lowering.py @@ -592,16 +592,23 @@ def _replace(match: re.Match[str]) -> str: def _rewrite_chained_comparison_expr(expr_text: str) -> str: - match = _CYPHER_CHAINED_COMPARISON_RE.fullmatch(expr_text) - if match is None: - return expr_text - left = match.group("left").strip() - middle = match.group("middle").strip() - right = match.group("right").strip() - if any(token in segment.upper() for token in {" AND", " OR", " XOR"} for segment in (left, middle, right)): - return expr_text - return f"({left} {match.group('op1')} {middle}) AND ({middle} {match.group('op2')} {right})" - + def _rewrite_match_text(text: str) -> str: + match = _CYPHER_CHAINED_COMPARISON_RE.fullmatch(text) + if match is None: + return text + left = match.group("left").strip() + middle = match.group("middle").strip() + right = match.group("right").strip() + if any(token in segment.upper() for token in {" AND", " OR", " XOR"} for segment in (left, middle, right)): + return text + return f"({left} {match.group('op1')} {middle}) AND ({middle} {match.group('op2')} {right})" + + def _replace_case_condition(match: re.Match[str]) -> str: + condition = match.group("condition") + return match.group(0) if (rewritten_condition := _rewrite_match_text(condition)) == condition else f"{match.group('prefix')}{rewritten_condition}{match.group('suffix')}" + + case_rewritten = _rewrite_unquoted_expr_segments(expr_text, rewrite=lambda segment: re.sub(r"(?P\bWHEN\s+)(?P.*?)(?P\s+THEN\b)", _replace_case_condition, segment, flags=re.IGNORECASE | re.DOTALL)) + return case_rewritten if case_rewritten != expr_text or expr_text.lstrip().upper().startswith("CASE ") else _rewrite_match_text(expr_text) def _unsupported(message: str, *, field: str, value: Any, line: int, column: int) -> GFQLValidationError: return GFQLValidationError( diff --git a/graphistry/tests/compute/gfql/cypher/test_lowering.py b/graphistry/tests/compute/gfql/cypher/test_lowering.py index 0e4d50d053..de838bd8ca 100644 --- a/graphistry/tests/compute/gfql/cypher/test_lowering.py +++ b/graphistry/tests/compute/gfql/cypher/test_lowering.py @@ -3273,6 +3273,45 @@ def test_string_cypher_supports_generic_match_where_chained_comparison() -> None assert result._nodes.to_dict(orient="records") == [] +def test_issue_1413_searched_case_rewrites_multiple_chained_when_arms() -> None: + graph = _mk_graph( + pd.DataFrame({"id": ["a", "b", "c"], "score": [5, 15, 25]}), + pd.DataFrame({"s": [], "d": []}), + ) + + result = graph.gfql( + "MATCH (n) RETURN n.id AS id, " + "CASE WHEN 0 <= n.score < 10 THEN 'low' " + "WHEN 10 <= n.score < 20 THEN 'mid' " + "ELSE 'high' END AS bucket ORDER BY id" + ) + + assert result._nodes.to_dict(orient="records") == [ + {"id": "a", "bucket": "low"}, + {"id": "b", "bucket": "mid"}, + {"id": "c", "bucket": "high"}, + ] + + +def test_issue_1413_searched_case_preserves_unrelated_comparisons() -> None: + graph = _mk_graph( + pd.DataFrame({"id": ["a", "b", "c"], "score": [-1, 5, 15]}), + pd.DataFrame({"s": [], "d": []}), + ) + + result = graph.gfql( + "MATCH (n) RETURN n.id AS id, " + "CASE WHEN n.score > 0 THEN n.score < 10 ELSE false END AS inRange " + "ORDER BY id" + ) + + assert result._nodes.to_dict(orient="records") == [ + {"id": "a", "inRange": False}, + {"id": "b", "inRange": True}, + {"id": "c", "inRange": False}, + ] + + def test_string_cypher_supports_distinct_with_aggregate_grouping() -> None: graph = _mk_graph( pd.DataFrame( @@ -13203,6 +13242,49 @@ def test_string_cypher_multi_alias_with_three_stage_case_aggregation() -> None: assert all(r["postCount"] > 0 for r in records) +def test_issue_1413_ic4_new_topics_exact_ldbc_reference_query() -> None: + """IC-4 new-topics exact LDBC reference shape: joined rows + CASE aggregation (#1413, #880).""" + graph = _mk_ic4_shape_graph() + result = graph.gfql( + "MATCH (person:Person {id: $personId})-[:KNOWS]-(friend:Person), " + "(friend)<-[:HAS_CREATOR]-(post:Post)-[:HAS_TAG]->(tag) " + "WITH DISTINCT tag, post " + "WITH tag, " + "CASE WHEN $startDate <= post.creationDate < $endDate THEN 1 ELSE 0 END AS valid, " + "CASE WHEN post.creationDate < $startDate THEN 1 ELSE 0 END AS inValid " + "WITH tag, sum(valid) AS postCount, sum(inValid) AS inValidPostCount " + "WHERE postCount>0 AND inValidPostCount=0 " + "RETURN tag.name AS tagName, postCount " + "ORDER BY postCount DESC, tagName ASC " + "LIMIT 10", + params={"personId": "p1", "startDate": 150, "endDate": 350}, + ) + assert result._nodes.to_dict(orient="records") == [ + {"tagName": "TagB", "postCount": 1}, + ] + + +def test_issue_1413_ic4_new_topics_multiple_chained_case_flags() -> None: + graph = _mk_ic4_shape_graph() + result = graph.gfql( + "MATCH (person:Person {id: $personId})-[:KNOWS]-(friend:Person), " + "(friend)<-[:HAS_CREATOR]-(post:Post)-[:HAS_TAG]->(tag) " + "WITH DISTINCT tag, post " + "WITH tag, " + "CASE WHEN $startDate <= post.creationDate < $endDate THEN 1 ELSE 0 END AS valid, " + "CASE WHEN $invalidStart <= post.creationDate < $startDate THEN 1 ELSE 0 END AS inValid " + "WITH tag, sum(valid) AS postCount, sum(inValid) AS inValidPostCount " + "WHERE postCount > 0 AND inValidPostCount = 0 " + "RETURN tag.name AS tagName, postCount " + "ORDER BY postCount DESC, tagName ASC " + "LIMIT 10", + params={"personId": "p1", "invalidStart": 0, "startDate": 150, "endDate": 350}, + ) + assert result._nodes.to_dict(orient="records") == [ + {"tagName": "TagB", "postCount": 1}, + ] + + def test_issue_1038_ic4_return_side_case_expression_regression_lock() -> None: """Regression lock for #1038: RETURN-side CASE over IC4-shaped post timestamp ranges.""" graph = _mk_graph(