Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
- **GFQL / Cypher ORDER BY on stringified-list columns uses Cypher list-orderability (#1359, meta #1353 item #1)**: When a list-valued property is stored as a string column (e.g. round-tripped through CSV / Arrow string columns), `ORDER BY` previously fell back to lex string sort, which mishandles negative numbers because `"-"` < `"2"` in ASCII (e.g. `"[1, -20]"` sorted before `"[1, 2]"`). Added `order_detect_stringified_list_series` + `parse_stringified_list_series` in `graphistry/compute/gfql/row/ordering.py`, and routed the row pipeline through `build_list_sort_columns` after `ast.literal_eval`-parsing the string entries when the column is fully list-shaped (`^\[.*\]$` per-row). Python-list-typed columns continue through the existing list-aware path unchanged. Includes pygraphistry-side regression coverage on both Python-list and stringified-list inputs (`test_string_cypher_order_by_python_list_column_uses_list_orderability`, `test_string_cypher_order_by_stringified_list_column_uses_list_orderability`). The matching TCK port-level fixture/runner fixes that flip the 14 `with-orderBy` wrong-row scenarios to `success_matches_expected` are tracked in [tck-gfql #36](https://github.com/graphistry/tck-gfql/issues/36).

### Fixed
- **GFQL / Cypher IC3 carried-row reentry joined aggregation (#1413, #880)**: Cypher lowering now admits the IC3/cross-country-style `WITH person, collect(city) AS cities MATCH ...` bounded-reentry shape with multiple post-reentry `WITH` stages, carries the collected city list alongside the `person` whole-row alias, and evaluates whole-row node membership such as `NOT friendCity IN cities` against collected entity lists. Added adversarial regressions for `collect(city)` and `collect(DISTINCT city)` with searched CASE aggregation and post-aggregate filtering in `graphistry/tests/compute/gfql/cypher/test_lowering.py`.
- **GFQL / Cypher joined-row aggregation CASE chained-comparison lowering (#1413, #880)**: Cypher lowering now rewrites chained comparisons inside searched `CASE WHEN ... THEN` conditions before row-expression validation, so LDBC IC4/new-topics-style joined-row aggregation queries using `$startDate <= post.creationDate < $endDate` no longer fail the local GFQL subset gate. The rewrite is constrained to unquoted CASE conditions, preserves unrelated CASE comparison bodies, and adds adversarial regression coverage for multiple chained CASE arms and multiple joined-row aggregation CASE flags in `graphistry/tests/compute/gfql/cypher/test_lowering.py`.
- **GFQL / Cypher temporal historical named-zone canonicalization + comparison parity (`#1406`, `#1353`)**: Direct-Cypher datetime canonicalization now applies Neo4j/TCK-compatible historical timezone offsets for pre-standard-time `Europe/Stockholm` named-zone literals in `graphistry/compute/gfql/temporal_text.py` (notably `1818-07-21` -> `+00:53:28`). This closes the residual wrong-row case `expr-temporal2-6-5` and keeps equality/comparison behavior consistent when one side is zone-derived and the other is explicit offset text. Added focused regression coverage in `graphistry/tests/compute/gfql/cypher/test_temporal_text.py` and `graphistry/tests/compute/gfql/cypher/test_lowering.py`.
- **GFQL / Cypher structural list/map equality now preserves null-unknown semantics (#1405, #1353)**: Direct-Cypher and row-pipeline comparison evaluation in `graphistry/compute/gfql/row/pipeline.py` now uses recursive tri-valued structural equality for list/map families under `=`, `!=`, and `<>`, so nested null comparisons return `null` instead of collapsing to Python `true/false` (for example `[[1], [2]] = [[1], [null]]`, `{k: null} = {k: null}`). Added regression coverage in `graphistry/tests/compute/test_gfql.py`, plus expanded ORDER BY nested non-primitive (raw + stringified map/list) pandas/cuDF parity amplification in `graphistry/tests/compute/gfql/test_row_pipeline_ops.py`, including RAPIDS 25.02/26.02 dgx validation.
Expand Down
2 changes: 1 addition & 1 deletion bin/ci_cypher_surface_guard_baseline.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@
"max_properties": 0
}
},
"lowering_py_max_lines": 8862
"lowering_py_max_lines": 8973
}
269 changes: 191 additions & 78 deletions graphistry/compute/gfql/cypher/lowering.py

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion graphistry/compute/gfql/cypher/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1610,12 +1610,16 @@ def query_body(self, meta: Any, items: Sequence[Any]) -> CypherQuery:
elif idx != len(stages) - 1:
with_stages.append(stage)
if reentry_match_clauses:
too_many_suffix_withs = (
len(reentry_match_clauses) > 1
and len(with_stages) > len(reentry_match_clauses) + 1
)
if (
stages[0].clause.kind != "with"
or stages[-1].clause.kind != "return"
or any(stage.clause.kind != "with" for stage in stages[:-1])
or len(with_stages) < len(reentry_match_clauses)
or len(with_stages) > len(reentry_match_clauses) + 1
or too_many_suffix_withs
):
first_match = reentry_match_clauses[0]
raise _to_syntax_error(
Expand Down
10 changes: 6 additions & 4 deletions graphistry/compute/gfql/cypher/reentry/compiletime.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,12 @@ def _compile_bounded_reentry_query(
span=first_unwind.span,
)
query = rewritten_query
if not query.reentry_matches or len(query.with_stages) not in {
len(query.reentry_matches),
len(query.reentry_matches) + 1,
}:
too_few_withs = len(query.with_stages) < len(query.reentry_matches)
too_many_suffix_withs = (
len(query.reentry_matches) > 1
and len(query.with_stages) > len(query.reentry_matches) + 1
)
if not query.reentry_matches or too_few_withs or too_many_suffix_withs:
raise _unsupported_at_span(
"Cypher MATCH after WITH is only supported for alternating MATCH ... WITH ... MATCH ... [WITH ... MATCH ...] ... [WITH] RETURN read shapes in the local compiler",
field="match",
Expand Down
17 changes: 17 additions & 0 deletions graphistry/compute/gfql/cypher/reentry/rewrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,23 @@ def _rewrite_reentry_expr_to_hidden_properties(
has_non_source = bool(non_source_carried_props)
if not carried_columns and not has_non_source:
return expr
needs_scalar_rewrite = any(
re.search(rf"(?<![A-Za-z0-9_]){re.escape(output_name)}(?![A-Za-z0-9_])", expr.text)
or _reentry_hidden_column_name(output_name) in expr.text
for output_name in carried_columns
)
needs_non_source_rewrite = False
if non_source_carried_props:
needs_non_source_rewrite = any(
re.search(
rf"(?<![A-Za-z0-9_]){re.escape(alias_name)}\.{re.escape(prop)}(?![A-Za-z0-9_])",
expr.text,
)
for alias_name, props in non_source_carried_props.items()
for prop in props
)
if not needs_scalar_rewrite and not needs_non_source_rewrite:
return expr
normalized_text = expr.text
for output_name in carried_columns:
hidden_name = _reentry_hidden_column_name(output_name)
Expand Down
54 changes: 39 additions & 15 deletions graphistry/compute/gfql/row/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,16 +1020,30 @@ def _gfql_eval_expr_ast(self, table_df: Any, node: Any) -> Tuple[bool, Any]:
alias_names.append(extra_arg.value)
else:
return False, None
source_alias = alias_names[0]
if source_alias not in table_df.columns:
source_alias_name = alias_names[0]
source_alias = source_alias_name
source_table_df = table_df
entity_id = getattr(self, "_node" if fn == "__node_keys__" else "_edge", None)
prefixed_id_col = f"{source_alias_name}.{entity_id}" if entity_id else None
if prefixed_id_col is not None and prefixed_id_col in table_df.columns:
prefix = f"{source_alias_name}."
alias_cols = [
col for col in table_df.columns
if isinstance(col, str) and col.startswith(prefix)
]
source_table_df = table_df[alias_cols].copy().rename(
columns={col: col[len(prefix):] for col in alias_cols}
)
source_alias = str(entity_id)
elif source_alias not in table_df.columns:
return False, None
out = entity_keys_series(
table_df,
source_table_df,
alias_col=source_alias,
table=("nodes" if fn == "__node_keys__" else "edges"),
excluded=tuple(alias_names),
)
null_mask = self._gfql_null_mask(table_df, table_df[source_alias])
null_mask = self._gfql_null_mask(source_table_df, source_table_df[source_alias])
if hasattr(out, "where"):
out = self._gfql_mask_fill(out, null_mask, None)
return True, out
Expand All @@ -1044,22 +1058,32 @@ def _gfql_eval_expr_ast(self, table_df: Any, node: Any) -> Tuple[bool, Any]:
entity_alias_names.append(extra_arg.value)
else:
return False, None
source_alias = entity_alias_names[0]
if source_alias not in table_df.columns:
# On bindings-row tables, resolve alias to alias.{node_id} (#880)
node_id = getattr(self, "_node", None)
id_col = f"{source_alias}.{node_id}" if node_id else None
if id_col is not None and id_col in table_df.columns:
source_alias = id_col
else:
return False, None
source_alias_name = entity_alias_names[0]
source_alias = source_alias_name
source_table_df = table_df
# On bindings-row tables, prefer alias.{id} even when a same-name
# marker column exists for the alias.
entity_id = getattr(self, "_node" if fn == "__node_entity__" else "_edge", None)
id_col = f"{source_alias_name}.{entity_id}" if entity_id else None
if id_col is not None and id_col in table_df.columns:
prefix = f"{source_alias_name}."
alias_cols = [
col for col in table_df.columns
if isinstance(col, str) and col.startswith(prefix)
]
source_table_df = table_df[alias_cols].copy().rename(
columns={col: col[len(prefix):] for col in alias_cols}
)
source_alias = str(entity_id)
elif source_alias not in table_df.columns:
return False, None
out = self._gfql_format_entity_series(
table_df,
source_table_df,
alias_col=source_alias,
table=("nodes" if fn == "__node_entity__" else "edges"),
excluded=tuple(entity_alias_names),
)
null_mask = self._gfql_null_mask(table_df, table_df[source_alias])
null_mask = self._gfql_null_mask(source_table_df, source_table_df[source_alias])
if hasattr(out, "where"):
out = self._gfql_mask_fill(out, null_mask, None)
return True, out
Expand Down
81 changes: 81 additions & 0 deletions graphistry/tests/compute/gfql/cypher/test_lowering.py
Original file line number Diff line number Diff line change
Expand Up @@ -3520,6 +3520,13 @@ def test_string_cypher_failfast_relationship_whole_row_grouped_count_star_bounda
graph.gfql("MATCH (a:L)-[rel]->(b) RETURN a, count(*)")


def test_string_cypher_failfast_optional_match_collect_null_whole_row_return_boundary() -> None:
graph = _mk_graph(pd.DataFrame({"id": ["n1"]}), pd.DataFrame({"s": [], "d": []}))

with pytest.raises(GFQLValidationError, match="one MATCH source alias"):
graph.gfql("MATCH (n) OPTIONAL MATCH (n)-[:NOT_EXIST]->(x) RETURN n, collect(x)")


def test_string_cypher_supports_optional_match_collect_alias_property() -> None:
graph = _mk_graph(
pd.DataFrame(
Expand Down Expand Up @@ -12674,6 +12681,35 @@ def test_string_cypher_rejects_obviously_non_boolean_operands_in_boolean_ops(que
# ── Multi-alias WITH projection from connected MATCH (#880 / IC-4 shape) ──


def _mk_ic3_cross_country_shape_graph() -> _CypherTestGraph:
"""Graph for IC-3-style carried row + collected city list reentry tests."""
return _mk_graph(
pd.DataFrame(
{
"id": ["p1", "cityA", "friend1", "friend2", "friend3", "cityB", "cityC"],
"label__Person": [True, False, True, True, True, False, False],
"label__City": [False, True, False, False, False, True, True],
"name": ["", "CityA", "", "", "", "CityB", "CityC"],
}
),
pd.DataFrame(
{
"s": ["p1", "p1", "p1", "p1", "friend1", "friend2", "friend3"],
"d": ["cityA", "friend1", "friend2", "friend3", "cityB", "cityA", "cityC"],
"type": [
"IS_LOCATED_IN",
"KNOWS",
"KNOWS",
"KNOWS",
"IS_LOCATED_IN",
"IS_LOCATED_IN",
"IS_LOCATED_IN",
],
}
),
)


def _mk_ic4_shape_graph() -> _CypherTestGraph:
"""Graph for IC-4 multi-alias WITH tests: person-KNOWS-friend, post-HAS_CREATOR->friend, post-HAS_TAG->tag."""
return _mk_graph(
Expand Down Expand Up @@ -13285,6 +13321,51 @@ def test_issue_1413_ic4_new_topics_multiple_chained_case_flags() -> None:
]


def test_issue_1413_ic3_cross_country_carried_row_collect_list_reentry_case_sum() -> None:
graph = _mk_ic3_cross_country_shape_graph()
result = graph.gfql(
"MATCH (person:Person {id: $personId})-[:IS_LOCATED_IN]->(city:City) "
"WITH person, collect(city) AS cities "
"MATCH (person)-[:KNOWS]-(friend:Person)-[:IS_LOCATED_IN]->(friendCity:City) "
"WHERE NOT person = friend AND NOT friendCity IN cities "
"WITH friend, "
"CASE WHEN friendCity.name = $countryXName THEN 1 ELSE 0 END AS messageX, "
"CASE WHEN friendCity.name = $countryYName THEN 1 ELSE 0 END AS messageY "
"WITH friend, sum(messageX) AS xCount, sum(messageY) AS yCount "
"RETURN friend.id AS friendId, xCount, yCount "
"ORDER BY friendId ASC",
params={"personId": "p1", "countryXName": "CityB", "countryYName": "CityC"},
)

assert result._nodes.to_dict(orient="records") == [
{"friendId": "friend1", "xCount": 1, "yCount": 0},
{"friendId": "friend3", "xCount": 0, "yCount": 1},
]


def test_issue_1413_ic3_collect_distinct_entity_membership_with_post_aggregate_where() -> None:
graph = _mk_ic3_cross_country_shape_graph()
result = graph.gfql(
"MATCH (person:Person {id: $personId})-[:IS_LOCATED_IN]->(city:City) "
"WITH person, collect(DISTINCT city) AS cities "
"MATCH (person)-[:KNOWS]-(friend:Person)-[:IS_LOCATED_IN]->(friendCity:City) "
"WHERE NOT (friendCity IN cities) "
"WITH friend, "
"CASE WHEN friendCity.name = $countryXName THEN 1 ELSE 0 END AS messageX, "
"CASE WHEN friendCity.name = $countryYName THEN 1 ELSE 0 END AS messageY "
"WITH friend, sum(messageX) AS xCount, sum(messageY) AS yCount "
"WHERE xCount > 0 OR yCount > 0 "
"RETURN friend.id AS friendId, xCount, yCount "
"ORDER BY yCount DESC, friendId ASC",
params={"personId": "p1", "countryXName": "CityB", "countryYName": "CityC"},
)

assert result._nodes.to_dict(orient="records") == [
{"friendId": "friend3", "xCount": 0, "yCount": 1},
{"friendId": "friend1", "xCount": 1, "yCount": 0},
]


def test_issue_1038_ic4_return_side_case_expression_regression_lock() -> None:
"""Regression lock for #1038: RETURN-side CASE over IC4-shaped post timestamp ranges."""
graph = _mk_graph(
Expand Down
Loading