From 28501d870b91870cc8e68293368ebcfdbc7e4ced Mon Sep 17 00:00:00 2001 From: dmitry Date: Thu, 14 May 2026 14:39:09 +0300 Subject: [PATCH] fix neighbors_v2 edge-type filter for Kuzu label IN bug - Replace label(e) IN $list with OR of scalar label(e) = $p (Kuzu 0.11.x). - Avoid typed union rel patterns here: RETURN columns differ across rel types. - Add regression test for strict-subset edge_types; adjust FakeGraph stub. - Document Kuzu Cypher pitfalls in AGENTS.md, kuzu_queries.py, tests/README. Fixes HumanBean17/java-codebase-rag#119. Co-authored-by: Cursor --- AGENTS.md | 16 ++++++++++++++++ kuzu_queries.py | 5 +++++ mcp_v2.py | 17 +++++++++++++---- tests/README.md | 2 ++ tests/test_mcp_v2.py | 36 +++++++++++++++++++++++++++++++++++- 5 files changed, 71 insertions(+), 5 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 74b7beb..5f998a0 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -85,6 +85,22 @@ Read these directly. Don't rely on rule files to mirror them. "Re-index required" callout and bump `ontology_version` when enrichment semantics change. +## Kuzu Cypher pitfalls + +When adding or editing Cypher run against Kuzu (for example in +`kuzu_queries.py`, `mcp_v2.py`, or any `KuzuGraph._rows` caller): + +- **Do not filter relationship types with** `label(e) IN $list` **or** + `label(e) IN ["A","B"]` **in** `WHERE`. On supported versions this can + be ignored or wrong; prefer **OR of scalar equalities** + (`label(e) = $p OR label(e) = $q …`) with bound parameters, after + validating labels against an allowlist (see `neighbors_v2` in + `mcp_v2.py`). +- **Typed union patterns** like `-[e:CALLS|HTTP_CALLS]->` are only safe if + every column you `RETURN` from `e` exists on **all** of those + relationship types in the graph schema. Otherwise prefer untyped `[e]` + plus explicit label filtering, or split queries. + ## Workflow - Branch from `master`. Branch names: `cursor/` (CLI work), diff --git a/kuzu_queries.py b/kuzu_queries.py index 95fe39a..0aa41c3 100644 --- a/kuzu_queries.py +++ b/kuzu_queries.py @@ -5,6 +5,11 @@ The Kuzu database is opened read-only and cached per-process. This module is intentionally dependency-light: nothing here imports LanceDB or sentence-transformers. + +Cypher pitfalls (see also ``AGENTS.md``): avoid ``label(e) IN $list`` in ``WHERE`` for +relationship-type filters; use OR of ``label(e) = $param`` with bound parameters. +Typed unions ``-[e:A|B]-`` require every ``RETURN`` column on ``e`` to exist on all +listed rel types, or the binder may fail. """ from __future__ import annotations diff --git a/mcp_v2.py b/mcp_v2.py index df7e34f..d2ebfc2 100644 --- a/mcp_v2.py +++ b/mcp_v2.py @@ -532,6 +532,12 @@ def neighbors_v2( ) -> NeighborsOutput: try: _NEIGHBOR_EDGE_TYPES_ADAPTER.validate_python(edge_types) + # Kuzu 0.11.x can drop `label(e) IN $list` in WHERE; use OR of scalar equalities instead. + # Typed unions like `[e:CALLS|HTTP_CALLS]` fail the binder when RETURN references rel + # columns that exist on only some of the union members. + labels = list(dict.fromkeys(edge_types)) + label_params = [f"l{i}" for i in range(len(labels))] + label_predicate = "(" + " OR ".join(f"label(e) = ${name}" for name in label_params) + ")" g = graph or KuzuGraph.get() raw_filter = _coerce_filter(filter) nf = ( @@ -543,27 +549,30 @@ def neighbors_v2( results: list[Edge] = [] for origin_id in origins: _resolve_node_kind(g, origin_id) + q_params = {"id": origin_id, **dict(zip(label_params, labels, strict=True))} if direction == "out": rows = g._rows( # noqa: SLF001 - "MATCH (a)-[e]->(b) WHERE a.id = $id AND label(e) IN $edge_types " + "MATCH (a)-[e]->(b) WHERE a.id = $id AND " + f"{label_predicate} " "RETURN b.id AS other_id, label(e) AS edge_type, e.confidence AS confidence, " "e.strategy AS strategy, e.match AS match, e.mechanism AS mechanism, " "e.annotation AS annotation, e.field_or_param AS field_or_param, " "e.source AS source, e.call_site_line AS call_site_line, " "e.call_site_byte AS call_site_byte, e.arg_count AS arg_count, " "e.resolved AS resolved", - {"id": origin_id, "edge_types": edge_types}, + q_params, ) else: rows = g._rows( # noqa: SLF001 - "MATCH (a)<-[e]-(b) WHERE a.id = $id AND label(e) IN $edge_types " + "MATCH (a)<-[e]-(b) WHERE a.id = $id AND " + f"{label_predicate} " "RETURN b.id AS other_id, label(e) AS edge_type, e.confidence AS confidence, " "e.strategy AS strategy, e.match AS match, e.mechanism AS mechanism, " "e.annotation AS annotation, e.field_or_param AS field_or_param, " "e.source AS source, e.call_site_line AS call_site_line, " "e.call_site_byte AS call_site_byte, e.arg_count AS arg_count, " "e.resolved AS resolved", - {"id": origin_id, "edge_types": edge_types}, + q_params, ) for row in rows: other_id = str(row.get("other_id") or "") diff --git a/tests/README.md b/tests/README.md index 23b3c15..96a5f18 100644 --- a/tests/README.md +++ b/tests/README.md @@ -29,6 +29,8 @@ cd /path/to/java-codebase-rag .venv/bin/python -m pytest tests -v ``` +**Kuzu Cypher:** When writing queries or asserting on edge filters, follow the pitfalls note in [`AGENTS.md`](../AGENTS.md) (avoid `label(e) IN $list` for type filters; be careful with typed union rel patterns). + ## CI merge gate and fixture tiers **Merge gate (mechanical):** [`.github/workflows/test.yml`](../.github/workflows/test.yml) runs `pytest tests` with `JAVA_CODEBASE_RAG_RUN_HEAVY=0` on every pull request and on every push to `master`. Branch protection on `master` requires the `test` status check to pass before merge and disables force-push. Break-glass policy: `enforce_admins: false` so the sole maintainer can bypass for emergency hotfixes — explain the bypass in the merge commit. diff --git a/tests/test_mcp_v2.py b/tests/test_mcp_v2.py index 3474236..fa29d43 100644 --- a/tests/test_mcp_v2.py +++ b/tests/test_mcp_v2.py @@ -27,6 +27,18 @@ def _method_id_with_calls(kuzu_graph, direction: str) -> str: return str(rows[0]["id"]) +def _method_id_declares_client_and_other_out_edge(kuzu_graph) -> str | None: + """A method with DECLARES_CLIENT plus another out-label (Kuzu #119 strict-subset case).""" + for pattern in ( + "MATCH (m:Symbol {kind: 'method'})-[:DECLARES_CLIENT]->() MATCH (m)-[:CALLS]->() RETURN m.id AS id LIMIT 1", + "MATCH (m:Symbol {kind: 'method'})-[:DECLARES_CLIENT]->() MATCH (m)-[:HTTP_CALLS]->() RETURN m.id AS id LIMIT 1", + ): + rows = kuzu_graph._rows(pattern) # noqa: SLF001 + if rows: + return str(rows[0]["id"]) + return None + + def _first_route_with_handler(kuzu_graph) -> str: for route in kuzu_graph.list_routes(limit=200): if kuzu_graph.find_route_handlers(route_id=route["id"]): @@ -281,6 +293,24 @@ def test_neighbors_out_calls(kuzu_graph) -> None: assert isinstance(out.results, list) +def test_neighbors_edge_types_strict_subset_respects_label_filter(kuzu_graph) -> None: + """Regression (#119): Kuzu can drop `label(e) IN $list`; use OR of `label(e) = $p` instead.""" + mid = _method_id_declares_client_and_other_out_edge(kuzu_graph) + if mid is None: + pytest.skip("no method with DECLARES_CLIENT and CALLS or HTTP_CALLS out-edges") + dc_rows = kuzu_graph._rows( # noqa: SLF001 + "MATCH (m:Symbol)-[e:DECLARES_CLIENT]->() WHERE m.id = $id RETURN count(e) AS n", + {"id": mid}, + ) + assert dc_rows + want_dc = int(dc_rows[0]["n"]) + assert want_dc >= 1 + out = neighbors_v2(mid, direction="out", edge_types=["DECLARES_CLIENT"], graph=kuzu_graph) + assert out.success is True + assert all(e.edge_type == "DECLARES_CLIENT" for e in out.results) + assert len(out.results) == want_dc + + def test_neighbors_route_in_exposes_returns_handler(kuzu_graph) -> None: route_id = _first_route_with_handler(kuzu_graph) out = neighbors_v2(route_id, direction="in", edge_types=["EXPOSES"], graph=kuzu_graph) @@ -291,7 +321,11 @@ def test_neighbors_route_in_exposes_returns_handler(kuzu_graph) -> None: def test_neighbors_route_in_http_calls_returns_callers(kuzu_graph) -> None: class FakeGraph: def _rows(self, query: str, params: dict[str, Any] | None = None) -> list[dict[str, Any]]: - if "MATCH (a)<-[e]-(b)" in query: + if ( + "MATCH (a)<-[e]-(b)" in query + and "WHERE a.id" in query + and "RETURN b.id AS other_id" in query + ): return [{"other_id": "sym:caller", "edge_type": "HTTP_CALLS", "confidence": 0.8, "match": "cross_service"}] if "MATCH (n:Symbol)" in query: return [