Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,22 @@ Read these directly. Don't rely on rule files to mirror them.
"Re-index required" callout and bump `ontology_version` when
enrichment semantics change.

## Kuzu Cypher pitfalls

When adding or editing Cypher run against Kuzu (for example in
`kuzu_queries.py`, `mcp_v2.py`, or any `KuzuGraph._rows` caller):

- **Do not filter relationship types with** `label(e) IN $list` **or**
`label(e) IN ["A","B"]` **in** `WHERE`. On supported versions this can
be ignored or wrong; prefer **OR of scalar equalities**
(`label(e) = $p OR label(e) = $q …`) with bound parameters, after
validating labels against an allowlist (see `neighbors_v2` in
`mcp_v2.py`).
- **Typed union patterns** like `-[e:CALLS|HTTP_CALLS]->` are only safe if
every column you `RETURN` from `e` exists on **all** of those
relationship types in the graph schema. Otherwise prefer untyped `[e]`
plus explicit label filtering, or split queries.

## Workflow

- Branch from `master`. Branch names: `cursor/<topic>` (CLI work),
Expand Down
5 changes: 5 additions & 0 deletions kuzu_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@

The Kuzu database is opened read-only and cached per-process. This module is
intentionally dependency-light: nothing here imports LanceDB or sentence-transformers.

Cypher pitfalls (see also ``AGENTS.md``): avoid ``label(e) IN $list`` in ``WHERE`` for
relationship-type filters; use OR of ``label(e) = $param`` with bound parameters.
Typed unions ``-[e:A|B]-`` require every ``RETURN`` column on ``e`` to exist on all
listed rel types, or the binder may fail.
"""
from __future__ import annotations

Expand Down
17 changes: 13 additions & 4 deletions mcp_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,12 @@ def neighbors_v2(
) -> NeighborsOutput:
try:
_NEIGHBOR_EDGE_TYPES_ADAPTER.validate_python(edge_types)
# Kuzu 0.11.x can drop `label(e) IN $list` in WHERE; use OR of scalar equalities instead.
# Typed unions like `[e:CALLS|HTTP_CALLS]` fail the binder when RETURN references rel
# columns that exist on only some of the union members.
labels = list(dict.fromkeys(edge_types))
label_params = [f"l{i}" for i in range(len(labels))]
label_predicate = "(" + " OR ".join(f"label(e) = ${name}" for name in label_params) + ")"
g = graph or KuzuGraph.get()
raw_filter = _coerce_filter(filter)
nf = (
Expand All @@ -543,27 +549,30 @@ def neighbors_v2(
results: list[Edge] = []
for origin_id in origins:
_resolve_node_kind(g, origin_id)
q_params = {"id": origin_id, **dict(zip(label_params, labels, strict=True))}
if direction == "out":
rows = g._rows( # noqa: SLF001
"MATCH (a)-[e]->(b) WHERE a.id = $id AND label(e) IN $edge_types "
"MATCH (a)-[e]->(b) WHERE a.id = $id AND "
f"{label_predicate} "
"RETURN b.id AS other_id, label(e) AS edge_type, e.confidence AS confidence, "
"e.strategy AS strategy, e.match AS match, e.mechanism AS mechanism, "
"e.annotation AS annotation, e.field_or_param AS field_or_param, "
"e.source AS source, e.call_site_line AS call_site_line, "
"e.call_site_byte AS call_site_byte, e.arg_count AS arg_count, "
"e.resolved AS resolved",
{"id": origin_id, "edge_types": edge_types},
q_params,
)
else:
rows = g._rows( # noqa: SLF001
"MATCH (a)<-[e]-(b) WHERE a.id = $id AND label(e) IN $edge_types "
"MATCH (a)<-[e]-(b) WHERE a.id = $id AND "
f"{label_predicate} "
"RETURN b.id AS other_id, label(e) AS edge_type, e.confidence AS confidence, "
"e.strategy AS strategy, e.match AS match, e.mechanism AS mechanism, "
"e.annotation AS annotation, e.field_or_param AS field_or_param, "
"e.source AS source, e.call_site_line AS call_site_line, "
"e.call_site_byte AS call_site_byte, e.arg_count AS arg_count, "
"e.resolved AS resolved",
{"id": origin_id, "edge_types": edge_types},
q_params,
)
for row in rows:
other_id = str(row.get("other_id") or "")
Expand Down
2 changes: 2 additions & 0 deletions tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ cd /path/to/java-codebase-rag
.venv/bin/python -m pytest tests -v
```

**Kuzu Cypher:** When writing queries or asserting on edge filters, follow the pitfalls note in [`AGENTS.md`](../AGENTS.md) (avoid `label(e) IN $list` for type filters; be careful with typed union rel patterns).

## CI merge gate and fixture tiers

**Merge gate (mechanical):** [`.github/workflows/test.yml`](../.github/workflows/test.yml) runs `pytest tests` with `JAVA_CODEBASE_RAG_RUN_HEAVY=0` on every pull request and on every push to `master`. Branch protection on `master` requires the `test` status check to pass before merge and disables force-push. Break-glass policy: `enforce_admins: false` so the sole maintainer can bypass for emergency hotfixes — explain the bypass in the merge commit.
Expand Down
36 changes: 35 additions & 1 deletion tests/test_mcp_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@ def _method_id_with_calls(kuzu_graph, direction: str) -> str:
return str(rows[0]["id"])


def _method_id_declares_client_and_other_out_edge(kuzu_graph) -> str | None:
"""A method with DECLARES_CLIENT plus another out-label (Kuzu #119 strict-subset case)."""
for pattern in (
"MATCH (m:Symbol {kind: 'method'})-[:DECLARES_CLIENT]->() MATCH (m)-[:CALLS]->() RETURN m.id AS id LIMIT 1",
"MATCH (m:Symbol {kind: 'method'})-[:DECLARES_CLIENT]->() MATCH (m)-[:HTTP_CALLS]->() RETURN m.id AS id LIMIT 1",
):
rows = kuzu_graph._rows(pattern) # noqa: SLF001
if rows:
return str(rows[0]["id"])
return None


def _first_route_with_handler(kuzu_graph) -> str:
for route in kuzu_graph.list_routes(limit=200):
if kuzu_graph.find_route_handlers(route_id=route["id"]):
Expand Down Expand Up @@ -281,6 +293,24 @@ def test_neighbors_out_calls(kuzu_graph) -> None:
assert isinstance(out.results, list)


def test_neighbors_edge_types_strict_subset_respects_label_filter(kuzu_graph) -> None:
"""Regression (#119): Kuzu can drop `label(e) IN $list`; use OR of `label(e) = $p` instead."""
mid = _method_id_declares_client_and_other_out_edge(kuzu_graph)
if mid is None:
pytest.skip("no method with DECLARES_CLIENT and CALLS or HTTP_CALLS out-edges")
dc_rows = kuzu_graph._rows( # noqa: SLF001
"MATCH (m:Symbol)-[e:DECLARES_CLIENT]->() WHERE m.id = $id RETURN count(e) AS n",
{"id": mid},
)
assert dc_rows
want_dc = int(dc_rows[0]["n"])
assert want_dc >= 1
out = neighbors_v2(mid, direction="out", edge_types=["DECLARES_CLIENT"], graph=kuzu_graph)
assert out.success is True
assert all(e.edge_type == "DECLARES_CLIENT" for e in out.results)
assert len(out.results) == want_dc


def test_neighbors_route_in_exposes_returns_handler(kuzu_graph) -> None:
route_id = _first_route_with_handler(kuzu_graph)
out = neighbors_v2(route_id, direction="in", edge_types=["EXPOSES"], graph=kuzu_graph)
Expand All @@ -291,7 +321,11 @@ def test_neighbors_route_in_exposes_returns_handler(kuzu_graph) -> None:
def test_neighbors_route_in_http_calls_returns_callers(kuzu_graph) -> None:
class FakeGraph:
def _rows(self, query: str, params: dict[str, Any] | None = None) -> list[dict[str, Any]]:
if "MATCH (a)<-[e]-(b)" in query:
if (
"MATCH (a)<-[e]-(b)" in query
and "WHERE a.id" in query
and "RETURN b.id AS other_id" in query
):
return [{"other_id": "sym:caller", "edge_type": "HTTP_CALLS", "confidence": 0.8, "match": "cross_service"}]
if "MATCH (n:Symbol)" in query:
return [
Expand Down
Loading