diff --git a/README.md b/README.md index 84e6b76..ab17a3a 100644 --- a/README.md +++ b/README.md @@ -361,7 +361,7 @@ For `reprocess`, the pipeline runs `cocoindex` with `cwd` set to the bundle dire ## 6. Graph layer -A deterministic property graph derived from tree-sitter Java parsing lives next to the LanceDB tables under the index directory (default `${JAVA_CODEBASE_RAG_INDEX_DIR:-./.java-codebase-rag}/code_graph.kuzu`). Current ontology version: **14** (see [`docs/EDGE-NAVIGATION.md`](./docs/EDGE-NAVIGATION.md) for edge shapes). +A deterministic property graph derived from tree-sitter Java parsing lives next to the LanceDB tables under the index directory (default `${JAVA_CODEBASE_RAG_INDEX_DIR:-./.java-codebase-rag}/code_graph.kuzu`). Current ontology version: **15** (see [`docs/EDGE-NAVIGATION.md`](./docs/EDGE-NAVIGATION.md) for MCP-traversable edge shapes). ### Node kinds @@ -370,10 +370,11 @@ A deterministic property graph derived from tree-sitter Java parsing lives next | `Symbol` | `package`, `file`, `class`, `interface`, `enum`, `record`, `annotation`, `method`, `constructor` | | `Route` | HTTP endpoint or async listener (one row per declared route) | | `Client` | Outbound HTTP / messaging call site | +| `UnresolvedCallSite` | Receiver-failure call site (`chained_receiver`, `phantom_unresolved_receiver`) — not a `Symbol`; ids use the `ucs:` prefix | -Unresolved targets become **phantom** nodes (`resolved=false`, FQN guessed from imports / `java.lang`). +Known-receiver-external JDK / Spring / Lombok callees stay on **`CALLS`** as phantom **method** symbols (`resolved=false`). Receiver-failure sites (unresolved receiver or chained receiver) are **`UnresolvedCallSite`** nodes linked by **`UNRESOLVED_AT`** (not in `EDGE_SCHEMA`; use `describe(method_id).unresolved_call_sites`, `neighbors(..., include_unresolved=True)`, or `java-codebase-rag unresolved-calls`). -### Edge types (10) +### Edge types (MCP-traversable) | Edge | Direction | Meaning | |---|---|---| @@ -388,7 +389,7 @@ Unresolved targets become **phantom** nodes (`resolved=false`, FQN guessed from | `HTTP_CALLS` | client → route | Cross-service HTTP call (caller-side Client to target Route). | | `ASYNC_CALLS` | producer → route | Cross-service async (Kafka, Rabbit, JMS, …). | -JDK / Spring / Lombok callees are represented as **phantom** method symbols at index time. Caller/callee traversals default to `exclude_external=true` so those edges are filtered by FQN prefix without dropping them from the graph. +Caller/callee traversals default to `exclude_external=true` on **`find_callers`** so library FQN prefixes are filtered without dropping edges from the graph. ### Call-graph notes @@ -426,7 +427,7 @@ Resolution order for `microservice`: Current ontology version is **15**. Any index built before this version must be rebuilt via `cocoindex update ... --full-reprocess -f` or a full `java-codebase-rag reprocess` (no selective flags) so vectors and graph stay aligned. Until re-indexed, the server defensively JSON-decodes string-form list columns so nothing explodes, but filters like `array_contains` will not work. -Ontology **15** (CALLS-NOISE PR-1) adds `CALLS.callee_declaring_role`, `GraphMeta.pass3_unresolved_phantom_receiver` / `pass3_unresolved_chained`, and **supertype-walk dedup** at build time: duplicate interface + concrete candidates at the same call site collapse to one `CALLS` row (row counts per method may drop after re-index, not only a new column). PR-2 adds `edge_filter` on `neighbors`; PR-3 moves true receiver-failure rows off `CALLS`. +Ontology **15** (CALLS-NOISE) adds `CALLS.callee_declaring_role`, `GraphMeta.pass3_unresolved_phantom_receiver` / `pass3_unresolved_chained`, and **supertype-walk dedup** at build time. PR-2 adds `edge_filter` on `neighbors`. **PR-3 (breaking):** receiver-failure sites (`chained_receiver`, unresolved-receiver `phantom`) are no longer `CALLS` rows — they live on `UnresolvedCallSite` + `UNRESOLVED_AT`. Default `neighbors(..., ['CALLS'])` returns fewer rows; use `include_unresolved=True` for a source-ordered interleaved transcript (`row_kind`), `describe(method_id).unresolved_call_sites` (capped), or `java-codebase-rag unresolved-calls list|stats`. Known-receiver-external JDK rows stay on `CALLS` with `resolved=false`. Ontology **14** introduces `EDGE_SCHEMA` in `java_ontology.py` as the canonical edge navigation schema (see `docs/EDGE-NAVIGATION.md`). **`HTTP_CALLS` is `Client → Route`** (SCHEMA-V2 PR-B). **`ASYNC_CALLS` is `Producer → Route`** with `DECLARES_PRODUCER` (SCHEMA-V2 PR-C). Run one full reprocess after upgrading through the SCHEMA-V2 sequence (or when you need the v14 ontology gate). diff --git a/build_ast_graph.py b/build_ast_graph.py index 97fdbad..707b27d 100644 --- a/build_ast_graph.py +++ b/build_ast_graph.py @@ -183,6 +183,18 @@ class CallsRow: callee_declaring_role: str = "OTHER" +@dataclass +class UnresolvedCallSiteRow: + id: str + caller_id: str + call_site_line: int + call_site_byte: int + arg_count: int + callee_simple: str + receiver_expr: str + reason: str + + @dataclass class DeclaresRow: src_id: str @@ -363,6 +375,7 @@ class GraphTables: implements_rows: list[EdgeRow] = field(default_factory=list) injects_rows: list[InjectsRow] = field(default_factory=list) calls_rows: list[CallsRow] = field(default_factory=list) + unresolved_call_site_rows: list[UnresolvedCallSiteRow] = field(default_factory=list) declares_rows: list[DeclaresRow] = field(default_factory=list) routes_rows: list[RouteRow] = field(default_factory=list) exposes_rows: list[ExposesRow] = field(default_factory=list) @@ -1209,6 +1222,34 @@ def _collapse_supertype_duplicates( return [concrete] +def _unresolved_call_site_id(caller_id: str, call: CallSite) -> str: + return f"ucs:{caller_id}:{call.line}:{call.byte}" + + +def _emit_unresolved_call_site( + tables: GraphTables, + stats: CallResolutionStats, + *, + caller_id: str, + call: CallSite, + reason: str, +) -> None: + tables.unresolved_call_site_rows.append(UnresolvedCallSiteRow( + id=_unresolved_call_site_id(caller_id, call), + caller_id=caller_id, + call_site_line=call.line, + call_site_byte=call.byte, + arg_count=call.arg_count, + callee_simple=call.callee_simple, + receiver_expr=call.receiver_expr or "", + reason=reason, + )) + if reason == "chained_receiver": + stats.phantom_chained += 1 + else: + stats.phantom_other += 1 + + def _emit_call_edge( tables: GraphTables, stats: CallResolutionStats, @@ -1235,14 +1276,7 @@ def _emit_call_edge( )) stats.total += 1 stats.by_strategy[strategy] += 1 - if strategy == "chained_receiver": - stats.phantom_chained += 1 - elif strategy == "phantom": - # Only count as phantom_other when the receiver itself was unresolvable. - # High-confidence edges with phantom callees (resolved=False, strategy!=phantom) - # are not noise — they are known external calls with good receiver resolution. - stats.phantom_other += 1 - if not resolved and strategy != "chained_receiver": + if not resolved: stats.callee_unresolved += 1 @@ -1268,26 +1302,17 @@ def _resolve_and_emit_call( recv_type, strat, conf = _resolve_receiver_type(call, scope=scope, member=member, ast=ast, tables=tables) if strat == "chained_receiver": - # Chained-receiver phantoms have no microservice attribution, so they cannot violate cross-service CALLS invariants. - pid = _phantom_method_id( - tables, receiver_fqn=None, receiver_expr=call.receiver_expr, - callee=call.callee_simple, arg_count=call.arg_count, - ) - _emit_call_edge( - tables, stats, src_id=member.node_id, dst_id=pid, call=call, - confidence=0.0, strategy="chained_receiver", resolved=False, + _emit_unresolved_call_site( + tables, stats, caller_id=member.node_id, call=call, reason="chained_receiver", ) return if recv_type is None: - # Unresolved-receiver phantoms also carry empty microservice attribution. - pid = _phantom_method_id( - tables, receiver_fqn=None, receiver_expr=call.receiver_expr, - callee=call.callee_simple, arg_count=call.arg_count, - ) - _emit_call_edge( - tables, stats, src_id=member.node_id, dst_id=pid, call=call, - confidence=0.0, strategy="phantom", resolved=False, + _emit_unresolved_call_site( + tables, stats, + caller_id=member.node_id, + call=call, + reason="phantom_unresolved_receiver", ) return @@ -1413,16 +1438,18 @@ def pass3_calls(tables: GraphTables, asts: dict[str, JavaFileAst], *, verbose: b _process_file_calls(file_ast, rel_path, tables, stats) except Exception as e: log.error("Call extraction failed for %s: %s", rel_path, e) - pct_chained = 100.0 * stats.phantom_chained / max(1, stats.total) - pct_callee_unres = 100.0 * stats.callee_unresolved / max(1, stats.total) - pct_phantom_recv = 100.0 * stats.phantom_other / max(1, stats.total) + denom_calls = max(1, stats.total) + denom_sites = max(1, stats.total + stats.phantom_chained + stats.phantom_other) + pct_chained = 100.0 * stats.phantom_chained / denom_sites + pct_callee_unres = 100.0 * stats.callee_unresolved / denom_calls + pct_phantom_recv = 100.0 * stats.phantom_other / denom_sites tables.pass3_skipped_cross_service = int(stats.skipped_cross_service) tables.pass3_unresolved_phantom_receiver = int(stats.phantom_other) tables.pass3_unresolved_chained = int(stats.phantom_chained) msg = ( - f"Call resolution: {stats.total} sites, {stats.phantom_chained} chained phantoms " - f"({pct_chained:.1f}%), {stats.callee_unresolved} unresolved callee " - f"({pct_callee_unres:.1f}%), {stats.phantom_other} phantom receiver " + f"Call resolution: {stats.total} CALLS rows, {stats.phantom_chained} chained unresolved " + f"({pct_chained:.1f}%), {stats.callee_unresolved} unresolved callee on CALLS " + f"({pct_callee_unres:.1f}%), {stats.phantom_other} phantom-receiver unresolved " f"({pct_phantom_recv:.1f}%), {stats.skipped_cross_service} skipped cross-service, " f"strategies: {dict(stats.by_strategy)}" ) @@ -2406,6 +2433,13 @@ def _micro_factor(member: MemberEntry | None) -> float: "confidence DOUBLE, strategy STRING, source STRING, resolved BOOLEAN, " "callee_declaring_role STRING)" ) +_SCHEMA_UNRESOLVED_CALL_SITE = ( + "CREATE NODE TABLE UnresolvedCallSite(" + "id STRING, caller_id STRING, call_site_line INT64, call_site_byte INT64, " + "arg_count INT64, callee_simple STRING, receiver_expr STRING, reason STRING, " + "PRIMARY KEY(id))" +) +_SCHEMA_UNRESOLVED_AT = "CREATE REL TABLE UNRESOLVED_AT(FROM Symbol TO UnresolvedCallSite)" _SCHEMA_EXPOSES = ( "CREATE REL TABLE EXPOSES(FROM Symbol TO Route, " "confidence DOUBLE, strategy STRING)" @@ -2437,12 +2471,14 @@ def _drop_all(conn: kuzu.Connection) -> None: "DROP TABLE IF EXISTS HTTP_CALLS", "DROP TABLE IF EXISTS ASYNC_CALLS", "DROP TABLE IF EXISTS EXPOSES", + "DROP TABLE IF EXISTS UNRESOLVED_AT", "DROP TABLE IF EXISTS EXTENDS", "DROP TABLE IF EXISTS IMPLEMENTS", "DROP TABLE IF EXISTS INJECTS", "DROP TABLE IF EXISTS CALLS", "DROP TABLE IF EXISTS OVERRIDES", "DROP TABLE IF EXISTS DECLARES", + "DROP TABLE IF EXISTS UnresolvedCallSite", "DROP TABLE IF EXISTS Symbol", "DROP TABLE IF EXISTS Route", "DROP TABLE IF EXISTS Client", @@ -2458,6 +2494,7 @@ def _drop_all(conn: kuzu.Connection) -> None: def _create_schema(conn: kuzu.Connection) -> None: for stmt in ( _SCHEMA_NODE, + _SCHEMA_UNRESOLVED_CALL_SITE, _SCHEMA_ROUTE, _SCHEMA_CLIENT, _SCHEMA_PRODUCER, @@ -2468,6 +2505,7 @@ def _create_schema(conn: kuzu.Connection) -> None: _SCHEMA_DECLARES, _SCHEMA_OVERRIDES, _SCHEMA_CALLS, + _SCHEMA_UNRESOLVED_AT, _SCHEMA_EXPOSES, _SCHEMA_DECLARES_CLIENT, _SCHEMA_DECLARES_PRODUCER, @@ -2743,6 +2781,33 @@ def _write_edges(conn: kuzu.Connection, tables: GraphTables) -> None: ), }) + _CREATE_UNRESOLVED = ( + "CREATE (:UnresolvedCallSite {" + "id: $id, caller_id: $caller_id, call_site_line: $line, call_site_byte: $byte, " + "arg_count: $argc, callee_simple: $callee, receiver_expr: $recv, reason: $reason" + "})" + ) + _CREATE_UNRESOLVED_AT = ( + "MATCH (a:Symbol {id: $caller}), (u:UnresolvedCallSite {id: $ucs}) " + "CREATE (a)-[:UNRESOLVED_AT]->(u)" + ) + seen_ucs: set[str] = set() + for row in tables.unresolved_call_site_rows: + if row.id in seen_ucs: + continue + seen_ucs.add(row.id) + conn.execute(_CREATE_UNRESOLVED, { + "id": row.id, + "caller_id": row.caller_id, + "line": row.call_site_line, + "byte": row.call_site_byte, + "argc": row.arg_count, + "callee": row.callee_simple, + "recv": row.receiver_expr, + "reason": row.reason, + }) + conn.execute(_CREATE_UNRESOLVED_AT, {"caller": row.caller_id, "ucs": row.id}) + def _write_routes_and_exposes(conn: kuzu.Connection, tables: GraphTables) -> None: for row in tables.routes_rows: diff --git a/docs/AGENT-GUIDE.md b/docs/AGENT-GUIDE.md index 79a07ea..23f4b7c 100644 --- a/docs/AGENT-GUIDE.md +++ b/docs/AGENT-GUIDE.md @@ -213,13 +213,13 @@ Identifier lookup; three statuses above. Args: `identifier`, optional `hint_kind One hop. Args: `ids` (string or array), **`direction`**, **`edge_types`**, `limit` (default 25), `offset`, optional `filter` on the other node, optional **`edge_filter`** (`edge_types` must be exactly `['CALLS']` — no composed dot-keys or second stored label; fail-loud otherwise). -**Multiple origin ids:** each id loads the full CALLS stream (or generic hop) in list order; `offset`/`limit` apply to the **concatenated** edge list (`ids[0]` edges first, then `ids[1]`, …), not global source order across origins — a large first origin can leave no rows for later ids within the same page. High fan-out methods are slow; prefer one id per call or a smaller `limit`. +**Multiple origin ids:** each id loads the full CALLS stream (or generic hop) in list order; `offset`/`limit` apply to the **concatenated** edge list (`ids[0]` edges first, then `ids[1]`, …), not global source order across origins — a large first origin can leave no rows for later ids within the same page. High fan-out methods are slow; prefer one id per call or a smaller `limit`. **Hints:** `TPL_NEIGHBORS_CALLS_HIGH_FANOUT` / `TPL_NEIGHBORS_CALLS_HAS_UNRESOLVED` fire only for a **single** origin id (multi-origin CALLS skips those nudges). Returns **edges** with `attrs` (`confidence`, `strategy`, `match`, … on cross-service edges) and **`other`** node. **Cross-service edges** (`HTTP_CALLS`, `ASYNC_CALLS`): read `attrs.confidence` and `attrs.match` — low confidence or `unresolved`/`phantom`/`ambiguous` means treat as a resolver signal, not ground truth. -**`CALLS` edges:** source-ordered (`call_site_line`, `call_site_byte`). `attrs.resolved=false` or low `attrs.confidence` may be JDK/external or unresolved static sites — still a lower bound, not exhaustive runtime behaviour. **`filter` + `edge_filter` together** load the ordered CALLS stream then apply callee `NodeFilter` in Python — expect higher latency on hot methods than `edge_filter` alone. Optional **`edge_filter`** projects before pagination: `min_confidence`; `include_strategies` / `exclude_strategies` (mutually exclusive); `callee_declaring_role`, `callee_declaring_roles`, `exclude_callee_declaring_roles` (`["OTHER"]` also drops known-external rows). **`filter.role` filters the neighbor method (usually `OTHER`), not the callee stereotype** — use `edge_filter.callee_declaring_role` for repository/service hops. **`exclude_external` applies to `find_callers` / `find_callees` only** (FQN-prefix); trim JDK noise on CALLS via `edge_filter`. Accessor noise: role excludes help; getter/setter heuristics in [`propose/AGENT-SKILLS-AND-COMMANDS-PROPOSE.md`](../propose/AGENT-SKILLS-AND-COMMANDS-PROPOSE.md) `/mini-map`. +**`CALLS` edges:** source-ordered (`call_site_line`, `call_site_byte`). After ontology 15 PR-3, true receiver-failure sites are **not** on `CALLS` — they are `UnresolvedCallSite` nodes (`reason`: `chained_receiver` or `phantom_unresolved_receiver`; ids use the `ucs:` prefix, `other.kind=unresolved_call_site` — **not** describable via `describe(id=…)`). `UNRESOLVED_AT` is graph storage only (not in `EDGE_SCHEMA` / `neighbors` edge_types). `attrs.resolved=false` on remaining `CALLS` rows means known-receiver-external (JDK/Spring) callees, not receiver failure. **`include_unresolved=True`** (CALLS + `direction=out` only) interleaves unresolved sites with resolved `CALLS` (`row_kind` discriminator); **mutually exclusive with `edge_filter`**. **`dedup_calls=True`** collapses identical `(origin, callee)` `CALLS` to one row with `call_site_lines`. **`filter` + `edge_filter` together** load the ordered CALLS stream then apply callee `NodeFilter` in Python — expect higher latency on hot methods than `edge_filter` alone. Optional **`edge_filter`** projects before pagination: `min_confidence`; `include_strategies` / `exclude_strategies` (mutually exclusive); `callee_declaring_role`, `callee_declaring_roles`, `exclude_callee_declaring_roles` (`["OTHER"]` also drops known-external rows). **`filter.role` filters the neighbor method (usually `OTHER`), not the callee stereotype** — use `edge_filter.callee_declaring_role` for repository/service hops. **`exclude_external` applies to `find_callers` / `find_callees` only** (FQN-prefix); trim JDK noise on `neighbors` CALLS via `edge_filter`. Accessor noise: role excludes help; getter/setter heuristics in [`propose/AGENT-SKILLS-AND-COMMANDS-PROPOSE.md`](../propose/AGENT-SKILLS-AND-COMMANDS-PROPOSE.md) `/mini-map`. ### Ontology glossary diff --git a/docs/EDGE-NAVIGATION.md b/docs/EDGE-NAVIGATION.md index a07ed08..9e79055 100644 --- a/docs/EDGE-NAVIGATION.md +++ b/docs/EDGE-NAVIGATION.md @@ -252,3 +252,20 @@ - `member_subject`: neighbors(['{id}'],'out',['DECLARES_PRODUCER']) then neighbors(producer_ids,'out',['ASYNC_CALLS']) - `route_subject`: neighbors(['{id}'],'in',['ASYNC_CALLS']) then neighbors(producer_ids,'in',['DECLARES_PRODUCER']) for declaring method - `alien_subject`: ASYNC_CALLS connects Producer→Route; use DECLARES_PRODUCER from a method Symbol, or neighbors(producer_id,'out',['ASYNC_CALLS']) from a Producer id + + +## Graph storage (not MCP `neighbors` edge_types) + +### `UnresolvedCallSite` + `UNRESOLVED_AT` (ontology 15 / CALLS-NOISE PR-3) + +Receiver-failure call sites (`chained_receiver`, `phantom_unresolved_receiver`) are **not** `CALLS` rows. They are `UnresolvedCallSite` nodes (`id` prefix `ucs:`) linked from the caller method Symbol via `UNRESOLVED_AT`. + +| Surface | How to read them | +| --- | --- | +| `describe(method_id)` | `record.data.unresolved_call_sites` (capped at 5) + footer when more exist | +| `neighbors(..., ['CALLS'], include_unresolved=True)` | Interleaved transcript; `row_kind='unresolved_call_site'`; `other.kind=unresolved_call_site` | +| CLI | `java-codebase-rag unresolved-calls list|stats` | + +- **Not** in `EDGE_SCHEMA` — do not pass `UNRESOLVED_AT` to `neighbors(edge_types=…)`. +- **`describe(ucs:…)`** is invalid (fail-loud); describe the **caller method** instead. +- Fresh graphs: `CALLS.strategy` no longer includes `phantom` or `chained_receiver` for receiver failure (those literals remain on HTTP/ASYNC `match` and brownfield resolver sets). diff --git a/docs/JAVA-CODEBASE-RAG-CLI.md b/docs/JAVA-CODEBASE-RAG-CLI.md index cbc6c32..9c5655d 100644 --- a/docs/JAVA-CODEBASE-RAG-CLI.md +++ b/docs/JAVA-CODEBASE-RAG-CLI.md @@ -159,6 +159,17 @@ Explains **why a path** is ignored or not ignored by the layered ignore rules (b java-codebase-rag diagnose-ignore src/main/generated/Foo.java --source-root /path/to/java/repo ``` +### `unresolved-calls` + +Lists or aggregates **receiver-failure** call sites stored as `UnresolvedCallSite` (not on `CALLS` after ontology 15 PR-3). Reasons: `phantom_unresolved_receiver`, `chained_receiver`. + +```bash +java-codebase-rag unresolved-calls stats --by microservice --source-root /path/to/java/repo --index-dir /path/to/.java-codebase-rag +java-codebase-rag unresolved-calls list --method-id sym:... --limit 100 --source-root /path/to/java/repo --index-dir /path/to/.java-codebase-rag +``` + +`stats --by` accepts `reason`, `microservice`, or `caller_role` (declaring type role of the caller method). + ## Analysis: `analyze-pr` Maps a **unified diff** to changed symbols, blast radius, routes touched, and risk band. Requires a **built Kuzu graph** at `/code_graph.kuzu`. diff --git a/java_codebase_rag/cli.py b/java_codebase_rag/cli.py index a311b8a..7312957 100644 --- a/java_codebase_rag/cli.py +++ b/java_codebase_rag/cli.py @@ -22,6 +22,7 @@ resolve_operator_config, ) from java_codebase_rag.pipeline import clip, run_build_ast_graph, run_cocoindex_drop, run_cocoindex_update +from java_ontology import VALID_UNRESOLVED_CALL_REASONS KUZU_INCREMENTAL_TRACKING_ISSUE_URL = "https://github.com/HumanBean17/java-codebase-rag/issues/73" @@ -514,6 +515,43 @@ def _read_diff_text(args: argparse.Namespace) -> str: raise ValueError("Provide exactly one of --diff-file or --diff-stdin") +def _cmd_unresolved_calls_list(args: argparse.Namespace) -> int: + cfg = _resolved_from_ns(args) + _startup_hints(cfg) + cfg.apply_to_os_environ() + from kuzu_queries import KuzuGraph # lazy + + if not KuzuGraph.exists(): + _emit({"success": False, "message": "Kuzu graph not found"}) + return 1 + graph = KuzuGraph.get() + rows = graph.list_unresolved_call_sites( + method_id=args.method_id, + reason=args.reason, + microservice=args.microservice, + callee_simple=args.callee_simple, + limit=int(args.limit), + ) + _emit({"success": True, "count": len(rows), "sites": rows}) + return 0 + + +def _cmd_unresolved_calls_stats(args: argparse.Namespace) -> int: + cfg = _resolved_from_ns(args) + _startup_hints(cfg) + cfg.apply_to_os_environ() + from kuzu_queries import KuzuGraph # lazy + + if not KuzuGraph.exists(): + _emit({"success": False, "message": "Kuzu graph not found"}) + return 1 + graph = KuzuGraph.get() + buckets = graph.stats_unresolved_call_sites(by=args.by) + total = sum(int(r.get("n") or 0) for r in buckets) + _emit({"success": True, "total": total, "by": args.by, "buckets": buckets}) + return 0 + + def _cmd_analyze_pr(args: argparse.Namespace) -> int: cfg = _resolved_from_ns(args) _startup_hints(cfg) @@ -551,7 +589,8 @@ def build_parser() -> argparse.ArgumentParser: "Introspection (inspect the index):\n" " meta Print ontology version, edge counts, and table summary.\n" " tables List Lance tables and row counts.\n" - " diagnose-ignore Show which ignore-pattern layer decided a path's fate.\n\n" + " diagnose-ignore Show which ignore-pattern layer decided a path's fate.\n" + " unresolved-calls List or aggregate receiver-failure call sites (not in CALLS).\n\n" "Analysis (work with code changes):\n" " analyze-pr Compute blast-radius + risk score for a unified diff.\n\n" "Run `java-codebase-rag --help` for command-specific options." @@ -656,6 +695,39 @@ def build_parser() -> argparse.ArgumentParser: group.add_argument("--diff-file", type=str) group.add_argument("--diff-stdin", action="store_true") analyze.set_defaults(handler=_cmd_analyze_pr) + + unresolved = subparsers.add_parser( + "unresolved-calls", + help="List or aggregate UnresolvedCallSite rows (receiver-failure call sites).", + ) + _add_index_embedding_flags(unresolved) + unresolved_sub = unresolved.add_subparsers(dest="unresolved_command", required=True) + + uc_list = unresolved_sub.add_parser("list", help="List unresolved call sites.") + _add_index_embedding_flags(uc_list) + uc_list.add_argument("--method-id", type=str, default=None, help="Caller Symbol id") + uc_list.add_argument( + "--reason", + type=str, + default=None, + choices=sorted(VALID_UNRESOLVED_CALL_REASONS), + help="Filter by UnresolvedCallSite.reason", + ) + uc_list.add_argument("--microservice", type=str, default=None) + uc_list.add_argument("--callee-simple", type=str, default=None, dest="callee_simple") + uc_list.add_argument("--limit", type=int, default=100) + uc_list.set_defaults(handler=_cmd_unresolved_calls_list) + + uc_stats = unresolved_sub.add_parser("stats", help="Aggregate unresolved call site counts.") + _add_index_embedding_flags(uc_stats) + uc_stats.add_argument( + "--by", + type=str, + choices=("reason", "microservice", "caller_role"), + default="reason", + ) + uc_stats.set_defaults(handler=_cmd_unresolved_calls_stats) + return parser diff --git a/java_ontology.py b/java_ontology.py index 9d2486f..2f17012 100644 --- a/java_ontology.py +++ b/java_ontology.py @@ -89,20 +89,28 @@ )) # Brownfield / fallback edge resolution strategies (hints v2 neighbors fuzzy signal). +# ``phantom`` / ``chained_receiver`` are not CALLS edge strategies after PR-3 (receiver +# failures live on ``UnresolvedCallSite``); they remain on HTTP/ASYNC match literals only. FUZZY_STRATEGY_SET: frozenset[str] = frozenset({ "layer_c_source", "layer_b_fqn", - "phantom", - "chained_receiver", "overload_ambiguous", "implicit_super", }) +VALID_UNRESOLVED_CALL_REASONS: frozenset[str] = frozenset({ + "phantom_unresolved_receiver", + "chained_receiver", +}) + # Union of fuzzy + non-fuzzy resolver strategies that may appear on graph edges # carrying a `strategy` column (brownfield layers, codebase stubs, call-graph tiers, # HTTP/async dispatch literals). Used by `EdgeSpec.brownfield_resolver_sourced`. BROWNFIELD_RESOLVER_STRATEGY_SET: frozenset[str] = frozenset({ *FUZZY_STRATEGY_SET, + # Receiver-tier / HTTP match literals — not CALLS edge strategies after PR-3 UCS facet. + "phantom", + "chained_receiver", "layer_b_ann", "layer_a_meta", "codebase_route", diff --git a/kuzu_queries.py b/kuzu_queries.py index 1792d35..6a9ba0b 100644 --- a/kuzu_queries.py +++ b/kuzu_queries.py @@ -737,6 +737,114 @@ def neighbor_calls_for_symbol( params["limit"] = limit return self._rows(q, params) + def count_unresolved_for_caller(self, caller_id: str) -> int: + rows = self._rows( + "MATCH (:Symbol {id: $id})-[:UNRESOLVED_AT]->(u:UnresolvedCallSite) " + "RETURN count(u) AS n", + {"id": caller_id}, + ) + return int(rows[0].get("n") or 0) if rows else 0 + + def unresolved_sites_for_caller( + self, + caller_id: str, + *, + direction: Literal["in", "out"] = "out", + ) -> list[dict[str, Any]]: + if direction != "out": + return [] + return self._rows( + "MATCH (:Symbol {id: $id})-[:UNRESOLVED_AT]->(u:UnresolvedCallSite) " + "RETURN u.id AS id, u.caller_id AS caller_id, u.call_site_line AS call_site_line, " + "u.call_site_byte AS call_site_byte, u.arg_count AS arg_count, " + "u.callee_simple AS callee_simple, u.receiver_expr AS receiver_expr, " + "u.reason AS reason " + "ORDER BY u.call_site_line, u.call_site_byte", + {"id": caller_id}, + ) + + def unresolved_sites_for_describe( + self, + method_id: str, + *, + inline_limit: int = 5, + ) -> tuple[list[dict[str, Any]], int]: + total_rows = self._rows( + "MATCH (:Symbol {id: $id})-[:UNRESOLVED_AT]->(u:UnresolvedCallSite) " + "RETURN count(u) AS n", + {"id": method_id}, + ) + total = int(total_rows[0].get("n") or 0) if total_rows else 0 + if total == 0: + return [], 0 + rows = self._rows( + "MATCH (:Symbol {id: $id})-[:UNRESOLVED_AT]->(u:UnresolvedCallSite) " + "RETURN u.call_site_line AS line, u.reason AS reason, " + "u.callee_simple AS callee_simple, u.receiver_expr AS receiver_expr " + "ORDER BY u.call_site_line, u.call_site_byte " + f"LIMIT {int(inline_limit)}", + {"id": method_id}, + ) + return rows, total + + def list_unresolved_call_sites( + self, + *, + method_id: str | None = None, + reason: str | None = None, + microservice: str | None = None, + callee_simple: str | None = None, + limit: int = 100, + ) -> list[dict[str, Any]]: + wh_parts: list[str] = [] + params: dict[str, Any] = {"lim": int(limit)} + if method_id: + wh_parts.append("caller.id = $method_id") + params["method_id"] = method_id + if reason: + wh_parts.append("u.reason = $reason") + params["reason"] = reason + if microservice: + wh_parts.append("caller.microservice = $microservice") + params["microservice"] = microservice + if callee_simple: + wh_parts.append("u.callee_simple = $callee_simple") + params["callee_simple"] = callee_simple + where = ("WHERE " + " AND ".join(wh_parts)) if wh_parts else "" + return self._rows( + "MATCH (caller:Symbol)-[:UNRESOLVED_AT]->(u:UnresolvedCallSite) " + f"{where} " + "RETURN u.id AS id, caller.id AS caller_id, caller.fqn AS caller_fqn, " + "caller.microservice AS microservice, u.call_site_line AS call_site_line, " + "u.call_site_byte AS call_site_byte, u.arg_count AS arg_count, " + "u.callee_simple AS callee_simple, u.receiver_expr AS receiver_expr, " + "u.reason AS reason " + "ORDER BY u.call_site_line, u.call_site_byte " + "LIMIT $lim", + params, + ) + + def stats_unresolved_call_sites( + self, + *, + by: Literal["reason", "microservice", "caller_role"], + ) -> list[dict[str, Any]]: + if by == "reason": + return self._rows( + "MATCH (:Symbol)-[:UNRESOLVED_AT]->(u:UnresolvedCallSite) " + "RETURN u.reason AS bucket, count(*) AS n ORDER BY n DESC", + ) + if by == "microservice": + return self._rows( + "MATCH (caller:Symbol)-[:UNRESOLVED_AT]->(:UnresolvedCallSite) " + "RETURN caller.microservice AS bucket, count(*) AS n ORDER BY n DESC", + ) + return self._rows( + "MATCH (caller:Symbol)-[:UNRESOLVED_AT]->(:UnresolvedCallSite) " + "MATCH (parent:Symbol)-[:DECLARES]->(caller) " + "RETURN parent.role AS bucket, count(*) AS n ORDER BY n DESC", + ) + def _edge_row_count_from_method_ids(self, method_ids: list[str], rel: str) -> int: """Count outgoing ``rel`` edges from method symbols (describe rollup helper).""" total = 0 diff --git a/mcp_hints.py b/mcp_hints.py index 07a0bfd..2f6bf2f 100644 --- a/mcp_hints.py +++ b/mcp_hints.py @@ -24,7 +24,9 @@ "structural hints describe the first origin only. On neighbors with " "edge_types=['CALLS'] only, optional edge_filter projects the ordered CALLS stream " "(min_confidence, strategies, callee_declaring_role axes); fail-loud with composed " - "dot-keys or additional stored labels." + "dot-keys or additional stored labels. include_unresolved interleaves " + "UnresolvedCallSite rows (mutually exclusive with edge_filter). dedup_calls collapses " + "identical (origin, callee) CALLS rows." ) # --- Appendix A verbatim templates (substitute {id}, {kind}, {limit}) --- @@ -124,6 +126,24 @@ "for CALLS stereotype projection" ) +_CALLS_HIGH_FANOUT_THRESHOLD = 10 + +TPL_NEIGHBORS_CALLS_HIGH_FANOUT = ( + "{n} CALLS on this method; the noisy axes are callee_declaring_role " + "and per-call-site multiplicity. Try edge_filter={{callee_declaring_role: 'SERVICE'}} " + "for delegation hops, edge_filter={{exclude_callee_declaring_roles: ['ENTITY','DTO']}} " + "to drop accessor noise, edge_filter={{min_confidence: 0.5}} to trim low-confidence rows " + "(exclude_external is find_callers-only, not neighbors), or dedup_calls=True to collapse " + "identical callees." +) + +TPL_NEIGHBORS_CALLS_HAS_UNRESOLVED = ( + "{n} CALLS shown; this method also has {k} unresolved call sites " + "(see describe(method_id).unresolved_call_sites, or call neighbors with " + "include_unresolved=True for a source-ordered interleaved view — note " + "include_unresolved is mutually exclusive with edge_filter)." +) + # v4 neighbors success-path (propose/HINTS-V4-SUCCESS-PATH-PROPOSE.md); N1a/N1b alias describe templates. TPL_NEIGHBORS_SUCCESS_HTTP_TARGETS = "HTTP targets: neighbors(client_ids,'out',['HTTP_CALLS'])" TPL_NEIGHBORS_SUCCESS_ASYNC_TARGETS = "async targets: neighbors(producer_ids,'out',['ASYNC_CALLS'])" @@ -344,6 +364,26 @@ def _append_neighbors_success_hint(pairs: list[tuple[int, str]], text: str) -> N pairs.append((PRIORITY_LEAF_FOLLOWUP, text)) +def neighbors_calls_fanout_hints(payload: dict[str, Any]) -> list[tuple[int, str]]: + """High-fanout and unresolved-site nudges for CALLS-on-method neighbors (PR-3).""" + pairs: list[tuple[int, str]] = [] + req_types = payload.get("requested_edge_types") + if not isinstance(req_types, list) or req_types != ["CALLS"]: + return pairs + if payload.get("include_unresolved"): + return pairs + page_n = len(list(payload.get("results") or [])) + calls_n = int(payload.get("calls_row_count") or 0) or page_n + unresolved = int(payload.get("unresolved_count") or 0) + if not payload.get("edge_filter_provided") and calls_n >= _CALLS_HIGH_FANOUT_THRESHOLD: + pairs.append((PRIORITY_LEAF_FOLLOWUP, TPL_NEIGHBORS_CALLS_HIGH_FANOUT.format(n=calls_n))) + if unresolved > 0: + pairs.append( + (PRIORITY_LEAF_FOLLOWUP, TPL_NEIGHBORS_CALLS_HAS_UNRESOLVED.format(n=page_n, k=unresolved)) + ) + return pairs + + def neighbors_calls_meta_hints(payload: dict[str, Any]) -> list[tuple[int, str]]: """CALLS-specific hints: role-filter OTHER fallback (Decision 20) and NodeFilter.role trap (30).""" pairs: list[tuple[int, str]] = [] @@ -630,6 +670,7 @@ def generate_hints( elif results and offset == 0: success_pairs = neighbors_success_hints(payload) meta_pairs.extend(neighbors_calls_meta_hints(payload)) + meta_pairs.extend(neighbors_calls_fanout_hints(payload)) if results and _any_fuzzy_strategy(results): meta_pairs.append((PRIORITY_META, TPL_NEIGHBORS_FUZZY_STRATEGY)) return finalize_hint_list( diff --git a/mcp_v2.py b/mcp_v2.py index c59110b..7cb9b48 100644 --- a/mcp_v2.py +++ b/mcp_v2.py @@ -392,7 +392,7 @@ class SearchHit(BaseModel): class NodeRef(BaseModel): id: str - kind: Literal["symbol", "route", "client", "producer"] + kind: Literal["symbol", "route", "client", "producer", "unresolved_call_site"] fqn: str symbol_kind: str | None = None microservice: str | None = None @@ -550,7 +550,11 @@ class ResolveOutput(BaseModel): hints: list[str] = Field(default_factory=list, description=MCP_HINTS_FIELD_DESCRIPTION) -def _node_kind_from_id(id_str: str) -> Literal["symbol", "route", "client", "producer"]: +def _node_kind_from_id( + id_str: str, +) -> Literal["symbol", "route", "client", "producer", "unresolved_call_site"]: + if id_str.startswith("ucs:"): + return "unresolved_call_site" if id_str.startswith("sym:"): return "symbol" if id_str.startswith("route:") or id_str.startswith("r:"): @@ -562,7 +566,10 @@ def _node_kind_from_id(id_str: str) -> Literal["symbol", "route", "client", "pro raise ValueError(f"Unknown id prefix for `{id_str}`") -def _resolve_node_kind(graph: KuzuGraph, node_id: str) -> Literal["symbol", "route", "client", "producer"]: +def _resolve_node_kind( + graph: KuzuGraph, + node_id: str, +) -> Literal["symbol", "route", "client", "producer", "unresolved_call_site"]: try: return _node_kind_from_id(node_id) except ValueError: @@ -1005,6 +1012,13 @@ def find_v2( return FindOutput(success=False, message=str(exc), hints=[], limit=None, offset=None) +_DESCRIBE_UCS_ID_MESSAGE = ( + "UnresolvedCallSite ids (ucs:…) are not describable — use describe(caller_method_id) " + "for record.data.unresolved_call_sites, neighbors(..., include_unresolved=True), " + "or java-codebase-rag unresolved-calls list --method-id " +) + + def describe_v2( id: str | None = None, fqn: str | None = None, @@ -1016,6 +1030,8 @@ def describe_v2( has_fqn = bool(fqn and str(fqn).strip()) if not has_id and not has_fqn: return DescribeOutput(success=False, message="id or fqn required", hints=[]) + if has_id and str(id).strip().startswith("ucs:"): + return DescribeOutput(success=False, message=_DESCRIBE_UCS_ID_MESSAGE, hints=[]) hint_message: str | None = None node_id: str if has_id: @@ -1036,12 +1052,33 @@ def describe_v2( "then describe(id=...) on the chosen node" ) kind = _resolve_node_kind(g, node_id) + if kind == "unresolved_call_site": + return DescribeOutput(success=False, message=_DESCRIBE_UCS_ID_MESSAGE, hints=[]) row = _load_node_record(g, node_id, kind) if row is None: return DescribeOutput(success=False, message=f"No node found for `{node_id}`", hints=[]) ref = _node_ref_from_row(kind, row) edge_summary = _edge_summary_for_node(g, node_id, kind=kind, row=row) - record = NodeRecord(id=ref.id, kind=kind, fqn=ref.fqn, data=row, edge_summary=edge_summary) + data = dict(row) + if kind == "symbol" and str(row.get("kind") or "") in _METHOD_SYMBOL_KINDS_FOR_OVERRIDE_ROLLUP: + inline, total = g.unresolved_sites_for_describe(node_id) + if total > 0: + data["unresolved_call_sites_total"] = total + data["unresolved_call_sites"] = [ + { + "line": int(r.get("line") or 0), + "reason": str(r.get("reason") or ""), + "callee_simple": str(r.get("callee_simple") or ""), + "receiver_expr": str(r.get("receiver_expr") or ""), + } + for r in inline + ] + if total > len(inline): + data["unresolved_call_sites_footer"] = ( + f"{total} unresolved call sites — see " + f"java-codebase-rag unresolved-calls list --method-id {node_id} for the full list" + ) + record = NodeRecord(id=ref.id, kind=kind, fqn=ref.fqn, data=data, edge_summary=edge_summary) return DescribeOutput( success=True, record=record, @@ -1426,12 +1463,74 @@ def resolve_v2( def _neighbor_edge_attrs(row: dict[str, Any]) -> dict[str, Any]: - return { + attrs = { k: v for k, v in row.items() if k not in {"other_id", "edge_type", "stored_edge_type"} and v not in (None, "") } + attrs.setdefault("row_kind", "resolved") + return attrs + + +def _unresolved_site_to_edge(origin_id: str, row: dict[str, Any]) -> Edge: + ucs_id = str(row.get("id") or "") + callee = str(row.get("callee_simple") or "") + line = int(row.get("call_site_line") or 0) + byte = int(row.get("call_site_byte") or 0) + return Edge( + origin_id=origin_id, + edge_type="CALLS", + direction="out", + other=NodeRef(id=ucs_id, kind="unresolved_call_site", fqn="", name=callee), + attrs={ + "row_kind": "unresolved_call_site", + "unresolved_call_site_id": ucs_id, + "reason": str(row.get("reason") or ""), + "call_site_line": line, + "call_site_byte": byte, + "arg_count": int(row.get("arg_count") or 0), + "callee_simple": callee, + "receiver_expr": str(row.get("receiver_expr") or ""), + }, + ) + + +def _calls_transcript_sort_key(edge: Edge) -> tuple[int, int, int]: + attrs = edge.attrs or {} + line = int(attrs.get("call_site_line") or 0) + byte = int(attrs.get("call_site_byte") or 0) + kind_rank = 0 if str(attrs.get("row_kind") or "resolved") == "resolved" else 1 + return (line, byte, kind_rank) + + +def _dedup_call_edges(edges: list[Edge]) -> list[Edge]: + """Collapse resolved CALLS rows sharing (origin_id, other.id); unresolved rows pass through.""" + resolved: list[Edge] = [] + unresolved: list[Edge] = [] + for e in edges: + if str((e.attrs or {}).get("row_kind") or "resolved") == "unresolved_call_site": + unresolved.append(e) + else: + resolved.append(e) + groups: dict[tuple[str, str], list[Edge]] = {} + for e in resolved: + key = (e.origin_id, e.other.id) + groups.setdefault(key, []).append(e) + collapsed: list[Edge] = [] + for group in groups.values(): + ordered = sorted(group, key=_calls_transcript_sort_key) + canonical = ordered[0] + lines = sorted( + {int((x.attrs or {}).get("call_site_line") or 0) for x in group}, + ) + attrs = dict(canonical.attrs or {}) + attrs["call_site_count"] = len(group) + attrs["call_site_lines"] = lines + collapsed.append(canonical.model_copy(update={"attrs": attrs})) + merged = collapsed + unresolved + merged.sort(key=_calls_transcript_sort_key) + return merged def _edgefilter_pushdown_kwargs(ef: EdgeFilter | None) -> dict[str, Any]: @@ -1488,9 +1587,17 @@ def _neighbors_calls_for_origin( ef: EdgeFilter | None, offset: int, limit: int | None, + include_unresolved: bool = False, + dedup_calls: bool = False, ) -> list[Edge]: pushdown = _edgefilter_pushdown_kwargs(ef) - sql_pagination = nf is None and limit is not None + needs_full_stream = ( + nf is not None + or dedup_calls + or include_unresolved + or limit is None + ) + sql_pagination = not needs_full_stream and limit is not None if sql_pagination: rows = g.neighbor_calls_for_symbol( origin_id, @@ -1510,6 +1617,12 @@ def _neighbors_calls_for_origin( **pushdown, ) edges = _rows_to_call_edges(g, origin_id=origin_id, direction=direction, rows=rows, nf=nf) + if include_unresolved and direction == "out": + ucs_rows = g.unresolved_sites_for_caller(origin_id, direction=direction) + edges.extend(_unresolved_site_to_edge(origin_id, r) for r in ucs_rows) + edges.sort(key=_calls_transcript_sort_key) + if dedup_calls: + edges = _dedup_call_edges(edges) if limit is None: return edges return edges[offset : offset + limit] @@ -1526,6 +1639,8 @@ def neighbors_v2( offset: int = 0, filter: NodeFilter | dict[str, Any] | str | None = None, edge_filter: EdgeFilter | dict[str, Any] | str | None = None, + include_unresolved: bool = False, + dedup_calls: bool = False, graph: Any | None = None, ) -> NeighborsOutput: try: @@ -1567,6 +1682,30 @@ def neighbors_v2( except ValueError as exc: _log_fail_loud("edge_filter") return NeighborsOutput(success=False, message=str(exc), hints=[], requested_edge_types=[]) + if include_unresolved and ef is not None: + return NeighborsOutput( + success=False, + message=( + "include_unresolved=True is incompatible with edge_filter; " + "UnresolvedCallSite rows have no edge attributes to filter on" + ), + hints=[], + requested_edge_types=requested_edge_types, + ) + if include_unresolved and requested_edge_types != ["CALLS"]: + return NeighborsOutput( + success=False, + message="include_unresolved requires edge_types=['CALLS']", + hints=[], + requested_edge_types=requested_edge_types, + ) + if include_unresolved and direction != "out": + return NeighborsOutput( + success=False, + message='include_unresolved requires direction="out"', + hints=[], + requested_edge_types=requested_edge_types, + ) if ef and (err := _edgefilter_applicability_error(requested_edge_types, ef)): _log_fail_loud("edge_filter") return NeighborsOutput( @@ -1589,6 +1728,11 @@ def neighbors_v2( origins = [ids] if isinstance(ids, str) else list(ids) results: list[Edge] = [] unfiltered_calls_count: int | None = None + unresolved_count: int | None = None + calls_row_count: int | None = None + if use_calls_path and len(origins) == 1 and direction == "out": + unresolved_count = g.count_unresolved_for_caller(origins[0]) + calls_row_count = g.count_calls_for_symbol(origins[0], direction=direction) for origin_id in origins: origin_kind = _resolve_node_kind(g, origin_id) if composed_keys: @@ -1613,7 +1757,12 @@ def neighbors_v2( requested_edge_types=requested_edge_types, ) if use_calls_path: - paginate_in_sql = len(origins) == 1 and nf is None + paginate_in_sql = ( + len(origins) == 1 + and nf is None + and not include_unresolved + and not dedup_calls + ) try: origin_edges = _neighbors_calls_for_origin( g, @@ -1623,6 +1772,8 @@ def neighbors_v2( ef=ef, offset=offset if paginate_in_sql else 0, limit=limit if paginate_in_sql else None, + include_unresolved=include_unresolved, + dedup_calls=dedup_calls, ) except ValueError as exc: return NeighborsOutput( @@ -1731,7 +1882,12 @@ def neighbors_v2( "subject_record": subject_record, "node_filter": nf.model_dump(exclude_none=True) if nf else None, "edge_filter": ef.model_dump(exclude_none=True) if ef else None, + "edge_filter_provided": ef is not None, + "include_unresolved": include_unresolved, + "dedup_calls": dedup_calls, "unfiltered_calls_count": unfiltered_calls_count, + "unresolved_count": unresolved_count, + "calls_row_count": calls_row_count, } return NeighborsOutput( success=True, diff --git a/propose/CALLS-NOISE-AND-RESOLUTION-PROPOSE.md b/propose/completed/CALLS-NOISE-AND-RESOLUTION-PROPOSE.md similarity index 99% rename from propose/CALLS-NOISE-AND-RESOLUTION-PROPOSE.md rename to propose/completed/CALLS-NOISE-AND-RESOLUTION-PROPOSE.md index f2bd820..429e787 100644 --- a/propose/CALLS-NOISE-AND-RESOLUTION-PROPOSE.md +++ b/propose/completed/CALLS-NOISE-AND-RESOLUTION-PROPOSE.md @@ -1,6 +1,6 @@ # CALLS-NOISE-AND-RESOLUTION — clean the CALLS edge by removing one bucket and projecting the other -**Status**: under review +**Status**: landed (PR-3) **Author**: Dmitriy Teriaev + Perplexity Computer **Date**: 2026-05-18 **Tracks**: [#177](https://github.com/HumanBean17/java-codebase-rag/issues/177) diff --git a/scripts/generate_edge_navigation.py b/scripts/generate_edge_navigation.py index 3fa695e..52c6992 100644 --- a/scripts/generate_edge_navigation.py +++ b/scripts/generate_edge_navigation.py @@ -18,6 +18,24 @@ _COMPOSED_MEMBER_EDGE_NAMES = frozenset({"EXPOSES", "DECLARES_CLIENT", "DECLARES_PRODUCER"}) +_GRAPH_STORAGE_APPENDIX = """ +## Graph storage (not MCP `neighbors` edge_types) + +### `UnresolvedCallSite` + `UNRESOLVED_AT` (ontology 15 / CALLS-NOISE PR-3) + +Receiver-failure call sites (`chained_receiver`, `phantom_unresolved_receiver`) are **not** `CALLS` rows. They are `UnresolvedCallSite` nodes (`id` prefix `ucs:`) linked from the caller method Symbol via `UNRESOLVED_AT`. + +| Surface | How to read them | +| --- | --- | +| `describe(method_id)` | `record.data.unresolved_call_sites` (capped at 5) + footer when more exist | +| `neighbors(..., ['CALLS'], include_unresolved=True)` | Interleaved transcript; `row_kind='unresolved_call_site'`; `other.kind=unresolved_call_site` | +| CLI | `java-codebase-rag unresolved-calls list|stats` | + +- **Not** in `EDGE_SCHEMA` — do not pass `UNRESOLVED_AT` to `neighbors(edge_types=…)`. +- **`describe(ucs:…)`** is invalid (fail-loud); describe the **caller method** instead. +- Fresh graphs: `CALLS.strategy` no longer includes `phantom` or `chained_receiver` for receiver failure (those literals remain on HTTP/ASYNC `match` and brownfield resolver sets). +""" + _DEFAULT_OUT = _REPO_ROOT / "docs" / "EDGE-NAVIGATION.md" _BANNER = ( "# Edge Navigation Schema\n\n" @@ -75,6 +93,7 @@ def generate_markdown() -> str: parts.append("") for spec in EDGE_SCHEMA.values(): parts.extend(_render_edge(spec)) + parts.append(_GRAPH_STORAGE_APPENDIX.rstrip()) return "\n".join(parts).rstrip() + "\n" diff --git a/server.py b/server.py index a1d86ec..74fe6ea 100644 --- a/server.py +++ b/server.py @@ -502,7 +502,22 @@ async def neighbors( description=( "Optional EdgeFilter on CALLS edge attributes (edge_types=['CALLS'] only). Use " "callee_declaring_role for callee stereotype projection — not NodeFilter.role on method neighbors. " - "Prefer a JSON object; a JSON-encoded string is accepted." + "Mutually exclusive with include_unresolved. Prefer a JSON object; a JSON-encoded string is accepted." + ), + ), + include_unresolved: bool = Field( + default=False, + description=( + "When true with edge_types=['CALLS'] and direction='out', interleave UnresolvedCallSite " + "rows (row_kind='unresolved_call_site') with resolved CALLS in source order. " + "Mutually exclusive with edge_filter." + ), + ), + dedup_calls: bool = Field( + default=False, + description=( + "When true with edge_types=['CALLS'], collapse identical (origin, callee) CALLS to one row " + "with call_site_count and call_site_lines; unresolved sites are not deduped." ), ), ) -> mcp_v2.NeighborsOutput: @@ -515,6 +530,8 @@ async def neighbors( offset, filter, edge_filter, + include_unresolved, + dedup_calls, None, ) diff --git a/tests/test_ast_graph_build.py b/tests/test_ast_graph_build.py index d571a56..5366215 100644 --- a/tests/test_ast_graph_build.py +++ b/tests/test_ast_graph_build.py @@ -52,9 +52,9 @@ def test_schema_has_all_expected_tables(kuzu_db_path: Path) -> None: # We only assert the tables we depend on are present. The builder is # free to add more (e.g. CALLS later) without breaking this test. expected = { - "Symbol", "Route", "Client", "GraphMeta", - "EXTENDS", "IMPLEMENTS", "INJECTS", "DECLARES", "OVERRIDES", "CALLS", "EXPOSES", - "DECLARES_CLIENT", "DECLARES_PRODUCER", + "Symbol", "UnresolvedCallSite", "Route", "Client", "GraphMeta", + "EXTENDS", "IMPLEMENTS", "INJECTS", "DECLARES", "OVERRIDES", "CALLS", "UNRESOLVED_AT", + "EXPOSES", "DECLARES_CLIENT", "DECLARES_PRODUCER", } missing = expected - tables assert not missing, f"missing schema tables: {missing}; saw {tables}" @@ -347,3 +347,43 @@ def test_cli_entrypoint_runs(tmp_path: Path, corpus_root: Path) -> None: assert target.exists() conn = _connect(target) assert _scalar(conn, "MATCH (s:Symbol) RETURN count(s)") > 0 + + +def test_pass3_no_phantom_chained_calls_rows(kuzu_db_path: Path) -> None: + """HV19 — receiver-failure strategies must not appear on CALLS after PR-3.""" + conn = _connect(kuzu_db_path) + n = _scalar( + conn, + "MATCH ()-[c:CALLS]->() " + "WHERE c.strategy IN ['phantom','chained_receiver'] RETURN count(c)", + ) + assert n == 0, f"expected zero phantom/chained_receiver CALLS rows, got {n}" + + +def test_pass3_unresolved_call_site_emitted(kuzu_db_path: Path) -> None: + conn = _connect(kuzu_db_path) + n_ucs = _scalar(conn, "MATCH (u:UnresolvedCallSite) RETURN count(u)") + n_rel = _scalar(conn, "MATCH ()-[:UNRESOLVED_AT]->() RETURN count(*)") + assert n_ucs >= 1, "bank fixture should emit UnresolvedCallSite rows" + assert n_rel == n_ucs + reasons = { + r[0] + for r in conn.execute( + "MATCH (u:UnresolvedCallSite) RETURN DISTINCT u.reason" + ) + } + assert reasons <= {"phantom_unresolved_receiver", "chained_receiver"} + assert len(reasons) >= 1 + + +def test_pass3_known_external_calls_preserved(kuzu_db_path: Path) -> None: + """HV37 — JDK/external callee stays on CALLS with resolved=False, not phantom strategy.""" + conn = _connect(kuzu_db_path) + rows = conn.execute( + "MATCH (src:Symbol)-[c:CALLS]->(dst:Symbol) " + "WHERE c.resolved = false AND c.strategy <> 'overload_ambiguous' " + "RETURN c.strategy AS s LIMIT 20" + ) + found = [str(r[0]) for r in rows] + assert found, "bank fixture should have known-external CALLS rows" + assert all(s not in ("phantom", "chained_receiver") for s in found), found diff --git a/tests/test_call_graph_smoke_roundtrip.py b/tests/test_call_graph_smoke_roundtrip.py index 03090cd..24c5303 100644 --- a/tests/test_call_graph_smoke_roundtrip.py +++ b/tests/test_call_graph_smoke_roundtrip.py @@ -167,17 +167,24 @@ def test_overload_distinct_arities_single_targets(kuzu_db_path_call_graph_smoke: def test_expr_qualified_method_ref_chained_receiver(kuzu_db_path_call_graph_smoke: Path) -> None: - """§7.1 #18 (graph): expression-qualified `getX()::trim` → chained_receiver phantom.""" + """§7.1 #18 (graph): expression-qualified `getX()::trim` → chained_receiver UnresolvedCallSite.""" db = kuzu_db_path_call_graph_smoke conn = _connect(db) - rows = _rows( + calls = _rows( conn, "MATCH (src:Symbol)-[c:CALLS]->(dst:Symbol) " "WHERE src.fqn STARTS WITH 'smoke.NestedCalls#m' AND dst.name = 'trim' " - "RETURN c.strategy AS s, c.resolved AS r LIMIT 5", + "RETURN count(*) AS n", + ) + assert int(calls[0][0]) == 0, "trim chained-receiver site must not be a CALLS row" + ucs = _rows( + conn, + "MATCH (src:Symbol)-[:UNRESOLVED_AT]->(u:UnresolvedCallSite) " + "WHERE src.fqn STARTS WITH 'smoke.NestedCalls#m' AND u.callee_simple = 'trim' " + "RETURN u.reason AS reason LIMIT 5", ) - assert rows, "expected a trim call site from NestedCalls.m" - assert any(str(r[0]) == "chained_receiver" and r[1] is False for r in rows), rows + assert ucs, "expected trim unresolved site from NestedCalls.m" + assert any(str(r[0]) == "chained_receiver" for r in ucs), ucs def test_anonymous_class_calls_attributed_to_synthetic_member(kuzu_db_path_call_graph_smoke: Path) -> None: diff --git a/tests/test_java_codebase_rag_cli.py b/tests/test_java_codebase_rag_cli.py index ba86fee..84cdd46 100644 --- a/tests/test_java_codebase_rag_cli.py +++ b/tests/test_java_codebase_rag_cli.py @@ -42,9 +42,17 @@ def _base_env(corpus_root: Path, kuzu_db_path: Path | None = None) -> dict[str, return env -def _run_cli(args: list[str], *, env: dict[str, str], stdin: str | None = None) -> subprocess.CompletedProcess: +def _java_codebase_rag_exe() -> str: + venv_exe = Path(sys.executable).parent / "java-codebase-rag" + if venv_exe.is_file(): + return str(venv_exe) exe = shutil.which("java-codebase-rag") assert exe is not None, "expected installed java-codebase-rag entrypoint" + return exe + + +def _run_cli(args: list[str], *, env: dict[str, str], stdin: str | None = None) -> subprocess.CompletedProcess: + exe = _java_codebase_rag_exe() return subprocess.run( [exe, *args], capture_output=True, @@ -76,7 +84,7 @@ def test_cli_erase_refuses_non_tty_without_yes(tmp_path: Path) -> None: env["JAVA_CODEBASE_RAG_INDEX_DIR"] = str(idx) env["JAVA_CODEBASE_RAG_SOURCE_ROOT"] = str(tmp_path) proc = subprocess.run( - [shutil.which("java-codebase-rag"), "erase", "--source-root", str(tmp_path), "--index-dir", str(idx)], + [_java_codebase_rag_exe(), "erase", "--source-root", str(tmp_path), "--index-dir", str(idx)], capture_output=True, text=True, env=env, @@ -439,6 +447,53 @@ def test_cli_tables_lists_known_table(corpus_root, kuzu_db_path) -> None: assert "graph" in payload +def test_cli_unresolved_calls_list_and_stats(corpus_root, kuzu_db_path) -> None: + env = _base_env(corpus_root, kuzu_db_path) + stats_proc = _run_cli( + ["unresolved-calls", "stats", "--source-root", str(corpus_root), "--by", "reason"], + env=env, + ) + assert stats_proc.returncode == 0, stats_proc.stderr + stats = json.loads(stats_proc.stdout) + assert stats.get("success") is True + assert int(stats.get("total") or 0) >= 1 + assert stats.get("buckets") + + list_proc = _run_cli( + [ + "unresolved-calls", + "list", + "--source-root", + str(corpus_root), + "--reason", + "chained_receiver", + "--limit", + "5", + ], + env=env, + ) + assert list_proc.returncode == 0, list_proc.stderr + listed = json.loads(list_proc.stdout) + assert listed.get("success") is True + sites = listed.get("sites") or [] + assert sites + assert all(str(s.get("id") or "").startswith("ucs:") for s in sites) + assert all(s.get("reason") == "chained_receiver" for s in sites) + + bad_reason = _run_cli( + [ + "unresolved-calls", + "list", + "--source-root", + str(corpus_root), + "--reason", + "phantom", + ], + env=env, + ) + assert bad_reason.returncode != 0 + + def test_cli_diagnose_ignore_walked_path(corpus_root, kuzu_db_path) -> None: env = _base_env(corpus_root, kuzu_db_path) path = "chat-assign/src/main/java/com/bank/chat/assign/service/ChatManagementService.java" diff --git a/tests/test_kuzu_queries.py b/tests/test_kuzu_queries.py index e38db77..067ffc1 100644 --- a/tests/test_kuzu_queries.py +++ b/tests/test_kuzu_queries.py @@ -294,6 +294,12 @@ def test_trace_flow_structural_edges_not_starved_by_calls(kuzu_graph) -> None: ] +def test_find_callers_no_phantom_chained_strategy(kuzu_graph) -> None: + edges = kuzu_graph.find_callers("save", depth=1, limit=100) + for e in edges: + assert e.strategy not in ("phantom", "chained_receiver") + + def test_find_callers_assign_method(kuzu_graph) -> None: needle = "com.bank.chat.assign.service.ChatManagementService#assign(AssignmentRequest)" edges = kuzu_graph.find_callers(needle, depth=1, limit=50) diff --git a/tests/test_mcp_hints.py b/tests/test_mcp_hints.py index 2dd62e5..b48dec1 100644 --- a/tests/test_mcp_hints.py +++ b/tests/test_mcp_hints.py @@ -470,8 +470,9 @@ def test_hints_neighbors_fuzzy_strategy_annotation_absent() -> None: def test_hints_neighbors_fuzzy_strategy_calls_phantom_emits() -> None: + """CALLS fuzzy hint uses remaining strategies (overload_ambiguous), not removed phantom/chained.""" payload = _neighbors_hint_payload( - [_edge_result(strategy="phantom", edge_type="CALLS")], + [_edge_result(strategy="overload_ambiguous", edge_type="CALLS")], requested_edge_types=["CALLS"], ) hints = generate_hints("neighbors", payload) @@ -489,7 +490,7 @@ def test_hints_neighbors_declares_no_strategy_attrs_empty() -> None: def test_hints_neighbors_multi_origin_fuzzy_emits_once() -> None: payload = _neighbors_hint_payload( [ - _edge_result(strategy="phantom", edge_type="CALLS"), + _edge_result(strategy="overload_ambiguous", edge_type="CALLS"), _edge_result(strategy="annotation", edge_type="CALLS"), ], requested_edge_types=["CALLS"], @@ -498,6 +499,41 @@ def test_hints_neighbors_multi_origin_fuzzy_emits_once() -> None: assert hints.count(mcp_hints.TPL_NEIGHBORS_FUZZY_STRATEGY) == 1 +def test_hints_neighbors_calls_high_fanout(kuzu_graph) -> None: + mid = client_message_processor_process_id(kuzu_graph) + out = neighbors_v2(mid, direction="out", edge_types=["CALLS"], limit=25, graph=kuzu_graph) + assert out.success is True + assert len(out.results) == 25 + total_calls = kuzu_graph.count_calls_for_symbol(mid, direction="out") + assert total_calls >= 10 + assert mcp_hints.TPL_NEIGHBORS_CALLS_HIGH_FANOUT.format(n=total_calls) in out.hints + assert mcp_hints.TPL_NEIGHBORS_CALLS_HIGH_FANOUT.format(n=len(out.results)) not in out.hints + + +def test_hints_neighbors_calls_has_unresolved(kuzu_graph) -> None: + mid = client_message_processor_process_id(kuzu_graph) + unresolved = kuzu_graph.count_unresolved_for_caller(mid) + assert unresolved >= 1 + out = neighbors_v2(mid, direction="out", edge_types=["CALLS"], limit=5, graph=kuzu_graph) + assert out.success is True + want = mcp_hints.TPL_NEIGHBORS_CALLS_HAS_UNRESOLVED.format(n=len(out.results), k=unresolved) + assert want in out.hints + + +def test_hints_neighbors_calls_high_fanout_suppressed_with_edge_filter(kuzu_graph) -> None: + mid = client_message_processor_process_id(kuzu_graph) + out = neighbors_v2( + mid, + direction="out", + edge_types=["CALLS"], + edge_filter={"callee_declaring_role": "SERVICE"}, + limit=500, + graph=kuzu_graph, + ) + assert out.success is True + assert not any("CALLS on this method" in h for h in out.hints) + + def test_hints_neighbors_layer_a_meta_no_fuzzy_hint() -> None: payload = _neighbors_hint_payload( [_edge_result(strategy="layer_a_meta", edge_type="CALLS")], diff --git a/tests/test_mcp_v2.py b/tests/test_mcp_v2.py index e6e9454..103808c 100644 --- a/tests/test_mcp_v2.py +++ b/tests/test_mcp_v2.py @@ -17,6 +17,7 @@ from java_ontology import VALID_RESOLVE_REASONS from mcp_v2 import ( + Edge, NodeFilter, _NODEFILTER_APPLICABLE_FIELDS, describe_v2, @@ -1486,3 +1487,134 @@ def test_neighbors_calls_perf_empty_filter_client_message_processor(kuzu_graph) assert median_sec <= float(baseline) * 1.5 +def test_neighbors_include_unresolved_interleaved_order(kuzu_graph) -> None: + mid = client_message_processor_process_id(kuzu_graph) + out = neighbors_v2( + mid, + direction="out", + edge_types=["CALLS"], + include_unresolved=True, + limit=500, + graph=kuzu_graph, + ) + assert out.success is True + assert out.results + kinds = [e.attrs.get("row_kind") for e in out.results] + assert "unresolved_call_site" in kinds + assert "resolved" in kinds + ucs_edges = [e for e in out.results if (e.attrs or {}).get("row_kind") == "unresolved_call_site"] + assert ucs_edges + for e in ucs_edges: + assert e.other.kind == "unresolved_call_site" + assert e.other.id.startswith("ucs:") + assert not e.other.id.startswith("sym:") + keys = [ + ( + int(e.attrs.get("call_site_line") or 0), + int(e.attrs.get("call_site_byte") or 0), + 0 if e.attrs.get("row_kind") == "resolved" else 1, + ) + for e in out.results + ] + assert keys == sorted(keys) + + +def test_neighbors_include_unresolved_edge_filter_mutex(kuzu_graph) -> None: + mid = client_message_processor_process_id(kuzu_graph) + out = neighbors_v2( + mid, + direction="out", + edge_types=["CALLS"], + include_unresolved=True, + edge_filter={"min_confidence": 0.0}, + graph=kuzu_graph, + ) + assert out.success is False + assert "incompatible" in (out.message or "").lower() + + +def test_neighbors_dedup_calls_collapses_identical_dst(kuzu_graph) -> None: + rows = kuzu_graph._rows( # noqa: SLF001 + "MATCH (m:Symbol)-[c:CALLS]->(dst:Symbol) " + "WITH m, dst, collect(c.call_site_line) AS lines " + "WHERE size(lines) > 1 " + "RETURN m.id AS mid, dst.id AS did LIMIT 1", + ) + if not rows: + pytest.skip("no duplicate (caller,callee) CALLS pair in bank fixture") + mid = str(rows[0]["mid"]) + flat = neighbors_v2( + mid, direction="out", edge_types=["CALLS"], limit=500, graph=kuzu_graph, + ) + deduped = neighbors_v2( + mid, + direction="out", + edge_types=["CALLS"], + dedup_calls=True, + limit=500, + graph=kuzu_graph, + ) + assert flat.success and deduped.success + assert len(deduped.results) < len(flat.results) + multi = [e for e in deduped.results if int((e.attrs or {}).get("call_site_count") or 0) > 1] + assert multi, "dedup_calls should emit call_site_count on collapsed rows" + + +def test_describe_ucs_id_not_describable(kuzu_graph) -> None: + rows = kuzu_graph._rows( # noqa: SLF001 + "MATCH (u:UnresolvedCallSite) RETURN u.id AS id LIMIT 1", + ) + assert rows + ucs_id = str(rows[0]["id"]) + assert ucs_id.startswith("ucs:") + out = describe_v2(ucs_id, graph=kuzu_graph) + assert out.success is False + assert out.record is None + assert "not describable" in (out.message or "").lower() + assert "unresolved-calls" in (out.message or "").lower() + + +def test_neighbors_dedup_calls_include_unresolved(kuzu_graph) -> None: + mid = client_message_processor_process_id(kuzu_graph) + out = neighbors_v2( + mid, + direction="out", + edge_types=["CALLS"], + include_unresolved=True, + dedup_calls=True, + limit=500, + graph=kuzu_graph, + ) + assert out.success is True + kinds = {str((e.attrs or {}).get("row_kind") or "resolved") for e in out.results} + assert "resolved" in kinds + assert "unresolved_call_site" in kinds + keys = [_calls_transcript_sort_key_from_edge(e) for e in out.results] + assert keys == sorted(keys) + resolved = [e for e in out.results if (e.attrs or {}).get("row_kind") == "resolved"] + assert any(int((e.attrs or {}).get("call_site_count") or 0) > 1 for e in resolved) + + +def _calls_transcript_sort_key_from_edge(edge: Edge) -> tuple[int, int, int]: + attrs = edge.attrs or {} + line = int(attrs.get("call_site_line") or 0) + byte = int(attrs.get("call_site_byte") or 0) + kind_rank = 0 if str(attrs.get("row_kind") or "resolved") == "resolved" else 1 + return (line, byte, kind_rank) + + +def test_describe_unresolved_call_sites_rollup_cap_footer_and_total(kuzu_graph) -> None: + mid = client_message_processor_process_id(kuzu_graph) + out = describe_v2(mid, graph=kuzu_graph) + assert out.success and out.record + data = out.record.data + total = int(data.get("unresolved_call_sites_total") or 0) + assert total >= 6, "ClientMessageProcessor#process should have multiple unresolved sites" + inline = data.get("unresolved_call_sites") or [] + assert 1 <= len(inline) <= 5 + if total > len(inline): + footer = str(data.get("unresolved_call_sites_footer") or "") + assert "unresolved-calls list" in footer + assert mid in footer + + diff --git a/tests/test_schema_consistency.py b/tests/test_schema_consistency.py index 2f30413..a76bb09 100644 --- a/tests/test_schema_consistency.py +++ b/tests/test_schema_consistency.py @@ -25,6 +25,9 @@ "ast_java.py", ) +# Stored graph rels that are not MCP ``EdgeType`` / ``EDGE_SCHEMA`` entries. +_DDL_EDGES_EXCLUDED_FROM_EDGE_SCHEMA = frozenset({"UNRESOLVED_AT"}) + def _ddl_endpoints() -> dict[str, tuple[str, str]]: text = _BUILD_AST_GRAPH.read_text(encoding="utf-8") @@ -46,7 +49,7 @@ def _strategy_literals_in_emitters() -> set[str]: def test_schema_consistency_all_ddl_endpoints_match_edge_schema() -> None: ddl = _ddl_endpoints() schema_names = set(EDGE_SCHEMA) - ddl_names = set(ddl) + ddl_names = set(ddl) - _DDL_EDGES_EXCLUDED_FROM_EDGE_SCHEMA assert schema_names == ddl_names, ( f"EDGE_SCHEMA keys {sorted(schema_names)} != DDL edges {sorted(ddl_names)}" )