Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,9 @@ Resolution order for `microservice`:

### Re-index required when ontology changes

Current ontology version is **14**. Any index built before this version must be rebuilt via `cocoindex update ... --full-reprocess -f` or a full `java-codebase-rag reprocess` (no selective flags) so vectors and graph stay aligned. Until re-indexed, the server defensively JSON-decodes string-form list columns so nothing explodes, but filters like `array_contains` will not work.
Current ontology version is **15**. Any index built before this version must be rebuilt via `cocoindex update ... --full-reprocess -f` or a full `java-codebase-rag reprocess` (no selective flags) so vectors and graph stay aligned. Until re-indexed, the server defensively JSON-decodes string-form list columns so nothing explodes, but filters like `array_contains` will not work.

Ontology **15** (CALLS-NOISE PR-1) adds `CALLS.callee_declaring_role`, `GraphMeta.pass3_unresolved_phantom_receiver` / `pass3_unresolved_chained`, and **supertype-walk dedup** at build time: duplicate interface + concrete candidates at the same call site collapse to one `CALLS` row (row counts per method may drop after re-index, not only a new column). PR-2 adds `edge_filter` on `neighbors`; PR-3 moves true receiver-failure rows off `CALLS`.

Ontology **14** introduces `EDGE_SCHEMA` in `java_ontology.py` as the canonical edge navigation schema (see `docs/EDGE-NAVIGATION.md`). **`HTTP_CALLS` is `Client → Route`** (SCHEMA-V2 PR-B). **`ASYNC_CALLS` is `Producer → Route`** with `DECLARES_PRODUCER` (SCHEMA-V2 PR-C). Run one full reprocess after upgrading through the SCHEMA-V2 sequence (or when you need the v14 ontology gate).

Expand Down
3 changes: 2 additions & 1 deletion ast_java.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,9 @@
# Phase 9: `@CodebaseAsyncRoute` replaces same-method built-in `@KafkaListener` routes in graph composition.
# Phase 10: `@CodebaseHttpClient` rename + `CodebaseHttpMethod` enum; inbound HTTP layer-C replaces built-in rows.
# Phase 11: `EDGE_SCHEMA` in `java_ontology.py` (canonical edge navigation schema; v14 re-index).
# Phase 12: CALLS `callee_declaring_role`, supertype-walk dedup, pass3 unresolved counters (v15 re-index).
# Bumps whenever extraction / enrichment semantics change.
ONTOLOGY_VERSION = 14
ONTOLOGY_VERSION = 15

ROLE_ANNOTATIONS: dict[str, str] = {
# Spring Web
Expand Down
102 changes: 100 additions & 2 deletions build_ast_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ class CallsRow:
strategy: str = "phantom"
source: str = "static"
resolved: bool = True
callee_declaring_role: str = "OTHER"


@dataclass
Expand Down Expand Up @@ -380,7 +381,11 @@ class GraphTables:
parse_errors: int = 0
skipped_files: int = 0
pass3_skipped_cross_service: int = 0
pass3_unresolved_phantom_receiver: int = 0
pass3_unresolved_chained: int = 0
cross_service_resolution: str = "auto"
# Populated in _write_nodes (same overrides + meta_chain as Symbol.role).
type_role_by_node_id: dict[str, str] = field(default_factory=dict)


# ---------- file walk (see `path_filtering.iter_java_source_files`) ----------
Expand Down Expand Up @@ -1129,6 +1134,81 @@ def _phantom_method_id(
return pid


def _method_signature_matches_call(member: MemberEntry, call: CallSite) -> bool:
if call.arg_count < 0:
return True
return len(member.decl.parameters) == call.arg_count


def _is_strict_supertype_of(tables: GraphTables, super_fqn: str, subtype_fqn: str) -> bool:
if super_fqn == subtype_fqn:
return False
entry = tables.types.get(subtype_fqn)
if entry is None:
return False
visited: set[str] = set()
queue = list(_direct_supertype_fqns(entry, tables))
while queue:
tfqn = queue.pop(0)
if tfqn == super_fqn:
return True
if tfqn in visited or tfqn not in tables.types:
continue
visited.add(tfqn)
queue.extend(_direct_supertype_fqns(tables.types[tfqn], tables))
return False


def _callee_declaring_role_at_write(
tables: GraphTables,
dst_id: str,
*,
member_by_id: dict[str, MemberEntry],
) -> str:
"""Match parent declaring-type Symbol.role (brownfield + meta_chain included)."""
if dst_id in tables.phantoms:
return "OTHER"
member = member_by_id.get(dst_id)
if member is None:
return "OTHER"
return tables.type_role_by_node_id.get(member.parent_id, "OTHER")


def _collapse_supertype_duplicates(
candidates: list[MemberEntry],
recv_type_fqn: str,
call: CallSite,
tables: GraphTables,
) -> list[MemberEntry]:
"""§3.3.1 supertype-walk dedup — collapse interface + concrete duplicate sites."""
if len(candidates) <= 1:
return candidates
concrete_on_receiver = [
c for c in candidates
if c.parent_fqn == recv_type_fqn and _method_signature_matches_call(c, call)
]
if len(concrete_on_receiver) != 1:
return candidates
concrete = concrete_on_receiver[0]
supertypes = [
c for c in candidates
if c is not concrete
and _is_strict_supertype_of(tables, c.parent_fqn, recv_type_fqn)
and c.decl.signature == concrete.decl.signature
]
if not supertypes:
return candidates
allowed_ids = {concrete.node_id, *(c.node_id for c in supertypes)}
if any(c.node_id not in allowed_ids for c in candidates):
return candidates
log.debug(
"pass3 supertype dedup %s -> %s",
[c.node_id for c in candidates],
concrete.node_id,
)
return [concrete]


def _emit_call_edge(
tables: GraphTables,
stats: CallResolutionStats,
Expand Down Expand Up @@ -1269,6 +1349,9 @@ def _resolve_and_emit_call(
)
return

if len(candidates) > 1 and edge_strat != "overload_ambiguous":
candidates = _collapse_supertype_duplicates(candidates, recv_type, call, tables)

if len(candidates) == 1:
candidate = candidates[0]
ref_arity: int | None = None
Expand Down Expand Up @@ -1334,6 +1417,8 @@ def pass3_calls(tables: GraphTables, asts: dict[str, JavaFileAst], *, verbose: b
pct_callee_unres = 100.0 * stats.callee_unresolved / max(1, stats.total)
pct_phantom_recv = 100.0 * stats.phantom_other / max(1, stats.total)
tables.pass3_skipped_cross_service = int(stats.skipped_cross_service)
tables.pass3_unresolved_phantom_receiver = int(stats.phantom_other)
tables.pass3_unresolved_chained = int(stats.phantom_chained)
msg = (
f"Call resolution: {stats.total} sites, {stats.phantom_chained} chained phantoms "
f"({pct_chained:.1f}%), {stats.callee_unresolved} unresolved callee "
Expand Down Expand Up @@ -2262,6 +2347,8 @@ def _micro_factor(member: MemberEntry | None) -> float:
"async_calls_match_breakdown STRING, "
"cross_service_calls_total INT64, "
"pass3_skipped_cross_service INT64, "
"pass3_unresolved_phantom_receiver INT64, "
"pass3_unresolved_chained INT64, "
"pass4_exposes_suppressed_feign INT64, "
"cross_service_resolution STRING"
")"
Expand Down Expand Up @@ -2316,7 +2403,8 @@ def _micro_factor(member: MemberEntry | None) -> float:
_SCHEMA_CALLS = (
"CREATE REL TABLE CALLS(FROM Symbol TO Symbol, "
"call_site_line INT64, call_site_byte INT64, arg_count INT64, "
"confidence DOUBLE, strategy STRING, source STRING, resolved BOOLEAN)"
"confidence DOUBLE, strategy STRING, source STRING, resolved BOOLEAN, "
"callee_declaring_role STRING)"
)
_SCHEMA_EXPOSES = (
"CREATE REL TABLE EXPOSES(FROM Symbol TO Route, "
Expand Down Expand Up @@ -2445,6 +2533,7 @@ def _write_nodes(
overrides=overrides,
meta_chain=mch,
)
tables.type_role_by_node_id[entry.node_id] = role
conn.execute(_CREATE_SYMBOL, _node_row(
id=entry.node_id, kind=d.kind, name=d.name, fqn=d.fqn,
package=entry.package,
Expand Down Expand Up @@ -2503,7 +2592,8 @@ def _write_nodes(
"MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) "
"CREATE (a)-[:CALLS {"
"call_site_line: $line, call_site_byte: $byte, arg_count: $argc, "
"confidence: $conf, strategy: $strat, source: $src_kind, resolved: $resolved"
"confidence: $conf, strategy: $strat, source: $src_kind, resolved: $resolved, "
"callee_declaring_role: $callee_declaring_role"
"}]->(b)"
)

Expand Down Expand Up @@ -2637,6 +2727,7 @@ def _write_edges(conn: kuzu.Connection, tables: GraphTables) -> None:
seen_calls.add(key)
unique_calls.append(row)

member_by_id = {m.node_id: m for m in tables.members}
for row in unique_calls:
conn.execute(_CREATE_CALL, {
"src": row.src_id, "dst": row.dst_id,
Expand All @@ -2647,6 +2738,9 @@ def _write_edges(conn: kuzu.Connection, tables: GraphTables) -> None:
"strat": row.strategy,
"src_kind": row.source,
"resolved": row.resolved,
"callee_declaring_role": _callee_declaring_role_at_write(
tables, row.dst_id, member_by_id=member_by_id,
),
})


Expand Down Expand Up @@ -2788,6 +2882,8 @@ def _write_meta(conn: kuzu.Connection, tables: GraphTables, source_root: Path) -
"async_calls_match_breakdown: $async_calls_match_breakdown, "
"cross_service_calls_total: $cross_service_calls_total, "
"pass3_skipped_cross_service: $pass3_skipped_cross_service, "
"pass3_unresolved_phantom_receiver: $pass3_unresolved_phantom_receiver, "
"pass3_unresolved_chained: $pass3_unresolved_chained, "
"pass4_exposes_suppressed_feign: $pass4_exposes_suppressed_feign, "
"cross_service_resolution: $cross_service_resolution})",
{
Expand Down Expand Up @@ -2821,6 +2917,8 @@ def _write_meta(conn: kuzu.Connection, tables: GraphTables, source_root: Path) -
"async_calls_match_breakdown": json.dumps(async_match),
"cross_service_calls_total": int(call_stats.cross_service_calls_total),
"pass3_skipped_cross_service": int(tables.pass3_skipped_cross_service),
"pass3_unresolved_phantom_receiver": int(tables.pass3_unresolved_phantom_receiver),
"pass3_unresolved_chained": int(tables.pass3_unresolved_chained),
"pass4_exposes_suppressed_feign": int(st.exposes_suppressed_feign),
"cross_service_resolution": str(tables.cross_service_resolution),
},
Expand Down
10 changes: 6 additions & 4 deletions docs/AGENT-GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
> `neighbors` arguments, pass stringified JSON, or use vector search for
> questions the graph answers exactly. This guide keeps them on the rails.
>
> Calibrated against ontology version **14** (see `ast_java.ONTOLOGY_VERSION` /
> Calibrated against ontology version **15** (see `ast_java.ONTOLOGY_VERSION` /
> `java_ontology.EDGE_SCHEMA` + valid sets): canonical edge navigation schema in
> `docs/EDGE-NAVIGATION.md`. v14 re-index required; `HTTP_CALLS` is `Client → Route`;
> `Producer` + `DECLARES_PRODUCER` and `ASYNC_CALLS` (`Producer → Route`) ship in v14.
> `docs/EDGE-NAVIGATION.md`. v15 re-index required — `CALLS.callee_declaring_role`,
> supertype-walk dedup (fewer duplicate-site rows), and `GraphMeta` pass3 unresolved
> counters; PR-2 adds `edge_filter` on `neighbors`. v14: `HTTP_CALLS` is `Client → Route`;
> `Producer` + `DECLARES_PRODUCER` and `ASYNC_CALLS` (`Producer → Route`).
> Still includes stored `OVERRIDES` Symbol→Symbol edges and v12 HTTP brownfield
> (`@CodebaseHttpClient`, shared `CodebaseHttpMethod` enum, inbound layer-C HTTP routes
> replace same-method built-in rows). **Design rationale:** navigation surface and tools —
Expand Down Expand Up @@ -264,7 +266,7 @@ Virtual keys (`OVERRIDDEN_BY`, …) are **not** valid `neighbors` arguments —
- **Mixed flat + composed `edge_types`:** flat edges are appended before composed edges, then `limit`/`offset` apply. A small `limit` with e.g. `["DECLARES", "DECLARES.DECLARES_CLIENT"]` may return only member Symbols and no Clients — use the dot-key alone when enumerating terminals.
- **Confidence:** Cross-service edges (`HTTP_CALLS`, `ASYNC_CALLS`) carry confidence, strategy, and match metadata on `edge.attrs` (`attrs.confidence`, `attrs.strategy`, `attrs.match`). Low confidence means the resolver had to guess at the route binding — treat it as a **resolver gap signal**, not a hallucination. Report low-confidence edges with their confidence value, not as facts. Intra-service edges (`CALLS`, `INJECTS`, `IMPLEMENTS`, `EXTENDS`, `DECLARES`, `DECLARES_CLIENT`, `EXPOSES`, `OVERRIDES`) faithfully represent the static graph; the resolved set is still a **lower bound** under reflection / dynamic dispatch (see *What this MCP is NOT*).

### Ontology glossary (version 14)
### Ontology glossary (version 15)

Source of truth: `java_ontology.py` (`EDGE_SCHEMA`, valid sets). Strings are case-sensitive. Edge navigation: [`docs/EDGE-NAVIGATION.md`](./EDGE-NAVIGATION.md) — for `HTTP_CALLS`, traverse via `DECLARES_CLIENT` from a method Symbol or `neighbors` outbound from a Client id; for `ASYNC_CALLS`, traverse via `DECLARES_PRODUCER` or outbound from a Producer id.

Expand Down
1 change: 1 addition & 0 deletions docs/EDGE-NAVIGATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@
- `strategy` (`STRING`) — call-graph resolution strategy literal
- `source` (`STRING`) — call-graph source tag
- `resolved` (`BOOLEAN`) — True iff callee Symbol was resolved in-graph
- `callee_declaring_role` (`STRING`) — role of the Symbol that declares the callee method

**Typical traversals**:

Expand Down
5 changes: 5 additions & 0 deletions java_ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,11 @@ class EdgeSpec:
EdgeAttr("strategy", "STRING", "call-graph resolution strategy literal"),
EdgeAttr("source", "STRING", "call-graph source tag"),
EdgeAttr("resolved", "BOOLEAN", "True iff callee Symbol was resolved in-graph"),
EdgeAttr(
"callee_declaring_role",
"STRING",
"role of the Symbol that declares the callee method",
),
),
purpose="intra-codebase method call from caller method to callee method",
member_only=True,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package smoke;

/** Minimal interface + concrete same-site stub for pass3 supertype-walk dedup (PR-1). */
@interface Repository {
}

@Repository
interface JpaStyleRepo {
void save(Object entity);
}

@Repository
class JpaStyleRepoImpl implements JpaStyleRepo {
@Override
public void save(Object entity) {
}
}

public class SupertypeDedupPatterns {
private final JpaStyleRepoImpl repo;

SupertypeDedupPatterns(JpaStyleRepoImpl repo) {
this.repo = repo;
}

void persist(Object entity) {
repo.save(entity);
}
}
92 changes: 92 additions & 0 deletions tests/test_ast_graph_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
import kuzu
import pytest

from _builders import build_kuzu_to
from ast_java import ONTOLOGY_VERSION
from graph_enrich import _load_brownfield_overrides, collect_annotation_meta_chain


def _connect(db_path: Path) -> kuzu.Connection:
Expand Down Expand Up @@ -58,6 +60,96 @@ def test_schema_has_all_expected_tables(kuzu_db_path: Path) -> None:
assert not missing, f"missing schema tables: {missing}; saw {tables}"


def test_graph_meta_unresolved_counters_present(kuzu_db_path: Path) -> None:
conn = _connect(kuzu_db_path)
r = conn.execute(
"MATCH (m:GraphMeta) RETURN m.pass3_unresolved_phantom_receiver, "
"m.pass3_unresolved_chained"
)
assert r.has_next(), "expected GraphMeta row"
row = r.get_next()
assert row[0] is not None and int(row[0]) >= 0
assert row[1] is not None and int(row[1]) >= 0


def test_calls_callee_declaring_role_matches_parent_symbol_role_yaml_brownfield(
tmp_path: Path,
) -> None:
"""YAML role_overrides on declaring type → edge attr matches parent Symbol.role."""
_load_brownfield_overrides.cache_clear()
collect_annotation_meta_chain.cache_clear()
root = tmp_path / "proj"
java_dir = root / "src/main/java/smoke"
java_dir.mkdir(parents=True)
(java_dir / "BrownfieldCallRole.java").write_text(
"""
package smoke;

@interface LegacyServiceMarker { }

@LegacyServiceMarker
class ConfigOnlyService {
void handle() { }
}

class Caller {
void run(ConfigOnlyService svc) {
svc.handle();
}
}
""".strip()
+ "\n",
encoding="utf-8",
)
(root / ".java-codebase-rag.yml").write_text(
"role_overrides:\n"
" annotations:\n"
" LegacyServiceMarker: SERVICE\n",
encoding="utf-8",
)
db_path = build_kuzu_to(root, tmp_path / "g.kuzu", max_pass=3)
conn = _connect(db_path)
mismatches = _scalar(
conn,
"MATCH ()-[c:CALLS]->(dst:Symbol) "
"MATCH (parent:Symbol {id: dst.parent_id}) "
"WHERE c.callee_declaring_role <> parent.role "
"RETURN count(*)",
)
assert mismatches == 0
roles = _column(
conn,
"MATCH ()-[c:CALLS]->(dst:Symbol) "
"MATCH (parent:Symbol {id: dst.parent_id}) "
"WHERE parent.fqn = 'smoke.ConfigOnlyService' "
"RETURN DISTINCT c.callee_declaring_role",
)
assert roles == ["SERVICE"]


def test_pass3_callee_declaring_role_bank_annotated_types(kuzu_db_path: Path) -> None:
"""CALLS to methods on @Service declaring types carry callee_declaring_role=SERVICE."""
conn = _connect(kuzu_db_path)
rows = _column(
conn,
"MATCH (src:Symbol)-[c:CALLS]->(dst:Symbol) "
"MATCH (parent:Symbol {id: dst.parent_id}) "
"WHERE 'Service' IN parent.annotations AND parent.role = 'SERVICE' "
"RETURN c.callee_declaring_role LIMIT 20",
)
assert rows, "expected CALLS to @Service-declared callees on bank-chat-system"
assert all(str(r) == "SERVICE" for r in rows), rows
repo_rows = _column(
conn,
"MATCH (src:Symbol)-[c:CALLS]->(dst:Symbol) "
"MATCH (parent:Symbol {id: dst.parent_id}) "
"WHERE 'Repository' IN parent.annotations "
"RETURN DISTINCT c.callee_declaring_role",
)
if repo_rows:
assert all(str(r) == "REPOSITORY" for r in repo_rows), repo_rows


def test_graph_meta_present_and_versioned(kuzu_db_path: Path) -> None:
conn = _connect(kuzu_db_path)
r = conn.execute(
Expand Down
Loading
Loading