From f025094702e7e9de5136da5aced607c496a66dcb Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Wed, 13 May 2026 18:09:33 +0300 Subject: [PATCH] add selective reprocess and phases_run contract Co-authored-by: Cursor --- README.md | 10 +- docs/JAVA-CODEBASE-RAG-CLI.md | 41 +++- java_codebase_rag/cli.py | 155 ++++++++++++++- server.py | 21 +- tests/test_java_codebase_rag_cli.py | 298 ++++++++++++++++++++++++++++ 5 files changed, 504 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 9f5fbaa..79e7ed8 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ async_producer_overrides: - **The file must be at `source_root`**, not in `$HOME`. The MCP server reads `JAVA_CODEBASE_RAG_SOURCE_ROOT` to find it; the CLI uses `--source-root` (else cwd). - **Don't commit secrets** into this YAML — it sits next to your source tree and is read by every operator who clones it. -- **Rebuild after editing brownfield overrides.** `java-codebase-rag reprocess` rebuilds Lance + Kuzu so the new overrides take effect. Editing `embedding.model` also requires reprocess (the embeddings in Lance must match the reader's model). +- **Rebuild after editing brownfield overrides.** Run a full `java-codebase-rag reprocess` (no flags) so Lance and Kuzu stay coherent, or use `--graph-only` / `--vectors-only` when you know only one store needs invalidation. Editing `embedding.model` requires a vector rebuild (`reprocess` or `--vectors-only`). - **Diagnose what's loaded.** `java-codebase-rag meta` prints the resolved config and each value's `*_source` (`cli` / `env` / `yaml` / `default`) — see `embedding_model_source`, `embedding_device_source`, `index_dir_source`. - **`embedding.model` and `$` in directory names.** `expandvars` treats `$VAR` / `${VAR}` like the shell. HuggingFace hub ids never contain `$`. If a local filesystem path contains a literal `$` in a directory name, use an absolute path that avoids `$`-expansion patterns, or expect `expandvars` to interpret `$` sequences. @@ -293,7 +293,7 @@ Shared flags on all subcommands: `--source-root`, `--index-dir`, `--embedding-mo |---|---|---| | Lifecycle | `init` | First-time index; refuses if the index dir already has artifacts. | | Lifecycle | `increment` | CocoIndex catch-up (Lance only); prints a stderr warning that Kuzu is unchanged until `reprocess`. | -| Lifecycle | `reprocess` | Full Lance reprocess + full Kuzu rebuild (full indexing pipeline). | +| Lifecycle | `reprocess` | Default: full Lance reprocess + full Kuzu rebuild. Optional `--vectors-only` / `--graph-only` (mutually exclusive) for a single phase. | | Lifecycle | `erase` | Deletes index artifacts; requires `--yes` or interactive TTY confirm. | | Introspection | `meta`, `tables`, `diagnose-ignore` | Health, table listing, ignore-layer diagnostics. | | Analysis | `analyze-pr` | Blast-radius / risk from a unified diff. | @@ -359,7 +359,7 @@ JAVA_CODEBASE_RAG_INDEX_DIR=/path/to/.java-codebase-rag .venv/bin/python search_ ### Building the graph standalone -`java-codebase-rag reprocess` runs `cocoindex update` with a full reprocess flag, then invokes `build_ast_graph.py` to rebuild Kuzu under the resolved index directory. To rebuild only the graph: +`java-codebase-rag reprocess` (default, no flags) runs `cocoindex update` with a full reprocess flag, then invokes `build_ast_graph.py` to rebuild Kuzu under the resolved index directory. For a **graph-only** rebuild from the CLI, prefer `java-codebase-rag reprocess --graph-only` (see [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md)). To invoke the graph builder directly: ```bash # Scan the current working directory @@ -439,7 +439,7 @@ Resolution order for `microservice`: ### Re-index required when ontology changes -Current ontology version is **12**. Any index built before this version must be rebuilt via `cocoindex update ... --full-reprocess -f` or `java-codebase-rag reprocess`. Until re-indexed, the server defensively JSON-decodes string-form list columns so nothing explodes, but filters like `array_contains` will not work. +Current ontology version is **12**. Any index built before this version must be rebuilt via `cocoindex update ... --full-reprocess -f` or a full `java-codebase-rag reprocess` (no selective flags) so vectors and graph stay aligned. Until re-indexed, the server defensively JSON-decodes string-form list columns so nothing explodes, but filters like `array_contains` will not work. Ontology **12** renames `@CodebaseClient` to `@CodebaseHttpClient`, types HTTP `method` as the shared `CodebaseHttpMethod` enum on both inbound and outbound stubs, and makes inbound layer-C HTTP routes **replace** same-method built-in Spring rows (no merge). Rebuild after upgrading so `meta_chain` keys and annotation simple names match the extractor. @@ -687,7 +687,7 @@ public Reply callJoinOperator(Request req) { /* ... */ } public void publishFollowUp(Event e) { /* ... */ } ``` -Resolution order in code: built-in inference → config annotation maps → meta-annotation walk → `@CodebaseRole` / `@CodebaseCapability` → `role_overrides.fqn` (highest priority for explicit per-type config). Route composition uses the same first-pass index, then `@CodebaseHttpRoute` / `@CodebaseAsyncRoute`, then `route_overrides.fqn`. Rebuild Lance + Kuzu (`java-codebase-rag reprocess` or `build_ast_graph.py`) after changing overrides. +Resolution order in code: built-in inference → config annotation maps → meta-annotation walk → `@CodebaseRole` / `@CodebaseCapability` → `role_overrides.fqn` (highest priority for explicit per-type config). Route composition uses the same first-pass index, then `@CodebaseHttpRoute` / `@CodebaseAsyncRoute`, then `route_overrides.fqn`. Rebuild the affected store (`java-codebase-rag reprocess`, or `--vectors-only` / `--graph-only` when appropriate, or `build_ast_graph.py` for graph-only manual runs) after changing overrides. ### 7.4 Caller-side overrides diff --git a/docs/JAVA-CODEBASE-RAG-CLI.md b/docs/JAVA-CODEBASE-RAG-CLI.md index 6c74cdc..524724e 100644 --- a/docs/JAVA-CODEBASE-RAG-CLI.md +++ b/docs/JAVA-CODEBASE-RAG-CLI.md @@ -19,7 +19,7 @@ If `java-codebase-rag` is missing, run the module entrypoint: ## Output mode -- **TTY:** human-readable `pprint` of the payload on stdout. +- **TTY:** human-readable `pprint` of the payload on stdout (except **successful selective `reprocess`** with `--vectors-only` / `--graph-only`, which prints `Rebuilt:` / `Skipped:` lines instead of dumping the full dict). - **Piped / non-TTY:** **single JSON object** per invocation on stdout (no trailing noise). Use this in scripts and CI. Example: @@ -78,8 +78,8 @@ Relative paths for `diagnose-ignore ` are resolved against the MCP/CLI pro | Code | Typical meaning | | ---- | ---------------- | | `0` | Success (payload may still report logical failures inside JSON for some commands — always parse stdout in scripts). | -| `1` | Subcommand-specific failure (e.g. `analyze-pr` cannot read diff, graph missing, invalid path for `diagnose-ignore`). | -| `2` | No subcommand / help printed; **`init`** refused because the index dir is non-empty; **`erase`** refused in non-TTY without `--yes`; **`meta`** when graph payload reports `success: false`; unhandled internal error in `main`. | +| `1` | Subcommand-specific failure (e.g. `analyze-pr` cannot read diff, graph missing, invalid path for `diagnose-ignore`). For **`reprocess`**, a **requested phase subprocess** ran and exited non-zero (see `phases_run` in stdout JSON). | +| `2` | No subcommand / help printed; **`init`** refused because the index dir is non-empty; **`erase`** refused in non-TTY without `--yes`; **`meta`** when graph payload reports `success: false`; unhandled internal error in `main`. For **`reprocess`**, invalid flag combination (handled like other argparse errors), or a **setup failure before any phase subprocess was spawned** (`phases_run: []` in the JSON payload — e.g. cocoindex binary missing next to this Python, flow file missing). | ## Lifecycle subcommands @@ -101,10 +101,41 @@ java-codebase-rag increment --source-root /path/to/java/repo --index-dir /path/t ### `reprocess` -Full **Lance reprocess** + **full Kuzu rebuild** (full indexing pipeline). +**Default (no extra flags):** full **Lance** reprocess (cocoindex `--full-reprocess`) then full **Kuzu** rebuild via `build_ast_graph.py`, in that order. This remains the recommended **coherence** operation when both stores might be out of date. + +**Selective flags (mutually exclusive):** + +- `--vectors-only` — runs only the cocoindex full reprocess phase; does **not** invoke the graph builder. +- `--graph-only` — runs only `build_ast_graph.py`; does **not** invoke cocoindex. + +Passing **both** flags is rejected by argparse **before** any subprocess runs. The error is printed on **stderr** in this form (wording may vary slightly with Python/argparse version): + +```text +java-codebase-rag: argument --graph-only: not allowed with argument --vectors-only +``` + +Use `java-codebase-rag reprocess --help` for the live synopsis. + +#### Drift warning (stderr) + +After a **successful** selective run, the CLI prints **exactly one** line to **stderr** naming the store that was **not** rebuilt. **`--quiet` does not suppress this line** (quiet only affects subprocess verbosity). There is no extra exit code for drift; scripts should treat stderr as informational. + +#### JSON payload: `phases_run` + +The stdout JSON includes an additive list field `phases_run`: which phases actually **spawned** subprocesses, in order (`"vectors"`, `"graph"`). Examples: + +- Default success after both phases: `["vectors", "graph"]` +- Default run where cocoindex fails before the graph step: `["vectors"]` (graph never started) +- `--vectors-only` success: `["vectors"]` +- `--graph-only` success: `["graph"]` +- Setup failure before any phase (missing cocoindex binary, missing bundled flow file, or pipeline preflight `126`/`127` stubs): `[]` + +Because `exit_code` and `graph_exit_code` can be `null` in multiple situations, **prefer branching on `phases_run` first**, then on the relevant per-phase exit field. **Asymmetry:** `--vectors-only` reports the cocoindex process in `exit_code` (and leaves `graph_exit_code` null); `--graph-only` leaves top-level `exit_code` null and reports the graph builder in `graph_exit_code`, so scripts that only read `exit_code` miss graph-only outcomes unless they branch on `phases_run` / `graph_exit_code`. ```bash java-codebase-rag reprocess --source-root /path/to/java/repo --index-dir /path/to/.java-codebase-rag --quiet +java-codebase-rag reprocess --source-root /path/to/java/repo --index-dir /path/to/.java-codebase-rag --vectors-only --quiet +java-codebase-rag reprocess --source-root /path/to/java/repo --index-dir /path/to/.java-codebase-rag --graph-only --quiet ``` ### `erase` @@ -192,7 +223,7 @@ java-codebase-rag analyze-pr --diff-file /tmp/pr.diff --source-root /path/to/jav ## Graph-only escape hatch -To rebuild **only** Kuzu (no Lance re-embed), call the graph builder directly: +Prefer **`java-codebase-rag reprocess --graph-only`** when you only need Kuzu rebuilt from the current Lance snapshot. To run the graph builder **without** going through the CLI (advanced / scripting): ```bash .venv/bin/python build_ast_graph.py --source-root /path/to/java/repo --kuzu-path /path/to/.java-codebase-rag/code_graph.kuzu --verbose diff --git a/java_codebase_rag/cli.py b/java_codebase_rag/cli.py index 491c5f7..9c5dea0 100644 --- a/java_codebase_rag/cli.py +++ b/java_codebase_rag/cli.py @@ -44,6 +44,55 @@ "This alias will be removed in the next release." ) +_REPROCESS_DRIFT_VECTORS_ONLY = ( + "java-codebase-rag reprocess: rebuilt vectors only; graph (code_graph.kuzu) was NOT rebuilt " + "and may now reflect a stale source snapshot." +) + + +def _reprocess_drift_graph_only_line(index_dir: Path) -> str: + return ( + "java-codebase-rag reprocess: rebuilt graph only; vectors (Lance tables under " + f"{index_dir}) were NOT rebuilt and may now reflect a stale source snapshot." + ) + + +def _reprocess_exit_code(payload: dict[str, Any]) -> int: + if payload.get("success"): + return 0 + phases_run = payload.get("phases_run") or [] + if not phases_run: + return 2 + return 1 + + +# Preflight detection must stay aligned with stub CompletedProcess shapes in +# java_codebase_rag/pipeline.py (missing cocoindex / flow / build_ast_graph.py). +def _is_cocoindex_preflight_blocker(coco: Any) -> bool: + """True when ``run_cocoindex_update`` returned without spawning cocoindex.""" + return bool(coco.returncode in (126, 127) and len(getattr(coco, "args", ()) or ()) <= 1) + + +def _is_graph_preflight_blocker(g: Any) -> bool: + """True when ``run_build_ast_graph`` returned without spawning the builder.""" + return bool(g.returncode in (126, 127) and len(getattr(g, "args", ()) or ()) <= 1) + + +def _emit_reprocess_selective_tty(*, mode: str) -> None: + if mode == "vectors": + print("Rebuilt: vectors") + print("Skipped: graph (use `java-codebase-rag reprocess --graph-only` or `reprocess` to refresh)") + else: + print("Rebuilt: graph") + print("Skipped: vectors (use `java-codebase-rag reprocess --vectors-only` or `reprocess` to refresh)") + + +def _emit_reprocess_outcome(payload: dict[str, Any], *, selective_tty_mode: str | None = None) -> None: + if payload.get("success") and selective_tty_mode and sys.stdout.isatty(): + _emit_reprocess_selective_tty(mode=selective_tty_mode) + return + _emit(payload) + def _jsonable(value: Any) -> Any: if hasattr(value, "model_dump"): @@ -176,18 +225,90 @@ def _cmd_increment(args: argparse.Namespace) -> int: def _cmd_reprocess(args: argparse.Namespace) -> int: - import server # lazy: pulls sentence_transformers/torch/lancedb/kuzu - cfg = _resolved_from_ns(args) _startup_hints(cfg) cfg.apply_to_os_environ() + env = cfg.subprocess_env() + vectors_only = bool(getattr(args, "vectors_only", False)) + graph_only = bool(getattr(args, "graph_only", False)) + + if vectors_only: + coco = run_cocoindex_update(env, full_reprocess=True, quiet=bool(args.quiet)) + if _is_cocoindex_preflight_blocker(coco): + payload: dict[str, Any] = { + "success": False, + "exit_code": None, + "stdout": clip(coco.stdout, 8000), + "stderr": clip(coco.stderr, 8000), + "message": coco.stderr.strip() or f"cocoindex setup exit {coco.returncode}", + "graph_exit_code": None, + "graph_stdout": "", + "graph_stderr": "", + "phases_run": [], + } + _emit_reprocess_outcome(payload) + return _reprocess_exit_code(payload) + ok = coco.returncode == 0 + payload = { + "success": ok, + "exit_code": coco.returncode, + "stdout": clip(coco.stdout, 8000), + "stderr": clip(coco.stderr, 8000), + "message": None if ok else f"cocoindex exit {coco.returncode}", + "graph_exit_code": None, + "graph_stdout": "", + "graph_stderr": "", + "phases_run": ["vectors"], + } + if ok: + print(_REPROCESS_DRIFT_VECTORS_ONLY, file=sys.stderr) + _emit_reprocess_outcome(payload, selective_tty_mode="vectors" if ok else None) + return _reprocess_exit_code(payload) + + if graph_only: + g = run_build_ast_graph( + source_root=cfg.source_root, + kuzu_path=cfg.kuzu_path, + verbose=not args.quiet, + env=env, + ) + if _is_graph_preflight_blocker(g): + payload = { + "success": False, + "exit_code": None, + "stdout": "", + "stderr": "", + "message": g.stderr.strip() or f"graph builder setup exit {g.returncode}", + "graph_exit_code": None, + "graph_stdout": clip(g.stdout, 4000), + "graph_stderr": clip(g.stderr, 4000), + "phases_run": [], + } + _emit_reprocess_outcome(payload) + return _reprocess_exit_code(payload) + ok = g.returncode == 0 + payload = { + "success": ok, + "exit_code": None, + "stdout": "", + "stderr": "", + "message": None if ok else f"graph builder exit {g.returncode}", + "graph_exit_code": g.returncode, + "graph_stdout": clip(g.stdout, 4000), + "graph_stderr": clip(g.stderr, 4000), + "phases_run": ["graph"], + } + if ok: + print(_reprocess_drift_graph_only_line(cfg.index_dir), file=sys.stderr) + _emit_reprocess_outcome(payload, selective_tty_mode="graph" if ok else None) + return _reprocess_exit_code(payload) + + import server # lazy: pulls sentence_transformers/torch/lancedb/kuzu + result = asyncio.run(server.run_refresh_pipeline(quiet=bool(args.quiet))) payload = result.model_dump() - if payload.get("success"): - _emit(payload) - return 0 - _emit(payload) - return 2 if payload.get("exit_code") is None else 1 + _emit_reprocess_outcome(payload) + return _reprocess_exit_code(payload) def _cmd_erase(args: argparse.Namespace) -> int: @@ -342,7 +463,7 @@ def build_parser() -> argparse.ArgumentParser: "Lifecycle (manage the index):\n" " init Create a fresh index from a Java repository.\n" " increment Pick up changes since the last index update (Lance only).\n" - " reprocess Rebuild the entire index from scratch.\n" + " reprocess Full vector + graph rebuild (default); optional --vectors-only / --graph-only.\n" " erase Delete the index from disk.\n\n" "Introspection (inspect the index):\n" " meta Print ontology version, edge counts, and table summary.\n" @@ -383,11 +504,25 @@ def build_parser() -> argparse.ArgumentParser: reprocess = subparsers.add_parser( "reprocess", - help="Rebuild the entire index from scratch.", - description="Full Lance reprocess plus Kuzu graph rebuild (same as the legacy refresh pipeline).", + help="Rebuild vectors and/or Kuzu (default: both full phases).", + description=( + "Default: full Lance reprocess (cocoindex --full-reprocess) then full Kuzu graph rebuild. " + "Use --vectors-only or --graph-only to run a single phase (mutually exclusive)." + ), ) _add_index_embedding_flags(reprocess) reprocess.add_argument("--quiet", action="store_true") + _rex = reprocess.add_mutually_exclusive_group() + _rex.add_argument( + "--vectors-only", + action="store_true", + help="Run only the Lance/cocoindex full reprocess phase (no graph builder).", + ) + _rex.add_argument( + "--graph-only", + action="store_true", + help="Run only build_ast_graph.py (no cocoindex / Lance reprocess).", + ) reprocess.set_defaults(handler=_cmd_reprocess) erase = subparsers.add_parser( diff --git a/server.py b/server.py index c8afc55..e4eade7 100644 --- a/server.py +++ b/server.py @@ -53,6 +53,15 @@ class GraphMetaOutput(BaseModel): class RefreshIndexOutput(BaseModel): + """Structured result for ``run_refresh_pipeline`` / CLI ``reprocess`` JSON. + + ``phases_run`` records which phase subprocesses actually started; the CLI maps + failures to exit **2** when it is empty (setup / nothing spawned) and exit **1** + when it is non-empty (build failure). Callers constructing this model manually + must set ``phases_run`` accordingly — omitting it leaves the default ``[]``, + which the CLI treats like a preflight failure. + """ + success: bool exit_code: int | None = None stdout: str = "" @@ -61,6 +70,7 @@ class RefreshIndexOutput(BaseModel): graph_exit_code: int | None = None graph_stdout: str = "" graph_stderr: str = "" + phases_run: list[Literal["vectors", "graph"]] = Field(default_factory=list) class IndexInfoOutput(BaseModel): @@ -189,6 +199,7 @@ async def run_refresh_pipeline(*, quiet: bool = False) -> RefreshIndexOutput: return RefreshIndexOutput( success=False, message=f"cocoindex not found next to Python: {cocoindex_bin}", + phases_run=[], ) flow_path = root / "java_index_flow_lancedb.py" bundle_dir = Path(__file__).resolve().parent @@ -200,6 +211,7 @@ async def run_refresh_pipeline(*, quiet: bool = False) -> RefreshIndexOutput: return RefreshIndexOutput( success=False, message=f"java_index_flow_lancedb.py not found under {root} nor {bundle_dir}", + phases_run=[], ) try: proc = await asyncio.create_subprocess_exec( @@ -215,10 +227,15 @@ async def run_refresh_pipeline(*, quiet: bool = False) -> RefreshIndexOutput: ) out_b, err_b = await proc.communicate() except Exception as exc: - return RefreshIndexOutput(success=False, message=f"spawn failed: {exc!s}") + return RefreshIndexOutput( + success=False, + message=f"spawn failed: {exc!s}", + phases_run=[], + ) out = out_b.decode(errors="replace") err = err_b.decode(errors="replace") ok = proc.returncode == 0 + phases_run: list[Literal["vectors", "graph"]] = ["vectors"] graph_code: int | None = None graph_out = "" graph_err = "" @@ -243,6 +260,7 @@ async def run_refresh_pipeline(*, quiet: bool = False) -> RefreshIndexOutput: stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) + phases_run = ["vectors", "graph"] gout_b, gerr_b = await gproc.communicate() graph_code = gproc.returncode graph_out = gout_b.decode(errors="replace") @@ -264,6 +282,7 @@ async def run_refresh_pipeline(*, quiet: bool = False) -> RefreshIndexOutput: graph_exit_code=graph_code, graph_stdout=graph_out[-4000:] if len(graph_out) > 4000 else graph_out, graph_stderr=graph_err[-4000:] if len(graph_err) > 4000 else graph_err, + phases_run=phases_run, ) diff --git a/tests/test_java_codebase_rag_cli.py b/tests/test_java_codebase_rag_cli.py index 0c6a977..ba86fee 100644 --- a/tests/test_java_codebase_rag_cli.py +++ b/tests/test_java_codebase_rag_cli.py @@ -506,6 +506,304 @@ def test_cli_analyze_pr_with_diff_stdin(corpus_root, kuzu_db_path) -> None: assert "risk_score" in payload +def test_reprocess_vectors_only_skips_graph( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + idx = tmp_path / "idx_vo" + idx.mkdir() + monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx)) + monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(tmp_path)) + + def fake_coco(*_a: object, **_k: object) -> subprocess.CompletedProcess[str]: + return subprocess.CompletedProcess( + args=["coco", "u", "t", "f"], + returncode=0, + stdout="", + stderr="", + ) + + def graph_should_not_run(**_kwargs: object) -> subprocess.CompletedProcess[str]: + raise AssertionError("graph builder must not run for --vectors-only") + + monkeypatch.setattr(cli_mod, "run_cocoindex_update", fake_coco) + monkeypatch.setattr(cli_mod, "run_build_ast_graph", graph_should_not_run) + + class _NonTty(io.StringIO): + def isatty(self) -> bool: + return False + + nout = _NonTty() + monkeypatch.setattr(cli_mod.sys, "stdout", nout) + rc = cli_mod.main( + ["reprocess", "--source-root", str(tmp_path), "--index-dir", str(idx), "--vectors-only"], + ) + assert rc == 0 + payload = json.loads(nout.getvalue()) + assert payload["phases_run"] == ["vectors"] + + +def test_reprocess_graph_only_skips_vectors( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + idx = tmp_path / "idx_go" + idx.mkdir() + monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx)) + monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(tmp_path)) + + def coco_should_not_run(*_a: object, **_k: object) -> subprocess.CompletedProcess[str]: + raise AssertionError("cocoindex must not run for --graph-only") + + def fake_graph(**_kwargs: object) -> subprocess.CompletedProcess[str]: + return subprocess.CompletedProcess( + args=["py", "build_ast_graph.py"], + returncode=0, + stdout="", + stderr="", + ) + + monkeypatch.setattr(cli_mod, "run_cocoindex_update", coco_should_not_run) + monkeypatch.setattr(cli_mod, "run_build_ast_graph", fake_graph) + out = io.StringIO() + monkeypatch.setattr(cli_mod.sys, "stdout", out) + rc = cli_mod.main( + ["reprocess", "--source-root", str(tmp_path), "--index-dir", str(idx), "--graph-only"], + ) + assert rc == 0 + assert json.loads(out.getvalue())["phases_run"] == ["graph"] + + +def test_reprocess_mutually_exclusive_flags_rejected(tmp_path: Path) -> None: + buf = io.StringIO() + with contextlib.redirect_stderr(buf): + rc = cli_mod.main( + [ + "reprocess", + "--source-root", + str(tmp_path), + "--vectors-only", + "--graph-only", + ], + ) + assert rc == 2 + err = buf.getvalue() + assert "not allowed with argument" in err or "mutually exclusive" in err.lower() + + +def test_reprocess_graph_only_build_failure_returns_exit_1( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + idx = tmp_path / "idx_gf" + idx.mkdir() + monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx)) + monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(tmp_path)) + + def fake_graph(**_kwargs: object) -> subprocess.CompletedProcess[str]: + return subprocess.CompletedProcess( + args=["py", "build_ast_graph.py"], + returncode=9, + stdout="", + stderr="boom", + ) + + monkeypatch.setattr(cli_mod, "run_build_ast_graph", fake_graph) + out = io.StringIO() + monkeypatch.setattr(cli_mod.sys, "stdout", out) + rc = cli_mod.main( + ["reprocess", "--source-root", str(tmp_path), "--index-dir", str(idx), "--graph-only"], + ) + assert rc == 1 + payload = json.loads(out.getvalue()) + assert payload["phases_run"] == ["graph"] + assert payload["graph_exit_code"] == 9 + + +def test_reprocess_vectors_only_emits_graph_stale_warning( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + idx = tmp_path / "idx_wv" + idx.mkdir() + monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx)) + monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(tmp_path)) + + def fake_coco(*_a: object, **_k: object) -> subprocess.CompletedProcess[str]: + return subprocess.CompletedProcess( + args=["coco", "u", "t", "f"], + returncode=0, + stdout="", + stderr="", + ) + + monkeypatch.setattr(cli_mod, "run_cocoindex_update", fake_coco) + monkeypatch.setattr( + cli_mod, + "run_build_ast_graph", + lambda **_k: subprocess.CompletedProcess(args=[], returncode=0, stdout="", stderr=""), + ) + err = io.StringIO() + out = io.StringIO() + monkeypatch.setattr(cli_mod.sys, "stderr", err) + monkeypatch.setattr(cli_mod.sys, "stdout", out) + rc = cli_mod.main( + ["reprocess", "--source-root", str(tmp_path), "--index-dir", str(idx), "--vectors-only"], + ) + assert rc == 0 + assert "code_graph.kuzu" in err.getvalue() + + +def test_reprocess_graph_only_emits_vectors_stale_warning( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + idx = tmp_path / "idx_wg" + idx.mkdir() + monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx)) + monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(tmp_path)) + + def fake_graph(**_kwargs: object) -> subprocess.CompletedProcess[str]: + return subprocess.CompletedProcess( + args=["py", "build_ast_graph.py"], + returncode=0, + stdout="", + stderr="", + ) + + monkeypatch.setattr(cli_mod, "run_build_ast_graph", fake_graph) + err = io.StringIO() + out = io.StringIO() + monkeypatch.setattr(cli_mod.sys, "stderr", err) + monkeypatch.setattr(cli_mod.sys, "stdout", out) + rc = cli_mod.main( + ["reprocess", "--source-root", str(tmp_path), "--index-dir", str(idx), "--graph-only"], + ) + assert rc == 0 + assert "Lance tables under" in err.getvalue() + assert str(idx) in err.getvalue() + + +def test_reprocess_vectors_only_setup_failure_returns_exit_2_without_phase( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + idx = tmp_path / "idx_vs" + idx.mkdir() + monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx)) + monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(tmp_path)) + + def fake_coco(*_a: object, **_k: object) -> subprocess.CompletedProcess[str]: + return subprocess.CompletedProcess( + args=["/nonexistent/cocoindex"], + returncode=127, + stdout="", + stderr="cocoindex not found next to Python", + ) + + monkeypatch.setattr(cli_mod, "run_cocoindex_update", fake_coco) + err = io.StringIO() + out = io.StringIO() + monkeypatch.setattr(cli_mod.sys, "stderr", err) + monkeypatch.setattr(cli_mod.sys, "stdout", out) + rc = cli_mod.main( + ["reprocess", "--source-root", str(tmp_path), "--index-dir", str(idx), "--vectors-only"], + ) + assert rc == 2 + assert json.loads(out.getvalue())["phases_run"] == [] + assert "rebuilt vectors only" not in err.getvalue().lower() + + +def test_reprocess_graph_only_setup_failure_returns_exit_2_without_phase( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + idx = tmp_path / "idx_gs" + idx.mkdir() + monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx)) + monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(tmp_path)) + + def fake_graph(**_kwargs: object) -> subprocess.CompletedProcess[str]: + return subprocess.CompletedProcess( + args=[], + returncode=126, + stdout="", + stderr="build_ast_graph.py not found", + ) + + monkeypatch.setattr(cli_mod, "run_build_ast_graph", fake_graph) + err = io.StringIO() + out = io.StringIO() + monkeypatch.setattr(cli_mod.sys, "stderr", err) + monkeypatch.setattr(cli_mod.sys, "stdout", out) + rc = cli_mod.main( + ["reprocess", "--source-root", str(tmp_path), "--index-dir", str(idx), "--graph-only"], + ) + assert rc == 2 + assert json.loads(out.getvalue())["phases_run"] == [] + assert "rebuilt graph only" not in err.getvalue().lower() + + +def test_reprocess_no_flag_cocoindex_failure_records_vectors_only( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + import server as server_mod + + idx = tmp_path / "idx_nf" + idx.mkdir() + monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx)) + monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(tmp_path)) + + async def fake_refresh(*, quiet: bool = False) -> server_mod.RefreshIndexOutput: + return server_mod.RefreshIndexOutput( + success=False, + exit_code=1, + stdout="out", + stderr="err", + message="cocoindex exit 1", + graph_exit_code=None, + graph_stdout="", + graph_stderr="", + phases_run=["vectors"], + ) + + monkeypatch.setattr(server_mod, "run_refresh_pipeline", fake_refresh) + out = io.StringIO() + monkeypatch.setattr(cli_mod.sys, "stdout", out) + rc = cli_mod.main( + ["reprocess", "--source-root", str(tmp_path), "--index-dir", str(idx)], + ) + assert rc == 1 + payload = json.loads(out.getvalue()) + assert payload["phases_run"] == ["vectors"] + + +def test_reprocess_pretty_output_lists_rebuilt_and_skipped( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + idx = tmp_path / "idx_po" + idx.mkdir() + monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx)) + monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(tmp_path)) + + def fake_coco(*_a: object, **_k: object) -> subprocess.CompletedProcess[str]: + return subprocess.CompletedProcess( + args=["coco", "u", "t", "f"], + returncode=0, + stdout="", + stderr="", + ) + + monkeypatch.setattr(cli_mod, "run_cocoindex_update", fake_coco) + + class TtyOut(io.StringIO): + def isatty(self) -> bool: + return True + + tty = TtyOut() + monkeypatch.setattr(cli_mod.sys, "stdout", tty) + rc = cli_mod.main( + ["reprocess", "--source-root", str(tmp_path), "--index-dir", str(idx), "--vectors-only"], + ) + assert rc == 0 + text = tty.getvalue() + assert "Rebuilt: vectors" in text + assert "Skipped: graph" in text + + def test_cli_reprocess_builds_kuzu_path(corpus_root, tmp_path) -> None: if not _cocoindex_available(): pytest.skip("cocoindex CLI missing")