From b22bee68732904306dfe320983b981914188345e Mon Sep 17 00:00:00 2001 From: ZhangDapao Date: Sat, 14 Mar 2026 11:43:55 +0800 Subject: [PATCH] feat(rag): add local retrieval pipeline and search tooling --- .gitignore | 4 + .opencode/plugins/rag_context.ts | 338 +++++++++++++++++ .opencode/rag.ts | 428 +++++++++++++++++++++ .opencode/skills/rag-pipeline/SKILL.md | 93 +++++ .opencode/tool/rag_search.ts | 149 ++++++++ .opencode/tool/rag_search.txt | 17 + script/rag/build-offline-bundle.sh | 132 +++++++ script/rag/build-vector-index.py | 402 ++++++++++++++++++++ script/rag/clean-text.py | 52 +++ script/rag/cmd/rag-bootstrap.sh | 82 ++++ script/rag/cmd/rag-init.sh | 14 + script/rag/cmd/rag-update.sh | 14 + script/rag/compare-structured.py | 52 +++ script/rag/convert-dir-to-text.sh | 136 +++++++ script/rag/debug-rag-state.py | 91 +++++ script/rag/install-docling.sh | 91 +++++ script/rag/install-offline-bundle.sh | 95 +++++ script/rag/install-tesseract.sh | 72 ++++ script/rag/install-vector.sh | 79 ++++ script/rag/merge-image-ocr.py | 97 +++++ script/rag/rag-pipeline.py | 427 +++++++++++++++++++++ script/rag/requirements-docling.txt | 1 + script/rag/requirements-llamaindex.txt | 2 + script/rag/requirements-vector.txt | 2 + script/rag/search-vector-index.py | 365 ++++++++++++++++++ script/rag/structure-text.py | 307 +++++++++++++++ script/rag/url-to-text.sh | 449 ++++++++++++++++++++++ specs/rag-docling-deploy.zh.md | 507 +++++++++++++++++++++++++ specs/rag-enhance-architecture.zh.md | 266 +++++++++++++ specs/rag-llm-prompt-protocol.zh.md | 309 +++++++++++++++ specs/rag-progressive-disclosure.zh.md | 365 ++++++++++++++++++ specs/rag-updates-history.zh.md | 226 +++++++++++ 32 files changed, 5664 insertions(+) create mode 100644 .opencode/plugins/rag_context.ts create mode 100644 .opencode/rag.ts create mode 100644 .opencode/skills/rag-pipeline/SKILL.md create mode 100644 .opencode/tool/rag_search.ts create mode 100644 .opencode/tool/rag_search.txt create mode 100755 script/rag/build-offline-bundle.sh create mode 100755 script/rag/build-vector-index.py create mode 100755 script/rag/clean-text.py create mode 100755 script/rag/cmd/rag-bootstrap.sh create mode 100644 script/rag/cmd/rag-init.sh create mode 100644 script/rag/cmd/rag-update.sh create mode 100755 script/rag/compare-structured.py create mode 100755 script/rag/convert-dir-to-text.sh create mode 100755 script/rag/debug-rag-state.py create mode 100755 script/rag/install-docling.sh create mode 100755 script/rag/install-offline-bundle.sh create mode 100755 script/rag/install-tesseract.sh create mode 100755 script/rag/install-vector.sh create mode 100755 script/rag/merge-image-ocr.py create mode 100644 script/rag/rag-pipeline.py create mode 100644 script/rag/requirements-docling.txt create mode 100644 script/rag/requirements-llamaindex.txt create mode 100644 script/rag/requirements-vector.txt create mode 100644 script/rag/search-vector-index.py create mode 100755 script/rag/structure-text.py create mode 100755 script/rag/url-to-text.sh create mode 100644 specs/rag-docling-deploy.zh.md create mode 100644 specs/rag-enhance-architecture.zh.md create mode 100644 specs/rag-llm-prompt-protocol.zh.md create mode 100644 specs/rag-progressive-disclosure.zh.md create mode 100644 specs/rag-updates-history.zh.md diff --git a/.gitignore b/.gitignore index bf78c046d4b..1efa395e597 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,10 @@ a.out target .scripts .direnv/ +.venv-docling/ +.rag/ +__pycache__/ +*.pyc # Local dev files opencode-dev diff --git a/.opencode/plugins/rag_context.ts b/.opencode/plugins/rag_context.ts new file mode 100644 index 00000000000..7191740c3d5 --- /dev/null +++ b/.opencode/plugins/rag_context.ts @@ -0,0 +1,338 @@ +/// +import type { Plugin } from "@opencode-ai/plugin" +import path from "path" +import { + allow, + allowExpand, + audit, + base, + cluster, + collection, + decide, + key, + model, + parse, + py, + reset, + rewriteMode, + rewriteModel, + rewriteQueries, + reuseSec, + root, + row, + session, + stateBlock, + strip, + summary, + topk, + chars, + db, +} from "../rag" + +type Msg = { + info?: { + role?: string + id?: string + sessionID?: string + sessionId?: string + } + parts?: Array<{ + type?: string + text?: string + synthetic?: boolean + }> +} + +function sid(msgs: Msg[], idx: number) { + const direct = msgs[idx]?.info?.sessionID || msgs[idx]?.info?.sessionId + if (direct) return String(direct) + for (let i = idx; i >= 0; i--) { + const v = msgs[i]?.info?.sessionID || msgs[i]?.info?.sessionId + if (v) return String(v) + } + return "default" +} + +function uid(msgs: Msg[], idx: number) { + const v = msgs[idx]?.info?.id + if (!v) return "" + return String(v) +} + +function next(status: string) { + if (status === "new_evidence") return "call_rag_search_delta_if_needed" + if (status === "weak_match") return "call_rag_search_delta_or_refine_query" + if (status === "no_new_evidence") return "reuse_known_state_or_call_rag_search_state" + if (status === "cluster_throttled") return "avoid_repeating_same_search" + if (status === "retrieval_error") return "retry_or_check_rag_backend" + return "refine_query_or_call_rag_search" +} + +function mark( + hit: ReturnType, + input: { query: string; status: string; reason: string; total?: number; rewrites?: string[] }, +) { + hit.last_query = input.query + hit.last_status = input.status + hit.last_reason = input.reason + hit.last_checked = Date.now() + hit.total_hits = input.total || 0 + hit.delta = [] + hit.hits = [] + hit.top = [] + hit.overlap = 0 + hit.rewrites = input.rewrites || [input.query] +} + +const RagContextPlugin: Plugin = async ({ worktree, $ }) => { + return { + "tool.definition": async (input, output) => { + if (input.toolID !== "rag_search") return + output.description = [ + output.description, + "", + "Call this tool with valid JSON arguments only.", + 'Use query as a plain string value. Do not insert extra quotes inside the query string.', + 'Valid example: {"query":"luckfox-pico zero 传输文件方式","mode":"delta","node_type":"text","top_k":3}', + 'Invalid example: {"query":"luck"fox-pico zero","mode":"brief"}', + ].join("\n") + }, + "tool.execute.before": async (input, output) => { + if (input.tool !== "rag_search") return + if (allowExpand()) return + if (output.args?.mode !== "expand") return + output.args = { + ...output.args, + mode: "delta", + top_k: Math.min(Number(output.args?.top_k || 3), 3), + } + }, + "experimental.chat.messages.transform": async (_input, output) => { + if (process.env.RAG_AUTO_INJECT === "0") return + const msgs = output.messages as Msg[] + if (!Array.isArray(msgs) || !msgs.length) return + let idx = -1 + for (let i = msgs.length - 1; i >= 0; i--) { + if (msgs[i].info?.role === "user") { + idx = i + break + } + } + if (idx < 0) return + const loop = msgs.slice(idx + 1).some((msg) => msg.info?.role === "assistant") + const parts = Array.isArray(msgs[idx].parts) ? msgs[idx].parts : [] + let textPart: { type?: string; text?: string; synthetic?: boolean } | undefined + for (let i = parts.length - 1; i >= 0; i--) { + const part = parts[i] + if (part?.type === "text" && typeof part.text === "string" && !part.synthetic) { + textPart = part + break + } + } + if (!textPart?.text) return + + const clean = strip(textPart.text) + const query = clean.trim().slice(0, 800) + if (!query) return + + const sessionID = sid(msgs, idx) + const userID = uid(msgs, idx) + const st = session(sessionID) + const keyName = cluster(query) + const hit = row(st, keyName) + const now = Date.now() + const baseDir = root(worktree) + const python = py(baseDir) + const script = path.join(baseDir, "script", "rag", "search-vector-index.py") + const dbPath = db(baseDir) + const same = st.last_user_id === userID && st.last_query === query && st.last_cluster === keyName + const cached = !!hit.last_status && (loop || (same && now - hit.last_checked <= reuseSec() * 1000)) + + if (cached) { + textPart.text = `${clean}\n\n${stateBlock(keyName, hit, next(hit.last_status))}` + await audit(worktree, { + channel: "rag_context", + event: "context_meta", + sessionID, + userID, + query, + cluster: keyName, + loop, + used_cache: true, + status: hit.last_status, + reason: hit.last_reason, + total_hits: hit.total_hits, + delta_hits: hit.delta.length, + known_hits: hit.known_hits, + overlap: hit.overlap, + rewrites: hit.rewrites, + top_hits: summary(hit.top, 3), + emitted_context: false, + }) + return + } + + if (!allow(hit)) { + mark(hit, { + query, + status: "cluster_throttled", + reason: "cluster_window_limit", + }) + st.last_user_id = userID + st.last_query = query + st.last_cluster = keyName + textPart.text = `${clean}\n\n${stateBlock(keyName, hit, next(hit.last_status))}` + await audit(worktree, { + channel: "rag_context", + event: "context_meta", + sessionID, + userID, + query, + cluster: keyName, + loop, + used_cache: false, + status: hit.last_status, + reason: hit.last_reason, + total_hits: hit.total_hits, + delta_hits: hit.delta.length, + known_hits: hit.known_hits, + overlap: hit.overlap, + rewrites: hit.rewrites, + top_hits: [], + emitted_context: false, + }) + return + } + + const res = + await $`${python} ${script} --query ${query} --db-path ${dbPath} --collection ${collection()} --model ${model()} --top-k ${topk()} --node-type text --show-text-chars ${chars()} --base-url ${base()} --api-key ${key()} --format json --rewrite ${rewriteMode()} --rewrite-model ${rewriteModel()} --rewrite-queries ${rewriteQueries()}` + .quiet() + .nothrow() + const raw = res.stdout.toString() + + if (res.exitCode !== 0) { + mark(hit, { + query, + status: "retrieval_error", + reason: "backend_error", + }) + st.last_user_id = userID + st.last_query = query + st.last_cluster = keyName + textPart.text = `${clean}\n\n${stateBlock(keyName, hit, next(hit.last_status))}` + await audit(worktree, { + channel: "rag_context", + event: "search_fail", + sessionID, + userID, + query, + cluster: keyName, + loop, + code: res.exitCode, + stderr: res.stderr.toString().slice(0, 1200), + status: hit.last_status, + reason: hit.last_reason, + emitted_context: false, + }) + return + } + + let resData = { hits: [], rewrites: [query], keywords: [], rewrite_mode: "none" } as ReturnType + try { + resData = parse(raw) + } catch { + mark(hit, { + query, + status: "retrieval_error", + reason: "parse_error", + }) + st.last_user_id = userID + st.last_query = query + st.last_cluster = keyName + textPart.text = `${clean}\n\n${stateBlock(keyName, hit, next(hit.last_status))}` + await audit(worktree, { + channel: "rag_context", + event: "parse_fail", + sessionID, + userID, + query, + cluster: keyName, + loop, + raw: raw.slice(0, 1200), + status: hit.last_status, + reason: hit.last_reason, + emitted_context: false, + }) + return + } + + const out = decide(hit, resData.hits, query, resData.rewrites) + st.last_user_id = userID + st.last_query = query + st.last_cluster = keyName + textPart.text = `${clean}\n\n${stateBlock(keyName, hit, out.next)}` + await audit(worktree, { + channel: "rag_context", + event: "context_search", + sessionID, + userID, + query, + cluster: keyName, + loop, + used_cache: false, + status: out.status, + reason: out.reason, + total_hits: out.total, + delta_hits: out.delta.length, + known_hits: out.known, + overlap: out.overlap, + rewrite_mode: resData.rewrite_mode, + rewrites: hit.rewrites, + keywords: resData.keywords, + top_hits: summary(hit.top, 3), + delta_fps: out.delta.map((x) => ({ + fp: `${x.text_file || x.source_url || ""}#${x.chunk_id || x.image_id || x.section_title || ""}`, + source_url: x.source_url || "", + section_title: x.section_title || "", + chunk_id: x.chunk_id || "", + })), + emitted_context: false, + }) + }, + "experimental.chat.system.transform": async (_input, output) => { + if (process.env.RAG_AUTO_INJECT === "0") return + output.system.push("RAG protocol: parse on every model step. rag_context injects retrieval meta only, not full evidence.") + output.system.push( + "If rag_state status=new_evidence and you still need facts, call rag_search with mode=delta first. Use mode=brief only when delta is insufficient.", + ) + output.system.push( + "If rag_state status=no_new_evidence, reuse current state. Do not repeat the same retrieval unless the query becomes more specific.", + ) + output.system.push( + "Do not call rag_search mode=expand in normal QA. Use expand only for explicit debugging or evidence inspection.", + ) + output.system.push( + "Do not execute script/rag/search-vector-index.py directly from shell for QA retrieval. Use rag_search only.", + ) + output.system.push( + 'When calling rag_search, emit valid JSON arguments. query must be one plain string value, without nested or broken quotation marks.', + ) + output.system.push( + "For long or noisy questions, trust rag_state rewrite metadata and prefer rag_search results derived from rewritten retrieval queries.", + ) + }, + "experimental.session.compacting": async (input, output) => { + const id = String((input as { sessionID?: string })?.sessionID || "default") + const st = reset(id) + await audit(worktree, { + channel: "rag_context", + event: "state_reset", + sessionID: id, + epoch: st.epoch, + }) + return output + }, + } +} + +export default RagContextPlugin diff --git a/.opencode/rag.ts b/.opencode/rag.ts new file mode 100644 index 00000000000..042399e9831 --- /dev/null +++ b/.opencode/rag.ts @@ -0,0 +1,428 @@ +import path from "path" +import { appendFile, mkdir } from "node:fs/promises" + +export type Hit = { + score?: number + rerank_score?: number + source_url?: string + section_title?: string + text_preview?: string + chunk_id?: string + image_id?: string + text_file?: string + matched_queries?: string[] + hit_count?: number +} + +export type SearchResult = { + hits: Hit[] + rewrites: string[] + keywords: string[] + rewrite_mode: string +} + +type Row = { + seen: Set + window: number[] + last_query: string + last_status: string + last_reason: string + last_checked: number + total_hits: number + known_hits: number + overlap: number + delta: Hit[] + hits: Hit[] + top: Hit[] + rewrites: string[] +} + +type Session = { + epoch: number + last_user_id: string + last_query: string + last_cluster: string + rows: Map +} + +const STORE = new Map() +const STOP = new Set([ + "的", + "了", + "和", + "是", + "怎么", + "如何", + "请问", + "一下", + "关于", + "教程", + "方法", + "方式", + "what", + "how", + "the", + "a", + "an", + "to", + "for", + "of", + "in", +]) +const SYN: Record = { + flash: "烧录", + burn: "烧录", + firmware: "固件", + image: "镜像", + electerm: "electerm", + luckfox: "luckfox", + pico: "pico", + zero: "zero", +} + +export function topk() { + const n = Number.parseInt(process.env.RAG_TOP_K ?? "4", 10) + if (Number.isFinite(n) && n > 0) return n + return 4 +} + +export function use() { + const n = Number.parseInt(process.env.RAG_CONTEXT_HITS ?? "2", 10) + if (Number.isFinite(n) && n > 0) return n + return 2 +} + +export function chars() { + const n = Number.parseInt(process.env.RAG_CONTEXT_CHARS ?? "120", 10) + if (Number.isFinite(n) && n >= 40) return n + return 120 +} + +export function expandChars() { + const n = Number.parseInt(process.env.RAG_EXPAND_CHARS ?? "420", 10) + if (Number.isFinite(n) && n >= 120) return n + return 420 +} + +export function simCut() { + const n = Number.parseFloat(process.env.RAG_OVERLAP_THRESHOLD ?? "0.8") + if (Number.isFinite(n) && n > 0 && n <= 1) return n + return 0.8 +} + +export function weakCut() { + const n = Number.parseFloat(process.env.RAG_WEAK_SCORE ?? "0.42") + if (Number.isFinite(n) && n > 0 && n < 1) return n + return 0.42 +} + +export function clusterWindowSec() { + const n = Number.parseInt(process.env.RAG_CLUSTER_WINDOW_SEC ?? "30", 10) + if (Number.isFinite(n) && n > 0) return n + return 30 +} + +export function clusterMax() { + const n = Number.parseInt(process.env.RAG_CLUSTER_MAX_FULL ?? "2", 10) + if (Number.isFinite(n) && n > 0) return n + return 2 +} + +export function reuseSec() { + const n = Number.parseInt(process.env.RAG_REUSE_SEC ?? "8", 10) + if (Number.isFinite(n) && n >= 0) return n + return 8 +} + +export function model() { + const v = process.env.RAG_EMBED_MODEL + if (v) return v + return "qwen3-embedding:4b" +} + +export function rewriteMode() { + const v = process.env.RAG_REWRITE_MODE + if (v) return v + return "auto" +} + +export function rewriteModel() { + const v = process.env.RAG_REWRITE_MODEL + if (v) return v + return process.env.RAG_STRUCT_MODEL || process.env.OPENAI_MODEL || "gpt-4o-mini" +} + +export function rewriteQueries() { + const n = Number.parseInt(process.env.RAG_REWRITE_QUERIES ?? "3", 10) + if (Number.isFinite(n) && n > 0) return n + return 3 +} + +export function collection() { + const v = process.env.RAG_COLLECTION + if (v) return v + return "rag_chunks" +} + +export function base() { + const v = process.env.RAG_BASE_URL || process.env.OPENAI_BASE_URL + if (v) return v + return "http://127.0.0.1:11434/v1" +} + +export function key() { + const v = process.env.RAG_API_KEY || process.env.OPENAI_API_KEY || process.env.MINIMAX_API_KEY + if (v) return v + return "ollama" +} + +export function debug() { + return process.env.RAG_DEBUG_LOG === "1" || process.env.RAG_DEBUG === "1" +} + +export function allowExpand() { + return process.env.RAG_ALLOW_EXPAND_TOOL === "1" +} + +export function root(input: string) { + const env = process.env.RAG_WORKTREE + if (env) return env + if (input && input !== "/") return input + return process.cwd() +} + +export function py(rootDir: string) { + const env = process.env.RAG_DOCLING_PYTHON_BIN + if (env) return env + return path.join(rootDir, ".venv-docling", "bin", "python") +} + +export function db(rootDir: string) { + const env = process.env.RAG_DB_PATH + if (env) return env + return path.join(rootDir, ".rag", "vector", "qdrant") +} + +export function clip(text: string, n: number) { + const s = String(text || "").replace(/\s+/g, " ").trim() + if (s.length <= n) return s + return `${s.slice(0, n).trim()} ...` +} + +export function strip(text: string) { + return text + .replace(/\n*[\s\S]*?<\/rag_context>\n*/g, "\n") + .replace(/\n*[\s\S]*?<\/rag_state>\n*/g, "\n") + .replace(/\n{3,}/g, "\n\n") + .trim() +} + +export function terms(query: string) { + const rows = (query.toLowerCase().match(/[\p{Script=Han}]+|[a-z0-9_-]+/gu) || []) + .map((x) => x.trim()) + .filter(Boolean) + const out: string[] = [] + for (const raw of rows) { + const v = SYN[raw] || raw + if (!v || STOP.has(v)) continue + out.push(v) + } + return [...new Set(out)].sort() +} + +export function cluster(query: string) { + const rows = terms(query) + if (!rows.length) return `q:${clip(query.toLowerCase(), 48)}` + return rows.slice(0, 8).join("|") +} + +export function fp(hit: Hit) { + const src = hit.text_file || hit.source_url || "" + const id = hit.chunk_id || hit.image_id || hit.section_title || clip(String(hit.text_preview || ""), 36) + return `${src}#${id}` +} + +export function parse(raw: string) { + const data = JSON.parse(raw) + const hits = Array.isArray(data?.hits) ? data.hits : [] + const rewrites = Array.isArray(data?.rewrite?.queries) ? data.rewrite.queries.filter((x: unknown) => typeof x === "string") : [] + const keywords = Array.isArray(data?.rewrite?.keywords) ? data.rewrite.keywords.filter((x: unknown) => typeof x === "string") : [] + return { + hits: hits as Hit[], + rewrites, + keywords, + rewrite_mode: String(data?.rewrite?.mode || "none"), + } as SearchResult +} + +export function session(id: string) { + const cur = STORE.get(id) + if (cur) return cur + const next: Session = { + epoch: 0, + last_user_id: "", + last_query: "", + last_cluster: "", + rows: new Map(), + } + STORE.set(id, next) + return next +} + +export function row(st: Session, key: string) { + const cur = st.rows.get(key) + if (cur) return cur + const next: Row = { + seen: new Set(), + window: [], + last_query: "", + last_status: "", + last_reason: "", + last_checked: 0, + total_hits: 0, + known_hits: 0, + overlap: 0, + delta: [], + hits: [], + top: [], + rewrites: [], + } + st.rows.set(key, next) + return next +} + +export function allow(row: Row) { + const now = Date.now() + const win = clusterWindowSec() * 1000 + row.window = row.window.filter((x) => now - x <= win) + if (row.window.length >= clusterMax()) return false + row.window.push(now) + return true +} + +export function decide(row: Row, hits: Hit[], query: string, rewrites?: string[]) { + const keys = hits.map(fp) + const fresh = hits.filter((hit) => !row.seen.has(fp(hit))) + const shared = keys.filter((key) => row.seen.has(key)).length + const ov = keys.length ? shared / keys.length : 0 + const top = Number(hits[0]?.score || 0) + const status = !hits.length + ? "need_refine" + : !fresh.length && ov >= simCut() + ? "no_new_evidence" + : top < weakCut() + ? "weak_match" + : "new_evidence" + const reason = !hits.length + ? "empty_hits" + : !fresh.length && ov >= simCut() + ? "high_overlap" + : top < weakCut() + ? "low_score" + : fresh.length < hits.length + ? "delta_available" + : "fresh_hits" + const next = + status === "need_refine" + ? "refine_query_or_call_rag_search" + : status === "no_new_evidence" + ? "reuse_known_evidence_or_call_rag_search_state" + : status === "weak_match" + ? "call_rag_search_delta_or_refine_query" + : "call_rag_search_delta_if_more_detail_needed" + for (const key of keys) row.seen.add(key) + row.last_query = query + row.last_status = status + row.last_reason = reason + row.last_checked = Date.now() + row.total_hits = hits.length + row.known_hits = row.seen.size + row.overlap = ov + row.delta = fresh + row.hits = hits + row.top = hits.slice(0, 3) + row.rewrites = rewrites && rewrites.length ? rewrites : [query] + return { status, reason, next, overlap: ov, delta: fresh, hits, known: row.known_hits, total: hits.length } +} + +export function stateBlock(key: string, row: Row, next?: string) { + const top = row.top[0] + return [ + "", + `status=${row.last_status || "need_refine"}`, + `reason=${row.last_reason || "empty_hits"}`, + `cluster=${key}`, + `total_hits=${row.total_hits}`, + `delta_hits=${row.delta.length}`, + `known_hits=${row.known_hits}`, + `overlap=${Number(row.overlap || 0).toFixed(4)}`, + `top_source=${top?.source_url || ""}`, + `top_section=${clip(top?.section_title || "", 48)}`, + `rewrite_queries=${JSON.stringify(row.rewrites)}`, + `next_action=${next || "call_rag_search_delta_if_needed"}`, + "", + ].join("\n") +} + +export function brief(hits: Hit[], limit: number) { + if (!hits.length) return "no_rag_hit" + return hits + .slice(0, Math.max(1, limit)) + .map((hit, i) => + [ + `[${i + 1}]`, + `source=${hit.source_url || ""}`, + `section=${clip(hit.section_title || "", 48)}`, + `summary=${clip(hit.text_preview || "", chars())}`, + ].join(" "), + ) + .join("\n") +} + +export function expand(hits: Hit[], limit: number) { + if (!hits.length) return "no_rag_hit" + return hits + .slice(0, Math.max(1, limit)) + .map((hit, i) => + [ + `[${i + 1}] score=${Number(hit.score || 0).toFixed(4)}`, + `source=${hit.source_url || ""}`, + `section=${hit.section_title || ""}`, + `chunk=${hit.chunk_id || hit.image_id || ""}`, + `text=${clip(hit.text_preview || "", expandChars())}`, + ].join("\n"), + ) + .join("\n\n") +} + +export function summary(hits: Hit[], limit: number) { + return hits.slice(0, Math.max(1, limit)).map((hit) => ({ + score: Number(hit.score || 0), + rerank_score: Number(hit.rerank_score || 0), + source_url: hit.source_url || "", + section_title: hit.section_title || "", + chunk_id: hit.chunk_id || "", + image_id: hit.image_id || "", + text_preview: clip(hit.text_preview || "", chars()), + fp: fp(hit), + matched_queries: Array.isArray(hit.matched_queries) ? hit.matched_queries : [], + hit_count: Number(hit.hit_count || 0), + })) +} + +export async function audit(worktree: string, data: Record) { + if (!debug()) return + const dir = path.join(root(worktree), ".rag", "log") + await mkdir(dir, { recursive: true }) + await appendFile(path.join(dir, "rag_debug.jsonl"), `${JSON.stringify({ ts: new Date().toISOString(), ...data })}\n`, "utf-8") +} + +export function reset(id: string) { + const st = session(id) + st.epoch += 1 + st.rows.clear() + st.last_user_id = "" + st.last_query = "" + st.last_cluster = "" + return st +} diff --git a/.opencode/skills/rag-pipeline/SKILL.md b/.opencode/skills/rag-pipeline/SKILL.md new file mode 100644 index 00000000000..5eb4081f8a7 --- /dev/null +++ b/.opencode/skills/rag-pipeline/SKILL.md @@ -0,0 +1,93 @@ +--- +name: rag-pipeline +description: Run standardized rag init/update pipeline with minimal options and manifest-based sync +compatibility: opencode +--- + +## Goal + +Use two commands only: + +1. `rag-init` for first build +2. `rag-update` for incremental sync + +If the target repo does not contain this pipeline yet, bootstrap first: + +```bash +bash script/rag/cmd/rag-bootstrap.sh --target +``` + +## Required Inputs + +1. source type: `structured` | `dir` | `url` +2. source path (or url list) +3. embedding model +4. collection name + +## Exposed Options + +Only expose these options to users by default: + +1. `--source` +2. `--struct-mode` + `--struct-model` +3. `--embed-model` +4. `--url` / `--url-file` / `--input-dir` / `--scan-dir` +5. `--collection` + +Keep low-level knobs hidden unless users ask explicitly: + +1. chunk size / overlap +2. OCR engine internals +3. retry/backoff internals + +## Commands + +### Initial build + +Structured-only init: + +```bash +bash script/rag/cmd/rag-init.sh --source structured --scan-dir .rag/text --glob "**/*.structured.json" --embed-model qwen3-embedding:4b --collection rag_chunks +``` + +Directory init: + +```bash +bash script/rag/cmd/rag-init.sh --source dir --input-dir --text-out-dir .rag/text/dir --embed-model qwen3-embedding:4b --collection rag_chunks +``` + +URL init: + +```bash +bash script/rag/cmd/rag-init.sh --source url --url --ocr-images --image-inline marker --url-text-dir .rag/text/url --embed-model qwen3-embedding:4b --collection rag_chunks +``` + +### Incremental update + +```bash +bash script/rag/cmd/rag-update.sh --source structured --scan-dir .rag/text --glob "**/*.structured.json" --embed-model qwen3-embedding:4b --collection rag_chunks +``` + +## Behavior Rules + +1. Do not expose chunk-size/overlap or low-level OCR internals unless user explicitly asks. +2. Keep defaults: + - `--struct-mode llamaindex` + - `--inline-ocr strip` + - `--image-inline marker` +3. If collection or embedding model changes, allow full rebuild. +4. Keep state in `--manifest` (default `.rag/state/manifest.json`) to support incremental update. +5. Runtime retrieval policy: + - prefer plugin auto-inject with `` meta on every model step + - use `rag_search` to progressively reveal evidence text + - avoid repeated retrieval in the same query cluster unless new evidence appears + - use `rag_search` mode progressively: `state` -> `delta` -> `brief` + - use `expand` only for explicit debugging or when the user asks to inspect evidence details +6. Debugging: + - enable with `RAG_DEBUG=1` + - inspect `.rag/log/rag_debug.jsonl` + - summarize quickly with `python script/rag/debug-rag-state.py --tail 100` +7. On failure, return: + - exact command + - stderr summary + - recovery action diff --git a/.opencode/tool/rag_search.ts b/.opencode/tool/rag_search.ts new file mode 100644 index 00000000000..1dfd760133f --- /dev/null +++ b/.opencode/tool/rag_search.ts @@ -0,0 +1,149 @@ +/// +import { tool } from "@opencode-ai/plugin" +import path from "path" +import DESCRIPTION from "./rag_search.txt" +import { + allowExpand, + audit, + base, + brief, + chars, + cluster, + collection, + db, + decide, + expand, + expandChars, + key, + model, + parse, + py, + rewriteMode, + rewriteModel, + rewriteQueries, + root, + row, + session, + stateBlock, + summary, +} from "../rag" + +export default tool({ + description: DESCRIPTION, + args: { + query: tool.schema.string().describe("Search query text"), + top_k: tool.schema.number().describe("Maximum hits to return").default(3), + node_type: tool.schema.enum(["any", "text", "image"]).describe("Filter node type").default("text"), + mode: tool.schema.enum(["state", "delta", "brief", "expand"]).describe("Result disclosure mode").default("delta"), + }, + async execute(args, ctx) { + const baseDir = root(ctx?.worktree || ctx?.directory || process.cwd()) + const python = py(baseDir) + const script = path.join(baseDir, "script", "rag", "search-vector-index.py") + const dbPath = db(baseDir) + const show = args.mode === "expand" ? expandChars() : chars() + const res = + await Bun.$`${python} ${script} --query ${args.query} --db-path ${dbPath} --collection ${collection()} --model ${model()} --top-k ${args.top_k} --node-type ${args.node_type} --show-text-chars ${show} --base-url ${base()} --api-key ${key()} --format json --rewrite ${rewriteMode()} --rewrite-model ${rewriteModel()} --rewrite-queries ${rewriteQueries()}` + .quiet() + .nothrow() + const out = res.stdout.toString().trim() + const sessionID = String(ctx?.sessionID || ctx?.sessionId || baseDir) + const keyName = cluster(args.query) + const st = session(sessionID) + const hit = row(st, keyName) + + if (res.exitCode !== 0) { + const err = res.stderr.toString().trim() + await audit(baseDir, { + channel: "rag_search", + event: "tool_error", + sessionID, + query: args.query, + cluster: keyName, + mode: args.mode, + code: res.exitCode, + stderr: err.slice(0, 1200), + stdout: out.slice(0, 1200), + }) + return JSON.stringify( + { + error: "rag_search_failed", + exit_code: res.exitCode, + worktree: baseDir, + python, + script, + db_path: dbPath, + collection: collection(), + model: model(), + base_url: base(), + mode: args.mode, + stderr: err.slice(0, 1200), + stdout: out.slice(0, 1200), + hint: "verify OPENAI_BASE_URL/OPENAI_API_KEY, collection exists, and venv has openai/qdrant-client", + }, + null, + 2, + ) + } + + let dataRes = { hits: [], rewrites: [args.query], keywords: [], rewrite_mode: "none" } as ReturnType + try { + dataRes = parse(out) + } catch { + await audit(baseDir, { + channel: "rag_search", + event: "tool_parse_fail", + sessionID, + query: args.query, + cluster: keyName, + mode: args.mode, + raw: out.slice(0, 1200), + }) + return out.slice(0, 1000) + } + + const data = decide(hit, dataRes.hits, args.query, dataRes.rewrites) + const head = stateBlock(keyName, hit, data.next) + const body = + args.mode === "state" + ? "" + : args.mode === "expand" + ? allowExpand() + ? expand(dataRes.hits, args.top_k) + : "expand_blocked=1\nhint=use mode=delta or mode=brief unless debugging with RAG_ALLOW_EXPAND_TOOL=1" + : args.mode === "brief" + ? brief(dataRes.hits, args.top_k) + : data.delta.length + ? brief(data.delta, args.top_k) + : "no_new_delta" + + await audit(baseDir, { + channel: "rag_search", + event: "tool_search", + sessionID, + query: args.query, + cluster: keyName, + mode: args.mode, + node_type: args.node_type, + status: data.status, + reason: data.reason, + total_hits: data.total, + delta_hits: data.delta.length, + known_hits: data.known, + overlap: data.overlap, + rewrite_mode: dataRes.rewrite_mode, + top_hits: summary(hit.top, 3), + delta_fps: data.delta.map((x) => ({ + fp: `${x.text_file || x.source_url || ""}#${x.chunk_id || x.image_id || x.section_title || ""}`, + source_url: x.source_url || "", + section_title: x.section_title || "", + chunk_id: x.chunk_id || "", + })), + emitted_context: args.mode !== "state", + rewrites: hit.rewrites, + keywords: dataRes.keywords, + }) + + return body ? `${head}\n${body}` : head + }, +}) diff --git a/.opencode/tool/rag_search.txt b/.opencode/tool/rag_search.txt new file mode 100644 index 00000000000..d38d2b8e510 --- /dev/null +++ b/.opencode/tool/rag_search.txt @@ -0,0 +1,17 @@ +Search local RAG vector index and return ranked evidence snippets for the current query. + +Use this tool when: +- the user asks about project docs, internal wiki, SOP, or known indexed materials +- you need grounded context before answering + +Behavior: +- reads local qdrant index under .rag/vector/qdrant +- may rewrite long queries into multiple focused retrieval queries before searching +- shares the same session/cluster state used by `rag_context` +- supports progressive disclosure via mode: + - `state`: retrieval state only, no evidence body + - `delta`: only new evidence within current query cluster (default) + - `brief`: short evidence list for current hits + - `expand`: richer per-hit details for follow-up drilling +- default output is compact and should not dump full raw retrieval payload +- do not use `expand` in normal QA unless the user explicitly asks to inspect evidence details diff --git a/script/rag/build-offline-bundle.sh b/script/rag/build-offline-bundle.sh new file mode 100755 index 00000000000..68353a56caa --- /dev/null +++ b/script/rag/build-offline-bundle.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd) +OUT=${RAG_OFFLINE_OUT:-"$ROOT/.rag/offline/bundle"} +PY=${RAG_DOCLING_PYTHON:-python3} +LANGS=${RAG_TESS_LANGS:-"eng chi-sim"} +DOC_REQ=${RAG_DOCLING_REQUIREMENTS:-"$ROOT/script/rag/requirements-docling.txt"} +LLM_REQ=${RAG_LLAMA_REQUIREMENTS:-"$ROOT/script/rag/requirements-llamaindex.txt"} +VECTOR_REQ=${RAG_VECTOR_REQUIREMENTS:-"$ROOT/script/rag/requirements-vector.txt"} +INCLUDE_LLM=false +INCLUDE_VECTOR=false + +usage() { + cat <<'EOF' +Build an offline bundle for Ubuntu hosts with limited mirror/network access. + +Usage: + script/rag/build-offline-bundle.sh [--out DIR] [--python BIN] [--langs "eng chi-sim"] [--include-llamaindex] [--include-vectordb] + +Options: + --out DIR Bundle output directory (default: ./.rag/offline/bundle) + --python BIN Python executable used for wheel download (default: python3) + --langs "a b" Tesseract language packs (default: "eng chi-sim") + --include-llamaindex Also download llamaindex wheels + --include-vectordb Also download vector db wheels (qdrant-client/openai) + -h, --help Show help +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --out) + OUT="$2" + shift 2 + ;; + --python) + PY="$2" + shift 2 + ;; + --langs) + LANGS="$2" + shift 2 + ;; + --include-llamaindex) + INCLUDE_LLM=true + shift + ;; + --include-vectordb) + INCLUDE_VECTOR=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if ! command -v "$PY" >/dev/null 2>&1; then + echo "python executable not found: $PY" >&2 + exit 1 +fi +if ! command -v apt-get >/dev/null 2>&1; then + echo "apt-get not found, this script targets Debian/Ubuntu" >&2 + exit 1 +fi + +rm -rf "$OUT" +mkdir -p "$OUT/wheelhouse" "$OUT/deb" "$OUT/script/rag" + +TMP=$(mktemp -d) +trap 'rm -rf "$TMP"' EXIT +"$PY" -m venv "$TMP/venv" + +"$TMP/venv/bin/python" -m pip install -U pip +"$TMP/venv/bin/pip" download -r "$DOC_REQ" -d "$OUT/wheelhouse" + +if [[ "$INCLUDE_LLM" == "true" && -f "$LLM_REQ" ]]; then + "$TMP/venv/bin/pip" download -r "$LLM_REQ" -d "$OUT/wheelhouse" +fi +if [[ "$INCLUDE_VECTOR" == "true" && -f "$VECTOR_REQ" ]]; then + "$TMP/venv/bin/pip" download -r "$VECTOR_REQ" -d "$OUT/wheelhouse" +fi + +declare -a PKGS=("tesseract-ocr") +read -ra ITEMS <<<"$LANGS" +for l in "${ITEMS[@]}"; do + [[ -z "$l" ]] && continue + PKGS+=("tesseract-ocr-${l//_/-}") +done + +if command -v apt-rdepends >/dev/null 2>&1; then + mapfile -t ALL < <( + apt-rdepends "${PKGS[@]}" 2>/dev/null | + awk '/^[a-zA-Z0-9]/ { print $1 }' | + rg -v '^(Reading|Building|Depends|PreDepends|Recommends|Suggests)$' | + sort -u + ) +else + echo "warning: apt-rdepends not installed, only top-level tesseract packages will be downloaded." >&2 + ALL=("${PKGS[@]}") +fi + +( + cd "$OUT/deb" + apt-get download "${ALL[@]}" +) + +cp "$ROOT/script/rag/install-docling.sh" "$OUT/script/rag/" +cp "$ROOT/script/rag/install-tesseract.sh" "$OUT/script/rag/" +cp "$ROOT/script/rag/install-vector.sh" "$OUT/script/rag/" +cp "$ROOT/script/rag/install-offline-bundle.sh" "$OUT/script/rag/" 2>/dev/null || true +cp "$ROOT/script/rag/build-vector-index.py" "$OUT/script/rag/" 2>/dev/null || true +cp "$ROOT/script/rag/search-vector-index.py" "$OUT/script/rag/" 2>/dev/null || true +cp "$ROOT/script/rag/requirements-docling.txt" "$OUT/script/rag/" +if [[ -f "$LLM_REQ" ]]; then + cp "$LLM_REQ" "$OUT/script/rag/" +fi +if [[ -f "$VECTOR_REQ" ]]; then + cp "$VECTOR_REQ" "$OUT/script/rag/" +fi + +sha256sum "$OUT"/wheelhouse/* "$OUT"/deb/* >"$OUT/SHA256SUMS.txt" +tar -C "$(dirname "$OUT")" -czf "${OUT%/}.tar.gz" "$(basename "$OUT")" +echo "bundle directory: $OUT" +echo "bundle archive: ${OUT%/}.tar.gz" diff --git a/script/rag/build-vector-index.py b/script/rag/build-vector-index.py new file mode 100755 index 00000000000..37ea3debc54 --- /dev/null +++ b/script/rag/build-vector-index.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +import re +import sys +import time +import uuid +from pathlib import Path + +IMAGE_OCR_RE = re.compile(r"\[IMAGE_OCR\][\s\S]*?\[/IMAGE_OCR\]") + + +def clean(text: str) -> str: + return re.sub(r"\s+", " ", text).strip() + + +def normalize_text(text: str, strip_inline_ocr: bool) -> str: + body = text or "" + if strip_inline_ocr: + body = IMAGE_OCR_RE.sub(" ", body) + return clean(body) + + +def is_rate_limit_error(e: Exception) -> bool: + s = str(e).lower() + return "rate limit" in s or "too many requests" in s or "429" in s + + +def embed_texts( + client, + model: str, + texts: list[str], + max_retries: int, + retry_initial: float, +) -> list[list[float]]: + n = 0 + delay = max(0.2, retry_initial) + while True: + try: + r = client.embeddings.create(model=model, input=texts) + return [item.embedding for item in r.data] + except Exception as e: + if not is_rate_limit_error(e) or n >= max_retries: + raise + n += 1 + print( + f"[embed] rate limit; retry {n}/{max_retries} after {delay:.1f}s", + file=sys.stderr, + ) + time.sleep(delay) + delay = min(delay * 2, 30) + + +def list_inputs(paths: list[str], input_dir: str, glob: str) -> list[Path]: + files = [Path(p) for p in paths] + if input_dir: + files.extend(sorted(Path(input_dir).glob(glob))) + out = [] + seen = set() + for path in files: + p = path.resolve() + if p in seen: + continue + seen.add(p) + if p.is_file(): + out.append(p) + return out + + +def doc_key(path: Path, root: Path) -> str: + p = path.resolve() + try: + return str(p.relative_to(root.resolve())) + except ValueError: + return str(p) + + +def delete_keys(direct: list[str], file_path: str) -> list[str]: + out = [x for x in direct if x] + if file_path: + p = Path(file_path) + if p.exists(): + out.extend( + line.strip() + for line in p.read_text(encoding="utf-8", errors="ignore").splitlines() + if line.strip() + ) + return sorted(set(out)) + + +def merge_images(data: dict) -> list[dict]: + if isinstance(data.get("image_nodes"), list): + out = [] + for item in data["image_nodes"]: + iid = item.get("image_id") or item.get("id") + if not iid: + continue + out.append( + { + "id": iid, + "section_ids": item.get("section_ids", []), + "source_url": item.get("source_url", ""), + "alt": item.get("alt", ""), + "ocr_text": item.get("ocr_text", ""), + } + ) + return out + + image_map = {} + for sec in data.get("sections", []): + for item in sec.get("images", []): + iid = item.get("id") + if not iid: + continue + row = image_map.get(iid) or { + "id": iid, + "section_ids": [], + "source_url": item.get("url", ""), + "alt": item.get("alt", ""), + "ocr_text": item.get("ocr_text", ""), + } + sid = sec.get("id") + if sid and sid not in row["section_ids"]: + row["section_ids"].append(sid) + if not row["source_url"]: + row["source_url"] = item.get("url", "") + if not row["alt"]: + row["alt"] = item.get("alt", "") + if not row["ocr_text"]: + row["ocr_text"] = item.get("ocr_text", "") + image_map[iid] = row + return list(image_map.values()) + + +def load_nodes( + paths: list[Path], + include_images: bool, + strip_inline_ocr: bool, + image_min_chars: int, + root: Path, +) -> list[dict]: + rows = [] + for path in paths: + data = json.loads(path.read_text(encoding="utf-8", errors="ignore")) + source_url = data.get("source_url", "") + text_file = data.get("text_file", str(path)) + key = doc_key(path, root) + for i, item in enumerate(data.get("chunks", [])): + text = normalize_text(item.get("text", ""), strip_inline_ocr) + if not text: + continue + raw = f"{path}:{item.get('id', i)}" + pid = str(uuid.uuid5(uuid.NAMESPACE_URL, raw)) + meta = item.get("metadata") or {} + rows.append( + { + "id": pid, + "text": text, + "payload": { + "node_type": "text", + "chunk_id": item.get("id", f"chunk-{i}"), + "section_id": item.get("section_id", ""), + "section_title": item.get("section_title", ""), + "source_url": meta.get("source_url") or source_url, + "text_file": meta.get("text_file") or text_file, + "doc_key": key, + "image_ids": item.get("image_ids", []), + "char_len": meta.get("char_len", len(text)), + "text": text, + "raw_id": raw, + }, + } + ) + if not include_images: + continue + for i, item in enumerate(merge_images(data)): + iid = item.get("id") + txt = clean( + "\n".join( + x + for x in [ + f"[IMAGE:{iid}]", + item.get("alt", ""), + item.get("ocr_text", ""), + ] + if x + ) + ) + if len(clean((item.get("alt", "") + " " + item.get("ocr_text", "")).strip())) < image_min_chars: + continue + raw = f"{path}:image:{iid}:{i}" + pid = str(uuid.uuid5(uuid.NAMESPACE_URL, raw)) + rows.append( + { + "id": pid, + "text": txt, + "payload": { + "node_type": "image", + "image_id": iid, + "section_ids": item.get("section_ids", []), + "section_title": "", + "source_url": item.get("source_url", "") or source_url, + "text_file": text_file, + "doc_key": key, + "image_ids": [iid], + "char_len": len(txt), + "text": txt, + "alt": item.get("alt", ""), + "ocr_text": item.get("ocr_text", ""), + "raw_id": raw, + }, + } + ) + return rows + + +def has_collection(client: QdrantClient, name: str) -> bool: + if hasattr(client, "collection_exists"): + return bool(client.collection_exists(name)) + cols = client.get_collections().collections + return any(c.name == name for c in cols) + + +def delete_doc_keys(client, models, collection: str, keys: list[str]) -> int: + if not keys: + return 0 + if not has_collection(client, collection): + return 0 + for key in keys: + client.delete( + collection_name=collection, + points_selector=models.Filter( + must=[models.FieldCondition(key="doc_key", match=models.MatchValue(value=key))] + ), + wait=True, + ) + return len(keys) + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--input", action="append", default=[]) + p.add_argument("--input-dir", default="") + p.add_argument("--glob", default="*.structured.json") + p.add_argument("--db-path", default=".rag/vector/qdrant") + p.add_argument("--collection", default="rag_chunks") + p.add_argument("--model", default="nomic-embed-text") + p.add_argument("--base-url", default="") + p.add_argument("--api-key", default="") + p.add_argument("--batch-size", type=int, default=16) + p.add_argument("--max-retries", type=int, default=6) + p.add_argument("--retry-initial", type=float, default=1.5) + p.add_argument("--no-image-nodes", action="store_true") + p.add_argument("--keep-inline-ocr", action="store_true") + p.add_argument("--image-min-chars", type=int, default=2) + p.add_argument("--root", default=".") + p.add_argument("--delete-doc-key", action="append", default=[]) + p.add_argument("--delete-doc-keys-file", default="") + p.add_argument("--recreate", action="store_true") + args = p.parse_args() + + try: + from openai import OpenAI + from qdrant_client import QdrantClient, models + except ModuleNotFoundError as e: + raise SystemExit( + f"missing dependency: {e.name}. run: bash script/rag/install-vector.sh" + ) from e + + inputs = list_inputs(args.input, args.input_dir, args.glob) + root = Path(args.root) + del_keys = delete_keys(args.delete_doc_key, args.delete_doc_keys_file) + + rows = ( + load_nodes( + inputs, + include_images=not args.no_image_nodes, + strip_inline_ocr=not args.keep_inline_ocr, + image_min_chars=max(0, args.image_min_chars), + root=root, + ) + if inputs + else [] + ) + if not rows and not del_keys: + raise SystemExit("no input files and no delete doc keys; nothing to do") + + key = args.api_key or os.getenv("OPENAI_API_KEY") or os.getenv("MINIMAX_API_KEY") or "ollama" + base = args.base_url or os.getenv("OPENAI_BASE_URL") or "http://127.0.0.1:11434/v1" + embed = OpenAI(api_key=key, base_url=base) if rows else None + + db_path = Path(args.db_path) + db_path.mkdir(parents=True, exist_ok=True) + qdrant = QdrantClient(path=str(db_path)) + deleted = 0 + + if args.recreate and has_collection(qdrant, args.collection): + qdrant.delete_collection(collection_name=args.collection) + if del_keys: + deleted = delete_doc_keys(qdrant, models, args.collection, del_keys) + + if not rows: + count = qdrant.count(collection_name=args.collection, exact=True).count if has_collection(qdrant, args.collection) else 0 + print( + json.dumps( + { + "db_path": str(db_path), + "collection": args.collection, + "input_files": 0, + "inserted": 0, + "deleted_doc_keys": deleted, + "collection_count": count, + "text_nodes": 0, + "image_nodes": 0, + "vector_size": 0, + "embedding_model": args.model, + "embedding_base_url": base, + }, + ensure_ascii=False, + indent=2, + ) + ) + return + + vec0 = embed_texts( + embed, + args.model, + [rows[0]["text"]], + args.max_retries, + args.retry_initial, + )[0] + dim = len(vec0) + if dim <= 0: + raise SystemExit("embedding result is empty") + if not has_collection(qdrant, args.collection): + qdrant.create_collection( + collection_name=args.collection, + vectors_config=models.VectorParams(size=dim, distance=models.Distance.COSINE), + ) + + batch_size = max(1, args.batch_size) + total = 0 + batch = [{"id": rows[0]["id"], "vector": vec0, "payload": rows[0]["payload"]}] + for i in range(1, len(rows), batch_size): + seg = rows[i : i + batch_size] + vecs = embed_texts( + embed, + args.model, + [x["text"] for x in seg], + args.max_retries, + args.retry_initial, + ) + batch.extend( + { + "id": seg[j]["id"], + "vector": vecs[j], + "payload": seg[j]["payload"], + } + for j in range(len(seg)) + ) + + for i in range(0, len(batch), batch_size): + seg = batch[i : i + batch_size] + qdrant.upsert( + collection_name=args.collection, + points=[ + models.PointStruct(id=item["id"], vector=item["vector"], payload=item["payload"]) + for item in seg + ], + wait=True, + ) + total += len(seg) + + count = qdrant.count(collection_name=args.collection, exact=True).count + text_nodes = sum(1 for x in rows if x["payload"].get("node_type") == "text") + image_nodes = sum(1 for x in rows if x["payload"].get("node_type") == "image") + print( + json.dumps( + { + "db_path": str(db_path), + "collection": args.collection, + "input_files": len(inputs), + "inserted": total, + "deleted_doc_keys": deleted, + "collection_count": count, + "text_nodes": text_nodes, + "image_nodes": image_nodes, + "vector_size": dim, + "embedding_model": args.model, + "embedding_base_url": base, + }, + ensure_ascii=False, + indent=2, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/script/rag/clean-text.py b/script/rag/clean-text.py new file mode 100755 index 00000000000..72c9ea64134 --- /dev/null +++ b/script/rag/clean-text.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import re +from pathlib import Path + + +def normalize(text: str) -> str: + text = text.replace("\r\n", "\n").replace("\r", "\n") + text = re.sub(r"[ \t]+", " ", text) + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() + "\n" + + +def drop_noise(lines: list[str]) -> list[str]: + out = [] + seen = set() + for line in lines: + row = line.strip() + if not row: + out.append("") + continue + if row.startswith("[上一页 ") or row.startswith("[下一页 "): + continue + if row.startswith("- [") and row.endswith(")"): + continue + if row == "": + continue + key = re.sub(r"\s+", " ", row) + if key in seen and len(key) > 80: + continue + seen.add(key) + out.append(line) + return out + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--input", required=True) + p.add_argument("--output", required=True) + args = p.parse_args() + + src = Path(args.input).read_text(encoding="utf-8", errors="ignore") + rows = drop_noise(src.splitlines()) + out = normalize("\n".join(rows)) + Path(args.output).write_text(out, encoding="utf-8") + + +if __name__ == "__main__": + main() + diff --git a/script/rag/cmd/rag-bootstrap.sh b/script/rag/cmd/rag-bootstrap.sh new file mode 100755 index 00000000000..722983be004 --- /dev/null +++ b/script/rag/cmd/rag-bootstrap.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd) +TARGET="" +WITH_OPENCODE=true + +usage() { + cat <<'EOF' +Copy RAG pipeline scripts and optional OpenCode assets to another project. + +Usage: + bash script/rag/cmd/rag-bootstrap.sh --target /path/to/target [--no-opencode] + +Options: + --target DIR Target project root + --no-opencode Do not copy .opencode plugin/tool/skill files + -h, --help Show help +EOF +} + +copy_dir() { + local src="$1" + local dst="$2" + mkdir -p "$dst" + if command -v rsync >/dev/null 2>&1; then + rsync -a --exclude '__pycache__' --exclude '*.pyc' "$src"/ "$dst"/ + return + fi + find "$src" -type d -name "__pycache__" -prune -o -type f ! -name '*.pyc' -print | while read -r file; do + rel=${file#"$src"/} + mkdir -p "$dst/$(dirname "$rel")" + cp -f "$file" "$dst/$rel" + done +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --target) + TARGET="$2" + shift 2 + ;; + --no-opencode) + WITH_OPENCODE=false + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ -z "$TARGET" ]]; then + echo "--target is required" >&2 + usage + exit 1 +fi + +mkdir -p "$TARGET/script" +copy_dir "$ROOT/script/rag" "$TARGET/script/rag" + +if [[ "$WITH_OPENCODE" == "true" ]]; then + mkdir -p "$TARGET/.opencode/tool" "$TARGET/.opencode/plugins" "$TARGET/.opencode/skills/rag-pipeline" + cp -f "$ROOT/.opencode/tool/rag_search.ts" "$TARGET/.opencode/tool/rag_search.ts" + cp -f "$ROOT/.opencode/tool/rag_search.txt" "$TARGET/.opencode/tool/rag_search.txt" + cp -f "$ROOT/.opencode/plugins/rag_context.ts" "$TARGET/.opencode/plugins/rag_context.ts" + cp -f "$ROOT/.opencode/skills/rag-pipeline/SKILL.md" "$TARGET/.opencode/skills/rag-pipeline/SKILL.md" + cp -f "$ROOT/.opencode/rag.ts" "$TARGET/.opencode/rag.ts" +fi + +echo "bootstrap_done target=$TARGET with_opencode=$WITH_OPENCODE" +echo "next:" +echo " 1) cd $TARGET" +echo " 2) bash script/rag/install-docling.sh" +echo " 3) bash script/rag/install-vector.sh" +echo " 4) bash script/rag/cmd/rag-init.sh --help" diff --git a/script/rag/cmd/rag-init.sh b/script/rag/cmd/rag-init.sh new file mode 100644 index 00000000000..bfb728b640c --- /dev/null +++ b/script/rag/cmd/rag-init.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd) +PY=${RAG_DOCLING_PYTHON_BIN:-} +if [[ -z "$PY" ]]; then + if [[ -x "$ROOT/.venv-docling/bin/python" ]]; then + PY="$ROOT/.venv-docling/bin/python" + else + PY="python3" + fi +fi + +exec "$PY" "$ROOT/script/rag/rag-pipeline.py" init "$@" diff --git a/script/rag/cmd/rag-update.sh b/script/rag/cmd/rag-update.sh new file mode 100644 index 00000000000..1c518a8879e --- /dev/null +++ b/script/rag/cmd/rag-update.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd) +PY=${RAG_DOCLING_PYTHON_BIN:-} +if [[ -z "$PY" ]]; then + if [[ -x "$ROOT/.venv-docling/bin/python" ]]; then + PY="$ROOT/.venv-docling/bin/python" + else + PY="python3" + fi +fi + +exec "$PY" "$ROOT/script/rag/rag-pipeline.py" update "$@" diff --git a/script/rag/compare-structured.py b/script/rag/compare-structured.py new file mode 100755 index 00000000000..623eecb7ce9 --- /dev/null +++ b/script/rag/compare-structured.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import re +from pathlib import Path + +OCR_RE = re.compile(r"\[IMAGE_OCR\][\s\S]*?\[/IMAGE_OCR\]") + + +def load(path: Path) -> dict: + return json.loads(path.read_text(encoding="utf-8", errors="ignore")) + + +def metrics(data: dict) -> dict: + chunks = data.get("chunks", []) + sections = data.get("sections", []) + image_nodes = data.get("image_nodes", []) + nodes = data.get("nodes", []) + txt = [x.get("text", "") for x in chunks] + chars = [len(x) for x in txt] + with_ocr = sum(1 for x in txt if "[IMAGE_OCR]" in x) + ocr_blocks = sum(len(OCR_RE.findall(x)) for x in txt) + linked = sum(1 for x in chunks if (x.get("image_ids") or [])) + return { + "chunks": len(chunks), + "sections": len(sections), + "image_nodes": len(image_nodes), + "nodes": len(nodes), + "chunks_with_image_refs": linked, + "chunks_with_inline_ocr": with_ocr, + "inline_ocr_blocks_in_chunks": ocr_blocks, + "avg_chunk_chars": 0 if not chars else round(sum(chars) / len(chars), 2), + } + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--old", required=True) + p.add_argument("--new", required=True) + args = p.parse_args() + + old = metrics(load(Path(args.old))) + new = metrics(load(Path(args.new))) + keys = sorted(set(old) | set(new)) + diff = {k: (new.get(k, 0) - old.get(k, 0)) for k in keys} + print(json.dumps({"old": old, "new": new, "delta_new_minus_old": diff}, ensure_ascii=False, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/script/rag/convert-dir-to-text.sh b/script/rag/convert-dir-to-text.sh new file mode 100755 index 00000000000..43118755a3f --- /dev/null +++ b/script/rag/convert-dir-to-text.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd) +DOC=${RAG_DOCLING_BIN:-"$ROOT/.venv-docling/bin/docling"} +IN="" +OUT=${RAG_TEXT_FILES_OUTPUT:-"$ROOT/.rag/text/files"} +EXT="pdf docx pptx html htm md txt csv xls xlsx xml" + +usage() { + cat <<'EOF' +Convert supported files in a directory to text with docling. + +Usage: + script/rag/convert-dir-to-text.sh --input DIR [--output DIR] [--ext "pdf docx html"] + +Options: + --input DIR Source directory (required) + --output DIR Text output directory (default: ./.rag/text/files) + --ext "a b c" Extensions to include (default: pdf docx pptx html htm md txt csv xls xlsx xml) + --docling-bin PATH docling executable (default: ./.venv-docling/bin/docling) + -h, --help Show help +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --input) + IN="$2" + shift 2 + ;; + --output) + OUT="$2" + shift 2 + ;; + --ext) + EXT="$2" + shift 2 + ;; + --docling-bin) + DOC="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ -z "$IN" ]]; then + echo "--input is required" >&2 + usage + exit 1 +fi + +if [[ ! -d "$IN" ]]; then + echo "input directory not found: $IN" >&2 + exit 1 +fi + +if [[ ! -x "$DOC" ]]; then + echo "docling not found: $DOC" >&2 + exit 1 +fi + +mkdir -p "$OUT" +SUCCESS_LOG="$OUT/_success.log" +FAIL_LOG="$OUT/_failed.log" +RUN_LOG="$OUT/_run.log" +: >"$SUCCESS_LOG" +: >"$FAIL_LOG" +: >"$RUN_LOG" + +declare -a FIND_EXPR=() +read -ra PARTS <<<"$EXT" +for i in "${!PARTS[@]}"; do + e="${PARTS[$i]}" + [[ -z "$e" ]] && continue + if [[ "$i" -gt 0 ]]; then + FIND_EXPR+=("-o") + fi + FIND_EXPR+=("-iname" "*.$e") +done + +if [[ "${#FIND_EXPR[@]}" -eq 0 ]]; then + echo "no valid extensions in --ext" >&2 + exit 1 +fi + +TMP=$(mktemp -d) +trap 'rm -rf "$TMP"' EXIT + +mapfile -t FILES < <(find "$IN" -type f \( "${FIND_EXPR[@]}" \) | sort) +if [[ "${#FILES[@]}" -eq 0 ]]; then + echo "no files matched in: $IN" + exit 0 +fi + +OK=0 +BAD=0 + +for f in "${FILES[@]}"; do + rel=${f#"$IN"/} + target="$OUT/${rel%.*}.txt" + mkdir -p "$(dirname "$target")" + + work="$TMP/out" + rm -rf "$work" + mkdir -p "$work" + + if "$DOC" "$f" --to text --output "$work" --abort-on-error >>"$RUN_LOG" 2>&1; then + b=$(basename "${f%.*}") + src="$work/$b.txt" + if [[ -f "$src" ]]; then + mv "$src" "$target" + printf '%s\n' "$target" >>"$SUCCESS_LOG" + OK=$((OK + 1)) + continue + fi + fi + + printf '%s\n' "$f" >>"$FAIL_LOG" + BAD=$((BAD + 1)) +done + +echo "done: total=${#FILES[@]} success=$OK failed=$BAD" +echo "success log: $SUCCESS_LOG" +echo "failed log: $FAIL_LOG" +echo "run log: $RUN_LOG" + diff --git a/script/rag/debug-rag-state.py b/script/rag/debug-rag-state.py new file mode 100755 index 00000000000..962014f0ade --- /dev/null +++ b/script/rag/debug-rag-state.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from collections import Counter +from pathlib import Path + + +def read_rows(path: Path) -> list[dict]: + rows = [] + if not path.exists(): + return rows + with path.open("r", encoding="utf-8", errors="ignore") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + rows.append(json.loads(line)) + except Exception: + continue + return rows + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--log", default=".rag/log/rag_debug.jsonl") + p.add_argument("--tail", type=int, default=80) + p.add_argument("--session", default="") + p.add_argument("--channel", default="") + p.add_argument("--full", action="store_true") + args = p.parse_args() + + path = Path(args.log) + rows = read_rows(path) + if args.session: + rows = [x for x in rows if str(x.get("sessionID", "")) == args.session] + if args.channel: + rows = [x for x in rows if str(x.get("channel", "")) == args.channel] + if not rows: + raise SystemExit(f"no debug rows found in: {path}") + + view = rows[-max(1, args.tail) :] + events = Counter(str(x.get("event", "")) for x in view) + statuses = Counter(str(x.get("status", "")) for x in view if x.get("status")) + clusters = Counter(str(x.get("cluster", "")) for x in view if x.get("cluster")) + channels = Counter(str(x.get("channel", "")) for x in view if x.get("channel")) + modes = Counter(str(x.get("mode", "")) for x in view if x.get("mode")) + + print(json.dumps({ + "log": str(path), + "rows_total": len(rows), + "rows_view": len(view), + "channels": dict(channels), + "events": dict(events), + "statuses": dict(statuses), + "modes": dict(modes), + "top_clusters": clusters.most_common(10), + }, ensure_ascii=False, indent=2)) + + print("\nlast_rows:") + for item in view[-20:]: + keep = item if args.full else { + "ts": item.get("ts", ""), + "channel": item.get("channel", ""), + "event": item.get("event", ""), + "sessionID": item.get("sessionID", ""), + "query": item.get("query", ""), + "cluster": item.get("cluster", ""), + "mode": item.get("mode", ""), + "loop": item.get("loop", ""), + "used_cache": item.get("used_cache", ""), + "status": item.get("status", ""), + "reason": item.get("reason", ""), + "rewrite_mode": item.get("rewrite_mode", ""), + "keywords": item.get("keywords", []), + "total_hits": item.get("total_hits", ""), + "delta_hits": item.get("delta_hits", ""), + "known_hits": item.get("known_hits", ""), + "overlap": item.get("overlap", ""), + "top_hits": item.get("top_hits", []), + "delta_fps": item.get("delta_fps", []), + "rewrites": item.get("rewrites", []), + "emitted_context": item.get("emitted_context", ""), + } + print(json.dumps(keep, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/script/rag/install-docling.sh b/script/rag/install-docling.sh new file mode 100755 index 00000000000..53870ccdee9 --- /dev/null +++ b/script/rag/install-docling.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd) +VENV=${RAG_DOCLING_VENV:-"$ROOT/.venv-docling"} +PY=${RAG_DOCLING_PYTHON:-python3} +REQ=${RAG_DOCLING_REQUIREMENTS:-"$ROOT/script/rag/requirements-docling.txt"} +WHEEL=${RAG_DOCLING_WHEELHOUSE:-} + +usage() { + cat <<'EOF' +Install docling into a dedicated virtual environment. + +Usage: + script/rag/install-docling.sh [--venv PATH] [--python BIN] [--requirements FILE] [--wheelhouse DIR] + +Options: + --venv PATH Virtualenv path (default: ./.venv-docling) + --python BIN Python executable (default: python3) + --requirements FILE Requirements file (default: script/rag/requirements-docling.txt) + --wheelhouse DIR Offline wheels directory, enables --no-index install + -h, --help Show help +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --venv) + VENV="$2" + shift 2 + ;; + --python) + PY="$2" + shift 2 + ;; + --requirements) + REQ="$2" + shift 2 + ;; + --wheelhouse) + WHEEL="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if ! command -v "$PY" >/dev/null 2>&1; then + echo "python executable not found: $PY" >&2 + exit 1 +fi + +if [[ ! -d "$VENV" ]]; then + "$PY" -m venv "$VENV" +fi + +declare -a PIP=("$VENV/bin/python" "-m" "pip" "--disable-pip-version-check") + +if [[ -n "$WHEEL" ]]; then + if [[ ! -d "$WHEEL" ]]; then + echo "wheelhouse directory not found: $WHEEL" >&2 + exit 1 + fi + if [[ -f "$REQ" ]]; then + "${PIP[@]}" install --no-index --find-links "$WHEEL" -r "$REQ" + else + "${PIP[@]}" install --no-index --find-links "$WHEEL" docling + fi + "$VENV/bin/docling" --version + echo "docling installed in: $VENV" + exit 0 +fi + +"${PIP[@]}" install -U pip setuptools wheel + +if [[ -f "$REQ" ]]; then + "${PIP[@]}" install -r "$REQ" +else + "${PIP[@]}" install docling +fi + +"$VENV/bin/docling" --version +echo "docling installed in: $VENV" diff --git a/script/rag/install-offline-bundle.sh b/script/rag/install-offline-bundle.sh new file mode 100755 index 00000000000..3055716a097 --- /dev/null +++ b/script/rag/install-offline-bundle.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd) +BUNDLE=${RAG_OFFLINE_BUNDLE:-"$ROOT/.rag/offline/bundle"} +VENV=${RAG_DOCLING_VENV:-"$ROOT/.venv-docling"} +INSTALL_LLM=false +INSTALL_VECTOR=false + +usage() { + cat <<'EOF' +Install docling+tesseract from an offline bundle. + +Usage: + script/rag/install-offline-bundle.sh [--bundle DIR] [--venv PATH] [--install-llamaindex] [--install-vectordb] + +Options: + --bundle DIR Offline bundle directory (default: ./.rag/offline/bundle) + --venv PATH Venv install path (default: ./.venv-docling) + --install-llamaindex Install llamaindex wheels if available in bundle + --install-vectordb Install vector db wheels if available in bundle + -h, --help Show help +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --bundle) + BUNDLE="$2" + shift 2 + ;; + --venv) + VENV="$2" + shift 2 + ;; + --install-llamaindex) + INSTALL_LLM=true + shift + ;; + --install-vectordb) + INSTALL_VECTOR=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ ! -d "$BUNDLE" ]]; then + echo "bundle directory not found: $BUNDLE" >&2 + exit 1 +fi +if [[ ! -d "$BUNDLE/wheelhouse" ]]; then + echo "wheelhouse not found: $BUNDLE/wheelhouse" >&2 + exit 1 +fi + +SUDO="" +if [[ "${EUID:-$(id -u)}" -ne 0 ]]; then + if command -v sudo >/dev/null 2>&1; then + SUDO="sudo" + else + echo "sudo not found and current user is not root." >&2 + exit 1 + fi +fi + +if ls "$BUNDLE/deb/"*.deb >/dev/null 2>&1; then + $SUDO apt-get install -y "$BUNDLE"/deb/*.deb +fi + +bash "$ROOT/script/rag/install-docling.sh" \ + --venv "$VENV" \ + --requirements "$BUNDLE/script/rag/requirements-docling.txt" \ + --wheelhouse "$BUNDLE/wheelhouse" + +if [[ "$INSTALL_LLM" == "true" && -f "$BUNDLE/script/rag/requirements-llamaindex.txt" ]]; then + "$VENV/bin/python" -m pip --disable-pip-version-check install \ + --no-index --find-links "$BUNDLE/wheelhouse" \ + -r "$BUNDLE/script/rag/requirements-llamaindex.txt" +fi +if [[ "$INSTALL_VECTOR" == "true" && -f "$BUNDLE/script/rag/requirements-vector.txt" ]]; then + "$VENV/bin/python" -m pip --disable-pip-version-check install \ + --no-index --find-links "$BUNDLE/wheelhouse" \ + -r "$BUNDLE/script/rag/requirements-vector.txt" +fi + +echo "offline install completed" diff --git a/script/rag/install-tesseract.sh b/script/rag/install-tesseract.sh new file mode 100755 index 00000000000..c7d2241bd37 --- /dev/null +++ b/script/rag/install-tesseract.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +set -euo pipefail + +LANGS=${RAG_TESS_LANGS:-"eng chi-sim"} +NO_UPDATE=false + +usage() { + cat <<'EOF' +Install tesseract OCR and language packs on Debian/Ubuntu. + +Usage: + script/rag/install-tesseract.sh [--langs "eng chi-sim"] [--no-update] + +Options: + --langs "a b" Language packs to install (default: "eng chi-sim") + --no-update Skip apt update + -h, --help Show help +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --langs) + LANGS="$2" + shift 2 + ;; + --no-update) + NO_UPDATE=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if ! command -v apt-get >/dev/null 2>&1; then + echo "apt-get not found. This script currently supports Debian/Ubuntu only." >&2 + exit 1 +fi + +SUDO="" +if [[ "${EUID:-$(id -u)}" -ne 0 ]]; then + if command -v sudo >/dev/null 2>&1; then + SUDO="sudo" + else + echo "sudo not found and current user is not root." >&2 + exit 1 + fi +fi + +declare -a PKGS=("tesseract-ocr") +read -ra ITEMS <<<"$LANGS" +for l in "${ITEMS[@]}"; do + [[ -z "$l" ]] && continue + PKGS+=("tesseract-ocr-${l//_/-}") +done + +if [[ "$NO_UPDATE" != "true" ]]; then + $SUDO apt-get update +fi +$SUDO apt-get install -y "${PKGS[@]}" + +tesseract --version | head -n 2 +tesseract --list-langs | sed -n '1,40p' +echo "tesseract installed" diff --git a/script/rag/install-vector.sh b/script/rag/install-vector.sh new file mode 100755 index 00000000000..881786e23b2 --- /dev/null +++ b/script/rag/install-vector.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd) +VENV=${RAG_DOCLING_VENV:-"$ROOT/.venv-docling"} +PY=${RAG_DOCLING_PYTHON:-python3} +REQ=${RAG_VECTOR_REQUIREMENTS:-"$ROOT/script/rag/requirements-vector.txt"} +WHEEL=${RAG_DOCLING_WHEELHOUSE:-} + +usage() { + cat <<'EOF' +Install vector database dependencies into the existing rag virtual environment. + +Usage: + script/rag/install-vector.sh [--venv PATH] [--python BIN] [--requirements FILE] [--wheelhouse DIR] + +Options: + --venv PATH Virtualenv path (default: ./.venv-docling) + --python BIN Python executable (default: python3) + --requirements FILE Requirements file (default: script/rag/requirements-vector.txt) + --wheelhouse DIR Offline wheels directory, enables --no-index install + -h, --help Show help +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --venv) + VENV="$2" + shift 2 + ;; + --python) + PY="$2" + shift 2 + ;; + --requirements) + REQ="$2" + shift 2 + ;; + --wheelhouse) + WHEEL="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if ! command -v "$PY" >/dev/null 2>&1; then + echo "python executable not found: $PY" >&2 + exit 1 +fi + +if [[ ! -d "$VENV" ]]; then + "$PY" -m venv "$VENV" +fi + +declare -a PIP=("$VENV/bin/python" "-m" "pip" "--disable-pip-version-check") + +if [[ -n "$WHEEL" ]]; then + if [[ ! -d "$WHEEL" ]]; then + echo "wheelhouse directory not found: $WHEEL" >&2 + exit 1 + fi + "${PIP[@]}" install --no-index --find-links "$WHEEL" -r "$REQ" + echo "vector dependencies installed in: $VENV" + exit 0 +fi + +"${PIP[@]}" install -U pip setuptools wheel +"${PIP[@]}" install -r "$REQ" +echo "vector dependencies installed in: $VENV" diff --git a/script/rag/merge-image-ocr.py b/script/rag/merge-image-ocr.py new file mode 100755 index 00000000000..57de0fb886c --- /dev/null +++ b/script/rag/merge-image-ocr.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import re +from datetime import datetime, timezone +from pathlib import Path + + +def read(path: Path) -> str: + if not path.exists(): + return "" + return path.read_text(encoding="utf-8", errors="ignore") + + +def clean(text: str) -> str: + return re.sub(r"\s+", " ", text).strip() + + +def snippet(text: str, n: int) -> str: + if len(text) <= n: + return text + return text[:n].rstrip() + " ..." + + +def inline_block(image_id: str, text: str, limit: int, mode: str) -> str: + if mode == "none": + return "" + if mode == "marker": + return f"[IMAGE:{image_id}]" + if not text: + return f"[IMAGE:{image_id}]" + body = snippet(text, limit) + return f"[IMAGE:{image_id}]\n[IMAGE_OCR]\n{body}\n[/IMAGE_OCR]" + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--text", required=True) + p.add_argument("--meta", required=True) + p.add_argument("--ocr-dir", required=True) + p.add_argument("--sidecar", required=True) + p.add_argument("--source-url", required=True) + p.add_argument("--raw", required=False, default="") + p.add_argument("--inline-limit", type=int, default=2000) + p.add_argument("--inline-mode", choices=["ocr", "marker", "none"], default="marker") + args = p.parse_args() + + text_path = Path(args.text) + meta_path = Path(args.meta) + ocr_dir = Path(args.ocr_dir) + sidecar_path = Path(args.sidecar) + + raw = read(text_path) + if args.raw: + Path(args.raw).write_text(raw, encoding="utf-8") + + rows = json.loads(read(meta_path) or "[]") + items = [] + for i, row in enumerate(rows): + image_id = row.get("id") or f"img-{i}" + files = sorted(ocr_dir.glob(f"{image_id}*.txt")) + ocr_text = clean(read(files[0])) if files else "" + items.append( + { + "id": image_id, + "index": i, + "url": row.get("url", ""), + "alt": row.get("alt", ""), + "ocr_text": ocr_text, + "ocr_chars": len(ocr_text), + "status": "ok" if ocr_text else "empty", + } + ) + + marker = re.compile(r"") + text = raw + n = min(len(items), len(marker.findall(raw))) + for i in range(n): + block = inline_block(items[i]["id"], items[i]["ocr_text"], args.inline_limit, args.inline_mode) + text = marker.sub(lambda _: block, text, count=1) + + text_path.write_text(text, encoding="utf-8") + + sidecar = { + "source_url": args.source_url, + "text_file": str(text_path), + "raw_file": args.raw, + "generated_at": datetime.now(timezone.utc).isoformat(), + "images": items, + } + sidecar_path.write_text(json.dumps(sidecar, ensure_ascii=False, indent=2), encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/script/rag/rag-pipeline.py b/script/rag/rag-pipeline.py new file mode 100644 index 00000000000..f3572e3b4fc --- /dev/null +++ b/script/rag/rag-pipeline.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import subprocess +import sys +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path + + +def now() -> str: + return datetime.now(timezone.utc).isoformat() + + +def sha(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + h.update(chunk) + return h.hexdigest() + + +def rel(path: Path, root: Path) -> str: + p = path.resolve() + try: + return str(p.relative_to(root.resolve())) + except ValueError: + return str(p) + + +def run(cmd: list[str], *, capture: bool = False) -> str: + if capture: + out = subprocess.run(cmd, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return out.stdout + subprocess.run(cmd, check=True) + return "" + + +def urls(args) -> list[str]: + out = [u for u in args.url if u] + if args.url_file: + p = Path(args.url_file) + if p.exists(): + out.extend( + line.strip() + for line in p.read_text(encoding="utf-8", errors="ignore").splitlines() + if line.strip() and not line.strip().startswith("#") + ) + seen = set() + uniq = [] + for u in out: + if u in seen: + continue + seen.add(u) + uniq.append(u) + return uniq + + +def txt_files(dir_path: Path) -> list[Path]: + bad = {"_success.log", "_failed.log", "_run.log"} + out = [] + for path in sorted(dir_path.rglob("*.txt")): + name = path.name + if name in bad: + continue + if name.endswith(".clean.txt") or name.endswith(".raw.txt"): + continue + out.append(path) + return out + + +def structured_files(scan_dir: Path, glob: str) -> list[Path]: + return sorted(p for p in scan_dir.glob(glob) if p.is_file()) + + +def clean_path(txt: Path) -> Path: + if txt.name.endswith(".txt"): + return txt.with_name(txt.name[:-4] + ".clean.txt") + return txt.with_name(txt.name + ".clean.txt") + + +def structured_path(txt: Path) -> Path: + if txt.name.endswith(".txt"): + return txt.with_name(txt.name[:-4] + ".structured.json") + return txt.with_name(txt.name + ".structured.json") + + +@dataclass +class Env: + root: Path + py: Path + url_to_text: Path + convert_dir: Path + clean_text: Path + structure_text: Path + build_index: Path + + +def env(root: Path, py: str) -> Env: + return Env( + root=root, + py=Path(py), + url_to_text=root / "script" / "rag" / "url-to-text.sh", + convert_dir=root / "script" / "rag" / "convert-dir-to-text.sh", + clean_text=root / "script" / "rag" / "clean-text.py", + structure_text=root / "script" / "rag" / "structure-text.py", + build_index=root / "script" / "rag" / "build-vector-index.py", + ) + + +def process_txt(e: Env, txt: Path, args, source_url: str = "") -> Path: + c = clean_path(txt) + s = structured_path(txt) + run([str(e.py), str(e.clean_text), "--input", str(txt), "--output", str(c)]) + cmd = [ + str(e.py), + str(e.structure_text), + "--text", + str(c), + "--output", + str(s), + "--mode", + args.struct_mode, + "--inline-ocr", + args.inline_ocr, + ] + img = txt.with_name(txt.name[:-4] + ".images.json") if txt.name.endswith(".txt") else txt.with_name(txt.name + ".images.json") + if img.exists(): + cmd.extend(["--images", str(img)]) + if source_url: + cmd.extend(["--source-url", source_url]) + if args.struct_mode == "llamaindex": + cmd.extend(["--model", args.struct_model]) + run(cmd) + return s + + +def refresh_dir(e: Env, args) -> list[Path]: + src = Path(args.input_dir) + out = Path(args.text_out_dir) + out.mkdir(parents=True, exist_ok=True) + run(["bash", str(e.convert_dir), "--input", str(src), "--output", str(out)]) + return [process_txt(e, txt, args) for txt in txt_files(out)] + + +def pick_txt(stdout: str) -> Path: + rows = [line.strip() for line in stdout.splitlines() if line.strip()] + if not rows: + raise SystemExit("url-to-text returned empty output") + return Path(rows[-1]) + + +def refresh_url(e: Env, args) -> list[Path]: + all_urls = urls(args) + if not all_urls: + raise SystemExit("no url provided: use --url or --url-file") + out = [] + for url in all_urls: + cmd = [ + "bash", + str(e.url_to_text), + "--url", + url, + "--output", + args.url_text_dir, + "--image-inline", + args.image_inline, + ] + if args.ocr_images: + cmd.append("--ocr-images") + txt = pick_txt(run(cmd, capture=True)) + out.append(process_txt(e, txt, args, source_url=url)) + return out + + +def manifest(paths: list[Path], root: Path, args) -> dict: + docs = {} + for p in paths: + key = rel(p, root) + data = json.loads(p.read_text(encoding="utf-8", errors="ignore")) + docs[key] = { + "path": key, + "sha256": sha(p), + "source_url": data.get("source_url", ""), + "updated_at": now(), + } + return { + "version": 1, + "generated_at": now(), + "root": str(root.resolve()), + "collection": args.collection, + "embedding_model": args.embed_model, + "struct_mode": args.struct_mode, + "struct_model": args.struct_model, + "docs": docs, + } + + +def load_manifest(path: Path) -> dict: + if not path.exists(): + return {} + try: + return json.loads(path.read_text(encoding="utf-8", errors="ignore")) + except Exception: + return {} + + +def write_manifest(path: Path, data: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") + + +def index(e: Env, args, files: list[Path], recreate: bool, delete_keys: list[str]) -> dict: + cmd = [ + str(e.py), + str(e.build_index), + "--db-path", + args.db_path, + "--collection", + args.collection, + "--model", + args.embed_model, + "--root", + str(args.root), + ] + for f in files: + cmd.extend(["--input", str(f)]) + for key in delete_keys: + cmd.extend(["--delete-doc-key", key]) + if recreate: + cmd.append("--recreate") + out = run(cmd, capture=True) + return json.loads(out) + + +def scan_all(args) -> list[Path]: + return structured_files(Path(args.scan_dir), args.glob) + + +def init_cmd(e: Env, args) -> None: + if args.source == "dir": + files = refresh_dir(e, args) + elif args.source == "url": + files = refresh_url(e, args) + else: + files = scan_all(args) + if not files: + raise SystemExit("no structured files found for init") + res = index(e, args, files, recreate=True, delete_keys=[]) + man = manifest(files, args.root, args) + write_manifest(Path(args.manifest), man) + print( + json.dumps( + { + "mode": "init", + "files": len(files), + "manifest": args.manifest, + "index": res, + }, + ensure_ascii=False, + indent=2, + ) + ) + + +def update_cmd(e: Env, args) -> None: + if args.source == "dir": + refresh_dir(e, args) + elif args.source == "url": + refresh_url(e, args) + + files = scan_all(args) + old = load_manifest(Path(args.manifest)) + old_docs = old.get("docs", {}) + if not files: + new = manifest([], args.root, args) + removed = sorted(old_docs.keys()) + res = None + if removed: + res = index(e, args, [], recreate=False, delete_keys=removed) + write_manifest(Path(args.manifest), new) + print( + json.dumps( + { + "mode": "update", + "changed": 0, + "removed": len(removed), + "manifest": args.manifest, + "index": res, + }, + ensure_ascii=False, + indent=2, + ) + ) + return + + new = manifest(files, args.root, args) + new_docs = new.get("docs", {}) + + force_full = False + if old: + if old.get("collection") != args.collection or old.get("embedding_model") != args.embed_model: + force_full = True + + if force_full: + res = index(e, args, files, recreate=True, delete_keys=[]) + write_manifest(Path(args.manifest), new) + print( + json.dumps( + { + "mode": "update", + "reason": "collection_or_embedding_changed", + "files": len(files), + "manifest": args.manifest, + "index": res, + }, + ensure_ascii=False, + indent=2, + ) + ) + return + + changed = [k for k, v in new_docs.items() if old_docs.get(k, {}).get("sha256") != v.get("sha256")] + removed = [k for k in old_docs if k not in new_docs] + if not changed and not removed: + write_manifest(Path(args.manifest), new) + print( + json.dumps( + { + "mode": "update", + "changed": 0, + "removed": 0, + "manifest": args.manifest, + "index": None, + }, + ensure_ascii=False, + indent=2, + ) + ) + return + + pick = {k: Path(args.root) / new_docs[k]["path"] for k in changed} + res = index(e, args, [p for p in pick.values() if p.exists()], recreate=False, delete_keys=sorted(set(changed + removed))) + write_manifest(Path(args.manifest), new) + print( + json.dumps( + { + "mode": "update", + "changed": len(changed), + "removed": len(removed), + "manifest": args.manifest, + "index": res, + }, + ensure_ascii=False, + indent=2, + ) + ) + + +def add_common(sp) -> None: + struct_mode = os.getenv("RAG_STRUCT_MODE", "llamaindex") + if struct_mode not in {"rule", "llamaindex"}: + struct_mode = "llamaindex" + sp.add_argument("--root", default=".") + sp.add_argument("--python", default="./.venv-docling/bin/python") + sp.add_argument("--source", choices=["structured", "dir", "url"], default="structured") + sp.add_argument("--scan-dir", default=".rag/text") + sp.add_argument("--glob", default="**/*.structured.json") + sp.add_argument("--input-dir", default="") + sp.add_argument("--text-out-dir", default=".rag/text/dir") + sp.add_argument("--url", action="append", default=[]) + sp.add_argument("--url-file", default="") + sp.add_argument("--url-text-dir", default=".rag/text/url") + sp.add_argument("--ocr-images", action="store_true") + sp.add_argument("--image-inline", choices=["marker", "ocr", "none"], default="marker") + sp.add_argument("--struct-mode", choices=["rule", "llamaindex"], default=struct_mode) + sp.add_argument("--struct-model", default=os.getenv("RAG_STRUCT_MODEL", "gpt-4o-mini")) + sp.add_argument("--inline-ocr", choices=["strip", "keep"], default="strip") + sp.add_argument("--embed-model", default="qwen3-embedding:4b") + sp.add_argument("--db-path", default=".rag/vector/qdrant") + sp.add_argument("--collection", default="rag_chunks") + sp.add_argument("--manifest", default=".rag/state/manifest.json") + + +def main() -> None: + p = argparse.ArgumentParser() + sub = p.add_subparsers(dest="cmd", required=True) + p_init = sub.add_parser("init") + add_common(p_init) + p_update = sub.add_parser("update") + add_common(p_update) + args = p.parse_args() + args.root = Path(args.root).resolve() + e = env(args.root, args.python) + + if args.cmd == "init": + init_cmd(e, args) + return + if args.cmd == "update": + update_cmd(e, args) + return + raise SystemExit("unknown cmd") + + +if __name__ == "__main__": + try: + main() + except subprocess.CalledProcessError as e: + print( + json.dumps( + { + "error": "command_failed", + "cmd": e.cmd, + "code": e.returncode, + "stdout": e.stdout if isinstance(e.stdout, str) else "", + "stderr": e.stderr if isinstance(e.stderr, str) else "", + }, + ensure_ascii=False, + indent=2, + ), + file=sys.stderr, + ) + raise SystemExit(e.returncode) diff --git a/script/rag/requirements-docling.txt b/script/rag/requirements-docling.txt new file mode 100644 index 00000000000..e195be9fd62 --- /dev/null +++ b/script/rag/requirements-docling.txt @@ -0,0 +1 @@ +docling==2.77.0 diff --git a/script/rag/requirements-llamaindex.txt b/script/rag/requirements-llamaindex.txt new file mode 100644 index 00000000000..7aaa93fb77e --- /dev/null +++ b/script/rag/requirements-llamaindex.txt @@ -0,0 +1,2 @@ +llama-index +llama-index-llms-openai diff --git a/script/rag/requirements-vector.txt b/script/rag/requirements-vector.txt new file mode 100644 index 00000000000..e21b5db2afa --- /dev/null +++ b/script/rag/requirements-vector.txt @@ -0,0 +1,2 @@ +qdrant-client +openai diff --git a/script/rag/search-vector-index.py b/script/rag/search-vector-index.py new file mode 100644 index 00000000000..0b7d6185158 --- /dev/null +++ b/script/rag/search-vector-index.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path + + +def clip(text: str, n: int) -> str: + s = " ".join(str(text or "").split()) + return s if len(s) <= n else s[:n].rstrip() + " ..." + + +def uniq(rows: list[str]) -> list[str]: + seen = set() + out = [] + for item in rows: + val = str(item or "").strip() + if not val or val in seen: + continue + seen.add(val) + out.append(val) + return out + + +def pick_json(text: str) -> dict: + start = text.find("{") + end = text.rfind("}") + if start == -1 or end == -1 or end <= start: + raise ValueError("no json object found in rewrite response") + return json.loads(text[start : end + 1]) + + +def render_state(query: str, hits: list[dict], rewrite: dict) -> str: + top = hits[0] if hits else {} + status = "new_evidence" if hits else "need_refine" + reason = "top_hits_available" if hits else "empty_hits" + next_action = "use_delta_or_brief_only_if_needed" if hits else "refine_query_with_device_or_step" + return "\n".join( + [ + "", + f"query={clip(query, 80)}", + f"status={status}", + f"reason={reason}", + f"total_hits={len(hits)}", + f"top_source={top.get('source_url', '')}", + f"top_section={clip(top.get('section_title', ''), 48)}", + f"rewrite_mode={rewrite.get('mode', 'none')}", + f"rewrite_queries={json.dumps(rewrite.get('queries', []), ensure_ascii=False)}", + f"next_action={next_action}", + "", + ] + ) + + +def render_brief(query: str, hits: list[dict], rewrite: dict, top_k: int) -> str: + state = render_state(query, hits, rewrite) + if not hits: + return state + body = [] + for i, item in enumerate(hits[: max(1, top_k)], start=1): + body.append( + " ".join( + [ + f"[{i}]", + f"source={item.get('source_url', '')}", + f"section={clip(item.get('section_title', ''), 48)}", + f"summary={clip(item.get('text_preview', ''), 120)}", + ] + ) + ) + return state + "\n" + "\n".join(body) + + +def auto_format(value: str) -> str: + if value != "auto": + return value + if os.getenv("OPENCODE") == "1": + return "state" + return "json" + + +def need_rewrite(query: str) -> bool: + text = str(query or "").strip() + if len(text) >= 48: + return True + if text.count(" ") >= 5: + return True + marks = ["并且", "以及", "同时", "还有", "怎么", "如何", "步骤", "方式", "版本", "命令"] + return sum(1 for x in marks if x in text) >= 2 + + +def auto_rewrite(value: str, model: str, query: str) -> str: + if value != "auto": + return value + if model and need_rewrite(query): + return "llm" + return "off" + + +def embed_query(client, model: str, text: str) -> list[float]: + r = client.embeddings.create(model=model, input=[text]) + return r.data[0].embedding + + +def rewrite_query(client, model: str, query: str, limit: int) -> dict: + if not model: + return {"mode": "off", "queries": [query], "keywords": []} + prompt = "\n".join( + [ + "你是RAG检索改写器。", + "目标:从长问题中提取真正的检索目标,去掉语义噪声。", + "输出必须是 JSON 对象,不要输出解释。", + f"最多给出 {max(1, limit)} 条 queries。", + '返回格式:{"queries":["..."],"keywords":["..."]}', + "要求:queries 应短、准、可用于 embedding 检索;keywords 只保留设备名、动作、文档对象、错误码、版本等关键信息。", + f"原始问题:{query}", + ] + ) + try: + res = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": prompt}], + temperature=0.1, + ) + text = res.choices[0].message.content or "" + except Exception: + return {"mode": "llm_error", "queries": [query], "keywords": []} + try: + data = pick_json(text) + except Exception: + return {"mode": "llm_fallback", "queries": [query], "keywords": []} + queries = uniq([str(x) for x in data.get("queries", [])])[: max(1, limit)] + if query not in queries: + queries.insert(0, query) + keywords = uniq([str(x) for x in data.get("keywords", [])])[:8] + return { + "mode": "llm", + "queries": uniq(queries)[: max(1, limit)], + "keywords": keywords, + } + + +def related_images(qdrant, models, collection: str, ids: list[str], text_chars: int) -> list[dict]: + out = [] + for iid in ids: + flt = models.Filter( + must=[ + models.FieldCondition(key="node_type", match=models.MatchValue(value="image")), + models.FieldCondition(key="image_id", match=models.MatchValue(value=iid)), + ] + ) + points, _ = qdrant.scroll( + collection_name=collection, + scroll_filter=flt, + with_payload=True, + limit=1, + ) + if not points: + continue + payload = points[0].payload or {} + text = str(payload.get("text", "")) + n = max(20, text_chars) + preview = text if len(text) <= n else text[:n].rstrip() + " ..." + out.append( + { + "image_id": iid, + "source_url": payload.get("source_url", ""), + "text_preview": preview, + } + ) + return out + + +def search(qdrant, models, collection: str, vec: list[float], limit: int, node_type: str): + flt = None + if node_type != "any": + flt = models.Filter( + must=[models.FieldCondition(key="node_type", match=models.MatchValue(value=node_type))] + ) + if hasattr(qdrant, "query_points"): + res = qdrant.query_points( + collection_name=collection, + query=vec, + limit=max(1, limit), + with_payload=True, + query_filter=flt, + ) + return res.points + return qdrant.search( + collection_name=collection, + query_vector=vec, + limit=max(1, limit), + with_payload=True, + query_filter=flt, + ) + + +def fp(payload: dict) -> str: + src = str(payload.get("text_file", "") or payload.get("source_url", "")) + ident = str(payload.get("chunk_id", "") or payload.get("image_id", "") or payload.get("section_title", "")) + return f"{src}#{ident}" + + +def collect(points, qdrant, models, args, query: str) -> list[dict]: + out = [] + for rank, item in enumerate(points, start=1): + payload = item.payload or {} + text = str(payload.get("text", "")) + n = max(20, args.show_text_chars) + preview = text if len(text) <= n else text[:n].rstrip() + " ..." + ids = payload.get("image_ids", []) + if not isinstance(ids, list): + ids = [] + ext = ( + [] + if args.no_related_images + else related_images( + qdrant, + models, + args.collection, + [str(x) for x in ids if x], + args.show_text_chars, + ) + ) + out.append( + { + "fp": fp(payload), + "query": query, + "rank": rank, + "score": float(item.score), + "node_type": payload.get("node_type", "text"), + "image_id": payload.get("image_id", ""), + "chunk_id": payload.get("chunk_id", ""), + "section_title": payload.get("section_title", ""), + "source_url": payload.get("source_url", ""), + "text_file": payload.get("text_file", ""), + "image_ids": ids, + "related_images": ext, + "text_preview": preview, + } + ) + return out + + +def merge_hits(rows: list[list[dict]], primary: str, top_k: int) -> list[dict]: + merged: dict[str, dict] = {} + for batch in rows: + for item in batch: + cur = merged.get(item["fp"]) + if not cur: + merged[item["fp"]] = { + **item, + "matched_queries": [item["query"]], + "hit_count": 1, + "max_score": float(item["score"]), + "rrf": 1.0 / (60 + int(item["rank"])), + "primary_match": 1 if item["query"] == primary else 0, + } + continue + if item["query"] not in cur["matched_queries"]: + cur["matched_queries"].append(item["query"]) + cur["hit_count"] += 1 + cur["max_score"] = max(float(cur["max_score"]), float(item["score"])) + cur["rrf"] += 1.0 / (60 + int(item["rank"])) + if item["query"] == primary: + cur["primary_match"] = 1 + if float(item["score"]) > float(cur["score"]): + cur.update( + { + "score": float(item["score"]), + "node_type": item["node_type"], + "image_id": item["image_id"], + "chunk_id": item["chunk_id"], + "section_title": item["section_title"], + "source_url": item["source_url"], + "text_file": item["text_file"], + "image_ids": item["image_ids"], + "related_images": item["related_images"], + "text_preview": item["text_preview"], + } + ) + out = [] + for item in merged.values(): + item["rerank_score"] = ( + 0.45 * float(item["max_score"]) + + 0.35 * float(item["rrf"]) + + 0.12 * float(item["hit_count"]) + + 0.08 * float(item["primary_match"]) + ) + item.pop("fp", None) + item.pop("query", None) + item.pop("rank", None) + item.pop("max_score", None) + item.pop("rrf", None) + item.pop("primary_match", None) + out.append(item) + out.sort(key=lambda x: (float(x.get("rerank_score", 0)), float(x.get("score", 0))), reverse=True) + return out[: max(1, top_k)] + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--query", required=True) + p.add_argument("--db-path", default=".rag/vector/qdrant") + p.add_argument("--collection", default="rag_chunks") + p.add_argument("--model", default="nomic-embed-text") + p.add_argument("--base-url", default="") + p.add_argument("--api-key", default="") + p.add_argument("--top-k", type=int, default=5) + p.add_argument("--per-query-k", type=int, default=5) + p.add_argument("--show-text-chars", type=int, default=240) + p.add_argument("--node-type", choices=["any", "text", "image"], default="any") + p.add_argument("--no-related-images", action="store_true") + p.add_argument("--format", choices=["auto", "json", "state", "brief"], default="auto") + p.add_argument("--rewrite", choices=["auto", "off", "llm"], default="auto") + p.add_argument("--rewrite-model", default=os.getenv("RAG_REWRITE_MODEL", "")) + p.add_argument("--rewrite-queries", type=int, default=int(os.getenv("RAG_REWRITE_QUERIES", "3"))) + args = p.parse_args() + + try: + from openai import OpenAI + from qdrant_client import QdrantClient, models + except ModuleNotFoundError as e: + raise SystemExit( + f"missing dependency: {e.name}. run: bash script/rag/install-vector.sh" + ) from e + + key = args.api_key or os.getenv("OPENAI_API_KEY") or os.getenv("MINIMAX_API_KEY") or "ollama" + base = args.base_url or os.getenv("OPENAI_BASE_URL") or "http://127.0.0.1:11434/v1" + client = OpenAI(api_key=key, base_url=base) + rewrite_mode = auto_rewrite(args.rewrite, args.rewrite_model, args.query) + rewrite = ( + rewrite_query(client, args.rewrite_model, args.query, max(1, args.rewrite_queries)) + if rewrite_mode == "llm" + else {"mode": "off", "queries": [args.query], "keywords": []} + ) + queries = uniq([args.query, *rewrite.get("queries", [])])[: max(1, args.rewrite_queries)] + + db = Path(args.db_path) + if not db.exists(): + raise SystemExit(f"db path not found: {db}") + + qdrant = QdrantClient(path=str(db)) + rows = [] + for query in queries: + vec = embed_query(client, args.model, query) + points = search(qdrant, models, args.collection, vec, max(args.top_k, args.per_query_k), args.node_type) + rows.append(collect(points, qdrant, models, args, query)) + + out = merge_hits(rows, queries[0], args.top_k) + rewrite["queries"] = queries + fmt = auto_format(args.format) + if fmt == "state": + print(render_state(args.query, out, rewrite)) + return + if fmt == "brief": + print(render_brief(args.query, out, rewrite, args.top_k)) + return + print(json.dumps({"query": args.query, "rewrite": rewrite, "hits": out}, ensure_ascii=False, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/script/rag/structure-text.py b/script/rag/structure-text.py new file mode 100755 index 00000000000..3d158cb80c4 --- /dev/null +++ b/script/rag/structure-text.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import importlib.util +import json +import os +import re +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +LAST_LLM_AT = 0.0 +IMAGE_ID_RE = re.compile(r"\[IMAGE:([^\]]+)\]") +IMAGE_OCR_RE = re.compile(r"\[IMAGE_OCR\][\s\S]*?\[/IMAGE_OCR\]") + + +def read(path: Path) -> str: + return path.read_text(encoding="utf-8", errors="ignore") + + +def clean(text: str) -> str: + return re.sub(r"\s+", " ", text).strip() + + +def strip_inline_ocr(text: str) -> str: + out = IMAGE_OCR_RE.sub("", text) + out = re.sub(r"\n{3,}", "\n\n", out) + return out.strip() + + +def image_ids(text: str) -> list[str]: + return sorted(set(IMAGE_ID_RE.findall(text))) + + +def split_sections(text: str) -> list[dict]: + rows = [] + title = "document" + buf = [] + for line in text.splitlines(): + if re.match(r"^#{1,6}\s+", line): + body = "\n".join(buf).strip() + if body: + rows.append({"title": title, "text": body}) + title = re.sub(r"^#{1,6}\s+", "", line).strip() + buf = [] + continue + buf.append(line) + body = "\n".join(buf).strip() + if body: + rows.append({"title": title, "text": body}) + return rows + + +def chunk_text(text: str, size: int, overlap: int) -> list[str]: + if len(text) <= size: + return [text] + out = [] + i = 0 + while i < len(text): + out.append(text[i : i + size]) + if i + size >= len(text): + break + i += max(1, size - overlap) + return out + + +def rule_summary(text: str, n: int = 280) -> str: + s = clean(text) + if len(s) <= n: + return s + return s[:n].rstrip() + " ..." + + +def throttle(interval: float) -> None: + global LAST_LLM_AT + if interval <= 0: + return + now = time.monotonic() + wait = LAST_LLM_AT + interval - now + if wait > 0: + time.sleep(wait) + LAST_LLM_AT = time.monotonic() + + +def is_rate_limit_error(e: Exception) -> bool: + s = str(e).lower() + return "rate limit" in s or "too many requests" in s or "429" in s + + +def with_retry( + fn, + *, + min_interval: float, + max_retries: int, + retry_initial: float, +) -> str: + delay = max(0.1, retry_initial) + n = 0 + while True: + throttle(min_interval) + try: + return fn() + except Exception as e: + if not is_rate_limit_error(e) or n >= max_retries: + raise + n += 1 + print( + f"[llm] rate limit; retry {n}/{max_retries} after {delay:.1f}s", + file=sys.stderr, + ) + time.sleep(delay) + delay = min(delay * 2, 30) + + +def llama_summary( + text: str, + model: str, + *, + min_interval: float, + max_retries: int, + retry_initial: float, +) -> str: + if importlib.util.find_spec("llama_index.llms.openai") is None: + raise SystemExit( + "llama-index is not installed in this Python environment. " + "Use ./.venv-docling/bin/python -m pip install -r script/rag/requirements-llamaindex.txt" + ) + + prompt = ( + "Summarize the following text in Chinese, keep factual key points in 3 sentences max.\n\n" + f"{text[:6000]}" + ) + + def key() -> str: + k = os.getenv("OPENAI_API_KEY") or os.getenv("MINIMAX_API_KEY") + if k: + return k + raise SystemExit( + "OPENAI_API_KEY is required for --mode llamaindex " + "(MINIMAX_API_KEY is also accepted)." + ) + + def compat() -> str: + from openai import OpenAI as OpenAIClient + + client = OpenAIClient( + api_key=key(), + base_url=os.getenv("OPENAI_BASE_URL") or None, + ) + res = client.chat.completions.create( + model=model, + temperature=0, + messages=[{"role": "user", "content": prompt}], + ) + msg = res.choices[0].message.content if res.choices else "" + return clean(msg or "") + + from llama_index.llms.openai import OpenAI + + try: + return with_retry( + lambda: clean( + OpenAI( + model=model, + temperature=0, + api_base=os.getenv("OPENAI_BASE_URL"), + api_key=key(), + ).complete(prompt).text + ), + min_interval=min_interval, + max_retries=max_retries, + retry_initial=retry_initial, + ) + except ValueError as e: + if "Unknown model" not in str(e): + raise + if not os.getenv("OPENAI_BASE_URL"): + raise SystemExit( + f"Unknown model '{model}'. Set OPENAI_BASE_URL to your compatible endpoint, " + "for example: https://api.minimaxi.com/v1" + ) + return with_retry( + compat, + min_interval=min_interval, + max_retries=max_retries, + retry_initial=retry_initial, + ) + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--text", required=True) + p.add_argument("--images", required=False, default="") + p.add_argument("--output", required=True) + p.add_argument("--source-url", required=False, default="") + p.add_argument("--mode", choices=["rule", "llamaindex"], default="rule") + p.add_argument("--model", default="gpt-4o-mini") + p.add_argument("--llm-min-interval", type=float, default=1.0) + p.add_argument("--llm-max-retries", type=int, default=6) + p.add_argument("--llm-retry-initial", type=float, default=1.5) + p.add_argument("--inline-ocr", choices=["strip", "keep"], default="strip") + p.add_argument("--chunk-size", type=int, default=1600) + p.add_argument("--chunk-overlap", type=int, default=200) + args = p.parse_args() + + text_path = Path(args.text) + src = read(text_path) + sections = split_sections(src) + + image_rows = [] + image_map = {} + if args.images: + rows = json.loads(read(Path(args.images))) + image_rows = rows.get("images", []) + for item in image_rows: + image_map[item["id"]] = item + + out_sections = [] + chunks = [] + nodes = [] + for si, sec in enumerate(sections): + body = strip_inline_ocr(sec["text"]) if args.inline_ocr == "strip" else sec["text"] + ids = image_ids(body) + summary = rule_summary(body) + if args.mode == "llamaindex": + summary = llama_summary( + body, + args.model, + min_interval=args.llm_min_interval, + max_retries=args.llm_max_retries, + retry_initial=args.llm_retry_initial, + ) + + out_sections.append( + { + "id": f"sec-{si}", + "title": sec["title"], + "summary": summary, + "image_ids": ids, + "images": [image_map[i] for i in ids if i in image_map], + "text": body, + } + ) + + parts = chunk_text(body, args.chunk_size, args.chunk_overlap) + for ci, body in enumerate(parts): + ids2 = image_ids(body) + chunk = { + "id": f"sec-{si}-chunk-{ci}", + "type": "text", + "section_id": f"sec-{si}", + "section_title": sec["title"], + "text": body, + "image_ids": ids2, + "metadata": { + "source_url": args.source_url, + "text_file": str(text_path), + "char_len": len(body), + }, + } + chunks.append(chunk) + nodes.append(chunk) + + image_nodes = [] + for item in image_rows: + iid = item.get("id") + if not iid: + continue + refs = [sec["id"] for sec in out_sections if iid in sec["image_ids"]] + text = clean("\n".join(x for x in [item.get("alt", ""), item.get("ocr_text", "")] if x)) + image = { + "id": f"image-{iid}", + "type": "image", + "image_id": iid, + "section_ids": refs, + "source_url": item.get("url", ""), + "alt": item.get("alt", ""), + "ocr_text": item.get("ocr_text", ""), + "text": text, + "metadata": { + "source_url": args.source_url, + "text_file": str(text_path), + "ocr_chars": item.get("ocr_chars", len(item.get("ocr_text", "") or "")), + "status": item.get("status", ""), + }, + } + image_nodes.append(image) + nodes.append(image) + + out = { + "source_url": args.source_url, + "text_file": str(text_path), + "generated_at": datetime.now(timezone.utc).isoformat(), + "mode": args.mode, + "inline_ocr": args.inline_ocr, + "sections": out_sections, + "chunks": chunks, + "image_nodes": image_nodes, + "nodes": nodes, + } + Path(args.output).write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/script/rag/url-to-text.sh b/script/rag/url-to-text.sh new file mode 100755 index 00000000000..bd600008225 --- /dev/null +++ b/script/rag/url-to-text.sh @@ -0,0 +1,449 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd) +DOC=${RAG_DOCLING_BIN:-"$ROOT/.venv-docling/bin/docling"} +PY=${RAG_DOCLING_PYTHON_BIN:-"$ROOT/.venv-docling/bin/python"} +OUT=${RAG_TEXT_URL_OUTPUT:-"$ROOT/.rag/text/url"} +HTML=${RAG_TEXT_URL_HTML:-"$ROOT/.rag/html/url"} +URL="" +NAME="" +KEEP_HTML=false +OCR_IMAGES=false +IMAGE_LIMIT=${RAG_TEXT_URL_IMAGE_LIMIT:-30} +OCR_ENGINE=${RAG_TEXT_URL_OCR_ENGINE:-} +OCR_LANG=${RAG_TEXT_URL_OCR_LANG:-} +OCR_ARTIFACTS=${RAG_TEXT_URL_OCR_ARTIFACTS:-} +OCR_PSM=${RAG_TEXT_URL_OCR_PSM:-} +IMAGE_INLINE=${RAG_TEXT_URL_IMAGE_INLINE:-marker} +USER=${RAG_TEXT_URL_USER:-} +PASS=${RAG_TEXT_URL_PASSWORD:-} +COOKIE=${RAG_TEXT_URL_COOKIE:-} +COOKIE_FILE=${RAG_TEXT_URL_COOKIE_FILE:-} +PROXY=${RAG_TEXT_URL_PROXY:-} +NO_PROXY_MODE=false +INSECURE=false +declare -a HDR=() + +usage() { + cat <<'EOF' +Fetch one URL as HTML, then convert it to plain text with docling. + +Usage: + script/rag/url-to-text.sh --url URL [--name NAME] [--output DIR] [--html-dir DIR] [--header "K: V"] [--user USER --password PASS] [--cookie "a=b"] [--cookie-file FILE] [--proxy URL] [--no-proxy] [--insecure] [--keep-html] [--ocr-images] [--image-limit N] [--ocr-engine NAME] [--ocr-lang CODE] [--psm N] [--image-inline MODE] + +Options: + --url URL Source URL to fetch + --name NAME Output file stem (default: generated from URL) + --output DIR Text output directory (default: ./.rag/text/url) + --html-dir DIR Downloaded HTML directory (default: ./.rag/html/url) + --header "K: V" Extra request header for curl (repeatable) + --user USER HTTP auth username for URL fetch + --password PASS HTTP auth password for URL fetch (or set RAG_TEXT_URL_PASSWORD) + --cookie "k=v;..." Cookie header value + --cookie-file FILE Netscape cookie file used by curl + --proxy URL Proxy for curl requests + --no-proxy Bypass proxy for all hosts (adds --noproxy "*") + --insecure Allow insecure TLS for intranet/self-signed cert + --keep-html Keep downloaded HTML file + --ocr-images OCR text in resources and append to output txt + --image-limit N Max images to OCR when --ocr-images is enabled (default: 30) + --ocr-engine NAME OCR engine for image OCR (for example: tesseract, rapidocr, auto) + --ocr-lang CODE OCR language list (for example: eng or eng,chi_sim) + --psm N OCR page segmentation mode, 0-13 (useful for tesseract) + --image-inline MODE Inline image strategy: marker|ocr|none (default: marker) + --artifacts-path PATH Local docling artifacts path for OCR-related models + --docling-bin PATH docling executable (default: ./.venv-docling/bin/docling) + --python-bin PATH python executable used to parse html img tags (default: ./.venv-docling/bin/python) + -h, --help Show help +EOF +} + +slug() { + printf '%s' "$1" | + sed -E 's#https?://##; s#[^a-zA-Z0-9._-]+#-#g; s#-+#-#g; s#(^-|-$)##g' | + cut -c1-120 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --url) + URL="$2" + shift 2 + ;; + --name) + NAME="$2" + shift 2 + ;; + --output) + OUT="$2" + shift 2 + ;; + --html-dir) + HTML="$2" + shift 2 + ;; + --header) + HDR+=("$2") + shift 2 + ;; + --user) + USER="$2" + shift 2 + ;; + --password) + PASS="$2" + shift 2 + ;; + --cookie) + COOKIE="$2" + shift 2 + ;; + --cookie-file) + COOKIE_FILE="$2" + shift 2 + ;; + --proxy) + PROXY="$2" + shift 2 + ;; + --no-proxy) + NO_PROXY_MODE=true + shift + ;; + --insecure) + INSECURE=true + shift + ;; + --keep-html) + KEEP_HTML=true + shift + ;; + --ocr-images) + OCR_IMAGES=true + shift + ;; + --image-limit) + IMAGE_LIMIT="$2" + shift 2 + ;; + --ocr-engine) + OCR_ENGINE="$2" + shift 2 + ;; + --ocr-lang) + OCR_LANG="$2" + shift 2 + ;; + --psm) + OCR_PSM="$2" + shift 2 + ;; + --image-inline) + IMAGE_INLINE="$2" + shift 2 + ;; + --artifacts-path) + OCR_ARTIFACTS="$2" + shift 2 + ;; + --docling-bin) + DOC="$2" + shift 2 + ;; + --python-bin) + PY="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ -z "$URL" ]]; then + echo "--url is required" >&2 + usage + exit 1 +fi + +if [[ ! -x "$DOC" ]]; then + echo "docling not found: $DOC" >&2 + exit 1 +fi + +if ! command -v curl >/dev/null 2>&1; then + echo "curl not found" >&2 + exit 1 +fi + +if [[ -n "$COOKIE_FILE" && ! -f "$COOKIE_FILE" ]]; then + echo "cookie file not found: $COOKIE_FILE" >&2 + exit 1 +fi + +if [[ "$OCR_IMAGES" == "true" && ! -x "$PY" ]]; then + echo "python not found or not executable: $PY" >&2 + exit 1 +fi + +if [[ "$OCR_IMAGES" == "true" ]]; then + if [[ -z "$OCR_ENGINE" ]]; then + if command -v tesseract >/dev/null 2>&1; then + OCR_ENGINE="tesseract" + if [[ -z "$OCR_LANG" ]]; then + OCR_LANG="eng,chi_sim" + fi + echo "image OCR engine selected: tesseract" >&2 + else + OCR_ENGINE="auto" + echo "image OCR engine selected: auto (tesseract not found)" >&2 + fi + fi + + if [[ "$OCR_ENGINE" == "tesseract" ]]; then + if ! command -v tesseract >/dev/null 2>&1; then + echo "tesseract not found, install it first: sudo apt install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-chi-sim" >&2 + exit 1 + fi + if [[ -z "$OCR_LANG" ]]; then + OCR_LANG="eng,chi_sim" + fi + if [[ -z "$OCR_PSM" ]]; then + OCR_PSM="6" + fi + fi + echo "image OCR config: engine=$OCR_ENGINE lang=${OCR_LANG:-} psm=${OCR_PSM:-}" >&2 +fi + +if [[ -n "$OCR_PSM" ]] && ! [[ "$OCR_PSM" =~ ^[0-9]+$ ]]; then + echo "invalid --psm: $OCR_PSM" >&2 + exit 1 +fi +if [[ "$IMAGE_INLINE" != "marker" && "$IMAGE_INLINE" != "ocr" && "$IMAGE_INLINE" != "none" ]]; then + echo "invalid --image-inline: $IMAGE_INLINE (expected marker|ocr|none)" >&2 + exit 1 +fi + +if [[ -z "$NAME" ]]; then + NAME=$(slug "$URL") +fi + +if [[ -z "$NAME" ]]; then + NAME="page-$(date +%Y%m%d-%H%M%S)" +fi + +mkdir -p "$OUT" "$HTML" +HTML_FILE="$HTML/$NAME.html" + +declare -a CURL_CMD=("curl" "-fsSL") +if [[ "$NO_PROXY_MODE" == "true" ]]; then + CURL_CMD+=("--noproxy" "*") +elif [[ -n "$PROXY" ]]; then + CURL_CMD+=("--proxy" "$PROXY") +fi +if [[ "$INSECURE" == "true" ]]; then + CURL_CMD+=("-k") +fi +if [[ -n "$USER" ]]; then + CURL_CMD+=("-u" "$USER:$PASS") +fi +if [[ -n "$COOKIE" ]]; then + CURL_CMD+=("-H" "Cookie: $COOKIE") +fi +if [[ -n "$COOKIE_FILE" ]]; then + CURL_CMD+=("-b" "$COOKIE_FILE") +fi +CURL_CMD+=("$URL" "-o" "$HTML_FILE") +for h in "${HDR[@]}"; do + CURL_CMD+=("-H" "$h") +done +"${CURL_CMD[@]}" + +"$DOC" "$HTML_FILE" --from html --to text --output "$OUT" --abort-on-error + +TXT_FILE="$OUT/$NAME.txt" +if [[ ! -f "$TXT_FILE" ]]; then + FALLBACK=$(find "$OUT" -maxdepth 1 -type f -name "$NAME*.txt" | head -n 1 || true) + if [[ -n "$FALLBACK" ]]; then + TXT_FILE="$FALLBACK" + fi +fi + +if [[ ! -f "$TXT_FILE" ]]; then + echo "docling conversion finished but no txt was found for: $NAME" >&2 + exit 1 +fi + +if [[ "$OCR_IMAGES" == "true" ]]; then + TMP=$(mktemp -d) + trap 'rm -rf "$TMP"' EXIT + IMG_LIST="$TMP/image_urls.txt" + IMG_META="$TMP/image_meta.json" + IMG_DIR="$TMP/images" + OCR_DIR="$TMP/ocr" + mkdir -p "$IMG_DIR" "$OCR_DIR" + + "$PY" - "$URL" "$HTML_FILE" "$IMG_LIST" "$IMG_META" <<'PY' +import json +import pathlib +import sys +from urllib.parse import urljoin +from bs4 import BeautifulSoup + +base = sys.argv[1] +html_path = pathlib.Path(sys.argv[2]) +out = pathlib.Path(sys.argv[3]) +meta = pathlib.Path(sys.argv[4]) +raw = html_path.read_text(encoding="utf-8", errors="ignore") +soup = BeautifulSoup(raw, "html.parser") +seen = set() +rows = [] +for n in soup.find_all("img"): + src = (n.get("src") or n.get("data-src") or n.get("data-original") or "").strip() + if not src: + continue + if src.startswith("data:"): + continue + u = urljoin(base, src) + if not u or u in seen: + continue + seen.add(u) + rows.append( + { + "id": f"img-{len(rows)}", + "url": u, + "alt": (n.get("alt") or "").strip(), + } + ) +out.write_text("\n".join(row["url"] for row in rows), encoding="utf-8") +meta.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8") +PY + + mapfile -t IMAGES <"$IMG_LIST" + MAX="$IMAGE_LIMIT" + if ! [[ "$MAX" =~ ^[0-9]+$ ]]; then + echo "invalid --image-limit: $MAX" >&2 + exit 1 + fi + + OCR_OK=0 + OCR_BAD=0 + OCR_DONE=0 + OCR_LOG="$OUT/$NAME.image_ocr.log" + : >"$OCR_LOG" + for u in "${IMAGES[@]}"; do + if [[ "$OCR_DONE" -ge "$MAX" ]]; then + break + fi + clean="${u%%\?*}" + ext="${clean##*.}" + if [[ "$ext" == "$clean" ]] || [[ ! "$ext" =~ ^[A-Za-z0-9]{1,6}$ ]]; then + ext="img" + fi + f="$IMG_DIR/img-$OCR_DONE.$ext" + declare -a CURL_IMAGE=("curl" "-fsSL") + if [[ "$NO_PROXY_MODE" == "true" ]]; then + CURL_IMAGE+=("--noproxy" "*") + elif [[ -n "$PROXY" ]]; then + CURL_IMAGE+=("--proxy" "$PROXY") + fi + if [[ "$INSECURE" == "true" ]]; then + CURL_IMAGE+=("-k") + fi + if [[ -n "$USER" ]]; then + CURL_IMAGE+=("-u" "$USER:$PASS") + fi + if [[ -n "$COOKIE" ]]; then + CURL_IMAGE+=("-H" "Cookie: $COOKIE") + fi + if [[ -n "$COOKIE_FILE" ]]; then + CURL_IMAGE+=("-b" "$COOKIE_FILE") + fi + CURL_IMAGE+=("$u" "-o" "$f" "-H" "Referer: $URL") + for h in "${HDR[@]}"; do + CURL_IMAGE+=("-H" "$h") + done + if ! "${CURL_IMAGE[@]}" >/dev/null 2>&1; then + OCR_BAD=$((OCR_BAD + 1)) + OCR_DONE=$((OCR_DONE + 1)) + continue + fi + + t="$OCR_DIR/$(basename "$f").txt" + if [[ "$OCR_ENGINE" == "tesseract" ]]; then + declare -a TESS=("tesseract" "$f" "stdout") + if [[ -n "$OCR_LANG" ]]; then + TESS+=("-l" "${OCR_LANG//,/+}") + fi + if [[ -n "$OCR_PSM" ]]; then + TESS+=("--psm" "$OCR_PSM") + fi + if "${TESS[@]}" >"$t" 2>>"$OCR_LOG"; then + : + else + OCR_BAD=$((OCR_BAD + 1)) + OCR_DONE=$((OCR_DONE + 1)) + continue + fi + else + declare -a OCR_CMD=("$DOC" "$f" "--from" "image" "--to" "text" "--output" "$OCR_DIR" "--ocr" "--force-ocr" "--abort-on-error") + if [[ -n "$OCR_ENGINE" ]]; then + OCR_CMD+=("--ocr-engine" "$OCR_ENGINE") + fi + if [[ -n "$OCR_LANG" ]]; then + OCR_CMD+=("--ocr-lang" "$OCR_LANG") + fi + if [[ -n "$OCR_ARTIFACTS" ]]; then + OCR_CMD+=("--artifacts-path" "$OCR_ARTIFACTS") + fi + if [[ -n "$OCR_PSM" ]]; then + OCR_CMD+=("--psm" "$OCR_PSM") + fi + if "${OCR_CMD[@]}" >>"$OCR_LOG" 2>&1; then + : + else + OCR_BAD=$((OCR_BAD + 1)) + OCR_DONE=$((OCR_DONE + 1)) + continue + fi + fi + + if [[ -s "$t" ]] && grep -q '[^[:space:]]' "$t"; then + OCR_OK=$((OCR_OK + 1)) + else + OCR_BAD=$((OCR_BAD + 1)) + fi + OCR_DONE=$((OCR_DONE + 1)) + done + + SIDECAR="$OUT/$NAME.images.json" + RAW_TXT="$OUT/$NAME.raw.txt" + "$PY" "$ROOT/script/rag/merge-image-ocr.py" \ + --text "$TXT_FILE" \ + --meta "$IMG_META" \ + --ocr-dir "$OCR_DIR" \ + --sidecar "$SIDECAR" \ + --raw "$RAW_TXT" \ + --inline-mode "$IMAGE_INLINE" \ + --source-url "$URL" + + echo "image_ocr_total=${#IMAGES[@]} scanned=$OCR_DONE success=$OCR_OK failed=$OCR_BAD" >&2 + echo "image_sidecar=$SIDECAR" >&2 + if [[ "${#IMAGES[@]}" -gt 0 && "$OCR_OK" -eq 0 ]]; then + echo "image OCR produced no text; inspect log: $OCR_LOG" >&2 + echo "hint: try --ocr-lang chi_sim or eng,chi_sim with --psm 6; if page images are tiny/icons, OCR may return empty." >&2 + fi +fi + +if [[ "$KEEP_HTML" != "true" ]]; then + rm -f "$HTML_FILE" +fi + +echo "$TXT_FILE" diff --git a/specs/rag-docling-deploy.zh.md b/specs/rag-docling-deploy.zh.md new file mode 100644 index 00000000000..9e17aa8adcf --- /dev/null +++ b/specs/rag-docling-deploy.zh.md @@ -0,0 +1,507 @@ +# RAG 文本化部署手册(Docling) + +本手册记录从环境准备到文本产出的完整步骤,适合在本地或内网机器复用。 + +## 1. 环境准备 + +在 Debian/Ubuntu 上安装 Python 虚拟环境能力: + +```bash +sudo apt update +sudo apt install -y python3 python3-venv python3-full curl +``` + +验证版本: + +```bash +python3 --version +curl --version | head -n 1 +``` + +## 2. 安装 Docling(隔离 venv) + +在仓库根目录执行: + +```bash +cd /home/zhang/01-my_code/09-my-opencode/opencode-worktrees/rag-enhance +bash script/rag/install-docling.sh +``` + +脚本行为: + +1. 创建 `./.venv-docling` +2. 升级 `pip/setuptools/wheel` +3. 安装 `script/rag/requirements-docling.txt` 中的 `docling` +4. 输出 `docling --version` 作为健康检查 + +可选参数: + +```bash +bash script/rag/install-docling.sh \ + --venv /opt/rag/.venv-docling \ + --python python3 \ + --requirements script/rag/requirements-docling.txt +``` + +内网离线安装(本地 wheel 仓): + +```bash +bash script/rag/install-docling.sh \ + --venv /opt/rag/.venv-docling \ + --requirements script/rag/requirements-docling.txt \ + --wheelhouse /opt/rag/docling-wheelhouse +``` + +## 3. 激活环境(可选) + +脚本默认直接调用绝对路径,不强制激活;如需手动调试可激活: + +```bash +source .venv-docling/bin/activate +docling --version +``` + +## 3.1 安装 Tesseract(方案 A,推荐内网) + +在 Debian/Ubuntu 上执行: + +```bash +bash script/rag/install-tesseract.sh +``` + +默认安装: + +- `tesseract-ocr` +- `tesseract-ocr-eng` +- `tesseract-ocr-chi-sim` + +可自定义语言包: + +```bash +bash script/rag/install-tesseract.sh --langs "eng chi-sim" +``` + +## 4. URL 抓取 HTML 并转换为 text + +单 URL: + +```bash +bash script/rag/url-to-text.sh \ + --url "https://example.com" +``` + +开启图片 OCR(识别页面 `img` 里的文字): + +```bash +bash script/rag/url-to-text.sh \ + --url "https://example.com" \ + --ocr-images \ + --image-limit 30 \ + --image-inline marker +``` + +说明:当 `--ocr-images` 启用且系统存在 `tesseract` 时,脚本会默认优先使用 `tesseract`(更适合内网离线)。 +且该路径会直接调用系统 `tesseract`,避免 docling 的 OSD 包装层导致的部分图片误报失败。 + +`--image-inline` 说明: + +1. `marker`:仅保留 `[IMAGE:img-x]` 占位,OCR 文本只放 sidecar(推荐,避免污染 chunk) +2. `ocr`:将 OCR 内联到正文(老行为) +3. `none`:移除图片占位 + +指定 OCR 引擎/语言: + +```bash +bash script/rag/url-to-text.sh \ + --url "https://example.com" \ + --ocr-images \ + --ocr-engine tesseract \ + --ocr-lang eng,chi_sim \ + --psm 6 +``` + +代理控制(避免被错误代理拦住): + +```bash +# 强制绕过代理 +bash script/rag/url-to-text.sh --url "https://example.com" --no-proxy + +# 显式指定代理 +bash script/rag/url-to-text.sh --url "https://example.com" --proxy "http://proxy.local:7890" +``` + +输出默认为: + +- HTML 暂存目录:`./.rag/html/url/` +- 文本目录:`./.rag/text/url/` + +带认证头示例: + +```bash +bash script/rag/url-to-text.sh \ + --url "https://intranet.example.local/doc?id=123" \ + --header "Authorization: Bearer " \ + --header "Cookie: session=" \ + --name "intranet-doc-123" \ + --ocr-images \ + --keep-html +``` + +账号密码认证(Basic/Digest 场景): + +```bash +bash script/rag/url-to-text.sh \ + --url "https://intranet.example.local/doc/123" \ + --user "your_user" \ + --password "your_password" \ + --ocr-images +``` + +Cookie 文件认证(SSO 登录后导出的 cookie): + +```bash +bash script/rag/url-to-text.sh \ + --url "https://intranet.example.local/doc/123" \ + --cookie-file /path/to/cookies.txt \ + --ocr-images +``` + +LDAP/SSO 场景说明: + +1. LDAP 只负责身份认证,`url-to-text.sh` 不能直接“输入 LDAP”完成网页表单登录 +2. 脚本本质是 `curl` 抓取,通常需要有效 session(Cookie)或网关支持 Basic Auth +3. 你的内网若是 LDAP + SSO(CAS/OIDC/SAML),推荐先在浏览器登录,再导出 `cookies.txt` 给 `--cookie-file` + +命令标准输出会打印生成的 `.txt` 路径,可直接接入后续 embedding 流程。 + +图片相关输出文件(`--ocr-images`): + +1. 主文本:`.txt`(`` 会被替换为 `[IMAGE:img-x]` + 就地 OCR) +2. 原始文本备份:`.raw.txt` +3. 图片 sidecar:`.images.json`(包含 `id/url/alt/ocr_text/status`) +4. OCR 运行日志:`.image_ocr.log` + +说明: + +1. 默认只提取 HTML 可见文本,不做图片 OCR +2. `--ocr-images` 会解析页面 `` 链接并逐张 OCR,并就地写回到图片占位符附近 +3. 若页面是前端渲染(图片不在原始 HTML),需要先用浏览器渲染后再抓取 HTML 或导出 PDF 再转文本 + +### 图片 OCR 常见问题 + +如果你看到“图片无法识别”或 `image_ocr_total` 有值但 `success=0`,通常是 OCR 模型未就绪: + +1. `docling` 的 `rapidocr/auto` 首次运行可能需要联网下载模型 +2. 内网环境需预下载模型并同步缓存,或改用本机 `tesseract` + +你给的日志 `wiki.luckfox.com-zh-Luckfox-Pico-Zero-Overview.image_ocr.log` 显示: + +1. 模型下载是成功的(`Successfully saved`) +2. 失败原因是 `RapidOCR returned empty result`(检测不到文字) +3. 因此该问题不只是“无法访问”,更像是该页面图片内容对 RapidOCR 不友好 +4. 当前切换到 tesseract 后,报错多为 `OSD failed / Too few characters`,可通过 `--psm 6` 降低此类问题 + +推荐排查顺序: + +```bash +# 1) 查看脚本 stderr 给出的 image_ocr.log(默认在输出目录,如 ./.rag/text/url/.image_ocr.log) + +# 2) 若能用系统 OCR,安装 tesseract 后强制使用 +sudo apt install -y tesseract-ocr tesseract-ocr-eng +bash script/rag/url-to-text.sh --url "https://example.com" --ocr-images --ocr-engine tesseract --ocr-lang eng + +# 3) 若必须用 docling 默认 OCR,则在可联网机器先完成一次图片 OCR 预热, +# 再把相关缓存目录复制到内网机器(例如 ~/.cache/rapidocr、~/.cache/docling) +``` + +## 5. 批量目录转 text + +把资料目录递归转换成文本,并保持子目录结构: + +```bash +bash script/rag/convert-dir-to-text.sh \ + --input /data/rag/raw \ + --output /data/rag/text +``` + +默认处理扩展名: + +`pdf docx pptx html htm md txt csv xls xlsx xml` + +自定义扩展名: + +```bash +bash script/rag/convert-dir-to-text.sh \ + --input /data/rag/raw \ + --output /data/rag/text \ + --ext "pdf docx html" +``` + +转换日志: + +- 成功清单:`/data/rag/text/_success.log` +- 失败清单:`/data/rag/text/_failed.log` +- 运行日志:`/data/rag/text/_run.log` + +## 6. 内网离线打包与安装(Ubuntu 22.04) + +在可联网机器打包: + +```bash +bash script/rag/build-offline-bundle.sh \ + --out /tmp/rag-offline-bundle \ + --langs "eng chi-sim" \ + --include-llamaindex \ + --include-vectordb +``` + +产物: + +1. 目录:`/tmp/rag-offline-bundle` +2. 压缩包:`/tmp/rag-offline-bundle.tar.gz` + +拷贝到内网目标机后安装: + +```bash +tar -xzf rag-offline-bundle.tar.gz +bash script/rag/install-offline-bundle.sh \ + --bundle ./rag-offline-bundle \ + --venv ./.venv-docling \ + --install-llamaindex \ + --install-vectordb +``` + +## 7. 数据清洗与结构化 + +清洗文本: + +```bash +./.venv-docling/bin/python script/rag/clean-text.py \ + --input .rag/text/url/.txt \ + --output .rag/text/url/.clean.txt +``` + +结构化输出(规则模式): + +```bash +./.venv-docling/bin/python script/rag/structure-text.py \ + --text .rag/text/url/.clean.txt \ + --images .rag/text/url/.images.json \ + --output .rag/text/url/.structured.json \ + --source-url "https://example.com" \ + --mode rule \ + --inline-ocr strip +``` + +结构化输出(LlamaIndex): + +```bash +export OPENAI_API_KEY=... +./.venv-docling/bin/python script/rag/structure-text.py \ + --text .rag/text/url/.clean.txt \ + --images .rag/text/url/.images.json \ + --output .rag/text/url/.structured.json \ + --source-url "https://example.com" \ + --mode llamaindex \ + --model gpt-4o-mini +``` + +结构化结果包含: + +1. `sections`:章节级标题、摘要、正文、关联图片 metadata +2. `chunks`:可直接喂 embedding 的分块 + `image_ids` + 来源 metadata + +## 8. 备用离线方式(wheelhouse 手工流程) + +若内网机器不能直接访问公网,建议在可联网机器提前准备 wheel 包: + +```bash +mkdir -p /tmp/docling-wheelhouse +python3 -m venv /tmp/docling-venv +/tmp/docling-venv/bin/python -m pip install -U pip +/tmp/docling-venv/bin/pip download -r script/rag/requirements-docling.txt -d /tmp/docling-wheelhouse +tar -C /tmp -czf docling-wheelhouse.tar.gz docling-wheelhouse +``` + +将 `docling-wheelhouse.tar.gz` 拷贝到内网机器后: + +```bash +tar -xzf docling-wheelhouse.tar.gz +python3 -m venv .venv-docling +.venv-docling/bin/python -m pip install -U pip +.venv-docling/bin/pip install --no-index --find-links ./docling-wheelhouse -r script/rag/requirements-docling.txt +``` + +## 9. 最小验收 + +```bash +./.venv-docling/bin/docling --version +bash script/rag/url-to-text.sh --url "https://example.com" +``` + +满足以下条件即通过: + +1. `docling --version` 正常返回版本信息 +2. URL 转换命令输出一个 `.txt` 文件路径 +3. 对应 `.txt` 文件可读取并包含页面正文 + +## 10. 向量库落地(Qdrant 本地持久化 + Ollama Embedding) + +安装向量依赖: + +```bash +bash script/rag/install-vector.sh +``` + +准备 Ollama embedding 模型(建议): + +```bash +ollama pull nomic-embed-text +``` + +设置 OpenAI 兼容环境变量(Ollama): + +```bash +export OPENAI_BASE_URL="http://127.0.0.1:11434/v1" +export OPENAI_API_KEY="ollama" +``` + +构建向量索引(单文件): + +```bash +./.venv-docling/bin/python script/rag/build-vector-index.py \ + --input .rag/text/url/.structured.json \ + --db-path .rag/vector/qdrant \ + --collection rag_chunks \ + --model nomic-embed-text \ + --recreate +``` + +构建向量索引(目录批量): + +```bash +./.venv-docling/bin/python script/rag/build-vector-index.py \ + --input-dir .rag/text/url \ + --glob "*.structured.json" \ + --db-path .rag/vector/qdrant \ + --collection rag_chunks \ + --model nomic-embed-text +``` + +检索验证: + +```bash +./.venv-docling/bin/python script/rag/search-vector-index.py \ + --query "如何刷写镜像到 Luckfox Pico Zero" \ + --db-path .rag/vector/qdrant \ + --collection rag_chunks \ + --model nomic-embed-text \ + --top-k 5 +``` + +向量脚本产物说明: + +1. 向量库目录:`.rag/vector/qdrant` +2. 集合名:默认 `rag_chunks` +3. 每条向量 payload 包含:`node_type(text/image)`、`chunk_id`、`section_title`、`source_url`、`image_ids`、`text` + +## 11. OpenCode 注入 RAG 上下文 + +已提供两种接入方式: + +1. 自定义工具:`.opencode/tool/rag_search.ts`(手动调用) +2. 自动注入插件:`.opencode/plugins/rag_context.ts`(每轮用户消息前自动检索 top-k 注入 ``) + +建议环境变量: + +```bash +export OPENAI_BASE_URL="http://192.168.0.99:11434/v1" +export OPENAI_API_KEY="ollama" +export RAG_STRUCT_MODE="llamaindex" +export RAG_STRUCT_MODEL="gpt-4o-mini" +export RAG_EMBED_MODEL="qwen3-embedding:4b" +export RAG_COLLECTION="rag_chunks" +export RAG_TOP_K=4 +export RAG_CONTEXT_HITS=2 +export RAG_CONTEXT_CHARS=120 +export RAG_AUTO_INJECT=1 +``` + +关闭自动注入: + +```bash +export RAG_AUTO_INJECT=0 +``` + +可选调试(排查“是否注入成功”): + +```bash +export RAG_DEBUG_LOG=1 +``` + +插件会写入:`.rag/log/rag_context.log` + +可选覆盖(当 OpenAI 兼容地址或密钥与默认环境不同): + +```bash +export RAG_BASE_URL="http://192.168.0.99:11434/v1" +export RAG_API_KEY="ollama" +``` + +## 12. Agent 一键编排(Skill) + +已新增技能文件:`.opencode/skills/rag-pipeline/SKILL.md` + +建议通过统一入口命令执行: + +初始化(首建): + +```bash +bash script/rag/cmd/rag-init.sh --source structured --scan-dir .rag/text --glob "**/*.structured.json" --embed-model qwen3-embedding:4b --collection rag_chunks +``` + +增量更新: + +```bash +bash script/rag/cmd/rag-update.sh --source structured --scan-dir .rag/text --glob "**/*.structured.json" --embed-model qwen3-embedding:4b --collection rag_chunks +``` + +该流程会维护 manifest(默认 `.rag/state/manifest.json`)用于判断: + +1. `changed`:内容 hash 变化,执行“先删旧 doc_key,再 upsert 新向量” +2. `removed`:文件消失,执行按 doc_key 删除 +3. embedding 模型或 collection 变化,自动触发全量重建 + +建议只暴露这些高层选项给用户: + +1. `--source` +2. `--struct-mode`/`--struct-model` +3. `--embed-model` +4. 数据来源参数(`--url`/`--url-file`/`--input-dir`/`--scan-dir`) +5. `--collection` + +其余算法细节(chunk、重试、OCR 引擎细节)默认不暴露。 + +## 13. 迁移到其他项目 + +在当前仓库执行: + +```bash +bash script/rag/cmd/rag-bootstrap.sh --target /path/to/target-project +``` + +默认会复制: + +1. `script/rag/*`(安装、转换、结构化、索引、检索、init/update) +2. `.opencode/tool/rag_search.*` +3. `.opencode/plugins/rag_context.ts` +4. `.opencode/skills/rag-pipeline/SKILL.md` + +目标项目里继续执行: + +```bash +cd /path/to/target-project +bash script/rag/install-docling.sh +bash script/rag/install-vector.sh +bash script/rag/cmd/rag-init.sh --help +``` diff --git a/specs/rag-enhance-architecture.zh.md b/specs/rag-enhance-architecture.zh.md new file mode 100644 index 00000000000..261f5a851df --- /dev/null +++ b/specs/rag-enhance-architecture.zh.md @@ -0,0 +1,266 @@ +# RAG Enhance 架构设计说明(rag-enhance) + +## 1. 目标与设计原则 + +### 1.1 目标 + +1. 在 OpenCode 对话中提供稳定的本地 RAG 能力(内网可部署) +2. 降低重复检索与重复注入导致的推理循环 +3. 控制上下文窗口占用,优先增量披露 +4. 提供可观测调试手段,便于快速定位问题 + +### 1.2 原则 + +1. 优先改插件与脚本,不侵入 opencode core +2. 结构化协议先行(`` + ``) +3. 去重与增量优先于硬编码“单次限制” +4. 参数可配置,默认值保守 + +## 2. 总体架构 + +### 2.1 模块分层 + +1. 数据准备层:`script/rag/url-to-text.sh`、`convert-dir-to-text.sh`、`clean-text.py` +2. 结构化层:`script/rag/structure-text.py`(rule/llamaindex) +3. 向量索引层:`script/rag/build-vector-index.py` + Qdrant local +4. 检索层:`script/rag/search-vector-index.py` +5. 编排层:`script/rag/rag-pipeline.py` + `cmd/rag-init.sh`/`cmd/rag-update.sh` +6. 交互层: + - 自动注入插件:`.opencode/plugins/rag_context.ts` + - 手动工具:`.opencode/tool/rag_search.ts` + - 共享状态模块:`.opencode/rag.ts` + +### 2.2 运行路径 + +1. 离线/内网数据进入文本化 +2. 文本结构化为 section/chunk/image 节点 +3. embedding 写入 Qdrant(payload 包含 source、section、doc_key 等) +4. 对话时:插件读取检索结果并注入状态 meta +5. 长 query 会先执行 rewrite + multi-query retrieval + merge/rerank +6. 模型必要时再调用 `rag_search(mode=state|delta|brief|expand)` 渐进补证据 + +## 3. 文档处理与切分策略 + +### 3.1 当前切分策略 + +1. 按 Markdown 标题(`#`)拆 section +2. section 内按固定窗口切 chunk(默认 `chunk_size=1600`, `chunk_overlap=200`) +3. 图片 OCR 独立为 image node,避免污染正文 chunk + +说明:当前不是句法感知切分,`overlap` 用于缓解边界截断,但不能完全消除语义断裂。 + +### 3.2 结构化与 LLM + +1. `structure-text.py` 直接执行时默认 `mode=rule` +2. `rag-pipeline.py` 默认 `RAG_STRUCT_MODE=llamaindex` +3. llamaindex 模式下调用 OpenAI 兼容接口做 section summary + +## 4. 检索交互协议(RAG-LLM) + +### 4.1 注入块 + +插件当前向用户消息注入一个主逻辑块: + +1. ``:检索状态协议(短) + +说明:正文证据当前主要通过 `rag_search` 渐进披露,不再由自动注入直接提供。 + +示例: + +```text + +status=no_new_evidence +reason=high_overlap +cluster=luckfox|zero|烧录 +delta_hits=0 +known_hits=3 +next_action=reuse_known_evidence_or_refine_query + +``` + +### 4.2 status 枚举 + +1. `new_evidence` +2. `no_new_evidence` +3. `weak_match` +4. `need_refine` +5. `cluster_throttled` +6. `retrieval_error` +7. `state_reset` + +### 4.3 reason 典型值 + +1. `fresh_hits` +2. `delta_available` +3. `high_overlap` +4. `low_score` +5. `empty_hits` +6. `cluster_window_limit` +7. `backend_error` +8. `parse_error` +9. `compaction_epoch_changed` +10. `cached_recent_result` + +## 5. 去重、增量与局部限流 + +### 5.1 Query Cluster + +`query_cluster` 为“检索意图簇”,由 query 规范化词项生成(停用词过滤+同义词归一+排序)。 + +用途: + +1. 将近义 query 归为同簇 +2. 对同簇做局部预算与节流 +3. 避免全局限流误伤其他主题 + +### 5.2 重复检测 + +1. 命中 fingerprint:`text_file/source + chunk_id/image_id/section` +2. overlap = 交集 / 当前命中数 +3. `overlap >= RAG_OVERLAP_THRESHOLD` 且无新增时,标记 `no_new_evidence` + +### 5.3 增量注入 + +1. 仅注入“未见过”的 delta hits +2. 无 delta 时只注入 ``,不重复注入上下文正文 +3. 同 query 的短时间重复触发走缓存复用(`RAG_REUSE_SEC`) + +### 5.4 局部限流 + +1. 仅针对同一 cluster +2. 时间窗:`RAG_CLUSTER_WINDOW_SEC` +3. 上限:`RAG_CLUSTER_MAX_FULL` +4. 超限状态:`cluster_throttled` + +## 6. 渐进式披露 + +`rag_search` 支持模式: + +1. `state`:只返回检索状态 +2. `delta`:同 query cluster 仅新增证据(默认) +3. `brief`:当前命中的短摘要 +4. `expand`:扩展细节(用于二次追问) + +策略: + +1. 默认由插件持续注入 `rag_state` +2. 模型需要证据时优先 `delta` +3. `brief`/`expand` 仅在需要更多正文时使用 + +## 7. 会话生命周期与 compact + +### 7.1 loop 触发 + +OpenCode loop 每步都会触发 `experimental.chat.messages.transform`,因此插件必须具备状态机去重能力。 + +### 7.2 compaction 重置 + +插件实现 `experimental.session.compacting`: + +1. session `epoch + 1` +2. 清空 seen hit 与 cluster 窗口 +3. 标记 `state_reset` + +目的:防止 compaction 后继续引用旧上下文状态。 + +## 8. 配置参数 + +### 8.1 基础连接 + +1. `OPENAI_BASE_URL` / `OPENAI_API_KEY` +2. `RAG_BASE_URL` / `RAG_API_KEY`(覆盖) +3. `RAG_WORKTREE` +4. `RAG_DOCLING_PYTHON_BIN` +5. `RAG_DB_PATH` + +### 8.2 检索与注入 + +1. `RAG_TOP_K`(默认 4) +2. `RAG_CONTEXT_HITS`(默认 2) +3. `RAG_CONTEXT_CHARS`(默认 120) +4. `RAG_EXPAND_CHARS`(默认 420) +5. `RAG_REWRITE_MODE`(默认 `auto`) +6. `RAG_REWRITE_MODEL` +7. `RAG_REWRITE_QUERIES`(默认 3) + +### 8.3 控制与阈值 + +1. `RAG_AUTO_INJECT`(`0` 关闭) +2. `RAG_OVERLAP_THRESHOLD`(默认 0.8) +3. `RAG_WEAK_SCORE`(默认 0.42) +4. `RAG_CLUSTER_WINDOW_SEC`(默认 30) +5. `RAG_CLUSTER_MAX_FULL`(默认 2) +6. `RAG_REUSE_SEC`(默认 8) + +### 8.4 调试 + +1. `RAG_DEBUG=1` 或 `RAG_DEBUG_LOG=1` +2. 日志:`.rag/log/rag_debug.jsonl` +3. 查看:`script/rag/debug-rag-state.py --tail 100` + +## 9. 典型问题与解决方案 + +### 9.1 问题:循环检索与重复思考 + +原因:loop 多步触发 + 命中不充分 + 无状态去重。 + +解决: + +1. `query_cluster` 局部限流 +2. overlap 去重 +3. delta 注入 +4. cache reuse + +### 9.2 问题:TUI 回显过多 + +原因:工具多轮调用 + 大块文本注入。 + +解决: + +1. 默认 `brief` +2. `RAG_CONTEXT_HITS` 降低 +3. 强制“禁止 dump 原始 JSON/rag_context”系统提示 +4. 必要时仅保留 plugin,禁用显式 `rag_search` + +### 9.3 问题:手工命令成功但插件失败 + +常见:worktree 识别为 `/`。 + +解决: + +1. 显式配置 `RAG_WORKTREE` +2. 显式配置 `RAG_DOCLING_PYTHON_BIN` +3. 显式配置 `RAG_DB_PATH` + +### 9.4 问题:compaction 后行为异常 + +原因:检索状态与压缩后消息不一致。 + +解决: + +1. 在 `experimental.session.compacting` 事件重置 RAG 状态 + +## 10. 运维与回归检查清单 + +1. 检索可用:`search-vector-index.py` 手工命令返回 hits +2. 集合存在:Qdrant `rag_chunks` 可见 +3. 插件注入:日志出现 `event=inject` +4. 无新增命中:出现 `status=no_new_evidence` +5. 局部限流触发:出现 `event=cluster_throttled` +6. compact 后:出现 `event=state_reset` + +## 11. 代码锚点(便于回溯) + +1. 自动注入状态机:`.opencode/plugins/rag_context.ts` +2. 工具渐进披露:`.opencode/tool/rag_search.ts` +3. 调试脚本:`script/rag/debug-rag-state.py` +4. 结构化切分:`script/rag/structure-text.py` +5. 编排入口:`script/rag/rag-pipeline.py` + +## 12. 后续可演进方向 + +1. 语义切分(句法/段落边界)替代纯字符窗口 +2. query cluster 从词法升级到 embedding 聚类 +3. reranker 引入(重排 top-k) +4. `expand` 模式支持按 `chunk_id` 精确拉取 +5. 将状态机下沉到独立模块,支持单元测试 diff --git a/specs/rag-llm-prompt-protocol.zh.md b/specs/rag-llm-prompt-protocol.zh.md new file mode 100644 index 00000000000..d5f5e867f20 --- /dev/null +++ b/specs/rag-llm-prompt-protocol.zh.md @@ -0,0 +1,309 @@ +# RAG 输出给 LLM 的当前协议 + +## 1. 范围 + +这份文档只描述当前代码里真正输出给 LLM 的内容,不描述 debug 日志,也不描述理想设计。 + +当前协议由三部分组成: + +1. 自动注入的 `` +2. 系统提示里的 RAG 协议说明 +3. `rag_search` 工具定义与工具返回 + +相关实现文件: + +1. `.opencode/rag.ts` +2. `.opencode/plugins/rag_context.ts` +3. `.opencode/tool/rag_search.ts` + +## 2. 自动注入块 + +### 2.1 注入位置 + +`rag_context` 会在 `experimental.chat.messages.transform` 阶段,把 `` 注入到当前最新的 user text 中。 + +当前默认行为: + +1. 自动注入只注入检索 meta +2. 不自动注入正文 `` +3. 正文证据主要由 `rag_search` 按需补充 + +### 2.2 当前字段 + +当前注入给 LLM 的 `` 字段来自 `.opencode/rag.ts` 的 `stateBlock()`: + +```text + +status=... +reason=... +cluster=... +total_hits=... +delta_hits=... +known_hits=... +overlap=... +top_source=... +top_section=... +rewrite_queries=... +next_action=... + +``` + +字段含义: + +1. `status` + 当前 `session + cluster` 最近一次有效检索状态 +2. `reason` + 对应状态的原因 +3. `cluster` + 当前 query 归一化后的检索意图簇 +4. `total_hits` + 当前最近一次检索返回的总命中数 +5. `delta_hits` + 相对当前 cluster 已知证据,本轮新增命中数 +6. `known_hits` + 当前 cluster 已记录的累计命中数 +7. `overlap` + 本轮结果和已知命中的重合比例 +8. `top_source` + 当前 top hit 的来源 URL +9. `top_section` + 当前 top hit 的 section 标题 +10. `rewrite_queries` + 当前底层检索实际使用的 rewrite query 列表 +11. `next_action` + 给 LLM 的下一步建议动作 + +### 2.3 当前不输出给 LLM 的字段 + +下面这些字段当前只写入 debug 日志,不直接注入给 LLM: + +1. `event` +2. `channel` +3. `loop` +4. `used_cache` +5. `rewrite_mode` +6. `keywords` +7. `top_hits` +8. `delta_fps` +9. `emitted_context` + +因此,LLM 不会直接看到“这一步是 `context_search` 还是 `context_meta`”,也不会直接看到完整 hit 列表。 + +## 3. 系统提示协议 + +`rag_context` 还会在 `experimental.chat.system.transform` 中追加 RAG 协议说明。 + +当前系统提示的核心约束是: + +1. 每一步先解析 `` +2. `rag_context` 只注入 retrieval meta,不注入正文 +3. 如果 `status=new_evidence` 且仍需要事实细节,优先调用 `rag_search mode=delta` +4. 如果 `status=no_new_evidence`,优先复用当前状态,不要重复检索 +5. 普通问答不要调用 `mode=expand` +6. 不要直接通过 shell 执行 `script/rag/search-vector-index.py` 做问答检索 +7. 调用 `rag_search` 时,参数必须是合法 JSON +8. 对于长 query 或噪声 query,优先信任 rewrite 后的检索结果 + +这部分不是结构化字段,而是对 LLM 的操作协议说明。 + +## 4. `rag_search` 工具协议 + +### 4.1 工具入参 + +当前 `rag_search` 暴露给 LLM 的主要入参是: + +1. `query` +2. `top_k` +3. `node_type` +4. `mode` + +其中: + +1. `query` 是普通字符串 +2. `top_k` 是返回条数 +3. `node_type` 目前主要是 `text` 或 `image` +4. `mode` 控制渐进式披露层级 + +### 4.2 工具模式 + +当前支持的模式: + +1. `state` +2. `delta` +3. `brief` +4. `expand` + +推荐顺序: + +1. `state` +2. `delta` +3. `brief` +4. `expand` + +默认约束: + +1. 普通 QA 下优先 `delta` +2. `expand` 默认受限,仅用于调试或显式证据展开 + +### 4.3 工具返回 + +`rag_search` 的返回不是原始 JSON,而是给 LLM 的文本协议。 + +当前工具返回的第一部分始终是: + +1. `` + +然后按 `mode` 决定是否追加正文: + +1. `state` + 只返回 `` +2. `delta` + 返回 `` + 本轮新增命中的短摘要 +3. `brief` + 返回 `` + 当前命中的短摘要 +4. `expand` + 返回 `` + 更长文本 + +### 4.4 摘要格式 + +`brief` 和 `delta` 当前使用 `.opencode/rag.ts` 里的 `brief()` 生成,格式类似: + +```text +[1] source=... section=... summary=... +[2] source=... section=... summary=... +``` + +`expand` 当前使用 `.opencode/rag.ts` 里的 `expand()`,会给更长的 `score/source/section/text`。 + +## 5. LLM 实际看到的内容 + +从 prompt 协议角度看,LLM 当前会看到三类信息: + +1. 用户原始问题 +2. 自动注入的 `` +3. 系统提示里的 RAG 使用规则 + +如果模型主动调用 `rag_search`,还会额外看到: + +1. 工具参数 schema +2. 工具返回的 `` +3. 工具返回的摘要或扩展正文 + +因此当前架构下: + +1. 自动注入负责给状态 +2. 工具调用负责给正文 + +## 6. 当前典型工作流 + +### 6.1 自动注入阶段 + +模型先看到: + +```text +用户问题 + + +status=new_evidence +reason=fresh_hits +cluster=luckfox|文件传输 +total_hits=4 +delta_hits=4 +known_hits=4 +overlap=0.0000 +top_source=https://wiki.luckfox.com/... +top_section=ADB 传输文件 +rewrite_queries=["Luckfox Pico Zero 文件传输","adb 文件传输"] +next_action=call_rag_search_delta_if_more_detail_needed + +``` + +这时模型应该先基于状态判断: + +1. 是否已有足够信息直接回答 +2. 是否需要调用 `rag_search mode=delta` +3. 是否应该缩小或改写 query + +### 6.2 工具补充阶段 + +如果模型调用: + +```json +{"query":"Luckfox Pico Zero 文件传输方式","mode":"delta","node_type":"text","top_k":4} +``` + +它会看到类似返回: + +```text + +status=new_evidence +reason=delta_available +cluster=luckfox|文件传输方式 +total_hits=4 +delta_hits=2 +known_hits=6 +overlap=0.5000 +top_source=https://wiki.luckfox.com/... +top_section=ADB 传输文件 +rewrite_queries=["Luckfox Pico Zero 文件传输方式","adb push pull 文件传输"] +next_action=call_rag_search_delta_if_more_detail_needed + +[1] source=https://wiki.luckfox.com/... section=ADB 传输文件 summary=... +[2] source=https://wiki.luckfox.com/... section=SCP 传输文件 summary=... +``` + +这时模型拿到的就不只是状态,还有正文摘要。 + +## 7. 当前语义边界 + +### 7.1 `status` 的语义 + +当前 `.status` 表示: + +1. 当前 `session + cluster` 最近一次有效检索结果的状态 + +它不等价于: + +1. “当前这一个 loop step 刚刚重新搜索得到的新状态” + +因此,如果当前 step 只是复用了缓存状态,LLM 看到的 `status=new_evidence`,实际语义更接近: + +1. 当前 cluster 的已知状态是 `new_evidence` + +而不是: + +1. 本 step 又重新找到了新证据 + +### 7.2 `next_action` 的语义 + +`next_action` 是建议,不是硬约束。 + +LLM 仍然可以: + +1. 直接回答 +2. 选择更具体的 query +3. 调 `rag_search` +4. 放弃继续检索 + +但系统提示已经对推荐行为做了收敛。 + +## 8. 当前已知限制 + +1. `event/context_meta/context_search` 只在 debug 日志里,LLM 不可见 +2. LLM 不能直接看到完整命中列表,除非主动调用 `rag_search` +3. `status` 当前更接近 cluster 持久状态,不是严格的 step 状态 +4. 自动注入与工具调用虽然共享状态,但 query cluster 仍可能因为 agent rewrite 而不同 + +## 9. 结论 + +当前真正输出给 LLM 的协议可以概括为: + +1. 自动注入 `` 提供检索 meta +2. 系统提示解释如何使用这些 meta +3. `rag_search` 提供分层的正文证据披露 + +因此,当前系统不是“自动把所有 RAG 内容都塞进 prompt”,而是: + +1. 先给状态 +2. 再由模型按需索取正文 + diff --git a/specs/rag-progressive-disclosure.zh.md b/specs/rag-progressive-disclosure.zh.md new file mode 100644 index 00000000000..0931a86568f --- /dev/null +++ b/specs/rag-progressive-disclosure.zh.md @@ -0,0 +1,365 @@ +# RAG 渐进式披露当前实现说明 + +## 1. 范围 + +这份文档描述当前代码里的真实实现,不是理想设计。 + +当前“渐进式披露”系统由三部分组成: + +1. 自动注入:`.opencode/plugins/rag_context.ts` +2. 显式检索工具:`.opencode/tool/rag_search.ts` +3. 共享状态与公共逻辑:`.opencode/rag.ts` + +底层检索脚本仍然是: + +1. `script/rag/search-vector-index.py` + +## 2. 当前目标 + +当前实现要解决的是: + +1. 在 ReAct 式 loop 中持续给模型提供检索状态 +2. 不在每一轮 loop 中重复注入相同正文 +3. 把 `rag_context` 和 `rag_search` 统一为同一套渐进式披露系统 +4. 提供可追踪的 JSONL 调试日志 + +## 3. 当前架构 + +### 3.1 自动注入链路 + +`rag_context` 当前只负责注入检索 meta 信息,不再自动注入正文摘要。 + +它每次在 `experimental.chat.messages.transform` 被调用时会: + +1. 找到当前会话里最新的 user text +2. 去掉旧的 `` / `` +3. 生成 query cluster +4. 查询共享状态 +5. 必要时调用底层检索脚本 +6. 只把 `` 注回用户消息 + +这意味着: + +1. 模型在 loop 中每一步都能看到当前的 RAG 状态 +2. 是否继续调 `rag_search`,由模型自己判断 + +### 3.2 显式工具链路 + +`rag_search` 当前负责渐进式补充证据。 + +支持模式: + +1. `state` +2. `delta` +3. `brief` +4. `expand` + +推荐顺序: + +1. `state` +2. `delta` +3. `brief` +4. `expand` + +其中: + +1. `state` 只返回状态 +2. `delta` 只返回新增证据 +3. `brief` 返回短摘要 +4. `expand` 返回扩展文本,默认受限 + +### 3.3 共享状态 + +自动注入和显式工具现在都使用同一个共享状态模块: + +1. `.opencode/rag.ts` + +共享状态粒度是: + +1. `session` +2. `cluster` + +每个 cluster 当前维护的信息包括: + +1. `seen` +2. `window` +3. `last_query` +4. `last_status` +5. `last_reason` +6. `last_checked` +7. `total_hits` +8. `known_hits` +9. `overlap` +10. `delta` +11. `hits` +12. `top` +13. `rewrites` + +因此当前 `rag_context` 和 `rag_search` 已经不是两套独立状态机,而是同一状态系统的两个入口。 + +## 4. 自动注入的当前规则 + +### 4.1 注入内容 + +自动注入当前只注入: + +1. `` + +不再自动注入正文 ``。 + +这样做的目的: + +1. 让模型在每一步都能看到检索状态 +2. 把正文披露权交给 `rag_search` +3. 避免 loop 中重复刷证据文本 + +### 4.2 何时触发 + +自动注入不是只在“用户第一次提问”时触发。 + +当前实现里,只要: + +1. `experimental.chat.messages.transform` 被调用 +2. 最新 user text 还存在 + +插件就会再次运行。 + +区别在于: + +1. 首次进入当前 query 时,通常会实际检索 +2. 后续 loop 更常见的是复用共享状态,只重新注入 `` + +### 4.3 缓存与复用 + +自动注入会优先复用共享状态,条件包括: + +1. 同一 user query +2. 同一 cluster +3. 在 `RAG_REUSE_SEC` 时间窗内 +4. 或已经进入 assistant loop 阶段 + +如果命中缓存,插件不会重新检索,而是直接注入当前 cluster 的状态。 + +### 4.4 局部限流 + +每个 cluster 单独维护时间窗: + +1. `RAG_CLUSTER_WINDOW_SEC` +2. `RAG_CLUSTER_MAX_FULL` + +超过上限后,状态会变成: + +1. `cluster_throttled` + +## 5. 当前状态机 + +当前状态枚举: + +1. `new_evidence` +2. `no_new_evidence` +3. `weak_match` +4. `need_refine` +5. `cluster_throttled` +6. `retrieval_error` +7. `state_reset` + +典型 reason: + +1. `fresh_hits` +2. `delta_available` +3. `high_overlap` +4. `low_score` +5. `empty_hits` +6. `cluster_window_limit` +7. `backend_error` +8. `parse_error` +9. `cached_recent_result` +10. `compaction_epoch_changed` + +## 6. 什么叫“渐进式披露” + +### 6.1 自动注入侧 + +自动注入侧的渐进式披露体现在: + +1. 首轮只建立状态并记录 hits +2. 后续 loop 主要复用状态 +3. 自动注入不再负责正文披露 + +换句话说,当前自动注入承担的是: + +1. 渐进提供 meta + +而不是: + +1. 渐进提供正文 + +### 6.2 工具侧 + +显式工具侧的渐进式披露体现在: + +1. `state` 只给状态 +2. `delta` 只给新增证据 +3. `brief` 给短摘要 +4. `expand` 给更多文本 + +这才是当前正文证据的主要披露链路。 + +## 7. Query Cluster + +当前 cluster 生成方式: + +1. query 小写化 +2. 中英文词项切分 +3. 去停用词 +4. 同义词归一 +5. 排序拼接 + +作用: + +1. 把近义问题归到同一局部检索意图 +2. 支持同 cluster 去重 +3. 支持同 cluster 限流 + +## 8. 底层检索脚本的当前角色 + +`search-vector-index.py` 仍然只负责: + +1. embedding query +2. 检索向量库 +3. 返回 hits + +当前输出格式支持: + +1. `json` +2. `state` +3. `brief` +4. `auto` + +当前约束: + +1. `rag_context` 强制 `--format json` +2. `rag_search` 也强制 `--format json` +3. 只有 shell 直接运行脚本时,`OPENCODE=1` 下默认输出 `state` + +这样做是为了: + +1. 插件和工具都自己控制披露层级 +2. 终端里不要直接泄漏 hits 正文 + +### 8.1 当前 rewrite 与 multi-query 检索 + +当前底层检索脚本已经支持: + +1. LLM query rewrite +2. multi-query retrieval +3. merge 去重 +4. simple rerank + +流程如下: + +1. 原始 query 输入 +2. LLM 产出 `queries` 和 `keywords` +3. 每个 rewrite query 单独向量检索 +4. 多路结果按 fingerprint merge +5. 用简单规则做 rerank +6. 输出最终 `top_k` + +当前 rerank 不是独立 reranker 模型,而是规则组合: + +1. `max_score` +2. `reciprocal_rank` +3. `hit_count` +4. `primary_match` + +## 9. 调试日志 + +### 9.1 日志文件 + +当前统一日志: + +1. `.rag/log/rag_debug.jsonl` + +### 9.2 当前记录的链路 + +现在会同时记录: + +1. `rag_context` +2. `rag_search` + +通过字段区分: + +1. `channel` +2. `event` + +### 9.3 当前重点字段 + +当前日志里重点字段包括: + +1. `channel` +2. `event` +3. `sessionID` +4. `query` +5. `cluster` +6. `mode` +7. `loop` +8. `used_cache` +9. `status` +10. `reason` +11. `total_hits` +12. `delta_hits` +13. `known_hits` +14. `overlap` +15. `rewrites` +16. `keywords` +17. `rewrite_mode` +18. `top_hits` +19. `delta_fps` +20. `emitted_context` + +### 9.4 当前怎么判断渐进式披露生效 + +看同一 `sessionID + cluster` 的连续日志: + +1. 首次检索: + - `status=new_evidence` + - `delta_hits>0` +2. 后续 loop: + - `channel=rag_context` + - `event=context_meta` + - `used_cache=true` +3. 后续主动补证据: + - `channel=rag_search` + - `event=tool_search` + - `mode=delta|brief|expand` + +这说明当前系统是在“先提供状态,再按需补正文”。 + +## 10. 终端与 TUI 控制 + +当前实现已经做了三层控制: + +1. 检索子进程使用 `.quiet()` +2. shell 直接跑脚本时默认只输出 `state` +3. `expand` 默认受限 + +当前目标不是完全隐藏检索,而是: + +1. 不让底层脚本 stdout 直接污染终端 +2. 不让自动注入链路在 loop 中刷大段正文 + +## 11. 当前限制 + +1. 自动注入只提供 meta,不提供正文,需要模型自行决定是否调 `rag_search` +2. 还没有 decomposition +3. 当前 rerank 还是简单规则,不是专门 reranker 模型 +4. debug 已能看到 top hits 和 delta 指纹,但还没有记录 assistant reasoning 原文 +5. 多模态 embedding 还未接入当前渐进披露链路 + +## 12. 关键代码锚点 + +1. 共享状态:`.opencode/rag.ts` +2. 自动注入:`.opencode/plugins/rag_context.ts` +3. 渐进检索工具:`.opencode/tool/rag_search.ts` +4. 底层检索:`script/rag/search-vector-index.py` +5. 调试查看:`script/rag/debug-rag-state.py` diff --git a/specs/rag-updates-history.zh.md b/specs/rag-updates-history.zh.md new file mode 100644 index 00000000000..de35f2ca049 --- /dev/null +++ b/specs/rag-updates-history.zh.md @@ -0,0 +1,226 @@ +# RAG Enhance 变更回溯记录 + +## 1. 目的 + +这份文档用于记录本分支上 RAG 增强相关的关键演进,方便后续回溯问题来源、定位设计变更和重新部署时核对差异。 + +## 2. 第一阶段:基础 RAG 流水线落地 + +这一阶段完成了基础数据链路: + +1. 文档转文本 +2. 文本清洗 +3. 结构化输出 +4. embedding 落库 +5. 本地向量检索 + +主要脚本: + +1. `script/rag/url-to-text.sh` +2. `script/rag/convert-dir-to-text.sh` +3. `script/rag/clean-text.py` +4. `script/rag/structure-text.py` +5. `script/rag/build-vector-index.py` +6. `script/rag/search-vector-index.py` + +## 3. 第二阶段:OpenCode 插件化接入 + +这一阶段引入了 OpenCode 集成层: + +1. 自动注入插件:`.opencode/plugins/rag_context.ts` +2. 手动工具:`.opencode/tool/rag_search.ts` +3. skill:`.opencode/skills/rag-pipeline/SKILL.md` + +目标是: + +1. 让 agent 在对话中可使用本地 RAG +2. 支持插件迁移到其他项目 +3. 用 `rag-bootstrap.sh` / `install.sh` 完成交付 + +## 4. 第三阶段:图片 OCR 与结构化关联 + +这一阶段处理了图片与正文的关联问题: + +1. 图片 OCR 从纯追加文本改成与 image node 关联 +2. 结构化输出中保留 image metadata +3. 向量检索命中正文时,可挂出 `related_images` + +目标是: + +1. 不直接污染正文 section +2. 在命中 chunk 时仍然能关联图片信息 + +## 5. 第四阶段:初版渐进式披露 + +这一阶段第一次引入: + +1. `` +2. `` +3. overlap 去重 +4. cluster 局部限流 +5. debug 日志 + +初版实现特点: + +1. 自动注入会注入状态和正文摘要 +2. `rag_search` 自己维护一套独立状态 +3. debug 主要看状态,证据可见性较弱 + +当时解决的问题: + +1. 检索循环 +2. 重复注入 +3. context 窗口浪费 + +## 6. 第五阶段:终端/TUI 回显治理 + +这一阶段重点修了“检索输出污染终端/TUI”的问题。 + +核心修复: + +1. `rag_search.ts` 和 `rag_context.ts` 调检索脚本时补 `.quiet()` +2. 两条链路都强制 `search-vector-index.py --format json` +3. `search-vector-index.py` 在 `OPENCODE=1` 下默认只输出 `state` +4. `rag_search expand` 默认拦截 + +目标是: + +1. 检索子进程不再把 stdout 直接打印到终端 +2. 工具链路不再因为 parse fail 回退成整段文本回显 + +## 7. 第六阶段:非法 JSON tool args 缓解 + +这一阶段修复了模型调用 `rag_search` 时偶发生成坏 JSON 的问题。 + +核心修复: + +1. 在 `tool.definition` 中补充合法/非法 JSON 示例 +2. 在 system prompt 中明确要求 `query` 必须是单个普通字符串 + +目标是: + +1. 降低模型把 query 引号拼坏的概率 + +注意: + +1. 这类问题是模型生成错误,无法 100% 从代码层彻底消除 + +## 8. 第七阶段:共享状态统一 + +这一阶段把 `rag_context` 和 `rag_search` 统一进同一套共享状态系统。 + +新增文件: + +1. `.opencode/rag.ts` + +统一后: + +1. 两条链路共享 session/cluster 状态 +2. 共享 `seen` +3. 共享 `total_hits / known_hits / overlap` +4. 共享 `top_hits` +5. 共享 `rewrites` + +这一阶段的设计变化很关键: + +1. `rag_context` 不再自动注入正文,只注入检索 meta +2. `rag_search` 成为正文证据的渐进式补充入口 + +## 9. 第八阶段:ReAct loop 对齐 + +这一阶段是为适配 OpenCode 的 ReAct 式 loop。 + +变化点: + +1. `rag_context` 不再只在“第一次用户提问前”工作 +2. 在 loop 中也会再次运行 +3. 但后续更常见的是复用缓存状态,只重复注入 `` + +目标是: + +1. 在推理过程中让模型持续看到当前检索状态 +2. 由模型自行决定是否继续调用 `rag_search` + +## 10. 第九阶段:debug 日志增强 + +这一阶段把 debug 从“状态日志”增强成“过程日志”。 + +现在统一记录到: + +1. `.rag/log/rag_debug.jsonl` + +日志覆盖: + +1. `rag_context` +2. `rag_search` + +主要新增字段: + +1. `channel` +2. `mode` +3. `loop` +4. `used_cache` +5. `top_hits` +6. `delta_fps` +7. `rewrites` +8. `emitted_context` + +目的: + +1. 可追踪每一次状态注入 +2. 可追踪每一次显式检索 +3. 可回溯当前 cluster 的命中情况 + +## 11. 第十阶段:query rewrite 与 multi-query retrieval + +这一阶段在底层检索脚本里加入了: + +1. LLM query rewrite +2. 多 query 独立召回 +3. merge 去重 +4. simple rerank + +当前实现方式: + +1. LLM 输出 `queries` 和 `keywords` +2. 每个 query 单独做 embedding 检索 +3. 按 chunk fingerprint 合并候选 +4. 结合 `max_score / reciprocal_rank / hit_count / primary_match` 做重排 + +目标是: + +1. 降低长 query 的语义噪声 +2. 提高多视角召回能力 +3. 给后续 decomposition 留出接口 + +## 12. 当前结论 + +到当前版本为止,系统已经形成了下面的职责分离: + +1. `rag_context` + - 持续注入 RAG meta + - 在 loop 中复用共享状态 + - 不主动注入正文 + +2. `rag_search` + - 按 `state -> delta -> brief -> expand` 渐进补证据 + - 与自动注入共享同一状态 + +3. `debug` + - 统一记录自动注入与显式检索 + - 便于后续对 query、cluster、命中和状态做回放 + +## 13. 仍未完成的方向 + +当前明确还没有完成的方向: + +1. decomposition +2. 专门 reranker +4. assistant reasoning 原文级别的日志追踪 +5. 多模态 embedding 接入当前渐进式披露系统 + +## 14. 对应文档 + +1. 当前实现说明:`specs/rag-progressive-disclosure.zh.md` +2. 总体架构:`specs/rag-enhance-architecture.zh.md` +3. 本回溯文档:`specs/rag-updates-history.zh.md`