diff --git a/.gitignore b/.gitignore
index bf78c046d4b..1efa395e597 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,10 @@ a.out
target
.scripts
.direnv/
+.venv-docling/
+.rag/
+__pycache__/
+*.pyc
# Local dev files
opencode-dev
diff --git a/.opencode/plugins/rag_context.ts b/.opencode/plugins/rag_context.ts
new file mode 100644
index 00000000000..7191740c3d5
--- /dev/null
+++ b/.opencode/plugins/rag_context.ts
@@ -0,0 +1,338 @@
+///
+import type { Plugin } from "@opencode-ai/plugin"
+import path from "path"
+import {
+ allow,
+ allowExpand,
+ audit,
+ base,
+ cluster,
+ collection,
+ decide,
+ key,
+ model,
+ parse,
+ py,
+ reset,
+ rewriteMode,
+ rewriteModel,
+ rewriteQueries,
+ reuseSec,
+ root,
+ row,
+ session,
+ stateBlock,
+ strip,
+ summary,
+ topk,
+ chars,
+ db,
+} from "../rag"
+
+type Msg = {
+ info?: {
+ role?: string
+ id?: string
+ sessionID?: string
+ sessionId?: string
+ }
+ parts?: Array<{
+ type?: string
+ text?: string
+ synthetic?: boolean
+ }>
+}
+
+function sid(msgs: Msg[], idx: number) {
+ const direct = msgs[idx]?.info?.sessionID || msgs[idx]?.info?.sessionId
+ if (direct) return String(direct)
+ for (let i = idx; i >= 0; i--) {
+ const v = msgs[i]?.info?.sessionID || msgs[i]?.info?.sessionId
+ if (v) return String(v)
+ }
+ return "default"
+}
+
+function uid(msgs: Msg[], idx: number) {
+ const v = msgs[idx]?.info?.id
+ if (!v) return ""
+ return String(v)
+}
+
+function next(status: string) {
+ if (status === "new_evidence") return "call_rag_search_delta_if_needed"
+ if (status === "weak_match") return "call_rag_search_delta_or_refine_query"
+ if (status === "no_new_evidence") return "reuse_known_state_or_call_rag_search_state"
+ if (status === "cluster_throttled") return "avoid_repeating_same_search"
+ if (status === "retrieval_error") return "retry_or_check_rag_backend"
+ return "refine_query_or_call_rag_search"
+}
+
+function mark(
+ hit: ReturnType,
+ input: { query: string; status: string; reason: string; total?: number; rewrites?: string[] },
+) {
+ hit.last_query = input.query
+ hit.last_status = input.status
+ hit.last_reason = input.reason
+ hit.last_checked = Date.now()
+ hit.total_hits = input.total || 0
+ hit.delta = []
+ hit.hits = []
+ hit.top = []
+ hit.overlap = 0
+ hit.rewrites = input.rewrites || [input.query]
+}
+
+const RagContextPlugin: Plugin = async ({ worktree, $ }) => {
+ return {
+ "tool.definition": async (input, output) => {
+ if (input.toolID !== "rag_search") return
+ output.description = [
+ output.description,
+ "",
+ "Call this tool with valid JSON arguments only.",
+ 'Use query as a plain string value. Do not insert extra quotes inside the query string.',
+ 'Valid example: {"query":"luckfox-pico zero 传输文件方式","mode":"delta","node_type":"text","top_k":3}',
+ 'Invalid example: {"query":"luck"fox-pico zero","mode":"brief"}',
+ ].join("\n")
+ },
+ "tool.execute.before": async (input, output) => {
+ if (input.tool !== "rag_search") return
+ if (allowExpand()) return
+ if (output.args?.mode !== "expand") return
+ output.args = {
+ ...output.args,
+ mode: "delta",
+ top_k: Math.min(Number(output.args?.top_k || 3), 3),
+ }
+ },
+ "experimental.chat.messages.transform": async (_input, output) => {
+ if (process.env.RAG_AUTO_INJECT === "0") return
+ const msgs = output.messages as Msg[]
+ if (!Array.isArray(msgs) || !msgs.length) return
+ let idx = -1
+ for (let i = msgs.length - 1; i >= 0; i--) {
+ if (msgs[i].info?.role === "user") {
+ idx = i
+ break
+ }
+ }
+ if (idx < 0) return
+ const loop = msgs.slice(idx + 1).some((msg) => msg.info?.role === "assistant")
+ const parts = Array.isArray(msgs[idx].parts) ? msgs[idx].parts : []
+ let textPart: { type?: string; text?: string; synthetic?: boolean } | undefined
+ for (let i = parts.length - 1; i >= 0; i--) {
+ const part = parts[i]
+ if (part?.type === "text" && typeof part.text === "string" && !part.synthetic) {
+ textPart = part
+ break
+ }
+ }
+ if (!textPart?.text) return
+
+ const clean = strip(textPart.text)
+ const query = clean.trim().slice(0, 800)
+ if (!query) return
+
+ const sessionID = sid(msgs, idx)
+ const userID = uid(msgs, idx)
+ const st = session(sessionID)
+ const keyName = cluster(query)
+ const hit = row(st, keyName)
+ const now = Date.now()
+ const baseDir = root(worktree)
+ const python = py(baseDir)
+ const script = path.join(baseDir, "script", "rag", "search-vector-index.py")
+ const dbPath = db(baseDir)
+ const same = st.last_user_id === userID && st.last_query === query && st.last_cluster === keyName
+ const cached = !!hit.last_status && (loop || (same && now - hit.last_checked <= reuseSec() * 1000))
+
+ if (cached) {
+ textPart.text = `${clean}\n\n${stateBlock(keyName, hit, next(hit.last_status))}`
+ await audit(worktree, {
+ channel: "rag_context",
+ event: "context_meta",
+ sessionID,
+ userID,
+ query,
+ cluster: keyName,
+ loop,
+ used_cache: true,
+ status: hit.last_status,
+ reason: hit.last_reason,
+ total_hits: hit.total_hits,
+ delta_hits: hit.delta.length,
+ known_hits: hit.known_hits,
+ overlap: hit.overlap,
+ rewrites: hit.rewrites,
+ top_hits: summary(hit.top, 3),
+ emitted_context: false,
+ })
+ return
+ }
+
+ if (!allow(hit)) {
+ mark(hit, {
+ query,
+ status: "cluster_throttled",
+ reason: "cluster_window_limit",
+ })
+ st.last_user_id = userID
+ st.last_query = query
+ st.last_cluster = keyName
+ textPart.text = `${clean}\n\n${stateBlock(keyName, hit, next(hit.last_status))}`
+ await audit(worktree, {
+ channel: "rag_context",
+ event: "context_meta",
+ sessionID,
+ userID,
+ query,
+ cluster: keyName,
+ loop,
+ used_cache: false,
+ status: hit.last_status,
+ reason: hit.last_reason,
+ total_hits: hit.total_hits,
+ delta_hits: hit.delta.length,
+ known_hits: hit.known_hits,
+ overlap: hit.overlap,
+ rewrites: hit.rewrites,
+ top_hits: [],
+ emitted_context: false,
+ })
+ return
+ }
+
+ const res =
+ await $`${python} ${script} --query ${query} --db-path ${dbPath} --collection ${collection()} --model ${model()} --top-k ${topk()} --node-type text --show-text-chars ${chars()} --base-url ${base()} --api-key ${key()} --format json --rewrite ${rewriteMode()} --rewrite-model ${rewriteModel()} --rewrite-queries ${rewriteQueries()}`
+ .quiet()
+ .nothrow()
+ const raw = res.stdout.toString()
+
+ if (res.exitCode !== 0) {
+ mark(hit, {
+ query,
+ status: "retrieval_error",
+ reason: "backend_error",
+ })
+ st.last_user_id = userID
+ st.last_query = query
+ st.last_cluster = keyName
+ textPart.text = `${clean}\n\n${stateBlock(keyName, hit, next(hit.last_status))}`
+ await audit(worktree, {
+ channel: "rag_context",
+ event: "search_fail",
+ sessionID,
+ userID,
+ query,
+ cluster: keyName,
+ loop,
+ code: res.exitCode,
+ stderr: res.stderr.toString().slice(0, 1200),
+ status: hit.last_status,
+ reason: hit.last_reason,
+ emitted_context: false,
+ })
+ return
+ }
+
+ let resData = { hits: [], rewrites: [query], keywords: [], rewrite_mode: "none" } as ReturnType
+ try {
+ resData = parse(raw)
+ } catch {
+ mark(hit, {
+ query,
+ status: "retrieval_error",
+ reason: "parse_error",
+ })
+ st.last_user_id = userID
+ st.last_query = query
+ st.last_cluster = keyName
+ textPart.text = `${clean}\n\n${stateBlock(keyName, hit, next(hit.last_status))}`
+ await audit(worktree, {
+ channel: "rag_context",
+ event: "parse_fail",
+ sessionID,
+ userID,
+ query,
+ cluster: keyName,
+ loop,
+ raw: raw.slice(0, 1200),
+ status: hit.last_status,
+ reason: hit.last_reason,
+ emitted_context: false,
+ })
+ return
+ }
+
+ const out = decide(hit, resData.hits, query, resData.rewrites)
+ st.last_user_id = userID
+ st.last_query = query
+ st.last_cluster = keyName
+ textPart.text = `${clean}\n\n${stateBlock(keyName, hit, out.next)}`
+ await audit(worktree, {
+ channel: "rag_context",
+ event: "context_search",
+ sessionID,
+ userID,
+ query,
+ cluster: keyName,
+ loop,
+ used_cache: false,
+ status: out.status,
+ reason: out.reason,
+ total_hits: out.total,
+ delta_hits: out.delta.length,
+ known_hits: out.known,
+ overlap: out.overlap,
+ rewrite_mode: resData.rewrite_mode,
+ rewrites: hit.rewrites,
+ keywords: resData.keywords,
+ top_hits: summary(hit.top, 3),
+ delta_fps: out.delta.map((x) => ({
+ fp: `${x.text_file || x.source_url || ""}#${x.chunk_id || x.image_id || x.section_title || ""}`,
+ source_url: x.source_url || "",
+ section_title: x.section_title || "",
+ chunk_id: x.chunk_id || "",
+ })),
+ emitted_context: false,
+ })
+ },
+ "experimental.chat.system.transform": async (_input, output) => {
+ if (process.env.RAG_AUTO_INJECT === "0") return
+ output.system.push("RAG protocol: parse on every model step. rag_context injects retrieval meta only, not full evidence.")
+ output.system.push(
+ "If rag_state status=new_evidence and you still need facts, call rag_search with mode=delta first. Use mode=brief only when delta is insufficient.",
+ )
+ output.system.push(
+ "If rag_state status=no_new_evidence, reuse current state. Do not repeat the same retrieval unless the query becomes more specific.",
+ )
+ output.system.push(
+ "Do not call rag_search mode=expand in normal QA. Use expand only for explicit debugging or evidence inspection.",
+ )
+ output.system.push(
+ "Do not execute script/rag/search-vector-index.py directly from shell for QA retrieval. Use rag_search only.",
+ )
+ output.system.push(
+ 'When calling rag_search, emit valid JSON arguments. query must be one plain string value, without nested or broken quotation marks.',
+ )
+ output.system.push(
+ "For long or noisy questions, trust rag_state rewrite metadata and prefer rag_search results derived from rewritten retrieval queries.",
+ )
+ },
+ "experimental.session.compacting": async (input, output) => {
+ const id = String((input as { sessionID?: string })?.sessionID || "default")
+ const st = reset(id)
+ await audit(worktree, {
+ channel: "rag_context",
+ event: "state_reset",
+ sessionID: id,
+ epoch: st.epoch,
+ })
+ return output
+ },
+ }
+}
+
+export default RagContextPlugin
diff --git a/.opencode/rag.ts b/.opencode/rag.ts
new file mode 100644
index 00000000000..042399e9831
--- /dev/null
+++ b/.opencode/rag.ts
@@ -0,0 +1,428 @@
+import path from "path"
+import { appendFile, mkdir } from "node:fs/promises"
+
+export type Hit = {
+ score?: number
+ rerank_score?: number
+ source_url?: string
+ section_title?: string
+ text_preview?: string
+ chunk_id?: string
+ image_id?: string
+ text_file?: string
+ matched_queries?: string[]
+ hit_count?: number
+}
+
+export type SearchResult = {
+ hits: Hit[]
+ rewrites: string[]
+ keywords: string[]
+ rewrite_mode: string
+}
+
+type Row = {
+ seen: Set
+ window: number[]
+ last_query: string
+ last_status: string
+ last_reason: string
+ last_checked: number
+ total_hits: number
+ known_hits: number
+ overlap: number
+ delta: Hit[]
+ hits: Hit[]
+ top: Hit[]
+ rewrites: string[]
+}
+
+type Session = {
+ epoch: number
+ last_user_id: string
+ last_query: string
+ last_cluster: string
+ rows: Map
+}
+
+const STORE = new Map()
+const STOP = new Set([
+ "的",
+ "了",
+ "和",
+ "是",
+ "怎么",
+ "如何",
+ "请问",
+ "一下",
+ "关于",
+ "教程",
+ "方法",
+ "方式",
+ "what",
+ "how",
+ "the",
+ "a",
+ "an",
+ "to",
+ "for",
+ "of",
+ "in",
+])
+const SYN: Record = {
+ flash: "烧录",
+ burn: "烧录",
+ firmware: "固件",
+ image: "镜像",
+ electerm: "electerm",
+ luckfox: "luckfox",
+ pico: "pico",
+ zero: "zero",
+}
+
+export function topk() {
+ const n = Number.parseInt(process.env.RAG_TOP_K ?? "4", 10)
+ if (Number.isFinite(n) && n > 0) return n
+ return 4
+}
+
+export function use() {
+ const n = Number.parseInt(process.env.RAG_CONTEXT_HITS ?? "2", 10)
+ if (Number.isFinite(n) && n > 0) return n
+ return 2
+}
+
+export function chars() {
+ const n = Number.parseInt(process.env.RAG_CONTEXT_CHARS ?? "120", 10)
+ if (Number.isFinite(n) && n >= 40) return n
+ return 120
+}
+
+export function expandChars() {
+ const n = Number.parseInt(process.env.RAG_EXPAND_CHARS ?? "420", 10)
+ if (Number.isFinite(n) && n >= 120) return n
+ return 420
+}
+
+export function simCut() {
+ const n = Number.parseFloat(process.env.RAG_OVERLAP_THRESHOLD ?? "0.8")
+ if (Number.isFinite(n) && n > 0 && n <= 1) return n
+ return 0.8
+}
+
+export function weakCut() {
+ const n = Number.parseFloat(process.env.RAG_WEAK_SCORE ?? "0.42")
+ if (Number.isFinite(n) && n > 0 && n < 1) return n
+ return 0.42
+}
+
+export function clusterWindowSec() {
+ const n = Number.parseInt(process.env.RAG_CLUSTER_WINDOW_SEC ?? "30", 10)
+ if (Number.isFinite(n) && n > 0) return n
+ return 30
+}
+
+export function clusterMax() {
+ const n = Number.parseInt(process.env.RAG_CLUSTER_MAX_FULL ?? "2", 10)
+ if (Number.isFinite(n) && n > 0) return n
+ return 2
+}
+
+export function reuseSec() {
+ const n = Number.parseInt(process.env.RAG_REUSE_SEC ?? "8", 10)
+ if (Number.isFinite(n) && n >= 0) return n
+ return 8
+}
+
+export function model() {
+ const v = process.env.RAG_EMBED_MODEL
+ if (v) return v
+ return "qwen3-embedding:4b"
+}
+
+export function rewriteMode() {
+ const v = process.env.RAG_REWRITE_MODE
+ if (v) return v
+ return "auto"
+}
+
+export function rewriteModel() {
+ const v = process.env.RAG_REWRITE_MODEL
+ if (v) return v
+ return process.env.RAG_STRUCT_MODEL || process.env.OPENAI_MODEL || "gpt-4o-mini"
+}
+
+export function rewriteQueries() {
+ const n = Number.parseInt(process.env.RAG_REWRITE_QUERIES ?? "3", 10)
+ if (Number.isFinite(n) && n > 0) return n
+ return 3
+}
+
+export function collection() {
+ const v = process.env.RAG_COLLECTION
+ if (v) return v
+ return "rag_chunks"
+}
+
+export function base() {
+ const v = process.env.RAG_BASE_URL || process.env.OPENAI_BASE_URL
+ if (v) return v
+ return "http://127.0.0.1:11434/v1"
+}
+
+export function key() {
+ const v = process.env.RAG_API_KEY || process.env.OPENAI_API_KEY || process.env.MINIMAX_API_KEY
+ if (v) return v
+ return "ollama"
+}
+
+export function debug() {
+ return process.env.RAG_DEBUG_LOG === "1" || process.env.RAG_DEBUG === "1"
+}
+
+export function allowExpand() {
+ return process.env.RAG_ALLOW_EXPAND_TOOL === "1"
+}
+
+export function root(input: string) {
+ const env = process.env.RAG_WORKTREE
+ if (env) return env
+ if (input && input !== "/") return input
+ return process.cwd()
+}
+
+export function py(rootDir: string) {
+ const env = process.env.RAG_DOCLING_PYTHON_BIN
+ if (env) return env
+ return path.join(rootDir, ".venv-docling", "bin", "python")
+}
+
+export function db(rootDir: string) {
+ const env = process.env.RAG_DB_PATH
+ if (env) return env
+ return path.join(rootDir, ".rag", "vector", "qdrant")
+}
+
+export function clip(text: string, n: number) {
+ const s = String(text || "").replace(/\s+/g, " ").trim()
+ if (s.length <= n) return s
+ return `${s.slice(0, n).trim()} ...`
+}
+
+export function strip(text: string) {
+ return text
+ .replace(/\n*[\s\S]*?<\/rag_context>\n*/g, "\n")
+ .replace(/\n*[\s\S]*?<\/rag_state>\n*/g, "\n")
+ .replace(/\n{3,}/g, "\n\n")
+ .trim()
+}
+
+export function terms(query: string) {
+ const rows = (query.toLowerCase().match(/[\p{Script=Han}]+|[a-z0-9_-]+/gu) || [])
+ .map((x) => x.trim())
+ .filter(Boolean)
+ const out: string[] = []
+ for (const raw of rows) {
+ const v = SYN[raw] || raw
+ if (!v || STOP.has(v)) continue
+ out.push(v)
+ }
+ return [...new Set(out)].sort()
+}
+
+export function cluster(query: string) {
+ const rows = terms(query)
+ if (!rows.length) return `q:${clip(query.toLowerCase(), 48)}`
+ return rows.slice(0, 8).join("|")
+}
+
+export function fp(hit: Hit) {
+ const src = hit.text_file || hit.source_url || ""
+ const id = hit.chunk_id || hit.image_id || hit.section_title || clip(String(hit.text_preview || ""), 36)
+ return `${src}#${id}`
+}
+
+export function parse(raw: string) {
+ const data = JSON.parse(raw)
+ const hits = Array.isArray(data?.hits) ? data.hits : []
+ const rewrites = Array.isArray(data?.rewrite?.queries) ? data.rewrite.queries.filter((x: unknown) => typeof x === "string") : []
+ const keywords = Array.isArray(data?.rewrite?.keywords) ? data.rewrite.keywords.filter((x: unknown) => typeof x === "string") : []
+ return {
+ hits: hits as Hit[],
+ rewrites,
+ keywords,
+ rewrite_mode: String(data?.rewrite?.mode || "none"),
+ } as SearchResult
+}
+
+export function session(id: string) {
+ const cur = STORE.get(id)
+ if (cur) return cur
+ const next: Session = {
+ epoch: 0,
+ last_user_id: "",
+ last_query: "",
+ last_cluster: "",
+ rows: new Map(),
+ }
+ STORE.set(id, next)
+ return next
+}
+
+export function row(st: Session, key: string) {
+ const cur = st.rows.get(key)
+ if (cur) return cur
+ const next: Row = {
+ seen: new Set(),
+ window: [],
+ last_query: "",
+ last_status: "",
+ last_reason: "",
+ last_checked: 0,
+ total_hits: 0,
+ known_hits: 0,
+ overlap: 0,
+ delta: [],
+ hits: [],
+ top: [],
+ rewrites: [],
+ }
+ st.rows.set(key, next)
+ return next
+}
+
+export function allow(row: Row) {
+ const now = Date.now()
+ const win = clusterWindowSec() * 1000
+ row.window = row.window.filter((x) => now - x <= win)
+ if (row.window.length >= clusterMax()) return false
+ row.window.push(now)
+ return true
+}
+
+export function decide(row: Row, hits: Hit[], query: string, rewrites?: string[]) {
+ const keys = hits.map(fp)
+ const fresh = hits.filter((hit) => !row.seen.has(fp(hit)))
+ const shared = keys.filter((key) => row.seen.has(key)).length
+ const ov = keys.length ? shared / keys.length : 0
+ const top = Number(hits[0]?.score || 0)
+ const status = !hits.length
+ ? "need_refine"
+ : !fresh.length && ov >= simCut()
+ ? "no_new_evidence"
+ : top < weakCut()
+ ? "weak_match"
+ : "new_evidence"
+ const reason = !hits.length
+ ? "empty_hits"
+ : !fresh.length && ov >= simCut()
+ ? "high_overlap"
+ : top < weakCut()
+ ? "low_score"
+ : fresh.length < hits.length
+ ? "delta_available"
+ : "fresh_hits"
+ const next =
+ status === "need_refine"
+ ? "refine_query_or_call_rag_search"
+ : status === "no_new_evidence"
+ ? "reuse_known_evidence_or_call_rag_search_state"
+ : status === "weak_match"
+ ? "call_rag_search_delta_or_refine_query"
+ : "call_rag_search_delta_if_more_detail_needed"
+ for (const key of keys) row.seen.add(key)
+ row.last_query = query
+ row.last_status = status
+ row.last_reason = reason
+ row.last_checked = Date.now()
+ row.total_hits = hits.length
+ row.known_hits = row.seen.size
+ row.overlap = ov
+ row.delta = fresh
+ row.hits = hits
+ row.top = hits.slice(0, 3)
+ row.rewrites = rewrites && rewrites.length ? rewrites : [query]
+ return { status, reason, next, overlap: ov, delta: fresh, hits, known: row.known_hits, total: hits.length }
+}
+
+export function stateBlock(key: string, row: Row, next?: string) {
+ const top = row.top[0]
+ return [
+ "",
+ `status=${row.last_status || "need_refine"}`,
+ `reason=${row.last_reason || "empty_hits"}`,
+ `cluster=${key}`,
+ `total_hits=${row.total_hits}`,
+ `delta_hits=${row.delta.length}`,
+ `known_hits=${row.known_hits}`,
+ `overlap=${Number(row.overlap || 0).toFixed(4)}`,
+ `top_source=${top?.source_url || ""}`,
+ `top_section=${clip(top?.section_title || "", 48)}`,
+ `rewrite_queries=${JSON.stringify(row.rewrites)}`,
+ `next_action=${next || "call_rag_search_delta_if_needed"}`,
+ "",
+ ].join("\n")
+}
+
+export function brief(hits: Hit[], limit: number) {
+ if (!hits.length) return "no_rag_hit"
+ return hits
+ .slice(0, Math.max(1, limit))
+ .map((hit, i) =>
+ [
+ `[${i + 1}]`,
+ `source=${hit.source_url || ""}`,
+ `section=${clip(hit.section_title || "", 48)}`,
+ `summary=${clip(hit.text_preview || "", chars())}`,
+ ].join(" "),
+ )
+ .join("\n")
+}
+
+export function expand(hits: Hit[], limit: number) {
+ if (!hits.length) return "no_rag_hit"
+ return hits
+ .slice(0, Math.max(1, limit))
+ .map((hit, i) =>
+ [
+ `[${i + 1}] score=${Number(hit.score || 0).toFixed(4)}`,
+ `source=${hit.source_url || ""}`,
+ `section=${hit.section_title || ""}`,
+ `chunk=${hit.chunk_id || hit.image_id || ""}`,
+ `text=${clip(hit.text_preview || "", expandChars())}`,
+ ].join("\n"),
+ )
+ .join("\n\n")
+}
+
+export function summary(hits: Hit[], limit: number) {
+ return hits.slice(0, Math.max(1, limit)).map((hit) => ({
+ score: Number(hit.score || 0),
+ rerank_score: Number(hit.rerank_score || 0),
+ source_url: hit.source_url || "",
+ section_title: hit.section_title || "",
+ chunk_id: hit.chunk_id || "",
+ image_id: hit.image_id || "",
+ text_preview: clip(hit.text_preview || "", chars()),
+ fp: fp(hit),
+ matched_queries: Array.isArray(hit.matched_queries) ? hit.matched_queries : [],
+ hit_count: Number(hit.hit_count || 0),
+ }))
+}
+
+export async function audit(worktree: string, data: Record) {
+ if (!debug()) return
+ const dir = path.join(root(worktree), ".rag", "log")
+ await mkdir(dir, { recursive: true })
+ await appendFile(path.join(dir, "rag_debug.jsonl"), `${JSON.stringify({ ts: new Date().toISOString(), ...data })}\n`, "utf-8")
+}
+
+export function reset(id: string) {
+ const st = session(id)
+ st.epoch += 1
+ st.rows.clear()
+ st.last_user_id = ""
+ st.last_query = ""
+ st.last_cluster = ""
+ return st
+}
diff --git a/.opencode/skills/rag-pipeline/SKILL.md b/.opencode/skills/rag-pipeline/SKILL.md
new file mode 100644
index 00000000000..5eb4081f8a7
--- /dev/null
+++ b/.opencode/skills/rag-pipeline/SKILL.md
@@ -0,0 +1,93 @@
+---
+name: rag-pipeline
+description: Run standardized rag init/update pipeline with minimal options and manifest-based sync
+compatibility: opencode
+---
+
+## Goal
+
+Use two commands only:
+
+1. `rag-init` for first build
+2. `rag-update` for incremental sync
+
+If the target repo does not contain this pipeline yet, bootstrap first:
+
+```bash
+bash script/rag/cmd/rag-bootstrap.sh --target
+```
+
+## Required Inputs
+
+1. source type: `structured` | `dir` | `url`
+2. source path (or url list)
+3. embedding model
+4. collection name
+
+## Exposed Options
+
+Only expose these options to users by default:
+
+1. `--source`
+2. `--struct-mode` + `--struct-model`
+3. `--embed-model`
+4. `--url` / `--url-file` / `--input-dir` / `--scan-dir`
+5. `--collection`
+
+Keep low-level knobs hidden unless users ask explicitly:
+
+1. chunk size / overlap
+2. OCR engine internals
+3. retry/backoff internals
+
+## Commands
+
+### Initial build
+
+Structured-only init:
+
+```bash
+bash script/rag/cmd/rag-init.sh --source structured --scan-dir .rag/text --glob "**/*.structured.json" --embed-model qwen3-embedding:4b --collection rag_chunks
+```
+
+Directory init:
+
+```bash
+bash script/rag/cmd/rag-init.sh --source dir --input-dir --text-out-dir .rag/text/dir --embed-model qwen3-embedding:4b --collection rag_chunks
+```
+
+URL init:
+
+```bash
+bash script/rag/cmd/rag-init.sh --source url --url --ocr-images --image-inline marker --url-text-dir .rag/text/url --embed-model qwen3-embedding:4b --collection rag_chunks
+```
+
+### Incremental update
+
+```bash
+bash script/rag/cmd/rag-update.sh --source structured --scan-dir .rag/text --glob "**/*.structured.json" --embed-model qwen3-embedding:4b --collection rag_chunks
+```
+
+## Behavior Rules
+
+1. Do not expose chunk-size/overlap or low-level OCR internals unless user explicitly asks.
+2. Keep defaults:
+ - `--struct-mode llamaindex`
+ - `--inline-ocr strip`
+ - `--image-inline marker`
+3. If collection or embedding model changes, allow full rebuild.
+4. Keep state in `--manifest` (default `.rag/state/manifest.json`) to support incremental update.
+5. Runtime retrieval policy:
+ - prefer plugin auto-inject with `` meta on every model step
+ - use `rag_search` to progressively reveal evidence text
+ - avoid repeated retrieval in the same query cluster unless new evidence appears
+ - use `rag_search` mode progressively: `state` -> `delta` -> `brief`
+ - use `expand` only for explicit debugging or when the user asks to inspect evidence details
+6. Debugging:
+ - enable with `RAG_DEBUG=1`
+ - inspect `.rag/log/rag_debug.jsonl`
+ - summarize quickly with `python script/rag/debug-rag-state.py --tail 100`
+7. On failure, return:
+ - exact command
+ - stderr summary
+ - recovery action
diff --git a/.opencode/tool/rag_search.ts b/.opencode/tool/rag_search.ts
new file mode 100644
index 00000000000..1dfd760133f
--- /dev/null
+++ b/.opencode/tool/rag_search.ts
@@ -0,0 +1,149 @@
+///
+import { tool } from "@opencode-ai/plugin"
+import path from "path"
+import DESCRIPTION from "./rag_search.txt"
+import {
+ allowExpand,
+ audit,
+ base,
+ brief,
+ chars,
+ cluster,
+ collection,
+ db,
+ decide,
+ expand,
+ expandChars,
+ key,
+ model,
+ parse,
+ py,
+ rewriteMode,
+ rewriteModel,
+ rewriteQueries,
+ root,
+ row,
+ session,
+ stateBlock,
+ summary,
+} from "../rag"
+
+export default tool({
+ description: DESCRIPTION,
+ args: {
+ query: tool.schema.string().describe("Search query text"),
+ top_k: tool.schema.number().describe("Maximum hits to return").default(3),
+ node_type: tool.schema.enum(["any", "text", "image"]).describe("Filter node type").default("text"),
+ mode: tool.schema.enum(["state", "delta", "brief", "expand"]).describe("Result disclosure mode").default("delta"),
+ },
+ async execute(args, ctx) {
+ const baseDir = root(ctx?.worktree || ctx?.directory || process.cwd())
+ const python = py(baseDir)
+ const script = path.join(baseDir, "script", "rag", "search-vector-index.py")
+ const dbPath = db(baseDir)
+ const show = args.mode === "expand" ? expandChars() : chars()
+ const res =
+ await Bun.$`${python} ${script} --query ${args.query} --db-path ${dbPath} --collection ${collection()} --model ${model()} --top-k ${args.top_k} --node-type ${args.node_type} --show-text-chars ${show} --base-url ${base()} --api-key ${key()} --format json --rewrite ${rewriteMode()} --rewrite-model ${rewriteModel()} --rewrite-queries ${rewriteQueries()}`
+ .quiet()
+ .nothrow()
+ const out = res.stdout.toString().trim()
+ const sessionID = String(ctx?.sessionID || ctx?.sessionId || baseDir)
+ const keyName = cluster(args.query)
+ const st = session(sessionID)
+ const hit = row(st, keyName)
+
+ if (res.exitCode !== 0) {
+ const err = res.stderr.toString().trim()
+ await audit(baseDir, {
+ channel: "rag_search",
+ event: "tool_error",
+ sessionID,
+ query: args.query,
+ cluster: keyName,
+ mode: args.mode,
+ code: res.exitCode,
+ stderr: err.slice(0, 1200),
+ stdout: out.slice(0, 1200),
+ })
+ return JSON.stringify(
+ {
+ error: "rag_search_failed",
+ exit_code: res.exitCode,
+ worktree: baseDir,
+ python,
+ script,
+ db_path: dbPath,
+ collection: collection(),
+ model: model(),
+ base_url: base(),
+ mode: args.mode,
+ stderr: err.slice(0, 1200),
+ stdout: out.slice(0, 1200),
+ hint: "verify OPENAI_BASE_URL/OPENAI_API_KEY, collection exists, and venv has openai/qdrant-client",
+ },
+ null,
+ 2,
+ )
+ }
+
+ let dataRes = { hits: [], rewrites: [args.query], keywords: [], rewrite_mode: "none" } as ReturnType
+ try {
+ dataRes = parse(out)
+ } catch {
+ await audit(baseDir, {
+ channel: "rag_search",
+ event: "tool_parse_fail",
+ sessionID,
+ query: args.query,
+ cluster: keyName,
+ mode: args.mode,
+ raw: out.slice(0, 1200),
+ })
+ return out.slice(0, 1000)
+ }
+
+ const data = decide(hit, dataRes.hits, args.query, dataRes.rewrites)
+ const head = stateBlock(keyName, hit, data.next)
+ const body =
+ args.mode === "state"
+ ? ""
+ : args.mode === "expand"
+ ? allowExpand()
+ ? expand(dataRes.hits, args.top_k)
+ : "expand_blocked=1\nhint=use mode=delta or mode=brief unless debugging with RAG_ALLOW_EXPAND_TOOL=1"
+ : args.mode === "brief"
+ ? brief(dataRes.hits, args.top_k)
+ : data.delta.length
+ ? brief(data.delta, args.top_k)
+ : "no_new_delta"
+
+ await audit(baseDir, {
+ channel: "rag_search",
+ event: "tool_search",
+ sessionID,
+ query: args.query,
+ cluster: keyName,
+ mode: args.mode,
+ node_type: args.node_type,
+ status: data.status,
+ reason: data.reason,
+ total_hits: data.total,
+ delta_hits: data.delta.length,
+ known_hits: data.known,
+ overlap: data.overlap,
+ rewrite_mode: dataRes.rewrite_mode,
+ top_hits: summary(hit.top, 3),
+ delta_fps: data.delta.map((x) => ({
+ fp: `${x.text_file || x.source_url || ""}#${x.chunk_id || x.image_id || x.section_title || ""}`,
+ source_url: x.source_url || "",
+ section_title: x.section_title || "",
+ chunk_id: x.chunk_id || "",
+ })),
+ emitted_context: args.mode !== "state",
+ rewrites: hit.rewrites,
+ keywords: dataRes.keywords,
+ })
+
+ return body ? `${head}\n${body}` : head
+ },
+})
diff --git a/.opencode/tool/rag_search.txt b/.opencode/tool/rag_search.txt
new file mode 100644
index 00000000000..d38d2b8e510
--- /dev/null
+++ b/.opencode/tool/rag_search.txt
@@ -0,0 +1,17 @@
+Search local RAG vector index and return ranked evidence snippets for the current query.
+
+Use this tool when:
+- the user asks about project docs, internal wiki, SOP, or known indexed materials
+- you need grounded context before answering
+
+Behavior:
+- reads local qdrant index under .rag/vector/qdrant
+- may rewrite long queries into multiple focused retrieval queries before searching
+- shares the same session/cluster state used by `rag_context`
+- supports progressive disclosure via mode:
+ - `state`: retrieval state only, no evidence body
+ - `delta`: only new evidence within current query cluster (default)
+ - `brief`: short evidence list for current hits
+ - `expand`: richer per-hit details for follow-up drilling
+- default output is compact and should not dump full raw retrieval payload
+- do not use `expand` in normal QA unless the user explicitly asks to inspect evidence details
diff --git a/script/rag/build-offline-bundle.sh b/script/rag/build-offline-bundle.sh
new file mode 100755
index 00000000000..68353a56caa
--- /dev/null
+++ b/script/rag/build-offline-bundle.sh
@@ -0,0 +1,132 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
+OUT=${RAG_OFFLINE_OUT:-"$ROOT/.rag/offline/bundle"}
+PY=${RAG_DOCLING_PYTHON:-python3}
+LANGS=${RAG_TESS_LANGS:-"eng chi-sim"}
+DOC_REQ=${RAG_DOCLING_REQUIREMENTS:-"$ROOT/script/rag/requirements-docling.txt"}
+LLM_REQ=${RAG_LLAMA_REQUIREMENTS:-"$ROOT/script/rag/requirements-llamaindex.txt"}
+VECTOR_REQ=${RAG_VECTOR_REQUIREMENTS:-"$ROOT/script/rag/requirements-vector.txt"}
+INCLUDE_LLM=false
+INCLUDE_VECTOR=false
+
+usage() {
+ cat <<'EOF'
+Build an offline bundle for Ubuntu hosts with limited mirror/network access.
+
+Usage:
+ script/rag/build-offline-bundle.sh [--out DIR] [--python BIN] [--langs "eng chi-sim"] [--include-llamaindex] [--include-vectordb]
+
+Options:
+ --out DIR Bundle output directory (default: ./.rag/offline/bundle)
+ --python BIN Python executable used for wheel download (default: python3)
+ --langs "a b" Tesseract language packs (default: "eng chi-sim")
+ --include-llamaindex Also download llamaindex wheels
+ --include-vectordb Also download vector db wheels (qdrant-client/openai)
+ -h, --help Show help
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --out)
+ OUT="$2"
+ shift 2
+ ;;
+ --python)
+ PY="$2"
+ shift 2
+ ;;
+ --langs)
+ LANGS="$2"
+ shift 2
+ ;;
+ --include-llamaindex)
+ INCLUDE_LLM=true
+ shift
+ ;;
+ --include-vectordb)
+ INCLUDE_VECTOR=true
+ shift
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "unknown argument: $1" >&2
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+if ! command -v "$PY" >/dev/null 2>&1; then
+ echo "python executable not found: $PY" >&2
+ exit 1
+fi
+if ! command -v apt-get >/dev/null 2>&1; then
+ echo "apt-get not found, this script targets Debian/Ubuntu" >&2
+ exit 1
+fi
+
+rm -rf "$OUT"
+mkdir -p "$OUT/wheelhouse" "$OUT/deb" "$OUT/script/rag"
+
+TMP=$(mktemp -d)
+trap 'rm -rf "$TMP"' EXIT
+"$PY" -m venv "$TMP/venv"
+
+"$TMP/venv/bin/python" -m pip install -U pip
+"$TMP/venv/bin/pip" download -r "$DOC_REQ" -d "$OUT/wheelhouse"
+
+if [[ "$INCLUDE_LLM" == "true" && -f "$LLM_REQ" ]]; then
+ "$TMP/venv/bin/pip" download -r "$LLM_REQ" -d "$OUT/wheelhouse"
+fi
+if [[ "$INCLUDE_VECTOR" == "true" && -f "$VECTOR_REQ" ]]; then
+ "$TMP/venv/bin/pip" download -r "$VECTOR_REQ" -d "$OUT/wheelhouse"
+fi
+
+declare -a PKGS=("tesseract-ocr")
+read -ra ITEMS <<<"$LANGS"
+for l in "${ITEMS[@]}"; do
+ [[ -z "$l" ]] && continue
+ PKGS+=("tesseract-ocr-${l//_/-}")
+done
+
+if command -v apt-rdepends >/dev/null 2>&1; then
+ mapfile -t ALL < <(
+ apt-rdepends "${PKGS[@]}" 2>/dev/null |
+ awk '/^[a-zA-Z0-9]/ { print $1 }' |
+ rg -v '^(Reading|Building|Depends|PreDepends|Recommends|Suggests)$' |
+ sort -u
+ )
+else
+ echo "warning: apt-rdepends not installed, only top-level tesseract packages will be downloaded." >&2
+ ALL=("${PKGS[@]}")
+fi
+
+(
+ cd "$OUT/deb"
+ apt-get download "${ALL[@]}"
+)
+
+cp "$ROOT/script/rag/install-docling.sh" "$OUT/script/rag/"
+cp "$ROOT/script/rag/install-tesseract.sh" "$OUT/script/rag/"
+cp "$ROOT/script/rag/install-vector.sh" "$OUT/script/rag/"
+cp "$ROOT/script/rag/install-offline-bundle.sh" "$OUT/script/rag/" 2>/dev/null || true
+cp "$ROOT/script/rag/build-vector-index.py" "$OUT/script/rag/" 2>/dev/null || true
+cp "$ROOT/script/rag/search-vector-index.py" "$OUT/script/rag/" 2>/dev/null || true
+cp "$ROOT/script/rag/requirements-docling.txt" "$OUT/script/rag/"
+if [[ -f "$LLM_REQ" ]]; then
+ cp "$LLM_REQ" "$OUT/script/rag/"
+fi
+if [[ -f "$VECTOR_REQ" ]]; then
+ cp "$VECTOR_REQ" "$OUT/script/rag/"
+fi
+
+sha256sum "$OUT"/wheelhouse/* "$OUT"/deb/* >"$OUT/SHA256SUMS.txt"
+tar -C "$(dirname "$OUT")" -czf "${OUT%/}.tar.gz" "$(basename "$OUT")"
+echo "bundle directory: $OUT"
+echo "bundle archive: ${OUT%/}.tar.gz"
diff --git a/script/rag/build-vector-index.py b/script/rag/build-vector-index.py
new file mode 100755
index 00000000000..37ea3debc54
--- /dev/null
+++ b/script/rag/build-vector-index.py
@@ -0,0 +1,402 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import sys
+import time
+import uuid
+from pathlib import Path
+
+IMAGE_OCR_RE = re.compile(r"\[IMAGE_OCR\][\s\S]*?\[/IMAGE_OCR\]")
+
+
+def clean(text: str) -> str:
+ return re.sub(r"\s+", " ", text).strip()
+
+
+def normalize_text(text: str, strip_inline_ocr: bool) -> str:
+ body = text or ""
+ if strip_inline_ocr:
+ body = IMAGE_OCR_RE.sub(" ", body)
+ return clean(body)
+
+
+def is_rate_limit_error(e: Exception) -> bool:
+ s = str(e).lower()
+ return "rate limit" in s or "too many requests" in s or "429" in s
+
+
+def embed_texts(
+ client,
+ model: str,
+ texts: list[str],
+ max_retries: int,
+ retry_initial: float,
+) -> list[list[float]]:
+ n = 0
+ delay = max(0.2, retry_initial)
+ while True:
+ try:
+ r = client.embeddings.create(model=model, input=texts)
+ return [item.embedding for item in r.data]
+ except Exception as e:
+ if not is_rate_limit_error(e) or n >= max_retries:
+ raise
+ n += 1
+ print(
+ f"[embed] rate limit; retry {n}/{max_retries} after {delay:.1f}s",
+ file=sys.stderr,
+ )
+ time.sleep(delay)
+ delay = min(delay * 2, 30)
+
+
+def list_inputs(paths: list[str], input_dir: str, glob: str) -> list[Path]:
+ files = [Path(p) for p in paths]
+ if input_dir:
+ files.extend(sorted(Path(input_dir).glob(glob)))
+ out = []
+ seen = set()
+ for path in files:
+ p = path.resolve()
+ if p in seen:
+ continue
+ seen.add(p)
+ if p.is_file():
+ out.append(p)
+ return out
+
+
+def doc_key(path: Path, root: Path) -> str:
+ p = path.resolve()
+ try:
+ return str(p.relative_to(root.resolve()))
+ except ValueError:
+ return str(p)
+
+
+def delete_keys(direct: list[str], file_path: str) -> list[str]:
+ out = [x for x in direct if x]
+ if file_path:
+ p = Path(file_path)
+ if p.exists():
+ out.extend(
+ line.strip()
+ for line in p.read_text(encoding="utf-8", errors="ignore").splitlines()
+ if line.strip()
+ )
+ return sorted(set(out))
+
+
+def merge_images(data: dict) -> list[dict]:
+ if isinstance(data.get("image_nodes"), list):
+ out = []
+ for item in data["image_nodes"]:
+ iid = item.get("image_id") or item.get("id")
+ if not iid:
+ continue
+ out.append(
+ {
+ "id": iid,
+ "section_ids": item.get("section_ids", []),
+ "source_url": item.get("source_url", ""),
+ "alt": item.get("alt", ""),
+ "ocr_text": item.get("ocr_text", ""),
+ }
+ )
+ return out
+
+ image_map = {}
+ for sec in data.get("sections", []):
+ for item in sec.get("images", []):
+ iid = item.get("id")
+ if not iid:
+ continue
+ row = image_map.get(iid) or {
+ "id": iid,
+ "section_ids": [],
+ "source_url": item.get("url", ""),
+ "alt": item.get("alt", ""),
+ "ocr_text": item.get("ocr_text", ""),
+ }
+ sid = sec.get("id")
+ if sid and sid not in row["section_ids"]:
+ row["section_ids"].append(sid)
+ if not row["source_url"]:
+ row["source_url"] = item.get("url", "")
+ if not row["alt"]:
+ row["alt"] = item.get("alt", "")
+ if not row["ocr_text"]:
+ row["ocr_text"] = item.get("ocr_text", "")
+ image_map[iid] = row
+ return list(image_map.values())
+
+
+def load_nodes(
+ paths: list[Path],
+ include_images: bool,
+ strip_inline_ocr: bool,
+ image_min_chars: int,
+ root: Path,
+) -> list[dict]:
+ rows = []
+ for path in paths:
+ data = json.loads(path.read_text(encoding="utf-8", errors="ignore"))
+ source_url = data.get("source_url", "")
+ text_file = data.get("text_file", str(path))
+ key = doc_key(path, root)
+ for i, item in enumerate(data.get("chunks", [])):
+ text = normalize_text(item.get("text", ""), strip_inline_ocr)
+ if not text:
+ continue
+ raw = f"{path}:{item.get('id', i)}"
+ pid = str(uuid.uuid5(uuid.NAMESPACE_URL, raw))
+ meta = item.get("metadata") or {}
+ rows.append(
+ {
+ "id": pid,
+ "text": text,
+ "payload": {
+ "node_type": "text",
+ "chunk_id": item.get("id", f"chunk-{i}"),
+ "section_id": item.get("section_id", ""),
+ "section_title": item.get("section_title", ""),
+ "source_url": meta.get("source_url") or source_url,
+ "text_file": meta.get("text_file") or text_file,
+ "doc_key": key,
+ "image_ids": item.get("image_ids", []),
+ "char_len": meta.get("char_len", len(text)),
+ "text": text,
+ "raw_id": raw,
+ },
+ }
+ )
+ if not include_images:
+ continue
+ for i, item in enumerate(merge_images(data)):
+ iid = item.get("id")
+ txt = clean(
+ "\n".join(
+ x
+ for x in [
+ f"[IMAGE:{iid}]",
+ item.get("alt", ""),
+ item.get("ocr_text", ""),
+ ]
+ if x
+ )
+ )
+ if len(clean((item.get("alt", "") + " " + item.get("ocr_text", "")).strip())) < image_min_chars:
+ continue
+ raw = f"{path}:image:{iid}:{i}"
+ pid = str(uuid.uuid5(uuid.NAMESPACE_URL, raw))
+ rows.append(
+ {
+ "id": pid,
+ "text": txt,
+ "payload": {
+ "node_type": "image",
+ "image_id": iid,
+ "section_ids": item.get("section_ids", []),
+ "section_title": "",
+ "source_url": item.get("source_url", "") or source_url,
+ "text_file": text_file,
+ "doc_key": key,
+ "image_ids": [iid],
+ "char_len": len(txt),
+ "text": txt,
+ "alt": item.get("alt", ""),
+ "ocr_text": item.get("ocr_text", ""),
+ "raw_id": raw,
+ },
+ }
+ )
+ return rows
+
+
+def has_collection(client: QdrantClient, name: str) -> bool:
+ if hasattr(client, "collection_exists"):
+ return bool(client.collection_exists(name))
+ cols = client.get_collections().collections
+ return any(c.name == name for c in cols)
+
+
+def delete_doc_keys(client, models, collection: str, keys: list[str]) -> int:
+ if not keys:
+ return 0
+ if not has_collection(client, collection):
+ return 0
+ for key in keys:
+ client.delete(
+ collection_name=collection,
+ points_selector=models.Filter(
+ must=[models.FieldCondition(key="doc_key", match=models.MatchValue(value=key))]
+ ),
+ wait=True,
+ )
+ return len(keys)
+
+
+def main() -> None:
+ p = argparse.ArgumentParser()
+ p.add_argument("--input", action="append", default=[])
+ p.add_argument("--input-dir", default="")
+ p.add_argument("--glob", default="*.structured.json")
+ p.add_argument("--db-path", default=".rag/vector/qdrant")
+ p.add_argument("--collection", default="rag_chunks")
+ p.add_argument("--model", default="nomic-embed-text")
+ p.add_argument("--base-url", default="")
+ p.add_argument("--api-key", default="")
+ p.add_argument("--batch-size", type=int, default=16)
+ p.add_argument("--max-retries", type=int, default=6)
+ p.add_argument("--retry-initial", type=float, default=1.5)
+ p.add_argument("--no-image-nodes", action="store_true")
+ p.add_argument("--keep-inline-ocr", action="store_true")
+ p.add_argument("--image-min-chars", type=int, default=2)
+ p.add_argument("--root", default=".")
+ p.add_argument("--delete-doc-key", action="append", default=[])
+ p.add_argument("--delete-doc-keys-file", default="")
+ p.add_argument("--recreate", action="store_true")
+ args = p.parse_args()
+
+ try:
+ from openai import OpenAI
+ from qdrant_client import QdrantClient, models
+ except ModuleNotFoundError as e:
+ raise SystemExit(
+ f"missing dependency: {e.name}. run: bash script/rag/install-vector.sh"
+ ) from e
+
+ inputs = list_inputs(args.input, args.input_dir, args.glob)
+ root = Path(args.root)
+ del_keys = delete_keys(args.delete_doc_key, args.delete_doc_keys_file)
+
+ rows = (
+ load_nodes(
+ inputs,
+ include_images=not args.no_image_nodes,
+ strip_inline_ocr=not args.keep_inline_ocr,
+ image_min_chars=max(0, args.image_min_chars),
+ root=root,
+ )
+ if inputs
+ else []
+ )
+ if not rows and not del_keys:
+ raise SystemExit("no input files and no delete doc keys; nothing to do")
+
+ key = args.api_key or os.getenv("OPENAI_API_KEY") or os.getenv("MINIMAX_API_KEY") or "ollama"
+ base = args.base_url or os.getenv("OPENAI_BASE_URL") or "http://127.0.0.1:11434/v1"
+ embed = OpenAI(api_key=key, base_url=base) if rows else None
+
+ db_path = Path(args.db_path)
+ db_path.mkdir(parents=True, exist_ok=True)
+ qdrant = QdrantClient(path=str(db_path))
+ deleted = 0
+
+ if args.recreate and has_collection(qdrant, args.collection):
+ qdrant.delete_collection(collection_name=args.collection)
+ if del_keys:
+ deleted = delete_doc_keys(qdrant, models, args.collection, del_keys)
+
+ if not rows:
+ count = qdrant.count(collection_name=args.collection, exact=True).count if has_collection(qdrant, args.collection) else 0
+ print(
+ json.dumps(
+ {
+ "db_path": str(db_path),
+ "collection": args.collection,
+ "input_files": 0,
+ "inserted": 0,
+ "deleted_doc_keys": deleted,
+ "collection_count": count,
+ "text_nodes": 0,
+ "image_nodes": 0,
+ "vector_size": 0,
+ "embedding_model": args.model,
+ "embedding_base_url": base,
+ },
+ ensure_ascii=False,
+ indent=2,
+ )
+ )
+ return
+
+ vec0 = embed_texts(
+ embed,
+ args.model,
+ [rows[0]["text"]],
+ args.max_retries,
+ args.retry_initial,
+ )[0]
+ dim = len(vec0)
+ if dim <= 0:
+ raise SystemExit("embedding result is empty")
+ if not has_collection(qdrant, args.collection):
+ qdrant.create_collection(
+ collection_name=args.collection,
+ vectors_config=models.VectorParams(size=dim, distance=models.Distance.COSINE),
+ )
+
+ batch_size = max(1, args.batch_size)
+ total = 0
+ batch = [{"id": rows[0]["id"], "vector": vec0, "payload": rows[0]["payload"]}]
+ for i in range(1, len(rows), batch_size):
+ seg = rows[i : i + batch_size]
+ vecs = embed_texts(
+ embed,
+ args.model,
+ [x["text"] for x in seg],
+ args.max_retries,
+ args.retry_initial,
+ )
+ batch.extend(
+ {
+ "id": seg[j]["id"],
+ "vector": vecs[j],
+ "payload": seg[j]["payload"],
+ }
+ for j in range(len(seg))
+ )
+
+ for i in range(0, len(batch), batch_size):
+ seg = batch[i : i + batch_size]
+ qdrant.upsert(
+ collection_name=args.collection,
+ points=[
+ models.PointStruct(id=item["id"], vector=item["vector"], payload=item["payload"])
+ for item in seg
+ ],
+ wait=True,
+ )
+ total += len(seg)
+
+ count = qdrant.count(collection_name=args.collection, exact=True).count
+ text_nodes = sum(1 for x in rows if x["payload"].get("node_type") == "text")
+ image_nodes = sum(1 for x in rows if x["payload"].get("node_type") == "image")
+ print(
+ json.dumps(
+ {
+ "db_path": str(db_path),
+ "collection": args.collection,
+ "input_files": len(inputs),
+ "inserted": total,
+ "deleted_doc_keys": deleted,
+ "collection_count": count,
+ "text_nodes": text_nodes,
+ "image_nodes": image_nodes,
+ "vector_size": dim,
+ "embedding_model": args.model,
+ "embedding_base_url": base,
+ },
+ ensure_ascii=False,
+ indent=2,
+ )
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/script/rag/clean-text.py b/script/rag/clean-text.py
new file mode 100755
index 00000000000..72c9ea64134
--- /dev/null
+++ b/script/rag/clean-text.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import re
+from pathlib import Path
+
+
+def normalize(text: str) -> str:
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
+ text = re.sub(r"[ \t]+", " ", text)
+ text = re.sub(r"\n{3,}", "\n\n", text)
+ return text.strip() + "\n"
+
+
+def drop_noise(lines: list[str]) -> list[str]:
+ out = []
+ seen = set()
+ for line in lines:
+ row = line.strip()
+ if not row:
+ out.append("")
+ continue
+ if row.startswith("[上一页 ") or row.startswith("[下一页 "):
+ continue
+ if row.startswith("- [") and row.endswith(")"):
+ continue
+ if row == "":
+ continue
+ key = re.sub(r"\s+", " ", row)
+ if key in seen and len(key) > 80:
+ continue
+ seen.add(key)
+ out.append(line)
+ return out
+
+
+def main() -> None:
+ p = argparse.ArgumentParser()
+ p.add_argument("--input", required=True)
+ p.add_argument("--output", required=True)
+ args = p.parse_args()
+
+ src = Path(args.input).read_text(encoding="utf-8", errors="ignore")
+ rows = drop_noise(src.splitlines())
+ out = normalize("\n".join(rows))
+ Path(args.output).write_text(out, encoding="utf-8")
+
+
+if __name__ == "__main__":
+ main()
+
diff --git a/script/rag/cmd/rag-bootstrap.sh b/script/rag/cmd/rag-bootstrap.sh
new file mode 100755
index 00000000000..722983be004
--- /dev/null
+++ b/script/rag/cmd/rag-bootstrap.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)
+TARGET=""
+WITH_OPENCODE=true
+
+usage() {
+ cat <<'EOF'
+Copy RAG pipeline scripts and optional OpenCode assets to another project.
+
+Usage:
+ bash script/rag/cmd/rag-bootstrap.sh --target /path/to/target [--no-opencode]
+
+Options:
+ --target DIR Target project root
+ --no-opencode Do not copy .opencode plugin/tool/skill files
+ -h, --help Show help
+EOF
+}
+
+copy_dir() {
+ local src="$1"
+ local dst="$2"
+ mkdir -p "$dst"
+ if command -v rsync >/dev/null 2>&1; then
+ rsync -a --exclude '__pycache__' --exclude '*.pyc' "$src"/ "$dst"/
+ return
+ fi
+ find "$src" -type d -name "__pycache__" -prune -o -type f ! -name '*.pyc' -print | while read -r file; do
+ rel=${file#"$src"/}
+ mkdir -p "$dst/$(dirname "$rel")"
+ cp -f "$file" "$dst/$rel"
+ done
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --target)
+ TARGET="$2"
+ shift 2
+ ;;
+ --no-opencode)
+ WITH_OPENCODE=false
+ shift
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "unknown argument: $1" >&2
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+if [[ -z "$TARGET" ]]; then
+ echo "--target is required" >&2
+ usage
+ exit 1
+fi
+
+mkdir -p "$TARGET/script"
+copy_dir "$ROOT/script/rag" "$TARGET/script/rag"
+
+if [[ "$WITH_OPENCODE" == "true" ]]; then
+ mkdir -p "$TARGET/.opencode/tool" "$TARGET/.opencode/plugins" "$TARGET/.opencode/skills/rag-pipeline"
+ cp -f "$ROOT/.opencode/tool/rag_search.ts" "$TARGET/.opencode/tool/rag_search.ts"
+ cp -f "$ROOT/.opencode/tool/rag_search.txt" "$TARGET/.opencode/tool/rag_search.txt"
+ cp -f "$ROOT/.opencode/plugins/rag_context.ts" "$TARGET/.opencode/plugins/rag_context.ts"
+ cp -f "$ROOT/.opencode/skills/rag-pipeline/SKILL.md" "$TARGET/.opencode/skills/rag-pipeline/SKILL.md"
+ cp -f "$ROOT/.opencode/rag.ts" "$TARGET/.opencode/rag.ts"
+fi
+
+echo "bootstrap_done target=$TARGET with_opencode=$WITH_OPENCODE"
+echo "next:"
+echo " 1) cd $TARGET"
+echo " 2) bash script/rag/install-docling.sh"
+echo " 3) bash script/rag/install-vector.sh"
+echo " 4) bash script/rag/cmd/rag-init.sh --help"
diff --git a/script/rag/cmd/rag-init.sh b/script/rag/cmd/rag-init.sh
new file mode 100644
index 00000000000..bfb728b640c
--- /dev/null
+++ b/script/rag/cmd/rag-init.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)
+PY=${RAG_DOCLING_PYTHON_BIN:-}
+if [[ -z "$PY" ]]; then
+ if [[ -x "$ROOT/.venv-docling/bin/python" ]]; then
+ PY="$ROOT/.venv-docling/bin/python"
+ else
+ PY="python3"
+ fi
+fi
+
+exec "$PY" "$ROOT/script/rag/rag-pipeline.py" init "$@"
diff --git a/script/rag/cmd/rag-update.sh b/script/rag/cmd/rag-update.sh
new file mode 100644
index 00000000000..1c518a8879e
--- /dev/null
+++ b/script/rag/cmd/rag-update.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)
+PY=${RAG_DOCLING_PYTHON_BIN:-}
+if [[ -z "$PY" ]]; then
+ if [[ -x "$ROOT/.venv-docling/bin/python" ]]; then
+ PY="$ROOT/.venv-docling/bin/python"
+ else
+ PY="python3"
+ fi
+fi
+
+exec "$PY" "$ROOT/script/rag/rag-pipeline.py" update "$@"
diff --git a/script/rag/compare-structured.py b/script/rag/compare-structured.py
new file mode 100755
index 00000000000..623eecb7ce9
--- /dev/null
+++ b/script/rag/compare-structured.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from pathlib import Path
+
+OCR_RE = re.compile(r"\[IMAGE_OCR\][\s\S]*?\[/IMAGE_OCR\]")
+
+
+def load(path: Path) -> dict:
+ return json.loads(path.read_text(encoding="utf-8", errors="ignore"))
+
+
+def metrics(data: dict) -> dict:
+ chunks = data.get("chunks", [])
+ sections = data.get("sections", [])
+ image_nodes = data.get("image_nodes", [])
+ nodes = data.get("nodes", [])
+ txt = [x.get("text", "") for x in chunks]
+ chars = [len(x) for x in txt]
+ with_ocr = sum(1 for x in txt if "[IMAGE_OCR]" in x)
+ ocr_blocks = sum(len(OCR_RE.findall(x)) for x in txt)
+ linked = sum(1 for x in chunks if (x.get("image_ids") or []))
+ return {
+ "chunks": len(chunks),
+ "sections": len(sections),
+ "image_nodes": len(image_nodes),
+ "nodes": len(nodes),
+ "chunks_with_image_refs": linked,
+ "chunks_with_inline_ocr": with_ocr,
+ "inline_ocr_blocks_in_chunks": ocr_blocks,
+ "avg_chunk_chars": 0 if not chars else round(sum(chars) / len(chars), 2),
+ }
+
+
+def main() -> None:
+ p = argparse.ArgumentParser()
+ p.add_argument("--old", required=True)
+ p.add_argument("--new", required=True)
+ args = p.parse_args()
+
+ old = metrics(load(Path(args.old)))
+ new = metrics(load(Path(args.new)))
+ keys = sorted(set(old) | set(new))
+ diff = {k: (new.get(k, 0) - old.get(k, 0)) for k in keys}
+ print(json.dumps({"old": old, "new": new, "delta_new_minus_old": diff}, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/script/rag/convert-dir-to-text.sh b/script/rag/convert-dir-to-text.sh
new file mode 100755
index 00000000000..43118755a3f
--- /dev/null
+++ b/script/rag/convert-dir-to-text.sh
@@ -0,0 +1,136 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
+DOC=${RAG_DOCLING_BIN:-"$ROOT/.venv-docling/bin/docling"}
+IN=""
+OUT=${RAG_TEXT_FILES_OUTPUT:-"$ROOT/.rag/text/files"}
+EXT="pdf docx pptx html htm md txt csv xls xlsx xml"
+
+usage() {
+ cat <<'EOF'
+Convert supported files in a directory to text with docling.
+
+Usage:
+ script/rag/convert-dir-to-text.sh --input DIR [--output DIR] [--ext "pdf docx html"]
+
+Options:
+ --input DIR Source directory (required)
+ --output DIR Text output directory (default: ./.rag/text/files)
+ --ext "a b c" Extensions to include (default: pdf docx pptx html htm md txt csv xls xlsx xml)
+ --docling-bin PATH docling executable (default: ./.venv-docling/bin/docling)
+ -h, --help Show help
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --input)
+ IN="$2"
+ shift 2
+ ;;
+ --output)
+ OUT="$2"
+ shift 2
+ ;;
+ --ext)
+ EXT="$2"
+ shift 2
+ ;;
+ --docling-bin)
+ DOC="$2"
+ shift 2
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "unknown argument: $1" >&2
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+if [[ -z "$IN" ]]; then
+ echo "--input is required" >&2
+ usage
+ exit 1
+fi
+
+if [[ ! -d "$IN" ]]; then
+ echo "input directory not found: $IN" >&2
+ exit 1
+fi
+
+if [[ ! -x "$DOC" ]]; then
+ echo "docling not found: $DOC" >&2
+ exit 1
+fi
+
+mkdir -p "$OUT"
+SUCCESS_LOG="$OUT/_success.log"
+FAIL_LOG="$OUT/_failed.log"
+RUN_LOG="$OUT/_run.log"
+: >"$SUCCESS_LOG"
+: >"$FAIL_LOG"
+: >"$RUN_LOG"
+
+declare -a FIND_EXPR=()
+read -ra PARTS <<<"$EXT"
+for i in "${!PARTS[@]}"; do
+ e="${PARTS[$i]}"
+ [[ -z "$e" ]] && continue
+ if [[ "$i" -gt 0 ]]; then
+ FIND_EXPR+=("-o")
+ fi
+ FIND_EXPR+=("-iname" "*.$e")
+done
+
+if [[ "${#FIND_EXPR[@]}" -eq 0 ]]; then
+ echo "no valid extensions in --ext" >&2
+ exit 1
+fi
+
+TMP=$(mktemp -d)
+trap 'rm -rf "$TMP"' EXIT
+
+mapfile -t FILES < <(find "$IN" -type f \( "${FIND_EXPR[@]}" \) | sort)
+if [[ "${#FILES[@]}" -eq 0 ]]; then
+ echo "no files matched in: $IN"
+ exit 0
+fi
+
+OK=0
+BAD=0
+
+for f in "${FILES[@]}"; do
+ rel=${f#"$IN"/}
+ target="$OUT/${rel%.*}.txt"
+ mkdir -p "$(dirname "$target")"
+
+ work="$TMP/out"
+ rm -rf "$work"
+ mkdir -p "$work"
+
+ if "$DOC" "$f" --to text --output "$work" --abort-on-error >>"$RUN_LOG" 2>&1; then
+ b=$(basename "${f%.*}")
+ src="$work/$b.txt"
+ if [[ -f "$src" ]]; then
+ mv "$src" "$target"
+ printf '%s\n' "$target" >>"$SUCCESS_LOG"
+ OK=$((OK + 1))
+ continue
+ fi
+ fi
+
+ printf '%s\n' "$f" >>"$FAIL_LOG"
+ BAD=$((BAD + 1))
+done
+
+echo "done: total=${#FILES[@]} success=$OK failed=$BAD"
+echo "success log: $SUCCESS_LOG"
+echo "failed log: $FAIL_LOG"
+echo "run log: $RUN_LOG"
+
diff --git a/script/rag/debug-rag-state.py b/script/rag/debug-rag-state.py
new file mode 100755
index 00000000000..962014f0ade
--- /dev/null
+++ b/script/rag/debug-rag-state.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+from collections import Counter
+from pathlib import Path
+
+
+def read_rows(path: Path) -> list[dict]:
+ rows = []
+ if not path.exists():
+ return rows
+ with path.open("r", encoding="utf-8", errors="ignore") as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ rows.append(json.loads(line))
+ except Exception:
+ continue
+ return rows
+
+
+def main() -> None:
+ p = argparse.ArgumentParser()
+ p.add_argument("--log", default=".rag/log/rag_debug.jsonl")
+ p.add_argument("--tail", type=int, default=80)
+ p.add_argument("--session", default="")
+ p.add_argument("--channel", default="")
+ p.add_argument("--full", action="store_true")
+ args = p.parse_args()
+
+ path = Path(args.log)
+ rows = read_rows(path)
+ if args.session:
+ rows = [x for x in rows if str(x.get("sessionID", "")) == args.session]
+ if args.channel:
+ rows = [x for x in rows if str(x.get("channel", "")) == args.channel]
+ if not rows:
+ raise SystemExit(f"no debug rows found in: {path}")
+
+ view = rows[-max(1, args.tail) :]
+ events = Counter(str(x.get("event", "")) for x in view)
+ statuses = Counter(str(x.get("status", "")) for x in view if x.get("status"))
+ clusters = Counter(str(x.get("cluster", "")) for x in view if x.get("cluster"))
+ channels = Counter(str(x.get("channel", "")) for x in view if x.get("channel"))
+ modes = Counter(str(x.get("mode", "")) for x in view if x.get("mode"))
+
+ print(json.dumps({
+ "log": str(path),
+ "rows_total": len(rows),
+ "rows_view": len(view),
+ "channels": dict(channels),
+ "events": dict(events),
+ "statuses": dict(statuses),
+ "modes": dict(modes),
+ "top_clusters": clusters.most_common(10),
+ }, ensure_ascii=False, indent=2))
+
+ print("\nlast_rows:")
+ for item in view[-20:]:
+ keep = item if args.full else {
+ "ts": item.get("ts", ""),
+ "channel": item.get("channel", ""),
+ "event": item.get("event", ""),
+ "sessionID": item.get("sessionID", ""),
+ "query": item.get("query", ""),
+ "cluster": item.get("cluster", ""),
+ "mode": item.get("mode", ""),
+ "loop": item.get("loop", ""),
+ "used_cache": item.get("used_cache", ""),
+ "status": item.get("status", ""),
+ "reason": item.get("reason", ""),
+ "rewrite_mode": item.get("rewrite_mode", ""),
+ "keywords": item.get("keywords", []),
+ "total_hits": item.get("total_hits", ""),
+ "delta_hits": item.get("delta_hits", ""),
+ "known_hits": item.get("known_hits", ""),
+ "overlap": item.get("overlap", ""),
+ "top_hits": item.get("top_hits", []),
+ "delta_fps": item.get("delta_fps", []),
+ "rewrites": item.get("rewrites", []),
+ "emitted_context": item.get("emitted_context", ""),
+ }
+ print(json.dumps(keep, ensure_ascii=False))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/script/rag/install-docling.sh b/script/rag/install-docling.sh
new file mode 100755
index 00000000000..53870ccdee9
--- /dev/null
+++ b/script/rag/install-docling.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
+VENV=${RAG_DOCLING_VENV:-"$ROOT/.venv-docling"}
+PY=${RAG_DOCLING_PYTHON:-python3}
+REQ=${RAG_DOCLING_REQUIREMENTS:-"$ROOT/script/rag/requirements-docling.txt"}
+WHEEL=${RAG_DOCLING_WHEELHOUSE:-}
+
+usage() {
+ cat <<'EOF'
+Install docling into a dedicated virtual environment.
+
+Usage:
+ script/rag/install-docling.sh [--venv PATH] [--python BIN] [--requirements FILE] [--wheelhouse DIR]
+
+Options:
+ --venv PATH Virtualenv path (default: ./.venv-docling)
+ --python BIN Python executable (default: python3)
+ --requirements FILE Requirements file (default: script/rag/requirements-docling.txt)
+ --wheelhouse DIR Offline wheels directory, enables --no-index install
+ -h, --help Show help
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --venv)
+ VENV="$2"
+ shift 2
+ ;;
+ --python)
+ PY="$2"
+ shift 2
+ ;;
+ --requirements)
+ REQ="$2"
+ shift 2
+ ;;
+ --wheelhouse)
+ WHEEL="$2"
+ shift 2
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "unknown argument: $1" >&2
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+if ! command -v "$PY" >/dev/null 2>&1; then
+ echo "python executable not found: $PY" >&2
+ exit 1
+fi
+
+if [[ ! -d "$VENV" ]]; then
+ "$PY" -m venv "$VENV"
+fi
+
+declare -a PIP=("$VENV/bin/python" "-m" "pip" "--disable-pip-version-check")
+
+if [[ -n "$WHEEL" ]]; then
+ if [[ ! -d "$WHEEL" ]]; then
+ echo "wheelhouse directory not found: $WHEEL" >&2
+ exit 1
+ fi
+ if [[ -f "$REQ" ]]; then
+ "${PIP[@]}" install --no-index --find-links "$WHEEL" -r "$REQ"
+ else
+ "${PIP[@]}" install --no-index --find-links "$WHEEL" docling
+ fi
+ "$VENV/bin/docling" --version
+ echo "docling installed in: $VENV"
+ exit 0
+fi
+
+"${PIP[@]}" install -U pip setuptools wheel
+
+if [[ -f "$REQ" ]]; then
+ "${PIP[@]}" install -r "$REQ"
+else
+ "${PIP[@]}" install docling
+fi
+
+"$VENV/bin/docling" --version
+echo "docling installed in: $VENV"
diff --git a/script/rag/install-offline-bundle.sh b/script/rag/install-offline-bundle.sh
new file mode 100755
index 00000000000..3055716a097
--- /dev/null
+++ b/script/rag/install-offline-bundle.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
+BUNDLE=${RAG_OFFLINE_BUNDLE:-"$ROOT/.rag/offline/bundle"}
+VENV=${RAG_DOCLING_VENV:-"$ROOT/.venv-docling"}
+INSTALL_LLM=false
+INSTALL_VECTOR=false
+
+usage() {
+ cat <<'EOF'
+Install docling+tesseract from an offline bundle.
+
+Usage:
+ script/rag/install-offline-bundle.sh [--bundle DIR] [--venv PATH] [--install-llamaindex] [--install-vectordb]
+
+Options:
+ --bundle DIR Offline bundle directory (default: ./.rag/offline/bundle)
+ --venv PATH Venv install path (default: ./.venv-docling)
+ --install-llamaindex Install llamaindex wheels if available in bundle
+ --install-vectordb Install vector db wheels if available in bundle
+ -h, --help Show help
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --bundle)
+ BUNDLE="$2"
+ shift 2
+ ;;
+ --venv)
+ VENV="$2"
+ shift 2
+ ;;
+ --install-llamaindex)
+ INSTALL_LLM=true
+ shift
+ ;;
+ --install-vectordb)
+ INSTALL_VECTOR=true
+ shift
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "unknown argument: $1" >&2
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+if [[ ! -d "$BUNDLE" ]]; then
+ echo "bundle directory not found: $BUNDLE" >&2
+ exit 1
+fi
+if [[ ! -d "$BUNDLE/wheelhouse" ]]; then
+ echo "wheelhouse not found: $BUNDLE/wheelhouse" >&2
+ exit 1
+fi
+
+SUDO=""
+if [[ "${EUID:-$(id -u)}" -ne 0 ]]; then
+ if command -v sudo >/dev/null 2>&1; then
+ SUDO="sudo"
+ else
+ echo "sudo not found and current user is not root." >&2
+ exit 1
+ fi
+fi
+
+if ls "$BUNDLE/deb/"*.deb >/dev/null 2>&1; then
+ $SUDO apt-get install -y "$BUNDLE"/deb/*.deb
+fi
+
+bash "$ROOT/script/rag/install-docling.sh" \
+ --venv "$VENV" \
+ --requirements "$BUNDLE/script/rag/requirements-docling.txt" \
+ --wheelhouse "$BUNDLE/wheelhouse"
+
+if [[ "$INSTALL_LLM" == "true" && -f "$BUNDLE/script/rag/requirements-llamaindex.txt" ]]; then
+ "$VENV/bin/python" -m pip --disable-pip-version-check install \
+ --no-index --find-links "$BUNDLE/wheelhouse" \
+ -r "$BUNDLE/script/rag/requirements-llamaindex.txt"
+fi
+if [[ "$INSTALL_VECTOR" == "true" && -f "$BUNDLE/script/rag/requirements-vector.txt" ]]; then
+ "$VENV/bin/python" -m pip --disable-pip-version-check install \
+ --no-index --find-links "$BUNDLE/wheelhouse" \
+ -r "$BUNDLE/script/rag/requirements-vector.txt"
+fi
+
+echo "offline install completed"
diff --git a/script/rag/install-tesseract.sh b/script/rag/install-tesseract.sh
new file mode 100755
index 00000000000..c7d2241bd37
--- /dev/null
+++ b/script/rag/install-tesseract.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+LANGS=${RAG_TESS_LANGS:-"eng chi-sim"}
+NO_UPDATE=false
+
+usage() {
+ cat <<'EOF'
+Install tesseract OCR and language packs on Debian/Ubuntu.
+
+Usage:
+ script/rag/install-tesseract.sh [--langs "eng chi-sim"] [--no-update]
+
+Options:
+ --langs "a b" Language packs to install (default: "eng chi-sim")
+ --no-update Skip apt update
+ -h, --help Show help
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --langs)
+ LANGS="$2"
+ shift 2
+ ;;
+ --no-update)
+ NO_UPDATE=true
+ shift
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "unknown argument: $1" >&2
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+if ! command -v apt-get >/dev/null 2>&1; then
+ echo "apt-get not found. This script currently supports Debian/Ubuntu only." >&2
+ exit 1
+fi
+
+SUDO=""
+if [[ "${EUID:-$(id -u)}" -ne 0 ]]; then
+ if command -v sudo >/dev/null 2>&1; then
+ SUDO="sudo"
+ else
+ echo "sudo not found and current user is not root." >&2
+ exit 1
+ fi
+fi
+
+declare -a PKGS=("tesseract-ocr")
+read -ra ITEMS <<<"$LANGS"
+for l in "${ITEMS[@]}"; do
+ [[ -z "$l" ]] && continue
+ PKGS+=("tesseract-ocr-${l//_/-}")
+done
+
+if [[ "$NO_UPDATE" != "true" ]]; then
+ $SUDO apt-get update
+fi
+$SUDO apt-get install -y "${PKGS[@]}"
+
+tesseract --version | head -n 2
+tesseract --list-langs | sed -n '1,40p'
+echo "tesseract installed"
diff --git a/script/rag/install-vector.sh b/script/rag/install-vector.sh
new file mode 100755
index 00000000000..881786e23b2
--- /dev/null
+++ b/script/rag/install-vector.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
+VENV=${RAG_DOCLING_VENV:-"$ROOT/.venv-docling"}
+PY=${RAG_DOCLING_PYTHON:-python3}
+REQ=${RAG_VECTOR_REQUIREMENTS:-"$ROOT/script/rag/requirements-vector.txt"}
+WHEEL=${RAG_DOCLING_WHEELHOUSE:-}
+
+usage() {
+ cat <<'EOF'
+Install vector database dependencies into the existing rag virtual environment.
+
+Usage:
+ script/rag/install-vector.sh [--venv PATH] [--python BIN] [--requirements FILE] [--wheelhouse DIR]
+
+Options:
+ --venv PATH Virtualenv path (default: ./.venv-docling)
+ --python BIN Python executable (default: python3)
+ --requirements FILE Requirements file (default: script/rag/requirements-vector.txt)
+ --wheelhouse DIR Offline wheels directory, enables --no-index install
+ -h, --help Show help
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --venv)
+ VENV="$2"
+ shift 2
+ ;;
+ --python)
+ PY="$2"
+ shift 2
+ ;;
+ --requirements)
+ REQ="$2"
+ shift 2
+ ;;
+ --wheelhouse)
+ WHEEL="$2"
+ shift 2
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "unknown argument: $1" >&2
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+if ! command -v "$PY" >/dev/null 2>&1; then
+ echo "python executable not found: $PY" >&2
+ exit 1
+fi
+
+if [[ ! -d "$VENV" ]]; then
+ "$PY" -m venv "$VENV"
+fi
+
+declare -a PIP=("$VENV/bin/python" "-m" "pip" "--disable-pip-version-check")
+
+if [[ -n "$WHEEL" ]]; then
+ if [[ ! -d "$WHEEL" ]]; then
+ echo "wheelhouse directory not found: $WHEEL" >&2
+ exit 1
+ fi
+ "${PIP[@]}" install --no-index --find-links "$WHEEL" -r "$REQ"
+ echo "vector dependencies installed in: $VENV"
+ exit 0
+fi
+
+"${PIP[@]}" install -U pip setuptools wheel
+"${PIP[@]}" install -r "$REQ"
+echo "vector dependencies installed in: $VENV"
diff --git a/script/rag/merge-image-ocr.py b/script/rag/merge-image-ocr.py
new file mode 100755
index 00000000000..57de0fb886c
--- /dev/null
+++ b/script/rag/merge-image-ocr.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def read(path: Path) -> str:
+ if not path.exists():
+ return ""
+ return path.read_text(encoding="utf-8", errors="ignore")
+
+
+def clean(text: str) -> str:
+ return re.sub(r"\s+", " ", text).strip()
+
+
+def snippet(text: str, n: int) -> str:
+ if len(text) <= n:
+ return text
+ return text[:n].rstrip() + " ..."
+
+
+def inline_block(image_id: str, text: str, limit: int, mode: str) -> str:
+ if mode == "none":
+ return ""
+ if mode == "marker":
+ return f"[IMAGE:{image_id}]"
+ if not text:
+ return f"[IMAGE:{image_id}]"
+ body = snippet(text, limit)
+ return f"[IMAGE:{image_id}]\n[IMAGE_OCR]\n{body}\n[/IMAGE_OCR]"
+
+
+def main() -> None:
+ p = argparse.ArgumentParser()
+ p.add_argument("--text", required=True)
+ p.add_argument("--meta", required=True)
+ p.add_argument("--ocr-dir", required=True)
+ p.add_argument("--sidecar", required=True)
+ p.add_argument("--source-url", required=True)
+ p.add_argument("--raw", required=False, default="")
+ p.add_argument("--inline-limit", type=int, default=2000)
+ p.add_argument("--inline-mode", choices=["ocr", "marker", "none"], default="marker")
+ args = p.parse_args()
+
+ text_path = Path(args.text)
+ meta_path = Path(args.meta)
+ ocr_dir = Path(args.ocr_dir)
+ sidecar_path = Path(args.sidecar)
+
+ raw = read(text_path)
+ if args.raw:
+ Path(args.raw).write_text(raw, encoding="utf-8")
+
+ rows = json.loads(read(meta_path) or "[]")
+ items = []
+ for i, row in enumerate(rows):
+ image_id = row.get("id") or f"img-{i}"
+ files = sorted(ocr_dir.glob(f"{image_id}*.txt"))
+ ocr_text = clean(read(files[0])) if files else ""
+ items.append(
+ {
+ "id": image_id,
+ "index": i,
+ "url": row.get("url", ""),
+ "alt": row.get("alt", ""),
+ "ocr_text": ocr_text,
+ "ocr_chars": len(ocr_text),
+ "status": "ok" if ocr_text else "empty",
+ }
+ )
+
+ marker = re.compile(r"")
+ text = raw
+ n = min(len(items), len(marker.findall(raw)))
+ for i in range(n):
+ block = inline_block(items[i]["id"], items[i]["ocr_text"], args.inline_limit, args.inline_mode)
+ text = marker.sub(lambda _: block, text, count=1)
+
+ text_path.write_text(text, encoding="utf-8")
+
+ sidecar = {
+ "source_url": args.source_url,
+ "text_file": str(text_path),
+ "raw_file": args.raw,
+ "generated_at": datetime.now(timezone.utc).isoformat(),
+ "images": items,
+ }
+ sidecar_path.write_text(json.dumps(sidecar, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/script/rag/rag-pipeline.py b/script/rag/rag-pipeline.py
new file mode 100644
index 00000000000..f3572e3b4fc
--- /dev/null
+++ b/script/rag/rag-pipeline.py
@@ -0,0 +1,427 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import os
+import subprocess
+import sys
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def now() -> str:
+ return datetime.now(timezone.utc).isoformat()
+
+
+def sha(path: Path) -> str:
+ h = hashlib.sha256()
+ with path.open("rb") as f:
+ for chunk in iter(lambda: f.read(1024 * 1024), b""):
+ h.update(chunk)
+ return h.hexdigest()
+
+
+def rel(path: Path, root: Path) -> str:
+ p = path.resolve()
+ try:
+ return str(p.relative_to(root.resolve()))
+ except ValueError:
+ return str(p)
+
+
+def run(cmd: list[str], *, capture: bool = False) -> str:
+ if capture:
+ out = subprocess.run(cmd, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ return out.stdout
+ subprocess.run(cmd, check=True)
+ return ""
+
+
+def urls(args) -> list[str]:
+ out = [u for u in args.url if u]
+ if args.url_file:
+ p = Path(args.url_file)
+ if p.exists():
+ out.extend(
+ line.strip()
+ for line in p.read_text(encoding="utf-8", errors="ignore").splitlines()
+ if line.strip() and not line.strip().startswith("#")
+ )
+ seen = set()
+ uniq = []
+ for u in out:
+ if u in seen:
+ continue
+ seen.add(u)
+ uniq.append(u)
+ return uniq
+
+
+def txt_files(dir_path: Path) -> list[Path]:
+ bad = {"_success.log", "_failed.log", "_run.log"}
+ out = []
+ for path in sorted(dir_path.rglob("*.txt")):
+ name = path.name
+ if name in bad:
+ continue
+ if name.endswith(".clean.txt") or name.endswith(".raw.txt"):
+ continue
+ out.append(path)
+ return out
+
+
+def structured_files(scan_dir: Path, glob: str) -> list[Path]:
+ return sorted(p for p in scan_dir.glob(glob) if p.is_file())
+
+
+def clean_path(txt: Path) -> Path:
+ if txt.name.endswith(".txt"):
+ return txt.with_name(txt.name[:-4] + ".clean.txt")
+ return txt.with_name(txt.name + ".clean.txt")
+
+
+def structured_path(txt: Path) -> Path:
+ if txt.name.endswith(".txt"):
+ return txt.with_name(txt.name[:-4] + ".structured.json")
+ return txt.with_name(txt.name + ".structured.json")
+
+
+@dataclass
+class Env:
+ root: Path
+ py: Path
+ url_to_text: Path
+ convert_dir: Path
+ clean_text: Path
+ structure_text: Path
+ build_index: Path
+
+
+def env(root: Path, py: str) -> Env:
+ return Env(
+ root=root,
+ py=Path(py),
+ url_to_text=root / "script" / "rag" / "url-to-text.sh",
+ convert_dir=root / "script" / "rag" / "convert-dir-to-text.sh",
+ clean_text=root / "script" / "rag" / "clean-text.py",
+ structure_text=root / "script" / "rag" / "structure-text.py",
+ build_index=root / "script" / "rag" / "build-vector-index.py",
+ )
+
+
+def process_txt(e: Env, txt: Path, args, source_url: str = "") -> Path:
+ c = clean_path(txt)
+ s = structured_path(txt)
+ run([str(e.py), str(e.clean_text), "--input", str(txt), "--output", str(c)])
+ cmd = [
+ str(e.py),
+ str(e.structure_text),
+ "--text",
+ str(c),
+ "--output",
+ str(s),
+ "--mode",
+ args.struct_mode,
+ "--inline-ocr",
+ args.inline_ocr,
+ ]
+ img = txt.with_name(txt.name[:-4] + ".images.json") if txt.name.endswith(".txt") else txt.with_name(txt.name + ".images.json")
+ if img.exists():
+ cmd.extend(["--images", str(img)])
+ if source_url:
+ cmd.extend(["--source-url", source_url])
+ if args.struct_mode == "llamaindex":
+ cmd.extend(["--model", args.struct_model])
+ run(cmd)
+ return s
+
+
+def refresh_dir(e: Env, args) -> list[Path]:
+ src = Path(args.input_dir)
+ out = Path(args.text_out_dir)
+ out.mkdir(parents=True, exist_ok=True)
+ run(["bash", str(e.convert_dir), "--input", str(src), "--output", str(out)])
+ return [process_txt(e, txt, args) for txt in txt_files(out)]
+
+
+def pick_txt(stdout: str) -> Path:
+ rows = [line.strip() for line in stdout.splitlines() if line.strip()]
+ if not rows:
+ raise SystemExit("url-to-text returned empty output")
+ return Path(rows[-1])
+
+
+def refresh_url(e: Env, args) -> list[Path]:
+ all_urls = urls(args)
+ if not all_urls:
+ raise SystemExit("no url provided: use --url or --url-file")
+ out = []
+ for url in all_urls:
+ cmd = [
+ "bash",
+ str(e.url_to_text),
+ "--url",
+ url,
+ "--output",
+ args.url_text_dir,
+ "--image-inline",
+ args.image_inline,
+ ]
+ if args.ocr_images:
+ cmd.append("--ocr-images")
+ txt = pick_txt(run(cmd, capture=True))
+ out.append(process_txt(e, txt, args, source_url=url))
+ return out
+
+
+def manifest(paths: list[Path], root: Path, args) -> dict:
+ docs = {}
+ for p in paths:
+ key = rel(p, root)
+ data = json.loads(p.read_text(encoding="utf-8", errors="ignore"))
+ docs[key] = {
+ "path": key,
+ "sha256": sha(p),
+ "source_url": data.get("source_url", ""),
+ "updated_at": now(),
+ }
+ return {
+ "version": 1,
+ "generated_at": now(),
+ "root": str(root.resolve()),
+ "collection": args.collection,
+ "embedding_model": args.embed_model,
+ "struct_mode": args.struct_mode,
+ "struct_model": args.struct_model,
+ "docs": docs,
+ }
+
+
+def load_manifest(path: Path) -> dict:
+ if not path.exists():
+ return {}
+ try:
+ return json.loads(path.read_text(encoding="utf-8", errors="ignore"))
+ except Exception:
+ return {}
+
+
+def write_manifest(path: Path, data: dict) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def index(e: Env, args, files: list[Path], recreate: bool, delete_keys: list[str]) -> dict:
+ cmd = [
+ str(e.py),
+ str(e.build_index),
+ "--db-path",
+ args.db_path,
+ "--collection",
+ args.collection,
+ "--model",
+ args.embed_model,
+ "--root",
+ str(args.root),
+ ]
+ for f in files:
+ cmd.extend(["--input", str(f)])
+ for key in delete_keys:
+ cmd.extend(["--delete-doc-key", key])
+ if recreate:
+ cmd.append("--recreate")
+ out = run(cmd, capture=True)
+ return json.loads(out)
+
+
+def scan_all(args) -> list[Path]:
+ return structured_files(Path(args.scan_dir), args.glob)
+
+
+def init_cmd(e: Env, args) -> None:
+ if args.source == "dir":
+ files = refresh_dir(e, args)
+ elif args.source == "url":
+ files = refresh_url(e, args)
+ else:
+ files = scan_all(args)
+ if not files:
+ raise SystemExit("no structured files found for init")
+ res = index(e, args, files, recreate=True, delete_keys=[])
+ man = manifest(files, args.root, args)
+ write_manifest(Path(args.manifest), man)
+ print(
+ json.dumps(
+ {
+ "mode": "init",
+ "files": len(files),
+ "manifest": args.manifest,
+ "index": res,
+ },
+ ensure_ascii=False,
+ indent=2,
+ )
+ )
+
+
+def update_cmd(e: Env, args) -> None:
+ if args.source == "dir":
+ refresh_dir(e, args)
+ elif args.source == "url":
+ refresh_url(e, args)
+
+ files = scan_all(args)
+ old = load_manifest(Path(args.manifest))
+ old_docs = old.get("docs", {})
+ if not files:
+ new = manifest([], args.root, args)
+ removed = sorted(old_docs.keys())
+ res = None
+ if removed:
+ res = index(e, args, [], recreate=False, delete_keys=removed)
+ write_manifest(Path(args.manifest), new)
+ print(
+ json.dumps(
+ {
+ "mode": "update",
+ "changed": 0,
+ "removed": len(removed),
+ "manifest": args.manifest,
+ "index": res,
+ },
+ ensure_ascii=False,
+ indent=2,
+ )
+ )
+ return
+
+ new = manifest(files, args.root, args)
+ new_docs = new.get("docs", {})
+
+ force_full = False
+ if old:
+ if old.get("collection") != args.collection or old.get("embedding_model") != args.embed_model:
+ force_full = True
+
+ if force_full:
+ res = index(e, args, files, recreate=True, delete_keys=[])
+ write_manifest(Path(args.manifest), new)
+ print(
+ json.dumps(
+ {
+ "mode": "update",
+ "reason": "collection_or_embedding_changed",
+ "files": len(files),
+ "manifest": args.manifest,
+ "index": res,
+ },
+ ensure_ascii=False,
+ indent=2,
+ )
+ )
+ return
+
+ changed = [k for k, v in new_docs.items() if old_docs.get(k, {}).get("sha256") != v.get("sha256")]
+ removed = [k for k in old_docs if k not in new_docs]
+ if not changed and not removed:
+ write_manifest(Path(args.manifest), new)
+ print(
+ json.dumps(
+ {
+ "mode": "update",
+ "changed": 0,
+ "removed": 0,
+ "manifest": args.manifest,
+ "index": None,
+ },
+ ensure_ascii=False,
+ indent=2,
+ )
+ )
+ return
+
+ pick = {k: Path(args.root) / new_docs[k]["path"] for k in changed}
+ res = index(e, args, [p for p in pick.values() if p.exists()], recreate=False, delete_keys=sorted(set(changed + removed)))
+ write_manifest(Path(args.manifest), new)
+ print(
+ json.dumps(
+ {
+ "mode": "update",
+ "changed": len(changed),
+ "removed": len(removed),
+ "manifest": args.manifest,
+ "index": res,
+ },
+ ensure_ascii=False,
+ indent=2,
+ )
+ )
+
+
+def add_common(sp) -> None:
+ struct_mode = os.getenv("RAG_STRUCT_MODE", "llamaindex")
+ if struct_mode not in {"rule", "llamaindex"}:
+ struct_mode = "llamaindex"
+ sp.add_argument("--root", default=".")
+ sp.add_argument("--python", default="./.venv-docling/bin/python")
+ sp.add_argument("--source", choices=["structured", "dir", "url"], default="structured")
+ sp.add_argument("--scan-dir", default=".rag/text")
+ sp.add_argument("--glob", default="**/*.structured.json")
+ sp.add_argument("--input-dir", default="")
+ sp.add_argument("--text-out-dir", default=".rag/text/dir")
+ sp.add_argument("--url", action="append", default=[])
+ sp.add_argument("--url-file", default="")
+ sp.add_argument("--url-text-dir", default=".rag/text/url")
+ sp.add_argument("--ocr-images", action="store_true")
+ sp.add_argument("--image-inline", choices=["marker", "ocr", "none"], default="marker")
+ sp.add_argument("--struct-mode", choices=["rule", "llamaindex"], default=struct_mode)
+ sp.add_argument("--struct-model", default=os.getenv("RAG_STRUCT_MODEL", "gpt-4o-mini"))
+ sp.add_argument("--inline-ocr", choices=["strip", "keep"], default="strip")
+ sp.add_argument("--embed-model", default="qwen3-embedding:4b")
+ sp.add_argument("--db-path", default=".rag/vector/qdrant")
+ sp.add_argument("--collection", default="rag_chunks")
+ sp.add_argument("--manifest", default=".rag/state/manifest.json")
+
+
+def main() -> None:
+ p = argparse.ArgumentParser()
+ sub = p.add_subparsers(dest="cmd", required=True)
+ p_init = sub.add_parser("init")
+ add_common(p_init)
+ p_update = sub.add_parser("update")
+ add_common(p_update)
+ args = p.parse_args()
+ args.root = Path(args.root).resolve()
+ e = env(args.root, args.python)
+
+ if args.cmd == "init":
+ init_cmd(e, args)
+ return
+ if args.cmd == "update":
+ update_cmd(e, args)
+ return
+ raise SystemExit("unknown cmd")
+
+
+if __name__ == "__main__":
+ try:
+ main()
+ except subprocess.CalledProcessError as e:
+ print(
+ json.dumps(
+ {
+ "error": "command_failed",
+ "cmd": e.cmd,
+ "code": e.returncode,
+ "stdout": e.stdout if isinstance(e.stdout, str) else "",
+ "stderr": e.stderr if isinstance(e.stderr, str) else "",
+ },
+ ensure_ascii=False,
+ indent=2,
+ ),
+ file=sys.stderr,
+ )
+ raise SystemExit(e.returncode)
diff --git a/script/rag/requirements-docling.txt b/script/rag/requirements-docling.txt
new file mode 100644
index 00000000000..e195be9fd62
--- /dev/null
+++ b/script/rag/requirements-docling.txt
@@ -0,0 +1 @@
+docling==2.77.0
diff --git a/script/rag/requirements-llamaindex.txt b/script/rag/requirements-llamaindex.txt
new file mode 100644
index 00000000000..7aaa93fb77e
--- /dev/null
+++ b/script/rag/requirements-llamaindex.txt
@@ -0,0 +1,2 @@
+llama-index
+llama-index-llms-openai
diff --git a/script/rag/requirements-vector.txt b/script/rag/requirements-vector.txt
new file mode 100644
index 00000000000..e21b5db2afa
--- /dev/null
+++ b/script/rag/requirements-vector.txt
@@ -0,0 +1,2 @@
+qdrant-client
+openai
diff --git a/script/rag/search-vector-index.py b/script/rag/search-vector-index.py
new file mode 100644
index 00000000000..0b7d6185158
--- /dev/null
+++ b/script/rag/search-vector-index.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from pathlib import Path
+
+
+def clip(text: str, n: int) -> str:
+ s = " ".join(str(text or "").split())
+ return s if len(s) <= n else s[:n].rstrip() + " ..."
+
+
+def uniq(rows: list[str]) -> list[str]:
+ seen = set()
+ out = []
+ for item in rows:
+ val = str(item or "").strip()
+ if not val or val in seen:
+ continue
+ seen.add(val)
+ out.append(val)
+ return out
+
+
+def pick_json(text: str) -> dict:
+ start = text.find("{")
+ end = text.rfind("}")
+ if start == -1 or end == -1 or end <= start:
+ raise ValueError("no json object found in rewrite response")
+ return json.loads(text[start : end + 1])
+
+
+def render_state(query: str, hits: list[dict], rewrite: dict) -> str:
+ top = hits[0] if hits else {}
+ status = "new_evidence" if hits else "need_refine"
+ reason = "top_hits_available" if hits else "empty_hits"
+ next_action = "use_delta_or_brief_only_if_needed" if hits else "refine_query_with_device_or_step"
+ return "\n".join(
+ [
+ "",
+ f"query={clip(query, 80)}",
+ f"status={status}",
+ f"reason={reason}",
+ f"total_hits={len(hits)}",
+ f"top_source={top.get('source_url', '')}",
+ f"top_section={clip(top.get('section_title', ''), 48)}",
+ f"rewrite_mode={rewrite.get('mode', 'none')}",
+ f"rewrite_queries={json.dumps(rewrite.get('queries', []), ensure_ascii=False)}",
+ f"next_action={next_action}",
+ "",
+ ]
+ )
+
+
+def render_brief(query: str, hits: list[dict], rewrite: dict, top_k: int) -> str:
+ state = render_state(query, hits, rewrite)
+ if not hits:
+ return state
+ body = []
+ for i, item in enumerate(hits[: max(1, top_k)], start=1):
+ body.append(
+ " ".join(
+ [
+ f"[{i}]",
+ f"source={item.get('source_url', '')}",
+ f"section={clip(item.get('section_title', ''), 48)}",
+ f"summary={clip(item.get('text_preview', ''), 120)}",
+ ]
+ )
+ )
+ return state + "\n" + "\n".join(body)
+
+
+def auto_format(value: str) -> str:
+ if value != "auto":
+ return value
+ if os.getenv("OPENCODE") == "1":
+ return "state"
+ return "json"
+
+
+def need_rewrite(query: str) -> bool:
+ text = str(query or "").strip()
+ if len(text) >= 48:
+ return True
+ if text.count(" ") >= 5:
+ return True
+ marks = ["并且", "以及", "同时", "还有", "怎么", "如何", "步骤", "方式", "版本", "命令"]
+ return sum(1 for x in marks if x in text) >= 2
+
+
+def auto_rewrite(value: str, model: str, query: str) -> str:
+ if value != "auto":
+ return value
+ if model and need_rewrite(query):
+ return "llm"
+ return "off"
+
+
+def embed_query(client, model: str, text: str) -> list[float]:
+ r = client.embeddings.create(model=model, input=[text])
+ return r.data[0].embedding
+
+
+def rewrite_query(client, model: str, query: str, limit: int) -> dict:
+ if not model:
+ return {"mode": "off", "queries": [query], "keywords": []}
+ prompt = "\n".join(
+ [
+ "你是RAG检索改写器。",
+ "目标:从长问题中提取真正的检索目标,去掉语义噪声。",
+ "输出必须是 JSON 对象,不要输出解释。",
+ f"最多给出 {max(1, limit)} 条 queries。",
+ '返回格式:{"queries":["..."],"keywords":["..."]}',
+ "要求:queries 应短、准、可用于 embedding 检索;keywords 只保留设备名、动作、文档对象、错误码、版本等关键信息。",
+ f"原始问题:{query}",
+ ]
+ )
+ try:
+ res = client.chat.completions.create(
+ model=model,
+ messages=[{"role": "user", "content": prompt}],
+ temperature=0.1,
+ )
+ text = res.choices[0].message.content or ""
+ except Exception:
+ return {"mode": "llm_error", "queries": [query], "keywords": []}
+ try:
+ data = pick_json(text)
+ except Exception:
+ return {"mode": "llm_fallback", "queries": [query], "keywords": []}
+ queries = uniq([str(x) for x in data.get("queries", [])])[: max(1, limit)]
+ if query not in queries:
+ queries.insert(0, query)
+ keywords = uniq([str(x) for x in data.get("keywords", [])])[:8]
+ return {
+ "mode": "llm",
+ "queries": uniq(queries)[: max(1, limit)],
+ "keywords": keywords,
+ }
+
+
+def related_images(qdrant, models, collection: str, ids: list[str], text_chars: int) -> list[dict]:
+ out = []
+ for iid in ids:
+ flt = models.Filter(
+ must=[
+ models.FieldCondition(key="node_type", match=models.MatchValue(value="image")),
+ models.FieldCondition(key="image_id", match=models.MatchValue(value=iid)),
+ ]
+ )
+ points, _ = qdrant.scroll(
+ collection_name=collection,
+ scroll_filter=flt,
+ with_payload=True,
+ limit=1,
+ )
+ if not points:
+ continue
+ payload = points[0].payload or {}
+ text = str(payload.get("text", ""))
+ n = max(20, text_chars)
+ preview = text if len(text) <= n else text[:n].rstrip() + " ..."
+ out.append(
+ {
+ "image_id": iid,
+ "source_url": payload.get("source_url", ""),
+ "text_preview": preview,
+ }
+ )
+ return out
+
+
+def search(qdrant, models, collection: str, vec: list[float], limit: int, node_type: str):
+ flt = None
+ if node_type != "any":
+ flt = models.Filter(
+ must=[models.FieldCondition(key="node_type", match=models.MatchValue(value=node_type))]
+ )
+ if hasattr(qdrant, "query_points"):
+ res = qdrant.query_points(
+ collection_name=collection,
+ query=vec,
+ limit=max(1, limit),
+ with_payload=True,
+ query_filter=flt,
+ )
+ return res.points
+ return qdrant.search(
+ collection_name=collection,
+ query_vector=vec,
+ limit=max(1, limit),
+ with_payload=True,
+ query_filter=flt,
+ )
+
+
+def fp(payload: dict) -> str:
+ src = str(payload.get("text_file", "") or payload.get("source_url", ""))
+ ident = str(payload.get("chunk_id", "") or payload.get("image_id", "") or payload.get("section_title", ""))
+ return f"{src}#{ident}"
+
+
+def collect(points, qdrant, models, args, query: str) -> list[dict]:
+ out = []
+ for rank, item in enumerate(points, start=1):
+ payload = item.payload or {}
+ text = str(payload.get("text", ""))
+ n = max(20, args.show_text_chars)
+ preview = text if len(text) <= n else text[:n].rstrip() + " ..."
+ ids = payload.get("image_ids", [])
+ if not isinstance(ids, list):
+ ids = []
+ ext = (
+ []
+ if args.no_related_images
+ else related_images(
+ qdrant,
+ models,
+ args.collection,
+ [str(x) for x in ids if x],
+ args.show_text_chars,
+ )
+ )
+ out.append(
+ {
+ "fp": fp(payload),
+ "query": query,
+ "rank": rank,
+ "score": float(item.score),
+ "node_type": payload.get("node_type", "text"),
+ "image_id": payload.get("image_id", ""),
+ "chunk_id": payload.get("chunk_id", ""),
+ "section_title": payload.get("section_title", ""),
+ "source_url": payload.get("source_url", ""),
+ "text_file": payload.get("text_file", ""),
+ "image_ids": ids,
+ "related_images": ext,
+ "text_preview": preview,
+ }
+ )
+ return out
+
+
+def merge_hits(rows: list[list[dict]], primary: str, top_k: int) -> list[dict]:
+ merged: dict[str, dict] = {}
+ for batch in rows:
+ for item in batch:
+ cur = merged.get(item["fp"])
+ if not cur:
+ merged[item["fp"]] = {
+ **item,
+ "matched_queries": [item["query"]],
+ "hit_count": 1,
+ "max_score": float(item["score"]),
+ "rrf": 1.0 / (60 + int(item["rank"])),
+ "primary_match": 1 if item["query"] == primary else 0,
+ }
+ continue
+ if item["query"] not in cur["matched_queries"]:
+ cur["matched_queries"].append(item["query"])
+ cur["hit_count"] += 1
+ cur["max_score"] = max(float(cur["max_score"]), float(item["score"]))
+ cur["rrf"] += 1.0 / (60 + int(item["rank"]))
+ if item["query"] == primary:
+ cur["primary_match"] = 1
+ if float(item["score"]) > float(cur["score"]):
+ cur.update(
+ {
+ "score": float(item["score"]),
+ "node_type": item["node_type"],
+ "image_id": item["image_id"],
+ "chunk_id": item["chunk_id"],
+ "section_title": item["section_title"],
+ "source_url": item["source_url"],
+ "text_file": item["text_file"],
+ "image_ids": item["image_ids"],
+ "related_images": item["related_images"],
+ "text_preview": item["text_preview"],
+ }
+ )
+ out = []
+ for item in merged.values():
+ item["rerank_score"] = (
+ 0.45 * float(item["max_score"])
+ + 0.35 * float(item["rrf"])
+ + 0.12 * float(item["hit_count"])
+ + 0.08 * float(item["primary_match"])
+ )
+ item.pop("fp", None)
+ item.pop("query", None)
+ item.pop("rank", None)
+ item.pop("max_score", None)
+ item.pop("rrf", None)
+ item.pop("primary_match", None)
+ out.append(item)
+ out.sort(key=lambda x: (float(x.get("rerank_score", 0)), float(x.get("score", 0))), reverse=True)
+ return out[: max(1, top_k)]
+
+
+def main() -> None:
+ p = argparse.ArgumentParser()
+ p.add_argument("--query", required=True)
+ p.add_argument("--db-path", default=".rag/vector/qdrant")
+ p.add_argument("--collection", default="rag_chunks")
+ p.add_argument("--model", default="nomic-embed-text")
+ p.add_argument("--base-url", default="")
+ p.add_argument("--api-key", default="")
+ p.add_argument("--top-k", type=int, default=5)
+ p.add_argument("--per-query-k", type=int, default=5)
+ p.add_argument("--show-text-chars", type=int, default=240)
+ p.add_argument("--node-type", choices=["any", "text", "image"], default="any")
+ p.add_argument("--no-related-images", action="store_true")
+ p.add_argument("--format", choices=["auto", "json", "state", "brief"], default="auto")
+ p.add_argument("--rewrite", choices=["auto", "off", "llm"], default="auto")
+ p.add_argument("--rewrite-model", default=os.getenv("RAG_REWRITE_MODEL", ""))
+ p.add_argument("--rewrite-queries", type=int, default=int(os.getenv("RAG_REWRITE_QUERIES", "3")))
+ args = p.parse_args()
+
+ try:
+ from openai import OpenAI
+ from qdrant_client import QdrantClient, models
+ except ModuleNotFoundError as e:
+ raise SystemExit(
+ f"missing dependency: {e.name}. run: bash script/rag/install-vector.sh"
+ ) from e
+
+ key = args.api_key or os.getenv("OPENAI_API_KEY") or os.getenv("MINIMAX_API_KEY") or "ollama"
+ base = args.base_url or os.getenv("OPENAI_BASE_URL") or "http://127.0.0.1:11434/v1"
+ client = OpenAI(api_key=key, base_url=base)
+ rewrite_mode = auto_rewrite(args.rewrite, args.rewrite_model, args.query)
+ rewrite = (
+ rewrite_query(client, args.rewrite_model, args.query, max(1, args.rewrite_queries))
+ if rewrite_mode == "llm"
+ else {"mode": "off", "queries": [args.query], "keywords": []}
+ )
+ queries = uniq([args.query, *rewrite.get("queries", [])])[: max(1, args.rewrite_queries)]
+
+ db = Path(args.db_path)
+ if not db.exists():
+ raise SystemExit(f"db path not found: {db}")
+
+ qdrant = QdrantClient(path=str(db))
+ rows = []
+ for query in queries:
+ vec = embed_query(client, args.model, query)
+ points = search(qdrant, models, args.collection, vec, max(args.top_k, args.per_query_k), args.node_type)
+ rows.append(collect(points, qdrant, models, args, query))
+
+ out = merge_hits(rows, queries[0], args.top_k)
+ rewrite["queries"] = queries
+ fmt = auto_format(args.format)
+ if fmt == "state":
+ print(render_state(args.query, out, rewrite))
+ return
+ if fmt == "brief":
+ print(render_brief(args.query, out, rewrite, args.top_k))
+ return
+ print(json.dumps({"query": args.query, "rewrite": rewrite, "hits": out}, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/script/rag/structure-text.py b/script/rag/structure-text.py
new file mode 100755
index 00000000000..3d158cb80c4
--- /dev/null
+++ b/script/rag/structure-text.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import importlib.util
+import json
+import os
+import re
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+LAST_LLM_AT = 0.0
+IMAGE_ID_RE = re.compile(r"\[IMAGE:([^\]]+)\]")
+IMAGE_OCR_RE = re.compile(r"\[IMAGE_OCR\][\s\S]*?\[/IMAGE_OCR\]")
+
+
+def read(path: Path) -> str:
+ return path.read_text(encoding="utf-8", errors="ignore")
+
+
+def clean(text: str) -> str:
+ return re.sub(r"\s+", " ", text).strip()
+
+
+def strip_inline_ocr(text: str) -> str:
+ out = IMAGE_OCR_RE.sub("", text)
+ out = re.sub(r"\n{3,}", "\n\n", out)
+ return out.strip()
+
+
+def image_ids(text: str) -> list[str]:
+ return sorted(set(IMAGE_ID_RE.findall(text)))
+
+
+def split_sections(text: str) -> list[dict]:
+ rows = []
+ title = "document"
+ buf = []
+ for line in text.splitlines():
+ if re.match(r"^#{1,6}\s+", line):
+ body = "\n".join(buf).strip()
+ if body:
+ rows.append({"title": title, "text": body})
+ title = re.sub(r"^#{1,6}\s+", "", line).strip()
+ buf = []
+ continue
+ buf.append(line)
+ body = "\n".join(buf).strip()
+ if body:
+ rows.append({"title": title, "text": body})
+ return rows
+
+
+def chunk_text(text: str, size: int, overlap: int) -> list[str]:
+ if len(text) <= size:
+ return [text]
+ out = []
+ i = 0
+ while i < len(text):
+ out.append(text[i : i + size])
+ if i + size >= len(text):
+ break
+ i += max(1, size - overlap)
+ return out
+
+
+def rule_summary(text: str, n: int = 280) -> str:
+ s = clean(text)
+ if len(s) <= n:
+ return s
+ return s[:n].rstrip() + " ..."
+
+
+def throttle(interval: float) -> None:
+ global LAST_LLM_AT
+ if interval <= 0:
+ return
+ now = time.monotonic()
+ wait = LAST_LLM_AT + interval - now
+ if wait > 0:
+ time.sleep(wait)
+ LAST_LLM_AT = time.monotonic()
+
+
+def is_rate_limit_error(e: Exception) -> bool:
+ s = str(e).lower()
+ return "rate limit" in s or "too many requests" in s or "429" in s
+
+
+def with_retry(
+ fn,
+ *,
+ min_interval: float,
+ max_retries: int,
+ retry_initial: float,
+) -> str:
+ delay = max(0.1, retry_initial)
+ n = 0
+ while True:
+ throttle(min_interval)
+ try:
+ return fn()
+ except Exception as e:
+ if not is_rate_limit_error(e) or n >= max_retries:
+ raise
+ n += 1
+ print(
+ f"[llm] rate limit; retry {n}/{max_retries} after {delay:.1f}s",
+ file=sys.stderr,
+ )
+ time.sleep(delay)
+ delay = min(delay * 2, 30)
+
+
+def llama_summary(
+ text: str,
+ model: str,
+ *,
+ min_interval: float,
+ max_retries: int,
+ retry_initial: float,
+) -> str:
+ if importlib.util.find_spec("llama_index.llms.openai") is None:
+ raise SystemExit(
+ "llama-index is not installed in this Python environment. "
+ "Use ./.venv-docling/bin/python -m pip install -r script/rag/requirements-llamaindex.txt"
+ )
+
+ prompt = (
+ "Summarize the following text in Chinese, keep factual key points in 3 sentences max.\n\n"
+ f"{text[:6000]}"
+ )
+
+ def key() -> str:
+ k = os.getenv("OPENAI_API_KEY") or os.getenv("MINIMAX_API_KEY")
+ if k:
+ return k
+ raise SystemExit(
+ "OPENAI_API_KEY is required for --mode llamaindex "
+ "(MINIMAX_API_KEY is also accepted)."
+ )
+
+ def compat() -> str:
+ from openai import OpenAI as OpenAIClient
+
+ client = OpenAIClient(
+ api_key=key(),
+ base_url=os.getenv("OPENAI_BASE_URL") or None,
+ )
+ res = client.chat.completions.create(
+ model=model,
+ temperature=0,
+ messages=[{"role": "user", "content": prompt}],
+ )
+ msg = res.choices[0].message.content if res.choices else ""
+ return clean(msg or "")
+
+ from llama_index.llms.openai import OpenAI
+
+ try:
+ return with_retry(
+ lambda: clean(
+ OpenAI(
+ model=model,
+ temperature=0,
+ api_base=os.getenv("OPENAI_BASE_URL"),
+ api_key=key(),
+ ).complete(prompt).text
+ ),
+ min_interval=min_interval,
+ max_retries=max_retries,
+ retry_initial=retry_initial,
+ )
+ except ValueError as e:
+ if "Unknown model" not in str(e):
+ raise
+ if not os.getenv("OPENAI_BASE_URL"):
+ raise SystemExit(
+ f"Unknown model '{model}'. Set OPENAI_BASE_URL to your compatible endpoint, "
+ "for example: https://api.minimaxi.com/v1"
+ )
+ return with_retry(
+ compat,
+ min_interval=min_interval,
+ max_retries=max_retries,
+ retry_initial=retry_initial,
+ )
+
+
+def main() -> None:
+ p = argparse.ArgumentParser()
+ p.add_argument("--text", required=True)
+ p.add_argument("--images", required=False, default="")
+ p.add_argument("--output", required=True)
+ p.add_argument("--source-url", required=False, default="")
+ p.add_argument("--mode", choices=["rule", "llamaindex"], default="rule")
+ p.add_argument("--model", default="gpt-4o-mini")
+ p.add_argument("--llm-min-interval", type=float, default=1.0)
+ p.add_argument("--llm-max-retries", type=int, default=6)
+ p.add_argument("--llm-retry-initial", type=float, default=1.5)
+ p.add_argument("--inline-ocr", choices=["strip", "keep"], default="strip")
+ p.add_argument("--chunk-size", type=int, default=1600)
+ p.add_argument("--chunk-overlap", type=int, default=200)
+ args = p.parse_args()
+
+ text_path = Path(args.text)
+ src = read(text_path)
+ sections = split_sections(src)
+
+ image_rows = []
+ image_map = {}
+ if args.images:
+ rows = json.loads(read(Path(args.images)))
+ image_rows = rows.get("images", [])
+ for item in image_rows:
+ image_map[item["id"]] = item
+
+ out_sections = []
+ chunks = []
+ nodes = []
+ for si, sec in enumerate(sections):
+ body = strip_inline_ocr(sec["text"]) if args.inline_ocr == "strip" else sec["text"]
+ ids = image_ids(body)
+ summary = rule_summary(body)
+ if args.mode == "llamaindex":
+ summary = llama_summary(
+ body,
+ args.model,
+ min_interval=args.llm_min_interval,
+ max_retries=args.llm_max_retries,
+ retry_initial=args.llm_retry_initial,
+ )
+
+ out_sections.append(
+ {
+ "id": f"sec-{si}",
+ "title": sec["title"],
+ "summary": summary,
+ "image_ids": ids,
+ "images": [image_map[i] for i in ids if i in image_map],
+ "text": body,
+ }
+ )
+
+ parts = chunk_text(body, args.chunk_size, args.chunk_overlap)
+ for ci, body in enumerate(parts):
+ ids2 = image_ids(body)
+ chunk = {
+ "id": f"sec-{si}-chunk-{ci}",
+ "type": "text",
+ "section_id": f"sec-{si}",
+ "section_title": sec["title"],
+ "text": body,
+ "image_ids": ids2,
+ "metadata": {
+ "source_url": args.source_url,
+ "text_file": str(text_path),
+ "char_len": len(body),
+ },
+ }
+ chunks.append(chunk)
+ nodes.append(chunk)
+
+ image_nodes = []
+ for item in image_rows:
+ iid = item.get("id")
+ if not iid:
+ continue
+ refs = [sec["id"] for sec in out_sections if iid in sec["image_ids"]]
+ text = clean("\n".join(x for x in [item.get("alt", ""), item.get("ocr_text", "")] if x))
+ image = {
+ "id": f"image-{iid}",
+ "type": "image",
+ "image_id": iid,
+ "section_ids": refs,
+ "source_url": item.get("url", ""),
+ "alt": item.get("alt", ""),
+ "ocr_text": item.get("ocr_text", ""),
+ "text": text,
+ "metadata": {
+ "source_url": args.source_url,
+ "text_file": str(text_path),
+ "ocr_chars": item.get("ocr_chars", len(item.get("ocr_text", "") or "")),
+ "status": item.get("status", ""),
+ },
+ }
+ image_nodes.append(image)
+ nodes.append(image)
+
+ out = {
+ "source_url": args.source_url,
+ "text_file": str(text_path),
+ "generated_at": datetime.now(timezone.utc).isoformat(),
+ "mode": args.mode,
+ "inline_ocr": args.inline_ocr,
+ "sections": out_sections,
+ "chunks": chunks,
+ "image_nodes": image_nodes,
+ "nodes": nodes,
+ }
+ Path(args.output).write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/script/rag/url-to-text.sh b/script/rag/url-to-text.sh
new file mode 100755
index 00000000000..bd600008225
--- /dev/null
+++ b/script/rag/url-to-text.sh
@@ -0,0 +1,449 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
+DOC=${RAG_DOCLING_BIN:-"$ROOT/.venv-docling/bin/docling"}
+PY=${RAG_DOCLING_PYTHON_BIN:-"$ROOT/.venv-docling/bin/python"}
+OUT=${RAG_TEXT_URL_OUTPUT:-"$ROOT/.rag/text/url"}
+HTML=${RAG_TEXT_URL_HTML:-"$ROOT/.rag/html/url"}
+URL=""
+NAME=""
+KEEP_HTML=false
+OCR_IMAGES=false
+IMAGE_LIMIT=${RAG_TEXT_URL_IMAGE_LIMIT:-30}
+OCR_ENGINE=${RAG_TEXT_URL_OCR_ENGINE:-}
+OCR_LANG=${RAG_TEXT_URL_OCR_LANG:-}
+OCR_ARTIFACTS=${RAG_TEXT_URL_OCR_ARTIFACTS:-}
+OCR_PSM=${RAG_TEXT_URL_OCR_PSM:-}
+IMAGE_INLINE=${RAG_TEXT_URL_IMAGE_INLINE:-marker}
+USER=${RAG_TEXT_URL_USER:-}
+PASS=${RAG_TEXT_URL_PASSWORD:-}
+COOKIE=${RAG_TEXT_URL_COOKIE:-}
+COOKIE_FILE=${RAG_TEXT_URL_COOKIE_FILE:-}
+PROXY=${RAG_TEXT_URL_PROXY:-}
+NO_PROXY_MODE=false
+INSECURE=false
+declare -a HDR=()
+
+usage() {
+ cat <<'EOF'
+Fetch one URL as HTML, then convert it to plain text with docling.
+
+Usage:
+ script/rag/url-to-text.sh --url URL [--name NAME] [--output DIR] [--html-dir DIR] [--header "K: V"] [--user USER --password PASS] [--cookie "a=b"] [--cookie-file FILE] [--proxy URL] [--no-proxy] [--insecure] [--keep-html] [--ocr-images] [--image-limit N] [--ocr-engine NAME] [--ocr-lang CODE] [--psm N] [--image-inline MODE]
+
+Options:
+ --url URL Source URL to fetch
+ --name NAME Output file stem (default: generated from URL)
+ --output DIR Text output directory (default: ./.rag/text/url)
+ --html-dir DIR Downloaded HTML directory (default: ./.rag/html/url)
+ --header "K: V" Extra request header for curl (repeatable)
+ --user USER HTTP auth username for URL fetch
+ --password PASS HTTP auth password for URL fetch (or set RAG_TEXT_URL_PASSWORD)
+ --cookie "k=v;..." Cookie header value
+ --cookie-file FILE Netscape cookie file used by curl
+ --proxy URL Proxy for curl requests
+ --no-proxy Bypass proxy for all hosts (adds --noproxy "*")
+ --insecure Allow insecure TLS for intranet/self-signed cert
+ --keep-html Keep downloaded HTML file
+ --ocr-images OCR text in
resources and append to output txt
+ --image-limit N Max images to OCR when --ocr-images is enabled (default: 30)
+ --ocr-engine NAME OCR engine for image OCR (for example: tesseract, rapidocr, auto)
+ --ocr-lang CODE OCR language list (for example: eng or eng,chi_sim)
+ --psm N OCR page segmentation mode, 0-13 (useful for tesseract)
+ --image-inline MODE Inline image strategy: marker|ocr|none (default: marker)
+ --artifacts-path PATH Local docling artifacts path for OCR-related models
+ --docling-bin PATH docling executable (default: ./.venv-docling/bin/docling)
+ --python-bin PATH python executable used to parse html img tags (default: ./.venv-docling/bin/python)
+ -h, --help Show help
+EOF
+}
+
+slug() {
+ printf '%s' "$1" |
+ sed -E 's#https?://##; s#[^a-zA-Z0-9._-]+#-#g; s#-+#-#g; s#(^-|-$)##g' |
+ cut -c1-120
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --url)
+ URL="$2"
+ shift 2
+ ;;
+ --name)
+ NAME="$2"
+ shift 2
+ ;;
+ --output)
+ OUT="$2"
+ shift 2
+ ;;
+ --html-dir)
+ HTML="$2"
+ shift 2
+ ;;
+ --header)
+ HDR+=("$2")
+ shift 2
+ ;;
+ --user)
+ USER="$2"
+ shift 2
+ ;;
+ --password)
+ PASS="$2"
+ shift 2
+ ;;
+ --cookie)
+ COOKIE="$2"
+ shift 2
+ ;;
+ --cookie-file)
+ COOKIE_FILE="$2"
+ shift 2
+ ;;
+ --proxy)
+ PROXY="$2"
+ shift 2
+ ;;
+ --no-proxy)
+ NO_PROXY_MODE=true
+ shift
+ ;;
+ --insecure)
+ INSECURE=true
+ shift
+ ;;
+ --keep-html)
+ KEEP_HTML=true
+ shift
+ ;;
+ --ocr-images)
+ OCR_IMAGES=true
+ shift
+ ;;
+ --image-limit)
+ IMAGE_LIMIT="$2"
+ shift 2
+ ;;
+ --ocr-engine)
+ OCR_ENGINE="$2"
+ shift 2
+ ;;
+ --ocr-lang)
+ OCR_LANG="$2"
+ shift 2
+ ;;
+ --psm)
+ OCR_PSM="$2"
+ shift 2
+ ;;
+ --image-inline)
+ IMAGE_INLINE="$2"
+ shift 2
+ ;;
+ --artifacts-path)
+ OCR_ARTIFACTS="$2"
+ shift 2
+ ;;
+ --docling-bin)
+ DOC="$2"
+ shift 2
+ ;;
+ --python-bin)
+ PY="$2"
+ shift 2
+ ;;
+ -h|--help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "unknown argument: $1" >&2
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+if [[ -z "$URL" ]]; then
+ echo "--url is required" >&2
+ usage
+ exit 1
+fi
+
+if [[ ! -x "$DOC" ]]; then
+ echo "docling not found: $DOC" >&2
+ exit 1
+fi
+
+if ! command -v curl >/dev/null 2>&1; then
+ echo "curl not found" >&2
+ exit 1
+fi
+
+if [[ -n "$COOKIE_FILE" && ! -f "$COOKIE_FILE" ]]; then
+ echo "cookie file not found: $COOKIE_FILE" >&2
+ exit 1
+fi
+
+if [[ "$OCR_IMAGES" == "true" && ! -x "$PY" ]]; then
+ echo "python not found or not executable: $PY" >&2
+ exit 1
+fi
+
+if [[ "$OCR_IMAGES" == "true" ]]; then
+ if [[ -z "$OCR_ENGINE" ]]; then
+ if command -v tesseract >/dev/null 2>&1; then
+ OCR_ENGINE="tesseract"
+ if [[ -z "$OCR_LANG" ]]; then
+ OCR_LANG="eng,chi_sim"
+ fi
+ echo "image OCR engine selected: tesseract" >&2
+ else
+ OCR_ENGINE="auto"
+ echo "image OCR engine selected: auto (tesseract not found)" >&2
+ fi
+ fi
+
+ if [[ "$OCR_ENGINE" == "tesseract" ]]; then
+ if ! command -v tesseract >/dev/null 2>&1; then
+ echo "tesseract not found, install it first: sudo apt install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-chi-sim" >&2
+ exit 1
+ fi
+ if [[ -z "$OCR_LANG" ]]; then
+ OCR_LANG="eng,chi_sim"
+ fi
+ if [[ -z "$OCR_PSM" ]]; then
+ OCR_PSM="6"
+ fi
+ fi
+ echo "image OCR config: engine=$OCR_ENGINE lang=${OCR_LANG:-} psm=${OCR_PSM:-}" >&2
+fi
+
+if [[ -n "$OCR_PSM" ]] && ! [[ "$OCR_PSM" =~ ^[0-9]+$ ]]; then
+ echo "invalid --psm: $OCR_PSM" >&2
+ exit 1
+fi
+if [[ "$IMAGE_INLINE" != "marker" && "$IMAGE_INLINE" != "ocr" && "$IMAGE_INLINE" != "none" ]]; then
+ echo "invalid --image-inline: $IMAGE_INLINE (expected marker|ocr|none)" >&2
+ exit 1
+fi
+
+if [[ -z "$NAME" ]]; then
+ NAME=$(slug "$URL")
+fi
+
+if [[ -z "$NAME" ]]; then
+ NAME="page-$(date +%Y%m%d-%H%M%S)"
+fi
+
+mkdir -p "$OUT" "$HTML"
+HTML_FILE="$HTML/$NAME.html"
+
+declare -a CURL_CMD=("curl" "-fsSL")
+if [[ "$NO_PROXY_MODE" == "true" ]]; then
+ CURL_CMD+=("--noproxy" "*")
+elif [[ -n "$PROXY" ]]; then
+ CURL_CMD+=("--proxy" "$PROXY")
+fi
+if [[ "$INSECURE" == "true" ]]; then
+ CURL_CMD+=("-k")
+fi
+if [[ -n "$USER" ]]; then
+ CURL_CMD+=("-u" "$USER:$PASS")
+fi
+if [[ -n "$COOKIE" ]]; then
+ CURL_CMD+=("-H" "Cookie: $COOKIE")
+fi
+if [[ -n "$COOKIE_FILE" ]]; then
+ CURL_CMD+=("-b" "$COOKIE_FILE")
+fi
+CURL_CMD+=("$URL" "-o" "$HTML_FILE")
+for h in "${HDR[@]}"; do
+ CURL_CMD+=("-H" "$h")
+done
+"${CURL_CMD[@]}"
+
+"$DOC" "$HTML_FILE" --from html --to text --output "$OUT" --abort-on-error
+
+TXT_FILE="$OUT/$NAME.txt"
+if [[ ! -f "$TXT_FILE" ]]; then
+ FALLBACK=$(find "$OUT" -maxdepth 1 -type f -name "$NAME*.txt" | head -n 1 || true)
+ if [[ -n "$FALLBACK" ]]; then
+ TXT_FILE="$FALLBACK"
+ fi
+fi
+
+if [[ ! -f "$TXT_FILE" ]]; then
+ echo "docling conversion finished but no txt was found for: $NAME" >&2
+ exit 1
+fi
+
+if [[ "$OCR_IMAGES" == "true" ]]; then
+ TMP=$(mktemp -d)
+ trap 'rm -rf "$TMP"' EXIT
+ IMG_LIST="$TMP/image_urls.txt"
+ IMG_META="$TMP/image_meta.json"
+ IMG_DIR="$TMP/images"
+ OCR_DIR="$TMP/ocr"
+ mkdir -p "$IMG_DIR" "$OCR_DIR"
+
+ "$PY" - "$URL" "$HTML_FILE" "$IMG_LIST" "$IMG_META" <<'PY'
+import json
+import pathlib
+import sys
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup
+
+base = sys.argv[1]
+html_path = pathlib.Path(sys.argv[2])
+out = pathlib.Path(sys.argv[3])
+meta = pathlib.Path(sys.argv[4])
+raw = html_path.read_text(encoding="utf-8", errors="ignore")
+soup = BeautifulSoup(raw, "html.parser")
+seen = set()
+rows = []
+for n in soup.find_all("img"):
+ src = (n.get("src") or n.get("data-src") or n.get("data-original") or "").strip()
+ if not src:
+ continue
+ if src.startswith("data:"):
+ continue
+ u = urljoin(base, src)
+ if not u or u in seen:
+ continue
+ seen.add(u)
+ rows.append(
+ {
+ "id": f"img-{len(rows)}",
+ "url": u,
+ "alt": (n.get("alt") or "").strip(),
+ }
+ )
+out.write_text("\n".join(row["url"] for row in rows), encoding="utf-8")
+meta.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")
+PY
+
+ mapfile -t IMAGES <"$IMG_LIST"
+ MAX="$IMAGE_LIMIT"
+ if ! [[ "$MAX" =~ ^[0-9]+$ ]]; then
+ echo "invalid --image-limit: $MAX" >&2
+ exit 1
+ fi
+
+ OCR_OK=0
+ OCR_BAD=0
+ OCR_DONE=0
+ OCR_LOG="$OUT/$NAME.image_ocr.log"
+ : >"$OCR_LOG"
+ for u in "${IMAGES[@]}"; do
+ if [[ "$OCR_DONE" -ge "$MAX" ]]; then
+ break
+ fi
+ clean="${u%%\?*}"
+ ext="${clean##*.}"
+ if [[ "$ext" == "$clean" ]] || [[ ! "$ext" =~ ^[A-Za-z0-9]{1,6}$ ]]; then
+ ext="img"
+ fi
+ f="$IMG_DIR/img-$OCR_DONE.$ext"
+ declare -a CURL_IMAGE=("curl" "-fsSL")
+ if [[ "$NO_PROXY_MODE" == "true" ]]; then
+ CURL_IMAGE+=("--noproxy" "*")
+ elif [[ -n "$PROXY" ]]; then
+ CURL_IMAGE+=("--proxy" "$PROXY")
+ fi
+ if [[ "$INSECURE" == "true" ]]; then
+ CURL_IMAGE+=("-k")
+ fi
+ if [[ -n "$USER" ]]; then
+ CURL_IMAGE+=("-u" "$USER:$PASS")
+ fi
+ if [[ -n "$COOKIE" ]]; then
+ CURL_IMAGE+=("-H" "Cookie: $COOKIE")
+ fi
+ if [[ -n "$COOKIE_FILE" ]]; then
+ CURL_IMAGE+=("-b" "$COOKIE_FILE")
+ fi
+ CURL_IMAGE+=("$u" "-o" "$f" "-H" "Referer: $URL")
+ for h in "${HDR[@]}"; do
+ CURL_IMAGE+=("-H" "$h")
+ done
+ if ! "${CURL_IMAGE[@]}" >/dev/null 2>&1; then
+ OCR_BAD=$((OCR_BAD + 1))
+ OCR_DONE=$((OCR_DONE + 1))
+ continue
+ fi
+
+ t="$OCR_DIR/$(basename "$f").txt"
+ if [[ "$OCR_ENGINE" == "tesseract" ]]; then
+ declare -a TESS=("tesseract" "$f" "stdout")
+ if [[ -n "$OCR_LANG" ]]; then
+ TESS+=("-l" "${OCR_LANG//,/+}")
+ fi
+ if [[ -n "$OCR_PSM" ]]; then
+ TESS+=("--psm" "$OCR_PSM")
+ fi
+ if "${TESS[@]}" >"$t" 2>>"$OCR_LOG"; then
+ :
+ else
+ OCR_BAD=$((OCR_BAD + 1))
+ OCR_DONE=$((OCR_DONE + 1))
+ continue
+ fi
+ else
+ declare -a OCR_CMD=("$DOC" "$f" "--from" "image" "--to" "text" "--output" "$OCR_DIR" "--ocr" "--force-ocr" "--abort-on-error")
+ if [[ -n "$OCR_ENGINE" ]]; then
+ OCR_CMD+=("--ocr-engine" "$OCR_ENGINE")
+ fi
+ if [[ -n "$OCR_LANG" ]]; then
+ OCR_CMD+=("--ocr-lang" "$OCR_LANG")
+ fi
+ if [[ -n "$OCR_ARTIFACTS" ]]; then
+ OCR_CMD+=("--artifacts-path" "$OCR_ARTIFACTS")
+ fi
+ if [[ -n "$OCR_PSM" ]]; then
+ OCR_CMD+=("--psm" "$OCR_PSM")
+ fi
+ if "${OCR_CMD[@]}" >>"$OCR_LOG" 2>&1; then
+ :
+ else
+ OCR_BAD=$((OCR_BAD + 1))
+ OCR_DONE=$((OCR_DONE + 1))
+ continue
+ fi
+ fi
+
+ if [[ -s "$t" ]] && grep -q '[^[:space:]]' "$t"; then
+ OCR_OK=$((OCR_OK + 1))
+ else
+ OCR_BAD=$((OCR_BAD + 1))
+ fi
+ OCR_DONE=$((OCR_DONE + 1))
+ done
+
+ SIDECAR="$OUT/$NAME.images.json"
+ RAW_TXT="$OUT/$NAME.raw.txt"
+ "$PY" "$ROOT/script/rag/merge-image-ocr.py" \
+ --text "$TXT_FILE" \
+ --meta "$IMG_META" \
+ --ocr-dir "$OCR_DIR" \
+ --sidecar "$SIDECAR" \
+ --raw "$RAW_TXT" \
+ --inline-mode "$IMAGE_INLINE" \
+ --source-url "$URL"
+
+ echo "image_ocr_total=${#IMAGES[@]} scanned=$OCR_DONE success=$OCR_OK failed=$OCR_BAD" >&2
+ echo "image_sidecar=$SIDECAR" >&2
+ if [[ "${#IMAGES[@]}" -gt 0 && "$OCR_OK" -eq 0 ]]; then
+ echo "image OCR produced no text; inspect log: $OCR_LOG" >&2
+ echo "hint: try --ocr-lang chi_sim or eng,chi_sim with --psm 6; if page images are tiny/icons, OCR may return empty." >&2
+ fi
+fi
+
+if [[ "$KEEP_HTML" != "true" ]]; then
+ rm -f "$HTML_FILE"
+fi
+
+echo "$TXT_FILE"
diff --git a/specs/rag-docling-deploy.zh.md b/specs/rag-docling-deploy.zh.md
new file mode 100644
index 00000000000..9e17aa8adcf
--- /dev/null
+++ b/specs/rag-docling-deploy.zh.md
@@ -0,0 +1,507 @@
+# RAG 文本化部署手册(Docling)
+
+本手册记录从环境准备到文本产出的完整步骤,适合在本地或内网机器复用。
+
+## 1. 环境准备
+
+在 Debian/Ubuntu 上安装 Python 虚拟环境能力:
+
+```bash
+sudo apt update
+sudo apt install -y python3 python3-venv python3-full curl
+```
+
+验证版本:
+
+```bash
+python3 --version
+curl --version | head -n 1
+```
+
+## 2. 安装 Docling(隔离 venv)
+
+在仓库根目录执行:
+
+```bash
+cd /home/zhang/01-my_code/09-my-opencode/opencode-worktrees/rag-enhance
+bash script/rag/install-docling.sh
+```
+
+脚本行为:
+
+1. 创建 `./.venv-docling`
+2. 升级 `pip/setuptools/wheel`
+3. 安装 `script/rag/requirements-docling.txt` 中的 `docling`
+4. 输出 `docling --version` 作为健康检查
+
+可选参数:
+
+```bash
+bash script/rag/install-docling.sh \
+ --venv /opt/rag/.venv-docling \
+ --python python3 \
+ --requirements script/rag/requirements-docling.txt
+```
+
+内网离线安装(本地 wheel 仓):
+
+```bash
+bash script/rag/install-docling.sh \
+ --venv /opt/rag/.venv-docling \
+ --requirements script/rag/requirements-docling.txt \
+ --wheelhouse /opt/rag/docling-wheelhouse
+```
+
+## 3. 激活环境(可选)
+
+脚本默认直接调用绝对路径,不强制激活;如需手动调试可激活:
+
+```bash
+source .venv-docling/bin/activate
+docling --version
+```
+
+## 3.1 安装 Tesseract(方案 A,推荐内网)
+
+在 Debian/Ubuntu 上执行:
+
+```bash
+bash script/rag/install-tesseract.sh
+```
+
+默认安装:
+
+- `tesseract-ocr`
+- `tesseract-ocr-eng`
+- `tesseract-ocr-chi-sim`
+
+可自定义语言包:
+
+```bash
+bash script/rag/install-tesseract.sh --langs "eng chi-sim"
+```
+
+## 4. URL 抓取 HTML 并转换为 text
+
+单 URL:
+
+```bash
+bash script/rag/url-to-text.sh \
+ --url "https://example.com"
+```
+
+开启图片 OCR(识别页面 `img` 里的文字):
+
+```bash
+bash script/rag/url-to-text.sh \
+ --url "https://example.com" \
+ --ocr-images \
+ --image-limit 30 \
+ --image-inline marker
+```
+
+说明:当 `--ocr-images` 启用且系统存在 `tesseract` 时,脚本会默认优先使用 `tesseract`(更适合内网离线)。
+且该路径会直接调用系统 `tesseract`,避免 docling 的 OSD 包装层导致的部分图片误报失败。
+
+`--image-inline` 说明:
+
+1. `marker`:仅保留 `[IMAGE:img-x]` 占位,OCR 文本只放 sidecar(推荐,避免污染 chunk)
+2. `ocr`:将 OCR 内联到正文(老行为)
+3. `none`:移除图片占位
+
+指定 OCR 引擎/语言:
+
+```bash
+bash script/rag/url-to-text.sh \
+ --url "https://example.com" \
+ --ocr-images \
+ --ocr-engine tesseract \
+ --ocr-lang eng,chi_sim \
+ --psm 6
+```
+
+代理控制(避免被错误代理拦住):
+
+```bash
+# 强制绕过代理
+bash script/rag/url-to-text.sh --url "https://example.com" --no-proxy
+
+# 显式指定代理
+bash script/rag/url-to-text.sh --url "https://example.com" --proxy "http://proxy.local:7890"
+```
+
+输出默认为:
+
+- HTML 暂存目录:`./.rag/html/url/`
+- 文本目录:`./.rag/text/url/`
+
+带认证头示例:
+
+```bash
+bash script/rag/url-to-text.sh \
+ --url "https://intranet.example.local/doc?id=123" \
+ --header "Authorization: Bearer " \
+ --header "Cookie: session=" \
+ --name "intranet-doc-123" \
+ --ocr-images \
+ --keep-html
+```
+
+账号密码认证(Basic/Digest 场景):
+
+```bash
+bash script/rag/url-to-text.sh \
+ --url "https://intranet.example.local/doc/123" \
+ --user "your_user" \
+ --password "your_password" \
+ --ocr-images
+```
+
+Cookie 文件认证(SSO 登录后导出的 cookie):
+
+```bash
+bash script/rag/url-to-text.sh \
+ --url "https://intranet.example.local/doc/123" \
+ --cookie-file /path/to/cookies.txt \
+ --ocr-images
+```
+
+LDAP/SSO 场景说明:
+
+1. LDAP 只负责身份认证,`url-to-text.sh` 不能直接“输入 LDAP”完成网页表单登录
+2. 脚本本质是 `curl` 抓取,通常需要有效 session(Cookie)或网关支持 Basic Auth
+3. 你的内网若是 LDAP + SSO(CAS/OIDC/SAML),推荐先在浏览器登录,再导出 `cookies.txt` 给 `--cookie-file`
+
+命令标准输出会打印生成的 `.txt` 路径,可直接接入后续 embedding 流程。
+
+图片相关输出文件(`--ocr-images`):
+
+1. 主文本:`.txt`(`` 会被替换为 `[IMAGE:img-x]` + 就地 OCR)
+2. 原始文本备份:`.raw.txt`
+3. 图片 sidecar:`.images.json`(包含 `id/url/alt/ocr_text/status`)
+4. OCR 运行日志:`.image_ocr.log`
+
+说明:
+
+1. 默认只提取 HTML 可见文本,不做图片 OCR
+2. `--ocr-images` 会解析页面 `
` 链接并逐张 OCR,并就地写回到图片占位符附近
+3. 若页面是前端渲染(图片不在原始 HTML),需要先用浏览器渲染后再抓取 HTML 或导出 PDF 再转文本
+
+### 图片 OCR 常见问题
+
+如果你看到“图片无法识别”或 `image_ocr_total` 有值但 `success=0`,通常是 OCR 模型未就绪:
+
+1. `docling` 的 `rapidocr/auto` 首次运行可能需要联网下载模型
+2. 内网环境需预下载模型并同步缓存,或改用本机 `tesseract`
+
+你给的日志 `wiki.luckfox.com-zh-Luckfox-Pico-Zero-Overview.image_ocr.log` 显示:
+
+1. 模型下载是成功的(`Successfully saved`)
+2. 失败原因是 `RapidOCR returned empty result`(检测不到文字)
+3. 因此该问题不只是“无法访问”,更像是该页面图片内容对 RapidOCR 不友好
+4. 当前切换到 tesseract 后,报错多为 `OSD failed / Too few characters`,可通过 `--psm 6` 降低此类问题
+
+推荐排查顺序:
+
+```bash
+# 1) 查看脚本 stderr 给出的 image_ocr.log(默认在输出目录,如 ./.rag/text/url/.image_ocr.log)
+
+# 2) 若能用系统 OCR,安装 tesseract 后强制使用
+sudo apt install -y tesseract-ocr tesseract-ocr-eng
+bash script/rag/url-to-text.sh --url "https://example.com" --ocr-images --ocr-engine tesseract --ocr-lang eng
+
+# 3) 若必须用 docling 默认 OCR,则在可联网机器先完成一次图片 OCR 预热,
+# 再把相关缓存目录复制到内网机器(例如 ~/.cache/rapidocr、~/.cache/docling)
+```
+
+## 5. 批量目录转 text
+
+把资料目录递归转换成文本,并保持子目录结构:
+
+```bash
+bash script/rag/convert-dir-to-text.sh \
+ --input /data/rag/raw \
+ --output /data/rag/text
+```
+
+默认处理扩展名:
+
+`pdf docx pptx html htm md txt csv xls xlsx xml`
+
+自定义扩展名:
+
+```bash
+bash script/rag/convert-dir-to-text.sh \
+ --input /data/rag/raw \
+ --output /data/rag/text \
+ --ext "pdf docx html"
+```
+
+转换日志:
+
+- 成功清单:`/data/rag/text/_success.log`
+- 失败清单:`/data/rag/text/_failed.log`
+- 运行日志:`/data/rag/text/_run.log`
+
+## 6. 内网离线打包与安装(Ubuntu 22.04)
+
+在可联网机器打包:
+
+```bash
+bash script/rag/build-offline-bundle.sh \
+ --out /tmp/rag-offline-bundle \
+ --langs "eng chi-sim" \
+ --include-llamaindex \
+ --include-vectordb
+```
+
+产物:
+
+1. 目录:`/tmp/rag-offline-bundle`
+2. 压缩包:`/tmp/rag-offline-bundle.tar.gz`
+
+拷贝到内网目标机后安装:
+
+```bash
+tar -xzf rag-offline-bundle.tar.gz
+bash script/rag/install-offline-bundle.sh \
+ --bundle ./rag-offline-bundle \
+ --venv ./.venv-docling \
+ --install-llamaindex \
+ --install-vectordb
+```
+
+## 7. 数据清洗与结构化
+
+清洗文本:
+
+```bash
+./.venv-docling/bin/python script/rag/clean-text.py \
+ --input .rag/text/url/.txt \
+ --output .rag/text/url/.clean.txt
+```
+
+结构化输出(规则模式):
+
+```bash
+./.venv-docling/bin/python script/rag/structure-text.py \
+ --text .rag/text/url/.clean.txt \
+ --images .rag/text/url/.images.json \
+ --output .rag/text/url/.structured.json \
+ --source-url "https://example.com" \
+ --mode rule \
+ --inline-ocr strip
+```
+
+结构化输出(LlamaIndex):
+
+```bash
+export OPENAI_API_KEY=...
+./.venv-docling/bin/python script/rag/structure-text.py \
+ --text .rag/text/url/.clean.txt \
+ --images .rag/text/url/.images.json \
+ --output .rag/text/url/.structured.json \
+ --source-url "https://example.com" \
+ --mode llamaindex \
+ --model gpt-4o-mini
+```
+
+结构化结果包含:
+
+1. `sections`:章节级标题、摘要、正文、关联图片 metadata
+2. `chunks`:可直接喂 embedding 的分块 + `image_ids` + 来源 metadata
+
+## 8. 备用离线方式(wheelhouse 手工流程)
+
+若内网机器不能直接访问公网,建议在可联网机器提前准备 wheel 包:
+
+```bash
+mkdir -p /tmp/docling-wheelhouse
+python3 -m venv /tmp/docling-venv
+/tmp/docling-venv/bin/python -m pip install -U pip
+/tmp/docling-venv/bin/pip download -r script/rag/requirements-docling.txt -d /tmp/docling-wheelhouse
+tar -C /tmp -czf docling-wheelhouse.tar.gz docling-wheelhouse
+```
+
+将 `docling-wheelhouse.tar.gz` 拷贝到内网机器后:
+
+```bash
+tar -xzf docling-wheelhouse.tar.gz
+python3 -m venv .venv-docling
+.venv-docling/bin/python -m pip install -U pip
+.venv-docling/bin/pip install --no-index --find-links ./docling-wheelhouse -r script/rag/requirements-docling.txt
+```
+
+## 9. 最小验收
+
+```bash
+./.venv-docling/bin/docling --version
+bash script/rag/url-to-text.sh --url "https://example.com"
+```
+
+满足以下条件即通过:
+
+1. `docling --version` 正常返回版本信息
+2. URL 转换命令输出一个 `.txt` 文件路径
+3. 对应 `.txt` 文件可读取并包含页面正文
+
+## 10. 向量库落地(Qdrant 本地持久化 + Ollama Embedding)
+
+安装向量依赖:
+
+```bash
+bash script/rag/install-vector.sh
+```
+
+准备 Ollama embedding 模型(建议):
+
+```bash
+ollama pull nomic-embed-text
+```
+
+设置 OpenAI 兼容环境变量(Ollama):
+
+```bash
+export OPENAI_BASE_URL="http://127.0.0.1:11434/v1"
+export OPENAI_API_KEY="ollama"
+```
+
+构建向量索引(单文件):
+
+```bash
+./.venv-docling/bin/python script/rag/build-vector-index.py \
+ --input .rag/text/url/.structured.json \
+ --db-path .rag/vector/qdrant \
+ --collection rag_chunks \
+ --model nomic-embed-text \
+ --recreate
+```
+
+构建向量索引(目录批量):
+
+```bash
+./.venv-docling/bin/python script/rag/build-vector-index.py \
+ --input-dir .rag/text/url \
+ --glob "*.structured.json" \
+ --db-path .rag/vector/qdrant \
+ --collection rag_chunks \
+ --model nomic-embed-text
+```
+
+检索验证:
+
+```bash
+./.venv-docling/bin/python script/rag/search-vector-index.py \
+ --query "如何刷写镜像到 Luckfox Pico Zero" \
+ --db-path .rag/vector/qdrant \
+ --collection rag_chunks \
+ --model nomic-embed-text \
+ --top-k 5
+```
+
+向量脚本产物说明:
+
+1. 向量库目录:`.rag/vector/qdrant`
+2. 集合名:默认 `rag_chunks`
+3. 每条向量 payload 包含:`node_type(text/image)`、`chunk_id`、`section_title`、`source_url`、`image_ids`、`text`
+
+## 11. OpenCode 注入 RAG 上下文
+
+已提供两种接入方式:
+
+1. 自定义工具:`.opencode/tool/rag_search.ts`(手动调用)
+2. 自动注入插件:`.opencode/plugins/rag_context.ts`(每轮用户消息前自动检索 top-k 注入 ``)
+
+建议环境变量:
+
+```bash
+export OPENAI_BASE_URL="http://192.168.0.99:11434/v1"
+export OPENAI_API_KEY="ollama"
+export RAG_STRUCT_MODE="llamaindex"
+export RAG_STRUCT_MODEL="gpt-4o-mini"
+export RAG_EMBED_MODEL="qwen3-embedding:4b"
+export RAG_COLLECTION="rag_chunks"
+export RAG_TOP_K=4
+export RAG_CONTEXT_HITS=2
+export RAG_CONTEXT_CHARS=120
+export RAG_AUTO_INJECT=1
+```
+
+关闭自动注入:
+
+```bash
+export RAG_AUTO_INJECT=0
+```
+
+可选调试(排查“是否注入成功”):
+
+```bash
+export RAG_DEBUG_LOG=1
+```
+
+插件会写入:`.rag/log/rag_context.log`
+
+可选覆盖(当 OpenAI 兼容地址或密钥与默认环境不同):
+
+```bash
+export RAG_BASE_URL="http://192.168.0.99:11434/v1"
+export RAG_API_KEY="ollama"
+```
+
+## 12. Agent 一键编排(Skill)
+
+已新增技能文件:`.opencode/skills/rag-pipeline/SKILL.md`
+
+建议通过统一入口命令执行:
+
+初始化(首建):
+
+```bash
+bash script/rag/cmd/rag-init.sh --source structured --scan-dir .rag/text --glob "**/*.structured.json" --embed-model qwen3-embedding:4b --collection rag_chunks
+```
+
+增量更新:
+
+```bash
+bash script/rag/cmd/rag-update.sh --source structured --scan-dir .rag/text --glob "**/*.structured.json" --embed-model qwen3-embedding:4b --collection rag_chunks
+```
+
+该流程会维护 manifest(默认 `.rag/state/manifest.json`)用于判断:
+
+1. `changed`:内容 hash 变化,执行“先删旧 doc_key,再 upsert 新向量”
+2. `removed`:文件消失,执行按 doc_key 删除
+3. embedding 模型或 collection 变化,自动触发全量重建
+
+建议只暴露这些高层选项给用户:
+
+1. `--source`
+2. `--struct-mode`/`--struct-model`
+3. `--embed-model`
+4. 数据来源参数(`--url`/`--url-file`/`--input-dir`/`--scan-dir`)
+5. `--collection`
+
+其余算法细节(chunk、重试、OCR 引擎细节)默认不暴露。
+
+## 13. 迁移到其他项目
+
+在当前仓库执行:
+
+```bash
+bash script/rag/cmd/rag-bootstrap.sh --target /path/to/target-project
+```
+
+默认会复制:
+
+1. `script/rag/*`(安装、转换、结构化、索引、检索、init/update)
+2. `.opencode/tool/rag_search.*`
+3. `.opencode/plugins/rag_context.ts`
+4. `.opencode/skills/rag-pipeline/SKILL.md`
+
+目标项目里继续执行:
+
+```bash
+cd /path/to/target-project
+bash script/rag/install-docling.sh
+bash script/rag/install-vector.sh
+bash script/rag/cmd/rag-init.sh --help
+```
diff --git a/specs/rag-enhance-architecture.zh.md b/specs/rag-enhance-architecture.zh.md
new file mode 100644
index 00000000000..261f5a851df
--- /dev/null
+++ b/specs/rag-enhance-architecture.zh.md
@@ -0,0 +1,266 @@
+# RAG Enhance 架构设计说明(rag-enhance)
+
+## 1. 目标与设计原则
+
+### 1.1 目标
+
+1. 在 OpenCode 对话中提供稳定的本地 RAG 能力(内网可部署)
+2. 降低重复检索与重复注入导致的推理循环
+3. 控制上下文窗口占用,优先增量披露
+4. 提供可观测调试手段,便于快速定位问题
+
+### 1.2 原则
+
+1. 优先改插件与脚本,不侵入 opencode core
+2. 结构化协议先行(`` + ``)
+3. 去重与增量优先于硬编码“单次限制”
+4. 参数可配置,默认值保守
+
+## 2. 总体架构
+
+### 2.1 模块分层
+
+1. 数据准备层:`script/rag/url-to-text.sh`、`convert-dir-to-text.sh`、`clean-text.py`
+2. 结构化层:`script/rag/structure-text.py`(rule/llamaindex)
+3. 向量索引层:`script/rag/build-vector-index.py` + Qdrant local
+4. 检索层:`script/rag/search-vector-index.py`
+5. 编排层:`script/rag/rag-pipeline.py` + `cmd/rag-init.sh`/`cmd/rag-update.sh`
+6. 交互层:
+ - 自动注入插件:`.opencode/plugins/rag_context.ts`
+ - 手动工具:`.opencode/tool/rag_search.ts`
+ - 共享状态模块:`.opencode/rag.ts`
+
+### 2.2 运行路径
+
+1. 离线/内网数据进入文本化
+2. 文本结构化为 section/chunk/image 节点
+3. embedding 写入 Qdrant(payload 包含 source、section、doc_key 等)
+4. 对话时:插件读取检索结果并注入状态 meta
+5. 长 query 会先执行 rewrite + multi-query retrieval + merge/rerank
+6. 模型必要时再调用 `rag_search(mode=state|delta|brief|expand)` 渐进补证据
+
+## 3. 文档处理与切分策略
+
+### 3.1 当前切分策略
+
+1. 按 Markdown 标题(`#`)拆 section
+2. section 内按固定窗口切 chunk(默认 `chunk_size=1600`, `chunk_overlap=200`)
+3. 图片 OCR 独立为 image node,避免污染正文 chunk
+
+说明:当前不是句法感知切分,`overlap` 用于缓解边界截断,但不能完全消除语义断裂。
+
+### 3.2 结构化与 LLM
+
+1. `structure-text.py` 直接执行时默认 `mode=rule`
+2. `rag-pipeline.py` 默认 `RAG_STRUCT_MODE=llamaindex`
+3. llamaindex 模式下调用 OpenAI 兼容接口做 section summary
+
+## 4. 检索交互协议(RAG-LLM)
+
+### 4.1 注入块
+
+插件当前向用户消息注入一个主逻辑块:
+
+1. ``:检索状态协议(短)
+
+说明:正文证据当前主要通过 `rag_search` 渐进披露,不再由自动注入直接提供。
+
+示例:
+
+```text
+
+status=no_new_evidence
+reason=high_overlap
+cluster=luckfox|zero|烧录
+delta_hits=0
+known_hits=3
+next_action=reuse_known_evidence_or_refine_query
+
+```
+
+### 4.2 status 枚举
+
+1. `new_evidence`
+2. `no_new_evidence`
+3. `weak_match`
+4. `need_refine`
+5. `cluster_throttled`
+6. `retrieval_error`
+7. `state_reset`
+
+### 4.3 reason 典型值
+
+1. `fresh_hits`
+2. `delta_available`
+3. `high_overlap`
+4. `low_score`
+5. `empty_hits`
+6. `cluster_window_limit`
+7. `backend_error`
+8. `parse_error`
+9. `compaction_epoch_changed`
+10. `cached_recent_result`
+
+## 5. 去重、增量与局部限流
+
+### 5.1 Query Cluster
+
+`query_cluster` 为“检索意图簇”,由 query 规范化词项生成(停用词过滤+同义词归一+排序)。
+
+用途:
+
+1. 将近义 query 归为同簇
+2. 对同簇做局部预算与节流
+3. 避免全局限流误伤其他主题
+
+### 5.2 重复检测
+
+1. 命中 fingerprint:`text_file/source + chunk_id/image_id/section`
+2. overlap = 交集 / 当前命中数
+3. `overlap >= RAG_OVERLAP_THRESHOLD` 且无新增时,标记 `no_new_evidence`
+
+### 5.3 增量注入
+
+1. 仅注入“未见过”的 delta hits
+2. 无 delta 时只注入 ``,不重复注入上下文正文
+3. 同 query 的短时间重复触发走缓存复用(`RAG_REUSE_SEC`)
+
+### 5.4 局部限流
+
+1. 仅针对同一 cluster
+2. 时间窗:`RAG_CLUSTER_WINDOW_SEC`
+3. 上限:`RAG_CLUSTER_MAX_FULL`
+4. 超限状态:`cluster_throttled`
+
+## 6. 渐进式披露
+
+`rag_search` 支持模式:
+
+1. `state`:只返回检索状态
+2. `delta`:同 query cluster 仅新增证据(默认)
+3. `brief`:当前命中的短摘要
+4. `expand`:扩展细节(用于二次追问)
+
+策略:
+
+1. 默认由插件持续注入 `rag_state`
+2. 模型需要证据时优先 `delta`
+3. `brief`/`expand` 仅在需要更多正文时使用
+
+## 7. 会话生命周期与 compact
+
+### 7.1 loop 触发
+
+OpenCode loop 每步都会触发 `experimental.chat.messages.transform`,因此插件必须具备状态机去重能力。
+
+### 7.2 compaction 重置
+
+插件实现 `experimental.session.compacting`:
+
+1. session `epoch + 1`
+2. 清空 seen hit 与 cluster 窗口
+3. 标记 `state_reset`
+
+目的:防止 compaction 后继续引用旧上下文状态。
+
+## 8. 配置参数
+
+### 8.1 基础连接
+
+1. `OPENAI_BASE_URL` / `OPENAI_API_KEY`
+2. `RAG_BASE_URL` / `RAG_API_KEY`(覆盖)
+3. `RAG_WORKTREE`
+4. `RAG_DOCLING_PYTHON_BIN`
+5. `RAG_DB_PATH`
+
+### 8.2 检索与注入
+
+1. `RAG_TOP_K`(默认 4)
+2. `RAG_CONTEXT_HITS`(默认 2)
+3. `RAG_CONTEXT_CHARS`(默认 120)
+4. `RAG_EXPAND_CHARS`(默认 420)
+5. `RAG_REWRITE_MODE`(默认 `auto`)
+6. `RAG_REWRITE_MODEL`
+7. `RAG_REWRITE_QUERIES`(默认 3)
+
+### 8.3 控制与阈值
+
+1. `RAG_AUTO_INJECT`(`0` 关闭)
+2. `RAG_OVERLAP_THRESHOLD`(默认 0.8)
+3. `RAG_WEAK_SCORE`(默认 0.42)
+4. `RAG_CLUSTER_WINDOW_SEC`(默认 30)
+5. `RAG_CLUSTER_MAX_FULL`(默认 2)
+6. `RAG_REUSE_SEC`(默认 8)
+
+### 8.4 调试
+
+1. `RAG_DEBUG=1` 或 `RAG_DEBUG_LOG=1`
+2. 日志:`.rag/log/rag_debug.jsonl`
+3. 查看:`script/rag/debug-rag-state.py --tail 100`
+
+## 9. 典型问题与解决方案
+
+### 9.1 问题:循环检索与重复思考
+
+原因:loop 多步触发 + 命中不充分 + 无状态去重。
+
+解决:
+
+1. `query_cluster` 局部限流
+2. overlap 去重
+3. delta 注入
+4. cache reuse
+
+### 9.2 问题:TUI 回显过多
+
+原因:工具多轮调用 + 大块文本注入。
+
+解决:
+
+1. 默认 `brief`
+2. `RAG_CONTEXT_HITS` 降低
+3. 强制“禁止 dump 原始 JSON/rag_context”系统提示
+4. 必要时仅保留 plugin,禁用显式 `rag_search`
+
+### 9.3 问题:手工命令成功但插件失败
+
+常见:worktree 识别为 `/`。
+
+解决:
+
+1. 显式配置 `RAG_WORKTREE`
+2. 显式配置 `RAG_DOCLING_PYTHON_BIN`
+3. 显式配置 `RAG_DB_PATH`
+
+### 9.4 问题:compaction 后行为异常
+
+原因:检索状态与压缩后消息不一致。
+
+解决:
+
+1. 在 `experimental.session.compacting` 事件重置 RAG 状态
+
+## 10. 运维与回归检查清单
+
+1. 检索可用:`search-vector-index.py` 手工命令返回 hits
+2. 集合存在:Qdrant `rag_chunks` 可见
+3. 插件注入:日志出现 `event=inject`
+4. 无新增命中:出现 `status=no_new_evidence`
+5. 局部限流触发:出现 `event=cluster_throttled`
+6. compact 后:出现 `event=state_reset`
+
+## 11. 代码锚点(便于回溯)
+
+1. 自动注入状态机:`.opencode/plugins/rag_context.ts`
+2. 工具渐进披露:`.opencode/tool/rag_search.ts`
+3. 调试脚本:`script/rag/debug-rag-state.py`
+4. 结构化切分:`script/rag/structure-text.py`
+5. 编排入口:`script/rag/rag-pipeline.py`
+
+## 12. 后续可演进方向
+
+1. 语义切分(句法/段落边界)替代纯字符窗口
+2. query cluster 从词法升级到 embedding 聚类
+3. reranker 引入(重排 top-k)
+4. `expand` 模式支持按 `chunk_id` 精确拉取
+5. 将状态机下沉到独立模块,支持单元测试
diff --git a/specs/rag-llm-prompt-protocol.zh.md b/specs/rag-llm-prompt-protocol.zh.md
new file mode 100644
index 00000000000..d5f5e867f20
--- /dev/null
+++ b/specs/rag-llm-prompt-protocol.zh.md
@@ -0,0 +1,309 @@
+# RAG 输出给 LLM 的当前协议
+
+## 1. 范围
+
+这份文档只描述当前代码里真正输出给 LLM 的内容,不描述 debug 日志,也不描述理想设计。
+
+当前协议由三部分组成:
+
+1. 自动注入的 ``
+2. 系统提示里的 RAG 协议说明
+3. `rag_search` 工具定义与工具返回
+
+相关实现文件:
+
+1. `.opencode/rag.ts`
+2. `.opencode/plugins/rag_context.ts`
+3. `.opencode/tool/rag_search.ts`
+
+## 2. 自动注入块
+
+### 2.1 注入位置
+
+`rag_context` 会在 `experimental.chat.messages.transform` 阶段,把 `` 注入到当前最新的 user text 中。
+
+当前默认行为:
+
+1. 自动注入只注入检索 meta
+2. 不自动注入正文 ``
+3. 正文证据主要由 `rag_search` 按需补充
+
+### 2.2 当前字段
+
+当前注入给 LLM 的 `` 字段来自 `.opencode/rag.ts` 的 `stateBlock()`:
+
+```text
+
+status=...
+reason=...
+cluster=...
+total_hits=...
+delta_hits=...
+known_hits=...
+overlap=...
+top_source=...
+top_section=...
+rewrite_queries=...
+next_action=...
+
+```
+
+字段含义:
+
+1. `status`
+ 当前 `session + cluster` 最近一次有效检索状态
+2. `reason`
+ 对应状态的原因
+3. `cluster`
+ 当前 query 归一化后的检索意图簇
+4. `total_hits`
+ 当前最近一次检索返回的总命中数
+5. `delta_hits`
+ 相对当前 cluster 已知证据,本轮新增命中数
+6. `known_hits`
+ 当前 cluster 已记录的累计命中数
+7. `overlap`
+ 本轮结果和已知命中的重合比例
+8. `top_source`
+ 当前 top hit 的来源 URL
+9. `top_section`
+ 当前 top hit 的 section 标题
+10. `rewrite_queries`
+ 当前底层检索实际使用的 rewrite query 列表
+11. `next_action`
+ 给 LLM 的下一步建议动作
+
+### 2.3 当前不输出给 LLM 的字段
+
+下面这些字段当前只写入 debug 日志,不直接注入给 LLM:
+
+1. `event`
+2. `channel`
+3. `loop`
+4. `used_cache`
+5. `rewrite_mode`
+6. `keywords`
+7. `top_hits`
+8. `delta_fps`
+9. `emitted_context`
+
+因此,LLM 不会直接看到“这一步是 `context_search` 还是 `context_meta`”,也不会直接看到完整 hit 列表。
+
+## 3. 系统提示协议
+
+`rag_context` 还会在 `experimental.chat.system.transform` 中追加 RAG 协议说明。
+
+当前系统提示的核心约束是:
+
+1. 每一步先解析 ``
+2. `rag_context` 只注入 retrieval meta,不注入正文
+3. 如果 `status=new_evidence` 且仍需要事实细节,优先调用 `rag_search mode=delta`
+4. 如果 `status=no_new_evidence`,优先复用当前状态,不要重复检索
+5. 普通问答不要调用 `mode=expand`
+6. 不要直接通过 shell 执行 `script/rag/search-vector-index.py` 做问答检索
+7. 调用 `rag_search` 时,参数必须是合法 JSON
+8. 对于长 query 或噪声 query,优先信任 rewrite 后的检索结果
+
+这部分不是结构化字段,而是对 LLM 的操作协议说明。
+
+## 4. `rag_search` 工具协议
+
+### 4.1 工具入参
+
+当前 `rag_search` 暴露给 LLM 的主要入参是:
+
+1. `query`
+2. `top_k`
+3. `node_type`
+4. `mode`
+
+其中:
+
+1. `query` 是普通字符串
+2. `top_k` 是返回条数
+3. `node_type` 目前主要是 `text` 或 `image`
+4. `mode` 控制渐进式披露层级
+
+### 4.2 工具模式
+
+当前支持的模式:
+
+1. `state`
+2. `delta`
+3. `brief`
+4. `expand`
+
+推荐顺序:
+
+1. `state`
+2. `delta`
+3. `brief`
+4. `expand`
+
+默认约束:
+
+1. 普通 QA 下优先 `delta`
+2. `expand` 默认受限,仅用于调试或显式证据展开
+
+### 4.3 工具返回
+
+`rag_search` 的返回不是原始 JSON,而是给 LLM 的文本协议。
+
+当前工具返回的第一部分始终是:
+
+1. ``
+
+然后按 `mode` 决定是否追加正文:
+
+1. `state`
+ 只返回 ``
+2. `delta`
+ 返回 `` + 本轮新增命中的短摘要
+3. `brief`
+ 返回 `` + 当前命中的短摘要
+4. `expand`
+ 返回 `` + 更长文本
+
+### 4.4 摘要格式
+
+`brief` 和 `delta` 当前使用 `.opencode/rag.ts` 里的 `brief()` 生成,格式类似:
+
+```text
+[1] source=... section=... summary=...
+[2] source=... section=... summary=...
+```
+
+`expand` 当前使用 `.opencode/rag.ts` 里的 `expand()`,会给更长的 `score/source/section/text`。
+
+## 5. LLM 实际看到的内容
+
+从 prompt 协议角度看,LLM 当前会看到三类信息:
+
+1. 用户原始问题
+2. 自动注入的 ``
+3. 系统提示里的 RAG 使用规则
+
+如果模型主动调用 `rag_search`,还会额外看到:
+
+1. 工具参数 schema
+2. 工具返回的 ``
+3. 工具返回的摘要或扩展正文
+
+因此当前架构下:
+
+1. 自动注入负责给状态
+2. 工具调用负责给正文
+
+## 6. 当前典型工作流
+
+### 6.1 自动注入阶段
+
+模型先看到:
+
+```text
+用户问题
+
+
+status=new_evidence
+reason=fresh_hits
+cluster=luckfox|文件传输
+total_hits=4
+delta_hits=4
+known_hits=4
+overlap=0.0000
+top_source=https://wiki.luckfox.com/...
+top_section=ADB 传输文件
+rewrite_queries=["Luckfox Pico Zero 文件传输","adb 文件传输"]
+next_action=call_rag_search_delta_if_more_detail_needed
+
+```
+
+这时模型应该先基于状态判断:
+
+1. 是否已有足够信息直接回答
+2. 是否需要调用 `rag_search mode=delta`
+3. 是否应该缩小或改写 query
+
+### 6.2 工具补充阶段
+
+如果模型调用:
+
+```json
+{"query":"Luckfox Pico Zero 文件传输方式","mode":"delta","node_type":"text","top_k":4}
+```
+
+它会看到类似返回:
+
+```text
+
+status=new_evidence
+reason=delta_available
+cluster=luckfox|文件传输方式
+total_hits=4
+delta_hits=2
+known_hits=6
+overlap=0.5000
+top_source=https://wiki.luckfox.com/...
+top_section=ADB 传输文件
+rewrite_queries=["Luckfox Pico Zero 文件传输方式","adb push pull 文件传输"]
+next_action=call_rag_search_delta_if_more_detail_needed
+
+[1] source=https://wiki.luckfox.com/... section=ADB 传输文件 summary=...
+[2] source=https://wiki.luckfox.com/... section=SCP 传输文件 summary=...
+```
+
+这时模型拿到的就不只是状态,还有正文摘要。
+
+## 7. 当前语义边界
+
+### 7.1 `status` 的语义
+
+当前 `.status` 表示:
+
+1. 当前 `session + cluster` 最近一次有效检索结果的状态
+
+它不等价于:
+
+1. “当前这一个 loop step 刚刚重新搜索得到的新状态”
+
+因此,如果当前 step 只是复用了缓存状态,LLM 看到的 `status=new_evidence`,实际语义更接近:
+
+1. 当前 cluster 的已知状态是 `new_evidence`
+
+而不是:
+
+1. 本 step 又重新找到了新证据
+
+### 7.2 `next_action` 的语义
+
+`next_action` 是建议,不是硬约束。
+
+LLM 仍然可以:
+
+1. 直接回答
+2. 选择更具体的 query
+3. 调 `rag_search`
+4. 放弃继续检索
+
+但系统提示已经对推荐行为做了收敛。
+
+## 8. 当前已知限制
+
+1. `event/context_meta/context_search` 只在 debug 日志里,LLM 不可见
+2. LLM 不能直接看到完整命中列表,除非主动调用 `rag_search`
+3. `status` 当前更接近 cluster 持久状态,不是严格的 step 状态
+4. 自动注入与工具调用虽然共享状态,但 query cluster 仍可能因为 agent rewrite 而不同
+
+## 9. 结论
+
+当前真正输出给 LLM 的协议可以概括为:
+
+1. 自动注入 `` 提供检索 meta
+2. 系统提示解释如何使用这些 meta
+3. `rag_search` 提供分层的正文证据披露
+
+因此,当前系统不是“自动把所有 RAG 内容都塞进 prompt”,而是:
+
+1. 先给状态
+2. 再由模型按需索取正文
+
diff --git a/specs/rag-progressive-disclosure.zh.md b/specs/rag-progressive-disclosure.zh.md
new file mode 100644
index 00000000000..0931a86568f
--- /dev/null
+++ b/specs/rag-progressive-disclosure.zh.md
@@ -0,0 +1,365 @@
+# RAG 渐进式披露当前实现说明
+
+## 1. 范围
+
+这份文档描述当前代码里的真实实现,不是理想设计。
+
+当前“渐进式披露”系统由三部分组成:
+
+1. 自动注入:`.opencode/plugins/rag_context.ts`
+2. 显式检索工具:`.opencode/tool/rag_search.ts`
+3. 共享状态与公共逻辑:`.opencode/rag.ts`
+
+底层检索脚本仍然是:
+
+1. `script/rag/search-vector-index.py`
+
+## 2. 当前目标
+
+当前实现要解决的是:
+
+1. 在 ReAct 式 loop 中持续给模型提供检索状态
+2. 不在每一轮 loop 中重复注入相同正文
+3. 把 `rag_context` 和 `rag_search` 统一为同一套渐进式披露系统
+4. 提供可追踪的 JSONL 调试日志
+
+## 3. 当前架构
+
+### 3.1 自动注入链路
+
+`rag_context` 当前只负责注入检索 meta 信息,不再自动注入正文摘要。
+
+它每次在 `experimental.chat.messages.transform` 被调用时会:
+
+1. 找到当前会话里最新的 user text
+2. 去掉旧的 `` / ``
+3. 生成 query cluster
+4. 查询共享状态
+5. 必要时调用底层检索脚本
+6. 只把 `` 注回用户消息
+
+这意味着:
+
+1. 模型在 loop 中每一步都能看到当前的 RAG 状态
+2. 是否继续调 `rag_search`,由模型自己判断
+
+### 3.2 显式工具链路
+
+`rag_search` 当前负责渐进式补充证据。
+
+支持模式:
+
+1. `state`
+2. `delta`
+3. `brief`
+4. `expand`
+
+推荐顺序:
+
+1. `state`
+2. `delta`
+3. `brief`
+4. `expand`
+
+其中:
+
+1. `state` 只返回状态
+2. `delta` 只返回新增证据
+3. `brief` 返回短摘要
+4. `expand` 返回扩展文本,默认受限
+
+### 3.3 共享状态
+
+自动注入和显式工具现在都使用同一个共享状态模块:
+
+1. `.opencode/rag.ts`
+
+共享状态粒度是:
+
+1. `session`
+2. `cluster`
+
+每个 cluster 当前维护的信息包括:
+
+1. `seen`
+2. `window`
+3. `last_query`
+4. `last_status`
+5. `last_reason`
+6. `last_checked`
+7. `total_hits`
+8. `known_hits`
+9. `overlap`
+10. `delta`
+11. `hits`
+12. `top`
+13. `rewrites`
+
+因此当前 `rag_context` 和 `rag_search` 已经不是两套独立状态机,而是同一状态系统的两个入口。
+
+## 4. 自动注入的当前规则
+
+### 4.1 注入内容
+
+自动注入当前只注入:
+
+1. ``
+
+不再自动注入正文 ``。
+
+这样做的目的:
+
+1. 让模型在每一步都能看到检索状态
+2. 把正文披露权交给 `rag_search`
+3. 避免 loop 中重复刷证据文本
+
+### 4.2 何时触发
+
+自动注入不是只在“用户第一次提问”时触发。
+
+当前实现里,只要:
+
+1. `experimental.chat.messages.transform` 被调用
+2. 最新 user text 还存在
+
+插件就会再次运行。
+
+区别在于:
+
+1. 首次进入当前 query 时,通常会实际检索
+2. 后续 loop 更常见的是复用共享状态,只重新注入 ``
+
+### 4.3 缓存与复用
+
+自动注入会优先复用共享状态,条件包括:
+
+1. 同一 user query
+2. 同一 cluster
+3. 在 `RAG_REUSE_SEC` 时间窗内
+4. 或已经进入 assistant loop 阶段
+
+如果命中缓存,插件不会重新检索,而是直接注入当前 cluster 的状态。
+
+### 4.4 局部限流
+
+每个 cluster 单独维护时间窗:
+
+1. `RAG_CLUSTER_WINDOW_SEC`
+2. `RAG_CLUSTER_MAX_FULL`
+
+超过上限后,状态会变成:
+
+1. `cluster_throttled`
+
+## 5. 当前状态机
+
+当前状态枚举:
+
+1. `new_evidence`
+2. `no_new_evidence`
+3. `weak_match`
+4. `need_refine`
+5. `cluster_throttled`
+6. `retrieval_error`
+7. `state_reset`
+
+典型 reason:
+
+1. `fresh_hits`
+2. `delta_available`
+3. `high_overlap`
+4. `low_score`
+5. `empty_hits`
+6. `cluster_window_limit`
+7. `backend_error`
+8. `parse_error`
+9. `cached_recent_result`
+10. `compaction_epoch_changed`
+
+## 6. 什么叫“渐进式披露”
+
+### 6.1 自动注入侧
+
+自动注入侧的渐进式披露体现在:
+
+1. 首轮只建立状态并记录 hits
+2. 后续 loop 主要复用状态
+3. 自动注入不再负责正文披露
+
+换句话说,当前自动注入承担的是:
+
+1. 渐进提供 meta
+
+而不是:
+
+1. 渐进提供正文
+
+### 6.2 工具侧
+
+显式工具侧的渐进式披露体现在:
+
+1. `state` 只给状态
+2. `delta` 只给新增证据
+3. `brief` 给短摘要
+4. `expand` 给更多文本
+
+这才是当前正文证据的主要披露链路。
+
+## 7. Query Cluster
+
+当前 cluster 生成方式:
+
+1. query 小写化
+2. 中英文词项切分
+3. 去停用词
+4. 同义词归一
+5. 排序拼接
+
+作用:
+
+1. 把近义问题归到同一局部检索意图
+2. 支持同 cluster 去重
+3. 支持同 cluster 限流
+
+## 8. 底层检索脚本的当前角色
+
+`search-vector-index.py` 仍然只负责:
+
+1. embedding query
+2. 检索向量库
+3. 返回 hits
+
+当前输出格式支持:
+
+1. `json`
+2. `state`
+3. `brief`
+4. `auto`
+
+当前约束:
+
+1. `rag_context` 强制 `--format json`
+2. `rag_search` 也强制 `--format json`
+3. 只有 shell 直接运行脚本时,`OPENCODE=1` 下默认输出 `state`
+
+这样做是为了:
+
+1. 插件和工具都自己控制披露层级
+2. 终端里不要直接泄漏 hits 正文
+
+### 8.1 当前 rewrite 与 multi-query 检索
+
+当前底层检索脚本已经支持:
+
+1. LLM query rewrite
+2. multi-query retrieval
+3. merge 去重
+4. simple rerank
+
+流程如下:
+
+1. 原始 query 输入
+2. LLM 产出 `queries` 和 `keywords`
+3. 每个 rewrite query 单独向量检索
+4. 多路结果按 fingerprint merge
+5. 用简单规则做 rerank
+6. 输出最终 `top_k`
+
+当前 rerank 不是独立 reranker 模型,而是规则组合:
+
+1. `max_score`
+2. `reciprocal_rank`
+3. `hit_count`
+4. `primary_match`
+
+## 9. 调试日志
+
+### 9.1 日志文件
+
+当前统一日志:
+
+1. `.rag/log/rag_debug.jsonl`
+
+### 9.2 当前记录的链路
+
+现在会同时记录:
+
+1. `rag_context`
+2. `rag_search`
+
+通过字段区分:
+
+1. `channel`
+2. `event`
+
+### 9.3 当前重点字段
+
+当前日志里重点字段包括:
+
+1. `channel`
+2. `event`
+3. `sessionID`
+4. `query`
+5. `cluster`
+6. `mode`
+7. `loop`
+8. `used_cache`
+9. `status`
+10. `reason`
+11. `total_hits`
+12. `delta_hits`
+13. `known_hits`
+14. `overlap`
+15. `rewrites`
+16. `keywords`
+17. `rewrite_mode`
+18. `top_hits`
+19. `delta_fps`
+20. `emitted_context`
+
+### 9.4 当前怎么判断渐进式披露生效
+
+看同一 `sessionID + cluster` 的连续日志:
+
+1. 首次检索:
+ - `status=new_evidence`
+ - `delta_hits>0`
+2. 后续 loop:
+ - `channel=rag_context`
+ - `event=context_meta`
+ - `used_cache=true`
+3. 后续主动补证据:
+ - `channel=rag_search`
+ - `event=tool_search`
+ - `mode=delta|brief|expand`
+
+这说明当前系统是在“先提供状态,再按需补正文”。
+
+## 10. 终端与 TUI 控制
+
+当前实现已经做了三层控制:
+
+1. 检索子进程使用 `.quiet()`
+2. shell 直接跑脚本时默认只输出 `state`
+3. `expand` 默认受限
+
+当前目标不是完全隐藏检索,而是:
+
+1. 不让底层脚本 stdout 直接污染终端
+2. 不让自动注入链路在 loop 中刷大段正文
+
+## 11. 当前限制
+
+1. 自动注入只提供 meta,不提供正文,需要模型自行决定是否调 `rag_search`
+2. 还没有 decomposition
+3. 当前 rerank 还是简单规则,不是专门 reranker 模型
+4. debug 已能看到 top hits 和 delta 指纹,但还没有记录 assistant reasoning 原文
+5. 多模态 embedding 还未接入当前渐进披露链路
+
+## 12. 关键代码锚点
+
+1. 共享状态:`.opencode/rag.ts`
+2. 自动注入:`.opencode/plugins/rag_context.ts`
+3. 渐进检索工具:`.opencode/tool/rag_search.ts`
+4. 底层检索:`script/rag/search-vector-index.py`
+5. 调试查看:`script/rag/debug-rag-state.py`
diff --git a/specs/rag-updates-history.zh.md b/specs/rag-updates-history.zh.md
new file mode 100644
index 00000000000..de35f2ca049
--- /dev/null
+++ b/specs/rag-updates-history.zh.md
@@ -0,0 +1,226 @@
+# RAG Enhance 变更回溯记录
+
+## 1. 目的
+
+这份文档用于记录本分支上 RAG 增强相关的关键演进,方便后续回溯问题来源、定位设计变更和重新部署时核对差异。
+
+## 2. 第一阶段:基础 RAG 流水线落地
+
+这一阶段完成了基础数据链路:
+
+1. 文档转文本
+2. 文本清洗
+3. 结构化输出
+4. embedding 落库
+5. 本地向量检索
+
+主要脚本:
+
+1. `script/rag/url-to-text.sh`
+2. `script/rag/convert-dir-to-text.sh`
+3. `script/rag/clean-text.py`
+4. `script/rag/structure-text.py`
+5. `script/rag/build-vector-index.py`
+6. `script/rag/search-vector-index.py`
+
+## 3. 第二阶段:OpenCode 插件化接入
+
+这一阶段引入了 OpenCode 集成层:
+
+1. 自动注入插件:`.opencode/plugins/rag_context.ts`
+2. 手动工具:`.opencode/tool/rag_search.ts`
+3. skill:`.opencode/skills/rag-pipeline/SKILL.md`
+
+目标是:
+
+1. 让 agent 在对话中可使用本地 RAG
+2. 支持插件迁移到其他项目
+3. 用 `rag-bootstrap.sh` / `install.sh` 完成交付
+
+## 4. 第三阶段:图片 OCR 与结构化关联
+
+这一阶段处理了图片与正文的关联问题:
+
+1. 图片 OCR 从纯追加文本改成与 image node 关联
+2. 结构化输出中保留 image metadata
+3. 向量检索命中正文时,可挂出 `related_images`
+
+目标是:
+
+1. 不直接污染正文 section
+2. 在命中 chunk 时仍然能关联图片信息
+
+## 5. 第四阶段:初版渐进式披露
+
+这一阶段第一次引入:
+
+1. ``
+2. ``
+3. overlap 去重
+4. cluster 局部限流
+5. debug 日志
+
+初版实现特点:
+
+1. 自动注入会注入状态和正文摘要
+2. `rag_search` 自己维护一套独立状态
+3. debug 主要看状态,证据可见性较弱
+
+当时解决的问题:
+
+1. 检索循环
+2. 重复注入
+3. context 窗口浪费
+
+## 6. 第五阶段:终端/TUI 回显治理
+
+这一阶段重点修了“检索输出污染终端/TUI”的问题。
+
+核心修复:
+
+1. `rag_search.ts` 和 `rag_context.ts` 调检索脚本时补 `.quiet()`
+2. 两条链路都强制 `search-vector-index.py --format json`
+3. `search-vector-index.py` 在 `OPENCODE=1` 下默认只输出 `state`
+4. `rag_search expand` 默认拦截
+
+目标是:
+
+1. 检索子进程不再把 stdout 直接打印到终端
+2. 工具链路不再因为 parse fail 回退成整段文本回显
+
+## 7. 第六阶段:非法 JSON tool args 缓解
+
+这一阶段修复了模型调用 `rag_search` 时偶发生成坏 JSON 的问题。
+
+核心修复:
+
+1. 在 `tool.definition` 中补充合法/非法 JSON 示例
+2. 在 system prompt 中明确要求 `query` 必须是单个普通字符串
+
+目标是:
+
+1. 降低模型把 query 引号拼坏的概率
+
+注意:
+
+1. 这类问题是模型生成错误,无法 100% 从代码层彻底消除
+
+## 8. 第七阶段:共享状态统一
+
+这一阶段把 `rag_context` 和 `rag_search` 统一进同一套共享状态系统。
+
+新增文件:
+
+1. `.opencode/rag.ts`
+
+统一后:
+
+1. 两条链路共享 session/cluster 状态
+2. 共享 `seen`
+3. 共享 `total_hits / known_hits / overlap`
+4. 共享 `top_hits`
+5. 共享 `rewrites`
+
+这一阶段的设计变化很关键:
+
+1. `rag_context` 不再自动注入正文,只注入检索 meta
+2. `rag_search` 成为正文证据的渐进式补充入口
+
+## 9. 第八阶段:ReAct loop 对齐
+
+这一阶段是为适配 OpenCode 的 ReAct 式 loop。
+
+变化点:
+
+1. `rag_context` 不再只在“第一次用户提问前”工作
+2. 在 loop 中也会再次运行
+3. 但后续更常见的是复用缓存状态,只重复注入 ``
+
+目标是:
+
+1. 在推理过程中让模型持续看到当前检索状态
+2. 由模型自行决定是否继续调用 `rag_search`
+
+## 10. 第九阶段:debug 日志增强
+
+这一阶段把 debug 从“状态日志”增强成“过程日志”。
+
+现在统一记录到:
+
+1. `.rag/log/rag_debug.jsonl`
+
+日志覆盖:
+
+1. `rag_context`
+2. `rag_search`
+
+主要新增字段:
+
+1. `channel`
+2. `mode`
+3. `loop`
+4. `used_cache`
+5. `top_hits`
+6. `delta_fps`
+7. `rewrites`
+8. `emitted_context`
+
+目的:
+
+1. 可追踪每一次状态注入
+2. 可追踪每一次显式检索
+3. 可回溯当前 cluster 的命中情况
+
+## 11. 第十阶段:query rewrite 与 multi-query retrieval
+
+这一阶段在底层检索脚本里加入了:
+
+1. LLM query rewrite
+2. 多 query 独立召回
+3. merge 去重
+4. simple rerank
+
+当前实现方式:
+
+1. LLM 输出 `queries` 和 `keywords`
+2. 每个 query 单独做 embedding 检索
+3. 按 chunk fingerprint 合并候选
+4. 结合 `max_score / reciprocal_rank / hit_count / primary_match` 做重排
+
+目标是:
+
+1. 降低长 query 的语义噪声
+2. 提高多视角召回能力
+3. 给后续 decomposition 留出接口
+
+## 12. 当前结论
+
+到当前版本为止,系统已经形成了下面的职责分离:
+
+1. `rag_context`
+ - 持续注入 RAG meta
+ - 在 loop 中复用共享状态
+ - 不主动注入正文
+
+2. `rag_search`
+ - 按 `state -> delta -> brief -> expand` 渐进补证据
+ - 与自动注入共享同一状态
+
+3. `debug`
+ - 统一记录自动注入与显式检索
+ - 便于后续对 query、cluster、命中和状态做回放
+
+## 13. 仍未完成的方向
+
+当前明确还没有完成的方向:
+
+1. decomposition
+2. 专门 reranker
+4. assistant reasoning 原文级别的日志追踪
+5. 多模态 embedding 接入当前渐进式披露系统
+
+## 14. 对应文档
+
+1. 当前实现说明:`specs/rag-progressive-disclosure.zh.md`
+2. 总体架构:`specs/rag-enhance-architecture.zh.md`
+3. 本回溯文档:`specs/rag-updates-history.zh.md`