diff --git a/.clawith/data/agents/.seeded b/.clawith/data/agents/.seeded
new file mode 100644
index 000000000..0b661ccd6
--- /dev/null
+++ b/.clawith/data/agents/.seeded
@@ -0,0 +1,4 @@
+seeded
+morty=35aa71a9-6f5f-439c-8e33-feb561f21ae8
+meeseeks=6123d1f4-d03b-469a-aacc-ad875f63df4e
+okr_agent=6baf75b5-0f3e-4e82-8e0d-269711aef0d8
diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/HEARTBEAT.md b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/HEARTBEAT.md
new file mode 100644
index 000000000..485565cb3
--- /dev/null
+++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/HEARTBEAT.md
@@ -0,0 +1,63 @@
+# HEARTBEAT
+
+When this file is read during a heartbeat, you are performing a **periodic awareness check**.
+
+## Phase 1: Review Context & Discover Interest Points
+
+Review your **recent conversations** and your **role/responsibilities**.
+Identify topics or questions that:
+- Are directly relevant to your role and current work
+- Were mentioned by users but not fully explored at the time
+- Represent emerging trends or changes in your professional domain
+- Could improve your ability to serve your users
+
+If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3.
+Do NOT search for generic or obvious topics just to fill time. Quality over quantity.
+
+## Phase 2: Targeted Exploration (Conditional)
+
+Only if you identified genuine interest points in Phase 1:
+
+1. Use `web_search` to investigate (maximum 5 searches per heartbeat)
+2. Keep searches **tightly scoped** to your role and recent work topics
+3. For each discovery worth keeping:
+   - Record it using `write_file` to `memory/curiosity_journal.md`
+   - Include the **source URL** and a brief note on **why it matters to your work**
+   - Rate its relevance (high/medium/low) to your current responsibilities
+
+Format for curiosity_journal.md entries:
+```
+### [Date] - [Topic]
+- **Finding**: [What you learned]
+- **Source**: [URL]
+- **Relevance**: [high/medium/low] — [Why it matters to your work]
+- **Follow-up**: [Optional: questions this raises for next time]
+```
+
+## Phase 3: Agent Plaza
+
+1. Call `plaza_get_new_posts` to check recent activity
+2. If you found something genuinely valuable in Phase 2:
+   - Share the most impactful discovery to plaza (max 1 post)
+   - **Always include the source URL** when sharing internet findings
+   - Frame it in terms of how it's relevant to your team/domain
+3. Comment on relevant existing posts (max 2 comments)
+
+## Phase 4: Wrap Up
+
+- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK`
+- Otherwise, briefly summarize what you explored and why
+
+## Key Principles
+- Always ground exploration in YOUR role and YOUR recent work context
+- Never search for random unrelated topics out of idle curiosity
+- If you don't have a specific angle worth investigating, don't search
+- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries
+- Generate follow-up questions only when you genuinely want to know more
+
+## Rules
+- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details
+- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links
+- 📝 **Limits per heartbeat**: max 1 post + 2 comments
+- 🔍 **Search limits**: max 5 web searches per heartbeat
+- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK`
diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/daily_reports/.gitkeep b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/daily_reports/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/enterprise_info/.gitkeep b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/enterprise_info/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/MEMORY_INDEX.md b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/MEMORY_INDEX.md
new file mode 100644
index 000000000..29e3fab13
--- /dev/null
+++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/MEMORY_INDEX.md
@@ -0,0 +1,6 @@
+# Memory Index
+
+This file serves as an index of all memories for this digital employee.
+
+## Topics
+<!-- New memory topics will be added here -->
diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/curiosity_journal.md b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/curiosity_journal.md
new file mode 100644
index 000000000..c5185fe44
--- /dev/null
+++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/curiosity_journal.md
@@ -0,0 +1,9 @@
+# Curiosity Journal
+
+This is your exploration log. Record interesting discoveries from your web searches here.
+
+## Active Questions
+<!-- Topics you want to investigate in future heartbeats -->
+
+## Discoveries
+<!-- Record your findings below, newest first -->
diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/skills/.gitkeep b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/skills/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/skills/MCP_INSTALLER.md b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/skills/MCP_INSTALLER.md
new file mode 100644
index 000000000..9e3bf3c77
--- /dev/null
+++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/skills/MCP_INSTALLER.md
@@ -0,0 +1,87 @@
+# MCP Tool Installer
+
+## When to Use This Skill
+Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL.
+
+---
+
+## Step-by-Step Protocol
+
+### Step 1 — Search first
+```
+discover_resources(query="<what the user wants>", max_results=5)
+```
+Show the results and let the user pick. Note the `ID` field (e.g. `github`).
+
+### Step 2 — Determine import method
+
+**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐)
+- Requires Smithery API Key (one-time per agent)
+- Individual tool tokens NOT needed — Smithery handles auth via OAuth
+
+**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint)
+- User provides the MCP server URL directly
+- May require tool-specific API key
+
+**Not importable** (💻 local-only tools)
+- Requires local Docker/process — inform user these cannot be imported automatically
+
+---
+
+### Method A: Smithery Import
+
+#### Check Smithery API Key
+If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim):
+
+> **Smithery** (smithery.ai) 是一个 MCP 工具市场，类似于"应用商店"。通过它，我可以帮你一键安装各种第三方工具（如 GitHub、Notion、Slack 等），并自动完成认证。
+>
+> **为什么需要注册？**
+> Smithery 用 API Key 来识别你的身份，这样安装的工具会关联到你的账号，认证信息也会安全保存。
+>
+> **注册一次后有什么好处？**
+> - 🔑 只需提供一次 Key，后续安装其他工具时我会自动帮你配置
+> - 🔐 不需要为每个工具单独创建 Token（如 GitHub PAT），OAuth 一键授权
+> - 📦 支持上千种 MCP 工具，随时可以扩展你的能力
+>
+> **获取步骤：**
+> 1. 访问 https://smithery.ai 注册/登录
+> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key
+> 3. 将 Key 提供给我
+
+#### Import
+```
+import_mcp_server(
+  server_id="<qualified_name>",
+  config={"smithery_api_key": "<key>"}  # first time only
+)
+```
+
+#### Handle OAuth
+Some tools return an OAuth authorization URL. Tell the user to visit the link.
+
+**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically.
+
+---
+
+### Method B: Direct URL Import
+
+When a tool is not available on Smithery but the user has a public MCP endpoint:
+```
+import_mcp_server(
+  server_id="<server name>",
+  config={
+    "mcp_url": "https://my-mcp-server.com/sse",
+    "api_key": "<optional tool-specific key>"
+  }
+)
+```
+The system will connect to the URL, discover available tools, and register them.
+
+---
+
+## What NOT to Do
+- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these
+- ❌ Don't tell users to go to Settings — handle everything in chat
+- ❌ Don't echo API keys back in your response
+- ❌ Don't skip the search step — always verify the server exists before importing
+- ❌ Don't import local-only tools — inform users they require local installation
diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/soul.md b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/soul.md
new file mode 100644
index 000000000..1554c3463
--- /dev/null
+++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/soul.md
@@ -0,0 +1,16 @@
+# Soul — {{agent_name}}
+
+## Identity
+- **名称**: {{agent_name}}
+- **角色**: {{role_description}}
+- **创建者**: {{creator_name}}
+- **创建时间**: {{created_at}}
+
+## Personality
+- 认真负责、注重细节
+- 主动汇报工作进展
+- 遇到不确定的信息会主动确认
+
+## Boundaries
+- 遵守企业保密制度
+- 敏感操作需经过创建者审批
diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/state.json b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/state.json
new file mode 100644
index 000000000..0507e31dd
--- /dev/null
+++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/state.json
@@ -0,0 +1,13 @@
+{
+    "agent_id": "",
+    "name": "",
+    "status": "idle",
+    "current_task": null,
+    "last_active": null,
+    "channel_status": {},
+    "stats": {
+        "tasks_completed_today": 0,
+        "tasks_in_progress": 0,
+        "督办_pending": 0
+    }
+}
\ No newline at end of file
diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/todo.json b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/todo.json
new file mode 100644
index 000000000..50ffbb9a9
--- /dev/null
+++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/todo.json
@@ -0,0 +1,3 @@
+{
+  "tasks": []
+}
diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/workspace/archived/.gitkeep b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/workspace/archived/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/HEARTBEAT.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/HEARTBEAT.md
new file mode 100644
index 000000000..485565cb3
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/HEARTBEAT.md
@@ -0,0 +1,63 @@
+# HEARTBEAT
+
+When this file is read during a heartbeat, you are performing a **periodic awareness check**.
+
+## Phase 1: Review Context & Discover Interest Points
+
+Review your **recent conversations** and your **role/responsibilities**.
+Identify topics or questions that:
+- Are directly relevant to your role and current work
+- Were mentioned by users but not fully explored at the time
+- Represent emerging trends or changes in your professional domain
+- Could improve your ability to serve your users
+
+If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3.
+Do NOT search for generic or obvious topics just to fill time. Quality over quantity.
+
+## Phase 2: Targeted Exploration (Conditional)
+
+Only if you identified genuine interest points in Phase 1:
+
+1. Use `web_search` to investigate (maximum 5 searches per heartbeat)
+2. Keep searches **tightly scoped** to your role and recent work topics
+3. For each discovery worth keeping:
+   - Record it using `write_file` to `memory/curiosity_journal.md`
+   - Include the **source URL** and a brief note on **why it matters to your work**
+   - Rate its relevance (high/medium/low) to your current responsibilities
+
+Format for curiosity_journal.md entries:
+```
+### [Date] - [Topic]
+- **Finding**: [What you learned]
+- **Source**: [URL]
+- **Relevance**: [high/medium/low] — [Why it matters to your work]
+- **Follow-up**: [Optional: questions this raises for next time]
+```
+
+## Phase 3: Agent Plaza
+
+1. Call `plaza_get_new_posts` to check recent activity
+2. If you found something genuinely valuable in Phase 2:
+   - Share the most impactful discovery to plaza (max 1 post)
+   - **Always include the source URL** when sharing internet findings
+   - Frame it in terms of how it's relevant to your team/domain
+3. Comment on relevant existing posts (max 2 comments)
+
+## Phase 4: Wrap Up
+
+- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK`
+- Otherwise, briefly summarize what you explored and why
+
+## Key Principles
+- Always ground exploration in YOUR role and YOUR recent work context
+- Never search for random unrelated topics out of idle curiosity
+- If you don't have a specific angle worth investigating, don't search
+- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries
+- Generate follow-up questions only when you genuinely want to know more
+
+## Rules
+- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details
+- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links
+- 📝 **Limits per heartbeat**: max 1 post + 2 comments
+- 🔍 **Search limits**: max 5 web searches per heartbeat
+- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK`
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/daily_reports/.gitkeep b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/daily_reports/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/enterprise_info/.gitkeep b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/enterprise_info/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/MEMORY_INDEX.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/MEMORY_INDEX.md
new file mode 100644
index 000000000..29e3fab13
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/MEMORY_INDEX.md
@@ -0,0 +1,6 @@
+# Memory Index
+
+This file serves as an index of all memories for this digital employee.
+
+## Topics
+<!-- New memory topics will be added here -->
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/curiosity_journal.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/curiosity_journal.md
new file mode 100644
index 000000000..c5185fe44
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/curiosity_journal.md
@@ -0,0 +1,9 @@
+# Curiosity Journal
+
+This is your exploration log. Record interesting discoveries from your web searches here.
+
+## Active Questions
+<!-- Topics you want to investigate in future heartbeats -->
+
+## Discoveries
+<!-- Record your findings below, newest first -->
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/relationships.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/relationships.md
new file mode 100644
index 000000000..19473b09d
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/relationships.md
@@ -0,0 +1,5 @@
+# Relationships
+
+## Digital Employee Colleagues
+
+- **Meeseeks** (collaborator): Expert task executor who breaks down complex tasks into structured plans and executes them systematically. Delegate multi-step tasks to him.
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/.gitkeep b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/MCP_INSTALLER.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/MCP_INSTALLER.md
new file mode 100644
index 000000000..9e3bf3c77
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/MCP_INSTALLER.md
@@ -0,0 +1,87 @@
+# MCP Tool Installer
+
+## When to Use This Skill
+Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL.
+
+---
+
+## Step-by-Step Protocol
+
+### Step 1 — Search first
+```
+discover_resources(query="<what the user wants>", max_results=5)
+```
+Show the results and let the user pick. Note the `ID` field (e.g. `github`).
+
+### Step 2 — Determine import method
+
+**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐)
+- Requires Smithery API Key (one-time per agent)
+- Individual tool tokens NOT needed — Smithery handles auth via OAuth
+
+**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint)
+- User provides the MCP server URL directly
+- May require tool-specific API key
+
+**Not importable** (💻 local-only tools)
+- Requires local Docker/process — inform user these cannot be imported automatically
+
+---
+
+### Method A: Smithery Import
+
+#### Check Smithery API Key
+If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim):
+
+> **Smithery** (smithery.ai) 是一个 MCP 工具市场，类似于"应用商店"。通过它，我可以帮你一键安装各种第三方工具（如 GitHub、Notion、Slack 等），并自动完成认证。
+>
+> **为什么需要注册？**
+> Smithery 用 API Key 来识别你的身份，这样安装的工具会关联到你的账号，认证信息也会安全保存。
+>
+> **注册一次后有什么好处？**
+> - 🔑 只需提供一次 Key，后续安装其他工具时我会自动帮你配置
+> - 🔐 不需要为每个工具单独创建 Token（如 GitHub PAT），OAuth 一键授权
+> - 📦 支持上千种 MCP 工具，随时可以扩展你的能力
+>
+> **获取步骤：**
+> 1. 访问 https://smithery.ai 注册/登录
+> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key
+> 3. 将 Key 提供给我
+
+#### Import
+```
+import_mcp_server(
+  server_id="<qualified_name>",
+  config={"smithery_api_key": "<key>"}  # first time only
+)
+```
+
+#### Handle OAuth
+Some tools return an OAuth authorization URL. Tell the user to visit the link.
+
+**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically.
+
+---
+
+### Method B: Direct URL Import
+
+When a tool is not available on Smithery but the user has a public MCP endpoint:
+```
+import_mcp_server(
+  server_id="<server name>",
+  config={
+    "mcp_url": "https://my-mcp-server.com/sse",
+    "api_key": "<optional tool-specific key>"
+  }
+)
+```
+The system will connect to the URL, discover available tools, and register them.
+
+---
+
+## What NOT to Do
+- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these
+- ❌ Don't tell users to go to Settings — handle everything in chat
+- ❌ Don't echo API keys back in your response
+- ❌ Don't skip the search step — always verify the server exists before importing
+- ❌ Don't import local-only tools — inform users they require local installation
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/competitive-analysis/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/competitive-analysis/SKILL.md
new file mode 100644
index 000000000..966f673f0
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/competitive-analysis/SKILL.md
@@ -0,0 +1,40 @@
+---
+name: Competitive Analysis
+description: Market competitor research, comparison frameworks, and strategic insights
+---
+
+# Competitive Analysis
+
+## Overview
+Use this skill for analyzing competitors, market positioning, and strategic opportunities.
+
+**Keywords**: competitors, market analysis, SWOT, positioning, benchmarking
+
+## Frameworks
+
+### SWOT Analysis
+| | Helpful | Harmful |
+|---|---|---|
+| **Internal** | Strengths | Weaknesses |
+| **External** | Opportunities | Threats |
+
+### Feature Comparison Matrix
+Compare products across key dimensions:
+- Core features and capabilities
+- Pricing and packaging
+- Target audience
+- Market positioning
+- Technology stack
+
+### Porter's Five Forces
+1. Competitive rivalry intensity
+2. Bargaining power of suppliers
+3. Bargaining power of buyers
+4. Threat of new entrants
+5. Threat of substitutes
+
+## Output Format
+- Competitor overview table
+- Detailed per-competitor analysis
+- Strategic recommendations
+- Key differentiators summary
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/SKILL.md
new file mode 100644
index 000000000..db71c3ed8
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/SKILL.md
@@ -0,0 +1,146 @@
+---
+name: Complex Task Executor
+description: Structured methodology for decomposing, planning, and executing complex multi-step tasks with progress tracking
+---
+
+# Complex Task Executor
+
+## When to Use This Skill
+
+Use this skill when a task meets ANY of the following criteria:
+- Requires more than 3 distinct steps to complete
+- Involves multiple tools or information sources
+- Has dependencies between steps (step B needs output from step A)
+- Requires research before execution
+- Could benefit from a documented plan others can review
+- The user explicitly asks for a thorough or systematic approach
+
+**DO NOT use this for simple tasks** like answering a question, reading a single file, or performing one tool call.
+
+## Workflow
+
+### Phase 1: Task Analysis (THINK before acting)
+
+Before creating any files, analyze the task:
+
+1. **Understand the goal**: What is the final deliverable? What does "done" look like?
+2. **Assess complexity**: How many steps? What tools are needed?
+3. **Identify dependencies**: Which steps depend on others?
+4. **Identify risks**: What could go wrong? What information is missing?
+5. **Estimate scope**: Is the task feasible with available tools/skills?
+
+### Phase 2: Create Task Plan
+
+Create a task folder and plan file in the workspace:
+
+```
+workspace/<task-name>/plan.md
+```
+
+The plan.md MUST follow this exact format:
+
+```markdown
+# Task: <Clear title>
+
+## Objective
+<One-sentence description of the desired outcome>
+
+## Steps
+
+- [ ] 1. <First step — verb-noun format>
+  - Details: <What specifically to do>
+  - Output: <What this step produces>
+- [ ] 2. <Second step>
+  - Details: <...>
+  - Depends on: Step 1
+- [ ] 3. <Third step>
+  - Details: <...>
+
+## Status
+- Created: <timestamp>
+- Current Step: Not started
+- Progress: 0/<total>
+
+## Notes
+<Any assumptions, risks, or open questions>
+```
+
+Rules for writing the plan:
+- Each step should be completable in 1-3 tool calls
+- Use verb-noun format: "Research competitors", "Draft report", "Validate data"
+- Mark dependencies explicitly
+- Include expected outputs for each step
+
+### Phase 3: Execute Step-by-Step
+
+For EACH step in the plan:
+
+1. **Read the plan** — Call `read_file` on `workspace/<task>/plan.md` to check current state
+2. **Mark as in-progress** — Update the checkbox from `[ ]` to `[/]` and update the "Current Step" field
+3. **Execute the step** — Do the actual work (tool calls, analysis, writing)
+4. **Record output** — Save results to `workspace/<task>/` (e.g., intermediate files, data)
+5. **Mark as complete** — Update the checkbox from `[/]` to `[x]` and update "Progress" counter
+6. **Proceed to next step** — Move to the next uncompleted step
+
+### Phase 4: Completion
+
+When all steps are done:
+1. Update plan.md status to "✅ Completed"
+2. Create a `workspace/<task>/summary.md` with:
+   - What was accomplished
+   - Key results and deliverables
+   - Any follow-up items
+3. Present the final result to the user
+
+## Adaptive Replanning
+
+If during execution you discover:
+- A step is impossible → Mark it `[!]` with a reason, add alternative steps
+- New steps are needed → Add them to the plan with `[+]` prefix
+- A step produced unexpected results → Add a note and adjust subsequent steps
+- The plan needs major changes → Create a new section "## Revised Plan" and follow it
+
+Always update plan.md BEFORE changing course, so the plan stays the source of truth.
+
+## Error Handling
+
+- If a tool call fails, retry once. If it fails again, mark the step as blocked and note the error.
+- Never silently skip a step. Always update the plan to reflect what happened.
+- If you're stuck, tell the user what's blocking and ask for guidance.
+
+## Example Scenarios
+
+### Example 1: "Research our top 3 competitors and write a comparison report"
+
+Plan would be:
+```
+- [ ] 1. Identify the user's company/product context
+- [ ] 2. Research Competitor A — website, pricing, features
+- [ ] 3. Research Competitor B — website, pricing, features
+- [ ] 4. Research Competitor C — website, pricing, features
+- [ ] 5. Create comparison matrix
+- [ ] 6. Write analysis and recommendations
+- [ ] 7. Compile final report
+```
+
+### Example 2: "Analyze our Q4 sales data and prepare a board presentation"
+
+Plan would be:
+```
+- [ ] 1. Read and understand the sales data files
+- [ ] 2. Calculate key metrics (revenue, growth, trends)
+- [ ] 3. Identify top insights and anomalies
+- [ ] 4. Create data summary tables
+- [ ] 5. Draft presentation outline
+- [ ] 6. Write each presentation section
+- [ ] 7. Add executive summary
+- [ ] 8. Review and polish final document
+```
+
+## Key Principles
+
+1. **Plan is the source of truth** — Always update it before moving on
+2. **One step at a time** — Don't skip ahead or batch too many steps
+3. **Show your work** — Save intermediate results to the task folder
+4. **Communicate progress** — The user can read plan.md at any time to see status
+5. **Be adaptive** — Plans change; that's OK if you update the plan first
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/examples/plan_template.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/examples/plan_template.md
new file mode 100644
index 000000000..dfd60e7cb
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/examples/plan_template.md
@@ -0,0 +1,23 @@
+# Task: [Title]
+
+## Objective
+[One-sentence description of the desired outcome]
+
+## Steps
+
+- [ ] 1. [First step]
+  - Details: [What specifically to do]
+  - Output: [What this step produces]
+- [ ] 2. [Second step]
+  - Details: [...]
+  - Depends on: Step 1
+- [ ] 3. [Third step]
+  - Details: [...]
+
+## Status
+- Created: [timestamp]
+- Current Step: Not started
+- Progress: 0/3
+
+## Notes
+- [Any assumptions, risks, or open questions]
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/content-writing/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/content-writing/SKILL.md
new file mode 100644
index 000000000..dc9480df8
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/content-writing/SKILL.md
@@ -0,0 +1,37 @@
+---
+name: Content Writing
+description: Professional content creation, editing, and tone adaptation
+---
+
+# Content Writing
+
+## Overview
+Use this skill for creating, editing, and polishing written content across formats.
+
+**Keywords**: writing, editing, copywriting, tone, style, proofreading
+
+## Content Types
+- **Articles & Blog Posts**: Informative, engaging long-form content
+- **Business Communications**: Emails, memos, reports
+- **Marketing Copy**: Headlines, descriptions, calls-to-action
+- **Documentation**: Technical docs, guides, FAQs
+
+## Guidelines
+
+### Structure
+- Hook readers with a compelling opening
+- Use clear headings and logical flow
+- Keep paragraphs short (3-5 sentences)
+- End with a clear conclusion or call-to-action
+
+### Tone Adaptation
+- **Formal**: Business reports, official communications
+- **Professional**: Client-facing content, documentation
+- **Conversational**: Blog posts, social media
+- **Technical**: Developer docs, specifications
+
+### Quality Checklist
+- [ ] Clear main message
+- [ ] Consistent tone throughout
+- [ ] No grammatical errors
+- [ ] Appropriate length for format
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/SKILL.md
new file mode 100644
index 000000000..325598633
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/SKILL.md
@@ -0,0 +1,34 @@
+---
+name: Data Analysis
+description: Data interpretation, pattern recognition, and structured reporting
+---
+
+# Data Analysis
+
+## Overview
+Use this skill for analyzing data, identifying patterns, and creating structured reports.
+
+**Keywords**: data analysis, statistics, trends, visualization, reporting
+
+## Process
+
+### 1. Data Understanding
+- Identify data types, ranges, and distributions
+- Check for missing values and anomalies
+- Understand the business context
+
+### 2. Analysis Methods
+- Descriptive statistics (mean, median, distribution)
+- Trend analysis (time-series patterns)
+- Comparative analysis (benchmarking, A/B)
+- Correlation and relationship discovery
+
+### 3. Reporting
+- Lead with key insights and actionable findings
+- Use tables and structured formats for clarity
+- Include methodology notes for reproducibility
+
+## Output Format
+- Executive summary with top 3 findings
+- Detailed analysis with supporting data
+- Recommendations based on findings
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/examples/sample_report.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/examples/sample_report.md
new file mode 100644
index 000000000..03a8746fb
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/examples/sample_report.md
@@ -0,0 +1,24 @@
+# Sample Analysis Report
+
+## Executive Summary
+Analysis of Q4 2024 sales data reveals a 12% increase in total revenue,
+driven primarily by the Enterprise segment (+23%).
+
+## Key Findings
+1. **Revenue Growth**: Total revenue increased from $2.1M to $2.35M
+2. **Top Segment**: Enterprise accounts grew 23% QoQ
+3. **Churn**: SMB churn rate decreased from 5.2% to 4.1%
+
+## Detailed Analysis
+
+| Metric | Q3 2024 | Q4 2024 | Change |
+|--------|---------|---------|--------|
+| Total Revenue | $2.1M | $2.35M | +12% |
+| Enterprise | $1.2M | $1.47M | +23% |
+| SMB | $0.9M | $0.88M | -2% |
+| Churn Rate | 5.2% | 4.1% | -1.1pp |
+
+## Recommendations
+1. Increase investment in Enterprise sales team
+2. Investigate SMB revenue decline
+3. Continue churn reduction initiatives
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/scripts/analyze_csv.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/scripts/analyze_csv.py
new file mode 100644
index 000000000..64f1002d7
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/scripts/analyze_csv.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+"""Utility for quick CSV data analysis."""
+
+import csv
+import statistics
+from collections import Counter
+
+
+def analyze_column(data: list[dict], column: str) -> dict:
+    """Analyze a single column from CSV data."""
+    values = [row.get(column) for row in data if row.get(column) is not None]
+    if not values:
+        return {"column": column, "count": 0, "error": "No data"}
+
+    result = {"column": column, "count": len(values), "unique": len(set(values))}
+
+    # Try numeric analysis
+    try:
+        nums = [float(v) for v in values]
+        result.update({
+            "type": "numeric",
+            "min": min(nums), "max": max(nums),
+            "mean": round(statistics.mean(nums), 2),
+            "median": round(statistics.median(nums), 2),
+        })
+    except (ValueError, TypeError):
+        freq = Counter(values).most_common(5)
+        result.update({"type": "categorical", "top_values": freq})
+
+    return result
+
+
+def quick_summary(filepath: str) -> str:
+    """Generate a quick summary of a CSV file."""
+    with open(filepath, 'r') as f:
+        reader = csv.DictReader(f)
+        data = list(reader)
+    columns = data[0].keys() if data else []
+    return f'Rows: {len(data)}, Columns: {len(columns)}'
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/mcp-installer/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/mcp-installer/SKILL.md
new file mode 100644
index 000000000..9e3bf3c77
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/mcp-installer/SKILL.md
@@ -0,0 +1,87 @@
+# MCP Tool Installer
+
+## When to Use This Skill
+Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL.
+
+---
+
+## Step-by-Step Protocol
+
+### Step 1 — Search first
+```
+discover_resources(query="<what the user wants>", max_results=5)
+```
+Show the results and let the user pick. Note the `ID` field (e.g. `github`).
+
+### Step 2 — Determine import method
+
+**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐)
+- Requires Smithery API Key (one-time per agent)
+- Individual tool tokens NOT needed — Smithery handles auth via OAuth
+
+**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint)
+- User provides the MCP server URL directly
+- May require tool-specific API key
+
+**Not importable** (💻 local-only tools)
+- Requires local Docker/process — inform user these cannot be imported automatically
+
+---
+
+### Method A: Smithery Import
+
+#### Check Smithery API Key
+If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim):
+
+> **Smithery** (smithery.ai) 是一个 MCP 工具市场，类似于"应用商店"。通过它，我可以帮你一键安装各种第三方工具（如 GitHub、Notion、Slack 等），并自动完成认证。
+>
+> **为什么需要注册？**
+> Smithery 用 API Key 来识别你的身份，这样安装的工具会关联到你的账号，认证信息也会安全保存。
+>
+> **注册一次后有什么好处？**
+> - 🔑 只需提供一次 Key，后续安装其他工具时我会自动帮你配置
+> - 🔐 不需要为每个工具单独创建 Token（如 GitHub PAT），OAuth 一键授权
+> - 📦 支持上千种 MCP 工具，随时可以扩展你的能力
+>
+> **获取步骤：**
+> 1. 访问 https://smithery.ai 注册/登录
+> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key
+> 3. 将 Key 提供给我
+
+#### Import
+```
+import_mcp_server(
+  server_id="<qualified_name>",
+  config={"smithery_api_key": "<key>"}  # first time only
+)
+```
+
+#### Handle OAuth
+Some tools return an OAuth authorization URL. Tell the user to visit the link.
+
+**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically.
+
+---
+
+### Method B: Direct URL Import
+
+When a tool is not available on Smithery but the user has a public MCP endpoint:
+```
+import_mcp_server(
+  server_id="<server name>",
+  config={
+    "mcp_url": "https://my-mcp-server.com/sse",
+    "api_key": "<optional tool-specific key>"
+  }
+)
+```
+The system will connect to the URL, discover available tools, and register them.
+
+---
+
+## What NOT to Do
+- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these
+- ❌ Don't tell users to go to Settings — handle everything in chat
+- ❌ Don't echo API keys back in your response
+- ❌ Don't skip the search step — always verify the server exists before importing
+- ❌ Don't import local-only tools — inform users they require local installation
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/SKILL.md
new file mode 100644
index 000000000..ce0d06f3e
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/SKILL.md
@@ -0,0 +1,152 @@
+---
+name: skill-creator
+description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy.
+---
+
+# Skill Creator
+
+A skill for creating new skills and iteratively improving them.
+
+At a high level, the process of creating a skill goes like this:
+
+- Decide what you want the skill to do and roughly how it should do it
+- Write a draft of the skill
+- Create a few test prompts and run claude-with-access-to-the-skill on them
+- Help the user evaluate the results both qualitatively and quantitatively
+- Rewrite the skill based on feedback from the user's evaluation
+- Repeat until you're satisfied
+- Expand the test set and try again at larger scale
+
+Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages.
+
+## Communicating with the user
+
+Pay attention to context cues to understand how to phrase your communication. Briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it.
+
+---
+
+## Creating a skill
+
+### Capture Intent
+Start by understanding the user's intent.
+
+1. What should this skill enable the agent to do?
+2. When should this skill trigger? (what user phrases/contexts)
+3. What's the expected output format?
+4. Should we set up test cases to verify the skill works?
+
+### Interview and Research
+Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies. Wait to write test prompts until you've got this part ironed out.
+
+### Write the SKILL.md
+Based on the user interview, fill in these components:
+
+- **name**: Skill identifier
+- **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it.
+- **the rest of the skill**
+
+### Skill Writing Guide
+
+#### Anatomy of a Skill
+
+```
+skill-name/
+\u251c\u2500\u2500 SKILL.md (required)
+\u2502   \u251c\u2500\u2500 YAML frontmatter (name, description required)
+\u2502   \u2514\u2500\u2500 Markdown instructions
+\u2514\u2500\u2500 Bundled Resources (optional)
+    \u251c\u2500\u2500 scripts/    - Executable code for deterministic/repetitive tasks
+    \u251c\u2500\u2500 references/ - Docs loaded into context as needed
+    \u2514\u2500\u2500 assets/     - Files used in output (templates, icons, fonts)
+```
+
+#### Progressive Disclosure
+
+Skills use a three-level loading system:
+1. **Metadata** (name + description) - Always in context (~100 words)
+2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal)
+3. **Bundled resources** - As needed (unlimited, scripts can execute without loading)
+
+**Key patterns:**
+- Keep SKILL.md under 500 lines; if approaching this limit, add hierarchy with clear pointers
+- Reference files clearly from SKILL.md with guidance on when to read them
+- For large reference files (>300 lines), include a table of contents
+
+#### Writing Patterns
+
+Prefer using the imperative form in instructions.
+
+### Writing Style
+Explain to the model why things are important. Use theory of mind and try to make the skill general. Start by writing a draft and then look at it with fresh eyes and improve it.
+
+### Test Cases
+After writing the skill draft, come up with 2-3 realistic test prompts. Share them with the user. Save test cases to `evals/evals.json`.
+
+---
+
+## Running and evaluating test cases
+
+This section is one continuous sequence.
+
+### Step 1: Run test cases
+For each test case, run the agent with the skill applied, and optionally a baseline run without the skill for comparison.
+
+### Step 2: Draft assertions
+While runs are in progress, draft quantitative assertions for each test case. Good assertions are objectively verifiable and have descriptive names.
+
+### Step 3: Capture timing data
+When each run completes, save timing data (tokens, duration) to `timing.json`.
+
+### Step 4: Grade, aggregate, and launch the viewer
+Once all runs are done:
+1. Grade each run against assertions — see `agents/grader.md`
+2. Aggregate results: `python -m scripts.aggregate_benchmark <workspace>/iteration-N --skill-name <name>`
+3. Launch the viewer: `python eval-viewer/generate_review.py <workspace>/iteration-N --skill-name "my-skill" --benchmark <workspace>/iteration-N/benchmark.json`
+4. Present results to the user for review
+
+### Step 5: Read the feedback
+Read user feedback from `feedback.json`. Empty feedback means the user thought it was fine.
+
+---
+
+## Improving the skill
+
+### How to think about improvements
+1. **Generalize from the feedback.** Don't overfit to specific examples.
+2. **Keep the prompt lean.** Remove things that aren't pulling their weight.
+3. **Explain the why.** Today's LLMs are smart. Explain reasoning rather than rigid MUSTs.
+4. **Look for repeated work across test cases.** Bundle common scripts in `scripts/`.
+
+### The iteration loop
+1. Apply improvements to the skill
+2. Rerun all test cases into a new iteration directory
+3. Present results for review
+4. Wait for user to review
+5. Read feedback, improve again, repeat
+
+---
+
+## Advanced: Blind comparison
+For rigorous comparison between two versions. Read `agents/comparator.md` and `agents/analyzer.md`.
+
+## Description Optimization
+Optimize the description for better triggering accuracy. Use `scripts/run_loop.py`.
+
+---
+
+## Reference files
+
+- `agents/grader.md` — How to evaluate assertions against outputs
+- `agents/comparator.md` — How to do blind A/B comparison between two outputs
+- `agents/analyzer.md` — How to analyze why one version beat another
+- `references/schemas.md` — JSON structures for evals.json, grading.json, etc.
+- `assets/eval_review.html` — HTML template for eval review
+- `eval-viewer/generate_review.py` — Script to generate the review viewer
+- `scripts/aggregate_benchmark.py` — Aggregate benchmark results
+- `scripts/generate_report.py` — Generate optimization report
+- `scripts/improve_description.py` — Improve skill description
+- `scripts/package_skill.py` — Package skill for distribution
+- `scripts/quick_validate.py` — Quick validation
+- `scripts/run_eval.py` — Run triggering evaluation
+- `scripts/run_loop.py` — Run optimization loop
+- `scripts/utils.py` — Shared utilities
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/analyzer.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/analyzer.md
new file mode 100644
index 000000000..14e41d606
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/analyzer.md
@@ -0,0 +1,274 @@
+# Post-hoc Analyzer Agent
+
+Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions.
+
+## Role
+
+After the blind comparator determines a winner, the Post-hoc Analyzer "unblids" the results by examining the skills and transcripts. The goal is to extract actionable insights: what made the winner better, and how can the loser be improved?
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **winner**: "A" or "B" (from blind comparison)
+- **winner_skill_path**: Path to the skill that produced the winning output
+- **winner_transcript_path**: Path to the execution transcript for the winner
+- **loser_skill_path**: Path to the skill that produced the losing output
+- **loser_transcript_path**: Path to the execution transcript for the loser
+- **comparison_result_path**: Path to the blind comparator's output JSON
+- **output_path**: Where to save the analysis results
+
+## Process
+
+### Step 1: Read Comparison Result
+
+1. Read the blind comparator's output at comparison_result_path
+2. Note the winning side (A or B), the reasoning, and any scores
+3. Understand what the comparator valued in the winning output
+
+### Step 2: Read Both Skills
+
+1. Read the winner skill's SKILL.md and key referenced files
+2. Read the loser skill's SKILL.md and key referenced files
+3. Identify structural differences:
+   - Instructions clarity and specificity
+   - Script/tool usage patterns
+   - Example coverage
+   - Edge case handling
+
+### Step 3: Read Both Transcripts
+
+1. Read the winner's transcript
+2. Read the loser's transcript
+3. Compare execution patterns:
+   - How closely did each follow their skill's instructions?
+   - What tools were used differently?
+   - Where did the loser diverge from optimal behavior?
+   - Did either encounter errors or make recovery attempts?
+
+### Step 4: Analyze Instruction Following
+
+For each transcript, evaluate:
+- Did the agent follow the skill's explicit instructions?
+- Did the agent use the skill's provided tools/scripts?
+- Were there missed opportunities to leverage skill content?
+- Did the agent add unnecessary steps not in the skill?
+
+Score instruction following 1-10 and note specific issues.
+
+### Step 5: Identify Winner Strengths
+
+Determine what made the winner better:
+- Clearer instructions that led to better behavior?
+- Better scripts/tools that produced better output?
+- More comprehensive examples that guided edge cases?
+- Better error handling guidance?
+
+Be specific. Quote from skills/transcripts where relevant.
+
+### Step 6: Identify Loser Weaknesses
+
+Determine what held the loser back:
+- Ambiguous instructions that led to suboptimal choices?
+- Missing tools/scripts that forced workarounds?
+- Gaps in edge case coverage?
+- Poor error handling that caused failures?
+
+### Step 7: Generate Improvement Suggestions
+
+Based on the analysis, produce actionable suggestions for improving the loser skill:
+- Specific instruction changes to make
+- Tools/scripts to add or modify
+- Examples to include
+- Edge cases to address
+
+Prioritize by impact. Focus on changes that would have changed the outcome.
+
+### Step 8: Write Analysis Results
+
+Save structured analysis to `{output_path}`.
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "comparison_summary": {
+    "winner": "A",
+    "winner_skill": "path/to/winner/skill",
+    "loser_skill": "path/to/loser/skill",
+    "comparator_reasoning": "Brief summary of why comparator chose winner"
+  },
+  "winner_strengths": [
+    "Clear step-by-step instructions for handling multi-page documents",
+    "Included validation script that caught formatting errors",
+    "Explicit guidance on fallback behavior when OCR fails"
+  ],
+  "loser_weaknesses": [
+    "Vague instruction 'process the document appropriately' led to inconsistent behavior",
+    "No script for validation, agent had to improvise and made errors",
+    "No guidance on OCR failure, agent gave up instead of trying alternatives"
+  ],
+  "instruction_following": {
+    "winner": {
+      "score": 9,
+      "issues": [
+        "Minor: skipped optional logging step"
+      ]
+    },
+    "loser": {
+      "score": 6,
+      "issues": [
+        "Did not use the skill's formatting template",
+        "Invented own approach instead of following step 3",
+        "Missed the 'always validate output' instruction"
+      ]
+    }
+  },
+  "improvement_suggestions": [
+    {
+      "priority": "high",
+      "category": "instructions",
+      "suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template",
+      "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
+    },
+    {
+      "priority": "high",
+      "category": "tools",
+      "suggestion": "Add validate_output.py script similar to winner skill's validation approach",
+      "expected_impact": "Would catch formatting errors before final output"
+    },
+    {
+      "priority": "medium",
+      "category": "error_handling",
+      "suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'",
+      "expected_impact": "Would prevent early failure on difficult documents"
+    }
+  ],
+  "transcript_insights": {
+    "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output",
+    "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors"
+  }
+}
+```
+
+## Guidelines
+
+- **Be specific**: Quote from skills and transcripts, don't just say "instructions were unclear"
+- **Be actionable**: Suggestions should be concrete changes, not vague advice
+- **Focus on skill improvements**: The goal is to improve the losing skill, not critique the agent
+- **Prioritize by impact**: Which changes would most likely have changed the outcome?
+- **Consider causation**: Did the skill weakness actually cause the worse output, or is it incidental?
+- **Stay objective**: Analyze what happened, don't editorialize
+- **Think about generalization**: Would this improvement help on other evals too?
+
+## Categories for Suggestions
+
+Use these categories to organize improvement suggestions:
+
+| Category | Description |
+|----------|-------------|
+| `instructions` | Changes to the skill's prose instructions |
+| `tools` | Scripts, templates, or utilities to add/modify |
+| `examples` | Example inputs/outputs to include |
+| `error_handling` | Guidance for handling failures |
+| `structure` | Reorganization of skill content |
+| `references` | External docs or resources to add |
+
+## Priority Levels
+
+- **high**: Would likely change the outcome of this comparison
+- **medium**: Would improve quality but may not change win/loss
+- **low**: Nice to have, marginal improvement
+
+---
+
+# Analyzing Benchmark Results
+
+When analyzing benchmark results, the analyzer's purpose is to **surface patterns and anomalies** across multiple runs, not suggest skill improvements.
+
+## Role
+
+Review all benchmark run results and generate freeform notes that help the user understand skill performance. Focus on patterns that wouldn't be visible from aggregate metrics alone.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **benchmark_data_path**: Path to the in-progress benchmark.json with all run results
+- **skill_path**: Path to the skill being benchmarked
+- **output_path**: Where to save the notes (as JSON array of strings)
+
+## Process
+
+### Step 1: Read Benchmark Data
+
+1. Read the benchmark.json containing all run results
+2. Note the configurations tested (with_skill, without_skill)
+3. Understand the run_summary aggregates already calculated
+
+### Step 2: Analyze Per-Assertion Patterns
+
+For each expectation across all runs:
+- Does it **always pass** in both configurations? (may not differentiate skill value)
+- Does it **always fail** in both configurations? (may be broken or beyond capability)
+- Does it **always pass with skill but fail without**? (skill clearly adds value here)
+- Does it **always fail with skill but pass without**? (skill may be hurting)
+- Is it **highly variable**? (flaky expectation or non-deterministic behavior)
+
+### Step 3: Analyze Cross-Eval Patterns
+
+Look for patterns across evals:
+- Are certain eval types consistently harder/easier?
+- Do some evals show high variance while others are stable?
+- Are there surprising results that contradict expectations?
+
+### Step 4: Analyze Metrics Patterns
+
+Look at time_seconds, tokens, tool_calls:
+- Does the skill significantly increase execution time?
+- Is there high variance in resource usage?
+- Are there outlier runs that skew the aggregates?
+
+### Step 5: Generate Notes
+
+Write freeform observations as a list of strings. Each note should:
+- State a specific observation
+- Be grounded in the data (not speculation)
+- Help the user understand something the aggregate metrics don't show
+
+Examples:
+- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value"
+- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky"
+- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)"
+- "Skill adds 13s average execution time but improves pass rate by 50%"
+- "Token usage is 80% higher with skill, primarily due to script output parsing"
+- "All 3 without-skill runs for eval 1 produced empty output"
+
+### Step 6: Write Notes
+
+Save notes to `{output_path}` as a JSON array of strings:
+
+```json
+[
+  "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
+  "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure",
+  "Without-skill runs consistently fail on table extraction expectations",
+  "Skill adds 13s average execution time but improves pass rate by 50%"
+]
+```
+
+## Guidelines
+
+**DO:**
+- Report what you observe in the data
+- Be specific about which evals, expectations, or runs you're referring to
+- Note patterns that aggregate metrics would hide
+- Provide context that helps interpret the numbers
+
+**DO NOT:**
+- Suggest improvements to the skill (that's for the improvement step, not benchmarking)
+- Make subjective quality judgments ("the output was good/bad")
+- Speculate about causes without evidence
+- Repeat information already in the run_summary aggregates
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/comparator.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/comparator.md
new file mode 100644
index 000000000..80e00eb45
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/comparator.md
@@ -0,0 +1,202 @@
+# Blind Comparator Agent
+
+Compare two outputs WITHOUT knowing which skill produced them.
+
+## Role
+
+The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach.
+
+Your judgment is based purely on output quality and task completion.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **output_a_path**: Path to the first output file or directory
+- **output_b_path**: Path to the second output file or directory
+- **eval_prompt**: The original task/prompt that was executed
+- **expectations**: List of expectations to check (optional - may be empty)
+
+## Process
+
+### Step 1: Read Both Outputs
+
+1. Examine output A (file or directory)
+2. Examine output B (file or directory)
+3. Note the type, structure, and content of each
+4. If outputs are directories, examine all relevant files inside
+
+### Step 2: Understand the Task
+
+1. Read the eval_prompt carefully
+2. Identify what the task requires:
+   - What should be produced?
+   - What qualities matter (accuracy, completeness, format)?
+   - What would distinguish a good output from a poor one?
+
+### Step 3: Generate Evaluation Rubric
+
+Based on the task, generate a rubric with two dimensions:
+
+**Content Rubric** (what the output contains):
+| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
+|-----------|----------|----------------|---------------|
+| Correctness | Major errors | Minor errors | Fully correct |
+| Completeness | Missing key elements | Mostly complete | All elements present |
+| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
+
+**Structure Rubric** (how the output is organized):
+| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
+|-----------|----------|----------------|---------------|
+| Organization | Disorganized | Reasonably organized | Clear, logical structure |
+| Formatting | Inconsistent/broken | Mostly consistent | Professional, polished |
+| Usability | Difficult to use | Usable with effort | Easy to use |
+
+Adapt criteria to the specific task. For example:
+- PDF form → "Field alignment", "Text readability", "Data placement"
+- Document → "Section structure", "Heading hierarchy", "Paragraph flow"
+- Data output → "Schema correctness", "Data types", "Completeness"
+
+### Step 4: Evaluate Each Output Against the Rubric
+
+For each output (A and B):
+
+1. **Score each criterion** on the rubric (1-5 scale)
+2. **Calculate dimension totals**: Content score, Structure score
+3. **Calculate overall score**: Average of dimension scores, scaled to 1-10
+
+### Step 5: Check Assertions (if provided)
+
+If expectations are provided:
+
+1. Check each expectation against output A
+2. Check each expectation against output B
+3. Count pass rates for each output
+4. Use expectation scores as secondary evidence (not the primary decision factor)
+
+### Step 6: Determine the Winner
+
+Compare A and B based on (in priority order):
+
+1. **Primary**: Overall rubric score (content + structure)
+2. **Secondary**: Assertion pass rates (if applicable)
+3. **Tiebreaker**: If truly equal, declare a TIE
+
+Be decisive - ties should be rare. One output is usually better, even if marginally.
+
+### Step 7: Write Comparison Results
+
+Save results to a JSON file at the path specified (or `comparison.json` if not specified).
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "winner": "A",
+  "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
+  "rubric": {
+    "A": {
+      "content": {
+        "correctness": 5,
+        "completeness": 5,
+        "accuracy": 4
+      },
+      "structure": {
+        "organization": 4,
+        "formatting": 5,
+        "usability": 4
+      },
+      "content_score": 4.7,
+      "structure_score": 4.3,
+      "overall_score": 9.0
+    },
+    "B": {
+      "content": {
+        "correctness": 3,
+        "completeness": 2,
+        "accuracy": 3
+      },
+      "structure": {
+        "organization": 3,
+        "formatting": 2,
+        "usability": 3
+      },
+      "content_score": 2.7,
+      "structure_score": 2.7,
+      "overall_score": 5.4
+    }
+  },
+  "output_quality": {
+    "A": {
+      "score": 9,
+      "strengths": ["Complete solution", "Well-formatted", "All fields present"],
+      "weaknesses": ["Minor style inconsistency in header"]
+    },
+    "B": {
+      "score": 5,
+      "strengths": ["Readable output", "Correct basic structure"],
+      "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
+    }
+  },
+  "expectation_results": {
+    "A": {
+      "passed": 4,
+      "total": 5,
+      "pass_rate": 0.80,
+      "details": [
+        {"text": "Output includes name", "passed": true},
+        {"text": "Output includes date", "passed": true},
+        {"text": "Format is PDF", "passed": true},
+        {"text": "Contains signature", "passed": false},
+        {"text": "Readable text", "passed": true}
+      ]
+    },
+    "B": {
+      "passed": 3,
+      "total": 5,
+      "pass_rate": 0.60,
+      "details": [
+        {"text": "Output includes name", "passed": true},
+        {"text": "Output includes date", "passed": false},
+        {"text": "Format is PDF", "passed": true},
+        {"text": "Contains signature", "passed": false},
+        {"text": "Readable text", "passed": true}
+      ]
+    }
+  }
+}
+```
+
+If no expectations were provided, omit the `expectation_results` field entirely.
+
+## Field Descriptions
+
+- **winner**: "A", "B", or "TIE"
+- **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie)
+- **rubric**: Structured rubric evaluation for each output
+  - **content**: Scores for content criteria (correctness, completeness, accuracy)
+  - **structure**: Scores for structure criteria (organization, formatting, usability)
+  - **content_score**: Average of content criteria (1-5)
+  - **structure_score**: Average of structure criteria (1-5)
+  - **overall_score**: Combined score scaled to 1-10
+- **output_quality**: Summary quality assessment
+  - **score**: 1-10 rating (should match rubric overall_score)
+  - **strengths**: List of positive aspects
+  - **weaknesses**: List of issues or shortcomings
+- **expectation_results**: (Only if expectations provided)
+  - **passed**: Number of expectations that passed
+  - **total**: Total number of expectations
+  - **pass_rate**: Fraction passed (0.0 to 1.0)
+  - **details**: Individual expectation results
+
+## Guidelines
+
+- **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality.
+- **Be specific**: Cite specific examples when explaining strengths and weaknesses.
+- **Be decisive**: Choose a winner unless outputs are genuinely equivalent.
+- **Output quality first**: Assertion scores are secondary to overall task completion.
+- **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness.
+- **Explain your reasoning**: The reasoning field should make it clear why you chose the winner.
+- **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better.
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/grader.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/grader.md
new file mode 100644
index 000000000..558ab05c0
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/grader.md
@@ -0,0 +1,223 @@
+# Grader Agent
+
+Evaluate expectations against an execution transcript and outputs.
+
+## Role
+
+The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment.
+
+You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **expectations**: List of expectations to evaluate (strings)
+- **transcript_path**: Path to the execution transcript (markdown file)
+- **outputs_dir**: Directory containing output files from execution
+
+## Process
+
+### Step 1: Read the Transcript
+
+1. Read the transcript file completely
+2. Note the eval prompt, execution steps, and final result
+3. Identify any issues or errors documented
+
+### Step 2: Examine Output Files
+
+1. List files in outputs_dir
+2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced.
+3. Note contents, structure, and quality
+
+### Step 3: Evaluate Each Assertion
+
+For each expectation:
+
+1. **Search for evidence** in the transcript and outputs
+2. **Determine verdict**:
+   - **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance
+   - **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content)
+3. **Cite the evidence**: Quote the specific text or describe what you found
+
+### Step 4: Extract and Verify Claims
+
+Beyond the predefined expectations, extract implicit claims from the outputs and verify them:
+
+1. **Extract claims** from the transcript and outputs:
+   - Factual statements ("The form has 12 fields")
+   - Process claims ("Used pypdf to fill the form")
+   - Quality claims ("All fields were filled correctly")
+
+2. **Verify each claim**:
+   - **Factual claims**: Can be checked against the outputs or external sources
+   - **Process claims**: Can be verified from the transcript
+   - **Quality claims**: Evaluate whether the claim is justified
+
+3. **Flag unverifiable claims**: Note claims that cannot be verified with available information
+
+This catches issues that predefined expectations might miss.
+
+### Step 5: Read User Notes
+
+If `{outputs_dir}/user_notes.md` exists:
+1. Read it and note any uncertainties or issues flagged by the executor
+2. Include relevant concerns in the grading output
+3. These may reveal problems even when expectations pass
+
+### Step 6: Critique the Evals
+
+After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap.
+
+Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't.
+
+Suggestions worth raising:
+- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content)
+- An important outcome you observed — good or bad — that no assertion covers at all
+- An assertion that can't actually be verified from the available outputs
+
+Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion.
+
+### Step 7: Write Grading Results
+
+Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
+
+## Grading Criteria
+
+**PASS when**:
+- The transcript or outputs clearly demonstrate the expectation is true
+- Specific evidence can be cited
+- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename)
+
+**FAIL when**:
+- No evidence found for the expectation
+- Evidence contradicts the expectation
+- The expectation cannot be verified from available information
+- The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete
+- The output appears to meet the assertion by coincidence rather than by actually doing the work
+
+**When uncertain**: The burden of proof to pass is on the expectation.
+
+### Step 8: Read Executor Metrics and Timing
+
+1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output
+2. If `{outputs_dir}/../timing.json` exists, read it and include timing data
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "expectations": [
+    {
+      "text": "The output includes the name 'John Smith'",
+      "passed": true,
+      "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
+    },
+    {
+      "text": "The spreadsheet has a SUM formula in cell B10",
+      "passed": false,
+      "evidence": "No spreadsheet was created. The output was a text file."
+    },
+    {
+      "text": "The assistant used the skill's OCR script",
+      "passed": true,
+      "evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'"
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 1,
+    "total": 3,
+    "pass_rate": 0.67
+  },
+  "execution_metrics": {
+    "tool_calls": {
+      "Read": 5,
+      "Write": 2,
+      "Bash": 8
+    },
+    "total_tool_calls": 15,
+    "total_steps": 6,
+    "errors_encountered": 0,
+    "output_chars": 12450,
+    "transcript_chars": 3200
+  },
+  "timing": {
+    "executor_duration_seconds": 165.0,
+    "grader_duration_seconds": 26.0,
+    "total_duration_seconds": 191.0
+  },
+  "claims": [
+    {
+      "claim": "The form has 12 fillable fields",
+      "type": "factual",
+      "verified": true,
+      "evidence": "Counted 12 fields in field_info.json"
+    },
+    {
+      "claim": "All required fields were populated",
+      "type": "quality",
+      "verified": false,
+      "evidence": "Reference section was left blank despite data being available"
+    }
+  ],
+  "user_notes_summary": {
+    "uncertainties": ["Used 2023 data, may be stale"],
+    "needs_review": [],
+    "workarounds": ["Fell back to text overlay for non-fillable fields"]
+  },
+  "eval_feedback": {
+    "suggestions": [
+      {
+        "assertion": "The output includes the name 'John Smith'",
+        "reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input"
+      },
+      {
+        "reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught"
+      }
+    ],
+    "overall": "Assertions check presence but not correctness. Consider adding content verification."
+  }
+}
+```
+
+## Field Descriptions
+
+- **expectations**: Array of graded expectations
+  - **text**: The original expectation text
+  - **passed**: Boolean - true if expectation passes
+  - **evidence**: Specific quote or description supporting the verdict
+- **summary**: Aggregate statistics
+  - **passed**: Count of passed expectations
+  - **failed**: Count of failed expectations
+  - **total**: Total expectations evaluated
+  - **pass_rate**: Fraction passed (0.0 to 1.0)
+- **execution_metrics**: Copied from executor's metrics.json (if available)
+  - **output_chars**: Total character count of output files (proxy for tokens)
+  - **transcript_chars**: Character count of transcript
+- **timing**: Wall clock timing from timing.json (if available)
+  - **executor_duration_seconds**: Time spent in executor subagent
+  - **total_duration_seconds**: Total elapsed time for the run
+- **claims**: Extracted and verified claims from the output
+  - **claim**: The statement being verified
+  - **type**: "factual", "process", or "quality"
+  - **verified**: Boolean - whether the claim holds
+  - **evidence**: Supporting or contradicting evidence
+- **user_notes_summary**: Issues flagged by the executor
+  - **uncertainties**: Things the executor wasn't sure about
+  - **needs_review**: Items requiring human attention
+  - **workarounds**: Places where the skill didn't work as expected
+- **eval_feedback**: Improvement suggestions for the evals (only when warranted)
+  - **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to
+  - **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag
+
+## Guidelines
+
+- **Be objective**: Base verdicts on evidence, not assumptions
+- **Be specific**: Quote the exact text that supports your verdict
+- **Be thorough**: Check both transcript and output files
+- **Be consistent**: Apply the same standard to each expectation
+- **Explain failures**: Make it clear why evidence was insufficient
+- **No partial credit**: Each expectation is pass or fail, not partial
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/assets/eval_review.html b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/assets/eval_review.html
new file mode 100644
index 000000000..938ff32ae
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/assets/eval_review.html
@@ -0,0 +1,146 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Eval Set Review - __SKILL_NAME_PLACEHOLDER__</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+  <style>
+    * { box-sizing: border-box; margin: 0; padding: 0; }
+    body { font-family: 'Lora', Georgia, serif; background: #faf9f5; padding: 2rem; color: #141413; }
+    h1 { font-family: 'Poppins', sans-serif; margin-bottom: 0.5rem; font-size: 1.5rem; }
+    .description { color: #b0aea5; margin-bottom: 1.5rem; font-style: italic; max-width: 900px; }
+    .controls { margin-bottom: 1rem; display: flex; gap: 0.5rem; }
+    .btn { font-family: 'Poppins', sans-serif; padding: 0.5rem 1rem; border: none; border-radius: 6px; cursor: pointer; font-size: 0.875rem; font-weight: 500; }
+    .btn-add { background: #6a9bcc; color: white; }
+    .btn-add:hover { background: #5889b8; }
+    .btn-export { background: #d97757; color: white; }
+    .btn-export:hover { background: #c4613f; }
+    table { width: 100%; max-width: 1100px; border-collapse: collapse; background: white; border-radius: 6px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.08); }
+    th { font-family: 'Poppins', sans-serif; background: #141413; color: #faf9f5; padding: 0.75rem 1rem; text-align: left; font-size: 0.875rem; }
+    td { padding: 0.75rem 1rem; border-bottom: 1px solid #e8e6dc; vertical-align: top; }
+    tr:nth-child(even) td { background: #faf9f5; }
+    tr:hover td { background: #f3f1ea; }
+    .section-header td { background: #e8e6dc; font-family: 'Poppins', sans-serif; font-weight: 500; font-size: 0.8rem; color: #141413; text-transform: uppercase; letter-spacing: 0.05em; }
+    .query-input { width: 100%; padding: 0.4rem; border: 1px solid #e8e6dc; border-radius: 4px; font-size: 0.875rem; font-family: 'Lora', Georgia, serif; resize: vertical; min-height: 60px; }
+    .query-input:focus { outline: none; border-color: #d97757; box-shadow: 0 0 0 2px rgba(217,119,87,0.15); }
+    .toggle { position: relative; display: inline-block; width: 44px; height: 24px; }
+    .toggle input { opacity: 0; width: 0; height: 0; }
+    .toggle .slider { position: absolute; inset: 0; background: #b0aea5; border-radius: 24px; cursor: pointer; transition: 0.2s; }
+    .toggle .slider::before { content: ""; position: absolute; width: 18px; height: 18px; left: 3px; bottom: 3px; background: white; border-radius: 50%; transition: 0.2s; }
+    .toggle input:checked + .slider { background: #d97757; }
+    .toggle input:checked + .slider::before { transform: translateX(20px); }
+    .btn-delete { background: #c44; color: white; padding: 0.3rem 0.6rem; border: none; border-radius: 4px; cursor: pointer; font-size: 0.75rem; font-family: 'Poppins', sans-serif; }
+    .btn-delete:hover { background: #a33; }
+    .summary { margin-top: 1rem; color: #b0aea5; font-size: 0.875rem; }
+  </style>
+</head>
+<body>
+  <h1>Eval Set Review: <span id="skill-name">__SKILL_NAME_PLACEHOLDER__</span></h1>
+  <p class="description">Current description: <span id="skill-desc">__SKILL_DESCRIPTION_PLACEHOLDER__</span></p>
+
+  <div class="controls">
+    <button class="btn btn-add" onclick="addRow()">+ Add Query</button>
+    <button class="btn btn-export" onclick="exportEvalSet()">Export Eval Set</button>
+  </div>
+
+  <table>
+    <thead>
+      <tr>
+        <th style="width:65%">Query</th>
+        <th style="width:18%">Should Trigger</th>
+        <th style="width:10%">Actions</th>
+      </tr>
+    </thead>
+    <tbody id="eval-body"></tbody>
+  </table>
+
+  <p class="summary" id="summary"></p>
+
+  <script>
+    const EVAL_DATA = __EVAL_DATA_PLACEHOLDER__;
+
+    let evalItems = [...EVAL_DATA];
+
+    function render() {
+      const tbody = document.getElementById('eval-body');
+      tbody.innerHTML = '';
+
+      // Sort: should-trigger first, then should-not-trigger
+      const sorted = evalItems
+        .map((item, origIdx) => ({ ...item, origIdx }))
+        .sort((a, b) => (b.should_trigger ? 1 : 0) - (a.should_trigger ? 1 : 0));
+
+      let lastGroup = null;
+      sorted.forEach(item => {
+        const group = item.should_trigger ? 'trigger' : 'no-trigger';
+        if (group !== lastGroup) {
+          const headerRow = document.createElement('tr');
+          headerRow.className = 'section-header';
+          headerRow.innerHTML = `<td colspan="3">${item.should_trigger ? 'Should Trigger' : 'Should NOT Trigger'}</td>`;
+          tbody.appendChild(headerRow);
+          lastGroup = group;
+        }
+
+        const idx = item.origIdx;
+        const tr = document.createElement('tr');
+        tr.innerHTML = `
+          <td><textarea class="query-input" onchange="updateQuery(${idx}, this.value)">${escapeHtml(item.query)}</textarea></td>
+          <td>
+            <label class="toggle">
+              <input type="checkbox" ${item.should_trigger ? 'checked' : ''} onchange="updateTrigger(${idx}, this.checked)">
+              <span class="slider"></span>
+            </label>
+            <span style="margin-left:8px;font-size:0.8rem;color:#b0aea5">${item.should_trigger ? 'Yes' : 'No'}</span>
+          </td>
+          <td><button class="btn-delete" onclick="deleteRow(${idx})">Delete</button></td>
+        `;
+        tbody.appendChild(tr);
+      });
+      updateSummary();
+    }
+
+    function escapeHtml(text) {
+      const div = document.createElement('div');
+      div.textContent = text;
+      return div.innerHTML;
+    }
+
+    function updateQuery(idx, value) { evalItems[idx].query = value; updateSummary(); }
+    function updateTrigger(idx, value) { evalItems[idx].should_trigger = value; render(); }
+    function deleteRow(idx) { evalItems.splice(idx, 1); render(); }
+
+    function addRow() {
+      evalItems.push({ query: '', should_trigger: true });
+      render();
+      const inputs = document.querySelectorAll('.query-input');
+      inputs[inputs.length - 1].focus();
+    }
+
+    function updateSummary() {
+      const trigger = evalItems.filter(i => i.should_trigger).length;
+      const noTrigger = evalItems.filter(i => !i.should_trigger).length;
+      document.getElementById('summary').textContent =
+        `${evalItems.length} queries total: ${trigger} should trigger, ${noTrigger} should not trigger`;
+    }
+
+    function exportEvalSet() {
+      const valid = evalItems.filter(i => i.query.trim() !== '');
+      const data = valid.map(i => ({ query: i.query.trim(), should_trigger: i.should_trigger }));
+      const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
+      const url = URL.createObjectURL(blob);
+      const a = document.createElement('a');
+      a.href = url;
+      a.download = 'eval_set.json';
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+      URL.revokeObjectURL(url);
+    }
+
+    render();
+  </script>
+</body>
+</html>
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/generate_review.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/generate_review.py
new file mode 100644
index 000000000..4f0b1fe00
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/generate_review.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python3
+"""Generate and serve a review page for eval results.
+
+Reads the workspace directory, discovers runs (directories with outputs/),
+embeds all output data into a self-contained HTML page, and serves it via
+a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace.
+
+Usage:
+    python generate_review.py <workspace-path> [--port PORT] [--skill-name NAME]
+    python generate_review.py <workspace-path> --previous-feedback /path/to/old/feedback.json
+
+No dependencies beyond the Python stdlib are required.
+"""
+
+import argparse
+import base64
+import json
+import mimetypes
+import os
+import re
+import signal
+import subprocess
+import sys
+import time
+import webbrowser
+from functools import partial
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from pathlib import Path
+
+from loguru import logger
+
+# Files to exclude from output listings
+METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
+
+# Extensions we render as inline text
+TEXT_EXTENSIONS = {
+    ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
+    ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
+    ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
+}
+
+# Extensions we render as inline images
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
+
+# MIME type overrides for common types
+MIME_OVERRIDES = {
+    ".svg": "image/svg+xml",
+    ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+}
+
+
+def get_mime_type(path: Path) -> str:
+    ext = path.suffix.lower()
+    if ext in MIME_OVERRIDES:
+        return MIME_OVERRIDES[ext]
+    mime, _ = mimetypes.guess_type(str(path))
+    return mime or "application/octet-stream"
+
+
+def find_runs(workspace: Path) -> list[dict]:
+    """Recursively find directories that contain an outputs/ subdirectory."""
+    runs: list[dict] = []
+    _find_runs_recursive(workspace, workspace, runs)
+    runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"]))
+    return runs
+
+
+def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None:
+    if not current.is_dir():
+        return
+
+    outputs_dir = current / "outputs"
+    if outputs_dir.is_dir():
+        run = build_run(root, current)
+        if run:
+            runs.append(run)
+        return
+
+    skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
+    for child in sorted(current.iterdir()):
+        if child.is_dir() and child.name not in skip:
+            _find_runs_recursive(root, child, runs)
+
+
+def build_run(root: Path, run_dir: Path) -> dict | None:
+    """Build a run dict with prompt, outputs, and grading data."""
+    prompt = ""
+    eval_id = None
+
+    # Try eval_metadata.json
+    for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:
+        if candidate.exists():
+            try:
+                metadata = json.loads(candidate.read_text())
+                prompt = metadata.get("prompt", "")
+                eval_id = metadata.get("eval_id")
+            except (json.JSONDecodeError, OSError):
+                pass
+            if prompt:
+                break
+
+    # Fall back to transcript.md
+    if not prompt:
+        for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:
+            if candidate.exists():
+                try:
+                    text = candidate.read_text()
+                    match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)
+                    if match:
+                        prompt = match.group(1).strip()
+                except OSError:
+                    pass
+                if prompt:
+                    break
+
+    if not prompt:
+        prompt = "(No prompt found)"
+
+    run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
+
+    # Collect output files
+    outputs_dir = run_dir / "outputs"
+    output_files: list[dict] = []
+    if outputs_dir.is_dir():
+        for f in sorted(outputs_dir.iterdir()):
+            if f.is_file() and f.name not in METADATA_FILES:
+                output_files.append(embed_file(f))
+
+    # Load grading if present
+    grading = None
+    for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:
+        if candidate.exists():
+            try:
+                grading = json.loads(candidate.read_text())
+            except (json.JSONDecodeError, OSError):
+                pass
+            if grading:
+                break
+
+    return {
+        "id": run_id,
+        "prompt": prompt,
+        "eval_id": eval_id,
+        "outputs": output_files,
+        "grading": grading,
+    }
+
+
+def embed_file(path: Path) -> dict:
+    """Read a file and return an embedded representation."""
+    ext = path.suffix.lower()
+    mime = get_mime_type(path)
+
+    if ext in TEXT_EXTENSIONS:
+        try:
+            content = path.read_text(errors="replace")
+        except OSError:
+            content = "(Error reading file)"
+        return {
+            "name": path.name,
+            "type": "text",
+            "content": content,
+        }
+    elif ext in IMAGE_EXTENSIONS:
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "image",
+            "mime": mime,
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+    elif ext == ".pdf":
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "pdf",
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+    elif ext == ".xlsx":
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "xlsx",
+            "data_b64": b64,
+        }
+    else:
+        # Binary / unknown — base64 download link
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "binary",
+            "mime": mime,
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+
+
+def load_previous_iteration(workspace: Path) -> dict[str, dict]:
+    """Load previous iteration's feedback and outputs.
+
+    Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}.
+    """
+    result: dict[str, dict] = {}
+
+    # Load feedback
+    feedback_map: dict[str, str] = {}
+    feedback_path = workspace / "feedback.json"
+    if feedback_path.exists():
+        try:
+            data = json.loads(feedback_path.read_text())
+            feedback_map = {
+                r["run_id"]: r["feedback"]
+                for r in data.get("reviews", [])
+                if r.get("feedback", "").strip()
+            }
+        except (json.JSONDecodeError, OSError, KeyError):
+            pass
+
+    # Load runs (to get outputs)
+    prev_runs = find_runs(workspace)
+    for run in prev_runs:
+        result[run["id"]] = {
+            "feedback": feedback_map.get(run["id"], ""),
+            "outputs": run.get("outputs", []),
+        }
+
+    # Also add feedback for run_ids that had feedback but no matching run
+    for run_id, fb in feedback_map.items():
+        if run_id not in result:
+            result[run_id] = {"feedback": fb, "outputs": []}
+
+    return result
+
+
+def generate_html(
+    runs: list[dict],
+    skill_name: str,
+    previous: dict[str, dict] | None = None,
+    benchmark: dict | None = None,
+) -> str:
+    """Generate the complete standalone HTML page with embedded data."""
+    template_path = Path(__file__).parent / "viewer.html"
+    template = template_path.read_text()
+
+    # Build previous_feedback and previous_outputs maps for the template
+    previous_feedback: dict[str, str] = {}
+    previous_outputs: dict[str, list[dict]] = {}
+    if previous:
+        for run_id, data in previous.items():
+            if data.get("feedback"):
+                previous_feedback[run_id] = data["feedback"]
+            if data.get("outputs"):
+                previous_outputs[run_id] = data["outputs"]
+
+    embedded = {
+        "skill_name": skill_name,
+        "runs": runs,
+        "previous_feedback": previous_feedback,
+        "previous_outputs": previous_outputs,
+    }
+    if benchmark:
+        embedded["benchmark"] = benchmark
+
+    data_json = json.dumps(embedded)
+
+    return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
+
+
+# ---------------------------------------------------------------------------
+# HTTP server (stdlib only, zero dependencies)
+# ---------------------------------------------------------------------------
+
+def _kill_port(port: int) -> None:
+    """Kill any process listening on the given port."""
+    try:
+        result = subprocess.run(
+            ["lsof", "-ti", f":{port}"],
+            capture_output=True, text=True, timeout=5,
+        )
+        for pid_str in result.stdout.strip().split("\n"):
+            if pid_str.strip():
+                try:
+                    os.kill(int(pid_str.strip()), signal.SIGTERM)
+                except (ProcessLookupError, ValueError):
+                    pass
+        if result.stdout.strip():
+            time.sleep(0.5)
+    except subprocess.TimeoutExpired:
+        pass
+    except FileNotFoundError:
+        logger.warning("Note: lsof not found, cannot check if port is in use")
+
+class ReviewHandler(BaseHTTPRequestHandler):
+    """Serves the review HTML and handles feedback saves.
+
+    Regenerates the HTML on each page load so that refreshing the browser
+    picks up new eval outputs without restarting the server.
+    """
+
+    def __init__(
+        self,
+        workspace: Path,
+        skill_name: str,
+        feedback_path: Path,
+        previous: dict[str, dict],
+        benchmark_path: Path | None,
+        *args,
+        **kwargs,
+    ):
+        self.workspace = workspace
+        self.skill_name = skill_name
+        self.feedback_path = feedback_path
+        self.previous = previous
+        self.benchmark_path = benchmark_path
+        super().__init__(*args, **kwargs)
+
+    def do_GET(self) -> None:
+        if self.path == "/" or self.path == "/index.html":
+            # Regenerate HTML on each request (re-scans workspace for new outputs)
+            runs = find_runs(self.workspace)
+            benchmark = None
+            if self.benchmark_path and self.benchmark_path.exists():
+                try:
+                    benchmark = json.loads(self.benchmark_path.read_text())
+                except (json.JSONDecodeError, OSError):
+                    pass
+            html = generate_html(runs, self.skill_name, self.previous, benchmark)
+            content = html.encode("utf-8")
+            self.send_response(200)
+            self.send_header("Content-Type", "text/html; charset=utf-8")
+            self.send_header("Content-Length", str(len(content)))
+            self.end_headers()
+            self.wfile.write(content)
+        elif self.path == "/api/feedback":
+            data = b"{}"
+            if self.feedback_path.exists():
+                data = self.feedback_path.read_bytes()
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(data)))
+            self.end_headers()
+            self.wfile.write(data)
+        else:
+            self.send_error(404)
+
+    def do_POST(self) -> None:
+        if self.path == "/api/feedback":
+            length = int(self.headers.get("Content-Length", 0))
+            body = self.rfile.read(length)
+            try:
+                data = json.loads(body)
+                if not isinstance(data, dict) or "reviews" not in data:
+                    raise ValueError("Expected JSON object with 'reviews' key")
+                self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")
+                resp = b'{"ok":true}'
+                self.send_response(200)
+            except (json.JSONDecodeError, OSError, ValueError) as e:
+                resp = json.dumps({"error": str(e)}).encode()
+                self.send_response(500)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(resp)))
+            self.end_headers()
+            self.wfile.write(resp)
+        else:
+            self.send_error(404)
+
+    def log_message(self, format: str, *args: object) -> None:
+        # Suppress request logging to keep terminal clean
+        pass
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate and serve eval review")
+    parser.add_argument("workspace", type=Path, help="Path to workspace directory")
+    parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
+    parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
+    parser.add_argument(
+        "--previous-workspace", type=Path, default=None,
+        help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
+    )
+    parser.add_argument(
+        "--benchmark", type=Path, default=None,
+        help="Path to benchmark.json to show in the Benchmark tab",
+    )
+    parser.add_argument(
+        "--static", "-s", type=Path, default=None,
+        help="Write standalone HTML to this path instead of starting a server",
+    )
+    args = parser.parse_args()
+
+    workspace = args.workspace.resolve()
+    if not workspace.is_dir():
+        logger.error(f"Error: {workspace} is not a directory")
+        sys.exit(1)
+
+    runs = find_runs(workspace)
+    if not runs:
+        logger.error(f"No runs found in {workspace}")
+        sys.exit(1)
+
+    skill_name = args.skill_name or workspace.name.replace("-workspace", "")
+    feedback_path = workspace / "feedback.json"
+
+    previous: dict[str, dict] = {}
+    if args.previous_workspace:
+        previous = load_previous_iteration(args.previous_workspace.resolve())
+
+    benchmark_path = args.benchmark.resolve() if args.benchmark else None
+    benchmark = None
+    if benchmark_path and benchmark_path.exists():
+        try:
+            benchmark = json.loads(benchmark_path.read_text())
+        except (json.JSONDecodeError, OSError):
+            pass
+
+    if args.static:
+        html = generate_html(runs, skill_name, previous, benchmark)
+        args.static.parent.mkdir(parents=True, exist_ok=True)
+        args.static.write_text(html)
+        logger.info(f"\n  Static viewer written to: {args.static}\n")
+        sys.exit(0)
+
+    # Kill any existing process on the target port
+    port = args.port
+    _kill_port(port)
+    handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path)
+    try:
+        server = HTTPServer(("127.0.0.1", port), handler)
+    except OSError:
+        # Port still in use after kill attempt — find a free one
+        server = HTTPServer(("127.0.0.1", 0), handler)
+        port = server.server_address[1]
+
+    url = f"http://localhost:{port}"
+    logger.info(f"\n  Eval Viewer")
+    logger.info(f"  ─────────────────────────────────")
+    logger.info(f"  URL:       {url}")
+    logger.info(f"  Workspace: {workspace}")
+    logger.info(f"  Feedback:  {feedback_path}")
+    if previous:
+        logger.info(f"  Previous:  {args.previous_workspace} ({len(previous)} runs)")
+    if benchmark_path:
+        logger.info(f"  Benchmark: {benchmark_path}")
+    logger.info(f"\n  Press Ctrl+C to stop.\n")
+
+    webbrowser.open(url)
+
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        logger.info("\nStopped.")
+        server.server_close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/viewer.html b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/viewer.html
new file mode 100644
index 000000000..6d8e96348
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/viewer.html
@@ -0,0 +1,1325 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Eval Review</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+  <script src="https://cdn.sheetjs.com/xlsx-0.20.3/package/dist/xlsx.full.min.js" integrity="sha384-EnyY0/GSHQGSxSgMwaIPzSESbqoOLSexfnSMN2AP+39Ckmn92stwABZynq1JyzdT" crossorigin="anonymous"></script>
+  <style>
+    :root {
+      --bg: #faf9f5;
+      --surface: #ffffff;
+      --border: #e8e6dc;
+      --text: #141413;
+      --text-muted: #b0aea5;
+      --accent: #d97757;
+      --accent-hover: #c4613f;
+      --green: #788c5d;
+      --green-bg: #eef2e8;
+      --red: #c44;
+      --red-bg: #fceaea;
+      --header-bg: #141413;
+      --header-text: #faf9f5;
+      --radius: 6px;
+    }
+
+    * { box-sizing: border-box; margin: 0; padding: 0; }
+
+    body {
+      font-family: 'Lora', Georgia, serif;
+      background: var(--bg);
+      color: var(--text);
+      height: 100vh;
+      display: flex;
+      flex-direction: column;
+    }
+
+    /* ---- Header ---- */
+    .header {
+      background: var(--header-bg);
+      color: var(--header-text);
+      padding: 1rem 2rem;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      flex-shrink: 0;
+    }
+    .header h1 {
+      font-family: 'Poppins', sans-serif;
+      font-size: 1.25rem;
+      font-weight: 600;
+    }
+    .header .instructions {
+      font-size: 0.8rem;
+      opacity: 0.7;
+      margin-top: 0.25rem;
+    }
+    .header .progress {
+      font-size: 0.875rem;
+      opacity: 0.8;
+      text-align: right;
+    }
+
+    /* ---- Main content ---- */
+    .main {
+      flex: 1;
+      overflow-y: auto;
+      padding: 1.5rem 2rem;
+      display: flex;
+      flex-direction: column;
+      gap: 1.25rem;
+    }
+
+    /* ---- Sections ---- */
+    .section {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      flex-shrink: 0;
+    }
+    .section-header {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.75rem 1rem;
+      font-size: 0.75rem;
+      font-weight: 500;
+      text-transform: uppercase;
+      letter-spacing: 0.05em;
+      color: var(--text-muted);
+      border-bottom: 1px solid var(--border);
+      background: var(--bg);
+    }
+    .section-body {
+      padding: 1rem;
+    }
+
+    /* ---- Config badge ---- */
+    .config-badge {
+      display: inline-block;
+      padding: 0.2rem 0.625rem;
+      border-radius: 9999px;
+      font-family: 'Poppins', sans-serif;
+      font-size: 0.6875rem;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.03em;
+      margin-left: 0.75rem;
+      vertical-align: middle;
+    }
+    .config-badge.config-primary {
+      background: rgba(33, 150, 243, 0.12);
+      color: #1976d2;
+    }
+    .config-badge.config-baseline {
+      background: rgba(255, 193, 7, 0.15);
+      color: #f57f17;
+    }
+
+    /* ---- Prompt ---- */
+    .prompt-text {
+      white-space: pre-wrap;
+      font-size: 0.9375rem;
+      line-height: 1.6;
+    }
+
+    /* ---- Outputs ---- */
+    .output-file {
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      overflow: hidden;
+    }
+    .output-file + .output-file {
+      margin-top: 1rem;
+    }
+    .output-file-header {
+      padding: 0.5rem 0.75rem;
+      font-size: 0.8rem;
+      font-weight: 600;
+      color: var(--text-muted);
+      background: var(--bg);
+      border-bottom: 1px solid var(--border);
+      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+    }
+    .output-file-header .dl-btn {
+      font-size: 0.7rem;
+      color: var(--accent);
+      text-decoration: none;
+      cursor: pointer;
+      font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+      font-weight: 500;
+      opacity: 0.8;
+    }
+    .output-file-header .dl-btn:hover {
+      opacity: 1;
+      text-decoration: underline;
+    }
+    .output-file-content {
+      padding: 0.75rem;
+      overflow-x: auto;
+    }
+    .output-file-content pre {
+      font-size: 0.8125rem;
+      line-height: 1.5;
+      white-space: pre-wrap;
+      word-break: break-word;
+      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
+    }
+    .output-file-content img {
+      max-width: 100%;
+      height: auto;
+      border-radius: 4px;
+    }
+    .output-file-content iframe {
+      width: 100%;
+      height: 600px;
+      border: none;
+    }
+    .output-file-content table {
+      border-collapse: collapse;
+      font-size: 0.8125rem;
+      width: 100%;
+    }
+    .output-file-content table td,
+    .output-file-content table th {
+      border: 1px solid var(--border);
+      padding: 0.375rem 0.5rem;
+      text-align: left;
+    }
+    .output-file-content table th {
+      background: var(--bg);
+      font-weight: 600;
+    }
+    .output-file-content .download-link {
+      display: inline-flex;
+      align-items: center;
+      gap: 0.5rem;
+      padding: 0.5rem 1rem;
+      background: var(--bg);
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      color: var(--accent);
+      text-decoration: none;
+      font-size: 0.875rem;
+      cursor: pointer;
+    }
+    .output-file-content .download-link:hover {
+      background: var(--border);
+    }
+    .empty-state {
+      color: var(--text-muted);
+      font-style: italic;
+      padding: 2rem;
+      text-align: center;
+    }
+
+    /* ---- Feedback ---- */
+    .prev-feedback {
+      background: var(--bg);
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      padding: 0.625rem 0.75rem;
+      margin-top: 0.75rem;
+      font-size: 0.8125rem;
+      color: var(--text-muted);
+      line-height: 1.5;
+    }
+    .prev-feedback-label {
+      font-size: 0.7rem;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.04em;
+      margin-bottom: 0.25rem;
+      color: var(--text-muted);
+    }
+    .feedback-textarea {
+      width: 100%;
+      min-height: 100px;
+      padding: 0.75rem;
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      font-family: inherit;
+      font-size: 0.9375rem;
+      line-height: 1.5;
+      resize: vertical;
+      color: var(--text);
+    }
+    .feedback-textarea:focus {
+      outline: none;
+      border-color: var(--accent);
+      box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
+    }
+    .feedback-status {
+      font-size: 0.75rem;
+      color: var(--text-muted);
+      margin-top: 0.5rem;
+      min-height: 1.1em;
+    }
+
+    /* ---- Grades (collapsible) ---- */
+    .grades-toggle {
+      display: flex;
+      align-items: center;
+      cursor: pointer;
+      user-select: none;
+    }
+    .grades-toggle:hover {
+      color: var(--accent);
+    }
+    .grades-toggle .arrow {
+      margin-right: 0.5rem;
+      transition: transform 0.15s;
+      font-size: 0.75rem;
+    }
+    .grades-toggle .arrow.open {
+      transform: rotate(90deg);
+    }
+    .grades-content {
+      display: none;
+      margin-top: 0.75rem;
+    }
+    .grades-content.open {
+      display: block;
+    }
+    .grades-summary {
+      font-size: 0.875rem;
+      margin-bottom: 0.75rem;
+      display: flex;
+      align-items: center;
+      gap: 0.5rem;
+    }
+    .grade-badge {
+      display: inline-block;
+      padding: 0.125rem 0.5rem;
+      border-radius: 9999px;
+      font-size: 0.75rem;
+      font-weight: 600;
+    }
+    .grade-pass { background: var(--green-bg); color: var(--green); }
+    .grade-fail { background: var(--red-bg); color: var(--red); }
+    .assertion-list {
+      list-style: none;
+    }
+    .assertion-item {
+      padding: 0.625rem 0;
+      border-bottom: 1px solid var(--border);
+      font-size: 0.8125rem;
+    }
+    .assertion-item:last-child { border-bottom: none; }
+    .assertion-status {
+      font-weight: 600;
+      margin-right: 0.5rem;
+    }
+    .assertion-status.pass { color: var(--green); }
+    .assertion-status.fail { color: var(--red); }
+    .assertion-evidence {
+      color: var(--text-muted);
+      font-size: 0.75rem;
+      margin-top: 0.25rem;
+      padding-left: 1.5rem;
+    }
+
+    /* ---- View tabs ---- */
+    .view-tabs {
+      display: flex;
+      gap: 0;
+      padding: 0 2rem;
+      background: var(--bg);
+      border-bottom: 1px solid var(--border);
+      flex-shrink: 0;
+    }
+    .view-tab {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.625rem 1.25rem;
+      font-size: 0.8125rem;
+      font-weight: 500;
+      cursor: pointer;
+      border: none;
+      background: none;
+      color: var(--text-muted);
+      border-bottom: 2px solid transparent;
+      transition: all 0.15s;
+    }
+    .view-tab:hover { color: var(--text); }
+    .view-tab.active {
+      color: var(--accent);
+      border-bottom-color: var(--accent);
+    }
+    .view-panel { display: none; }
+    .view-panel.active { display: flex; flex-direction: column; flex: 1; overflow: hidden; }
+
+    /* ---- Benchmark view ---- */
+    .benchmark-view {
+      padding: 1.5rem 2rem;
+      overflow-y: auto;
+      flex: 1;
+    }
+    .benchmark-table {
+      border-collapse: collapse;
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      font-size: 0.8125rem;
+      width: 100%;
+      margin-bottom: 1.5rem;
+    }
+    .benchmark-table th, .benchmark-table td {
+      padding: 0.625rem 0.75rem;
+      text-align: left;
+      border: 1px solid var(--border);
+    }
+    .benchmark-table th {
+      font-family: 'Poppins', sans-serif;
+      background: var(--header-bg);
+      color: var(--header-text);
+      font-weight: 500;
+      font-size: 0.75rem;
+      text-transform: uppercase;
+      letter-spacing: 0.04em;
+    }
+    .benchmark-table tr:hover { background: var(--bg); }
+    .benchmark-table tr.benchmark-row-with { background: rgba(33, 150, 243, 0.06); }
+    .benchmark-table tr.benchmark-row-without { background: rgba(255, 193, 7, 0.06); }
+    .benchmark-table tr.benchmark-row-with:hover { background: rgba(33, 150, 243, 0.12); }
+    .benchmark-table tr.benchmark-row-without:hover { background: rgba(255, 193, 7, 0.12); }
+    .benchmark-table tr.benchmark-row-avg { font-weight: 600; border-top: 2px solid var(--border); }
+    .benchmark-table tr.benchmark-row-avg.benchmark-row-with { background: rgba(33, 150, 243, 0.12); }
+    .benchmark-table tr.benchmark-row-avg.benchmark-row-without { background: rgba(255, 193, 7, 0.12); }
+    .benchmark-delta-positive { color: var(--green); font-weight: 600; }
+    .benchmark-delta-negative { color: var(--red); font-weight: 600; }
+    .benchmark-notes {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      padding: 1rem;
+    }
+    .benchmark-notes h3 {
+      font-family: 'Poppins', sans-serif;
+      font-size: 0.875rem;
+      margin-bottom: 0.75rem;
+    }
+    .benchmark-notes ul {
+      list-style: disc;
+      padding-left: 1.25rem;
+    }
+    .benchmark-notes li {
+      font-size: 0.8125rem;
+      line-height: 1.6;
+      margin-bottom: 0.375rem;
+    }
+    .benchmark-empty {
+      color: var(--text-muted);
+      font-style: italic;
+      text-align: center;
+      padding: 3rem;
+    }
+
+    /* ---- Navigation ---- */
+    .nav {
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      padding: 1rem 2rem;
+      border-top: 1px solid var(--border);
+      background: var(--surface);
+      flex-shrink: 0;
+    }
+    .nav-btn {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.5rem 1.25rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      cursor: pointer;
+      font-size: 0.875rem;
+      font-weight: 500;
+      color: var(--text);
+      transition: all 0.15s;
+    }
+    .nav-btn:hover:not(:disabled) {
+      background: var(--bg);
+      border-color: var(--text-muted);
+    }
+    .nav-btn:disabled {
+      opacity: 0.4;
+      cursor: not-allowed;
+    }
+    .done-btn {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.5rem 1.5rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      color: var(--text);
+      cursor: pointer;
+      font-size: 0.875rem;
+      font-weight: 500;
+      transition: all 0.15s;
+    }
+    .done-btn:hover {
+      background: var(--bg);
+      border-color: var(--text-muted);
+    }
+    .done-btn.ready {
+      border: none;
+      background: var(--accent);
+      color: white;
+      font-weight: 600;
+    }
+    .done-btn.ready:hover {
+      background: var(--accent-hover);
+    }
+    /* ---- Done overlay ---- */
+    .done-overlay {
+      display: none;
+      position: fixed;
+      inset: 0;
+      background: rgba(0, 0, 0, 0.5);
+      z-index: 100;
+      justify-content: center;
+      align-items: center;
+    }
+    .done-overlay.visible {
+      display: flex;
+    }
+    .done-card {
+      background: var(--surface);
+      border-radius: 12px;
+      padding: 2rem 3rem;
+      text-align: center;
+      box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
+      max-width: 500px;
+    }
+    .done-card h2 {
+      font-size: 1.5rem;
+      margin-bottom: 0.5rem;
+    }
+    .done-card p {
+      color: var(--text-muted);
+      margin-bottom: 1.5rem;
+      line-height: 1.5;
+    }
+    .done-card .btn-row {
+      display: flex;
+      gap: 0.5rem;
+      justify-content: center;
+    }
+    .done-card button {
+      padding: 0.5rem 1.25rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      cursor: pointer;
+      font-size: 0.875rem;
+    }
+    .done-card button:hover {
+      background: var(--bg);
+    }
+    /* ---- Toast ---- */
+    .toast {
+      position: fixed;
+      bottom: 5rem;
+      left: 50%;
+      transform: translateX(-50%);
+      background: var(--header-bg);
+      color: var(--header-text);
+      padding: 0.625rem 1.25rem;
+      border-radius: var(--radius);
+      font-size: 0.875rem;
+      opacity: 0;
+      transition: opacity 0.3s;
+      pointer-events: none;
+      z-index: 200;
+    }
+    .toast.visible {
+      opacity: 1;
+    }
+  </style>
+</head>
+<body>
+  <div id="app" style="height:100vh; display:flex; flex-direction:column;">
+    <div class="header">
+      <div>
+        <h1>Eval Review: <span id="skill-name"></span></h1>
+        <div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>
+      </div>
+      <div class="progress" id="progress"></div>
+    </div>
+
+    <!-- View tabs (only shown when benchmark data exists) -->
+    <div class="view-tabs" id="view-tabs" style="display:none;">
+      <button class="view-tab active" onclick="switchView('outputs')">Outputs</button>
+      <button class="view-tab" onclick="switchView('benchmark')">Benchmark</button>
+    </div>
+
+    <!-- Outputs panel (qualitative review) -->
+    <div class="view-panel active" id="panel-outputs">
+    <div class="main">
+      <!-- Prompt -->
+      <div class="section">
+        <div class="section-header">Prompt <span class="config-badge" id="config-badge" style="display:none;"></span></div>
+        <div class="section-body">
+          <div class="prompt-text" id="prompt-text"></div>
+        </div>
+      </div>
+
+      <!-- Outputs -->
+      <div class="section">
+        <div class="section-header">Output</div>
+        <div class="section-body" id="outputs-body">
+          <div class="empty-state">No output files found</div>
+        </div>
+      </div>
+
+      <!-- Previous Output (collapsible) -->
+      <div class="section" id="prev-outputs-section" style="display:none;">
+        <div class="section-header">
+          <div class="grades-toggle" onclick="togglePrevOutputs()">
+            <span class="arrow" id="prev-outputs-arrow">&#9654;</span>
+            Previous Output
+          </div>
+        </div>
+        <div class="grades-content" id="prev-outputs-content"></div>
+      </div>
+
+      <!-- Grades (collapsible) -->
+      <div class="section" id="grades-section" style="display:none;">
+        <div class="section-header">
+          <div class="grades-toggle" onclick="toggleGrades()">
+            <span class="arrow" id="grades-arrow">&#9654;</span>
+            Formal Grades
+          </div>
+        </div>
+        <div class="grades-content" id="grades-content"></div>
+      </div>
+
+      <!-- Feedback -->
+      <div class="section">
+        <div class="section-header">Your Feedback</div>
+        <div class="section-body">
+          <textarea
+            class="feedback-textarea"
+            id="feedback"
+            placeholder="What do you think of this output? Any issues, suggestions, or things that look great?"
+          ></textarea>
+          <div class="feedback-status" id="feedback-status"></div>
+          <div class="prev-feedback" id="prev-feedback" style="display:none;">
+            <div class="prev-feedback-label">Previous feedback</div>
+            <div id="prev-feedback-text"></div>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <div class="nav" id="outputs-nav">
+      <button class="nav-btn" id="prev-btn" onclick="navigate(-1)">&#8592; Previous</button>
+      <button class="done-btn" id="done-btn" onclick="showDoneDialog()">Submit All Reviews</button>
+      <button class="nav-btn" id="next-btn" onclick="navigate(1)">Next &#8594;</button>
+    </div>
+    </div><!-- end panel-outputs -->
+
+    <!-- Benchmark panel (quantitative stats) -->
+    <div class="view-panel" id="panel-benchmark">
+      <div class="benchmark-view" id="benchmark-content">
+        <div class="benchmark-empty">No benchmark data available. Run a benchmark to see quantitative results here.</div>
+      </div>
+    </div>
+  </div>
+
+  <!-- Done overlay -->
+  <div class="done-overlay" id="done-overlay">
+    <div class="done-card">
+      <h2>Review Complete</h2>
+      <p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>
+      <div class="btn-row">
+        <button onclick="closeDoneDialog()">OK</button>
+      </div>
+    </div>
+  </div>
+
+  <!-- Toast -->
+  <div class="toast" id="toast"></div>
+
+  <script>
+    // ---- Embedded data (injected by generate_review.py) ----
+    /*__EMBEDDED_DATA__*/
+
+    // ---- State ----
+    let feedbackMap = {};  // run_id -> feedback text
+    let currentIndex = 0;
+    let visitedRuns = new Set();
+
+    // ---- Init ----
+    async function init() {
+      // Load saved feedback from server — but only if this isn't a fresh
+      // iteration (indicated by previous_feedback being present). When
+      // previous feedback exists, the feedback.json on disk is stale from
+      // the prior iteration and should not pre-fill the textareas.
+      const hasPrevious = Object.keys(EMBEDDED_DATA.previous_feedback || {}).length > 0
+        || Object.keys(EMBEDDED_DATA.previous_outputs || {}).length > 0;
+      if (!hasPrevious) {
+        try {
+          const resp = await fetch("/api/feedback");
+          const data = await resp.json();
+          if (data.reviews) {
+            for (const r of data.reviews) feedbackMap[r.run_id] = r.feedback;
+          }
+        } catch { /* first run, no feedback yet */ }
+      }
+
+      document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
+      showRun(0);
+
+      // Wire up feedback auto-save
+      const textarea = document.getElementById("feedback");
+      let saveTimeout = null;
+      textarea.addEventListener("input", () => {
+        clearTimeout(saveTimeout);
+        document.getElementById("feedback-status").textContent = "";
+        saveTimeout = setTimeout(() => saveCurrentFeedback(), 800);
+      });
+    }
+
+    // ---- Navigation ----
+    function navigate(delta) {
+      const newIndex = currentIndex + delta;
+      if (newIndex >= 0 && newIndex < EMBEDDED_DATA.runs.length) {
+        saveCurrentFeedback();
+        showRun(newIndex);
+      }
+    }
+
+    function updateNavButtons() {
+      document.getElementById("prev-btn").disabled = currentIndex === 0;
+      document.getElementById("next-btn").disabled =
+        currentIndex === EMBEDDED_DATA.runs.length - 1;
+    }
+
+    // ---- Show a run ----
+    function showRun(index) {
+      currentIndex = index;
+      const run = EMBEDDED_DATA.runs[index];
+
+      // Progress
+      document.getElementById("progress").textContent =
+        `${index + 1} of ${EMBEDDED_DATA.runs.length}`;
+
+      // Prompt
+      document.getElementById("prompt-text").textContent = run.prompt;
+
+      // Config badge
+      const badge = document.getElementById("config-badge");
+      const configMatch = run.id.match(/(with_skill|without_skill|new_skill|old_skill)/);
+      if (configMatch) {
+        const config = configMatch[1];
+        const isBaseline = config === "without_skill" || config === "old_skill";
+        badge.textContent = config.replace(/_/g, " ");
+        badge.className = "config-badge " + (isBaseline ? "config-baseline" : "config-primary");
+        badge.style.display = "inline-block";
+      } else {
+        badge.style.display = "none";
+      }
+
+      // Outputs
+      renderOutputs(run);
+
+      // Previous outputs
+      renderPrevOutputs(run);
+
+      // Grades
+      renderGrades(run);
+
+      // Previous feedback
+      const prevFb = (EMBEDDED_DATA.previous_feedback || {})[run.id];
+      const prevEl = document.getElementById("prev-feedback");
+      if (prevFb) {
+        document.getElementById("prev-feedback-text").textContent = prevFb;
+        prevEl.style.display = "block";
+      } else {
+        prevEl.style.display = "none";
+      }
+
+      // Feedback
+      document.getElementById("feedback").value = feedbackMap[run.id] || "";
+      document.getElementById("feedback-status").textContent = "";
+
+      updateNavButtons();
+
+      // Track visited runs and promote done button when all visited
+      visitedRuns.add(index);
+      const doneBtn = document.getElementById("done-btn");
+      if (visitedRuns.size >= EMBEDDED_DATA.runs.length) {
+        doneBtn.classList.add("ready");
+      }
+
+      // Scroll main content to top
+      document.querySelector(".main").scrollTop = 0;
+    }
+
+    // ---- Render outputs ----
+    function renderOutputs(run) {
+      const container = document.getElementById("outputs-body");
+      container.innerHTML = "";
+
+      const outputs = run.outputs || [];
+      if (outputs.length === 0) {
+        container.innerHTML = '<div class="empty-state">No output files</div>';
+        return;
+      }
+
+      for (const file of outputs) {
+        const fileDiv = document.createElement("div");
+        fileDiv.className = "output-file";
+
+        // Always show file header with download link
+        const header = document.createElement("div");
+        header.className = "output-file-header";
+        const nameSpan = document.createElement("span");
+        nameSpan.textContent = file.name;
+        header.appendChild(nameSpan);
+        const dlBtn = document.createElement("a");
+        dlBtn.className = "dl-btn";
+        dlBtn.textContent = "Download";
+        dlBtn.download = file.name;
+        dlBtn.href = getDownloadUri(file);
+        header.appendChild(dlBtn);
+        fileDiv.appendChild(header);
+
+        const content = document.createElement("div");
+        content.className = "output-file-content";
+
+        if (file.type === "text") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          content.appendChild(pre);
+        } else if (file.type === "image") {
+          const img = document.createElement("img");
+          img.src = file.data_uri;
+          img.alt = file.name;
+          content.appendChild(img);
+        } else if (file.type === "pdf") {
+          const iframe = document.createElement("iframe");
+          iframe.src = file.data_uri;
+          content.appendChild(iframe);
+        } else if (file.type === "xlsx") {
+          renderXlsx(content, file.data_b64);
+        } else if (file.type === "binary") {
+          const a = document.createElement("a");
+          a.className = "download-link";
+          a.href = file.data_uri;
+          a.download = file.name;
+          a.textContent = "Download " + file.name;
+          content.appendChild(a);
+        } else if (file.type === "error") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          pre.style.color = "var(--red)";
+          content.appendChild(pre);
+        }
+
+        fileDiv.appendChild(content);
+        container.appendChild(fileDiv);
+      }
+    }
+
+    // ---- XLSX rendering via SheetJS ----
+    function renderXlsx(container, b64Data) {
+      try {
+        const raw = Uint8Array.from(atob(b64Data), c => c.charCodeAt(0));
+        const wb = XLSX.read(raw, { type: "array" });
+
+        for (let i = 0; i < wb.SheetNames.length; i++) {
+          const sheetName = wb.SheetNames[i];
+          const ws = wb.Sheets[sheetName];
+
+          if (wb.SheetNames.length > 1) {
+            const sheetLabel = document.createElement("div");
+            sheetLabel.style.cssText =
+              "font-weight:600; font-size:0.8rem; color:#b0aea5; margin-top:0.5rem; margin-bottom:0.25rem;";
+            sheetLabel.textContent = "Sheet: " + sheetName;
+            container.appendChild(sheetLabel);
+          }
+
+          const htmlStr = XLSX.utils.sheet_to_html(ws, { editable: false });
+          const wrapper = document.createElement("div");
+          wrapper.innerHTML = htmlStr;
+          container.appendChild(wrapper);
+        }
+      } catch (err) {
+        container.textContent = "Error rendering spreadsheet: " + err.message;
+      }
+    }
+
+    // ---- Grades ----
+    function renderGrades(run) {
+      const section = document.getElementById("grades-section");
+      const content = document.getElementById("grades-content");
+
+      if (!run.grading) {
+        section.style.display = "none";
+        return;
+      }
+
+      const grading = run.grading;
+      section.style.display = "block";
+      // Reset to collapsed
+      content.classList.remove("open");
+      document.getElementById("grades-arrow").classList.remove("open");
+
+      const summary = grading.summary || {};
+      const expectations = grading.expectations || [];
+
+      let html = '<div style="padding: 1rem;">';
+
+      // Summary line
+      const passRate = summary.pass_rate != null
+        ? Math.round(summary.pass_rate * 100) + "%"
+        : "?";
+      const badgeClass = summary.pass_rate >= 0.8 ? "grade-pass" : summary.pass_rate >= 0.5 ? "" : "grade-fail";
+      html += '<div class="grades-summary">';
+      html += '<span class="grade-badge ' + badgeClass + '">' + passRate + '</span>';
+      html += '<span>' + (summary.passed || 0) + ' passed, ' + (summary.failed || 0) + ' failed of ' + (summary.total || 0) + '</span>';
+      html += '</div>';
+
+      // Assertions list
+      html += '<ul class="assertion-list">';
+      for (const exp of expectations) {
+        const statusClass = exp.passed ? "pass" : "fail";
+        const statusIcon = exp.passed ? "\u2713" : "\u2717";
+        html += '<li class="assertion-item">';
+        html += '<span class="assertion-status ' + statusClass + '">' + statusIcon + '</span>';
+        html += '<span>' + escapeHtml(exp.text) + '</span>';
+        if (exp.evidence) {
+          html += '<div class="assertion-evidence">' + escapeHtml(exp.evidence) + '</div>';
+        }
+        html += '</li>';
+      }
+      html += '</ul>';
+
+      html += '</div>';
+      content.innerHTML = html;
+    }
+
+    function toggleGrades() {
+      const content = document.getElementById("grades-content");
+      const arrow = document.getElementById("grades-arrow");
+      content.classList.toggle("open");
+      arrow.classList.toggle("open");
+    }
+
+    // ---- Previous outputs (collapsible) ----
+    function renderPrevOutputs(run) {
+      const section = document.getElementById("prev-outputs-section");
+      const content = document.getElementById("prev-outputs-content");
+      const prevOutputs = (EMBEDDED_DATA.previous_outputs || {})[run.id];
+
+      if (!prevOutputs || prevOutputs.length === 0) {
+        section.style.display = "none";
+        return;
+      }
+
+      section.style.display = "block";
+      // Reset to collapsed
+      content.classList.remove("open");
+      document.getElementById("prev-outputs-arrow").classList.remove("open");
+
+      // Render the files into the content area
+      content.innerHTML = "";
+      const wrapper = document.createElement("div");
+      wrapper.style.padding = "1rem";
+
+      for (const file of prevOutputs) {
+        const fileDiv = document.createElement("div");
+        fileDiv.className = "output-file";
+
+        const header = document.createElement("div");
+        header.className = "output-file-header";
+        const nameSpan = document.createElement("span");
+        nameSpan.textContent = file.name;
+        header.appendChild(nameSpan);
+        const dlBtn = document.createElement("a");
+        dlBtn.className = "dl-btn";
+        dlBtn.textContent = "Download";
+        dlBtn.download = file.name;
+        dlBtn.href = getDownloadUri(file);
+        header.appendChild(dlBtn);
+        fileDiv.appendChild(header);
+
+        const fc = document.createElement("div");
+        fc.className = "output-file-content";
+
+        if (file.type === "text") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          fc.appendChild(pre);
+        } else if (file.type === "image") {
+          const img = document.createElement("img");
+          img.src = file.data_uri;
+          img.alt = file.name;
+          fc.appendChild(img);
+        } else if (file.type === "pdf") {
+          const iframe = document.createElement("iframe");
+          iframe.src = file.data_uri;
+          fc.appendChild(iframe);
+        } else if (file.type === "xlsx") {
+          renderXlsx(fc, file.data_b64);
+        } else if (file.type === "binary") {
+          const a = document.createElement("a");
+          a.className = "download-link";
+          a.href = file.data_uri;
+          a.download = file.name;
+          a.textContent = "Download " + file.name;
+          fc.appendChild(a);
+        }
+
+        fileDiv.appendChild(fc);
+        wrapper.appendChild(fileDiv);
+      }
+
+      content.appendChild(wrapper);
+    }
+
+    function togglePrevOutputs() {
+      const content = document.getElementById("prev-outputs-content");
+      const arrow = document.getElementById("prev-outputs-arrow");
+      content.classList.toggle("open");
+      arrow.classList.toggle("open");
+    }
+
+    // ---- Feedback (saved to server -> feedback.json) ----
+    function saveCurrentFeedback() {
+      const run = EMBEDDED_DATA.runs[currentIndex];
+      const text = document.getElementById("feedback").value;
+
+      if (text.trim() === "") {
+        delete feedbackMap[run.id];
+      } else {
+        feedbackMap[run.id] = text;
+      }
+
+      // Build reviews array from map
+      const reviews = [];
+      for (const [run_id, feedback] of Object.entries(feedbackMap)) {
+        if (feedback.trim()) {
+          reviews.push({ run_id, feedback, timestamp: new Date().toISOString() });
+        }
+      }
+
+      fetch("/api/feedback", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ reviews, status: "in_progress" }),
+      }).then(() => {
+        document.getElementById("feedback-status").textContent = "Saved";
+      }).catch(() => {
+        // Static mode or server unavailable — no-op on auto-save,
+        // feedback will be downloaded on final submit
+        document.getElementById("feedback-status").textContent = "Will download on submit";
+      });
+    }
+
+    // ---- Done ----
+    function showDoneDialog() {
+      // Save current textarea to feedbackMap (but don't POST yet)
+      const run = EMBEDDED_DATA.runs[currentIndex];
+      const text = document.getElementById("feedback").value;
+      if (text.trim() === "") {
+        delete feedbackMap[run.id];
+      } else {
+        feedbackMap[run.id] = text;
+      }
+
+      // POST once with status: complete — include ALL runs so the model
+      // can distinguish "no feedback" (looks good) from "not reviewed"
+      const reviews = [];
+      const ts = new Date().toISOString();
+      for (const r of EMBEDDED_DATA.runs) {
+        reviews.push({ run_id: r.id, feedback: feedbackMap[r.id] || "", timestamp: ts });
+      }
+      const payload = JSON.stringify({ reviews, status: "complete" }, null, 2);
+      fetch("/api/feedback", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: payload,
+      }).then(() => {
+        document.getElementById("done-overlay").classList.add("visible");
+      }).catch(() => {
+        // Server not available (static mode) — download as file
+        const blob = new Blob([payload], { type: "application/json" });
+        const url = URL.createObjectURL(blob);
+        const a = document.createElement("a");
+        a.href = url;
+        a.download = "feedback.json";
+        a.click();
+        URL.revokeObjectURL(url);
+        document.getElementById("done-overlay").classList.add("visible");
+      });
+    }
+
+    function closeDoneDialog() {
+      // Reset status back to in_progress
+      saveCurrentFeedback();
+      document.getElementById("done-overlay").classList.remove("visible");
+    }
+
+    // ---- Toast ----
+    function showToast(message) {
+      const toast = document.getElementById("toast");
+      toast.textContent = message;
+      toast.classList.add("visible");
+      setTimeout(() => toast.classList.remove("visible"), 2000);
+    }
+
+    // ---- Keyboard nav ----
+    document.addEventListener("keydown", (e) => {
+      // Don't capture when typing in textarea
+      if (e.target.tagName === "TEXTAREA") return;
+
+      if (e.key === "ArrowLeft" || e.key === "ArrowUp") {
+        e.preventDefault();
+        navigate(-1);
+      } else if (e.key === "ArrowRight" || e.key === "ArrowDown") {
+        e.preventDefault();
+        navigate(1);
+      }
+    });
+
+    // ---- Util ----
+    function getDownloadUri(file) {
+      if (file.data_uri) return file.data_uri;
+      if (file.data_b64) return "data:application/octet-stream;base64," + file.data_b64;
+      if (file.type === "text") return "data:text/plain;charset=utf-8," + encodeURIComponent(file.content);
+      return "#";
+    }
+
+    function escapeHtml(text) {
+      const div = document.createElement("div");
+      div.textContent = text;
+      return div.innerHTML;
+    }
+
+    // ---- View switching ----
+    function switchView(view) {
+      document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
+      document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
+      document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
+      document.getElementById("panel-" + view).classList.add("active");
+    }
+
+    // ---- Benchmark rendering ----
+    function renderBenchmark() {
+      const data = EMBEDDED_DATA.benchmark;
+      if (!data) return;
+
+      // Show the tabs
+      document.getElementById("view-tabs").style.display = "flex";
+
+      const container = document.getElementById("benchmark-content");
+      const summary = data.run_summary || {};
+      const metadata = data.metadata || {};
+      const notes = data.notes || [];
+
+      let html = "";
+
+      // Header
+      html += "<h2 style='font-family: Poppins, sans-serif; margin-bottom: 0.5rem;'>Benchmark Results</h2>";
+      html += "<p style='color: var(--text-muted); font-size: 0.875rem; margin-bottom: 1.25rem;'>";
+      if (metadata.skill_name) html += "<strong>" + escapeHtml(metadata.skill_name) + "</strong> &mdash; ";
+      if (metadata.timestamp) html += metadata.timestamp + " &mdash; ";
+      if (metadata.evals_run) html += "Evals: " + metadata.evals_run.join(", ") + " &mdash; ";
+      html += (metadata.runs_per_configuration || "?") + " runs per configuration";
+      html += "</p>";
+
+      // Summary table
+      html += '<table class="benchmark-table">';
+
+      function fmtStat(stat, pct) {
+        if (!stat) return "—";
+        const suffix = pct ? "%" : "";
+        const m = pct ? (stat.mean * 100).toFixed(0) : stat.mean.toFixed(1);
+        const s = pct ? (stat.stddev * 100).toFixed(0) : stat.stddev.toFixed(1);
+        return m + suffix + " ± " + s + suffix;
+      }
+
+      function deltaClass(val) {
+        if (!val) return "";
+        const n = parseFloat(val);
+        if (n > 0) return "benchmark-delta-positive";
+        if (n < 0) return "benchmark-delta-negative";
+        return "";
+      }
+
+      // Discover config names dynamically (everything except "delta")
+      const configs = Object.keys(summary).filter(k => k !== "delta");
+      const configA = configs[0] || "config_a";
+      const configB = configs[1] || "config_b";
+      const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+      const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+      const a = summary[configA] || {};
+      const b = summary[configB] || {};
+      const delta = summary.delta || {};
+
+      html += "<thead><tr><th>Metric</th><th>" + escapeHtml(labelA) + "</th><th>" + escapeHtml(labelB) + "</th><th>Delta</th></tr></thead>";
+      html += "<tbody>";
+
+      html += "<tr><td><strong>Pass Rate</strong></td>";
+      html += "<td>" + fmtStat(a.pass_rate, true) + "</td>";
+      html += "<td>" + fmtStat(b.pass_rate, true) + "</td>";
+      html += '<td class="' + deltaClass(delta.pass_rate) + '">' + (delta.pass_rate || "—") + "</td></tr>";
+
+      // Time (only show row if data exists)
+      if (a.time_seconds || b.time_seconds) {
+        html += "<tr><td><strong>Time (s)</strong></td>";
+        html += "<td>" + fmtStat(a.time_seconds, false) + "</td>";
+        html += "<td>" + fmtStat(b.time_seconds, false) + "</td>";
+        html += '<td class="' + deltaClass(delta.time_seconds) + '">' + (delta.time_seconds ? delta.time_seconds + "s" : "—") + "</td></tr>";
+      }
+
+      // Tokens (only show row if data exists)
+      if (a.tokens || b.tokens) {
+        html += "<tr><td><strong>Tokens</strong></td>";
+        html += "<td>" + fmtStat(a.tokens, false) + "</td>";
+        html += "<td>" + fmtStat(b.tokens, false) + "</td>";
+        html += '<td class="' + deltaClass(delta.tokens) + '">' + (delta.tokens || "—") + "</td></tr>";
+      }
+
+      html += "</tbody></table>";
+
+      // Per-eval breakdown (if runs data available)
+      const runs = data.runs || [];
+      if (runs.length > 0) {
+        const evalIds = [...new Set(runs.map(r => r.eval_id))].sort((a, b) => a - b);
+
+        html += "<h3 style='font-family: Poppins, sans-serif; margin-bottom: 0.75rem;'>Per-Eval Breakdown</h3>";
+
+        const hasTime = runs.some(r => r.result && r.result.time_seconds != null);
+        const hasErrors = runs.some(r => r.result && r.result.errors > 0);
+
+        for (const evalId of evalIds) {
+          const evalRuns = runs.filter(r => r.eval_id === evalId);
+          const evalName = evalRuns[0] && evalRuns[0].eval_name ? evalRuns[0].eval_name : "Eval " + evalId;
+
+          html += "<h4 style='font-family: Poppins, sans-serif; margin: 1rem 0 0.5rem; color: var(--text);'>" + escapeHtml(evalName) + "</h4>";
+          html += '<table class="benchmark-table">';
+          html += "<thead><tr><th>Config</th><th>Run</th><th>Pass Rate</th>";
+          if (hasTime) html += "<th>Time (s)</th>";
+          if (hasErrors) html += "<th>Crashes During Execution</th>";
+          html += "</tr></thead>";
+          html += "<tbody>";
+
+          // Group by config and render with average rows
+          const configGroups = [...new Set(evalRuns.map(r => r.configuration))];
+          for (let ci = 0; ci < configGroups.length; ci++) {
+            const config = configGroups[ci];
+            const configRuns = evalRuns.filter(r => r.configuration === config);
+            if (configRuns.length === 0) continue;
+
+            const rowClass = ci === 0 ? "benchmark-row-with" : "benchmark-row-without";
+            const configLabel = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+
+            for (const run of configRuns) {
+              const r = run.result || {};
+              const prClass = r.pass_rate >= 0.8 ? "benchmark-delta-positive" : r.pass_rate < 0.5 ? "benchmark-delta-negative" : "";
+              html += '<tr class="' + rowClass + '">';
+              html += "<td>" + configLabel + "</td>";
+              html += "<td>" + run.run_number + "</td>";
+              html += '<td class="' + prClass + '">' + ((r.pass_rate || 0) * 100).toFixed(0) + "% (" + (r.passed || 0) + "/" + (r.total || 0) + ")</td>";
+              if (hasTime) html += "<td>" + (r.time_seconds != null ? r.time_seconds.toFixed(1) : "—") + "</td>";
+              if (hasErrors) html += "<td>" + (r.errors || 0) + "</td>";
+              html += "</tr>";
+            }
+
+            // Average row
+            const rates = configRuns.map(r => (r.result || {}).pass_rate || 0);
+            const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
+            const avgPrClass = avgRate >= 0.8 ? "benchmark-delta-positive" : avgRate < 0.5 ? "benchmark-delta-negative" : "";
+            html += '<tr class="benchmark-row-avg ' + rowClass + '">';
+            html += "<td>" + configLabel + "</td>";
+            html += "<td>Avg</td>";
+            html += '<td class="' + avgPrClass + '">' + (avgRate * 100).toFixed(0) + "%</td>";
+            if (hasTime) {
+              const times = configRuns.map(r => (r.result || {}).time_seconds).filter(t => t != null);
+              html += "<td>" + (times.length ? (times.reduce((a, b) => a + b, 0) / times.length).toFixed(1) : "—") + "</td>";
+            }
+            if (hasErrors) html += "<td></td>";
+            html += "</tr>";
+          }
+          html += "</tbody></table>";
+
+          // Per-assertion detail for this eval
+          const runsWithExpectations = {};
+          for (const config of configGroups) {
+            runsWithExpectations[config] = evalRuns.filter(r => r.configuration === config && r.expectations && r.expectations.length > 0);
+          }
+          const hasAnyExpectations = Object.values(runsWithExpectations).some(runs => runs.length > 0);
+          if (hasAnyExpectations) {
+            // Collect all unique assertion texts across all configs
+            const allAssertions = [];
+            const seen = new Set();
+            for (const config of configGroups) {
+              for (const run of runsWithExpectations[config]) {
+                for (const exp of (run.expectations || [])) {
+                  if (!seen.has(exp.text)) {
+                    seen.add(exp.text);
+                    allAssertions.push(exp.text);
+                  }
+                }
+              }
+            }
+
+            html += '<table class="benchmark-table" style="margin-top: 0.5rem;">';
+            html += "<thead><tr><th>Assertion</th>";
+            for (const config of configGroups) {
+              const label = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+              html += "<th>" + escapeHtml(label) + "</th>";
+            }
+            html += "</tr></thead><tbody>";
+
+            for (const assertionText of allAssertions) {
+              html += "<tr><td>" + escapeHtml(assertionText) + "</td>";
+
+              for (const config of configGroups) {
+                html += "<td>";
+                for (const run of runsWithExpectations[config]) {
+                  const exp = (run.expectations || []).find(e => e.text === assertionText);
+                  if (exp) {
+                    const cls = exp.passed ? "benchmark-delta-positive" : "benchmark-delta-negative";
+                    const icon = exp.passed ? "\u2713" : "\u2717";
+                    html += '<span class="' + cls + '" title="Run ' + run.run_number + ': ' + escapeHtml(exp.evidence || "") + '">' + icon + "</span> ";
+                  } else {
+                    html += "— ";
+                  }
+                }
+                html += "</td>";
+              }
+              html += "</tr>";
+            }
+            html += "</tbody></table>";
+          }
+        }
+      }
+
+      // Notes
+      if (notes.length > 0) {
+        html += '<div class="benchmark-notes">';
+        html += "<h3>Analysis Notes</h3>";
+        html += "<ul>";
+        for (const note of notes) {
+          html += "<li>" + escapeHtml(note) + "</li>";
+        }
+        html += "</ul></div>";
+      }
+
+      container.innerHTML = html;
+    }
+
+    // ---- Start ----
+    init();
+    renderBenchmark();
+  </script>
+</body>
+</html>
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/references/schemas.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/references/schemas.md
new file mode 100644
index 000000000..b6eeaa2d4
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/references/schemas.md
@@ -0,0 +1,430 @@
+# JSON Schemas
+
+This document defines the JSON schemas used by skill-creator.
+
+---
+
+## evals.json
+
+Defines the evals for a skill. Located at `evals/evals.json` within the skill directory.
+
+```json
+{
+  "skill_name": "example-skill",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "User's example prompt",
+      "expected_output": "Description of expected result",
+      "files": ["evals/files/sample1.pdf"],
+      "expectations": [
+        "The output includes X",
+        "The skill used script Y"
+      ]
+    }
+  ]
+}
+```
+
+**Fields:**
+- `skill_name`: Name matching the skill's frontmatter
+- `evals[].id`: Unique integer identifier
+- `evals[].prompt`: The task to execute
+- `evals[].expected_output`: Human-readable description of success
+- `evals[].files`: Optional list of input file paths (relative to skill root)
+- `evals[].expectations`: List of verifiable statements
+
+---
+
+## history.json
+
+Tracks version progression in Improve mode. Located at workspace root.
+
+```json
+{
+  "started_at": "2026-01-15T10:30:00Z",
+  "skill_name": "pdf",
+  "current_best": "v2",
+  "iterations": [
+    {
+      "version": "v0",
+      "parent": null,
+      "expectation_pass_rate": 0.65,
+      "grading_result": "baseline",
+      "is_current_best": false
+    },
+    {
+      "version": "v1",
+      "parent": "v0",
+      "expectation_pass_rate": 0.75,
+      "grading_result": "won",
+      "is_current_best": false
+    },
+    {
+      "version": "v2",
+      "parent": "v1",
+      "expectation_pass_rate": 0.85,
+      "grading_result": "won",
+      "is_current_best": true
+    }
+  ]
+}
+```
+
+**Fields:**
+- `started_at`: ISO timestamp of when improvement started
+- `skill_name`: Name of the skill being improved
+- `current_best`: Version identifier of the best performer
+- `iterations[].version`: Version identifier (v0, v1, ...)
+- `iterations[].parent`: Parent version this was derived from
+- `iterations[].expectation_pass_rate`: Pass rate from grading
+- `iterations[].grading_result`: "baseline", "won", "lost", or "tie"
+- `iterations[].is_current_best`: Whether this is the current best version
+
+---
+
+## grading.json
+
+Output from the grader agent. Located at `<run-dir>/grading.json`.
+
+```json
+{
+  "expectations": [
+    {
+      "text": "The output includes the name 'John Smith'",
+      "passed": true,
+      "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
+    },
+    {
+      "text": "The spreadsheet has a SUM formula in cell B10",
+      "passed": false,
+      "evidence": "No spreadsheet was created. The output was a text file."
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 1,
+    "total": 3,
+    "pass_rate": 0.67
+  },
+  "execution_metrics": {
+    "tool_calls": {
+      "Read": 5,
+      "Write": 2,
+      "Bash": 8
+    },
+    "total_tool_calls": 15,
+    "total_steps": 6,
+    "errors_encountered": 0,
+    "output_chars": 12450,
+    "transcript_chars": 3200
+  },
+  "timing": {
+    "executor_duration_seconds": 165.0,
+    "grader_duration_seconds": 26.0,
+    "total_duration_seconds": 191.0
+  },
+  "claims": [
+    {
+      "claim": "The form has 12 fillable fields",
+      "type": "factual",
+      "verified": true,
+      "evidence": "Counted 12 fields in field_info.json"
+    }
+  ],
+  "user_notes_summary": {
+    "uncertainties": ["Used 2023 data, may be stale"],
+    "needs_review": [],
+    "workarounds": ["Fell back to text overlay for non-fillable fields"]
+  },
+  "eval_feedback": {
+    "suggestions": [
+      {
+        "assertion": "The output includes the name 'John Smith'",
+        "reason": "A hallucinated document that mentions the name would also pass"
+      }
+    ],
+    "overall": "Assertions check presence but not correctness."
+  }
+}
+```
+
+**Fields:**
+- `expectations[]`: Graded expectations with evidence
+- `summary`: Aggregate pass/fail counts
+- `execution_metrics`: Tool usage and output size (from executor's metrics.json)
+- `timing`: Wall clock timing (from timing.json)
+- `claims`: Extracted and verified claims from the output
+- `user_notes_summary`: Issues flagged by the executor
+- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising
+
+---
+
+## metrics.json
+
+Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
+
+```json
+{
+  "tool_calls": {
+    "Read": 5,
+    "Write": 2,
+    "Bash": 8,
+    "Edit": 1,
+    "Glob": 2,
+    "Grep": 0
+  },
+  "total_tool_calls": 18,
+  "total_steps": 6,
+  "files_created": ["filled_form.pdf", "field_values.json"],
+  "errors_encountered": 0,
+  "output_chars": 12450,
+  "transcript_chars": 3200
+}
+```
+
+**Fields:**
+- `tool_calls`: Count per tool type
+- `total_tool_calls`: Sum of all tool calls
+- `total_steps`: Number of major execution steps
+- `files_created`: List of output files created
+- `errors_encountered`: Number of errors during execution
+- `output_chars`: Total character count of output files
+- `transcript_chars`: Character count of transcript
+
+---
+
+## timing.json
+
+Wall clock timing for a run. Located at `<run-dir>/timing.json`.
+
+**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact.
+
+```json
+{
+  "total_tokens": 84852,
+  "duration_ms": 23332,
+  "total_duration_seconds": 23.3,
+  "executor_start": "2026-01-15T10:30:00Z",
+  "executor_end": "2026-01-15T10:32:45Z",
+  "executor_duration_seconds": 165.0,
+  "grader_start": "2026-01-15T10:32:46Z",
+  "grader_end": "2026-01-15T10:33:12Z",
+  "grader_duration_seconds": 26.0
+}
+```
+
+---
+
+## benchmark.json
+
+Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
+
+```json
+{
+  "metadata": {
+    "skill_name": "pdf",
+    "skill_path": "/path/to/pdf",
+    "executor_model": "claude-sonnet-4-20250514",
+    "analyzer_model": "most-capable-model",
+    "timestamp": "2026-01-15T10:30:00Z",
+    "evals_run": [1, 2, 3],
+    "runs_per_configuration": 3
+  },
+
+  "runs": [
+    {
+      "eval_id": 1,
+      "eval_name": "Ocean",
+      "configuration": "with_skill",
+      "run_number": 1,
+      "result": {
+        "pass_rate": 0.85,
+        "passed": 6,
+        "failed": 1,
+        "total": 7,
+        "time_seconds": 42.5,
+        "tokens": 3800,
+        "tool_calls": 18,
+        "errors": 0
+      },
+      "expectations": [
+        {"text": "...", "passed": true, "evidence": "..."}
+      ],
+      "notes": [
+        "Used 2023 data, may be stale",
+        "Fell back to text overlay for non-fillable fields"
+      ]
+    }
+  ],
+
+  "run_summary": {
+    "with_skill": {
+      "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
+      "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
+      "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
+    },
+    "without_skill": {
+      "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
+      "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
+      "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
+    },
+    "delta": {
+      "pass_rate": "+0.50",
+      "time_seconds": "+13.0",
+      "tokens": "+1700"
+    }
+  },
+
+  "notes": [
+    "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
+    "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent",
+    "Without-skill runs consistently fail on table extraction expectations",
+    "Skill adds 13s average execution time but improves pass rate by 50%"
+  ]
+}
+```
+
+**Fields:**
+- `metadata`: Information about the benchmark run
+  - `skill_name`: Name of the skill
+  - `timestamp`: When the benchmark was run
+  - `evals_run`: List of eval names or IDs
+  - `runs_per_configuration`: Number of runs per config (e.g. 3)
+- `runs[]`: Individual run results
+  - `eval_id`: Numeric eval identifier
+  - `eval_name`: Human-readable eval name (used as section header in the viewer)
+  - `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding)
+  - `run_number`: Integer run number (1, 2, 3...)
+  - `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors`
+- `run_summary`: Statistical aggregates per configuration
+  - `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields
+  - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"`
+- `notes`: Freeform observations from the analyzer
+
+**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually.
+
+---
+
+## comparison.json
+
+Output from blind comparator. Located at `<grading-dir>/comparison-N.json`.
+
+```json
+{
+  "winner": "A",
+  "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
+  "rubric": {
+    "A": {
+      "content": {
+        "correctness": 5,
+        "completeness": 5,
+        "accuracy": 4
+      },
+      "structure": {
+        "organization": 4,
+        "formatting": 5,
+        "usability": 4
+      },
+      "content_score": 4.7,
+      "structure_score": 4.3,
+      "overall_score": 9.0
+    },
+    "B": {
+      "content": {
+        "correctness": 3,
+        "completeness": 2,
+        "accuracy": 3
+      },
+      "structure": {
+        "organization": 3,
+        "formatting": 2,
+        "usability": 3
+      },
+      "content_score": 2.7,
+      "structure_score": 2.7,
+      "overall_score": 5.4
+    }
+  },
+  "output_quality": {
+    "A": {
+      "score": 9,
+      "strengths": ["Complete solution", "Well-formatted", "All fields present"],
+      "weaknesses": ["Minor style inconsistency in header"]
+    },
+    "B": {
+      "score": 5,
+      "strengths": ["Readable output", "Correct basic structure"],
+      "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
+    }
+  },
+  "expectation_results": {
+    "A": {
+      "passed": 4,
+      "total": 5,
+      "pass_rate": 0.80,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    },
+    "B": {
+      "passed": 3,
+      "total": 5,
+      "pass_rate": 0.60,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    }
+  }
+}
+```
+
+---
+
+## analysis.json
+
+Output from post-hoc analyzer. Located at `<grading-dir>/analysis.json`.
+
+```json
+{
+  "comparison_summary": {
+    "winner": "A",
+    "winner_skill": "path/to/winner/skill",
+    "loser_skill": "path/to/loser/skill",
+    "comparator_reasoning": "Brief summary of why comparator chose winner"
+  },
+  "winner_strengths": [
+    "Clear step-by-step instructions for handling multi-page documents",
+    "Included validation script that caught formatting errors"
+  ],
+  "loser_weaknesses": [
+    "Vague instruction 'process the document appropriately' led to inconsistent behavior",
+    "No script for validation, agent had to improvise"
+  ],
+  "instruction_following": {
+    "winner": {
+      "score": 9,
+      "issues": ["Minor: skipped optional logging step"]
+    },
+    "loser": {
+      "score": 6,
+      "issues": [
+        "Did not use the skill's formatting template",
+        "Invented own approach instead of following step 3"
+      ]
+    }
+  },
+  "improvement_suggestions": [
+    {
+      "priority": "high",
+      "category": "instructions",
+      "suggestion": "Replace 'process the document appropriately' with explicit steps",
+      "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
+    }
+  ],
+  "transcript_insights": {
+    "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script",
+    "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods"
+  }
+}
+```
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/__init__.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/aggregate_benchmark.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/aggregate_benchmark.py
new file mode 100644
index 000000000..ccc810819
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/aggregate_benchmark.py
@@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+"""
+Aggregate individual run results into benchmark summary statistics.
+
+Reads grading.json files from run directories and produces:
+- run_summary with mean, stddev, min, max for each metric
+- delta between with_skill and without_skill configurations
+
+Usage:
+    python aggregate_benchmark.py <benchmark_dir>
+
+Example:
+    python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
+
+The script supports two directory layouts:
+
+    Workspace layout (from skill-creator iterations):
+    <benchmark_dir>/
+    └── eval-N/
+        ├── with_skill/
+        │   ├── run-1/grading.json
+        │   └── run-2/grading.json
+        └── without_skill/
+            ├── run-1/grading.json
+            └── run-2/grading.json
+
+    Legacy layout (with runs/ subdirectory):
+    <benchmark_dir>/
+    └── runs/
+        └── eval-N/
+            ├── with_skill/
+            │   └── run-1/grading.json
+            └── without_skill/
+                └── run-1/grading.json
+"""
+
+import argparse
+import json
+import math
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+from loguru import logger
+
+
+def calculate_stats(values: list[float]) -> dict:
+    """Calculate mean, stddev, min, max for a list of values."""
+    if not values:
+        return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
+
+    n = len(values)
+    mean = sum(values) / n
+
+    if n > 1:
+        variance = sum((x - mean) ** 2 for x in values) / (n - 1)
+        stddev = math.sqrt(variance)
+    else:
+        stddev = 0.0
+
+    return {
+        "mean": round(mean, 4),
+        "stddev": round(stddev, 4),
+        "min": round(min(values), 4),
+        "max": round(max(values), 4)
+    }
+
+
+def load_run_results(benchmark_dir: Path) -> dict:
+    """
+    Load all run results from a benchmark directory.
+
+    Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
+    or "new_skill"/"old_skill"), each containing a list of run results.
+    """
+    # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
+    runs_dir = benchmark_dir / "runs"
+    if runs_dir.exists():
+        search_dir = runs_dir
+    elif list(benchmark_dir.glob("eval-*")):
+        search_dir = benchmark_dir
+    else:
+        logger.warning(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
+        return {}
+
+    results: dict[str, list] = {}
+
+    for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
+        metadata_path = eval_dir / "eval_metadata.json"
+        if metadata_path.exists():
+            try:
+                with open(metadata_path) as mf:
+                    eval_id = json.load(mf).get("eval_id", eval_idx)
+            except (json.JSONDecodeError, OSError):
+                eval_id = eval_idx
+        else:
+            try:
+                eval_id = int(eval_dir.name.split("-")[1])
+            except ValueError:
+                eval_id = eval_idx
+
+        # Discover config directories dynamically rather than hardcoding names
+        for config_dir in sorted(eval_dir.iterdir()):
+            if not config_dir.is_dir():
+                continue
+            # Skip non-config directories (inputs, outputs, etc.)
+            if not list(config_dir.glob("run-*")):
+                continue
+            config = config_dir.name
+            if config not in results:
+                results[config] = []
+
+            for run_dir in sorted(config_dir.glob("run-*")):
+                run_number = int(run_dir.name.split("-")[1])
+                grading_file = run_dir / "grading.json"
+
+                if not grading_file.exists():
+                    logger.warning(f"Warning: grading.json not found in {run_dir}")
+                    continue
+
+                try:
+                    with open(grading_file) as f:
+                        grading = json.load(f)
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Warning: Invalid JSON in {grading_file}: {e}")
+                    continue
+
+                # Extract metrics
+                result = {
+                    "eval_id": eval_id,
+                    "run_number": run_number,
+                    "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
+                    "passed": grading.get("summary", {}).get("passed", 0),
+                    "failed": grading.get("summary", {}).get("failed", 0),
+                    "total": grading.get("summary", {}).get("total", 0),
+                }
+
+                # Extract timing — check grading.json first, then sibling timing.json
+                timing = grading.get("timing", {})
+                result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
+                timing_file = run_dir / "timing.json"
+                if result["time_seconds"] == 0.0 and timing_file.exists():
+                    try:
+                        with open(timing_file) as tf:
+                            timing_data = json.load(tf)
+                        result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
+                        result["tokens"] = timing_data.get("total_tokens", 0)
+                    except json.JSONDecodeError:
+                        pass
+
+                # Extract metrics if available
+                metrics = grading.get("execution_metrics", {})
+                result["tool_calls"] = metrics.get("total_tool_calls", 0)
+                if not result.get("tokens"):
+                    result["tokens"] = metrics.get("output_chars", 0)
+                result["errors"] = metrics.get("errors_encountered", 0)
+
+                # Extract expectations — viewer requires fields: text, passed, evidence
+                raw_expectations = grading.get("expectations", [])
+                for exp in raw_expectations:
+                    if "text" not in exp or "passed" not in exp:
+                        logger.warning(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
+                result["expectations"] = raw_expectations
+
+                # Extract notes from user_notes_summary
+                notes_summary = grading.get("user_notes_summary", {})
+                notes = []
+                notes.extend(notes_summary.get("uncertainties", []))
+                notes.extend(notes_summary.get("needs_review", []))
+                notes.extend(notes_summary.get("workarounds", []))
+                result["notes"] = notes
+
+                results[config].append(result)
+
+    return results
+
+
+def aggregate_results(results: dict) -> dict:
+    """
+    Aggregate run results into summary statistics.
+
+    Returns run_summary with stats for each configuration and delta.
+    """
+    run_summary = {}
+    configs = list(results.keys())
+
+    for config in configs:
+        runs = results.get(config, [])
+
+        if not runs:
+            run_summary[config] = {
+                "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
+            }
+            continue
+
+        pass_rates = [r["pass_rate"] for r in runs]
+        times = [r["time_seconds"] for r in runs]
+        tokens = [r.get("tokens", 0) for r in runs]
+
+        run_summary[config] = {
+            "pass_rate": calculate_stats(pass_rates),
+            "time_seconds": calculate_stats(times),
+            "tokens": calculate_stats(tokens)
+        }
+
+    # Calculate delta between the first two configs (if two exist)
+    if len(configs) >= 2:
+        primary = run_summary.get(configs[0], {})
+        baseline = run_summary.get(configs[1], {})
+    else:
+        primary = run_summary.get(configs[0], {}) if configs else {}
+        baseline = {}
+
+    delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
+    delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
+    delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
+
+    run_summary["delta"] = {
+        "pass_rate": f"{delta_pass_rate:+.2f}",
+        "time_seconds": f"{delta_time:+.1f}",
+        "tokens": f"{delta_tokens:+.0f}"
+    }
+
+    return run_summary
+
+
+def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
+    """
+    Generate complete benchmark.json from run results.
+    """
+    results = load_run_results(benchmark_dir)
+    run_summary = aggregate_results(results)
+
+    # Build runs array for benchmark.json
+    runs = []
+    for config in results:
+        for result in results[config]:
+            runs.append({
+                "eval_id": result["eval_id"],
+                "configuration": config,
+                "run_number": result["run_number"],
+                "result": {
+                    "pass_rate": result["pass_rate"],
+                    "passed": result["passed"],
+                    "failed": result["failed"],
+                    "total": result["total"],
+                    "time_seconds": result["time_seconds"],
+                    "tokens": result.get("tokens", 0),
+                    "tool_calls": result.get("tool_calls", 0),
+                    "errors": result.get("errors", 0)
+                },
+                "expectations": result["expectations"],
+                "notes": result["notes"]
+            })
+
+    # Determine eval IDs from results
+    eval_ids = sorted(set(
+        r["eval_id"]
+        for config in results.values()
+        for r in config
+    ))
+
+    benchmark = {
+        "metadata": {
+            "skill_name": skill_name or "<skill-name>",
+            "skill_path": skill_path or "<path/to/skill>",
+            "executor_model": "<model-name>",
+            "analyzer_model": "<model-name>",
+            "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+            "evals_run": eval_ids,
+            "runs_per_configuration": 3
+        },
+        "runs": runs,
+        "run_summary": run_summary,
+        "notes": []  # To be filled by analyzer
+    }
+
+    return benchmark
+
+
+def generate_markdown(benchmark: dict) -> str:
+    """Generate human-readable benchmark.md from benchmark data."""
+    metadata = benchmark["metadata"]
+    run_summary = benchmark["run_summary"]
+
+    # Determine config names (excluding "delta")
+    configs = [k for k in run_summary if k != "delta"]
+    config_a = configs[0] if len(configs) >= 1 else "config_a"
+    config_b = configs[1] if len(configs) >= 2 else "config_b"
+    label_a = config_a.replace("_", " ").title()
+    label_b = config_b.replace("_", " ").title()
+
+    lines = [
+        f"# Skill Benchmark: {metadata['skill_name']}",
+        "",
+        f"**Model**: {metadata['executor_model']}",
+        f"**Date**: {metadata['timestamp']}",
+        f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
+        "",
+        "## Summary",
+        "",
+        f"| Metric | {label_a} | {label_b} | Delta |",
+        "|--------|------------|---------------|-------|",
+    ]
+
+    a_summary = run_summary.get(config_a, {})
+    b_summary = run_summary.get(config_b, {})
+    delta = run_summary.get("delta", {})
+
+    # Format pass rate
+    a_pr = a_summary.get("pass_rate", {})
+    b_pr = b_summary.get("pass_rate", {})
+    lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
+
+    # Format time
+    a_time = a_summary.get("time_seconds", {})
+    b_time = b_summary.get("time_seconds", {})
+    lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
+
+    # Format tokens
+    a_tokens = a_summary.get("tokens", {})
+    b_tokens = b_summary.get("tokens", {})
+    lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
+
+    # Notes section
+    if benchmark.get("notes"):
+        lines.extend([
+            "",
+            "## Notes",
+            ""
+        ])
+        for note in benchmark["notes"]:
+            lines.append(f"- {note}")
+
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Aggregate benchmark run results into summary statistics"
+    )
+    parser.add_argument(
+        "benchmark_dir",
+        type=Path,
+        help="Path to the benchmark directory"
+    )
+    parser.add_argument(
+        "--skill-name",
+        default="",
+        help="Name of the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--skill-path",
+        default="",
+        help="Path to the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=Path,
+        help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
+    )
+
+    args = parser.parse_args()
+
+    if not args.benchmark_dir.exists():
+        logger.error(f"Directory not found: {args.benchmark_dir}")
+        sys.exit(1)
+
+    # Generate benchmark
+    benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
+
+    # Determine output paths
+    output_json = args.output or (args.benchmark_dir / "benchmark.json")
+    output_md = output_json.with_suffix(".md")
+
+    # Write benchmark.json
+    with open(output_json, "w") as f:
+        json.dump(benchmark, f, indent=2)
+    logger.info(f"Generated: {output_json}")
+
+    # Write benchmark.md
+    markdown = generate_markdown(benchmark)
+    with open(output_md, "w") as f:
+        f.write(markdown)
+    logger.info(f"Generated: {output_md}")
+
+    # Print summary
+    run_summary = benchmark["run_summary"]
+    configs = [k for k in run_summary if k != "delta"]
+    delta = run_summary.get("delta", {})
+
+    logger.info(f"\nSummary:")
+    for config in configs:
+        pr = run_summary[config]["pass_rate"]["mean"]
+        label = config.replace("_", " ").title()
+        logger.info(f"  {label}: {pr*100:.1f}% pass rate")
+    logger.info(f"  Delta:         {delta.get('pass_rate', '—')}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/generate_report.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/generate_report.py
new file mode 100644
index 000000000..395232d96
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/generate_report.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""Generate an HTML report from run_loop.py output.
+
+Takes the JSON output from run_loop.py and generates a visual HTML report
+showing each description attempt with check/x for each test case.
+Distinguishes between train and test queries.
+"""
+
+import argparse
+import html
+import json
+import sys
+from pathlib import Path
+
+from loguru import logger
+
+
+def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:
+    """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""
+    history = data.get("history", [])
+    holdout = data.get("holdout", 0)
+    title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""
+
+    # Get all unique queries from train and test sets, with should_trigger info
+    train_queries: list[dict] = []
+    test_queries: list[dict] = []
+    if history:
+        for r in history[0].get("train_results", history[0].get("results", [])):
+            train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+        if history[0].get("test_results"):
+            for r in history[0].get("test_results", []):
+                test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+
+    refresh_tag = '    <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""
+
+    html_parts = ["""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+""" + refresh_tag + """    <title>""" + title_prefix + """Skill Description Optimization</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+    <style>
+        body {
+            font-family: 'Lora', Georgia, serif;
+            max-width: 100%;
+            margin: 0 auto;
+            padding: 20px;
+            background: #faf9f5;
+            color: #141413;
+        }
+        h1 { font-family: 'Poppins', sans-serif; color: #141413; }
+        .explainer {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+            color: #b0aea5;
+            font-size: 0.875rem;
+            line-height: 1.6;
+        }
+        .summary {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+        }
+        .summary p { margin: 5px 0; }
+        .best { color: #788c5d; font-weight: bold; }
+        .table-container {
+            overflow-x: auto;
+            width: 100%;
+        }
+        table {
+            border-collapse: collapse;
+            background: white;
+            border: 1px solid #e8e6dc;
+            border-radius: 6px;
+            font-size: 12px;
+            min-width: 100%;
+        }
+        th, td {
+            padding: 8px;
+            text-align: left;
+            border: 1px solid #e8e6dc;
+            white-space: normal;
+            word-wrap: break-word;
+        }
+        th {
+            font-family: 'Poppins', sans-serif;
+            background: #141413;
+            color: #faf9f5;
+            font-weight: 500;
+        }
+        th.test-col {
+            background: #6a9bcc;
+        }
+        th.query-col { min-width: 200px; }
+        td.description {
+            font-family: monospace;
+            font-size: 11px;
+            word-wrap: break-word;
+            max-width: 400px;
+        }
+        td.result {
+            text-align: center;
+            font-size: 16px;
+            min-width: 40px;
+        }
+        td.test-result {
+            background: #f0f6fc;
+        }
+        .pass { color: #788c5d; }
+        .fail { color: #c44; }
+        .rate {
+            font-size: 9px;
+            color: #b0aea5;
+            display: block;
+        }
+        tr:hover { background: #faf9f5; }
+        .score {
+            display: inline-block;
+            padding: 2px 6px;
+            border-radius: 4px;
+            font-weight: bold;
+            font-size: 11px;
+        }
+        .score-good { background: #eef2e8; color: #788c5d; }
+        .score-ok { background: #fef3c7; color: #d97706; }
+        .score-bad { background: #fceaea; color: #c44; }
+        .train-label { color: #b0aea5; font-size: 10px; }
+        .test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
+        .best-row { background: #f5f8f2; }
+        th.positive-col { border-bottom: 3px solid #788c5d; }
+        th.negative-col { border-bottom: 3px solid #c44; }
+        th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
+        th.test-col.negative-col { border-bottom: 3px solid #c44; }
+        .legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
+        .legend-item { display: flex; align-items: center; gap: 6px; }
+        .legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
+        .swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
+        .swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
+        .swatch-test { background: #6a9bcc; }
+        .swatch-train { background: #141413; }
+    </style>
+</head>
+<body>
+    <h1>""" + title_prefix + """Skill Description Optimization</h1>
+    <div class="explainer">
+        <strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
+    </div>
+"""]
+
+    # Summary section
+    best_test_score = data.get('best_test_score')
+    best_train_score = data.get('best_train_score')
+    html_parts.append(f"""
+    <div class="summary">
+        <p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>
+        <p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>
+        <p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>
+        <p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>
+    </div>
+""")
+
+    # Legend
+    html_parts.append("""
+    <div class="legend">
+        <span style="font-weight:600">Query columns:</span>
+        <span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
+        <span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
+    </div>
+""")
+
+    # Table header
+    html_parts.append("""
+    <div class="table-container">
+    <table>
+        <thead>
+            <tr>
+                <th>Iter</th>
+                <th>Train</th>
+                <th>Test</th>
+                <th class="query-col">Description</th>
+""")
+
+    # Add column headers for train queries
+    for qinfo in train_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    # Add column headers for test queries (different color)
+    for qinfo in test_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    html_parts.append("""            </tr>
+        </thead>
+        <tbody>
+""")
+
+    # Find best iteration for highlighting
+    if test_queries:
+        best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")
+    else:
+        best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")
+
+    # Add rows for each iteration
+    for h in history:
+        iteration = h.get("iteration", "?")
+        train_passed = h.get("train_passed", h.get("passed", 0))
+        train_total = h.get("train_total", h.get("total", 0))
+        test_passed = h.get("test_passed")
+        test_total = h.get("test_total")
+        description = h.get("description", "")
+        train_results = h.get("train_results", h.get("results", []))
+        test_results = h.get("test_results", [])
+
+        # Create lookups for results by query
+        train_by_query = {r["query"]: r for r in train_results}
+        test_by_query = {r["query"]: r for r in test_results} if test_results else {}
+
+        # Compute aggregate correct/total runs across all retries
+        def aggregate_runs(results: list[dict]) -> tuple[int, int]:
+            correct = 0
+            total = 0
+            for r in results:
+                runs = r.get("runs", 0)
+                triggers = r.get("triggers", 0)
+                total += runs
+                if r.get("should_trigger", True):
+                    correct += triggers
+                else:
+                    correct += runs - triggers
+            return correct, total
+
+        train_correct, train_runs = aggregate_runs(train_results)
+        test_correct, test_runs = aggregate_runs(test_results)
+
+        # Determine score classes
+        def score_class(correct: int, total: int) -> str:
+            if total > 0:
+                ratio = correct / total
+                if ratio >= 0.8:
+                    return "score-good"
+                elif ratio >= 0.5:
+                    return "score-ok"
+            return "score-bad"
+
+        train_class = score_class(train_correct, train_runs)
+        test_class = score_class(test_correct, test_runs)
+
+        row_class = "best-row" if iteration == best_iter else ""
+
+        html_parts.append(f"""            <tr class="{row_class}">
+                <td>{iteration}</td>
+                <td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>
+                <td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>
+                <td class="description">{html.escape(description)}</td>
+""")
+
+        # Add result for each train query
+        for qinfo in train_queries:
+            r = train_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        # Add result for each test query (with different background)
+        for qinfo in test_queries:
+            r = test_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        html_parts.append("            </tr>\n")
+
+    html_parts.append("""        </tbody>
+    </table>
+    </div>
+""")
+
+    html_parts.append("""
+</body>
+</html>
+""")
+
+    return "".join(html_parts)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")
+    parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")
+    parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")
+    parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")
+    args = parser.parse_args()
+
+    if args.input == "-":
+        data = json.load(sys.stdin)
+    else:
+        data = json.loads(Path(args.input).read_text())
+
+    html_output = generate_html(data, skill_name=args.skill_name)
+
+    if args.output:
+        Path(args.output).write_text(html_output)
+        logger.info(f"Report written to {args.output}")
+    else:
+        print(html_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/improve_description.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/improve_description.py
new file mode 100644
index 000000000..887a06a08
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/improve_description.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""Improve a skill description based on eval results.
+
+Takes eval results (from run_eval.py) and generates an improved description
+using Claude with extended thinking.
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+import anthropic
+from loguru import logger
+
+from scripts.utils import parse_skill_md
+
+
+def improve_description(
+    client: anthropic.Anthropic,
+    skill_name: str,
+    skill_content: str,
+    current_description: str,
+    eval_results: dict,
+    history: list[dict],
+    model: str,
+    test_results: dict | None = None,
+    log_dir: Path | None = None,
+    iteration: int | None = None,
+) -> str:
+    """Call Claude to improve the description based on eval results."""
+    failed_triggers = [
+        r for r in eval_results["results"]
+        if r["should_trigger"] and not r["pass"]
+    ]
+    false_triggers = [
+        r for r in eval_results["results"]
+        if not r["should_trigger"] and not r["pass"]
+    ]
+
+    # Build scores summary
+    train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
+    if test_results:
+        test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
+        scores_summary = f"Train: {train_score}, Test: {test_score}"
+    else:
+        scores_summary = f"Train: {train_score}"
+
+    prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
+
+The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
+
+Here's the current description:
+<current_description>
+"{current_description}"
+</current_description>
+
+Current scores ({scores_summary}):
+<scores_summary>
+"""
+    if failed_triggers:
+        prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n"
+        for r in failed_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if false_triggers:
+        prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n"
+        for r in false_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if history:
+        prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n"
+        for h in history:
+            train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}"
+            test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None
+            score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")
+            prompt += f'<attempt {score_str}>\n'
+            prompt += f'Description: "{h["description"]}"\n'
+            if "results" in h:
+                prompt += "Train results:\n"
+                for r in h["results"]:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    prompt += f'  [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n'
+            if h.get("note"):
+                prompt += f'Note: {h["note"]}\n'
+            prompt += "</attempt>\n\n"
+
+    prompt += f"""</scores_summary>
+
+Skill content (for context on what the skill does):
+<skill_content>
+{skill_content}
+</skill_content>
+
+Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold:
+
+1. Avoid overfitting
+2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description.
+
+Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy.
+
+Here are some tips that we've found to work well in writing these descriptions:
+- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"
+- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.
+- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable.
+- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.
+
+I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end. 
+
+Please respond with only the new description text in <new_description> tags, nothing else."""
+
+    response = client.messages.create(
+        model=model,
+        max_tokens=16000,
+        thinking={
+            "type": "enabled",
+            "budget_tokens": 10000,
+        },
+        messages=[{"role": "user", "content": prompt}],
+    )
+
+    # Extract thinking and text from response
+    thinking_text = ""
+    text = ""
+    for block in response.content:
+        if block.type == "thinking":
+            thinking_text = block.thinking
+        elif block.type == "text":
+            text = block.text
+
+    # Parse out the <new_description> tags
+    match = re.search(r"<new_description>(.*?)</new_description>", text, re.DOTALL)
+    description = match.group(1).strip().strip('"') if match else text.strip().strip('"')
+
+    # Log the transcript
+    transcript: dict = {
+        "iteration": iteration,
+        "prompt": prompt,
+        "thinking": thinking_text,
+        "response": text,
+        "parsed_description": description,
+        "char_count": len(description),
+        "over_limit": len(description) > 1024,
+    }
+
+    # If over 1024 chars, ask the model to shorten it
+    if len(description) > 1024:
+        shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in <new_description> tags."
+        shorten_response = client.messages.create(
+            model=model,
+            max_tokens=16000,
+            thinking={
+                "type": "enabled",
+                "budget_tokens": 10000,
+            },
+            messages=[
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": text},
+                {"role": "user", "content": shorten_prompt},
+            ],
+        )
+
+        shorten_thinking = ""
+        shorten_text = ""
+        for block in shorten_response.content:
+            if block.type == "thinking":
+                shorten_thinking = block.thinking
+            elif block.type == "text":
+                shorten_text = block.text
+
+        match = re.search(r"<new_description>(.*?)</new_description>", shorten_text, re.DOTALL)
+        shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"')
+
+        transcript["rewrite_prompt"] = shorten_prompt
+        transcript["rewrite_thinking"] = shorten_thinking
+        transcript["rewrite_response"] = shorten_text
+        transcript["rewrite_description"] = shortened
+        transcript["rewrite_char_count"] = len(shortened)
+        description = shortened
+
+    transcript["final_description"] = description
+
+    if log_dir:
+        log_dir.mkdir(parents=True, exist_ok=True)
+        log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json"
+        log_file.write_text(json.dumps(transcript, indent=2))
+
+    return description
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Improve a skill description based on eval results")
+    parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr")
+    args = parser.parse_args()
+
+    skill_path = Path(args.skill_path)
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    eval_results = json.loads(Path(args.eval_results).read_text())
+    history = []
+    if args.history:
+        history = json.loads(Path(args.history).read_text())
+
+    name, _, content = parse_skill_md(skill_path)
+    current_description = eval_results["description"]
+
+    if args.verbose:
+        logger.info(f"Current: {current_description}")
+        logger.info(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}")
+
+    client = anthropic.Anthropic()
+    new_description = improve_description(
+        client=client,
+        skill_name=name,
+        skill_content=content,
+        current_description=current_description,
+        eval_results=eval_results,
+        history=history,
+        model=args.model,
+    )
+
+    if args.verbose:
+        logger.info(f"Improved: {new_description}")
+
+    # Output as JSON with both the new description and updated history
+    output = {
+        "description": new_description,
+        "history": history + [{
+            "description": current_description,
+            "passed": eval_results["summary"]["passed"],
+            "failed": eval_results["summary"]["failed"],
+            "total": eval_results["summary"]["total"],
+            "results": eval_results["results"],
+        }],
+    }
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/package_skill.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/package_skill.py
new file mode 100644
index 000000000..5dbdf7843
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/package_skill.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Skill Packager - Creates a distributable .skill file of a skill folder
+
+Usage:
+    python utils/package_skill.py <path/to/skill-folder> [output-directory]
+
+Example:
+    python utils/package_skill.py skills/public/my-skill
+    python utils/package_skill.py skills/public/my-skill ./dist
+"""
+
+import fnmatch
+import sys
+import zipfile
+from pathlib import Path
+
+from loguru import logger
+from scripts.quick_validate import validate_skill
+
+# Patterns to exclude when packaging skills.
+EXCLUDE_DIRS = {"__pycache__", "node_modules"}
+EXCLUDE_GLOBS = {"*.pyc"}
+EXCLUDE_FILES = {".DS_Store"}
+# Directories excluded only at the skill root (not when nested deeper).
+ROOT_EXCLUDE_DIRS = {"evals"}
+
+
+def should_exclude(rel_path: Path) -> bool:
+    """Check if a path should be excluded from packaging."""
+    parts = rel_path.parts
+    if any(part in EXCLUDE_DIRS for part in parts):
+        return True
+    # rel_path is relative to skill_path.parent, so parts[0] is the skill
+    # folder name and parts[1] (if present) is the first subdir.
+    if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
+        return True
+    name = rel_path.name
+    if name in EXCLUDE_FILES:
+        return True
+    return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
+
+
+def package_skill(skill_path, output_dir=None):
+    """
+    Package a skill folder into a .skill file.
+
+    Args:
+        skill_path: Path to the skill folder
+        output_dir: Optional output directory for the .skill file (defaults to current directory)
+
+    Returns:
+        Path to the created .skill file, or None if error
+    """
+    skill_path = Path(skill_path).resolve()
+
+    # Validate skill folder exists
+    if not skill_path.exists():
+        logger.error(f"Skill folder not found: {skill_path}")
+        return None
+
+    if not skill_path.is_dir():
+        logger.error(f"Path is not a directory: {skill_path}")
+        return None
+
+    # Validate SKILL.md exists
+    skill_md = skill_path / "SKILL.md"
+    if not skill_md.exists():
+        logger.error(f"SKILL.md not found in {skill_path}")
+        return None
+
+    # Run validation before packaging
+    logger.info("Validating skill...")
+    valid, message = validate_skill(skill_path)
+    if not valid:
+        logger.error(f"Validation failed: {message}")
+        logger.error("Please fix the validation errors before packaging.")
+        return None
+    logger.info(f"{message}\n")
+
+    # Determine output location
+    skill_name = skill_path.name
+    if output_dir:
+        output_path = Path(output_dir).resolve()
+        output_path.mkdir(parents=True, exist_ok=True)
+    else:
+        output_path = Path.cwd()
+
+    skill_filename = output_path / f"{skill_name}.skill"
+
+    # Create the .skill file (zip format)
+    try:
+        with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            # Walk through the skill directory, excluding build artifacts
+            for file_path in skill_path.rglob('*'):
+                if not file_path.is_file():
+                    continue
+                arcname = file_path.relative_to(skill_path.parent)
+                if should_exclude(arcname):
+                    logger.debug(f"Skipped: {arcname}")
+                    continue
+                zipf.write(file_path, arcname)
+                logger.debug(f"Added: {arcname}")
+
+        logger.info(f"Successfully packaged skill to: {skill_filename}")
+        return skill_filename
+
+    except Exception as e:
+        logger.error(f"Error creating .skill file: {e}")
+        return None
+
+
+def main():
+    if len(sys.argv) < 2:
+        logger.info("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
+        logger.info("\nExample:")
+        logger.info("  python utils/package_skill.py skills/public/my-skill")
+        logger.info("  python utils/package_skill.py skills/public/my-skill ./dist")
+        sys.exit(1)
+
+    skill_path = sys.argv[1]
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else None
+
+    logger.info(f"Packaging skill: {skill_path}")
+    if output_dir:
+        logger.info(f"   Output directory: {output_dir}")
+    logger.info("")
+
+    result = package_skill(skill_path, output_dir)
+
+    if result:
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/quick_validate.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/quick_validate.py
new file mode 100644
index 000000000..36553161e
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/quick_validate.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Quick validation script for skills - minimal version
+"""
+
+import sys
+import os
+import re
+import yaml
+from pathlib import Path
+
+from loguru import logger
+
+def validate_skill(skill_path):
+    """Basic validation of a skill"""
+    skill_path = Path(skill_path)
+
+    # Check SKILL.md exists
+    skill_md = skill_path / 'SKILL.md'
+    if not skill_md.exists():
+        return False, "SKILL.md not found"
+
+    # Read and validate frontmatter
+    content = skill_md.read_text()
+    if not content.startswith('---'):
+        return False, "No YAML frontmatter found"
+
+    # Extract frontmatter
+    match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
+    if not match:
+        return False, "Invalid frontmatter format"
+
+    frontmatter_text = match.group(1)
+
+    # Parse YAML frontmatter
+    try:
+        frontmatter = yaml.safe_load(frontmatter_text)
+        if not isinstance(frontmatter, dict):
+            return False, "Frontmatter must be a YAML dictionary"
+    except yaml.YAMLError as e:
+        return False, f"Invalid YAML in frontmatter: {e}"
+
+    # Define allowed properties
+    ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
+
+    # Check for unexpected properties (excluding nested keys under metadata)
+    unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
+    if unexpected_keys:
+        return False, (
+            f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
+            f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
+        )
+
+    # Check required fields
+    if 'name' not in frontmatter:
+        return False, "Missing 'name' in frontmatter"
+    if 'description' not in frontmatter:
+        return False, "Missing 'description' in frontmatter"
+
+    # Extract name for validation
+    name = frontmatter.get('name', '')
+    if not isinstance(name, str):
+        return False, f"Name must be a string, got {type(name).__name__}"
+    name = name.strip()
+    if name:
+        # Check naming convention (kebab-case: lowercase with hyphens)
+        if not re.match(r'^[a-z0-9-]+$', name):
+            return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
+        if name.startswith('-') or name.endswith('-') or '--' in name:
+            return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
+        # Check name length (max 64 characters per spec)
+        if len(name) > 64:
+            return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
+
+    # Extract and validate description
+    description = frontmatter.get('description', '')
+    if not isinstance(description, str):
+        return False, f"Description must be a string, got {type(description).__name__}"
+    description = description.strip()
+    if description:
+        # Check for angle brackets
+        if '<' in description or '>' in description:
+            return False, "Description cannot contain angle brackets (< or >)"
+        # Check description length (max 1024 characters per spec)
+        if len(description) > 1024:
+            return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
+
+    # Validate compatibility field if present (optional)
+    compatibility = frontmatter.get('compatibility', '')
+    if compatibility:
+        if not isinstance(compatibility, str):
+            return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
+        if len(compatibility) > 500:
+            return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
+
+    return True, "Skill is valid!"
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        logger.info("Usage: python quick_validate.py <skill_directory>")
+        sys.exit(1)
+
+    valid, message = validate_skill(sys.argv[1])
+    if valid:
+        logger.info(message)
+    else:
+        logger.error(message)
+    sys.exit(0 if valid else 1)
\ No newline at end of file
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_eval.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_eval.py
new file mode 100644
index 000000000..f923066ca
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_eval.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+"""Run trigger evaluation for a skill description.
+
+Tests whether a skill's description causes Claude to trigger (read the skill)
+for a set of queries. Outputs results as JSON.
+"""
+
+import argparse
+import json
+import os
+import select
+import subprocess
+import sys
+import time
+import uuid
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+
+from loguru import logger
+
+from scripts.utils import parse_skill_md
+
+
+def find_project_root() -> Path:
+    """Find the project root by walking up from cwd looking for .claude/.
+
+    Mimics how Claude Code discovers its project root, so the command file
+    we create ends up where claude -p will look for it.
+    """
+    current = Path.cwd()
+    for parent in [current, *current.parents]:
+        if (parent / ".claude").is_dir():
+            return parent
+    return current
+
+
+def run_single_query(
+    query: str,
+    skill_name: str,
+    skill_description: str,
+    timeout: int,
+    project_root: str,
+    model: str | None = None,
+) -> bool:
+    """Run a single query and return whether the skill was triggered.
+
+    Creates a command file in .claude/commands/ so it appears in Claude's
+    available_skills list, then runs `claude -p` with the raw query.
+    Uses --include-partial-messages to detect triggering early from
+    stream events (content_block_start) rather than waiting for the
+    full assistant message, which only arrives after tool execution.
+    """
+    unique_id = uuid.uuid4().hex[:8]
+    clean_name = f"{skill_name}-skill-{unique_id}"
+    project_commands_dir = Path(project_root) / ".claude" / "commands"
+    command_file = project_commands_dir / f"{clean_name}.md"
+
+    try:
+        project_commands_dir.mkdir(parents=True, exist_ok=True)
+        # Use YAML block scalar to avoid breaking on quotes in description
+        indented_desc = "\n  ".join(skill_description.split("\n"))
+        command_content = (
+            f"---\n"
+            f"description: |\n"
+            f"  {indented_desc}\n"
+            f"---\n\n"
+            f"# {skill_name}\n\n"
+            f"This skill handles: {skill_description}\n"
+        )
+        command_file.write_text(command_content)
+
+        cmd = [
+            "claude",
+            "-p", query,
+            "--output-format", "stream-json",
+            "--verbose",
+            "--include-partial-messages",
+        ]
+        if model:
+            cmd.extend(["--model", model])
+
+        # Remove CLAUDECODE env var to allow nesting claude -p inside a
+        # Claude Code session. The guard is for interactive terminal conflicts;
+        # programmatic subprocess usage is safe.
+        env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            cwd=project_root,
+            env=env,
+        )
+
+        triggered = False
+        start_time = time.time()
+        buffer = ""
+        # Track state for stream event detection
+        pending_tool_name = None
+        accumulated_json = ""
+
+        try:
+            while time.time() - start_time < timeout:
+                if process.poll() is not None:
+                    remaining = process.stdout.read()
+                    if remaining:
+                        buffer += remaining.decode("utf-8", errors="replace")
+                    break
+
+                ready, _, _ = select.select([process.stdout], [], [], 1.0)
+                if not ready:
+                    continue
+
+                chunk = os.read(process.stdout.fileno(), 8192)
+                if not chunk:
+                    break
+                buffer += chunk.decode("utf-8", errors="replace")
+
+                while "\n" in buffer:
+                    line, buffer = buffer.split("\n", 1)
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    try:
+                        event = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+
+                    # Early detection via stream events
+                    if event.get("type") == "stream_event":
+                        se = event.get("event", {})
+                        se_type = se.get("type", "")
+
+                        if se_type == "content_block_start":
+                            cb = se.get("content_block", {})
+                            if cb.get("type") == "tool_use":
+                                tool_name = cb.get("name", "")
+                                if tool_name in ("Skill", "Read"):
+                                    pending_tool_name = tool_name
+                                    accumulated_json = ""
+                                else:
+                                    return False
+
+                        elif se_type == "content_block_delta" and pending_tool_name:
+                            delta = se.get("delta", {})
+                            if delta.get("type") == "input_json_delta":
+                                accumulated_json += delta.get("partial_json", "")
+                                if clean_name in accumulated_json:
+                                    return True
+
+                        elif se_type in ("content_block_stop", "message_stop"):
+                            if pending_tool_name:
+                                return clean_name in accumulated_json
+                            if se_type == "message_stop":
+                                return False
+
+                    # Fallback: full assistant message
+                    elif event.get("type") == "assistant":
+                        message = event.get("message", {})
+                        for content_item in message.get("content", []):
+                            if content_item.get("type") != "tool_use":
+                                continue
+                            tool_name = content_item.get("name", "")
+                            tool_input = content_item.get("input", {})
+                            if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
+                                triggered = True
+                            elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
+                                triggered = True
+                            return triggered
+
+                    elif event.get("type") == "result":
+                        return triggered
+        finally:
+            # Clean up process on any exit path (return, exception, timeout)
+            if process.poll() is None:
+                process.kill()
+                process.wait()
+
+        return triggered
+    finally:
+        if command_file.exists():
+            command_file.unlink()
+
+
+def run_eval(
+    eval_set: list[dict],
+    skill_name: str,
+    description: str,
+    num_workers: int,
+    timeout: int,
+    project_root: Path,
+    runs_per_query: int = 1,
+    trigger_threshold: float = 0.5,
+    model: str | None = None,
+) -> dict:
+    """Run the full eval set and return results."""
+    results = []
+
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        future_to_info = {}
+        for item in eval_set:
+            for run_idx in range(runs_per_query):
+                future = executor.submit(
+                    run_single_query,
+                    item["query"],
+                    skill_name,
+                    description,
+                    timeout,
+                    str(project_root),
+                    model,
+                )
+                future_to_info[future] = (item, run_idx)
+
+        query_triggers: dict[str, list[bool]] = {}
+        query_items: dict[str, dict] = {}
+        for future in as_completed(future_to_info):
+            item, _ = future_to_info[future]
+            query = item["query"]
+            query_items[query] = item
+            if query not in query_triggers:
+                query_triggers[query] = []
+            try:
+                query_triggers[query].append(future.result())
+            except Exception as e:
+                logger.warning(f"Warning: query failed: {e}")
+                query_triggers[query].append(False)
+
+    for query, triggers in query_triggers.items():
+        item = query_items[query]
+        trigger_rate = sum(triggers) / len(triggers)
+        should_trigger = item["should_trigger"]
+        if should_trigger:
+            did_pass = trigger_rate >= trigger_threshold
+        else:
+            did_pass = trigger_rate < trigger_threshold
+        results.append({
+            "query": query,
+            "should_trigger": should_trigger,
+            "trigger_rate": trigger_rate,
+            "triggers": sum(triggers),
+            "runs": len(triggers),
+            "pass": did_pass,
+        })
+
+    passed = sum(1 for r in results if r["pass"])
+    total = len(results)
+
+    return {
+        "skill_name": skill_name,
+        "description": description,
+        "results": results,
+        "summary": {
+            "total": total,
+            "passed": passed,
+            "failed": total - passed,
+        },
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override description to test")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    name, original_description, content = parse_skill_md(skill_path)
+    description = args.description or original_description
+    project_root = find_project_root()
+
+    if args.verbose:
+        logger.info(f"Evaluating: {description}")
+
+    output = run_eval(
+        eval_set=eval_set,
+        skill_name=name,
+        description=description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        project_root=project_root,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        model=args.model,
+    )
+
+    if args.verbose:
+        summary = output["summary"]
+        logger.info(f"Results: {summary['passed']}/{summary['total']} passed")
+        for r in output["results"]:
+            status = "PASS" if r["pass"] else "FAIL"
+            rate_str = f"{r['triggers']}/{r['runs']}"
+            logger.info(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}")
+
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_loop.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_loop.py
new file mode 100644
index 000000000..a2907d6e0
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_loop.py
@@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+"""Run the eval + improve loop until all pass or max iterations reached.
+
+Combines run_eval.py and improve_description.py in a loop, tracking history
+and returning the best description found. Supports train/test split to prevent
+overfitting.
+"""
+
+import argparse
+import json
+import random
+import sys
+import tempfile
+import time
+import webbrowser
+from pathlib import Path
+
+import anthropic
+from loguru import logger
+
+from scripts.generate_report import generate_html
+from scripts.improve_description import improve_description
+from scripts.run_eval import find_project_root, run_eval
+from scripts.utils import parse_skill_md
+
+
+def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
+    """Split eval set into train and test sets, stratified by should_trigger."""
+    random.seed(seed)
+
+    # Separate by should_trigger
+    trigger = [e for e in eval_set if e["should_trigger"]]
+    no_trigger = [e for e in eval_set if not e["should_trigger"]]
+
+    # Shuffle each group
+    random.shuffle(trigger)
+    random.shuffle(no_trigger)
+
+    # Calculate split points
+    n_trigger_test = max(1, int(len(trigger) * holdout))
+    n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
+
+    # Split
+    test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
+    train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
+
+    return train_set, test_set
+
+
+def run_loop(
+    eval_set: list[dict],
+    skill_path: Path,
+    description_override: str | None,
+    num_workers: int,
+    timeout: int,
+    max_iterations: int,
+    runs_per_query: int,
+    trigger_threshold: float,
+    holdout: float,
+    model: str,
+    verbose: bool,
+    live_report_path: Path | None = None,
+    log_dir: Path | None = None,
+) -> dict:
+    """Run the eval + improvement loop."""
+    project_root = find_project_root()
+    name, original_description, content = parse_skill_md(skill_path)
+    current_description = description_override or original_description
+
+    # Split into train/test if holdout > 0
+    if holdout > 0:
+        train_set, test_set = split_eval_set(eval_set, holdout)
+        if verbose:
+            logger.info(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})")
+    else:
+        train_set = eval_set
+        test_set = []
+
+    client = anthropic.Anthropic()
+    history = []
+    exit_reason = "unknown"
+
+    for iteration in range(1, max_iterations + 1):
+        if verbose:
+            logger.info(f"\n{'='*60}")
+            logger.info(f"Iteration {iteration}/{max_iterations}")
+            logger.info(f"Description: {current_description}")
+            logger.info(f"{'='*60}")
+
+        # Evaluate train + test together in one batch for parallelism
+        all_queries = train_set + test_set
+        t0 = time.time()
+        all_results = run_eval(
+            eval_set=all_queries,
+            skill_name=name,
+            description=current_description,
+            num_workers=num_workers,
+            timeout=timeout,
+            project_root=project_root,
+            runs_per_query=runs_per_query,
+            trigger_threshold=trigger_threshold,
+            model=model,
+        )
+        eval_elapsed = time.time() - t0
+
+        # Split results back into train/test by matching queries
+        train_queries_set = {q["query"] for q in train_set}
+        train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
+        test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
+
+        train_passed = sum(1 for r in train_result_list if r["pass"])
+        train_total = len(train_result_list)
+        train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
+        train_results = {"results": train_result_list, "summary": train_summary}
+
+        if test_set:
+            test_passed = sum(1 for r in test_result_list if r["pass"])
+            test_total = len(test_result_list)
+            test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
+            test_results = {"results": test_result_list, "summary": test_summary}
+        else:
+            test_results = None
+            test_summary = None
+
+        history.append({
+            "iteration": iteration,
+            "description": current_description,
+            "train_passed": train_summary["passed"],
+            "train_failed": train_summary["failed"],
+            "train_total": train_summary["total"],
+            "train_results": train_results["results"],
+            "test_passed": test_summary["passed"] if test_summary else None,
+            "test_failed": test_summary["failed"] if test_summary else None,
+            "test_total": test_summary["total"] if test_summary else None,
+            "test_results": test_results["results"] if test_results else None,
+            # For backward compat with report generator
+            "passed": train_summary["passed"],
+            "failed": train_summary["failed"],
+            "total": train_summary["total"],
+            "results": train_results["results"],
+        })
+
+        # Write live report if path provided
+        if live_report_path:
+            partial_output = {
+                "original_description": original_description,
+                "best_description": current_description,
+                "best_score": "in progress",
+                "iterations_run": len(history),
+                "holdout": holdout,
+                "train_size": len(train_set),
+                "test_size": len(test_set),
+                "history": history,
+            }
+            live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
+
+        if verbose:
+            def print_eval_stats(label, results, elapsed):
+                pos = [r for r in results if r["should_trigger"]]
+                neg = [r for r in results if not r["should_trigger"]]
+                tp = sum(r["triggers"] for r in pos)
+                pos_runs = sum(r["runs"] for r in pos)
+                fn = pos_runs - tp
+                fp = sum(r["triggers"] for r in neg)
+                neg_runs = sum(r["runs"] for r in neg)
+                tn = neg_runs - fp
+                total = tp + tn + fp + fn
+                precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
+                recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
+                accuracy = (tp + tn) / total if total > 0 else 0.0
+                logger.info(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)")
+                for r in results:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    rate_str = f"{r['triggers']}/{r['runs']}"
+                    logger.info(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}")
+
+            print_eval_stats("Train", train_results["results"], eval_elapsed)
+            if test_summary:
+                print_eval_stats("Test ", test_results["results"], 0)
+
+        if train_summary["failed"] == 0:
+            exit_reason = f"all_passed (iteration {iteration})"
+            if verbose:
+                logger.info(f"\nAll train queries passed on iteration {iteration}!")
+            break
+
+        if iteration == max_iterations:
+            exit_reason = f"max_iterations ({max_iterations})"
+            if verbose:
+                logger.info(f"\nMax iterations reached ({max_iterations}).")
+            break
+
+        # Improve the description based on train results
+        if verbose:
+            logger.info(f"\nImproving description...")
+
+        t0 = time.time()
+        # Strip test scores from history so improvement model can't see them
+        blinded_history = [
+            {k: v for k, v in h.items() if not k.startswith("test_")}
+            for h in history
+        ]
+        new_description = improve_description(
+            client=client,
+            skill_name=name,
+            skill_content=content,
+            current_description=current_description,
+            eval_results=train_results,
+            history=blinded_history,
+            model=model,
+            log_dir=log_dir,
+            iteration=iteration,
+        )
+        improve_elapsed = time.time() - t0
+
+        if verbose:
+            logger.info(f"Proposed ({improve_elapsed:.1f}s): {new_description}")
+
+        current_description = new_description
+
+    # Find the best iteration by TEST score (or train if no test set)
+    if test_set:
+        best = max(history, key=lambda h: h["test_passed"] or 0)
+        best_score = f"{best['test_passed']}/{best['test_total']}"
+    else:
+        best = max(history, key=lambda h: h["train_passed"])
+        best_score = f"{best['train_passed']}/{best['train_total']}"
+
+    if verbose:
+        logger.info(f"\nExit reason: {exit_reason}")
+        logger.info(f"Best score: {best_score} (iteration {best['iteration']})")
+
+    return {
+        "exit_reason": exit_reason,
+        "original_description": original_description,
+        "best_description": best["description"],
+        "best_score": best_score,
+        "best_train_score": f"{best['train_passed']}/{best['train_total']}",
+        "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
+        "final_description": current_description,
+        "iterations_run": len(history),
+        "holdout": holdout,
+        "train_size": len(train_set),
+        "test_size": len(test_set),
+        "history": history,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run eval + improve loop")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override starting description")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
+    parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    name, _, _ = parse_skill_md(skill_path)
+
+    # Set up live report path
+    if args.report != "none":
+        if args.report == "auto":
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
+        else:
+            live_report_path = Path(args.report)
+        # Open the report immediately so the user can watch
+        live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
+        webbrowser.open(str(live_report_path))
+    else:
+        live_report_path = None
+
+    # Determine output directory (create before run_loop so logs can be written)
+    if args.results_dir:
+        timestamp = time.strftime("%Y-%m-%d_%H%M%S")
+        results_dir = Path(args.results_dir) / timestamp
+        results_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        results_dir = None
+
+    log_dir = results_dir / "logs" if results_dir else None
+
+    output = run_loop(
+        eval_set=eval_set,
+        skill_path=skill_path,
+        description_override=args.description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        max_iterations=args.max_iterations,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        holdout=args.holdout,
+        model=args.model,
+        verbose=args.verbose,
+        live_report_path=live_report_path,
+        log_dir=log_dir,
+    )
+
+    # Save JSON output
+    json_output = json.dumps(output, indent=2)
+    print(json_output)
+    if results_dir:
+        (results_dir / "results.json").write_text(json_output)
+
+    # Write final HTML report (without auto-refresh)
+    if live_report_path:
+        live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
+        logger.info(f"\nReport: {live_report_path}")
+
+    if results_dir and live_report_path:
+        (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
+
+    if results_dir:
+        logger.info(f"Results saved to: {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/utils.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/utils.py
new file mode 100644
index 000000000..51b6a07dd
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/utils.py
@@ -0,0 +1,47 @@
+"""Shared utilities for skill-creator scripts."""
+
+from pathlib import Path
+
+
+
+def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
+    """Parse a SKILL.md file, returning (name, description, full_content)."""
+    content = (skill_path / "SKILL.md").read_text()
+    lines = content.split("\n")
+
+    if lines[0].strip() != "---":
+        raise ValueError("SKILL.md missing frontmatter (no opening ---)")
+
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+
+    if end_idx is None:
+        raise ValueError("SKILL.md missing frontmatter (no closing ---)")
+
+    name = ""
+    description = ""
+    frontmatter_lines = lines[1:end_idx]
+    i = 0
+    while i < len(frontmatter_lines):
+        line = frontmatter_lines[i]
+        if line.startswith("name:"):
+            name = line[len("name:"):].strip().strip('"').strip("'")
+        elif line.startswith("description:"):
+            value = line[len("description:"):].strip()
+            # Handle YAML multiline indicators (>, |, >-, |-)
+            if value in (">", "|", ">-", "|-"):
+                continuation_lines: list[str] = []
+                i += 1
+                while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith("  ") or frontmatter_lines[i].startswith("\t")):
+                    continuation_lines.append(frontmatter_lines[i].strip())
+                    i += 1
+                description = " ".join(continuation_lines)
+                continue
+            else:
+                description = value.strip('"').strip("'")
+        i += 1
+
+    return name, description, content
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/SKILL.md
new file mode 100644
index 000000000..1ff24943b
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/SKILL.md
@@ -0,0 +1,33 @@
+---
+name: Web Research
+description: Systematic web searching, source evaluation, and information synthesis
+---
+
+# Web Research
+
+## Overview
+Use this skill when you need to find, evaluate, and synthesize information from the web.
+
+**Keywords**: web search, information retrieval, source evaluation, fact-checking, research
+
+## Process
+
+### 1. Define Search Strategy
+- Identify key search terms and variations
+- Consider different angles and perspectives
+- Plan multiple search queries
+
+### 2. Evaluate Sources
+- Check source credibility and recency
+- Cross-reference claims across multiple sources
+- Note publication dates and author expertise
+
+### 3. Synthesize Findings
+- Organize information by theme or relevance
+- Highlight key findings and consensus views
+- Note conflicting information and gaps
+
+## Output Format
+- Start with a brief summary of findings
+- Provide detailed sections with source citations
+- End with confidence assessment and limitations
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/scripts/search_helper.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/scripts/search_helper.py
new file mode 100644
index 000000000..09679111d
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/scripts/search_helper.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+"""Helper utilities for structured web search."""
+
+from datetime import datetime
+
+
+def format_search_results(results: list[dict]) -> str:
+    """Format raw search results into a structured report."""
+    output = []
+    for i, r in enumerate(results, 1):
+        title = r.get('title', 'Untitled')
+        url = r.get('url', '#')
+        snippet = r.get('snippet', 'No description')
+        output.append(f'{i}. [{title}]({url})')
+        output.append(f'   {snippet}')
+        output.append('')
+    return '\n'.join(output)
+
+
+def assess_source_credibility(url: str) -> dict:
+    """Basic heuristics for source credibility."""
+    trusted = ['.edu', '.gov', '.org', 'arxiv.org', 'nature.com']
+    score = 0.5
+    for d in trusted:
+        if d in url:
+            score = 0.8
+            break
+    return {'url': url, 'credibility_score': score,
+            'assessed_at': datetime.now().isoformat()}
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/soul.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/soul.md
new file mode 100644
index 000000000..1554c3463
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/soul.md
@@ -0,0 +1,16 @@
+# Soul — {{agent_name}}
+
+## Identity
+- **名称**: {{agent_name}}
+- **角色**: {{role_description}}
+- **创建者**: {{creator_name}}
+- **创建时间**: {{created_at}}
+
+## Personality
+- 认真负责、注重细节
+- 主动汇报工作进展
+- 遇到不确定的信息会主动确认
+
+## Boundaries
+- 遵守企业保密制度
+- 敏感操作需经过创建者审批
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/state.json b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/state.json
new file mode 100644
index 000000000..0507e31dd
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/state.json
@@ -0,0 +1,13 @@
+{
+    "agent_id": "",
+    "name": "",
+    "status": "idle",
+    "current_task": null,
+    "last_active": null,
+    "channel_status": {},
+    "stats": {
+        "tasks_completed_today": 0,
+        "tasks_in_progress": 0,
+        "督办_pending": 0
+    }
+}
\ No newline at end of file
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/todo.json b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/todo.json
new file mode 100644
index 000000000..50ffbb9a9
--- /dev/null
+++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/todo.json
@@ -0,0 +1,3 @@
+{
+  "tasks": []
+}
diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/workspace/archived/.gitkeep b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/workspace/archived/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/HEARTBEAT.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/HEARTBEAT.md
new file mode 100644
index 000000000..485565cb3
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/HEARTBEAT.md
@@ -0,0 +1,63 @@
+# HEARTBEAT
+
+When this file is read during a heartbeat, you are performing a **periodic awareness check**.
+
+## Phase 1: Review Context & Discover Interest Points
+
+Review your **recent conversations** and your **role/responsibilities**.
+Identify topics or questions that:
+- Are directly relevant to your role and current work
+- Were mentioned by users but not fully explored at the time
+- Represent emerging trends or changes in your professional domain
+- Could improve your ability to serve your users
+
+If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3.
+Do NOT search for generic or obvious topics just to fill time. Quality over quantity.
+
+## Phase 2: Targeted Exploration (Conditional)
+
+Only if you identified genuine interest points in Phase 1:
+
+1. Use `web_search` to investigate (maximum 5 searches per heartbeat)
+2. Keep searches **tightly scoped** to your role and recent work topics
+3. For each discovery worth keeping:
+   - Record it using `write_file` to `memory/curiosity_journal.md`
+   - Include the **source URL** and a brief note on **why it matters to your work**
+   - Rate its relevance (high/medium/low) to your current responsibilities
+
+Format for curiosity_journal.md entries:
+```
+### [Date] - [Topic]
+- **Finding**: [What you learned]
+- **Source**: [URL]
+- **Relevance**: [high/medium/low] — [Why it matters to your work]
+- **Follow-up**: [Optional: questions this raises for next time]
+```
+
+## Phase 3: Agent Plaza
+
+1. Call `plaza_get_new_posts` to check recent activity
+2. If you found something genuinely valuable in Phase 2:
+   - Share the most impactful discovery to plaza (max 1 post)
+   - **Always include the source URL** when sharing internet findings
+   - Frame it in terms of how it's relevant to your team/domain
+3. Comment on relevant existing posts (max 2 comments)
+
+## Phase 4: Wrap Up
+
+- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK`
+- Otherwise, briefly summarize what you explored and why
+
+## Key Principles
+- Always ground exploration in YOUR role and YOUR recent work context
+- Never search for random unrelated topics out of idle curiosity
+- If you don't have a specific angle worth investigating, don't search
+- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries
+- Generate follow-up questions only when you genuinely want to know more
+
+## Rules
+- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details
+- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links
+- 📝 **Limits per heartbeat**: max 1 post + 2 comments
+- 🔍 **Search limits**: max 5 web searches per heartbeat
+- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK`
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/daily_reports/.gitkeep b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/daily_reports/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/enterprise_info/.gitkeep b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/enterprise_info/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/MEMORY_INDEX.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/MEMORY_INDEX.md
new file mode 100644
index 000000000..29e3fab13
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/MEMORY_INDEX.md
@@ -0,0 +1,6 @@
+# Memory Index
+
+This file serves as an index of all memories for this digital employee.
+
+## Topics
+<!-- New memory topics will be added here -->
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/curiosity_journal.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/curiosity_journal.md
new file mode 100644
index 000000000..c5185fe44
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/curiosity_journal.md
@@ -0,0 +1,9 @@
+# Curiosity Journal
+
+This is your exploration log. Record interesting discoveries from your web searches here.
+
+## Active Questions
+<!-- Topics you want to investigate in future heartbeats -->
+
+## Discoveries
+<!-- Record your findings below, newest first -->
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/relationships.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/relationships.md
new file mode 100644
index 000000000..625380120
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/relationships.md
@@ -0,0 +1,5 @@
+# Relationships
+
+## Digital Employee Colleagues
+
+- **Morty** (collaborator): Research expert with strong learning ability. Ask him for information retrieval, web research, data analysis, and knowledge synthesis.
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/.gitkeep b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/MCP_INSTALLER.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/MCP_INSTALLER.md
new file mode 100644
index 000000000..9e3bf3c77
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/MCP_INSTALLER.md
@@ -0,0 +1,87 @@
+# MCP Tool Installer
+
+## When to Use This Skill
+Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL.
+
+---
+
+## Step-by-Step Protocol
+
+### Step 1 — Search first
+```
+discover_resources(query="<what the user wants>", max_results=5)
+```
+Show the results and let the user pick. Note the `ID` field (e.g. `github`).
+
+### Step 2 — Determine import method
+
+**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐)
+- Requires Smithery API Key (one-time per agent)
+- Individual tool tokens NOT needed — Smithery handles auth via OAuth
+
+**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint)
+- User provides the MCP server URL directly
+- May require tool-specific API key
+
+**Not importable** (💻 local-only tools)
+- Requires local Docker/process — inform user these cannot be imported automatically
+
+---
+
+### Method A: Smithery Import
+
+#### Check Smithery API Key
+If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim):
+
+> **Smithery** (smithery.ai) 是一个 MCP 工具市场，类似于"应用商店"。通过它，我可以帮你一键安装各种第三方工具（如 GitHub、Notion、Slack 等），并自动完成认证。
+>
+> **为什么需要注册？**
+> Smithery 用 API Key 来识别你的身份，这样安装的工具会关联到你的账号，认证信息也会安全保存。
+>
+> **注册一次后有什么好处？**
+> - 🔑 只需提供一次 Key，后续安装其他工具时我会自动帮你配置
+> - 🔐 不需要为每个工具单独创建 Token（如 GitHub PAT），OAuth 一键授权
+> - 📦 支持上千种 MCP 工具，随时可以扩展你的能力
+>
+> **获取步骤：**
+> 1. 访问 https://smithery.ai 注册/登录
+> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key
+> 3. 将 Key 提供给我
+
+#### Import
+```
+import_mcp_server(
+  server_id="<qualified_name>",
+  config={"smithery_api_key": "<key>"}  # first time only
+)
+```
+
+#### Handle OAuth
+Some tools return an OAuth authorization URL. Tell the user to visit the link.
+
+**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically.
+
+---
+
+### Method B: Direct URL Import
+
+When a tool is not available on Smithery but the user has a public MCP endpoint:
+```
+import_mcp_server(
+  server_id="<server name>",
+  config={
+    "mcp_url": "https://my-mcp-server.com/sse",
+    "api_key": "<optional tool-specific key>"
+  }
+)
+```
+The system will connect to the URL, discover available tools, and register them.
+
+---
+
+## What NOT to Do
+- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these
+- ❌ Don't tell users to go to Settings — handle everything in chat
+- ❌ Don't echo API keys back in your response
+- ❌ Don't skip the search step — always verify the server exists before importing
+- ❌ Don't import local-only tools — inform users they require local installation
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/SKILL.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/SKILL.md
new file mode 100644
index 000000000..db71c3ed8
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/SKILL.md
@@ -0,0 +1,146 @@
+---
+name: Complex Task Executor
+description: Structured methodology for decomposing, planning, and executing complex multi-step tasks with progress tracking
+---
+
+# Complex Task Executor
+
+## When to Use This Skill
+
+Use this skill when a task meets ANY of the following criteria:
+- Requires more than 3 distinct steps to complete
+- Involves multiple tools or information sources
+- Has dependencies between steps (step B needs output from step A)
+- Requires research before execution
+- Could benefit from a documented plan others can review
+- The user explicitly asks for a thorough or systematic approach
+
+**DO NOT use this for simple tasks** like answering a question, reading a single file, or performing one tool call.
+
+## Workflow
+
+### Phase 1: Task Analysis (THINK before acting)
+
+Before creating any files, analyze the task:
+
+1. **Understand the goal**: What is the final deliverable? What does "done" look like?
+2. **Assess complexity**: How many steps? What tools are needed?
+3. **Identify dependencies**: Which steps depend on others?
+4. **Identify risks**: What could go wrong? What information is missing?
+5. **Estimate scope**: Is the task feasible with available tools/skills?
+
+### Phase 2: Create Task Plan
+
+Create a task folder and plan file in the workspace:
+
+```
+workspace/<task-name>/plan.md
+```
+
+The plan.md MUST follow this exact format:
+
+```markdown
+# Task: <Clear title>
+
+## Objective
+<One-sentence description of the desired outcome>
+
+## Steps
+
+- [ ] 1. <First step — verb-noun format>
+  - Details: <What specifically to do>
+  - Output: <What this step produces>
+- [ ] 2. <Second step>
+  - Details: <...>
+  - Depends on: Step 1
+- [ ] 3. <Third step>
+  - Details: <...>
+
+## Status
+- Created: <timestamp>
+- Current Step: Not started
+- Progress: 0/<total>
+
+## Notes
+<Any assumptions, risks, or open questions>
+```
+
+Rules for writing the plan:
+- Each step should be completable in 1-3 tool calls
+- Use verb-noun format: "Research competitors", "Draft report", "Validate data"
+- Mark dependencies explicitly
+- Include expected outputs for each step
+
+### Phase 3: Execute Step-by-Step
+
+For EACH step in the plan:
+
+1. **Read the plan** — Call `read_file` on `workspace/<task>/plan.md` to check current state
+2. **Mark as in-progress** — Update the checkbox from `[ ]` to `[/]` and update the "Current Step" field
+3. **Execute the step** — Do the actual work (tool calls, analysis, writing)
+4. **Record output** — Save results to `workspace/<task>/` (e.g., intermediate files, data)
+5. **Mark as complete** — Update the checkbox from `[/]` to `[x]` and update "Progress" counter
+6. **Proceed to next step** — Move to the next uncompleted step
+
+### Phase 4: Completion
+
+When all steps are done:
+1. Update plan.md status to "✅ Completed"
+2. Create a `workspace/<task>/summary.md` with:
+   - What was accomplished
+   - Key results and deliverables
+   - Any follow-up items
+3. Present the final result to the user
+
+## Adaptive Replanning
+
+If during execution you discover:
+- A step is impossible → Mark it `[!]` with a reason, add alternative steps
+- New steps are needed → Add them to the plan with `[+]` prefix
+- A step produced unexpected results → Add a note and adjust subsequent steps
+- The plan needs major changes → Create a new section "## Revised Plan" and follow it
+
+Always update plan.md BEFORE changing course, so the plan stays the source of truth.
+
+## Error Handling
+
+- If a tool call fails, retry once. If it fails again, mark the step as blocked and note the error.
+- Never silently skip a step. Always update the plan to reflect what happened.
+- If you're stuck, tell the user what's blocking and ask for guidance.
+
+## Example Scenarios
+
+### Example 1: "Research our top 3 competitors and write a comparison report"
+
+Plan would be:
+```
+- [ ] 1. Identify the user's company/product context
+- [ ] 2. Research Competitor A — website, pricing, features
+- [ ] 3. Research Competitor B — website, pricing, features
+- [ ] 4. Research Competitor C — website, pricing, features
+- [ ] 5. Create comparison matrix
+- [ ] 6. Write analysis and recommendations
+- [ ] 7. Compile final report
+```
+
+### Example 2: "Analyze our Q4 sales data and prepare a board presentation"
+
+Plan would be:
+```
+- [ ] 1. Read and understand the sales data files
+- [ ] 2. Calculate key metrics (revenue, growth, trends)
+- [ ] 3. Identify top insights and anomalies
+- [ ] 4. Create data summary tables
+- [ ] 5. Draft presentation outline
+- [ ] 6. Write each presentation section
+- [ ] 7. Add executive summary
+- [ ] 8. Review and polish final document
+```
+
+## Key Principles
+
+1. **Plan is the source of truth** — Always update it before moving on
+2. **One step at a time** — Don't skip ahead or batch too many steps
+3. **Show your work** — Save intermediate results to the task folder
+4. **Communicate progress** — The user can read plan.md at any time to see status
+5. **Be adaptive** — Plans change; that's OK if you update the plan first
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/examples/plan_template.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/examples/plan_template.md
new file mode 100644
index 000000000..dfd60e7cb
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/examples/plan_template.md
@@ -0,0 +1,23 @@
+# Task: [Title]
+
+## Objective
+[One-sentence description of the desired outcome]
+
+## Steps
+
+- [ ] 1. [First step]
+  - Details: [What specifically to do]
+  - Output: [What this step produces]
+- [ ] 2. [Second step]
+  - Details: [...]
+  - Depends on: Step 1
+- [ ] 3. [Third step]
+  - Details: [...]
+
+## Status
+- Created: [timestamp]
+- Current Step: Not started
+- Progress: 0/3
+
+## Notes
+- [Any assumptions, risks, or open questions]
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/mcp-installer/SKILL.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/mcp-installer/SKILL.md
new file mode 100644
index 000000000..9e3bf3c77
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/mcp-installer/SKILL.md
@@ -0,0 +1,87 @@
+# MCP Tool Installer
+
+## When to Use This Skill
+Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL.
+
+---
+
+## Step-by-Step Protocol
+
+### Step 1 — Search first
+```
+discover_resources(query="<what the user wants>", max_results=5)
+```
+Show the results and let the user pick. Note the `ID` field (e.g. `github`).
+
+### Step 2 — Determine import method
+
+**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐)
+- Requires Smithery API Key (one-time per agent)
+- Individual tool tokens NOT needed — Smithery handles auth via OAuth
+
+**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint)
+- User provides the MCP server URL directly
+- May require tool-specific API key
+
+**Not importable** (💻 local-only tools)
+- Requires local Docker/process — inform user these cannot be imported automatically
+
+---
+
+### Method A: Smithery Import
+
+#### Check Smithery API Key
+If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim):
+
+> **Smithery** (smithery.ai) 是一个 MCP 工具市场，类似于"应用商店"。通过它，我可以帮你一键安装各种第三方工具（如 GitHub、Notion、Slack 等），并自动完成认证。
+>
+> **为什么需要注册？**
+> Smithery 用 API Key 来识别你的身份，这样安装的工具会关联到你的账号，认证信息也会安全保存。
+>
+> **注册一次后有什么好处？**
+> - 🔑 只需提供一次 Key，后续安装其他工具时我会自动帮你配置
+> - 🔐 不需要为每个工具单独创建 Token（如 GitHub PAT），OAuth 一键授权
+> - 📦 支持上千种 MCP 工具，随时可以扩展你的能力
+>
+> **获取步骤：**
+> 1. 访问 https://smithery.ai 注册/登录
+> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key
+> 3. 将 Key 提供给我
+
+#### Import
+```
+import_mcp_server(
+  server_id="<qualified_name>",
+  config={"smithery_api_key": "<key>"}  # first time only
+)
+```
+
+#### Handle OAuth
+Some tools return an OAuth authorization URL. Tell the user to visit the link.
+
+**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically.
+
+---
+
+### Method B: Direct URL Import
+
+When a tool is not available on Smithery but the user has a public MCP endpoint:
+```
+import_mcp_server(
+  server_id="<server name>",
+  config={
+    "mcp_url": "https://my-mcp-server.com/sse",
+    "api_key": "<optional tool-specific key>"
+  }
+)
+```
+The system will connect to the URL, discover available tools, and register them.
+
+---
+
+## What NOT to Do
+- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these
+- ❌ Don't tell users to go to Settings — handle everything in chat
+- ❌ Don't echo API keys back in your response
+- ❌ Don't skip the search step — always verify the server exists before importing
+- ❌ Don't import local-only tools — inform users they require local installation
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/meeting-notes/SKILL.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/meeting-notes/SKILL.md
new file mode 100644
index 000000000..71cf83da2
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/meeting-notes/SKILL.md
@@ -0,0 +1,36 @@
+---
+name: Meeting Notes
+description: Meeting summarization, action item extraction, and follow-up tracking
+---
+
+# Meeting Notes
+
+## Overview
+Use this skill for processing meeting content into structured summaries with clear action items.
+
+**Keywords**: meetings, notes, action items, decisions, follow-up
+
+## Template
+
+### Meeting Summary
+```
+Meeting: [Title]
+Date: [Date]
+Participants: [Names]
+Duration: [Time]
+```
+
+### Key Decisions
+- Numbered list of decisions made
+
+### Action Items
+| # | Action | Owner | Due Date | Status |
+|---|--------|-------|----------|--------|
+| 1 | [Task] | [Name] | [Date] | ⬜ Pending |
+
+### Discussion Points
+Brief summary of main topics discussed
+
+### Next Steps
+- Follow-up meeting date
+- Items deferred to next meeting
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/SKILL.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/SKILL.md
new file mode 100644
index 000000000..ce0d06f3e
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/SKILL.md
@@ -0,0 +1,152 @@
+---
+name: skill-creator
+description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy.
+---
+
+# Skill Creator
+
+A skill for creating new skills and iteratively improving them.
+
+At a high level, the process of creating a skill goes like this:
+
+- Decide what you want the skill to do and roughly how it should do it
+- Write a draft of the skill
+- Create a few test prompts and run claude-with-access-to-the-skill on them
+- Help the user evaluate the results both qualitatively and quantitatively
+- Rewrite the skill based on feedback from the user's evaluation
+- Repeat until you're satisfied
+- Expand the test set and try again at larger scale
+
+Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages.
+
+## Communicating with the user
+
+Pay attention to context cues to understand how to phrase your communication. Briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it.
+
+---
+
+## Creating a skill
+
+### Capture Intent
+Start by understanding the user's intent.
+
+1. What should this skill enable the agent to do?
+2. When should this skill trigger? (what user phrases/contexts)
+3. What's the expected output format?
+4. Should we set up test cases to verify the skill works?
+
+### Interview and Research
+Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies. Wait to write test prompts until you've got this part ironed out.
+
+### Write the SKILL.md
+Based on the user interview, fill in these components:
+
+- **name**: Skill identifier
+- **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it.
+- **the rest of the skill**
+
+### Skill Writing Guide
+
+#### Anatomy of a Skill
+
+```
+skill-name/
+\u251c\u2500\u2500 SKILL.md (required)
+\u2502   \u251c\u2500\u2500 YAML frontmatter (name, description required)
+\u2502   \u2514\u2500\u2500 Markdown instructions
+\u2514\u2500\u2500 Bundled Resources (optional)
+    \u251c\u2500\u2500 scripts/    - Executable code for deterministic/repetitive tasks
+    \u251c\u2500\u2500 references/ - Docs loaded into context as needed
+    \u2514\u2500\u2500 assets/     - Files used in output (templates, icons, fonts)
+```
+
+#### Progressive Disclosure
+
+Skills use a three-level loading system:
+1. **Metadata** (name + description) - Always in context (~100 words)
+2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal)
+3. **Bundled resources** - As needed (unlimited, scripts can execute without loading)
+
+**Key patterns:**
+- Keep SKILL.md under 500 lines; if approaching this limit, add hierarchy with clear pointers
+- Reference files clearly from SKILL.md with guidance on when to read them
+- For large reference files (>300 lines), include a table of contents
+
+#### Writing Patterns
+
+Prefer using the imperative form in instructions.
+
+### Writing Style
+Explain to the model why things are important. Use theory of mind and try to make the skill general. Start by writing a draft and then look at it with fresh eyes and improve it.
+
+### Test Cases
+After writing the skill draft, come up with 2-3 realistic test prompts. Share them with the user. Save test cases to `evals/evals.json`.
+
+---
+
+## Running and evaluating test cases
+
+This section is one continuous sequence.
+
+### Step 1: Run test cases
+For each test case, run the agent with the skill applied, and optionally a baseline run without the skill for comparison.
+
+### Step 2: Draft assertions
+While runs are in progress, draft quantitative assertions for each test case. Good assertions are objectively verifiable and have descriptive names.
+
+### Step 3: Capture timing data
+When each run completes, save timing data (tokens, duration) to `timing.json`.
+
+### Step 4: Grade, aggregate, and launch the viewer
+Once all runs are done:
+1. Grade each run against assertions — see `agents/grader.md`
+2. Aggregate results: `python -m scripts.aggregate_benchmark <workspace>/iteration-N --skill-name <name>`
+3. Launch the viewer: `python eval-viewer/generate_review.py <workspace>/iteration-N --skill-name "my-skill" --benchmark <workspace>/iteration-N/benchmark.json`
+4. Present results to the user for review
+
+### Step 5: Read the feedback
+Read user feedback from `feedback.json`. Empty feedback means the user thought it was fine.
+
+---
+
+## Improving the skill
+
+### How to think about improvements
+1. **Generalize from the feedback.** Don't overfit to specific examples.
+2. **Keep the prompt lean.** Remove things that aren't pulling their weight.
+3. **Explain the why.** Today's LLMs are smart. Explain reasoning rather than rigid MUSTs.
+4. **Look for repeated work across test cases.** Bundle common scripts in `scripts/`.
+
+### The iteration loop
+1. Apply improvements to the skill
+2. Rerun all test cases into a new iteration directory
+3. Present results for review
+4. Wait for user to review
+5. Read feedback, improve again, repeat
+
+---
+
+## Advanced: Blind comparison
+For rigorous comparison between two versions. Read `agents/comparator.md` and `agents/analyzer.md`.
+
+## Description Optimization
+Optimize the description for better triggering accuracy. Use `scripts/run_loop.py`.
+
+---
+
+## Reference files
+
+- `agents/grader.md` — How to evaluate assertions against outputs
+- `agents/comparator.md` — How to do blind A/B comparison between two outputs
+- `agents/analyzer.md` — How to analyze why one version beat another
+- `references/schemas.md` — JSON structures for evals.json, grading.json, etc.
+- `assets/eval_review.html` — HTML template for eval review
+- `eval-viewer/generate_review.py` — Script to generate the review viewer
+- `scripts/aggregate_benchmark.py` — Aggregate benchmark results
+- `scripts/generate_report.py` — Generate optimization report
+- `scripts/improve_description.py` — Improve skill description
+- `scripts/package_skill.py` — Package skill for distribution
+- `scripts/quick_validate.py` — Quick validation
+- `scripts/run_eval.py` — Run triggering evaluation
+- `scripts/run_loop.py` — Run optimization loop
+- `scripts/utils.py` — Shared utilities
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/analyzer.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/analyzer.md
new file mode 100644
index 000000000..14e41d606
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/analyzer.md
@@ -0,0 +1,274 @@
+# Post-hoc Analyzer Agent
+
+Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions.
+
+## Role
+
+After the blind comparator determines a winner, the Post-hoc Analyzer "unblids" the results by examining the skills and transcripts. The goal is to extract actionable insights: what made the winner better, and how can the loser be improved?
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **winner**: "A" or "B" (from blind comparison)
+- **winner_skill_path**: Path to the skill that produced the winning output
+- **winner_transcript_path**: Path to the execution transcript for the winner
+- **loser_skill_path**: Path to the skill that produced the losing output
+- **loser_transcript_path**: Path to the execution transcript for the loser
+- **comparison_result_path**: Path to the blind comparator's output JSON
+- **output_path**: Where to save the analysis results
+
+## Process
+
+### Step 1: Read Comparison Result
+
+1. Read the blind comparator's output at comparison_result_path
+2. Note the winning side (A or B), the reasoning, and any scores
+3. Understand what the comparator valued in the winning output
+
+### Step 2: Read Both Skills
+
+1. Read the winner skill's SKILL.md and key referenced files
+2. Read the loser skill's SKILL.md and key referenced files
+3. Identify structural differences:
+   - Instructions clarity and specificity
+   - Script/tool usage patterns
+   - Example coverage
+   - Edge case handling
+
+### Step 3: Read Both Transcripts
+
+1. Read the winner's transcript
+2. Read the loser's transcript
+3. Compare execution patterns:
+   - How closely did each follow their skill's instructions?
+   - What tools were used differently?
+   - Where did the loser diverge from optimal behavior?
+   - Did either encounter errors or make recovery attempts?
+
+### Step 4: Analyze Instruction Following
+
+For each transcript, evaluate:
+- Did the agent follow the skill's explicit instructions?
+- Did the agent use the skill's provided tools/scripts?
+- Were there missed opportunities to leverage skill content?
+- Did the agent add unnecessary steps not in the skill?
+
+Score instruction following 1-10 and note specific issues.
+
+### Step 5: Identify Winner Strengths
+
+Determine what made the winner better:
+- Clearer instructions that led to better behavior?
+- Better scripts/tools that produced better output?
+- More comprehensive examples that guided edge cases?
+- Better error handling guidance?
+
+Be specific. Quote from skills/transcripts where relevant.
+
+### Step 6: Identify Loser Weaknesses
+
+Determine what held the loser back:
+- Ambiguous instructions that led to suboptimal choices?
+- Missing tools/scripts that forced workarounds?
+- Gaps in edge case coverage?
+- Poor error handling that caused failures?
+
+### Step 7: Generate Improvement Suggestions
+
+Based on the analysis, produce actionable suggestions for improving the loser skill:
+- Specific instruction changes to make
+- Tools/scripts to add or modify
+- Examples to include
+- Edge cases to address
+
+Prioritize by impact. Focus on changes that would have changed the outcome.
+
+### Step 8: Write Analysis Results
+
+Save structured analysis to `{output_path}`.
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "comparison_summary": {
+    "winner": "A",
+    "winner_skill": "path/to/winner/skill",
+    "loser_skill": "path/to/loser/skill",
+    "comparator_reasoning": "Brief summary of why comparator chose winner"
+  },
+  "winner_strengths": [
+    "Clear step-by-step instructions for handling multi-page documents",
+    "Included validation script that caught formatting errors",
+    "Explicit guidance on fallback behavior when OCR fails"
+  ],
+  "loser_weaknesses": [
+    "Vague instruction 'process the document appropriately' led to inconsistent behavior",
+    "No script for validation, agent had to improvise and made errors",
+    "No guidance on OCR failure, agent gave up instead of trying alternatives"
+  ],
+  "instruction_following": {
+    "winner": {
+      "score": 9,
+      "issues": [
+        "Minor: skipped optional logging step"
+      ]
+    },
+    "loser": {
+      "score": 6,
+      "issues": [
+        "Did not use the skill's formatting template",
+        "Invented own approach instead of following step 3",
+        "Missed the 'always validate output' instruction"
+      ]
+    }
+  },
+  "improvement_suggestions": [
+    {
+      "priority": "high",
+      "category": "instructions",
+      "suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template",
+      "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
+    },
+    {
+      "priority": "high",
+      "category": "tools",
+      "suggestion": "Add validate_output.py script similar to winner skill's validation approach",
+      "expected_impact": "Would catch formatting errors before final output"
+    },
+    {
+      "priority": "medium",
+      "category": "error_handling",
+      "suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'",
+      "expected_impact": "Would prevent early failure on difficult documents"
+    }
+  ],
+  "transcript_insights": {
+    "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output",
+    "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors"
+  }
+}
+```
+
+## Guidelines
+
+- **Be specific**: Quote from skills and transcripts, don't just say "instructions were unclear"
+- **Be actionable**: Suggestions should be concrete changes, not vague advice
+- **Focus on skill improvements**: The goal is to improve the losing skill, not critique the agent
+- **Prioritize by impact**: Which changes would most likely have changed the outcome?
+- **Consider causation**: Did the skill weakness actually cause the worse output, or is it incidental?
+- **Stay objective**: Analyze what happened, don't editorialize
+- **Think about generalization**: Would this improvement help on other evals too?
+
+## Categories for Suggestions
+
+Use these categories to organize improvement suggestions:
+
+| Category | Description |
+|----------|-------------|
+| `instructions` | Changes to the skill's prose instructions |
+| `tools` | Scripts, templates, or utilities to add/modify |
+| `examples` | Example inputs/outputs to include |
+| `error_handling` | Guidance for handling failures |
+| `structure` | Reorganization of skill content |
+| `references` | External docs or resources to add |
+
+## Priority Levels
+
+- **high**: Would likely change the outcome of this comparison
+- **medium**: Would improve quality but may not change win/loss
+- **low**: Nice to have, marginal improvement
+
+---
+
+# Analyzing Benchmark Results
+
+When analyzing benchmark results, the analyzer's purpose is to **surface patterns and anomalies** across multiple runs, not suggest skill improvements.
+
+## Role
+
+Review all benchmark run results and generate freeform notes that help the user understand skill performance. Focus on patterns that wouldn't be visible from aggregate metrics alone.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **benchmark_data_path**: Path to the in-progress benchmark.json with all run results
+- **skill_path**: Path to the skill being benchmarked
+- **output_path**: Where to save the notes (as JSON array of strings)
+
+## Process
+
+### Step 1: Read Benchmark Data
+
+1. Read the benchmark.json containing all run results
+2. Note the configurations tested (with_skill, without_skill)
+3. Understand the run_summary aggregates already calculated
+
+### Step 2: Analyze Per-Assertion Patterns
+
+For each expectation across all runs:
+- Does it **always pass** in both configurations? (may not differentiate skill value)
+- Does it **always fail** in both configurations? (may be broken or beyond capability)
+- Does it **always pass with skill but fail without**? (skill clearly adds value here)
+- Does it **always fail with skill but pass without**? (skill may be hurting)
+- Is it **highly variable**? (flaky expectation or non-deterministic behavior)
+
+### Step 3: Analyze Cross-Eval Patterns
+
+Look for patterns across evals:
+- Are certain eval types consistently harder/easier?
+- Do some evals show high variance while others are stable?
+- Are there surprising results that contradict expectations?
+
+### Step 4: Analyze Metrics Patterns
+
+Look at time_seconds, tokens, tool_calls:
+- Does the skill significantly increase execution time?
+- Is there high variance in resource usage?
+- Are there outlier runs that skew the aggregates?
+
+### Step 5: Generate Notes
+
+Write freeform observations as a list of strings. Each note should:
+- State a specific observation
+- Be grounded in the data (not speculation)
+- Help the user understand something the aggregate metrics don't show
+
+Examples:
+- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value"
+- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky"
+- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)"
+- "Skill adds 13s average execution time but improves pass rate by 50%"
+- "Token usage is 80% higher with skill, primarily due to script output parsing"
+- "All 3 without-skill runs for eval 1 produced empty output"
+
+### Step 6: Write Notes
+
+Save notes to `{output_path}` as a JSON array of strings:
+
+```json
+[
+  "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
+  "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure",
+  "Without-skill runs consistently fail on table extraction expectations",
+  "Skill adds 13s average execution time but improves pass rate by 50%"
+]
+```
+
+## Guidelines
+
+**DO:**
+- Report what you observe in the data
+- Be specific about which evals, expectations, or runs you're referring to
+- Note patterns that aggregate metrics would hide
+- Provide context that helps interpret the numbers
+
+**DO NOT:**
+- Suggest improvements to the skill (that's for the improvement step, not benchmarking)
+- Make subjective quality judgments ("the output was good/bad")
+- Speculate about causes without evidence
+- Repeat information already in the run_summary aggregates
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/comparator.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/comparator.md
new file mode 100644
index 000000000..80e00eb45
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/comparator.md
@@ -0,0 +1,202 @@
+# Blind Comparator Agent
+
+Compare two outputs WITHOUT knowing which skill produced them.
+
+## Role
+
+The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach.
+
+Your judgment is based purely on output quality and task completion.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **output_a_path**: Path to the first output file or directory
+- **output_b_path**: Path to the second output file or directory
+- **eval_prompt**: The original task/prompt that was executed
+- **expectations**: List of expectations to check (optional - may be empty)
+
+## Process
+
+### Step 1: Read Both Outputs
+
+1. Examine output A (file or directory)
+2. Examine output B (file or directory)
+3. Note the type, structure, and content of each
+4. If outputs are directories, examine all relevant files inside
+
+### Step 2: Understand the Task
+
+1. Read the eval_prompt carefully
+2. Identify what the task requires:
+   - What should be produced?
+   - What qualities matter (accuracy, completeness, format)?
+   - What would distinguish a good output from a poor one?
+
+### Step 3: Generate Evaluation Rubric
+
+Based on the task, generate a rubric with two dimensions:
+
+**Content Rubric** (what the output contains):
+| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
+|-----------|----------|----------------|---------------|
+| Correctness | Major errors | Minor errors | Fully correct |
+| Completeness | Missing key elements | Mostly complete | All elements present |
+| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
+
+**Structure Rubric** (how the output is organized):
+| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
+|-----------|----------|----------------|---------------|
+| Organization | Disorganized | Reasonably organized | Clear, logical structure |
+| Formatting | Inconsistent/broken | Mostly consistent | Professional, polished |
+| Usability | Difficult to use | Usable with effort | Easy to use |
+
+Adapt criteria to the specific task. For example:
+- PDF form → "Field alignment", "Text readability", "Data placement"
+- Document → "Section structure", "Heading hierarchy", "Paragraph flow"
+- Data output → "Schema correctness", "Data types", "Completeness"
+
+### Step 4: Evaluate Each Output Against the Rubric
+
+For each output (A and B):
+
+1. **Score each criterion** on the rubric (1-5 scale)
+2. **Calculate dimension totals**: Content score, Structure score
+3. **Calculate overall score**: Average of dimension scores, scaled to 1-10
+
+### Step 5: Check Assertions (if provided)
+
+If expectations are provided:
+
+1. Check each expectation against output A
+2. Check each expectation against output B
+3. Count pass rates for each output
+4. Use expectation scores as secondary evidence (not the primary decision factor)
+
+### Step 6: Determine the Winner
+
+Compare A and B based on (in priority order):
+
+1. **Primary**: Overall rubric score (content + structure)
+2. **Secondary**: Assertion pass rates (if applicable)
+3. **Tiebreaker**: If truly equal, declare a TIE
+
+Be decisive - ties should be rare. One output is usually better, even if marginally.
+
+### Step 7: Write Comparison Results
+
+Save results to a JSON file at the path specified (or `comparison.json` if not specified).
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "winner": "A",
+  "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
+  "rubric": {
+    "A": {
+      "content": {
+        "correctness": 5,
+        "completeness": 5,
+        "accuracy": 4
+      },
+      "structure": {
+        "organization": 4,
+        "formatting": 5,
+        "usability": 4
+      },
+      "content_score": 4.7,
+      "structure_score": 4.3,
+      "overall_score": 9.0
+    },
+    "B": {
+      "content": {
+        "correctness": 3,
+        "completeness": 2,
+        "accuracy": 3
+      },
+      "structure": {
+        "organization": 3,
+        "formatting": 2,
+        "usability": 3
+      },
+      "content_score": 2.7,
+      "structure_score": 2.7,
+      "overall_score": 5.4
+    }
+  },
+  "output_quality": {
+    "A": {
+      "score": 9,
+      "strengths": ["Complete solution", "Well-formatted", "All fields present"],
+      "weaknesses": ["Minor style inconsistency in header"]
+    },
+    "B": {
+      "score": 5,
+      "strengths": ["Readable output", "Correct basic structure"],
+      "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
+    }
+  },
+  "expectation_results": {
+    "A": {
+      "passed": 4,
+      "total": 5,
+      "pass_rate": 0.80,
+      "details": [
+        {"text": "Output includes name", "passed": true},
+        {"text": "Output includes date", "passed": true},
+        {"text": "Format is PDF", "passed": true},
+        {"text": "Contains signature", "passed": false},
+        {"text": "Readable text", "passed": true}
+      ]
+    },
+    "B": {
+      "passed": 3,
+      "total": 5,
+      "pass_rate": 0.60,
+      "details": [
+        {"text": "Output includes name", "passed": true},
+        {"text": "Output includes date", "passed": false},
+        {"text": "Format is PDF", "passed": true},
+        {"text": "Contains signature", "passed": false},
+        {"text": "Readable text", "passed": true}
+      ]
+    }
+  }
+}
+```
+
+If no expectations were provided, omit the `expectation_results` field entirely.
+
+## Field Descriptions
+
+- **winner**: "A", "B", or "TIE"
+- **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie)
+- **rubric**: Structured rubric evaluation for each output
+  - **content**: Scores for content criteria (correctness, completeness, accuracy)
+  - **structure**: Scores for structure criteria (organization, formatting, usability)
+  - **content_score**: Average of content criteria (1-5)
+  - **structure_score**: Average of structure criteria (1-5)
+  - **overall_score**: Combined score scaled to 1-10
+- **output_quality**: Summary quality assessment
+  - **score**: 1-10 rating (should match rubric overall_score)
+  - **strengths**: List of positive aspects
+  - **weaknesses**: List of issues or shortcomings
+- **expectation_results**: (Only if expectations provided)
+  - **passed**: Number of expectations that passed
+  - **total**: Total number of expectations
+  - **pass_rate**: Fraction passed (0.0 to 1.0)
+  - **details**: Individual expectation results
+
+## Guidelines
+
+- **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality.
+- **Be specific**: Cite specific examples when explaining strengths and weaknesses.
+- **Be decisive**: Choose a winner unless outputs are genuinely equivalent.
+- **Output quality first**: Assertion scores are secondary to overall task completion.
+- **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness.
+- **Explain your reasoning**: The reasoning field should make it clear why you chose the winner.
+- **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better.
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/grader.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/grader.md
new file mode 100644
index 000000000..558ab05c0
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/grader.md
@@ -0,0 +1,223 @@
+# Grader Agent
+
+Evaluate expectations against an execution transcript and outputs.
+
+## Role
+
+The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment.
+
+You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **expectations**: List of expectations to evaluate (strings)
+- **transcript_path**: Path to the execution transcript (markdown file)
+- **outputs_dir**: Directory containing output files from execution
+
+## Process
+
+### Step 1: Read the Transcript
+
+1. Read the transcript file completely
+2. Note the eval prompt, execution steps, and final result
+3. Identify any issues or errors documented
+
+### Step 2: Examine Output Files
+
+1. List files in outputs_dir
+2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced.
+3. Note contents, structure, and quality
+
+### Step 3: Evaluate Each Assertion
+
+For each expectation:
+
+1. **Search for evidence** in the transcript and outputs
+2. **Determine verdict**:
+   - **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance
+   - **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content)
+3. **Cite the evidence**: Quote the specific text or describe what you found
+
+### Step 4: Extract and Verify Claims
+
+Beyond the predefined expectations, extract implicit claims from the outputs and verify them:
+
+1. **Extract claims** from the transcript and outputs:
+   - Factual statements ("The form has 12 fields")
+   - Process claims ("Used pypdf to fill the form")
+   - Quality claims ("All fields were filled correctly")
+
+2. **Verify each claim**:
+   - **Factual claims**: Can be checked against the outputs or external sources
+   - **Process claims**: Can be verified from the transcript
+   - **Quality claims**: Evaluate whether the claim is justified
+
+3. **Flag unverifiable claims**: Note claims that cannot be verified with available information
+
+This catches issues that predefined expectations might miss.
+
+### Step 5: Read User Notes
+
+If `{outputs_dir}/user_notes.md` exists:
+1. Read it and note any uncertainties or issues flagged by the executor
+2. Include relevant concerns in the grading output
+3. These may reveal problems even when expectations pass
+
+### Step 6: Critique the Evals
+
+After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap.
+
+Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't.
+
+Suggestions worth raising:
+- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content)
+- An important outcome you observed — good or bad — that no assertion covers at all
+- An assertion that can't actually be verified from the available outputs
+
+Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion.
+
+### Step 7: Write Grading Results
+
+Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
+
+## Grading Criteria
+
+**PASS when**:
+- The transcript or outputs clearly demonstrate the expectation is true
+- Specific evidence can be cited
+- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename)
+
+**FAIL when**:
+- No evidence found for the expectation
+- Evidence contradicts the expectation
+- The expectation cannot be verified from available information
+- The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete
+- The output appears to meet the assertion by coincidence rather than by actually doing the work
+
+**When uncertain**: The burden of proof to pass is on the expectation.
+
+### Step 8: Read Executor Metrics and Timing
+
+1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output
+2. If `{outputs_dir}/../timing.json` exists, read it and include timing data
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "expectations": [
+    {
+      "text": "The output includes the name 'John Smith'",
+      "passed": true,
+      "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
+    },
+    {
+      "text": "The spreadsheet has a SUM formula in cell B10",
+      "passed": false,
+      "evidence": "No spreadsheet was created. The output was a text file."
+    },
+    {
+      "text": "The assistant used the skill's OCR script",
+      "passed": true,
+      "evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'"
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 1,
+    "total": 3,
+    "pass_rate": 0.67
+  },
+  "execution_metrics": {
+    "tool_calls": {
+      "Read": 5,
+      "Write": 2,
+      "Bash": 8
+    },
+    "total_tool_calls": 15,
+    "total_steps": 6,
+    "errors_encountered": 0,
+    "output_chars": 12450,
+    "transcript_chars": 3200
+  },
+  "timing": {
+    "executor_duration_seconds": 165.0,
+    "grader_duration_seconds": 26.0,
+    "total_duration_seconds": 191.0
+  },
+  "claims": [
+    {
+      "claim": "The form has 12 fillable fields",
+      "type": "factual",
+      "verified": true,
+      "evidence": "Counted 12 fields in field_info.json"
+    },
+    {
+      "claim": "All required fields were populated",
+      "type": "quality",
+      "verified": false,
+      "evidence": "Reference section was left blank despite data being available"
+    }
+  ],
+  "user_notes_summary": {
+    "uncertainties": ["Used 2023 data, may be stale"],
+    "needs_review": [],
+    "workarounds": ["Fell back to text overlay for non-fillable fields"]
+  },
+  "eval_feedback": {
+    "suggestions": [
+      {
+        "assertion": "The output includes the name 'John Smith'",
+        "reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input"
+      },
+      {
+        "reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught"
+      }
+    ],
+    "overall": "Assertions check presence but not correctness. Consider adding content verification."
+  }
+}
+```
+
+## Field Descriptions
+
+- **expectations**: Array of graded expectations
+  - **text**: The original expectation text
+  - **passed**: Boolean - true if expectation passes
+  - **evidence**: Specific quote or description supporting the verdict
+- **summary**: Aggregate statistics
+  - **passed**: Count of passed expectations
+  - **failed**: Count of failed expectations
+  - **total**: Total expectations evaluated
+  - **pass_rate**: Fraction passed (0.0 to 1.0)
+- **execution_metrics**: Copied from executor's metrics.json (if available)
+  - **output_chars**: Total character count of output files (proxy for tokens)
+  - **transcript_chars**: Character count of transcript
+- **timing**: Wall clock timing from timing.json (if available)
+  - **executor_duration_seconds**: Time spent in executor subagent
+  - **total_duration_seconds**: Total elapsed time for the run
+- **claims**: Extracted and verified claims from the output
+  - **claim**: The statement being verified
+  - **type**: "factual", "process", or "quality"
+  - **verified**: Boolean - whether the claim holds
+  - **evidence**: Supporting or contradicting evidence
+- **user_notes_summary**: Issues flagged by the executor
+  - **uncertainties**: Things the executor wasn't sure about
+  - **needs_review**: Items requiring human attention
+  - **workarounds**: Places where the skill didn't work as expected
+- **eval_feedback**: Improvement suggestions for the evals (only when warranted)
+  - **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to
+  - **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag
+
+## Guidelines
+
+- **Be objective**: Base verdicts on evidence, not assumptions
+- **Be specific**: Quote the exact text that supports your verdict
+- **Be thorough**: Check both transcript and output files
+- **Be consistent**: Apply the same standard to each expectation
+- **Explain failures**: Make it clear why evidence was insufficient
+- **No partial credit**: Each expectation is pass or fail, not partial
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/assets/eval_review.html b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/assets/eval_review.html
new file mode 100644
index 000000000..938ff32ae
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/assets/eval_review.html
@@ -0,0 +1,146 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Eval Set Review - __SKILL_NAME_PLACEHOLDER__</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+  <style>
+    * { box-sizing: border-box; margin: 0; padding: 0; }
+    body { font-family: 'Lora', Georgia, serif; background: #faf9f5; padding: 2rem; color: #141413; }
+    h1 { font-family: 'Poppins', sans-serif; margin-bottom: 0.5rem; font-size: 1.5rem; }
+    .description { color: #b0aea5; margin-bottom: 1.5rem; font-style: italic; max-width: 900px; }
+    .controls { margin-bottom: 1rem; display: flex; gap: 0.5rem; }
+    .btn { font-family: 'Poppins', sans-serif; padding: 0.5rem 1rem; border: none; border-radius: 6px; cursor: pointer; font-size: 0.875rem; font-weight: 500; }
+    .btn-add { background: #6a9bcc; color: white; }
+    .btn-add:hover { background: #5889b8; }
+    .btn-export { background: #d97757; color: white; }
+    .btn-export:hover { background: #c4613f; }
+    table { width: 100%; max-width: 1100px; border-collapse: collapse; background: white; border-radius: 6px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.08); }
+    th { font-family: 'Poppins', sans-serif; background: #141413; color: #faf9f5; padding: 0.75rem 1rem; text-align: left; font-size: 0.875rem; }
+    td { padding: 0.75rem 1rem; border-bottom: 1px solid #e8e6dc; vertical-align: top; }
+    tr:nth-child(even) td { background: #faf9f5; }
+    tr:hover td { background: #f3f1ea; }
+    .section-header td { background: #e8e6dc; font-family: 'Poppins', sans-serif; font-weight: 500; font-size: 0.8rem; color: #141413; text-transform: uppercase; letter-spacing: 0.05em; }
+    .query-input { width: 100%; padding: 0.4rem; border: 1px solid #e8e6dc; border-radius: 4px; font-size: 0.875rem; font-family: 'Lora', Georgia, serif; resize: vertical; min-height: 60px; }
+    .query-input:focus { outline: none; border-color: #d97757; box-shadow: 0 0 0 2px rgba(217,119,87,0.15); }
+    .toggle { position: relative; display: inline-block; width: 44px; height: 24px; }
+    .toggle input { opacity: 0; width: 0; height: 0; }
+    .toggle .slider { position: absolute; inset: 0; background: #b0aea5; border-radius: 24px; cursor: pointer; transition: 0.2s; }
+    .toggle .slider::before { content: ""; position: absolute; width: 18px; height: 18px; left: 3px; bottom: 3px; background: white; border-radius: 50%; transition: 0.2s; }
+    .toggle input:checked + .slider { background: #d97757; }
+    .toggle input:checked + .slider::before { transform: translateX(20px); }
+    .btn-delete { background: #c44; color: white; padding: 0.3rem 0.6rem; border: none; border-radius: 4px; cursor: pointer; font-size: 0.75rem; font-family: 'Poppins', sans-serif; }
+    .btn-delete:hover { background: #a33; }
+    .summary { margin-top: 1rem; color: #b0aea5; font-size: 0.875rem; }
+  </style>
+</head>
+<body>
+  <h1>Eval Set Review: <span id="skill-name">__SKILL_NAME_PLACEHOLDER__</span></h1>
+  <p class="description">Current description: <span id="skill-desc">__SKILL_DESCRIPTION_PLACEHOLDER__</span></p>
+
+  <div class="controls">
+    <button class="btn btn-add" onclick="addRow()">+ Add Query</button>
+    <button class="btn btn-export" onclick="exportEvalSet()">Export Eval Set</button>
+  </div>
+
+  <table>
+    <thead>
+      <tr>
+        <th style="width:65%">Query</th>
+        <th style="width:18%">Should Trigger</th>
+        <th style="width:10%">Actions</th>
+      </tr>
+    </thead>
+    <tbody id="eval-body"></tbody>
+  </table>
+
+  <p class="summary" id="summary"></p>
+
+  <script>
+    const EVAL_DATA = __EVAL_DATA_PLACEHOLDER__;
+
+    let evalItems = [...EVAL_DATA];
+
+    function render() {
+      const tbody = document.getElementById('eval-body');
+      tbody.innerHTML = '';
+
+      // Sort: should-trigger first, then should-not-trigger
+      const sorted = evalItems
+        .map((item, origIdx) => ({ ...item, origIdx }))
+        .sort((a, b) => (b.should_trigger ? 1 : 0) - (a.should_trigger ? 1 : 0));
+
+      let lastGroup = null;
+      sorted.forEach(item => {
+        const group = item.should_trigger ? 'trigger' : 'no-trigger';
+        if (group !== lastGroup) {
+          const headerRow = document.createElement('tr');
+          headerRow.className = 'section-header';
+          headerRow.innerHTML = `<td colspan="3">${item.should_trigger ? 'Should Trigger' : 'Should NOT Trigger'}</td>`;
+          tbody.appendChild(headerRow);
+          lastGroup = group;
+        }
+
+        const idx = item.origIdx;
+        const tr = document.createElement('tr');
+        tr.innerHTML = `
+          <td><textarea class="query-input" onchange="updateQuery(${idx}, this.value)">${escapeHtml(item.query)}</textarea></td>
+          <td>
+            <label class="toggle">
+              <input type="checkbox" ${item.should_trigger ? 'checked' : ''} onchange="updateTrigger(${idx}, this.checked)">
+              <span class="slider"></span>
+            </label>
+            <span style="margin-left:8px;font-size:0.8rem;color:#b0aea5">${item.should_trigger ? 'Yes' : 'No'}</span>
+          </td>
+          <td><button class="btn-delete" onclick="deleteRow(${idx})">Delete</button></td>
+        `;
+        tbody.appendChild(tr);
+      });
+      updateSummary();
+    }
+
+    function escapeHtml(text) {
+      const div = document.createElement('div');
+      div.textContent = text;
+      return div.innerHTML;
+    }
+
+    function updateQuery(idx, value) { evalItems[idx].query = value; updateSummary(); }
+    function updateTrigger(idx, value) { evalItems[idx].should_trigger = value; render(); }
+    function deleteRow(idx) { evalItems.splice(idx, 1); render(); }
+
+    function addRow() {
+      evalItems.push({ query: '', should_trigger: true });
+      render();
+      const inputs = document.querySelectorAll('.query-input');
+      inputs[inputs.length - 1].focus();
+    }
+
+    function updateSummary() {
+      const trigger = evalItems.filter(i => i.should_trigger).length;
+      const noTrigger = evalItems.filter(i => !i.should_trigger).length;
+      document.getElementById('summary').textContent =
+        `${evalItems.length} queries total: ${trigger} should trigger, ${noTrigger} should not trigger`;
+    }
+
+    function exportEvalSet() {
+      const valid = evalItems.filter(i => i.query.trim() !== '');
+      const data = valid.map(i => ({ query: i.query.trim(), should_trigger: i.should_trigger }));
+      const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
+      const url = URL.createObjectURL(blob);
+      const a = document.createElement('a');
+      a.href = url;
+      a.download = 'eval_set.json';
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+      URL.revokeObjectURL(url);
+    }
+
+    render();
+  </script>
+</body>
+</html>
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/generate_review.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/generate_review.py
new file mode 100644
index 000000000..4f0b1fe00
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/generate_review.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python3
+"""Generate and serve a review page for eval results.
+
+Reads the workspace directory, discovers runs (directories with outputs/),
+embeds all output data into a self-contained HTML page, and serves it via
+a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace.
+
+Usage:
+    python generate_review.py <workspace-path> [--port PORT] [--skill-name NAME]
+    python generate_review.py <workspace-path> --previous-feedback /path/to/old/feedback.json
+
+No dependencies beyond the Python stdlib are required.
+"""
+
+import argparse
+import base64
+import json
+import mimetypes
+import os
+import re
+import signal
+import subprocess
+import sys
+import time
+import webbrowser
+from functools import partial
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from pathlib import Path
+
+from loguru import logger
+
+# Files to exclude from output listings
+METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
+
+# Extensions we render as inline text
+TEXT_EXTENSIONS = {
+    ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
+    ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
+    ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
+}
+
+# Extensions we render as inline images
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
+
+# MIME type overrides for common types
+MIME_OVERRIDES = {
+    ".svg": "image/svg+xml",
+    ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+}
+
+
+def get_mime_type(path: Path) -> str:
+    ext = path.suffix.lower()
+    if ext in MIME_OVERRIDES:
+        return MIME_OVERRIDES[ext]
+    mime, _ = mimetypes.guess_type(str(path))
+    return mime or "application/octet-stream"
+
+
+def find_runs(workspace: Path) -> list[dict]:
+    """Recursively find directories that contain an outputs/ subdirectory."""
+    runs: list[dict] = []
+    _find_runs_recursive(workspace, workspace, runs)
+    runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"]))
+    return runs
+
+
+def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None:
+    if not current.is_dir():
+        return
+
+    outputs_dir = current / "outputs"
+    if outputs_dir.is_dir():
+        run = build_run(root, current)
+        if run:
+            runs.append(run)
+        return
+
+    skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
+    for child in sorted(current.iterdir()):
+        if child.is_dir() and child.name not in skip:
+            _find_runs_recursive(root, child, runs)
+
+
+def build_run(root: Path, run_dir: Path) -> dict | None:
+    """Build a run dict with prompt, outputs, and grading data."""
+    prompt = ""
+    eval_id = None
+
+    # Try eval_metadata.json
+    for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:
+        if candidate.exists():
+            try:
+                metadata = json.loads(candidate.read_text())
+                prompt = metadata.get("prompt", "")
+                eval_id = metadata.get("eval_id")
+            except (json.JSONDecodeError, OSError):
+                pass
+            if prompt:
+                break
+
+    # Fall back to transcript.md
+    if not prompt:
+        for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:
+            if candidate.exists():
+                try:
+                    text = candidate.read_text()
+                    match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)
+                    if match:
+                        prompt = match.group(1).strip()
+                except OSError:
+                    pass
+                if prompt:
+                    break
+
+    if not prompt:
+        prompt = "(No prompt found)"
+
+    run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
+
+    # Collect output files
+    outputs_dir = run_dir / "outputs"
+    output_files: list[dict] = []
+    if outputs_dir.is_dir():
+        for f in sorted(outputs_dir.iterdir()):
+            if f.is_file() and f.name not in METADATA_FILES:
+                output_files.append(embed_file(f))
+
+    # Load grading if present
+    grading = None
+    for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:
+        if candidate.exists():
+            try:
+                grading = json.loads(candidate.read_text())
+            except (json.JSONDecodeError, OSError):
+                pass
+            if grading:
+                break
+
+    return {
+        "id": run_id,
+        "prompt": prompt,
+        "eval_id": eval_id,
+        "outputs": output_files,
+        "grading": grading,
+    }
+
+
+def embed_file(path: Path) -> dict:
+    """Read a file and return an embedded representation."""
+    ext = path.suffix.lower()
+    mime = get_mime_type(path)
+
+    if ext in TEXT_EXTENSIONS:
+        try:
+            content = path.read_text(errors="replace")
+        except OSError:
+            content = "(Error reading file)"
+        return {
+            "name": path.name,
+            "type": "text",
+            "content": content,
+        }
+    elif ext in IMAGE_EXTENSIONS:
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "image",
+            "mime": mime,
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+    elif ext == ".pdf":
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "pdf",
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+    elif ext == ".xlsx":
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "xlsx",
+            "data_b64": b64,
+        }
+    else:
+        # Binary / unknown — base64 download link
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "binary",
+            "mime": mime,
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+
+
+def load_previous_iteration(workspace: Path) -> dict[str, dict]:
+    """Load previous iteration's feedback and outputs.
+
+    Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}.
+    """
+    result: dict[str, dict] = {}
+
+    # Load feedback
+    feedback_map: dict[str, str] = {}
+    feedback_path = workspace / "feedback.json"
+    if feedback_path.exists():
+        try:
+            data = json.loads(feedback_path.read_text())
+            feedback_map = {
+                r["run_id"]: r["feedback"]
+                for r in data.get("reviews", [])
+                if r.get("feedback", "").strip()
+            }
+        except (json.JSONDecodeError, OSError, KeyError):
+            pass
+
+    # Load runs (to get outputs)
+    prev_runs = find_runs(workspace)
+    for run in prev_runs:
+        result[run["id"]] = {
+            "feedback": feedback_map.get(run["id"], ""),
+            "outputs": run.get("outputs", []),
+        }
+
+    # Also add feedback for run_ids that had feedback but no matching run
+    for run_id, fb in feedback_map.items():
+        if run_id not in result:
+            result[run_id] = {"feedback": fb, "outputs": []}
+
+    return result
+
+
+def generate_html(
+    runs: list[dict],
+    skill_name: str,
+    previous: dict[str, dict] | None = None,
+    benchmark: dict | None = None,
+) -> str:
+    """Generate the complete standalone HTML page with embedded data."""
+    template_path = Path(__file__).parent / "viewer.html"
+    template = template_path.read_text()
+
+    # Build previous_feedback and previous_outputs maps for the template
+    previous_feedback: dict[str, str] = {}
+    previous_outputs: dict[str, list[dict]] = {}
+    if previous:
+        for run_id, data in previous.items():
+            if data.get("feedback"):
+                previous_feedback[run_id] = data["feedback"]
+            if data.get("outputs"):
+                previous_outputs[run_id] = data["outputs"]
+
+    embedded = {
+        "skill_name": skill_name,
+        "runs": runs,
+        "previous_feedback": previous_feedback,
+        "previous_outputs": previous_outputs,
+    }
+    if benchmark:
+        embedded["benchmark"] = benchmark
+
+    data_json = json.dumps(embedded)
+
+    return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
+
+
+# ---------------------------------------------------------------------------
+# HTTP server (stdlib only, zero dependencies)
+# ---------------------------------------------------------------------------
+
+def _kill_port(port: int) -> None:
+    """Kill any process listening on the given port."""
+    try:
+        result = subprocess.run(
+            ["lsof", "-ti", f":{port}"],
+            capture_output=True, text=True, timeout=5,
+        )
+        for pid_str in result.stdout.strip().split("\n"):
+            if pid_str.strip():
+                try:
+                    os.kill(int(pid_str.strip()), signal.SIGTERM)
+                except (ProcessLookupError, ValueError):
+                    pass
+        if result.stdout.strip():
+            time.sleep(0.5)
+    except subprocess.TimeoutExpired:
+        pass
+    except FileNotFoundError:
+        logger.warning("Note: lsof not found, cannot check if port is in use")
+
+class ReviewHandler(BaseHTTPRequestHandler):
+    """Serves the review HTML and handles feedback saves.
+
+    Regenerates the HTML on each page load so that refreshing the browser
+    picks up new eval outputs without restarting the server.
+    """
+
+    def __init__(
+        self,
+        workspace: Path,
+        skill_name: str,
+        feedback_path: Path,
+        previous: dict[str, dict],
+        benchmark_path: Path | None,
+        *args,
+        **kwargs,
+    ):
+        self.workspace = workspace
+        self.skill_name = skill_name
+        self.feedback_path = feedback_path
+        self.previous = previous
+        self.benchmark_path = benchmark_path
+        super().__init__(*args, **kwargs)
+
+    def do_GET(self) -> None:
+        if self.path == "/" or self.path == "/index.html":
+            # Regenerate HTML on each request (re-scans workspace for new outputs)
+            runs = find_runs(self.workspace)
+            benchmark = None
+            if self.benchmark_path and self.benchmark_path.exists():
+                try:
+                    benchmark = json.loads(self.benchmark_path.read_text())
+                except (json.JSONDecodeError, OSError):
+                    pass
+            html = generate_html(runs, self.skill_name, self.previous, benchmark)
+            content = html.encode("utf-8")
+            self.send_response(200)
+            self.send_header("Content-Type", "text/html; charset=utf-8")
+            self.send_header("Content-Length", str(len(content)))
+            self.end_headers()
+            self.wfile.write(content)
+        elif self.path == "/api/feedback":
+            data = b"{}"
+            if self.feedback_path.exists():
+                data = self.feedback_path.read_bytes()
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(data)))
+            self.end_headers()
+            self.wfile.write(data)
+        else:
+            self.send_error(404)
+
+    def do_POST(self) -> None:
+        if self.path == "/api/feedback":
+            length = int(self.headers.get("Content-Length", 0))
+            body = self.rfile.read(length)
+            try:
+                data = json.loads(body)
+                if not isinstance(data, dict) or "reviews" not in data:
+                    raise ValueError("Expected JSON object with 'reviews' key")
+                self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")
+                resp = b'{"ok":true}'
+                self.send_response(200)
+            except (json.JSONDecodeError, OSError, ValueError) as e:
+                resp = json.dumps({"error": str(e)}).encode()
+                self.send_response(500)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(resp)))
+            self.end_headers()
+            self.wfile.write(resp)
+        else:
+            self.send_error(404)
+
+    def log_message(self, format: str, *args: object) -> None:
+        # Suppress request logging to keep terminal clean
+        pass
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate and serve eval review")
+    parser.add_argument("workspace", type=Path, help="Path to workspace directory")
+    parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
+    parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
+    parser.add_argument(
+        "--previous-workspace", type=Path, default=None,
+        help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
+    )
+    parser.add_argument(
+        "--benchmark", type=Path, default=None,
+        help="Path to benchmark.json to show in the Benchmark tab",
+    )
+    parser.add_argument(
+        "--static", "-s", type=Path, default=None,
+        help="Write standalone HTML to this path instead of starting a server",
+    )
+    args = parser.parse_args()
+
+    workspace = args.workspace.resolve()
+    if not workspace.is_dir():
+        logger.error(f"Error: {workspace} is not a directory")
+        sys.exit(1)
+
+    runs = find_runs(workspace)
+    if not runs:
+        logger.error(f"No runs found in {workspace}")
+        sys.exit(1)
+
+    skill_name = args.skill_name or workspace.name.replace("-workspace", "")
+    feedback_path = workspace / "feedback.json"
+
+    previous: dict[str, dict] = {}
+    if args.previous_workspace:
+        previous = load_previous_iteration(args.previous_workspace.resolve())
+
+    benchmark_path = args.benchmark.resolve() if args.benchmark else None
+    benchmark = None
+    if benchmark_path and benchmark_path.exists():
+        try:
+            benchmark = json.loads(benchmark_path.read_text())
+        except (json.JSONDecodeError, OSError):
+            pass
+
+    if args.static:
+        html = generate_html(runs, skill_name, previous, benchmark)
+        args.static.parent.mkdir(parents=True, exist_ok=True)
+        args.static.write_text(html)
+        logger.info(f"\n  Static viewer written to: {args.static}\n")
+        sys.exit(0)
+
+    # Kill any existing process on the target port
+    port = args.port
+    _kill_port(port)
+    handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path)
+    try:
+        server = HTTPServer(("127.0.0.1", port), handler)
+    except OSError:
+        # Port still in use after kill attempt — find a free one
+        server = HTTPServer(("127.0.0.1", 0), handler)
+        port = server.server_address[1]
+
+    url = f"http://localhost:{port}"
+    logger.info(f"\n  Eval Viewer")
+    logger.info(f"  ─────────────────────────────────")
+    logger.info(f"  URL:       {url}")
+    logger.info(f"  Workspace: {workspace}")
+    logger.info(f"  Feedback:  {feedback_path}")
+    if previous:
+        logger.info(f"  Previous:  {args.previous_workspace} ({len(previous)} runs)")
+    if benchmark_path:
+        logger.info(f"  Benchmark: {benchmark_path}")
+    logger.info(f"\n  Press Ctrl+C to stop.\n")
+
+    webbrowser.open(url)
+
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        logger.info("\nStopped.")
+        server.server_close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/viewer.html b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/viewer.html
new file mode 100644
index 000000000..6d8e96348
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/viewer.html
@@ -0,0 +1,1325 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Eval Review</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+  <script src="https://cdn.sheetjs.com/xlsx-0.20.3/package/dist/xlsx.full.min.js" integrity="sha384-EnyY0/GSHQGSxSgMwaIPzSESbqoOLSexfnSMN2AP+39Ckmn92stwABZynq1JyzdT" crossorigin="anonymous"></script>
+  <style>
+    :root {
+      --bg: #faf9f5;
+      --surface: #ffffff;
+      --border: #e8e6dc;
+      --text: #141413;
+      --text-muted: #b0aea5;
+      --accent: #d97757;
+      --accent-hover: #c4613f;
+      --green: #788c5d;
+      --green-bg: #eef2e8;
+      --red: #c44;
+      --red-bg: #fceaea;
+      --header-bg: #141413;
+      --header-text: #faf9f5;
+      --radius: 6px;
+    }
+
+    * { box-sizing: border-box; margin: 0; padding: 0; }
+
+    body {
+      font-family: 'Lora', Georgia, serif;
+      background: var(--bg);
+      color: var(--text);
+      height: 100vh;
+      display: flex;
+      flex-direction: column;
+    }
+
+    /* ---- Header ---- */
+    .header {
+      background: var(--header-bg);
+      color: var(--header-text);
+      padding: 1rem 2rem;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      flex-shrink: 0;
+    }
+    .header h1 {
+      font-family: 'Poppins', sans-serif;
+      font-size: 1.25rem;
+      font-weight: 600;
+    }
+    .header .instructions {
+      font-size: 0.8rem;
+      opacity: 0.7;
+      margin-top: 0.25rem;
+    }
+    .header .progress {
+      font-size: 0.875rem;
+      opacity: 0.8;
+      text-align: right;
+    }
+
+    /* ---- Main content ---- */
+    .main {
+      flex: 1;
+      overflow-y: auto;
+      padding: 1.5rem 2rem;
+      display: flex;
+      flex-direction: column;
+      gap: 1.25rem;
+    }
+
+    /* ---- Sections ---- */
+    .section {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      flex-shrink: 0;
+    }
+    .section-header {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.75rem 1rem;
+      font-size: 0.75rem;
+      font-weight: 500;
+      text-transform: uppercase;
+      letter-spacing: 0.05em;
+      color: var(--text-muted);
+      border-bottom: 1px solid var(--border);
+      background: var(--bg);
+    }
+    .section-body {
+      padding: 1rem;
+    }
+
+    /* ---- Config badge ---- */
+    .config-badge {
+      display: inline-block;
+      padding: 0.2rem 0.625rem;
+      border-radius: 9999px;
+      font-family: 'Poppins', sans-serif;
+      font-size: 0.6875rem;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.03em;
+      margin-left: 0.75rem;
+      vertical-align: middle;
+    }
+    .config-badge.config-primary {
+      background: rgba(33, 150, 243, 0.12);
+      color: #1976d2;
+    }
+    .config-badge.config-baseline {
+      background: rgba(255, 193, 7, 0.15);
+      color: #f57f17;
+    }
+
+    /* ---- Prompt ---- */
+    .prompt-text {
+      white-space: pre-wrap;
+      font-size: 0.9375rem;
+      line-height: 1.6;
+    }
+
+    /* ---- Outputs ---- */
+    .output-file {
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      overflow: hidden;
+    }
+    .output-file + .output-file {
+      margin-top: 1rem;
+    }
+    .output-file-header {
+      padding: 0.5rem 0.75rem;
+      font-size: 0.8rem;
+      font-weight: 600;
+      color: var(--text-muted);
+      background: var(--bg);
+      border-bottom: 1px solid var(--border);
+      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+    }
+    .output-file-header .dl-btn {
+      font-size: 0.7rem;
+      color: var(--accent);
+      text-decoration: none;
+      cursor: pointer;
+      font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+      font-weight: 500;
+      opacity: 0.8;
+    }
+    .output-file-header .dl-btn:hover {
+      opacity: 1;
+      text-decoration: underline;
+    }
+    .output-file-content {
+      padding: 0.75rem;
+      overflow-x: auto;
+    }
+    .output-file-content pre {
+      font-size: 0.8125rem;
+      line-height: 1.5;
+      white-space: pre-wrap;
+      word-break: break-word;
+      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
+    }
+    .output-file-content img {
+      max-width: 100%;
+      height: auto;
+      border-radius: 4px;
+    }
+    .output-file-content iframe {
+      width: 100%;
+      height: 600px;
+      border: none;
+    }
+    .output-file-content table {
+      border-collapse: collapse;
+      font-size: 0.8125rem;
+      width: 100%;
+    }
+    .output-file-content table td,
+    .output-file-content table th {
+      border: 1px solid var(--border);
+      padding: 0.375rem 0.5rem;
+      text-align: left;
+    }
+    .output-file-content table th {
+      background: var(--bg);
+      font-weight: 600;
+    }
+    .output-file-content .download-link {
+      display: inline-flex;
+      align-items: center;
+      gap: 0.5rem;
+      padding: 0.5rem 1rem;
+      background: var(--bg);
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      color: var(--accent);
+      text-decoration: none;
+      font-size: 0.875rem;
+      cursor: pointer;
+    }
+    .output-file-content .download-link:hover {
+      background: var(--border);
+    }
+    .empty-state {
+      color: var(--text-muted);
+      font-style: italic;
+      padding: 2rem;
+      text-align: center;
+    }
+
+    /* ---- Feedback ---- */
+    .prev-feedback {
+      background: var(--bg);
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      padding: 0.625rem 0.75rem;
+      margin-top: 0.75rem;
+      font-size: 0.8125rem;
+      color: var(--text-muted);
+      line-height: 1.5;
+    }
+    .prev-feedback-label {
+      font-size: 0.7rem;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.04em;
+      margin-bottom: 0.25rem;
+      color: var(--text-muted);
+    }
+    .feedback-textarea {
+      width: 100%;
+      min-height: 100px;
+      padding: 0.75rem;
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      font-family: inherit;
+      font-size: 0.9375rem;
+      line-height: 1.5;
+      resize: vertical;
+      color: var(--text);
+    }
+    .feedback-textarea:focus {
+      outline: none;
+      border-color: var(--accent);
+      box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
+    }
+    .feedback-status {
+      font-size: 0.75rem;
+      color: var(--text-muted);
+      margin-top: 0.5rem;
+      min-height: 1.1em;
+    }
+
+    /* ---- Grades (collapsible) ---- */
+    .grades-toggle {
+      display: flex;
+      align-items: center;
+      cursor: pointer;
+      user-select: none;
+    }
+    .grades-toggle:hover {
+      color: var(--accent);
+    }
+    .grades-toggle .arrow {
+      margin-right: 0.5rem;
+      transition: transform 0.15s;
+      font-size: 0.75rem;
+    }
+    .grades-toggle .arrow.open {
+      transform: rotate(90deg);
+    }
+    .grades-content {
+      display: none;
+      margin-top: 0.75rem;
+    }
+    .grades-content.open {
+      display: block;
+    }
+    .grades-summary {
+      font-size: 0.875rem;
+      margin-bottom: 0.75rem;
+      display: flex;
+      align-items: center;
+      gap: 0.5rem;
+    }
+    .grade-badge {
+      display: inline-block;
+      padding: 0.125rem 0.5rem;
+      border-radius: 9999px;
+      font-size: 0.75rem;
+      font-weight: 600;
+    }
+    .grade-pass { background: var(--green-bg); color: var(--green); }
+    .grade-fail { background: var(--red-bg); color: var(--red); }
+    .assertion-list {
+      list-style: none;
+    }
+    .assertion-item {
+      padding: 0.625rem 0;
+      border-bottom: 1px solid var(--border);
+      font-size: 0.8125rem;
+    }
+    .assertion-item:last-child { border-bottom: none; }
+    .assertion-status {
+      font-weight: 600;
+      margin-right: 0.5rem;
+    }
+    .assertion-status.pass { color: var(--green); }
+    .assertion-status.fail { color: var(--red); }
+    .assertion-evidence {
+      color: var(--text-muted);
+      font-size: 0.75rem;
+      margin-top: 0.25rem;
+      padding-left: 1.5rem;
+    }
+
+    /* ---- View tabs ---- */
+    .view-tabs {
+      display: flex;
+      gap: 0;
+      padding: 0 2rem;
+      background: var(--bg);
+      border-bottom: 1px solid var(--border);
+      flex-shrink: 0;
+    }
+    .view-tab {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.625rem 1.25rem;
+      font-size: 0.8125rem;
+      font-weight: 500;
+      cursor: pointer;
+      border: none;
+      background: none;
+      color: var(--text-muted);
+      border-bottom: 2px solid transparent;
+      transition: all 0.15s;
+    }
+    .view-tab:hover { color: var(--text); }
+    .view-tab.active {
+      color: var(--accent);
+      border-bottom-color: var(--accent);
+    }
+    .view-panel { display: none; }
+    .view-panel.active { display: flex; flex-direction: column; flex: 1; overflow: hidden; }
+
+    /* ---- Benchmark view ---- */
+    .benchmark-view {
+      padding: 1.5rem 2rem;
+      overflow-y: auto;
+      flex: 1;
+    }
+    .benchmark-table {
+      border-collapse: collapse;
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      font-size: 0.8125rem;
+      width: 100%;
+      margin-bottom: 1.5rem;
+    }
+    .benchmark-table th, .benchmark-table td {
+      padding: 0.625rem 0.75rem;
+      text-align: left;
+      border: 1px solid var(--border);
+    }
+    .benchmark-table th {
+      font-family: 'Poppins', sans-serif;
+      background: var(--header-bg);
+      color: var(--header-text);
+      font-weight: 500;
+      font-size: 0.75rem;
+      text-transform: uppercase;
+      letter-spacing: 0.04em;
+    }
+    .benchmark-table tr:hover { background: var(--bg); }
+    .benchmark-table tr.benchmark-row-with { background: rgba(33, 150, 243, 0.06); }
+    .benchmark-table tr.benchmark-row-without { background: rgba(255, 193, 7, 0.06); }
+    .benchmark-table tr.benchmark-row-with:hover { background: rgba(33, 150, 243, 0.12); }
+    .benchmark-table tr.benchmark-row-without:hover { background: rgba(255, 193, 7, 0.12); }
+    .benchmark-table tr.benchmark-row-avg { font-weight: 600; border-top: 2px solid var(--border); }
+    .benchmark-table tr.benchmark-row-avg.benchmark-row-with { background: rgba(33, 150, 243, 0.12); }
+    .benchmark-table tr.benchmark-row-avg.benchmark-row-without { background: rgba(255, 193, 7, 0.12); }
+    .benchmark-delta-positive { color: var(--green); font-weight: 600; }
+    .benchmark-delta-negative { color: var(--red); font-weight: 600; }
+    .benchmark-notes {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      padding: 1rem;
+    }
+    .benchmark-notes h3 {
+      font-family: 'Poppins', sans-serif;
+      font-size: 0.875rem;
+      margin-bottom: 0.75rem;
+    }
+    .benchmark-notes ul {
+      list-style: disc;
+      padding-left: 1.25rem;
+    }
+    .benchmark-notes li {
+      font-size: 0.8125rem;
+      line-height: 1.6;
+      margin-bottom: 0.375rem;
+    }
+    .benchmark-empty {
+      color: var(--text-muted);
+      font-style: italic;
+      text-align: center;
+      padding: 3rem;
+    }
+
+    /* ---- Navigation ---- */
+    .nav {
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      padding: 1rem 2rem;
+      border-top: 1px solid var(--border);
+      background: var(--surface);
+      flex-shrink: 0;
+    }
+    .nav-btn {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.5rem 1.25rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      cursor: pointer;
+      font-size: 0.875rem;
+      font-weight: 500;
+      color: var(--text);
+      transition: all 0.15s;
+    }
+    .nav-btn:hover:not(:disabled) {
+      background: var(--bg);
+      border-color: var(--text-muted);
+    }
+    .nav-btn:disabled {
+      opacity: 0.4;
+      cursor: not-allowed;
+    }
+    .done-btn {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.5rem 1.5rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      color: var(--text);
+      cursor: pointer;
+      font-size: 0.875rem;
+      font-weight: 500;
+      transition: all 0.15s;
+    }
+    .done-btn:hover {
+      background: var(--bg);
+      border-color: var(--text-muted);
+    }
+    .done-btn.ready {
+      border: none;
+      background: var(--accent);
+      color: white;
+      font-weight: 600;
+    }
+    .done-btn.ready:hover {
+      background: var(--accent-hover);
+    }
+    /* ---- Done overlay ---- */
+    .done-overlay {
+      display: none;
+      position: fixed;
+      inset: 0;
+      background: rgba(0, 0, 0, 0.5);
+      z-index: 100;
+      justify-content: center;
+      align-items: center;
+    }
+    .done-overlay.visible {
+      display: flex;
+    }
+    .done-card {
+      background: var(--surface);
+      border-radius: 12px;
+      padding: 2rem 3rem;
+      text-align: center;
+      box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
+      max-width: 500px;
+    }
+    .done-card h2 {
+      font-size: 1.5rem;
+      margin-bottom: 0.5rem;
+    }
+    .done-card p {
+      color: var(--text-muted);
+      margin-bottom: 1.5rem;
+      line-height: 1.5;
+    }
+    .done-card .btn-row {
+      display: flex;
+      gap: 0.5rem;
+      justify-content: center;
+    }
+    .done-card button {
+      padding: 0.5rem 1.25rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      cursor: pointer;
+      font-size: 0.875rem;
+    }
+    .done-card button:hover {
+      background: var(--bg);
+    }
+    /* ---- Toast ---- */
+    .toast {
+      position: fixed;
+      bottom: 5rem;
+      left: 50%;
+      transform: translateX(-50%);
+      background: var(--header-bg);
+      color: var(--header-text);
+      padding: 0.625rem 1.25rem;
+      border-radius: var(--radius);
+      font-size: 0.875rem;
+      opacity: 0;
+      transition: opacity 0.3s;
+      pointer-events: none;
+      z-index: 200;
+    }
+    .toast.visible {
+      opacity: 1;
+    }
+  </style>
+</head>
+<body>
+  <div id="app" style="height:100vh; display:flex; flex-direction:column;">
+    <div class="header">
+      <div>
+        <h1>Eval Review: <span id="skill-name"></span></h1>
+        <div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>
+      </div>
+      <div class="progress" id="progress"></div>
+    </div>
+
+    <!-- View tabs (only shown when benchmark data exists) -->
+    <div class="view-tabs" id="view-tabs" style="display:none;">
+      <button class="view-tab active" onclick="switchView('outputs')">Outputs</button>
+      <button class="view-tab" onclick="switchView('benchmark')">Benchmark</button>
+    </div>
+
+    <!-- Outputs panel (qualitative review) -->
+    <div class="view-panel active" id="panel-outputs">
+    <div class="main">
+      <!-- Prompt -->
+      <div class="section">
+        <div class="section-header">Prompt <span class="config-badge" id="config-badge" style="display:none;"></span></div>
+        <div class="section-body">
+          <div class="prompt-text" id="prompt-text"></div>
+        </div>
+      </div>
+
+      <!-- Outputs -->
+      <div class="section">
+        <div class="section-header">Output</div>
+        <div class="section-body" id="outputs-body">
+          <div class="empty-state">No output files found</div>
+        </div>
+      </div>
+
+      <!-- Previous Output (collapsible) -->
+      <div class="section" id="prev-outputs-section" style="display:none;">
+        <div class="section-header">
+          <div class="grades-toggle" onclick="togglePrevOutputs()">
+            <span class="arrow" id="prev-outputs-arrow">&#9654;</span>
+            Previous Output
+          </div>
+        </div>
+        <div class="grades-content" id="prev-outputs-content"></div>
+      </div>
+
+      <!-- Grades (collapsible) -->
+      <div class="section" id="grades-section" style="display:none;">
+        <div class="section-header">
+          <div class="grades-toggle" onclick="toggleGrades()">
+            <span class="arrow" id="grades-arrow">&#9654;</span>
+            Formal Grades
+          </div>
+        </div>
+        <div class="grades-content" id="grades-content"></div>
+      </div>
+
+      <!-- Feedback -->
+      <div class="section">
+        <div class="section-header">Your Feedback</div>
+        <div class="section-body">
+          <textarea
+            class="feedback-textarea"
+            id="feedback"
+            placeholder="What do you think of this output? Any issues, suggestions, or things that look great?"
+          ></textarea>
+          <div class="feedback-status" id="feedback-status"></div>
+          <div class="prev-feedback" id="prev-feedback" style="display:none;">
+            <div class="prev-feedback-label">Previous feedback</div>
+            <div id="prev-feedback-text"></div>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <div class="nav" id="outputs-nav">
+      <button class="nav-btn" id="prev-btn" onclick="navigate(-1)">&#8592; Previous</button>
+      <button class="done-btn" id="done-btn" onclick="showDoneDialog()">Submit All Reviews</button>
+      <button class="nav-btn" id="next-btn" onclick="navigate(1)">Next &#8594;</button>
+    </div>
+    </div><!-- end panel-outputs -->
+
+    <!-- Benchmark panel (quantitative stats) -->
+    <div class="view-panel" id="panel-benchmark">
+      <div class="benchmark-view" id="benchmark-content">
+        <div class="benchmark-empty">No benchmark data available. Run a benchmark to see quantitative results here.</div>
+      </div>
+    </div>
+  </div>
+
+  <!-- Done overlay -->
+  <div class="done-overlay" id="done-overlay">
+    <div class="done-card">
+      <h2>Review Complete</h2>
+      <p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>
+      <div class="btn-row">
+        <button onclick="closeDoneDialog()">OK</button>
+      </div>
+    </div>
+  </div>
+
+  <!-- Toast -->
+  <div class="toast" id="toast"></div>
+
+  <script>
+    // ---- Embedded data (injected by generate_review.py) ----
+    /*__EMBEDDED_DATA__*/
+
+    // ---- State ----
+    let feedbackMap = {};  // run_id -> feedback text
+    let currentIndex = 0;
+    let visitedRuns = new Set();
+
+    // ---- Init ----
+    async function init() {
+      // Load saved feedback from server — but only if this isn't a fresh
+      // iteration (indicated by previous_feedback being present). When
+      // previous feedback exists, the feedback.json on disk is stale from
+      // the prior iteration and should not pre-fill the textareas.
+      const hasPrevious = Object.keys(EMBEDDED_DATA.previous_feedback || {}).length > 0
+        || Object.keys(EMBEDDED_DATA.previous_outputs || {}).length > 0;
+      if (!hasPrevious) {
+        try {
+          const resp = await fetch("/api/feedback");
+          const data = await resp.json();
+          if (data.reviews) {
+            for (const r of data.reviews) feedbackMap[r.run_id] = r.feedback;
+          }
+        } catch { /* first run, no feedback yet */ }
+      }
+
+      document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
+      showRun(0);
+
+      // Wire up feedback auto-save
+      const textarea = document.getElementById("feedback");
+      let saveTimeout = null;
+      textarea.addEventListener("input", () => {
+        clearTimeout(saveTimeout);
+        document.getElementById("feedback-status").textContent = "";
+        saveTimeout = setTimeout(() => saveCurrentFeedback(), 800);
+      });
+    }
+
+    // ---- Navigation ----
+    function navigate(delta) {
+      const newIndex = currentIndex + delta;
+      if (newIndex >= 0 && newIndex < EMBEDDED_DATA.runs.length) {
+        saveCurrentFeedback();
+        showRun(newIndex);
+      }
+    }
+
+    function updateNavButtons() {
+      document.getElementById("prev-btn").disabled = currentIndex === 0;
+      document.getElementById("next-btn").disabled =
+        currentIndex === EMBEDDED_DATA.runs.length - 1;
+    }
+
+    // ---- Show a run ----
+    function showRun(index) {
+      currentIndex = index;
+      const run = EMBEDDED_DATA.runs[index];
+
+      // Progress
+      document.getElementById("progress").textContent =
+        `${index + 1} of ${EMBEDDED_DATA.runs.length}`;
+
+      // Prompt
+      document.getElementById("prompt-text").textContent = run.prompt;
+
+      // Config badge
+      const badge = document.getElementById("config-badge");
+      const configMatch = run.id.match(/(with_skill|without_skill|new_skill|old_skill)/);
+      if (configMatch) {
+        const config = configMatch[1];
+        const isBaseline = config === "without_skill" || config === "old_skill";
+        badge.textContent = config.replace(/_/g, " ");
+        badge.className = "config-badge " + (isBaseline ? "config-baseline" : "config-primary");
+        badge.style.display = "inline-block";
+      } else {
+        badge.style.display = "none";
+      }
+
+      // Outputs
+      renderOutputs(run);
+
+      // Previous outputs
+      renderPrevOutputs(run);
+
+      // Grades
+      renderGrades(run);
+
+      // Previous feedback
+      const prevFb = (EMBEDDED_DATA.previous_feedback || {})[run.id];
+      const prevEl = document.getElementById("prev-feedback");
+      if (prevFb) {
+        document.getElementById("prev-feedback-text").textContent = prevFb;
+        prevEl.style.display = "block";
+      } else {
+        prevEl.style.display = "none";
+      }
+
+      // Feedback
+      document.getElementById("feedback").value = feedbackMap[run.id] || "";
+      document.getElementById("feedback-status").textContent = "";
+
+      updateNavButtons();
+
+      // Track visited runs and promote done button when all visited
+      visitedRuns.add(index);
+      const doneBtn = document.getElementById("done-btn");
+      if (visitedRuns.size >= EMBEDDED_DATA.runs.length) {
+        doneBtn.classList.add("ready");
+      }
+
+      // Scroll main content to top
+      document.querySelector(".main").scrollTop = 0;
+    }
+
+    // ---- Render outputs ----
+    function renderOutputs(run) {
+      const container = document.getElementById("outputs-body");
+      container.innerHTML = "";
+
+      const outputs = run.outputs || [];
+      if (outputs.length === 0) {
+        container.innerHTML = '<div class="empty-state">No output files</div>';
+        return;
+      }
+
+      for (const file of outputs) {
+        const fileDiv = document.createElement("div");
+        fileDiv.className = "output-file";
+
+        // Always show file header with download link
+        const header = document.createElement("div");
+        header.className = "output-file-header";
+        const nameSpan = document.createElement("span");
+        nameSpan.textContent = file.name;
+        header.appendChild(nameSpan);
+        const dlBtn = document.createElement("a");
+        dlBtn.className = "dl-btn";
+        dlBtn.textContent = "Download";
+        dlBtn.download = file.name;
+        dlBtn.href = getDownloadUri(file);
+        header.appendChild(dlBtn);
+        fileDiv.appendChild(header);
+
+        const content = document.createElement("div");
+        content.className = "output-file-content";
+
+        if (file.type === "text") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          content.appendChild(pre);
+        } else if (file.type === "image") {
+          const img = document.createElement("img");
+          img.src = file.data_uri;
+          img.alt = file.name;
+          content.appendChild(img);
+        } else if (file.type === "pdf") {
+          const iframe = document.createElement("iframe");
+          iframe.src = file.data_uri;
+          content.appendChild(iframe);
+        } else if (file.type === "xlsx") {
+          renderXlsx(content, file.data_b64);
+        } else if (file.type === "binary") {
+          const a = document.createElement("a");
+          a.className = "download-link";
+          a.href = file.data_uri;
+          a.download = file.name;
+          a.textContent = "Download " + file.name;
+          content.appendChild(a);
+        } else if (file.type === "error") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          pre.style.color = "var(--red)";
+          content.appendChild(pre);
+        }
+
+        fileDiv.appendChild(content);
+        container.appendChild(fileDiv);
+      }
+    }
+
+    // ---- XLSX rendering via SheetJS ----
+    function renderXlsx(container, b64Data) {
+      try {
+        const raw = Uint8Array.from(atob(b64Data), c => c.charCodeAt(0));
+        const wb = XLSX.read(raw, { type: "array" });
+
+        for (let i = 0; i < wb.SheetNames.length; i++) {
+          const sheetName = wb.SheetNames[i];
+          const ws = wb.Sheets[sheetName];
+
+          if (wb.SheetNames.length > 1) {
+            const sheetLabel = document.createElement("div");
+            sheetLabel.style.cssText =
+              "font-weight:600; font-size:0.8rem; color:#b0aea5; margin-top:0.5rem; margin-bottom:0.25rem;";
+            sheetLabel.textContent = "Sheet: " + sheetName;
+            container.appendChild(sheetLabel);
+          }
+
+          const htmlStr = XLSX.utils.sheet_to_html(ws, { editable: false });
+          const wrapper = document.createElement("div");
+          wrapper.innerHTML = htmlStr;
+          container.appendChild(wrapper);
+        }
+      } catch (err) {
+        container.textContent = "Error rendering spreadsheet: " + err.message;
+      }
+    }
+
+    // ---- Grades ----
+    function renderGrades(run) {
+      const section = document.getElementById("grades-section");
+      const content = document.getElementById("grades-content");
+
+      if (!run.grading) {
+        section.style.display = "none";
+        return;
+      }
+
+      const grading = run.grading;
+      section.style.display = "block";
+      // Reset to collapsed
+      content.classList.remove("open");
+      document.getElementById("grades-arrow").classList.remove("open");
+
+      const summary = grading.summary || {};
+      const expectations = grading.expectations || [];
+
+      let html = '<div style="padding: 1rem;">';
+
+      // Summary line
+      const passRate = summary.pass_rate != null
+        ? Math.round(summary.pass_rate * 100) + "%"
+        : "?";
+      const badgeClass = summary.pass_rate >= 0.8 ? "grade-pass" : summary.pass_rate >= 0.5 ? "" : "grade-fail";
+      html += '<div class="grades-summary">';
+      html += '<span class="grade-badge ' + badgeClass + '">' + passRate + '</span>';
+      html += '<span>' + (summary.passed || 0) + ' passed, ' + (summary.failed || 0) + ' failed of ' + (summary.total || 0) + '</span>';
+      html += '</div>';
+
+      // Assertions list
+      html += '<ul class="assertion-list">';
+      for (const exp of expectations) {
+        const statusClass = exp.passed ? "pass" : "fail";
+        const statusIcon = exp.passed ? "\u2713" : "\u2717";
+        html += '<li class="assertion-item">';
+        html += '<span class="assertion-status ' + statusClass + '">' + statusIcon + '</span>';
+        html += '<span>' + escapeHtml(exp.text) + '</span>';
+        if (exp.evidence) {
+          html += '<div class="assertion-evidence">' + escapeHtml(exp.evidence) + '</div>';
+        }
+        html += '</li>';
+      }
+      html += '</ul>';
+
+      html += '</div>';
+      content.innerHTML = html;
+    }
+
+    function toggleGrades() {
+      const content = document.getElementById("grades-content");
+      const arrow = document.getElementById("grades-arrow");
+      content.classList.toggle("open");
+      arrow.classList.toggle("open");
+    }
+
+    // ---- Previous outputs (collapsible) ----
+    function renderPrevOutputs(run) {
+      const section = document.getElementById("prev-outputs-section");
+      const content = document.getElementById("prev-outputs-content");
+      const prevOutputs = (EMBEDDED_DATA.previous_outputs || {})[run.id];
+
+      if (!prevOutputs || prevOutputs.length === 0) {
+        section.style.display = "none";
+        return;
+      }
+
+      section.style.display = "block";
+      // Reset to collapsed
+      content.classList.remove("open");
+      document.getElementById("prev-outputs-arrow").classList.remove("open");
+
+      // Render the files into the content area
+      content.innerHTML = "";
+      const wrapper = document.createElement("div");
+      wrapper.style.padding = "1rem";
+
+      for (const file of prevOutputs) {
+        const fileDiv = document.createElement("div");
+        fileDiv.className = "output-file";
+
+        const header = document.createElement("div");
+        header.className = "output-file-header";
+        const nameSpan = document.createElement("span");
+        nameSpan.textContent = file.name;
+        header.appendChild(nameSpan);
+        const dlBtn = document.createElement("a");
+        dlBtn.className = "dl-btn";
+        dlBtn.textContent = "Download";
+        dlBtn.download = file.name;
+        dlBtn.href = getDownloadUri(file);
+        header.appendChild(dlBtn);
+        fileDiv.appendChild(header);
+
+        const fc = document.createElement("div");
+        fc.className = "output-file-content";
+
+        if (file.type === "text") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          fc.appendChild(pre);
+        } else if (file.type === "image") {
+          const img = document.createElement("img");
+          img.src = file.data_uri;
+          img.alt = file.name;
+          fc.appendChild(img);
+        } else if (file.type === "pdf") {
+          const iframe = document.createElement("iframe");
+          iframe.src = file.data_uri;
+          fc.appendChild(iframe);
+        } else if (file.type === "xlsx") {
+          renderXlsx(fc, file.data_b64);
+        } else if (file.type === "binary") {
+          const a = document.createElement("a");
+          a.className = "download-link";
+          a.href = file.data_uri;
+          a.download = file.name;
+          a.textContent = "Download " + file.name;
+          fc.appendChild(a);
+        }
+
+        fileDiv.appendChild(fc);
+        wrapper.appendChild(fileDiv);
+      }
+
+      content.appendChild(wrapper);
+    }
+
+    function togglePrevOutputs() {
+      const content = document.getElementById("prev-outputs-content");
+      const arrow = document.getElementById("prev-outputs-arrow");
+      content.classList.toggle("open");
+      arrow.classList.toggle("open");
+    }
+
+    // ---- Feedback (saved to server -> feedback.json) ----
+    function saveCurrentFeedback() {
+      const run = EMBEDDED_DATA.runs[currentIndex];
+      const text = document.getElementById("feedback").value;
+
+      if (text.trim() === "") {
+        delete feedbackMap[run.id];
+      } else {
+        feedbackMap[run.id] = text;
+      }
+
+      // Build reviews array from map
+      const reviews = [];
+      for (const [run_id, feedback] of Object.entries(feedbackMap)) {
+        if (feedback.trim()) {
+          reviews.push({ run_id, feedback, timestamp: new Date().toISOString() });
+        }
+      }
+
+      fetch("/api/feedback", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ reviews, status: "in_progress" }),
+      }).then(() => {
+        document.getElementById("feedback-status").textContent = "Saved";
+      }).catch(() => {
+        // Static mode or server unavailable — no-op on auto-save,
+        // feedback will be downloaded on final submit
+        document.getElementById("feedback-status").textContent = "Will download on submit";
+      });
+    }
+
+    // ---- Done ----
+    function showDoneDialog() {
+      // Save current textarea to feedbackMap (but don't POST yet)
+      const run = EMBEDDED_DATA.runs[currentIndex];
+      const text = document.getElementById("feedback").value;
+      if (text.trim() === "") {
+        delete feedbackMap[run.id];
+      } else {
+        feedbackMap[run.id] = text;
+      }
+
+      // POST once with status: complete — include ALL runs so the model
+      // can distinguish "no feedback" (looks good) from "not reviewed"
+      const reviews = [];
+      const ts = new Date().toISOString();
+      for (const r of EMBEDDED_DATA.runs) {
+        reviews.push({ run_id: r.id, feedback: feedbackMap[r.id] || "", timestamp: ts });
+      }
+      const payload = JSON.stringify({ reviews, status: "complete" }, null, 2);
+      fetch("/api/feedback", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: payload,
+      }).then(() => {
+        document.getElementById("done-overlay").classList.add("visible");
+      }).catch(() => {
+        // Server not available (static mode) — download as file
+        const blob = new Blob([payload], { type: "application/json" });
+        const url = URL.createObjectURL(blob);
+        const a = document.createElement("a");
+        a.href = url;
+        a.download = "feedback.json";
+        a.click();
+        URL.revokeObjectURL(url);
+        document.getElementById("done-overlay").classList.add("visible");
+      });
+    }
+
+    function closeDoneDialog() {
+      // Reset status back to in_progress
+      saveCurrentFeedback();
+      document.getElementById("done-overlay").classList.remove("visible");
+    }
+
+    // ---- Toast ----
+    function showToast(message) {
+      const toast = document.getElementById("toast");
+      toast.textContent = message;
+      toast.classList.add("visible");
+      setTimeout(() => toast.classList.remove("visible"), 2000);
+    }
+
+    // ---- Keyboard nav ----
+    document.addEventListener("keydown", (e) => {
+      // Don't capture when typing in textarea
+      if (e.target.tagName === "TEXTAREA") return;
+
+      if (e.key === "ArrowLeft" || e.key === "ArrowUp") {
+        e.preventDefault();
+        navigate(-1);
+      } else if (e.key === "ArrowRight" || e.key === "ArrowDown") {
+        e.preventDefault();
+        navigate(1);
+      }
+    });
+
+    // ---- Util ----
+    function getDownloadUri(file) {
+      if (file.data_uri) return file.data_uri;
+      if (file.data_b64) return "data:application/octet-stream;base64," + file.data_b64;
+      if (file.type === "text") return "data:text/plain;charset=utf-8," + encodeURIComponent(file.content);
+      return "#";
+    }
+
+    function escapeHtml(text) {
+      const div = document.createElement("div");
+      div.textContent = text;
+      return div.innerHTML;
+    }
+
+    // ---- View switching ----
+    function switchView(view) {
+      document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
+      document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
+      document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
+      document.getElementById("panel-" + view).classList.add("active");
+    }
+
+    // ---- Benchmark rendering ----
+    function renderBenchmark() {
+      const data = EMBEDDED_DATA.benchmark;
+      if (!data) return;
+
+      // Show the tabs
+      document.getElementById("view-tabs").style.display = "flex";
+
+      const container = document.getElementById("benchmark-content");
+      const summary = data.run_summary || {};
+      const metadata = data.metadata || {};
+      const notes = data.notes || [];
+
+      let html = "";
+
+      // Header
+      html += "<h2 style='font-family: Poppins, sans-serif; margin-bottom: 0.5rem;'>Benchmark Results</h2>";
+      html += "<p style='color: var(--text-muted); font-size: 0.875rem; margin-bottom: 1.25rem;'>";
+      if (metadata.skill_name) html += "<strong>" + escapeHtml(metadata.skill_name) + "</strong> &mdash; ";
+      if (metadata.timestamp) html += metadata.timestamp + " &mdash; ";
+      if (metadata.evals_run) html += "Evals: " + metadata.evals_run.join(", ") + " &mdash; ";
+      html += (metadata.runs_per_configuration || "?") + " runs per configuration";
+      html += "</p>";
+
+      // Summary table
+      html += '<table class="benchmark-table">';
+
+      function fmtStat(stat, pct) {
+        if (!stat) return "—";
+        const suffix = pct ? "%" : "";
+        const m = pct ? (stat.mean * 100).toFixed(0) : stat.mean.toFixed(1);
+        const s = pct ? (stat.stddev * 100).toFixed(0) : stat.stddev.toFixed(1);
+        return m + suffix + " ± " + s + suffix;
+      }
+
+      function deltaClass(val) {
+        if (!val) return "";
+        const n = parseFloat(val);
+        if (n > 0) return "benchmark-delta-positive";
+        if (n < 0) return "benchmark-delta-negative";
+        return "";
+      }
+
+      // Discover config names dynamically (everything except "delta")
+      const configs = Object.keys(summary).filter(k => k !== "delta");
+      const configA = configs[0] || "config_a";
+      const configB = configs[1] || "config_b";
+      const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+      const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+      const a = summary[configA] || {};
+      const b = summary[configB] || {};
+      const delta = summary.delta || {};
+
+      html += "<thead><tr><th>Metric</th><th>" + escapeHtml(labelA) + "</th><th>" + escapeHtml(labelB) + "</th><th>Delta</th></tr></thead>";
+      html += "<tbody>";
+
+      html += "<tr><td><strong>Pass Rate</strong></td>";
+      html += "<td>" + fmtStat(a.pass_rate, true) + "</td>";
+      html += "<td>" + fmtStat(b.pass_rate, true) + "</td>";
+      html += '<td class="' + deltaClass(delta.pass_rate) + '">' + (delta.pass_rate || "—") + "</td></tr>";
+
+      // Time (only show row if data exists)
+      if (a.time_seconds || b.time_seconds) {
+        html += "<tr><td><strong>Time (s)</strong></td>";
+        html += "<td>" + fmtStat(a.time_seconds, false) + "</td>";
+        html += "<td>" + fmtStat(b.time_seconds, false) + "</td>";
+        html += '<td class="' + deltaClass(delta.time_seconds) + '">' + (delta.time_seconds ? delta.time_seconds + "s" : "—") + "</td></tr>";
+      }
+
+      // Tokens (only show row if data exists)
+      if (a.tokens || b.tokens) {
+        html += "<tr><td><strong>Tokens</strong></td>";
+        html += "<td>" + fmtStat(a.tokens, false) + "</td>";
+        html += "<td>" + fmtStat(b.tokens, false) + "</td>";
+        html += '<td class="' + deltaClass(delta.tokens) + '">' + (delta.tokens || "—") + "</td></tr>";
+      }
+
+      html += "</tbody></table>";
+
+      // Per-eval breakdown (if runs data available)
+      const runs = data.runs || [];
+      if (runs.length > 0) {
+        const evalIds = [...new Set(runs.map(r => r.eval_id))].sort((a, b) => a - b);
+
+        html += "<h3 style='font-family: Poppins, sans-serif; margin-bottom: 0.75rem;'>Per-Eval Breakdown</h3>";
+
+        const hasTime = runs.some(r => r.result && r.result.time_seconds != null);
+        const hasErrors = runs.some(r => r.result && r.result.errors > 0);
+
+        for (const evalId of evalIds) {
+          const evalRuns = runs.filter(r => r.eval_id === evalId);
+          const evalName = evalRuns[0] && evalRuns[0].eval_name ? evalRuns[0].eval_name : "Eval " + evalId;
+
+          html += "<h4 style='font-family: Poppins, sans-serif; margin: 1rem 0 0.5rem; color: var(--text);'>" + escapeHtml(evalName) + "</h4>";
+          html += '<table class="benchmark-table">';
+          html += "<thead><tr><th>Config</th><th>Run</th><th>Pass Rate</th>";
+          if (hasTime) html += "<th>Time (s)</th>";
+          if (hasErrors) html += "<th>Crashes During Execution</th>";
+          html += "</tr></thead>";
+          html += "<tbody>";
+
+          // Group by config and render with average rows
+          const configGroups = [...new Set(evalRuns.map(r => r.configuration))];
+          for (let ci = 0; ci < configGroups.length; ci++) {
+            const config = configGroups[ci];
+            const configRuns = evalRuns.filter(r => r.configuration === config);
+            if (configRuns.length === 0) continue;
+
+            const rowClass = ci === 0 ? "benchmark-row-with" : "benchmark-row-without";
+            const configLabel = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+
+            for (const run of configRuns) {
+              const r = run.result || {};
+              const prClass = r.pass_rate >= 0.8 ? "benchmark-delta-positive" : r.pass_rate < 0.5 ? "benchmark-delta-negative" : "";
+              html += '<tr class="' + rowClass + '">';
+              html += "<td>" + configLabel + "</td>";
+              html += "<td>" + run.run_number + "</td>";
+              html += '<td class="' + prClass + '">' + ((r.pass_rate || 0) * 100).toFixed(0) + "% (" + (r.passed || 0) + "/" + (r.total || 0) + ")</td>";
+              if (hasTime) html += "<td>" + (r.time_seconds != null ? r.time_seconds.toFixed(1) : "—") + "</td>";
+              if (hasErrors) html += "<td>" + (r.errors || 0) + "</td>";
+              html += "</tr>";
+            }
+
+            // Average row
+            const rates = configRuns.map(r => (r.result || {}).pass_rate || 0);
+            const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
+            const avgPrClass = avgRate >= 0.8 ? "benchmark-delta-positive" : avgRate < 0.5 ? "benchmark-delta-negative" : "";
+            html += '<tr class="benchmark-row-avg ' + rowClass + '">';
+            html += "<td>" + configLabel + "</td>";
+            html += "<td>Avg</td>";
+            html += '<td class="' + avgPrClass + '">' + (avgRate * 100).toFixed(0) + "%</td>";
+            if (hasTime) {
+              const times = configRuns.map(r => (r.result || {}).time_seconds).filter(t => t != null);
+              html += "<td>" + (times.length ? (times.reduce((a, b) => a + b, 0) / times.length).toFixed(1) : "—") + "</td>";
+            }
+            if (hasErrors) html += "<td></td>";
+            html += "</tr>";
+          }
+          html += "</tbody></table>";
+
+          // Per-assertion detail for this eval
+          const runsWithExpectations = {};
+          for (const config of configGroups) {
+            runsWithExpectations[config] = evalRuns.filter(r => r.configuration === config && r.expectations && r.expectations.length > 0);
+          }
+          const hasAnyExpectations = Object.values(runsWithExpectations).some(runs => runs.length > 0);
+          if (hasAnyExpectations) {
+            // Collect all unique assertion texts across all configs
+            const allAssertions = [];
+            const seen = new Set();
+            for (const config of configGroups) {
+              for (const run of runsWithExpectations[config]) {
+                for (const exp of (run.expectations || [])) {
+                  if (!seen.has(exp.text)) {
+                    seen.add(exp.text);
+                    allAssertions.push(exp.text);
+                  }
+                }
+              }
+            }
+
+            html += '<table class="benchmark-table" style="margin-top: 0.5rem;">';
+            html += "<thead><tr><th>Assertion</th>";
+            for (const config of configGroups) {
+              const label = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+              html += "<th>" + escapeHtml(label) + "</th>";
+            }
+            html += "</tr></thead><tbody>";
+
+            for (const assertionText of allAssertions) {
+              html += "<tr><td>" + escapeHtml(assertionText) + "</td>";
+
+              for (const config of configGroups) {
+                html += "<td>";
+                for (const run of runsWithExpectations[config]) {
+                  const exp = (run.expectations || []).find(e => e.text === assertionText);
+                  if (exp) {
+                    const cls = exp.passed ? "benchmark-delta-positive" : "benchmark-delta-negative";
+                    const icon = exp.passed ? "\u2713" : "\u2717";
+                    html += '<span class="' + cls + '" title="Run ' + run.run_number + ': ' + escapeHtml(exp.evidence || "") + '">' + icon + "</span> ";
+                  } else {
+                    html += "— ";
+                  }
+                }
+                html += "</td>";
+              }
+              html += "</tr>";
+            }
+            html += "</tbody></table>";
+          }
+        }
+      }
+
+      // Notes
+      if (notes.length > 0) {
+        html += '<div class="benchmark-notes">';
+        html += "<h3>Analysis Notes</h3>";
+        html += "<ul>";
+        for (const note of notes) {
+          html += "<li>" + escapeHtml(note) + "</li>";
+        }
+        html += "</ul></div>";
+      }
+
+      container.innerHTML = html;
+    }
+
+    // ---- Start ----
+    init();
+    renderBenchmark();
+  </script>
+</body>
+</html>
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/references/schemas.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/references/schemas.md
new file mode 100644
index 000000000..b6eeaa2d4
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/references/schemas.md
@@ -0,0 +1,430 @@
+# JSON Schemas
+
+This document defines the JSON schemas used by skill-creator.
+
+---
+
+## evals.json
+
+Defines the evals for a skill. Located at `evals/evals.json` within the skill directory.
+
+```json
+{
+  "skill_name": "example-skill",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "User's example prompt",
+      "expected_output": "Description of expected result",
+      "files": ["evals/files/sample1.pdf"],
+      "expectations": [
+        "The output includes X",
+        "The skill used script Y"
+      ]
+    }
+  ]
+}
+```
+
+**Fields:**
+- `skill_name`: Name matching the skill's frontmatter
+- `evals[].id`: Unique integer identifier
+- `evals[].prompt`: The task to execute
+- `evals[].expected_output`: Human-readable description of success
+- `evals[].files`: Optional list of input file paths (relative to skill root)
+- `evals[].expectations`: List of verifiable statements
+
+---
+
+## history.json
+
+Tracks version progression in Improve mode. Located at workspace root.
+
+```json
+{
+  "started_at": "2026-01-15T10:30:00Z",
+  "skill_name": "pdf",
+  "current_best": "v2",
+  "iterations": [
+    {
+      "version": "v0",
+      "parent": null,
+      "expectation_pass_rate": 0.65,
+      "grading_result": "baseline",
+      "is_current_best": false
+    },
+    {
+      "version": "v1",
+      "parent": "v0",
+      "expectation_pass_rate": 0.75,
+      "grading_result": "won",
+      "is_current_best": false
+    },
+    {
+      "version": "v2",
+      "parent": "v1",
+      "expectation_pass_rate": 0.85,
+      "grading_result": "won",
+      "is_current_best": true
+    }
+  ]
+}
+```
+
+**Fields:**
+- `started_at`: ISO timestamp of when improvement started
+- `skill_name`: Name of the skill being improved
+- `current_best`: Version identifier of the best performer
+- `iterations[].version`: Version identifier (v0, v1, ...)
+- `iterations[].parent`: Parent version this was derived from
+- `iterations[].expectation_pass_rate`: Pass rate from grading
+- `iterations[].grading_result`: "baseline", "won", "lost", or "tie"
+- `iterations[].is_current_best`: Whether this is the current best version
+
+---
+
+## grading.json
+
+Output from the grader agent. Located at `<run-dir>/grading.json`.
+
+```json
+{
+  "expectations": [
+    {
+      "text": "The output includes the name 'John Smith'",
+      "passed": true,
+      "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
+    },
+    {
+      "text": "The spreadsheet has a SUM formula in cell B10",
+      "passed": false,
+      "evidence": "No spreadsheet was created. The output was a text file."
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 1,
+    "total": 3,
+    "pass_rate": 0.67
+  },
+  "execution_metrics": {
+    "tool_calls": {
+      "Read": 5,
+      "Write": 2,
+      "Bash": 8
+    },
+    "total_tool_calls": 15,
+    "total_steps": 6,
+    "errors_encountered": 0,
+    "output_chars": 12450,
+    "transcript_chars": 3200
+  },
+  "timing": {
+    "executor_duration_seconds": 165.0,
+    "grader_duration_seconds": 26.0,
+    "total_duration_seconds": 191.0
+  },
+  "claims": [
+    {
+      "claim": "The form has 12 fillable fields",
+      "type": "factual",
+      "verified": true,
+      "evidence": "Counted 12 fields in field_info.json"
+    }
+  ],
+  "user_notes_summary": {
+    "uncertainties": ["Used 2023 data, may be stale"],
+    "needs_review": [],
+    "workarounds": ["Fell back to text overlay for non-fillable fields"]
+  },
+  "eval_feedback": {
+    "suggestions": [
+      {
+        "assertion": "The output includes the name 'John Smith'",
+        "reason": "A hallucinated document that mentions the name would also pass"
+      }
+    ],
+    "overall": "Assertions check presence but not correctness."
+  }
+}
+```
+
+**Fields:**
+- `expectations[]`: Graded expectations with evidence
+- `summary`: Aggregate pass/fail counts
+- `execution_metrics`: Tool usage and output size (from executor's metrics.json)
+- `timing`: Wall clock timing (from timing.json)
+- `claims`: Extracted and verified claims from the output
+- `user_notes_summary`: Issues flagged by the executor
+- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising
+
+---
+
+## metrics.json
+
+Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
+
+```json
+{
+  "tool_calls": {
+    "Read": 5,
+    "Write": 2,
+    "Bash": 8,
+    "Edit": 1,
+    "Glob": 2,
+    "Grep": 0
+  },
+  "total_tool_calls": 18,
+  "total_steps": 6,
+  "files_created": ["filled_form.pdf", "field_values.json"],
+  "errors_encountered": 0,
+  "output_chars": 12450,
+  "transcript_chars": 3200
+}
+```
+
+**Fields:**
+- `tool_calls`: Count per tool type
+- `total_tool_calls`: Sum of all tool calls
+- `total_steps`: Number of major execution steps
+- `files_created`: List of output files created
+- `errors_encountered`: Number of errors during execution
+- `output_chars`: Total character count of output files
+- `transcript_chars`: Character count of transcript
+
+---
+
+## timing.json
+
+Wall clock timing for a run. Located at `<run-dir>/timing.json`.
+
+**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact.
+
+```json
+{
+  "total_tokens": 84852,
+  "duration_ms": 23332,
+  "total_duration_seconds": 23.3,
+  "executor_start": "2026-01-15T10:30:00Z",
+  "executor_end": "2026-01-15T10:32:45Z",
+  "executor_duration_seconds": 165.0,
+  "grader_start": "2026-01-15T10:32:46Z",
+  "grader_end": "2026-01-15T10:33:12Z",
+  "grader_duration_seconds": 26.0
+}
+```
+
+---
+
+## benchmark.json
+
+Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
+
+```json
+{
+  "metadata": {
+    "skill_name": "pdf",
+    "skill_path": "/path/to/pdf",
+    "executor_model": "claude-sonnet-4-20250514",
+    "analyzer_model": "most-capable-model",
+    "timestamp": "2026-01-15T10:30:00Z",
+    "evals_run": [1, 2, 3],
+    "runs_per_configuration": 3
+  },
+
+  "runs": [
+    {
+      "eval_id": 1,
+      "eval_name": "Ocean",
+      "configuration": "with_skill",
+      "run_number": 1,
+      "result": {
+        "pass_rate": 0.85,
+        "passed": 6,
+        "failed": 1,
+        "total": 7,
+        "time_seconds": 42.5,
+        "tokens": 3800,
+        "tool_calls": 18,
+        "errors": 0
+      },
+      "expectations": [
+        {"text": "...", "passed": true, "evidence": "..."}
+      ],
+      "notes": [
+        "Used 2023 data, may be stale",
+        "Fell back to text overlay for non-fillable fields"
+      ]
+    }
+  ],
+
+  "run_summary": {
+    "with_skill": {
+      "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
+      "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
+      "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
+    },
+    "without_skill": {
+      "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
+      "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
+      "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
+    },
+    "delta": {
+      "pass_rate": "+0.50",
+      "time_seconds": "+13.0",
+      "tokens": "+1700"
+    }
+  },
+
+  "notes": [
+    "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
+    "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent",
+    "Without-skill runs consistently fail on table extraction expectations",
+    "Skill adds 13s average execution time but improves pass rate by 50%"
+  ]
+}
+```
+
+**Fields:**
+- `metadata`: Information about the benchmark run
+  - `skill_name`: Name of the skill
+  - `timestamp`: When the benchmark was run
+  - `evals_run`: List of eval names or IDs
+  - `runs_per_configuration`: Number of runs per config (e.g. 3)
+- `runs[]`: Individual run results
+  - `eval_id`: Numeric eval identifier
+  - `eval_name`: Human-readable eval name (used as section header in the viewer)
+  - `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding)
+  - `run_number`: Integer run number (1, 2, 3...)
+  - `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors`
+- `run_summary`: Statistical aggregates per configuration
+  - `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields
+  - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"`
+- `notes`: Freeform observations from the analyzer
+
+**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually.
+
+---
+
+## comparison.json
+
+Output from blind comparator. Located at `<grading-dir>/comparison-N.json`.
+
+```json
+{
+  "winner": "A",
+  "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
+  "rubric": {
+    "A": {
+      "content": {
+        "correctness": 5,
+        "completeness": 5,
+        "accuracy": 4
+      },
+      "structure": {
+        "organization": 4,
+        "formatting": 5,
+        "usability": 4
+      },
+      "content_score": 4.7,
+      "structure_score": 4.3,
+      "overall_score": 9.0
+    },
+    "B": {
+      "content": {
+        "correctness": 3,
+        "completeness": 2,
+        "accuracy": 3
+      },
+      "structure": {
+        "organization": 3,
+        "formatting": 2,
+        "usability": 3
+      },
+      "content_score": 2.7,
+      "structure_score": 2.7,
+      "overall_score": 5.4
+    }
+  },
+  "output_quality": {
+    "A": {
+      "score": 9,
+      "strengths": ["Complete solution", "Well-formatted", "All fields present"],
+      "weaknesses": ["Minor style inconsistency in header"]
+    },
+    "B": {
+      "score": 5,
+      "strengths": ["Readable output", "Correct basic structure"],
+      "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
+    }
+  },
+  "expectation_results": {
+    "A": {
+      "passed": 4,
+      "total": 5,
+      "pass_rate": 0.80,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    },
+    "B": {
+      "passed": 3,
+      "total": 5,
+      "pass_rate": 0.60,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    }
+  }
+}
+```
+
+---
+
+## analysis.json
+
+Output from post-hoc analyzer. Located at `<grading-dir>/analysis.json`.
+
+```json
+{
+  "comparison_summary": {
+    "winner": "A",
+    "winner_skill": "path/to/winner/skill",
+    "loser_skill": "path/to/loser/skill",
+    "comparator_reasoning": "Brief summary of why comparator chose winner"
+  },
+  "winner_strengths": [
+    "Clear step-by-step instructions for handling multi-page documents",
+    "Included validation script that caught formatting errors"
+  ],
+  "loser_weaknesses": [
+    "Vague instruction 'process the document appropriately' led to inconsistent behavior",
+    "No script for validation, agent had to improvise"
+  ],
+  "instruction_following": {
+    "winner": {
+      "score": 9,
+      "issues": ["Minor: skipped optional logging step"]
+    },
+    "loser": {
+      "score": 6,
+      "issues": [
+        "Did not use the skill's formatting template",
+        "Invented own approach instead of following step 3"
+      ]
+    }
+  },
+  "improvement_suggestions": [
+    {
+      "priority": "high",
+      "category": "instructions",
+      "suggestion": "Replace 'process the document appropriately' with explicit steps",
+      "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
+    }
+  ],
+  "transcript_insights": {
+    "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script",
+    "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods"
+  }
+}
+```
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/__init__.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/aggregate_benchmark.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/aggregate_benchmark.py
new file mode 100644
index 000000000..ccc810819
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/aggregate_benchmark.py
@@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+"""
+Aggregate individual run results into benchmark summary statistics.
+
+Reads grading.json files from run directories and produces:
+- run_summary with mean, stddev, min, max for each metric
+- delta between with_skill and without_skill configurations
+
+Usage:
+    python aggregate_benchmark.py <benchmark_dir>
+
+Example:
+    python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
+
+The script supports two directory layouts:
+
+    Workspace layout (from skill-creator iterations):
+    <benchmark_dir>/
+    └── eval-N/
+        ├── with_skill/
+        │   ├── run-1/grading.json
+        │   └── run-2/grading.json
+        └── without_skill/
+            ├── run-1/grading.json
+            └── run-2/grading.json
+
+    Legacy layout (with runs/ subdirectory):
+    <benchmark_dir>/
+    └── runs/
+        └── eval-N/
+            ├── with_skill/
+            │   └── run-1/grading.json
+            └── without_skill/
+                └── run-1/grading.json
+"""
+
+import argparse
+import json
+import math
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+from loguru import logger
+
+
+def calculate_stats(values: list[float]) -> dict:
+    """Calculate mean, stddev, min, max for a list of values."""
+    if not values:
+        return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
+
+    n = len(values)
+    mean = sum(values) / n
+
+    if n > 1:
+        variance = sum((x - mean) ** 2 for x in values) / (n - 1)
+        stddev = math.sqrt(variance)
+    else:
+        stddev = 0.0
+
+    return {
+        "mean": round(mean, 4),
+        "stddev": round(stddev, 4),
+        "min": round(min(values), 4),
+        "max": round(max(values), 4)
+    }
+
+
+def load_run_results(benchmark_dir: Path) -> dict:
+    """
+    Load all run results from a benchmark directory.
+
+    Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
+    or "new_skill"/"old_skill"), each containing a list of run results.
+    """
+    # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
+    runs_dir = benchmark_dir / "runs"
+    if runs_dir.exists():
+        search_dir = runs_dir
+    elif list(benchmark_dir.glob("eval-*")):
+        search_dir = benchmark_dir
+    else:
+        logger.warning(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
+        return {}
+
+    results: dict[str, list] = {}
+
+    for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
+        metadata_path = eval_dir / "eval_metadata.json"
+        if metadata_path.exists():
+            try:
+                with open(metadata_path) as mf:
+                    eval_id = json.load(mf).get("eval_id", eval_idx)
+            except (json.JSONDecodeError, OSError):
+                eval_id = eval_idx
+        else:
+            try:
+                eval_id = int(eval_dir.name.split("-")[1])
+            except ValueError:
+                eval_id = eval_idx
+
+        # Discover config directories dynamically rather than hardcoding names
+        for config_dir in sorted(eval_dir.iterdir()):
+            if not config_dir.is_dir():
+                continue
+            # Skip non-config directories (inputs, outputs, etc.)
+            if not list(config_dir.glob("run-*")):
+                continue
+            config = config_dir.name
+            if config not in results:
+                results[config] = []
+
+            for run_dir in sorted(config_dir.glob("run-*")):
+                run_number = int(run_dir.name.split("-")[1])
+                grading_file = run_dir / "grading.json"
+
+                if not grading_file.exists():
+                    logger.warning(f"Warning: grading.json not found in {run_dir}")
+                    continue
+
+                try:
+                    with open(grading_file) as f:
+                        grading = json.load(f)
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Warning: Invalid JSON in {grading_file}: {e}")
+                    continue
+
+                # Extract metrics
+                result = {
+                    "eval_id": eval_id,
+                    "run_number": run_number,
+                    "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
+                    "passed": grading.get("summary", {}).get("passed", 0),
+                    "failed": grading.get("summary", {}).get("failed", 0),
+                    "total": grading.get("summary", {}).get("total", 0),
+                }
+
+                # Extract timing — check grading.json first, then sibling timing.json
+                timing = grading.get("timing", {})
+                result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
+                timing_file = run_dir / "timing.json"
+                if result["time_seconds"] == 0.0 and timing_file.exists():
+                    try:
+                        with open(timing_file) as tf:
+                            timing_data = json.load(tf)
+                        result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
+                        result["tokens"] = timing_data.get("total_tokens", 0)
+                    except json.JSONDecodeError:
+                        pass
+
+                # Extract metrics if available
+                metrics = grading.get("execution_metrics", {})
+                result["tool_calls"] = metrics.get("total_tool_calls", 0)
+                if not result.get("tokens"):
+                    result["tokens"] = metrics.get("output_chars", 0)
+                result["errors"] = metrics.get("errors_encountered", 0)
+
+                # Extract expectations — viewer requires fields: text, passed, evidence
+                raw_expectations = grading.get("expectations", [])
+                for exp in raw_expectations:
+                    if "text" not in exp or "passed" not in exp:
+                        logger.warning(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
+                result["expectations"] = raw_expectations
+
+                # Extract notes from user_notes_summary
+                notes_summary = grading.get("user_notes_summary", {})
+                notes = []
+                notes.extend(notes_summary.get("uncertainties", []))
+                notes.extend(notes_summary.get("needs_review", []))
+                notes.extend(notes_summary.get("workarounds", []))
+                result["notes"] = notes
+
+                results[config].append(result)
+
+    return results
+
+
+def aggregate_results(results: dict) -> dict:
+    """
+    Aggregate run results into summary statistics.
+
+    Returns run_summary with stats for each configuration and delta.
+    """
+    run_summary = {}
+    configs = list(results.keys())
+
+    for config in configs:
+        runs = results.get(config, [])
+
+        if not runs:
+            run_summary[config] = {
+                "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
+            }
+            continue
+
+        pass_rates = [r["pass_rate"] for r in runs]
+        times = [r["time_seconds"] for r in runs]
+        tokens = [r.get("tokens", 0) for r in runs]
+
+        run_summary[config] = {
+            "pass_rate": calculate_stats(pass_rates),
+            "time_seconds": calculate_stats(times),
+            "tokens": calculate_stats(tokens)
+        }
+
+    # Calculate delta between the first two configs (if two exist)
+    if len(configs) >= 2:
+        primary = run_summary.get(configs[0], {})
+        baseline = run_summary.get(configs[1], {})
+    else:
+        primary = run_summary.get(configs[0], {}) if configs else {}
+        baseline = {}
+
+    delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
+    delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
+    delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
+
+    run_summary["delta"] = {
+        "pass_rate": f"{delta_pass_rate:+.2f}",
+        "time_seconds": f"{delta_time:+.1f}",
+        "tokens": f"{delta_tokens:+.0f}"
+    }
+
+    return run_summary
+
+
+def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
+    """
+    Generate complete benchmark.json from run results.
+    """
+    results = load_run_results(benchmark_dir)
+    run_summary = aggregate_results(results)
+
+    # Build runs array for benchmark.json
+    runs = []
+    for config in results:
+        for result in results[config]:
+            runs.append({
+                "eval_id": result["eval_id"],
+                "configuration": config,
+                "run_number": result["run_number"],
+                "result": {
+                    "pass_rate": result["pass_rate"],
+                    "passed": result["passed"],
+                    "failed": result["failed"],
+                    "total": result["total"],
+                    "time_seconds": result["time_seconds"],
+                    "tokens": result.get("tokens", 0),
+                    "tool_calls": result.get("tool_calls", 0),
+                    "errors": result.get("errors", 0)
+                },
+                "expectations": result["expectations"],
+                "notes": result["notes"]
+            })
+
+    # Determine eval IDs from results
+    eval_ids = sorted(set(
+        r["eval_id"]
+        for config in results.values()
+        for r in config
+    ))
+
+    benchmark = {
+        "metadata": {
+            "skill_name": skill_name or "<skill-name>",
+            "skill_path": skill_path or "<path/to/skill>",
+            "executor_model": "<model-name>",
+            "analyzer_model": "<model-name>",
+            "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+            "evals_run": eval_ids,
+            "runs_per_configuration": 3
+        },
+        "runs": runs,
+        "run_summary": run_summary,
+        "notes": []  # To be filled by analyzer
+    }
+
+    return benchmark
+
+
+def generate_markdown(benchmark: dict) -> str:
+    """Generate human-readable benchmark.md from benchmark data."""
+    metadata = benchmark["metadata"]
+    run_summary = benchmark["run_summary"]
+
+    # Determine config names (excluding "delta")
+    configs = [k for k in run_summary if k != "delta"]
+    config_a = configs[0] if len(configs) >= 1 else "config_a"
+    config_b = configs[1] if len(configs) >= 2 else "config_b"
+    label_a = config_a.replace("_", " ").title()
+    label_b = config_b.replace("_", " ").title()
+
+    lines = [
+        f"# Skill Benchmark: {metadata['skill_name']}",
+        "",
+        f"**Model**: {metadata['executor_model']}",
+        f"**Date**: {metadata['timestamp']}",
+        f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
+        "",
+        "## Summary",
+        "",
+        f"| Metric | {label_a} | {label_b} | Delta |",
+        "|--------|------------|---------------|-------|",
+    ]
+
+    a_summary = run_summary.get(config_a, {})
+    b_summary = run_summary.get(config_b, {})
+    delta = run_summary.get("delta", {})
+
+    # Format pass rate
+    a_pr = a_summary.get("pass_rate", {})
+    b_pr = b_summary.get("pass_rate", {})
+    lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
+
+    # Format time
+    a_time = a_summary.get("time_seconds", {})
+    b_time = b_summary.get("time_seconds", {})
+    lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
+
+    # Format tokens
+    a_tokens = a_summary.get("tokens", {})
+    b_tokens = b_summary.get("tokens", {})
+    lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
+
+    # Notes section
+    if benchmark.get("notes"):
+        lines.extend([
+            "",
+            "## Notes",
+            ""
+        ])
+        for note in benchmark["notes"]:
+            lines.append(f"- {note}")
+
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Aggregate benchmark run results into summary statistics"
+    )
+    parser.add_argument(
+        "benchmark_dir",
+        type=Path,
+        help="Path to the benchmark directory"
+    )
+    parser.add_argument(
+        "--skill-name",
+        default="",
+        help="Name of the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--skill-path",
+        default="",
+        help="Path to the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=Path,
+        help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
+    )
+
+    args = parser.parse_args()
+
+    if not args.benchmark_dir.exists():
+        logger.error(f"Directory not found: {args.benchmark_dir}")
+        sys.exit(1)
+
+    # Generate benchmark
+    benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
+
+    # Determine output paths
+    output_json = args.output or (args.benchmark_dir / "benchmark.json")
+    output_md = output_json.with_suffix(".md")
+
+    # Write benchmark.json
+    with open(output_json, "w") as f:
+        json.dump(benchmark, f, indent=2)
+    logger.info(f"Generated: {output_json}")
+
+    # Write benchmark.md
+    markdown = generate_markdown(benchmark)
+    with open(output_md, "w") as f:
+        f.write(markdown)
+    logger.info(f"Generated: {output_md}")
+
+    # Print summary
+    run_summary = benchmark["run_summary"]
+    configs = [k for k in run_summary if k != "delta"]
+    delta = run_summary.get("delta", {})
+
+    logger.info(f"\nSummary:")
+    for config in configs:
+        pr = run_summary[config]["pass_rate"]["mean"]
+        label = config.replace("_", " ").title()
+        logger.info(f"  {label}: {pr*100:.1f}% pass rate")
+    logger.info(f"  Delta:         {delta.get('pass_rate', '—')}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/generate_report.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/generate_report.py
new file mode 100644
index 000000000..395232d96
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/generate_report.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""Generate an HTML report from run_loop.py output.
+
+Takes the JSON output from run_loop.py and generates a visual HTML report
+showing each description attempt with check/x for each test case.
+Distinguishes between train and test queries.
+"""
+
+import argparse
+import html
+import json
+import sys
+from pathlib import Path
+
+from loguru import logger
+
+
+def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:
+    """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""
+    history = data.get("history", [])
+    holdout = data.get("holdout", 0)
+    title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""
+
+    # Get all unique queries from train and test sets, with should_trigger info
+    train_queries: list[dict] = []
+    test_queries: list[dict] = []
+    if history:
+        for r in history[0].get("train_results", history[0].get("results", [])):
+            train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+        if history[0].get("test_results"):
+            for r in history[0].get("test_results", []):
+                test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+
+    refresh_tag = '    <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""
+
+    html_parts = ["""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+""" + refresh_tag + """    <title>""" + title_prefix + """Skill Description Optimization</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+    <style>
+        body {
+            font-family: 'Lora', Georgia, serif;
+            max-width: 100%;
+            margin: 0 auto;
+            padding: 20px;
+            background: #faf9f5;
+            color: #141413;
+        }
+        h1 { font-family: 'Poppins', sans-serif; color: #141413; }
+        .explainer {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+            color: #b0aea5;
+            font-size: 0.875rem;
+            line-height: 1.6;
+        }
+        .summary {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+        }
+        .summary p { margin: 5px 0; }
+        .best { color: #788c5d; font-weight: bold; }
+        .table-container {
+            overflow-x: auto;
+            width: 100%;
+        }
+        table {
+            border-collapse: collapse;
+            background: white;
+            border: 1px solid #e8e6dc;
+            border-radius: 6px;
+            font-size: 12px;
+            min-width: 100%;
+        }
+        th, td {
+            padding: 8px;
+            text-align: left;
+            border: 1px solid #e8e6dc;
+            white-space: normal;
+            word-wrap: break-word;
+        }
+        th {
+            font-family: 'Poppins', sans-serif;
+            background: #141413;
+            color: #faf9f5;
+            font-weight: 500;
+        }
+        th.test-col {
+            background: #6a9bcc;
+        }
+        th.query-col { min-width: 200px; }
+        td.description {
+            font-family: monospace;
+            font-size: 11px;
+            word-wrap: break-word;
+            max-width: 400px;
+        }
+        td.result {
+            text-align: center;
+            font-size: 16px;
+            min-width: 40px;
+        }
+        td.test-result {
+            background: #f0f6fc;
+        }
+        .pass { color: #788c5d; }
+        .fail { color: #c44; }
+        .rate {
+            font-size: 9px;
+            color: #b0aea5;
+            display: block;
+        }
+        tr:hover { background: #faf9f5; }
+        .score {
+            display: inline-block;
+            padding: 2px 6px;
+            border-radius: 4px;
+            font-weight: bold;
+            font-size: 11px;
+        }
+        .score-good { background: #eef2e8; color: #788c5d; }
+        .score-ok { background: #fef3c7; color: #d97706; }
+        .score-bad { background: #fceaea; color: #c44; }
+        .train-label { color: #b0aea5; font-size: 10px; }
+        .test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
+        .best-row { background: #f5f8f2; }
+        th.positive-col { border-bottom: 3px solid #788c5d; }
+        th.negative-col { border-bottom: 3px solid #c44; }
+        th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
+        th.test-col.negative-col { border-bottom: 3px solid #c44; }
+        .legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
+        .legend-item { display: flex; align-items: center; gap: 6px; }
+        .legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
+        .swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
+        .swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
+        .swatch-test { background: #6a9bcc; }
+        .swatch-train { background: #141413; }
+    </style>
+</head>
+<body>
+    <h1>""" + title_prefix + """Skill Description Optimization</h1>
+    <div class="explainer">
+        <strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
+    </div>
+"""]
+
+    # Summary section
+    best_test_score = data.get('best_test_score')
+    best_train_score = data.get('best_train_score')
+    html_parts.append(f"""
+    <div class="summary">
+        <p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>
+        <p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>
+        <p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>
+        <p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>
+    </div>
+""")
+
+    # Legend
+    html_parts.append("""
+    <div class="legend">
+        <span style="font-weight:600">Query columns:</span>
+        <span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
+        <span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
+    </div>
+""")
+
+    # Table header
+    html_parts.append("""
+    <div class="table-container">
+    <table>
+        <thead>
+            <tr>
+                <th>Iter</th>
+                <th>Train</th>
+                <th>Test</th>
+                <th class="query-col">Description</th>
+""")
+
+    # Add column headers for train queries
+    for qinfo in train_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    # Add column headers for test queries (different color)
+    for qinfo in test_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    html_parts.append("""            </tr>
+        </thead>
+        <tbody>
+""")
+
+    # Find best iteration for highlighting
+    if test_queries:
+        best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")
+    else:
+        best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")
+
+    # Add rows for each iteration
+    for h in history:
+        iteration = h.get("iteration", "?")
+        train_passed = h.get("train_passed", h.get("passed", 0))
+        train_total = h.get("train_total", h.get("total", 0))
+        test_passed = h.get("test_passed")
+        test_total = h.get("test_total")
+        description = h.get("description", "")
+        train_results = h.get("train_results", h.get("results", []))
+        test_results = h.get("test_results", [])
+
+        # Create lookups for results by query
+        train_by_query = {r["query"]: r for r in train_results}
+        test_by_query = {r["query"]: r for r in test_results} if test_results else {}
+
+        # Compute aggregate correct/total runs across all retries
+        def aggregate_runs(results: list[dict]) -> tuple[int, int]:
+            correct = 0
+            total = 0
+            for r in results:
+                runs = r.get("runs", 0)
+                triggers = r.get("triggers", 0)
+                total += runs
+                if r.get("should_trigger", True):
+                    correct += triggers
+                else:
+                    correct += runs - triggers
+            return correct, total
+
+        train_correct, train_runs = aggregate_runs(train_results)
+        test_correct, test_runs = aggregate_runs(test_results)
+
+        # Determine score classes
+        def score_class(correct: int, total: int) -> str:
+            if total > 0:
+                ratio = correct / total
+                if ratio >= 0.8:
+                    return "score-good"
+                elif ratio >= 0.5:
+                    return "score-ok"
+            return "score-bad"
+
+        train_class = score_class(train_correct, train_runs)
+        test_class = score_class(test_correct, test_runs)
+
+        row_class = "best-row" if iteration == best_iter else ""
+
+        html_parts.append(f"""            <tr class="{row_class}">
+                <td>{iteration}</td>
+                <td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>
+                <td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>
+                <td class="description">{html.escape(description)}</td>
+""")
+
+        # Add result for each train query
+        for qinfo in train_queries:
+            r = train_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        # Add result for each test query (with different background)
+        for qinfo in test_queries:
+            r = test_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        html_parts.append("            </tr>\n")
+
+    html_parts.append("""        </tbody>
+    </table>
+    </div>
+""")
+
+    html_parts.append("""
+</body>
+</html>
+""")
+
+    return "".join(html_parts)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")
+    parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")
+    parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")
+    parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")
+    args = parser.parse_args()
+
+    if args.input == "-":
+        data = json.load(sys.stdin)
+    else:
+        data = json.loads(Path(args.input).read_text())
+
+    html_output = generate_html(data, skill_name=args.skill_name)
+
+    if args.output:
+        Path(args.output).write_text(html_output)
+        logger.info(f"Report written to {args.output}")
+    else:
+        print(html_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/improve_description.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/improve_description.py
new file mode 100644
index 000000000..887a06a08
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/improve_description.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""Improve a skill description based on eval results.
+
+Takes eval results (from run_eval.py) and generates an improved description
+using Claude with extended thinking.
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+import anthropic
+from loguru import logger
+
+from scripts.utils import parse_skill_md
+
+
+def improve_description(
+    client: anthropic.Anthropic,
+    skill_name: str,
+    skill_content: str,
+    current_description: str,
+    eval_results: dict,
+    history: list[dict],
+    model: str,
+    test_results: dict | None = None,
+    log_dir: Path | None = None,
+    iteration: int | None = None,
+) -> str:
+    """Call Claude to improve the description based on eval results."""
+    failed_triggers = [
+        r for r in eval_results["results"]
+        if r["should_trigger"] and not r["pass"]
+    ]
+    false_triggers = [
+        r for r in eval_results["results"]
+        if not r["should_trigger"] and not r["pass"]
+    ]
+
+    # Build scores summary
+    train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
+    if test_results:
+        test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
+        scores_summary = f"Train: {train_score}, Test: {test_score}"
+    else:
+        scores_summary = f"Train: {train_score}"
+
+    prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
+
+The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
+
+Here's the current description:
+<current_description>
+"{current_description}"
+</current_description>
+
+Current scores ({scores_summary}):
+<scores_summary>
+"""
+    if failed_triggers:
+        prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n"
+        for r in failed_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if false_triggers:
+        prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n"
+        for r in false_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if history:
+        prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n"
+        for h in history:
+            train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}"
+            test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None
+            score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")
+            prompt += f'<attempt {score_str}>\n'
+            prompt += f'Description: "{h["description"]}"\n'
+            if "results" in h:
+                prompt += "Train results:\n"
+                for r in h["results"]:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    prompt += f'  [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n'
+            if h.get("note"):
+                prompt += f'Note: {h["note"]}\n'
+            prompt += "</attempt>\n\n"
+
+    prompt += f"""</scores_summary>
+
+Skill content (for context on what the skill does):
+<skill_content>
+{skill_content}
+</skill_content>
+
+Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold:
+
+1. Avoid overfitting
+2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description.
+
+Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy.
+
+Here are some tips that we've found to work well in writing these descriptions:
+- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"
+- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.
+- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable.
+- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.
+
+I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end. 
+
+Please respond with only the new description text in <new_description> tags, nothing else."""
+
+    response = client.messages.create(
+        model=model,
+        max_tokens=16000,
+        thinking={
+            "type": "enabled",
+            "budget_tokens": 10000,
+        },
+        messages=[{"role": "user", "content": prompt}],
+    )
+
+    # Extract thinking and text from response
+    thinking_text = ""
+    text = ""
+    for block in response.content:
+        if block.type == "thinking":
+            thinking_text = block.thinking
+        elif block.type == "text":
+            text = block.text
+
+    # Parse out the <new_description> tags
+    match = re.search(r"<new_description>(.*?)</new_description>", text, re.DOTALL)
+    description = match.group(1).strip().strip('"') if match else text.strip().strip('"')
+
+    # Log the transcript
+    transcript: dict = {
+        "iteration": iteration,
+        "prompt": prompt,
+        "thinking": thinking_text,
+        "response": text,
+        "parsed_description": description,
+        "char_count": len(description),
+        "over_limit": len(description) > 1024,
+    }
+
+    # If over 1024 chars, ask the model to shorten it
+    if len(description) > 1024:
+        shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in <new_description> tags."
+        shorten_response = client.messages.create(
+            model=model,
+            max_tokens=16000,
+            thinking={
+                "type": "enabled",
+                "budget_tokens": 10000,
+            },
+            messages=[
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": text},
+                {"role": "user", "content": shorten_prompt},
+            ],
+        )
+
+        shorten_thinking = ""
+        shorten_text = ""
+        for block in shorten_response.content:
+            if block.type == "thinking":
+                shorten_thinking = block.thinking
+            elif block.type == "text":
+                shorten_text = block.text
+
+        match = re.search(r"<new_description>(.*?)</new_description>", shorten_text, re.DOTALL)
+        shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"')
+
+        transcript["rewrite_prompt"] = shorten_prompt
+        transcript["rewrite_thinking"] = shorten_thinking
+        transcript["rewrite_response"] = shorten_text
+        transcript["rewrite_description"] = shortened
+        transcript["rewrite_char_count"] = len(shortened)
+        description = shortened
+
+    transcript["final_description"] = description
+
+    if log_dir:
+        log_dir.mkdir(parents=True, exist_ok=True)
+        log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json"
+        log_file.write_text(json.dumps(transcript, indent=2))
+
+    return description
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Improve a skill description based on eval results")
+    parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr")
+    args = parser.parse_args()
+
+    skill_path = Path(args.skill_path)
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    eval_results = json.loads(Path(args.eval_results).read_text())
+    history = []
+    if args.history:
+        history = json.loads(Path(args.history).read_text())
+
+    name, _, content = parse_skill_md(skill_path)
+    current_description = eval_results["description"]
+
+    if args.verbose:
+        logger.info(f"Current: {current_description}")
+        logger.info(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}")
+
+    client = anthropic.Anthropic()
+    new_description = improve_description(
+        client=client,
+        skill_name=name,
+        skill_content=content,
+        current_description=current_description,
+        eval_results=eval_results,
+        history=history,
+        model=args.model,
+    )
+
+    if args.verbose:
+        logger.info(f"Improved: {new_description}")
+
+    # Output as JSON with both the new description and updated history
+    output = {
+        "description": new_description,
+        "history": history + [{
+            "description": current_description,
+            "passed": eval_results["summary"]["passed"],
+            "failed": eval_results["summary"]["failed"],
+            "total": eval_results["summary"]["total"],
+            "results": eval_results["results"],
+        }],
+    }
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/package_skill.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/package_skill.py
new file mode 100644
index 000000000..5dbdf7843
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/package_skill.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Skill Packager - Creates a distributable .skill file of a skill folder
+
+Usage:
+    python utils/package_skill.py <path/to/skill-folder> [output-directory]
+
+Example:
+    python utils/package_skill.py skills/public/my-skill
+    python utils/package_skill.py skills/public/my-skill ./dist
+"""
+
+import fnmatch
+import sys
+import zipfile
+from pathlib import Path
+
+from loguru import logger
+from scripts.quick_validate import validate_skill
+
+# Patterns to exclude when packaging skills.
+EXCLUDE_DIRS = {"__pycache__", "node_modules"}
+EXCLUDE_GLOBS = {"*.pyc"}
+EXCLUDE_FILES = {".DS_Store"}
+# Directories excluded only at the skill root (not when nested deeper).
+ROOT_EXCLUDE_DIRS = {"evals"}
+
+
+def should_exclude(rel_path: Path) -> bool:
+    """Check if a path should be excluded from packaging."""
+    parts = rel_path.parts
+    if any(part in EXCLUDE_DIRS for part in parts):
+        return True
+    # rel_path is relative to skill_path.parent, so parts[0] is the skill
+    # folder name and parts[1] (if present) is the first subdir.
+    if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
+        return True
+    name = rel_path.name
+    if name in EXCLUDE_FILES:
+        return True
+    return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
+
+
+def package_skill(skill_path, output_dir=None):
+    """
+    Package a skill folder into a .skill file.
+
+    Args:
+        skill_path: Path to the skill folder
+        output_dir: Optional output directory for the .skill file (defaults to current directory)
+
+    Returns:
+        Path to the created .skill file, or None if error
+    """
+    skill_path = Path(skill_path).resolve()
+
+    # Validate skill folder exists
+    if not skill_path.exists():
+        logger.error(f"Skill folder not found: {skill_path}")
+        return None
+
+    if not skill_path.is_dir():
+        logger.error(f"Path is not a directory: {skill_path}")
+        return None
+
+    # Validate SKILL.md exists
+    skill_md = skill_path / "SKILL.md"
+    if not skill_md.exists():
+        logger.error(f"SKILL.md not found in {skill_path}")
+        return None
+
+    # Run validation before packaging
+    logger.info("Validating skill...")
+    valid, message = validate_skill(skill_path)
+    if not valid:
+        logger.error(f"Validation failed: {message}")
+        logger.error("Please fix the validation errors before packaging.")
+        return None
+    logger.info(f"{message}\n")
+
+    # Determine output location
+    skill_name = skill_path.name
+    if output_dir:
+        output_path = Path(output_dir).resolve()
+        output_path.mkdir(parents=True, exist_ok=True)
+    else:
+        output_path = Path.cwd()
+
+    skill_filename = output_path / f"{skill_name}.skill"
+
+    # Create the .skill file (zip format)
+    try:
+        with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            # Walk through the skill directory, excluding build artifacts
+            for file_path in skill_path.rglob('*'):
+                if not file_path.is_file():
+                    continue
+                arcname = file_path.relative_to(skill_path.parent)
+                if should_exclude(arcname):
+                    logger.debug(f"Skipped: {arcname}")
+                    continue
+                zipf.write(file_path, arcname)
+                logger.debug(f"Added: {arcname}")
+
+        logger.info(f"Successfully packaged skill to: {skill_filename}")
+        return skill_filename
+
+    except Exception as e:
+        logger.error(f"Error creating .skill file: {e}")
+        return None
+
+
+def main():
+    if len(sys.argv) < 2:
+        logger.info("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
+        logger.info("\nExample:")
+        logger.info("  python utils/package_skill.py skills/public/my-skill")
+        logger.info("  python utils/package_skill.py skills/public/my-skill ./dist")
+        sys.exit(1)
+
+    skill_path = sys.argv[1]
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else None
+
+    logger.info(f"Packaging skill: {skill_path}")
+    if output_dir:
+        logger.info(f"   Output directory: {output_dir}")
+    logger.info("")
+
+    result = package_skill(skill_path, output_dir)
+
+    if result:
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/quick_validate.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/quick_validate.py
new file mode 100644
index 000000000..36553161e
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/quick_validate.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Quick validation script for skills - minimal version
+"""
+
+import sys
+import os
+import re
+import yaml
+from pathlib import Path
+
+from loguru import logger
+
+def validate_skill(skill_path):
+    """Basic validation of a skill"""
+    skill_path = Path(skill_path)
+
+    # Check SKILL.md exists
+    skill_md = skill_path / 'SKILL.md'
+    if not skill_md.exists():
+        return False, "SKILL.md not found"
+
+    # Read and validate frontmatter
+    content = skill_md.read_text()
+    if not content.startswith('---'):
+        return False, "No YAML frontmatter found"
+
+    # Extract frontmatter
+    match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
+    if not match:
+        return False, "Invalid frontmatter format"
+
+    frontmatter_text = match.group(1)
+
+    # Parse YAML frontmatter
+    try:
+        frontmatter = yaml.safe_load(frontmatter_text)
+        if not isinstance(frontmatter, dict):
+            return False, "Frontmatter must be a YAML dictionary"
+    except yaml.YAMLError as e:
+        return False, f"Invalid YAML in frontmatter: {e}"
+
+    # Define allowed properties
+    ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
+
+    # Check for unexpected properties (excluding nested keys under metadata)
+    unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
+    if unexpected_keys:
+        return False, (
+            f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
+            f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
+        )
+
+    # Check required fields
+    if 'name' not in frontmatter:
+        return False, "Missing 'name' in frontmatter"
+    if 'description' not in frontmatter:
+        return False, "Missing 'description' in frontmatter"
+
+    # Extract name for validation
+    name = frontmatter.get('name', '')
+    if not isinstance(name, str):
+        return False, f"Name must be a string, got {type(name).__name__}"
+    name = name.strip()
+    if name:
+        # Check naming convention (kebab-case: lowercase with hyphens)
+        if not re.match(r'^[a-z0-9-]+$', name):
+            return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
+        if name.startswith('-') or name.endswith('-') or '--' in name:
+            return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
+        # Check name length (max 64 characters per spec)
+        if len(name) > 64:
+            return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
+
+    # Extract and validate description
+    description = frontmatter.get('description', '')
+    if not isinstance(description, str):
+        return False, f"Description must be a string, got {type(description).__name__}"
+    description = description.strip()
+    if description:
+        # Check for angle brackets
+        if '<' in description or '>' in description:
+            return False, "Description cannot contain angle brackets (< or >)"
+        # Check description length (max 1024 characters per spec)
+        if len(description) > 1024:
+            return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
+
+    # Validate compatibility field if present (optional)
+    compatibility = frontmatter.get('compatibility', '')
+    if compatibility:
+        if not isinstance(compatibility, str):
+            return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
+        if len(compatibility) > 500:
+            return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
+
+    return True, "Skill is valid!"
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        logger.info("Usage: python quick_validate.py <skill_directory>")
+        sys.exit(1)
+
+    valid, message = validate_skill(sys.argv[1])
+    if valid:
+        logger.info(message)
+    else:
+        logger.error(message)
+    sys.exit(0 if valid else 1)
\ No newline at end of file
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_eval.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_eval.py
new file mode 100644
index 000000000..f923066ca
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_eval.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+"""Run trigger evaluation for a skill description.
+
+Tests whether a skill's description causes Claude to trigger (read the skill)
+for a set of queries. Outputs results as JSON.
+"""
+
+import argparse
+import json
+import os
+import select
+import subprocess
+import sys
+import time
+import uuid
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+
+from loguru import logger
+
+from scripts.utils import parse_skill_md
+
+
+def find_project_root() -> Path:
+    """Find the project root by walking up from cwd looking for .claude/.
+
+    Mimics how Claude Code discovers its project root, so the command file
+    we create ends up where claude -p will look for it.
+    """
+    current = Path.cwd()
+    for parent in [current, *current.parents]:
+        if (parent / ".claude").is_dir():
+            return parent
+    return current
+
+
+def run_single_query(
+    query: str,
+    skill_name: str,
+    skill_description: str,
+    timeout: int,
+    project_root: str,
+    model: str | None = None,
+) -> bool:
+    """Run a single query and return whether the skill was triggered.
+
+    Creates a command file in .claude/commands/ so it appears in Claude's
+    available_skills list, then runs `claude -p` with the raw query.
+    Uses --include-partial-messages to detect triggering early from
+    stream events (content_block_start) rather than waiting for the
+    full assistant message, which only arrives after tool execution.
+    """
+    unique_id = uuid.uuid4().hex[:8]
+    clean_name = f"{skill_name}-skill-{unique_id}"
+    project_commands_dir = Path(project_root) / ".claude" / "commands"
+    command_file = project_commands_dir / f"{clean_name}.md"
+
+    try:
+        project_commands_dir.mkdir(parents=True, exist_ok=True)
+        # Use YAML block scalar to avoid breaking on quotes in description
+        indented_desc = "\n  ".join(skill_description.split("\n"))
+        command_content = (
+            f"---\n"
+            f"description: |\n"
+            f"  {indented_desc}\n"
+            f"---\n\n"
+            f"# {skill_name}\n\n"
+            f"This skill handles: {skill_description}\n"
+        )
+        command_file.write_text(command_content)
+
+        cmd = [
+            "claude",
+            "-p", query,
+            "--output-format", "stream-json",
+            "--verbose",
+            "--include-partial-messages",
+        ]
+        if model:
+            cmd.extend(["--model", model])
+
+        # Remove CLAUDECODE env var to allow nesting claude -p inside a
+        # Claude Code session. The guard is for interactive terminal conflicts;
+        # programmatic subprocess usage is safe.
+        env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            cwd=project_root,
+            env=env,
+        )
+
+        triggered = False
+        start_time = time.time()
+        buffer = ""
+        # Track state for stream event detection
+        pending_tool_name = None
+        accumulated_json = ""
+
+        try:
+            while time.time() - start_time < timeout:
+                if process.poll() is not None:
+                    remaining = process.stdout.read()
+                    if remaining:
+                        buffer += remaining.decode("utf-8", errors="replace")
+                    break
+
+                ready, _, _ = select.select([process.stdout], [], [], 1.0)
+                if not ready:
+                    continue
+
+                chunk = os.read(process.stdout.fileno(), 8192)
+                if not chunk:
+                    break
+                buffer += chunk.decode("utf-8", errors="replace")
+
+                while "\n" in buffer:
+                    line, buffer = buffer.split("\n", 1)
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    try:
+                        event = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+
+                    # Early detection via stream events
+                    if event.get("type") == "stream_event":
+                        se = event.get("event", {})
+                        se_type = se.get("type", "")
+
+                        if se_type == "content_block_start":
+                            cb = se.get("content_block", {})
+                            if cb.get("type") == "tool_use":
+                                tool_name = cb.get("name", "")
+                                if tool_name in ("Skill", "Read"):
+                                    pending_tool_name = tool_name
+                                    accumulated_json = ""
+                                else:
+                                    return False
+
+                        elif se_type == "content_block_delta" and pending_tool_name:
+                            delta = se.get("delta", {})
+                            if delta.get("type") == "input_json_delta":
+                                accumulated_json += delta.get("partial_json", "")
+                                if clean_name in accumulated_json:
+                                    return True
+
+                        elif se_type in ("content_block_stop", "message_stop"):
+                            if pending_tool_name:
+                                return clean_name in accumulated_json
+                            if se_type == "message_stop":
+                                return False
+
+                    # Fallback: full assistant message
+                    elif event.get("type") == "assistant":
+                        message = event.get("message", {})
+                        for content_item in message.get("content", []):
+                            if content_item.get("type") != "tool_use":
+                                continue
+                            tool_name = content_item.get("name", "")
+                            tool_input = content_item.get("input", {})
+                            if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
+                                triggered = True
+                            elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
+                                triggered = True
+                            return triggered
+
+                    elif event.get("type") == "result":
+                        return triggered
+        finally:
+            # Clean up process on any exit path (return, exception, timeout)
+            if process.poll() is None:
+                process.kill()
+                process.wait()
+
+        return triggered
+    finally:
+        if command_file.exists():
+            command_file.unlink()
+
+
+def run_eval(
+    eval_set: list[dict],
+    skill_name: str,
+    description: str,
+    num_workers: int,
+    timeout: int,
+    project_root: Path,
+    runs_per_query: int = 1,
+    trigger_threshold: float = 0.5,
+    model: str | None = None,
+) -> dict:
+    """Run the full eval set and return results."""
+    results = []
+
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        future_to_info = {}
+        for item in eval_set:
+            for run_idx in range(runs_per_query):
+                future = executor.submit(
+                    run_single_query,
+                    item["query"],
+                    skill_name,
+                    description,
+                    timeout,
+                    str(project_root),
+                    model,
+                )
+                future_to_info[future] = (item, run_idx)
+
+        query_triggers: dict[str, list[bool]] = {}
+        query_items: dict[str, dict] = {}
+        for future in as_completed(future_to_info):
+            item, _ = future_to_info[future]
+            query = item["query"]
+            query_items[query] = item
+            if query not in query_triggers:
+                query_triggers[query] = []
+            try:
+                query_triggers[query].append(future.result())
+            except Exception as e:
+                logger.warning(f"Warning: query failed: {e}")
+                query_triggers[query].append(False)
+
+    for query, triggers in query_triggers.items():
+        item = query_items[query]
+        trigger_rate = sum(triggers) / len(triggers)
+        should_trigger = item["should_trigger"]
+        if should_trigger:
+            did_pass = trigger_rate >= trigger_threshold
+        else:
+            did_pass = trigger_rate < trigger_threshold
+        results.append({
+            "query": query,
+            "should_trigger": should_trigger,
+            "trigger_rate": trigger_rate,
+            "triggers": sum(triggers),
+            "runs": len(triggers),
+            "pass": did_pass,
+        })
+
+    passed = sum(1 for r in results if r["pass"])
+    total = len(results)
+
+    return {
+        "skill_name": skill_name,
+        "description": description,
+        "results": results,
+        "summary": {
+            "total": total,
+            "passed": passed,
+            "failed": total - passed,
+        },
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override description to test")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    name, original_description, content = parse_skill_md(skill_path)
+    description = args.description or original_description
+    project_root = find_project_root()
+
+    if args.verbose:
+        logger.info(f"Evaluating: {description}")
+
+    output = run_eval(
+        eval_set=eval_set,
+        skill_name=name,
+        description=description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        project_root=project_root,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        model=args.model,
+    )
+
+    if args.verbose:
+        summary = output["summary"]
+        logger.info(f"Results: {summary['passed']}/{summary['total']} passed")
+        for r in output["results"]:
+            status = "PASS" if r["pass"] else "FAIL"
+            rate_str = f"{r['triggers']}/{r['runs']}"
+            logger.info(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}")
+
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_loop.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_loop.py
new file mode 100644
index 000000000..a2907d6e0
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_loop.py
@@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+"""Run the eval + improve loop until all pass or max iterations reached.
+
+Combines run_eval.py and improve_description.py in a loop, tracking history
+and returning the best description found. Supports train/test split to prevent
+overfitting.
+"""
+
+import argparse
+import json
+import random
+import sys
+import tempfile
+import time
+import webbrowser
+from pathlib import Path
+
+import anthropic
+from loguru import logger
+
+from scripts.generate_report import generate_html
+from scripts.improve_description import improve_description
+from scripts.run_eval import find_project_root, run_eval
+from scripts.utils import parse_skill_md
+
+
+def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
+    """Split eval set into train and test sets, stratified by should_trigger."""
+    random.seed(seed)
+
+    # Separate by should_trigger
+    trigger = [e for e in eval_set if e["should_trigger"]]
+    no_trigger = [e for e in eval_set if not e["should_trigger"]]
+
+    # Shuffle each group
+    random.shuffle(trigger)
+    random.shuffle(no_trigger)
+
+    # Calculate split points
+    n_trigger_test = max(1, int(len(trigger) * holdout))
+    n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
+
+    # Split
+    test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
+    train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
+
+    return train_set, test_set
+
+
+def run_loop(
+    eval_set: list[dict],
+    skill_path: Path,
+    description_override: str | None,
+    num_workers: int,
+    timeout: int,
+    max_iterations: int,
+    runs_per_query: int,
+    trigger_threshold: float,
+    holdout: float,
+    model: str,
+    verbose: bool,
+    live_report_path: Path | None = None,
+    log_dir: Path | None = None,
+) -> dict:
+    """Run the eval + improvement loop."""
+    project_root = find_project_root()
+    name, original_description, content = parse_skill_md(skill_path)
+    current_description = description_override or original_description
+
+    # Split into train/test if holdout > 0
+    if holdout > 0:
+        train_set, test_set = split_eval_set(eval_set, holdout)
+        if verbose:
+            logger.info(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})")
+    else:
+        train_set = eval_set
+        test_set = []
+
+    client = anthropic.Anthropic()
+    history = []
+    exit_reason = "unknown"
+
+    for iteration in range(1, max_iterations + 1):
+        if verbose:
+            logger.info(f"\n{'='*60}")
+            logger.info(f"Iteration {iteration}/{max_iterations}")
+            logger.info(f"Description: {current_description}")
+            logger.info(f"{'='*60}")
+
+        # Evaluate train + test together in one batch for parallelism
+        all_queries = train_set + test_set
+        t0 = time.time()
+        all_results = run_eval(
+            eval_set=all_queries,
+            skill_name=name,
+            description=current_description,
+            num_workers=num_workers,
+            timeout=timeout,
+            project_root=project_root,
+            runs_per_query=runs_per_query,
+            trigger_threshold=trigger_threshold,
+            model=model,
+        )
+        eval_elapsed = time.time() - t0
+
+        # Split results back into train/test by matching queries
+        train_queries_set = {q["query"] for q in train_set}
+        train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
+        test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
+
+        train_passed = sum(1 for r in train_result_list if r["pass"])
+        train_total = len(train_result_list)
+        train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
+        train_results = {"results": train_result_list, "summary": train_summary}
+
+        if test_set:
+            test_passed = sum(1 for r in test_result_list if r["pass"])
+            test_total = len(test_result_list)
+            test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
+            test_results = {"results": test_result_list, "summary": test_summary}
+        else:
+            test_results = None
+            test_summary = None
+
+        history.append({
+            "iteration": iteration,
+            "description": current_description,
+            "train_passed": train_summary["passed"],
+            "train_failed": train_summary["failed"],
+            "train_total": train_summary["total"],
+            "train_results": train_results["results"],
+            "test_passed": test_summary["passed"] if test_summary else None,
+            "test_failed": test_summary["failed"] if test_summary else None,
+            "test_total": test_summary["total"] if test_summary else None,
+            "test_results": test_results["results"] if test_results else None,
+            # For backward compat with report generator
+            "passed": train_summary["passed"],
+            "failed": train_summary["failed"],
+            "total": train_summary["total"],
+            "results": train_results["results"],
+        })
+
+        # Write live report if path provided
+        if live_report_path:
+            partial_output = {
+                "original_description": original_description,
+                "best_description": current_description,
+                "best_score": "in progress",
+                "iterations_run": len(history),
+                "holdout": holdout,
+                "train_size": len(train_set),
+                "test_size": len(test_set),
+                "history": history,
+            }
+            live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
+
+        if verbose:
+            def print_eval_stats(label, results, elapsed):
+                pos = [r for r in results if r["should_trigger"]]
+                neg = [r for r in results if not r["should_trigger"]]
+                tp = sum(r["triggers"] for r in pos)
+                pos_runs = sum(r["runs"] for r in pos)
+                fn = pos_runs - tp
+                fp = sum(r["triggers"] for r in neg)
+                neg_runs = sum(r["runs"] for r in neg)
+                tn = neg_runs - fp
+                total = tp + tn + fp + fn
+                precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
+                recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
+                accuracy = (tp + tn) / total if total > 0 else 0.0
+                logger.info(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)")
+                for r in results:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    rate_str = f"{r['triggers']}/{r['runs']}"
+                    logger.info(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}")
+
+            print_eval_stats("Train", train_results["results"], eval_elapsed)
+            if test_summary:
+                print_eval_stats("Test ", test_results["results"], 0)
+
+        if train_summary["failed"] == 0:
+            exit_reason = f"all_passed (iteration {iteration})"
+            if verbose:
+                logger.info(f"\nAll train queries passed on iteration {iteration}!")
+            break
+
+        if iteration == max_iterations:
+            exit_reason = f"max_iterations ({max_iterations})"
+            if verbose:
+                logger.info(f"\nMax iterations reached ({max_iterations}).")
+            break
+
+        # Improve the description based on train results
+        if verbose:
+            logger.info(f"\nImproving description...")
+
+        t0 = time.time()
+        # Strip test scores from history so improvement model can't see them
+        blinded_history = [
+            {k: v for k, v in h.items() if not k.startswith("test_")}
+            for h in history
+        ]
+        new_description = improve_description(
+            client=client,
+            skill_name=name,
+            skill_content=content,
+            current_description=current_description,
+            eval_results=train_results,
+            history=blinded_history,
+            model=model,
+            log_dir=log_dir,
+            iteration=iteration,
+        )
+        improve_elapsed = time.time() - t0
+
+        if verbose:
+            logger.info(f"Proposed ({improve_elapsed:.1f}s): {new_description}")
+
+        current_description = new_description
+
+    # Find the best iteration by TEST score (or train if no test set)
+    if test_set:
+        best = max(history, key=lambda h: h["test_passed"] or 0)
+        best_score = f"{best['test_passed']}/{best['test_total']}"
+    else:
+        best = max(history, key=lambda h: h["train_passed"])
+        best_score = f"{best['train_passed']}/{best['train_total']}"
+
+    if verbose:
+        logger.info(f"\nExit reason: {exit_reason}")
+        logger.info(f"Best score: {best_score} (iteration {best['iteration']})")
+
+    return {
+        "exit_reason": exit_reason,
+        "original_description": original_description,
+        "best_description": best["description"],
+        "best_score": best_score,
+        "best_train_score": f"{best['train_passed']}/{best['train_total']}",
+        "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
+        "final_description": current_description,
+        "iterations_run": len(history),
+        "holdout": holdout,
+        "train_size": len(train_set),
+        "test_size": len(test_set),
+        "history": history,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run eval + improve loop")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override starting description")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
+    parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    name, _, _ = parse_skill_md(skill_path)
+
+    # Set up live report path
+    if args.report != "none":
+        if args.report == "auto":
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
+        else:
+            live_report_path = Path(args.report)
+        # Open the report immediately so the user can watch
+        live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
+        webbrowser.open(str(live_report_path))
+    else:
+        live_report_path = None
+
+    # Determine output directory (create before run_loop so logs can be written)
+    if args.results_dir:
+        timestamp = time.strftime("%Y-%m-%d_%H%M%S")
+        results_dir = Path(args.results_dir) / timestamp
+        results_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        results_dir = None
+
+    log_dir = results_dir / "logs" if results_dir else None
+
+    output = run_loop(
+        eval_set=eval_set,
+        skill_path=skill_path,
+        description_override=args.description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        max_iterations=args.max_iterations,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        holdout=args.holdout,
+        model=args.model,
+        verbose=args.verbose,
+        live_report_path=live_report_path,
+        log_dir=log_dir,
+    )
+
+    # Save JSON output
+    json_output = json.dumps(output, indent=2)
+    print(json_output)
+    if results_dir:
+        (results_dir / "results.json").write_text(json_output)
+
+    # Write final HTML report (without auto-refresh)
+    if live_report_path:
+        live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
+        logger.info(f"\nReport: {live_report_path}")
+
+    if results_dir and live_report_path:
+        (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
+
+    if results_dir:
+        logger.info(f"Results saved to: {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/utils.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/utils.py
new file mode 100644
index 000000000..51b6a07dd
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/utils.py
@@ -0,0 +1,47 @@
+"""Shared utilities for skill-creator scripts."""
+
+from pathlib import Path
+
+
+
+def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
+    """Parse a SKILL.md file, returning (name, description, full_content)."""
+    content = (skill_path / "SKILL.md").read_text()
+    lines = content.split("\n")
+
+    if lines[0].strip() != "---":
+        raise ValueError("SKILL.md missing frontmatter (no opening ---)")
+
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+
+    if end_idx is None:
+        raise ValueError("SKILL.md missing frontmatter (no closing ---)")
+
+    name = ""
+    description = ""
+    frontmatter_lines = lines[1:end_idx]
+    i = 0
+    while i < len(frontmatter_lines):
+        line = frontmatter_lines[i]
+        if line.startswith("name:"):
+            name = line[len("name:"):].strip().strip('"').strip("'")
+        elif line.startswith("description:"):
+            value = line[len("description:"):].strip()
+            # Handle YAML multiline indicators (>, |, >-, |-)
+            if value in (">", "|", ">-", "|-"):
+                continuation_lines: list[str] = []
+                i += 1
+                while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith("  ") or frontmatter_lines[i].startswith("\t")):
+                    continuation_lines.append(frontmatter_lines[i].strip())
+                    i += 1
+                description = " ".join(continuation_lines)
+                continue
+            else:
+                description = value.strip('"').strip("'")
+        i += 1
+
+    return name, description, content
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/soul.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/soul.md
new file mode 100644
index 000000000..1554c3463
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/soul.md
@@ -0,0 +1,16 @@
+# Soul — {{agent_name}}
+
+## Identity
+- **名称**: {{agent_name}}
+- **角色**: {{role_description}}
+- **创建者**: {{creator_name}}
+- **创建时间**: {{created_at}}
+
+## Personality
+- 认真负责、注重细节
+- 主动汇报工作进展
+- 遇到不确定的信息会主动确认
+
+## Boundaries
+- 遵守企业保密制度
+- 敏感操作需经过创建者审批
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/state.json b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/state.json
new file mode 100644
index 000000000..0507e31dd
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/state.json
@@ -0,0 +1,13 @@
+{
+    "agent_id": "",
+    "name": "",
+    "status": "idle",
+    "current_task": null,
+    "last_active": null,
+    "channel_status": {},
+    "stats": {
+        "tasks_completed_today": 0,
+        "tasks_in_progress": 0,
+        "督办_pending": 0
+    }
+}
\ No newline at end of file
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/todo.json b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/todo.json
new file mode 100644
index 000000000..50ffbb9a9
--- /dev/null
+++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/todo.json
@@ -0,0 +1,3 @@
+{
+  "tasks": []
+}
diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/workspace/archived/.gitkeep b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/workspace/archived/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/HEARTBEAT.md b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/HEARTBEAT.md
new file mode 100644
index 000000000..485565cb3
--- /dev/null
+++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/HEARTBEAT.md
@@ -0,0 +1,63 @@
+# HEARTBEAT
+
+When this file is read during a heartbeat, you are performing a **periodic awareness check**.
+
+## Phase 1: Review Context & Discover Interest Points
+
+Review your **recent conversations** and your **role/responsibilities**.
+Identify topics or questions that:
+- Are directly relevant to your role and current work
+- Were mentioned by users but not fully explored at the time
+- Represent emerging trends or changes in your professional domain
+- Could improve your ability to serve your users
+
+If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3.
+Do NOT search for generic or obvious topics just to fill time. Quality over quantity.
+
+## Phase 2: Targeted Exploration (Conditional)
+
+Only if you identified genuine interest points in Phase 1:
+
+1. Use `web_search` to investigate (maximum 5 searches per heartbeat)
+2. Keep searches **tightly scoped** to your role and recent work topics
+3. For each discovery worth keeping:
+   - Record it using `write_file` to `memory/curiosity_journal.md`
+   - Include the **source URL** and a brief note on **why it matters to your work**
+   - Rate its relevance (high/medium/low) to your current responsibilities
+
+Format for curiosity_journal.md entries:
+```
+### [Date] - [Topic]
+- **Finding**: [What you learned]
+- **Source**: [URL]
+- **Relevance**: [high/medium/low] — [Why it matters to your work]
+- **Follow-up**: [Optional: questions this raises for next time]
+```
+
+## Phase 3: Agent Plaza
+
+1. Call `plaza_get_new_posts` to check recent activity
+2. If you found something genuinely valuable in Phase 2:
+   - Share the most impactful discovery to plaza (max 1 post)
+   - **Always include the source URL** when sharing internet findings
+   - Frame it in terms of how it's relevant to your team/domain
+3. Comment on relevant existing posts (max 2 comments)
+
+## Phase 4: Wrap Up
+
+- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK`
+- Otherwise, briefly summarize what you explored and why
+
+## Key Principles
+- Always ground exploration in YOUR role and YOUR recent work context
+- Never search for random unrelated topics out of idle curiosity
+- If you don't have a specific angle worth investigating, don't search
+- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries
+- Generate follow-up questions only when you genuinely want to know more
+
+## Rules
+- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details
+- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links
+- 📝 **Limits per heartbeat**: max 1 post + 2 comments
+- 🔍 **Search limits**: max 5 web searches per heartbeat
+- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK`
diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/daily_reports/.gitkeep b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/daily_reports/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/enterprise_info/.gitkeep b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/enterprise_info/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/MEMORY_INDEX.md b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/MEMORY_INDEX.md
new file mode 100644
index 000000000..29e3fab13
--- /dev/null
+++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/MEMORY_INDEX.md
@@ -0,0 +1,6 @@
+# Memory Index
+
+This file serves as an index of all memories for this digital employee.
+
+## Topics
+<!-- New memory topics will be added here -->
diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/curiosity_journal.md b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/curiosity_journal.md
new file mode 100644
index 000000000..c5185fe44
--- /dev/null
+++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/curiosity_journal.md
@@ -0,0 +1,9 @@
+# Curiosity Journal
+
+This is your exploration log. Record interesting discoveries from your web searches here.
+
+## Active Questions
+<!-- Topics you want to investigate in future heartbeats -->
+
+## Discoveries
+<!-- Record your findings below, newest first -->
diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/skills/.gitkeep b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/skills/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/skills/MCP_INSTALLER.md b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/skills/MCP_INSTALLER.md
new file mode 100644
index 000000000..9e3bf3c77
--- /dev/null
+++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/skills/MCP_INSTALLER.md
@@ -0,0 +1,87 @@
+# MCP Tool Installer
+
+## When to Use This Skill
+Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL.
+
+---
+
+## Step-by-Step Protocol
+
+### Step 1 — Search first
+```
+discover_resources(query="<what the user wants>", max_results=5)
+```
+Show the results and let the user pick. Note the `ID` field (e.g. `github`).
+
+### Step 2 — Determine import method
+
+**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐)
+- Requires Smithery API Key (one-time per agent)
+- Individual tool tokens NOT needed — Smithery handles auth via OAuth
+
+**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint)
+- User provides the MCP server URL directly
+- May require tool-specific API key
+
+**Not importable** (💻 local-only tools)
+- Requires local Docker/process — inform user these cannot be imported automatically
+
+---
+
+### Method A: Smithery Import
+
+#### Check Smithery API Key
+If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim):
+
+> **Smithery** (smithery.ai) 是一个 MCP 工具市场，类似于"应用商店"。通过它，我可以帮你一键安装各种第三方工具（如 GitHub、Notion、Slack 等），并自动完成认证。
+>
+> **为什么需要注册？**
+> Smithery 用 API Key 来识别你的身份，这样安装的工具会关联到你的账号，认证信息也会安全保存。
+>
+> **注册一次后有什么好处？**
+> - 🔑 只需提供一次 Key，后续安装其他工具时我会自动帮你配置
+> - 🔐 不需要为每个工具单独创建 Token（如 GitHub PAT），OAuth 一键授权
+> - 📦 支持上千种 MCP 工具，随时可以扩展你的能力
+>
+> **获取步骤：**
+> 1. 访问 https://smithery.ai 注册/登录
+> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key
+> 3. 将 Key 提供给我
+
+#### Import
+```
+import_mcp_server(
+  server_id="<qualified_name>",
+  config={"smithery_api_key": "<key>"}  # first time only
+)
+```
+
+#### Handle OAuth
+Some tools return an OAuth authorization URL. Tell the user to visit the link.
+
+**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically.
+
+---
+
+### Method B: Direct URL Import
+
+When a tool is not available on Smithery but the user has a public MCP endpoint:
+```
+import_mcp_server(
+  server_id="<server name>",
+  config={
+    "mcp_url": "https://my-mcp-server.com/sse",
+    "api_key": "<optional tool-specific key>"
+  }
+)
+```
+The system will connect to the URL, discover available tools, and register them.
+
+---
+
+## What NOT to Do
+- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these
+- ❌ Don't tell users to go to Settings — handle everything in chat
+- ❌ Don't echo API keys back in your response
+- ❌ Don't skip the search step — always verify the server exists before importing
+- ❌ Don't import local-only tools — inform users they require local installation
diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/soul.md b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/soul.md
new file mode 100644
index 000000000..1554c3463
--- /dev/null
+++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/soul.md
@@ -0,0 +1,16 @@
+# Soul — {{agent_name}}
+
+## Identity
+- **名称**: {{agent_name}}
+- **角色**: {{role_description}}
+- **创建者**: {{creator_name}}
+- **创建时间**: {{created_at}}
+
+## Personality
+- 认真负责、注重细节
+- 主动汇报工作进展
+- 遇到不确定的信息会主动确认
+
+## Boundaries
+- 遵守企业保密制度
+- 敏感操作需经过创建者审批
diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/state.json b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/state.json
new file mode 100644
index 000000000..0507e31dd
--- /dev/null
+++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/state.json
@@ -0,0 +1,13 @@
+{
+    "agent_id": "",
+    "name": "",
+    "status": "idle",
+    "current_task": null,
+    "last_active": null,
+    "channel_status": {},
+    "stats": {
+        "tasks_completed_today": 0,
+        "tasks_in_progress": 0,
+        "督办_pending": 0
+    }
+}
\ No newline at end of file
diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/todo.json b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/todo.json
new file mode 100644
index 000000000..50ffbb9a9
--- /dev/null
+++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/todo.json
@@ -0,0 +1,3 @@
+{
+  "tasks": []
+}
diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/workspace/archived/.gitkeep b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/workspace/archived/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/HEARTBEAT.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/HEARTBEAT.md
new file mode 100644
index 000000000..485565cb3
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/HEARTBEAT.md
@@ -0,0 +1,63 @@
+# HEARTBEAT
+
+When this file is read during a heartbeat, you are performing a **periodic awareness check**.
+
+## Phase 1: Review Context & Discover Interest Points
+
+Review your **recent conversations** and your **role/responsibilities**.
+Identify topics or questions that:
+- Are directly relevant to your role and current work
+- Were mentioned by users but not fully explored at the time
+- Represent emerging trends or changes in your professional domain
+- Could improve your ability to serve your users
+
+If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3.
+Do NOT search for generic or obvious topics just to fill time. Quality over quantity.
+
+## Phase 2: Targeted Exploration (Conditional)
+
+Only if you identified genuine interest points in Phase 1:
+
+1. Use `web_search` to investigate (maximum 5 searches per heartbeat)
+2. Keep searches **tightly scoped** to your role and recent work topics
+3. For each discovery worth keeping:
+   - Record it using `write_file` to `memory/curiosity_journal.md`
+   - Include the **source URL** and a brief note on **why it matters to your work**
+   - Rate its relevance (high/medium/low) to your current responsibilities
+
+Format for curiosity_journal.md entries:
+```
+### [Date] - [Topic]
+- **Finding**: [What you learned]
+- **Source**: [URL]
+- **Relevance**: [high/medium/low] — [Why it matters to your work]
+- **Follow-up**: [Optional: questions this raises for next time]
+```
+
+## Phase 3: Agent Plaza
+
+1. Call `plaza_get_new_posts` to check recent activity
+2. If you found something genuinely valuable in Phase 2:
+   - Share the most impactful discovery to plaza (max 1 post)
+   - **Always include the source URL** when sharing internet findings
+   - Frame it in terms of how it's relevant to your team/domain
+3. Comment on relevant existing posts (max 2 comments)
+
+## Phase 4: Wrap Up
+
+- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK`
+- Otherwise, briefly summarize what you explored and why
+
+## Key Principles
+- Always ground exploration in YOUR role and YOUR recent work context
+- Never search for random unrelated topics out of idle curiosity
+- If you don't have a specific angle worth investigating, don't search
+- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries
+- Generate follow-up questions only when you genuinely want to know more
+
+## Rules
+- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details
+- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links
+- 📝 **Limits per heartbeat**: max 1 post + 2 comments
+- 🔍 **Search limits**: max 5 web searches per heartbeat
+- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK`
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/daily_reports/.gitkeep b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/daily_reports/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/enterprise_info/.gitkeep b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/enterprise_info/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/MEMORY_INDEX.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/MEMORY_INDEX.md
new file mode 100644
index 000000000..29e3fab13
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/MEMORY_INDEX.md
@@ -0,0 +1,6 @@
+# Memory Index
+
+This file serves as an index of all memories for this digital employee.
+
+## Topics
+<!-- New memory topics will be added here -->
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/curiosity_journal.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/curiosity_journal.md
new file mode 100644
index 000000000..c5185fe44
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/curiosity_journal.md
@@ -0,0 +1,9 @@
+# Curiosity Journal
+
+This is your exploration log. Record interesting discoveries from your web searches here.
+
+## Active Questions
+<!-- Topics you want to investigate in future heartbeats -->
+
+## Discoveries
+<!-- Record your findings below, newest first -->
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/memory.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/memory.md
new file mode 100644
index 000000000..a09922cb2
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/memory.md
@@ -0,0 +1,6 @@
+# Memory
+
+## OKR System State
+- Last report generated: (none)
+- Last progress collection: (none)
+- Team members tracked: (pending)
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/relationships.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/relationships.md
new file mode 100644
index 000000000..17cf2772b
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/relationships.md
@@ -0,0 +1,5 @@
+# Relationships
+
+## Team Members (OKR tracking)
+
+_Team members will be added here as they are onboarded into the OKR system._
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/.gitkeep b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/MCP_INSTALLER.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/MCP_INSTALLER.md
new file mode 100644
index 000000000..9e3bf3c77
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/MCP_INSTALLER.md
@@ -0,0 +1,87 @@
+# MCP Tool Installer
+
+## When to Use This Skill
+Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL.
+
+---
+
+## Step-by-Step Protocol
+
+### Step 1 — Search first
+```
+discover_resources(query="<what the user wants>", max_results=5)
+```
+Show the results and let the user pick. Note the `ID` field (e.g. `github`).
+
+### Step 2 — Determine import method
+
+**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐)
+- Requires Smithery API Key (one-time per agent)
+- Individual tool tokens NOT needed — Smithery handles auth via OAuth
+
+**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint)
+- User provides the MCP server URL directly
+- May require tool-specific API key
+
+**Not importable** (💻 local-only tools)
+- Requires local Docker/process — inform user these cannot be imported automatically
+
+---
+
+### Method A: Smithery Import
+
+#### Check Smithery API Key
+If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim):
+
+> **Smithery** (smithery.ai) 是一个 MCP 工具市场，类似于"应用商店"。通过它，我可以帮你一键安装各种第三方工具（如 GitHub、Notion、Slack 等），并自动完成认证。
+>
+> **为什么需要注册？**
+> Smithery 用 API Key 来识别你的身份，这样安装的工具会关联到你的账号，认证信息也会安全保存。
+>
+> **注册一次后有什么好处？**
+> - 🔑 只需提供一次 Key，后续安装其他工具时我会自动帮你配置
+> - 🔐 不需要为每个工具单独创建 Token（如 GitHub PAT），OAuth 一键授权
+> - 📦 支持上千种 MCP 工具，随时可以扩展你的能力
+>
+> **获取步骤：**
+> 1. 访问 https://smithery.ai 注册/登录
+> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key
+> 3. 将 Key 提供给我
+
+#### Import
+```
+import_mcp_server(
+  server_id="<qualified_name>",
+  config={"smithery_api_key": "<key>"}  # first time only
+)
+```
+
+#### Handle OAuth
+Some tools return an OAuth authorization URL. Tell the user to visit the link.
+
+**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically.
+
+---
+
+### Method B: Direct URL Import
+
+When a tool is not available on Smithery but the user has a public MCP endpoint:
+```
+import_mcp_server(
+  server_id="<server name>",
+  config={
+    "mcp_url": "https://my-mcp-server.com/sse",
+    "api_key": "<optional tool-specific key>"
+  }
+)
+```
+The system will connect to the URL, discover available tools, and register them.
+
+---
+
+## What NOT to Do
+- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these
+- ❌ Don't tell users to go to Settings — handle everything in chat
+- ❌ Don't echo API keys back in your response
+- ❌ Don't skip the search step — always verify the server exists before importing
+- ❌ Don't import local-only tools — inform users they require local installation
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/SKILL.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/SKILL.md
new file mode 100644
index 000000000..db71c3ed8
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/SKILL.md
@@ -0,0 +1,146 @@
+---
+name: Complex Task Executor
+description: Structured methodology for decomposing, planning, and executing complex multi-step tasks with progress tracking
+---
+
+# Complex Task Executor
+
+## When to Use This Skill
+
+Use this skill when a task meets ANY of the following criteria:
+- Requires more than 3 distinct steps to complete
+- Involves multiple tools or information sources
+- Has dependencies between steps (step B needs output from step A)
+- Requires research before execution
+- Could benefit from a documented plan others can review
+- The user explicitly asks for a thorough or systematic approach
+
+**DO NOT use this for simple tasks** like answering a question, reading a single file, or performing one tool call.
+
+## Workflow
+
+### Phase 1: Task Analysis (THINK before acting)
+
+Before creating any files, analyze the task:
+
+1. **Understand the goal**: What is the final deliverable? What does "done" look like?
+2. **Assess complexity**: How many steps? What tools are needed?
+3. **Identify dependencies**: Which steps depend on others?
+4. **Identify risks**: What could go wrong? What information is missing?
+5. **Estimate scope**: Is the task feasible with available tools/skills?
+
+### Phase 2: Create Task Plan
+
+Create a task folder and plan file in the workspace:
+
+```
+workspace/<task-name>/plan.md
+```
+
+The plan.md MUST follow this exact format:
+
+```markdown
+# Task: <Clear title>
+
+## Objective
+<One-sentence description of the desired outcome>
+
+## Steps
+
+- [ ] 1. <First step — verb-noun format>
+  - Details: <What specifically to do>
+  - Output: <What this step produces>
+- [ ] 2. <Second step>
+  - Details: <...>
+  - Depends on: Step 1
+- [ ] 3. <Third step>
+  - Details: <...>
+
+## Status
+- Created: <timestamp>
+- Current Step: Not started
+- Progress: 0/<total>
+
+## Notes
+<Any assumptions, risks, or open questions>
+```
+
+Rules for writing the plan:
+- Each step should be completable in 1-3 tool calls
+- Use verb-noun format: "Research competitors", "Draft report", "Validate data"
+- Mark dependencies explicitly
+- Include expected outputs for each step
+
+### Phase 3: Execute Step-by-Step
+
+For EACH step in the plan:
+
+1. **Read the plan** — Call `read_file` on `workspace/<task>/plan.md` to check current state
+2. **Mark as in-progress** — Update the checkbox from `[ ]` to `[/]` and update the "Current Step" field
+3. **Execute the step** — Do the actual work (tool calls, analysis, writing)
+4. **Record output** — Save results to `workspace/<task>/` (e.g., intermediate files, data)
+5. **Mark as complete** — Update the checkbox from `[/]` to `[x]` and update "Progress" counter
+6. **Proceed to next step** — Move to the next uncompleted step
+
+### Phase 4: Completion
+
+When all steps are done:
+1. Update plan.md status to "✅ Completed"
+2. Create a `workspace/<task>/summary.md` with:
+   - What was accomplished
+   - Key results and deliverables
+   - Any follow-up items
+3. Present the final result to the user
+
+## Adaptive Replanning
+
+If during execution you discover:
+- A step is impossible → Mark it `[!]` with a reason, add alternative steps
+- New steps are needed → Add them to the plan with `[+]` prefix
+- A step produced unexpected results → Add a note and adjust subsequent steps
+- The plan needs major changes → Create a new section "## Revised Plan" and follow it
+
+Always update plan.md BEFORE changing course, so the plan stays the source of truth.
+
+## Error Handling
+
+- If a tool call fails, retry once. If it fails again, mark the step as blocked and note the error.
+- Never silently skip a step. Always update the plan to reflect what happened.
+- If you're stuck, tell the user what's blocking and ask for guidance.
+
+## Example Scenarios
+
+### Example 1: "Research our top 3 competitors and write a comparison report"
+
+Plan would be:
+```
+- [ ] 1. Identify the user's company/product context
+- [ ] 2. Research Competitor A — website, pricing, features
+- [ ] 3. Research Competitor B — website, pricing, features
+- [ ] 4. Research Competitor C — website, pricing, features
+- [ ] 5. Create comparison matrix
+- [ ] 6. Write analysis and recommendations
+- [ ] 7. Compile final report
+```
+
+### Example 2: "Analyze our Q4 sales data and prepare a board presentation"
+
+Plan would be:
+```
+- [ ] 1. Read and understand the sales data files
+- [ ] 2. Calculate key metrics (revenue, growth, trends)
+- [ ] 3. Identify top insights and anomalies
+- [ ] 4. Create data summary tables
+- [ ] 5. Draft presentation outline
+- [ ] 6. Write each presentation section
+- [ ] 7. Add executive summary
+- [ ] 8. Review and polish final document
+```
+
+## Key Principles
+
+1. **Plan is the source of truth** — Always update it before moving on
+2. **One step at a time** — Don't skip ahead or batch too many steps
+3. **Show your work** — Save intermediate results to the task folder
+4. **Communicate progress** — The user can read plan.md at any time to see status
+5. **Be adaptive** — Plans change; that's OK if you update the plan first
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/examples/plan_template.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/examples/plan_template.md
new file mode 100644
index 000000000..dfd60e7cb
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/examples/plan_template.md
@@ -0,0 +1,23 @@
+# Task: [Title]
+
+## Objective
+[One-sentence description of the desired outcome]
+
+## Steps
+
+- [ ] 1. [First step]
+  - Details: [What specifically to do]
+  - Output: [What this step produces]
+- [ ] 2. [Second step]
+  - Details: [...]
+  - Depends on: Step 1
+- [ ] 3. [Third step]
+  - Details: [...]
+
+## Status
+- Created: [timestamp]
+- Current Step: Not started
+- Progress: 0/3
+
+## Notes
+- [Any assumptions, risks, or open questions]
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/mcp-installer/SKILL.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/mcp-installer/SKILL.md
new file mode 100644
index 000000000..9e3bf3c77
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/mcp-installer/SKILL.md
@@ -0,0 +1,87 @@
+# MCP Tool Installer
+
+## When to Use This Skill
+Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL.
+
+---
+
+## Step-by-Step Protocol
+
+### Step 1 — Search first
+```
+discover_resources(query="<what the user wants>", max_results=5)
+```
+Show the results and let the user pick. Note the `ID` field (e.g. `github`).
+
+### Step 2 — Determine import method
+
+**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐)
+- Requires Smithery API Key (one-time per agent)
+- Individual tool tokens NOT needed — Smithery handles auth via OAuth
+
+**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint)
+- User provides the MCP server URL directly
+- May require tool-specific API key
+
+**Not importable** (💻 local-only tools)
+- Requires local Docker/process — inform user these cannot be imported automatically
+
+---
+
+### Method A: Smithery Import
+
+#### Check Smithery API Key
+If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim):
+
+> **Smithery** (smithery.ai) 是一个 MCP 工具市场，类似于"应用商店"。通过它，我可以帮你一键安装各种第三方工具（如 GitHub、Notion、Slack 等），并自动完成认证。
+>
+> **为什么需要注册？**
+> Smithery 用 API Key 来识别你的身份，这样安装的工具会关联到你的账号，认证信息也会安全保存。
+>
+> **注册一次后有什么好处？**
+> - 🔑 只需提供一次 Key，后续安装其他工具时我会自动帮你配置
+> - 🔐 不需要为每个工具单独创建 Token（如 GitHub PAT），OAuth 一键授权
+> - 📦 支持上千种 MCP 工具，随时可以扩展你的能力
+>
+> **获取步骤：**
+> 1. 访问 https://smithery.ai 注册/登录
+> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key
+> 3. 将 Key 提供给我
+
+#### Import
+```
+import_mcp_server(
+  server_id="<qualified_name>",
+  config={"smithery_api_key": "<key>"}  # first time only
+)
+```
+
+#### Handle OAuth
+Some tools return an OAuth authorization URL. Tell the user to visit the link.
+
+**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically.
+
+---
+
+### Method B: Direct URL Import
+
+When a tool is not available on Smithery but the user has a public MCP endpoint:
+```
+import_mcp_server(
+  server_id="<server name>",
+  config={
+    "mcp_url": "https://my-mcp-server.com/sse",
+    "api_key": "<optional tool-specific key>"
+  }
+)
+```
+The system will connect to the URL, discover available tools, and register them.
+
+---
+
+## What NOT to Do
+- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these
+- ❌ Don't tell users to go to Settings — handle everything in chat
+- ❌ Don't echo API keys back in your response
+- ❌ Don't skip the search step — always verify the server exists before importing
+- ❌ Don't import local-only tools — inform users they require local installation
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/SKILL.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/SKILL.md
new file mode 100644
index 000000000..ce0d06f3e
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/SKILL.md
@@ -0,0 +1,152 @@
+---
+name: skill-creator
+description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy.
+---
+
+# Skill Creator
+
+A skill for creating new skills and iteratively improving them.
+
+At a high level, the process of creating a skill goes like this:
+
+- Decide what you want the skill to do and roughly how it should do it
+- Write a draft of the skill
+- Create a few test prompts and run claude-with-access-to-the-skill on them
+- Help the user evaluate the results both qualitatively and quantitatively
+- Rewrite the skill based on feedback from the user's evaluation
+- Repeat until you're satisfied
+- Expand the test set and try again at larger scale
+
+Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages.
+
+## Communicating with the user
+
+Pay attention to context cues to understand how to phrase your communication. Briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it.
+
+---
+
+## Creating a skill
+
+### Capture Intent
+Start by understanding the user's intent.
+
+1. What should this skill enable the agent to do?
+2. When should this skill trigger? (what user phrases/contexts)
+3. What's the expected output format?
+4. Should we set up test cases to verify the skill works?
+
+### Interview and Research
+Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies. Wait to write test prompts until you've got this part ironed out.
+
+### Write the SKILL.md
+Based on the user interview, fill in these components:
+
+- **name**: Skill identifier
+- **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it.
+- **the rest of the skill**
+
+### Skill Writing Guide
+
+#### Anatomy of a Skill
+
+```
+skill-name/
+\u251c\u2500\u2500 SKILL.md (required)
+\u2502   \u251c\u2500\u2500 YAML frontmatter (name, description required)
+\u2502   \u2514\u2500\u2500 Markdown instructions
+\u2514\u2500\u2500 Bundled Resources (optional)
+    \u251c\u2500\u2500 scripts/    - Executable code for deterministic/repetitive tasks
+    \u251c\u2500\u2500 references/ - Docs loaded into context as needed
+    \u2514\u2500\u2500 assets/     - Files used in output (templates, icons, fonts)
+```
+
+#### Progressive Disclosure
+
+Skills use a three-level loading system:
+1. **Metadata** (name + description) - Always in context (~100 words)
+2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal)
+3. **Bundled resources** - As needed (unlimited, scripts can execute without loading)
+
+**Key patterns:**
+- Keep SKILL.md under 500 lines; if approaching this limit, add hierarchy with clear pointers
+- Reference files clearly from SKILL.md with guidance on when to read them
+- For large reference files (>300 lines), include a table of contents
+
+#### Writing Patterns
+
+Prefer using the imperative form in instructions.
+
+### Writing Style
+Explain to the model why things are important. Use theory of mind and try to make the skill general. Start by writing a draft and then look at it with fresh eyes and improve it.
+
+### Test Cases
+After writing the skill draft, come up with 2-3 realistic test prompts. Share them with the user. Save test cases to `evals/evals.json`.
+
+---
+
+## Running and evaluating test cases
+
+This section is one continuous sequence.
+
+### Step 1: Run test cases
+For each test case, run the agent with the skill applied, and optionally a baseline run without the skill for comparison.
+
+### Step 2: Draft assertions
+While runs are in progress, draft quantitative assertions for each test case. Good assertions are objectively verifiable and have descriptive names.
+
+### Step 3: Capture timing data
+When each run completes, save timing data (tokens, duration) to `timing.json`.
+
+### Step 4: Grade, aggregate, and launch the viewer
+Once all runs are done:
+1. Grade each run against assertions — see `agents/grader.md`
+2. Aggregate results: `python -m scripts.aggregate_benchmark <workspace>/iteration-N --skill-name <name>`
+3. Launch the viewer: `python eval-viewer/generate_review.py <workspace>/iteration-N --skill-name "my-skill" --benchmark <workspace>/iteration-N/benchmark.json`
+4. Present results to the user for review
+
+### Step 5: Read the feedback
+Read user feedback from `feedback.json`. Empty feedback means the user thought it was fine.
+
+---
+
+## Improving the skill
+
+### How to think about improvements
+1. **Generalize from the feedback.** Don't overfit to specific examples.
+2. **Keep the prompt lean.** Remove things that aren't pulling their weight.
+3. **Explain the why.** Today's LLMs are smart. Explain reasoning rather than rigid MUSTs.
+4. **Look for repeated work across test cases.** Bundle common scripts in `scripts/`.
+
+### The iteration loop
+1. Apply improvements to the skill
+2. Rerun all test cases into a new iteration directory
+3. Present results for review
+4. Wait for user to review
+5. Read feedback, improve again, repeat
+
+---
+
+## Advanced: Blind comparison
+For rigorous comparison between two versions. Read `agents/comparator.md` and `agents/analyzer.md`.
+
+## Description Optimization
+Optimize the description for better triggering accuracy. Use `scripts/run_loop.py`.
+
+---
+
+## Reference files
+
+- `agents/grader.md` — How to evaluate assertions against outputs
+- `agents/comparator.md` — How to do blind A/B comparison between two outputs
+- `agents/analyzer.md` — How to analyze why one version beat another
+- `references/schemas.md` — JSON structures for evals.json, grading.json, etc.
+- `assets/eval_review.html` — HTML template for eval review
+- `eval-viewer/generate_review.py` — Script to generate the review viewer
+- `scripts/aggregate_benchmark.py` — Aggregate benchmark results
+- `scripts/generate_report.py` — Generate optimization report
+- `scripts/improve_description.py` — Improve skill description
+- `scripts/package_skill.py` — Package skill for distribution
+- `scripts/quick_validate.py` — Quick validation
+- `scripts/run_eval.py` — Run triggering evaluation
+- `scripts/run_loop.py` — Run optimization loop
+- `scripts/utils.py` — Shared utilities
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/analyzer.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/analyzer.md
new file mode 100644
index 000000000..14e41d606
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/analyzer.md
@@ -0,0 +1,274 @@
+# Post-hoc Analyzer Agent
+
+Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions.
+
+## Role
+
+After the blind comparator determines a winner, the Post-hoc Analyzer "unblids" the results by examining the skills and transcripts. The goal is to extract actionable insights: what made the winner better, and how can the loser be improved?
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **winner**: "A" or "B" (from blind comparison)
+- **winner_skill_path**: Path to the skill that produced the winning output
+- **winner_transcript_path**: Path to the execution transcript for the winner
+- **loser_skill_path**: Path to the skill that produced the losing output
+- **loser_transcript_path**: Path to the execution transcript for the loser
+- **comparison_result_path**: Path to the blind comparator's output JSON
+- **output_path**: Where to save the analysis results
+
+## Process
+
+### Step 1: Read Comparison Result
+
+1. Read the blind comparator's output at comparison_result_path
+2. Note the winning side (A or B), the reasoning, and any scores
+3. Understand what the comparator valued in the winning output
+
+### Step 2: Read Both Skills
+
+1. Read the winner skill's SKILL.md and key referenced files
+2. Read the loser skill's SKILL.md and key referenced files
+3. Identify structural differences:
+   - Instructions clarity and specificity
+   - Script/tool usage patterns
+   - Example coverage
+   - Edge case handling
+
+### Step 3: Read Both Transcripts
+
+1. Read the winner's transcript
+2. Read the loser's transcript
+3. Compare execution patterns:
+   - How closely did each follow their skill's instructions?
+   - What tools were used differently?
+   - Where did the loser diverge from optimal behavior?
+   - Did either encounter errors or make recovery attempts?
+
+### Step 4: Analyze Instruction Following
+
+For each transcript, evaluate:
+- Did the agent follow the skill's explicit instructions?
+- Did the agent use the skill's provided tools/scripts?
+- Were there missed opportunities to leverage skill content?
+- Did the agent add unnecessary steps not in the skill?
+
+Score instruction following 1-10 and note specific issues.
+
+### Step 5: Identify Winner Strengths
+
+Determine what made the winner better:
+- Clearer instructions that led to better behavior?
+- Better scripts/tools that produced better output?
+- More comprehensive examples that guided edge cases?
+- Better error handling guidance?
+
+Be specific. Quote from skills/transcripts where relevant.
+
+### Step 6: Identify Loser Weaknesses
+
+Determine what held the loser back:
+- Ambiguous instructions that led to suboptimal choices?
+- Missing tools/scripts that forced workarounds?
+- Gaps in edge case coverage?
+- Poor error handling that caused failures?
+
+### Step 7: Generate Improvement Suggestions
+
+Based on the analysis, produce actionable suggestions for improving the loser skill:
+- Specific instruction changes to make
+- Tools/scripts to add or modify
+- Examples to include
+- Edge cases to address
+
+Prioritize by impact. Focus on changes that would have changed the outcome.
+
+### Step 8: Write Analysis Results
+
+Save structured analysis to `{output_path}`.
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "comparison_summary": {
+    "winner": "A",
+    "winner_skill": "path/to/winner/skill",
+    "loser_skill": "path/to/loser/skill",
+    "comparator_reasoning": "Brief summary of why comparator chose winner"
+  },
+  "winner_strengths": [
+    "Clear step-by-step instructions for handling multi-page documents",
+    "Included validation script that caught formatting errors",
+    "Explicit guidance on fallback behavior when OCR fails"
+  ],
+  "loser_weaknesses": [
+    "Vague instruction 'process the document appropriately' led to inconsistent behavior",
+    "No script for validation, agent had to improvise and made errors",
+    "No guidance on OCR failure, agent gave up instead of trying alternatives"
+  ],
+  "instruction_following": {
+    "winner": {
+      "score": 9,
+      "issues": [
+        "Minor: skipped optional logging step"
+      ]
+    },
+    "loser": {
+      "score": 6,
+      "issues": [
+        "Did not use the skill's formatting template",
+        "Invented own approach instead of following step 3",
+        "Missed the 'always validate output' instruction"
+      ]
+    }
+  },
+  "improvement_suggestions": [
+    {
+      "priority": "high",
+      "category": "instructions",
+      "suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template",
+      "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
+    },
+    {
+      "priority": "high",
+      "category": "tools",
+      "suggestion": "Add validate_output.py script similar to winner skill's validation approach",
+      "expected_impact": "Would catch formatting errors before final output"
+    },
+    {
+      "priority": "medium",
+      "category": "error_handling",
+      "suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'",
+      "expected_impact": "Would prevent early failure on difficult documents"
+    }
+  ],
+  "transcript_insights": {
+    "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output",
+    "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors"
+  }
+}
+```
+
+## Guidelines
+
+- **Be specific**: Quote from skills and transcripts, don't just say "instructions were unclear"
+- **Be actionable**: Suggestions should be concrete changes, not vague advice
+- **Focus on skill improvements**: The goal is to improve the losing skill, not critique the agent
+- **Prioritize by impact**: Which changes would most likely have changed the outcome?
+- **Consider causation**: Did the skill weakness actually cause the worse output, or is it incidental?
+- **Stay objective**: Analyze what happened, don't editorialize
+- **Think about generalization**: Would this improvement help on other evals too?
+
+## Categories for Suggestions
+
+Use these categories to organize improvement suggestions:
+
+| Category | Description |
+|----------|-------------|
+| `instructions` | Changes to the skill's prose instructions |
+| `tools` | Scripts, templates, or utilities to add/modify |
+| `examples` | Example inputs/outputs to include |
+| `error_handling` | Guidance for handling failures |
+| `structure` | Reorganization of skill content |
+| `references` | External docs or resources to add |
+
+## Priority Levels
+
+- **high**: Would likely change the outcome of this comparison
+- **medium**: Would improve quality but may not change win/loss
+- **low**: Nice to have, marginal improvement
+
+---
+
+# Analyzing Benchmark Results
+
+When analyzing benchmark results, the analyzer's purpose is to **surface patterns and anomalies** across multiple runs, not suggest skill improvements.
+
+## Role
+
+Review all benchmark run results and generate freeform notes that help the user understand skill performance. Focus on patterns that wouldn't be visible from aggregate metrics alone.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **benchmark_data_path**: Path to the in-progress benchmark.json with all run results
+- **skill_path**: Path to the skill being benchmarked
+- **output_path**: Where to save the notes (as JSON array of strings)
+
+## Process
+
+### Step 1: Read Benchmark Data
+
+1. Read the benchmark.json containing all run results
+2. Note the configurations tested (with_skill, without_skill)
+3. Understand the run_summary aggregates already calculated
+
+### Step 2: Analyze Per-Assertion Patterns
+
+For each expectation across all runs:
+- Does it **always pass** in both configurations? (may not differentiate skill value)
+- Does it **always fail** in both configurations? (may be broken or beyond capability)
+- Does it **always pass with skill but fail without**? (skill clearly adds value here)
+- Does it **always fail with skill but pass without**? (skill may be hurting)
+- Is it **highly variable**? (flaky expectation or non-deterministic behavior)
+
+### Step 3: Analyze Cross-Eval Patterns
+
+Look for patterns across evals:
+- Are certain eval types consistently harder/easier?
+- Do some evals show high variance while others are stable?
+- Are there surprising results that contradict expectations?
+
+### Step 4: Analyze Metrics Patterns
+
+Look at time_seconds, tokens, tool_calls:
+- Does the skill significantly increase execution time?
+- Is there high variance in resource usage?
+- Are there outlier runs that skew the aggregates?
+
+### Step 5: Generate Notes
+
+Write freeform observations as a list of strings. Each note should:
+- State a specific observation
+- Be grounded in the data (not speculation)
+- Help the user understand something the aggregate metrics don't show
+
+Examples:
+- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value"
+- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky"
+- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)"
+- "Skill adds 13s average execution time but improves pass rate by 50%"
+- "Token usage is 80% higher with skill, primarily due to script output parsing"
+- "All 3 without-skill runs for eval 1 produced empty output"
+
+### Step 6: Write Notes
+
+Save notes to `{output_path}` as a JSON array of strings:
+
+```json
+[
+  "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
+  "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure",
+  "Without-skill runs consistently fail on table extraction expectations",
+  "Skill adds 13s average execution time but improves pass rate by 50%"
+]
+```
+
+## Guidelines
+
+**DO:**
+- Report what you observe in the data
+- Be specific about which evals, expectations, or runs you're referring to
+- Note patterns that aggregate metrics would hide
+- Provide context that helps interpret the numbers
+
+**DO NOT:**
+- Suggest improvements to the skill (that's for the improvement step, not benchmarking)
+- Make subjective quality judgments ("the output was good/bad")
+- Speculate about causes without evidence
+- Repeat information already in the run_summary aggregates
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/comparator.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/comparator.md
new file mode 100644
index 000000000..80e00eb45
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/comparator.md
@@ -0,0 +1,202 @@
+# Blind Comparator Agent
+
+Compare two outputs WITHOUT knowing which skill produced them.
+
+## Role
+
+The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach.
+
+Your judgment is based purely on output quality and task completion.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **output_a_path**: Path to the first output file or directory
+- **output_b_path**: Path to the second output file or directory
+- **eval_prompt**: The original task/prompt that was executed
+- **expectations**: List of expectations to check (optional - may be empty)
+
+## Process
+
+### Step 1: Read Both Outputs
+
+1. Examine output A (file or directory)
+2. Examine output B (file or directory)
+3. Note the type, structure, and content of each
+4. If outputs are directories, examine all relevant files inside
+
+### Step 2: Understand the Task
+
+1. Read the eval_prompt carefully
+2. Identify what the task requires:
+   - What should be produced?
+   - What qualities matter (accuracy, completeness, format)?
+   - What would distinguish a good output from a poor one?
+
+### Step 3: Generate Evaluation Rubric
+
+Based on the task, generate a rubric with two dimensions:
+
+**Content Rubric** (what the output contains):
+| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
+|-----------|----------|----------------|---------------|
+| Correctness | Major errors | Minor errors | Fully correct |
+| Completeness | Missing key elements | Mostly complete | All elements present |
+| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
+
+**Structure Rubric** (how the output is organized):
+| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
+|-----------|----------|----------------|---------------|
+| Organization | Disorganized | Reasonably organized | Clear, logical structure |
+| Formatting | Inconsistent/broken | Mostly consistent | Professional, polished |
+| Usability | Difficult to use | Usable with effort | Easy to use |
+
+Adapt criteria to the specific task. For example:
+- PDF form → "Field alignment", "Text readability", "Data placement"
+- Document → "Section structure", "Heading hierarchy", "Paragraph flow"
+- Data output → "Schema correctness", "Data types", "Completeness"
+
+### Step 4: Evaluate Each Output Against the Rubric
+
+For each output (A and B):
+
+1. **Score each criterion** on the rubric (1-5 scale)
+2. **Calculate dimension totals**: Content score, Structure score
+3. **Calculate overall score**: Average of dimension scores, scaled to 1-10
+
+### Step 5: Check Assertions (if provided)
+
+If expectations are provided:
+
+1. Check each expectation against output A
+2. Check each expectation against output B
+3. Count pass rates for each output
+4. Use expectation scores as secondary evidence (not the primary decision factor)
+
+### Step 6: Determine the Winner
+
+Compare A and B based on (in priority order):
+
+1. **Primary**: Overall rubric score (content + structure)
+2. **Secondary**: Assertion pass rates (if applicable)
+3. **Tiebreaker**: If truly equal, declare a TIE
+
+Be decisive - ties should be rare. One output is usually better, even if marginally.
+
+### Step 7: Write Comparison Results
+
+Save results to a JSON file at the path specified (or `comparison.json` if not specified).
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "winner": "A",
+  "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
+  "rubric": {
+    "A": {
+      "content": {
+        "correctness": 5,
+        "completeness": 5,
+        "accuracy": 4
+      },
+      "structure": {
+        "organization": 4,
+        "formatting": 5,
+        "usability": 4
+      },
+      "content_score": 4.7,
+      "structure_score": 4.3,
+      "overall_score": 9.0
+    },
+    "B": {
+      "content": {
+        "correctness": 3,
+        "completeness": 2,
+        "accuracy": 3
+      },
+      "structure": {
+        "organization": 3,
+        "formatting": 2,
+        "usability": 3
+      },
+      "content_score": 2.7,
+      "structure_score": 2.7,
+      "overall_score": 5.4
+    }
+  },
+  "output_quality": {
+    "A": {
+      "score": 9,
+      "strengths": ["Complete solution", "Well-formatted", "All fields present"],
+      "weaknesses": ["Minor style inconsistency in header"]
+    },
+    "B": {
+      "score": 5,
+      "strengths": ["Readable output", "Correct basic structure"],
+      "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
+    }
+  },
+  "expectation_results": {
+    "A": {
+      "passed": 4,
+      "total": 5,
+      "pass_rate": 0.80,
+      "details": [
+        {"text": "Output includes name", "passed": true},
+        {"text": "Output includes date", "passed": true},
+        {"text": "Format is PDF", "passed": true},
+        {"text": "Contains signature", "passed": false},
+        {"text": "Readable text", "passed": true}
+      ]
+    },
+    "B": {
+      "passed": 3,
+      "total": 5,
+      "pass_rate": 0.60,
+      "details": [
+        {"text": "Output includes name", "passed": true},
+        {"text": "Output includes date", "passed": false},
+        {"text": "Format is PDF", "passed": true},
+        {"text": "Contains signature", "passed": false},
+        {"text": "Readable text", "passed": true}
+      ]
+    }
+  }
+}
+```
+
+If no expectations were provided, omit the `expectation_results` field entirely.
+
+## Field Descriptions
+
+- **winner**: "A", "B", or "TIE"
+- **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie)
+- **rubric**: Structured rubric evaluation for each output
+  - **content**: Scores for content criteria (correctness, completeness, accuracy)
+  - **structure**: Scores for structure criteria (organization, formatting, usability)
+  - **content_score**: Average of content criteria (1-5)
+  - **structure_score**: Average of structure criteria (1-5)
+  - **overall_score**: Combined score scaled to 1-10
+- **output_quality**: Summary quality assessment
+  - **score**: 1-10 rating (should match rubric overall_score)
+  - **strengths**: List of positive aspects
+  - **weaknesses**: List of issues or shortcomings
+- **expectation_results**: (Only if expectations provided)
+  - **passed**: Number of expectations that passed
+  - **total**: Total number of expectations
+  - **pass_rate**: Fraction passed (0.0 to 1.0)
+  - **details**: Individual expectation results
+
+## Guidelines
+
+- **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality.
+- **Be specific**: Cite specific examples when explaining strengths and weaknesses.
+- **Be decisive**: Choose a winner unless outputs are genuinely equivalent.
+- **Output quality first**: Assertion scores are secondary to overall task completion.
+- **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness.
+- **Explain your reasoning**: The reasoning field should make it clear why you chose the winner.
+- **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better.
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/grader.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/grader.md
new file mode 100644
index 000000000..558ab05c0
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/grader.md
@@ -0,0 +1,223 @@
+# Grader Agent
+
+Evaluate expectations against an execution transcript and outputs.
+
+## Role
+
+The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment.
+
+You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **expectations**: List of expectations to evaluate (strings)
+- **transcript_path**: Path to the execution transcript (markdown file)
+- **outputs_dir**: Directory containing output files from execution
+
+## Process
+
+### Step 1: Read the Transcript
+
+1. Read the transcript file completely
+2. Note the eval prompt, execution steps, and final result
+3. Identify any issues or errors documented
+
+### Step 2: Examine Output Files
+
+1. List files in outputs_dir
+2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced.
+3. Note contents, structure, and quality
+
+### Step 3: Evaluate Each Assertion
+
+For each expectation:
+
+1. **Search for evidence** in the transcript and outputs
+2. **Determine verdict**:
+   - **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance
+   - **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content)
+3. **Cite the evidence**: Quote the specific text or describe what you found
+
+### Step 4: Extract and Verify Claims
+
+Beyond the predefined expectations, extract implicit claims from the outputs and verify them:
+
+1. **Extract claims** from the transcript and outputs:
+   - Factual statements ("The form has 12 fields")
+   - Process claims ("Used pypdf to fill the form")
+   - Quality claims ("All fields were filled correctly")
+
+2. **Verify each claim**:
+   - **Factual claims**: Can be checked against the outputs or external sources
+   - **Process claims**: Can be verified from the transcript
+   - **Quality claims**: Evaluate whether the claim is justified
+
+3. **Flag unverifiable claims**: Note claims that cannot be verified with available information
+
+This catches issues that predefined expectations might miss.
+
+### Step 5: Read User Notes
+
+If `{outputs_dir}/user_notes.md` exists:
+1. Read it and note any uncertainties or issues flagged by the executor
+2. Include relevant concerns in the grading output
+3. These may reveal problems even when expectations pass
+
+### Step 6: Critique the Evals
+
+After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap.
+
+Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't.
+
+Suggestions worth raising:
+- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content)
+- An important outcome you observed — good or bad — that no assertion covers at all
+- An assertion that can't actually be verified from the available outputs
+
+Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion.
+
+### Step 7: Write Grading Results
+
+Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
+
+## Grading Criteria
+
+**PASS when**:
+- The transcript or outputs clearly demonstrate the expectation is true
+- Specific evidence can be cited
+- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename)
+
+**FAIL when**:
+- No evidence found for the expectation
+- Evidence contradicts the expectation
+- The expectation cannot be verified from available information
+- The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete
+- The output appears to meet the assertion by coincidence rather than by actually doing the work
+
+**When uncertain**: The burden of proof to pass is on the expectation.
+
+### Step 8: Read Executor Metrics and Timing
+
+1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output
+2. If `{outputs_dir}/../timing.json` exists, read it and include timing data
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "expectations": [
+    {
+      "text": "The output includes the name 'John Smith'",
+      "passed": true,
+      "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
+    },
+    {
+      "text": "The spreadsheet has a SUM formula in cell B10",
+      "passed": false,
+      "evidence": "No spreadsheet was created. The output was a text file."
+    },
+    {
+      "text": "The assistant used the skill's OCR script",
+      "passed": true,
+      "evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'"
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 1,
+    "total": 3,
+    "pass_rate": 0.67
+  },
+  "execution_metrics": {
+    "tool_calls": {
+      "Read": 5,
+      "Write": 2,
+      "Bash": 8
+    },
+    "total_tool_calls": 15,
+    "total_steps": 6,
+    "errors_encountered": 0,
+    "output_chars": 12450,
+    "transcript_chars": 3200
+  },
+  "timing": {
+    "executor_duration_seconds": 165.0,
+    "grader_duration_seconds": 26.0,
+    "total_duration_seconds": 191.0
+  },
+  "claims": [
+    {
+      "claim": "The form has 12 fillable fields",
+      "type": "factual",
+      "verified": true,
+      "evidence": "Counted 12 fields in field_info.json"
+    },
+    {
+      "claim": "All required fields were populated",
+      "type": "quality",
+      "verified": false,
+      "evidence": "Reference section was left blank despite data being available"
+    }
+  ],
+  "user_notes_summary": {
+    "uncertainties": ["Used 2023 data, may be stale"],
+    "needs_review": [],
+    "workarounds": ["Fell back to text overlay for non-fillable fields"]
+  },
+  "eval_feedback": {
+    "suggestions": [
+      {
+        "assertion": "The output includes the name 'John Smith'",
+        "reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input"
+      },
+      {
+        "reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught"
+      }
+    ],
+    "overall": "Assertions check presence but not correctness. Consider adding content verification."
+  }
+}
+```
+
+## Field Descriptions
+
+- **expectations**: Array of graded expectations
+  - **text**: The original expectation text
+  - **passed**: Boolean - true if expectation passes
+  - **evidence**: Specific quote or description supporting the verdict
+- **summary**: Aggregate statistics
+  - **passed**: Count of passed expectations
+  - **failed**: Count of failed expectations
+  - **total**: Total expectations evaluated
+  - **pass_rate**: Fraction passed (0.0 to 1.0)
+- **execution_metrics**: Copied from executor's metrics.json (if available)
+  - **output_chars**: Total character count of output files (proxy for tokens)
+  - **transcript_chars**: Character count of transcript
+- **timing**: Wall clock timing from timing.json (if available)
+  - **executor_duration_seconds**: Time spent in executor subagent
+  - **total_duration_seconds**: Total elapsed time for the run
+- **claims**: Extracted and verified claims from the output
+  - **claim**: The statement being verified
+  - **type**: "factual", "process", or "quality"
+  - **verified**: Boolean - whether the claim holds
+  - **evidence**: Supporting or contradicting evidence
+- **user_notes_summary**: Issues flagged by the executor
+  - **uncertainties**: Things the executor wasn't sure about
+  - **needs_review**: Items requiring human attention
+  - **workarounds**: Places where the skill didn't work as expected
+- **eval_feedback**: Improvement suggestions for the evals (only when warranted)
+  - **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to
+  - **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag
+
+## Guidelines
+
+- **Be objective**: Base verdicts on evidence, not assumptions
+- **Be specific**: Quote the exact text that supports your verdict
+- **Be thorough**: Check both transcript and output files
+- **Be consistent**: Apply the same standard to each expectation
+- **Explain failures**: Make it clear why evidence was insufficient
+- **No partial credit**: Each expectation is pass or fail, not partial
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/assets/eval_review.html b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/assets/eval_review.html
new file mode 100644
index 000000000..938ff32ae
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/assets/eval_review.html
@@ -0,0 +1,146 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Eval Set Review - __SKILL_NAME_PLACEHOLDER__</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+  <style>
+    * { box-sizing: border-box; margin: 0; padding: 0; }
+    body { font-family: 'Lora', Georgia, serif; background: #faf9f5; padding: 2rem; color: #141413; }
+    h1 { font-family: 'Poppins', sans-serif; margin-bottom: 0.5rem; font-size: 1.5rem; }
+    .description { color: #b0aea5; margin-bottom: 1.5rem; font-style: italic; max-width: 900px; }
+    .controls { margin-bottom: 1rem; display: flex; gap: 0.5rem; }
+    .btn { font-family: 'Poppins', sans-serif; padding: 0.5rem 1rem; border: none; border-radius: 6px; cursor: pointer; font-size: 0.875rem; font-weight: 500; }
+    .btn-add { background: #6a9bcc; color: white; }
+    .btn-add:hover { background: #5889b8; }
+    .btn-export { background: #d97757; color: white; }
+    .btn-export:hover { background: #c4613f; }
+    table { width: 100%; max-width: 1100px; border-collapse: collapse; background: white; border-radius: 6px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.08); }
+    th { font-family: 'Poppins', sans-serif; background: #141413; color: #faf9f5; padding: 0.75rem 1rem; text-align: left; font-size: 0.875rem; }
+    td { padding: 0.75rem 1rem; border-bottom: 1px solid #e8e6dc; vertical-align: top; }
+    tr:nth-child(even) td { background: #faf9f5; }
+    tr:hover td { background: #f3f1ea; }
+    .section-header td { background: #e8e6dc; font-family: 'Poppins', sans-serif; font-weight: 500; font-size: 0.8rem; color: #141413; text-transform: uppercase; letter-spacing: 0.05em; }
+    .query-input { width: 100%; padding: 0.4rem; border: 1px solid #e8e6dc; border-radius: 4px; font-size: 0.875rem; font-family: 'Lora', Georgia, serif; resize: vertical; min-height: 60px; }
+    .query-input:focus { outline: none; border-color: #d97757; box-shadow: 0 0 0 2px rgba(217,119,87,0.15); }
+    .toggle { position: relative; display: inline-block; width: 44px; height: 24px; }
+    .toggle input { opacity: 0; width: 0; height: 0; }
+    .toggle .slider { position: absolute; inset: 0; background: #b0aea5; border-radius: 24px; cursor: pointer; transition: 0.2s; }
+    .toggle .slider::before { content: ""; position: absolute; width: 18px; height: 18px; left: 3px; bottom: 3px; background: white; border-radius: 50%; transition: 0.2s; }
+    .toggle input:checked + .slider { background: #d97757; }
+    .toggle input:checked + .slider::before { transform: translateX(20px); }
+    .btn-delete { background: #c44; color: white; padding: 0.3rem 0.6rem; border: none; border-radius: 4px; cursor: pointer; font-size: 0.75rem; font-family: 'Poppins', sans-serif; }
+    .btn-delete:hover { background: #a33; }
+    .summary { margin-top: 1rem; color: #b0aea5; font-size: 0.875rem; }
+  </style>
+</head>
+<body>
+  <h1>Eval Set Review: <span id="skill-name">__SKILL_NAME_PLACEHOLDER__</span></h1>
+  <p class="description">Current description: <span id="skill-desc">__SKILL_DESCRIPTION_PLACEHOLDER__</span></p>
+
+  <div class="controls">
+    <button class="btn btn-add" onclick="addRow()">+ Add Query</button>
+    <button class="btn btn-export" onclick="exportEvalSet()">Export Eval Set</button>
+  </div>
+
+  <table>
+    <thead>
+      <tr>
+        <th style="width:65%">Query</th>
+        <th style="width:18%">Should Trigger</th>
+        <th style="width:10%">Actions</th>
+      </tr>
+    </thead>
+    <tbody id="eval-body"></tbody>
+  </table>
+
+  <p class="summary" id="summary"></p>
+
+  <script>
+    const EVAL_DATA = __EVAL_DATA_PLACEHOLDER__;
+
+    let evalItems = [...EVAL_DATA];
+
+    function render() {
+      const tbody = document.getElementById('eval-body');
+      tbody.innerHTML = '';
+
+      // Sort: should-trigger first, then should-not-trigger
+      const sorted = evalItems
+        .map((item, origIdx) => ({ ...item, origIdx }))
+        .sort((a, b) => (b.should_trigger ? 1 : 0) - (a.should_trigger ? 1 : 0));
+
+      let lastGroup = null;
+      sorted.forEach(item => {
+        const group = item.should_trigger ? 'trigger' : 'no-trigger';
+        if (group !== lastGroup) {
+          const headerRow = document.createElement('tr');
+          headerRow.className = 'section-header';
+          headerRow.innerHTML = `<td colspan="3">${item.should_trigger ? 'Should Trigger' : 'Should NOT Trigger'}</td>`;
+          tbody.appendChild(headerRow);
+          lastGroup = group;
+        }
+
+        const idx = item.origIdx;
+        const tr = document.createElement('tr');
+        tr.innerHTML = `
+          <td><textarea class="query-input" onchange="updateQuery(${idx}, this.value)">${escapeHtml(item.query)}</textarea></td>
+          <td>
+            <label class="toggle">
+              <input type="checkbox" ${item.should_trigger ? 'checked' : ''} onchange="updateTrigger(${idx}, this.checked)">
+              <span class="slider"></span>
+            </label>
+            <span style="margin-left:8px;font-size:0.8rem;color:#b0aea5">${item.should_trigger ? 'Yes' : 'No'}</span>
+          </td>
+          <td><button class="btn-delete" onclick="deleteRow(${idx})">Delete</button></td>
+        `;
+        tbody.appendChild(tr);
+      });
+      updateSummary();
+    }
+
+    function escapeHtml(text) {
+      const div = document.createElement('div');
+      div.textContent = text;
+      return div.innerHTML;
+    }
+
+    function updateQuery(idx, value) { evalItems[idx].query = value; updateSummary(); }
+    function updateTrigger(idx, value) { evalItems[idx].should_trigger = value; render(); }
+    function deleteRow(idx) { evalItems.splice(idx, 1); render(); }
+
+    function addRow() {
+      evalItems.push({ query: '', should_trigger: true });
+      render();
+      const inputs = document.querySelectorAll('.query-input');
+      inputs[inputs.length - 1].focus();
+    }
+
+    function updateSummary() {
+      const trigger = evalItems.filter(i => i.should_trigger).length;
+      const noTrigger = evalItems.filter(i => !i.should_trigger).length;
+      document.getElementById('summary').textContent =
+        `${evalItems.length} queries total: ${trigger} should trigger, ${noTrigger} should not trigger`;
+    }
+
+    function exportEvalSet() {
+      const valid = evalItems.filter(i => i.query.trim() !== '');
+      const data = valid.map(i => ({ query: i.query.trim(), should_trigger: i.should_trigger }));
+      const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
+      const url = URL.createObjectURL(blob);
+      const a = document.createElement('a');
+      a.href = url;
+      a.download = 'eval_set.json';
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+      URL.revokeObjectURL(url);
+    }
+
+    render();
+  </script>
+</body>
+</html>
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/generate_review.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/generate_review.py
new file mode 100644
index 000000000..4f0b1fe00
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/generate_review.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python3
+"""Generate and serve a review page for eval results.
+
+Reads the workspace directory, discovers runs (directories with outputs/),
+embeds all output data into a self-contained HTML page, and serves it via
+a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace.
+
+Usage:
+    python generate_review.py <workspace-path> [--port PORT] [--skill-name NAME]
+    python generate_review.py <workspace-path> --previous-feedback /path/to/old/feedback.json
+
+No dependencies beyond the Python stdlib are required.
+"""
+
+import argparse
+import base64
+import json
+import mimetypes
+import os
+import re
+import signal
+import subprocess
+import sys
+import time
+import webbrowser
+from functools import partial
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from pathlib import Path
+
+from loguru import logger
+
+# Files to exclude from output listings
+METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
+
+# Extensions we render as inline text
+TEXT_EXTENSIONS = {
+    ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
+    ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
+    ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
+}
+
+# Extensions we render as inline images
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
+
+# MIME type overrides for common types
+MIME_OVERRIDES = {
+    ".svg": "image/svg+xml",
+    ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+}
+
+
+def get_mime_type(path: Path) -> str:
+    ext = path.suffix.lower()
+    if ext in MIME_OVERRIDES:
+        return MIME_OVERRIDES[ext]
+    mime, _ = mimetypes.guess_type(str(path))
+    return mime or "application/octet-stream"
+
+
+def find_runs(workspace: Path) -> list[dict]:
+    """Recursively find directories that contain an outputs/ subdirectory."""
+    runs: list[dict] = []
+    _find_runs_recursive(workspace, workspace, runs)
+    runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"]))
+    return runs
+
+
+def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None:
+    if not current.is_dir():
+        return
+
+    outputs_dir = current / "outputs"
+    if outputs_dir.is_dir():
+        run = build_run(root, current)
+        if run:
+            runs.append(run)
+        return
+
+    skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
+    for child in sorted(current.iterdir()):
+        if child.is_dir() and child.name not in skip:
+            _find_runs_recursive(root, child, runs)
+
+
+def build_run(root: Path, run_dir: Path) -> dict | None:
+    """Build a run dict with prompt, outputs, and grading data."""
+    prompt = ""
+    eval_id = None
+
+    # Try eval_metadata.json
+    for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:
+        if candidate.exists():
+            try:
+                metadata = json.loads(candidate.read_text())
+                prompt = metadata.get("prompt", "")
+                eval_id = metadata.get("eval_id")
+            except (json.JSONDecodeError, OSError):
+                pass
+            if prompt:
+                break
+
+    # Fall back to transcript.md
+    if not prompt:
+        for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:
+            if candidate.exists():
+                try:
+                    text = candidate.read_text()
+                    match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)
+                    if match:
+                        prompt = match.group(1).strip()
+                except OSError:
+                    pass
+                if prompt:
+                    break
+
+    if not prompt:
+        prompt = "(No prompt found)"
+
+    run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
+
+    # Collect output files
+    outputs_dir = run_dir / "outputs"
+    output_files: list[dict] = []
+    if outputs_dir.is_dir():
+        for f in sorted(outputs_dir.iterdir()):
+            if f.is_file() and f.name not in METADATA_FILES:
+                output_files.append(embed_file(f))
+
+    # Load grading if present
+    grading = None
+    for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:
+        if candidate.exists():
+            try:
+                grading = json.loads(candidate.read_text())
+            except (json.JSONDecodeError, OSError):
+                pass
+            if grading:
+                break
+
+    return {
+        "id": run_id,
+        "prompt": prompt,
+        "eval_id": eval_id,
+        "outputs": output_files,
+        "grading": grading,
+    }
+
+
+def embed_file(path: Path) -> dict:
+    """Read a file and return an embedded representation."""
+    ext = path.suffix.lower()
+    mime = get_mime_type(path)
+
+    if ext in TEXT_EXTENSIONS:
+        try:
+            content = path.read_text(errors="replace")
+        except OSError:
+            content = "(Error reading file)"
+        return {
+            "name": path.name,
+            "type": "text",
+            "content": content,
+        }
+    elif ext in IMAGE_EXTENSIONS:
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "image",
+            "mime": mime,
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+    elif ext == ".pdf":
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "pdf",
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+    elif ext == ".xlsx":
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "xlsx",
+            "data_b64": b64,
+        }
+    else:
+        # Binary / unknown — base64 download link
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "binary",
+            "mime": mime,
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+
+
+def load_previous_iteration(workspace: Path) -> dict[str, dict]:
+    """Load previous iteration's feedback and outputs.
+
+    Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}.
+    """
+    result: dict[str, dict] = {}
+
+    # Load feedback
+    feedback_map: dict[str, str] = {}
+    feedback_path = workspace / "feedback.json"
+    if feedback_path.exists():
+        try:
+            data = json.loads(feedback_path.read_text())
+            feedback_map = {
+                r["run_id"]: r["feedback"]
+                for r in data.get("reviews", [])
+                if r.get("feedback", "").strip()
+            }
+        except (json.JSONDecodeError, OSError, KeyError):
+            pass
+
+    # Load runs (to get outputs)
+    prev_runs = find_runs(workspace)
+    for run in prev_runs:
+        result[run["id"]] = {
+            "feedback": feedback_map.get(run["id"], ""),
+            "outputs": run.get("outputs", []),
+        }
+
+    # Also add feedback for run_ids that had feedback but no matching run
+    for run_id, fb in feedback_map.items():
+        if run_id not in result:
+            result[run_id] = {"feedback": fb, "outputs": []}
+
+    return result
+
+
+def generate_html(
+    runs: list[dict],
+    skill_name: str,
+    previous: dict[str, dict] | None = None,
+    benchmark: dict | None = None,
+) -> str:
+    """Generate the complete standalone HTML page with embedded data."""
+    template_path = Path(__file__).parent / "viewer.html"
+    template = template_path.read_text()
+
+    # Build previous_feedback and previous_outputs maps for the template
+    previous_feedback: dict[str, str] = {}
+    previous_outputs: dict[str, list[dict]] = {}
+    if previous:
+        for run_id, data in previous.items():
+            if data.get("feedback"):
+                previous_feedback[run_id] = data["feedback"]
+            if data.get("outputs"):
+                previous_outputs[run_id] = data["outputs"]
+
+    embedded = {
+        "skill_name": skill_name,
+        "runs": runs,
+        "previous_feedback": previous_feedback,
+        "previous_outputs": previous_outputs,
+    }
+    if benchmark:
+        embedded["benchmark"] = benchmark
+
+    data_json = json.dumps(embedded)
+
+    return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
+
+
+# ---------------------------------------------------------------------------
+# HTTP server (stdlib only, zero dependencies)
+# ---------------------------------------------------------------------------
+
+def _kill_port(port: int) -> None:
+    """Kill any process listening on the given port."""
+    try:
+        result = subprocess.run(
+            ["lsof", "-ti", f":{port}"],
+            capture_output=True, text=True, timeout=5,
+        )
+        for pid_str in result.stdout.strip().split("\n"):
+            if pid_str.strip():
+                try:
+                    os.kill(int(pid_str.strip()), signal.SIGTERM)
+                except (ProcessLookupError, ValueError):
+                    pass
+        if result.stdout.strip():
+            time.sleep(0.5)
+    except subprocess.TimeoutExpired:
+        pass
+    except FileNotFoundError:
+        logger.warning("Note: lsof not found, cannot check if port is in use")
+
+class ReviewHandler(BaseHTTPRequestHandler):
+    """Serves the review HTML and handles feedback saves.
+
+    Regenerates the HTML on each page load so that refreshing the browser
+    picks up new eval outputs without restarting the server.
+    """
+
+    def __init__(
+        self,
+        workspace: Path,
+        skill_name: str,
+        feedback_path: Path,
+        previous: dict[str, dict],
+        benchmark_path: Path | None,
+        *args,
+        **kwargs,
+    ):
+        self.workspace = workspace
+        self.skill_name = skill_name
+        self.feedback_path = feedback_path
+        self.previous = previous
+        self.benchmark_path = benchmark_path
+        super().__init__(*args, **kwargs)
+
+    def do_GET(self) -> None:
+        if self.path == "/" or self.path == "/index.html":
+            # Regenerate HTML on each request (re-scans workspace for new outputs)
+            runs = find_runs(self.workspace)
+            benchmark = None
+            if self.benchmark_path and self.benchmark_path.exists():
+                try:
+                    benchmark = json.loads(self.benchmark_path.read_text())
+                except (json.JSONDecodeError, OSError):
+                    pass
+            html = generate_html(runs, self.skill_name, self.previous, benchmark)
+            content = html.encode("utf-8")
+            self.send_response(200)
+            self.send_header("Content-Type", "text/html; charset=utf-8")
+            self.send_header("Content-Length", str(len(content)))
+            self.end_headers()
+            self.wfile.write(content)
+        elif self.path == "/api/feedback":
+            data = b"{}"
+            if self.feedback_path.exists():
+                data = self.feedback_path.read_bytes()
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(data)))
+            self.end_headers()
+            self.wfile.write(data)
+        else:
+            self.send_error(404)
+
+    def do_POST(self) -> None:
+        if self.path == "/api/feedback":
+            length = int(self.headers.get("Content-Length", 0))
+            body = self.rfile.read(length)
+            try:
+                data = json.loads(body)
+                if not isinstance(data, dict) or "reviews" not in data:
+                    raise ValueError("Expected JSON object with 'reviews' key")
+                self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")
+                resp = b'{"ok":true}'
+                self.send_response(200)
+            except (json.JSONDecodeError, OSError, ValueError) as e:
+                resp = json.dumps({"error": str(e)}).encode()
+                self.send_response(500)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(resp)))
+            self.end_headers()
+            self.wfile.write(resp)
+        else:
+            self.send_error(404)
+
+    def log_message(self, format: str, *args: object) -> None:
+        # Suppress request logging to keep terminal clean
+        pass
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate and serve eval review")
+    parser.add_argument("workspace", type=Path, help="Path to workspace directory")
+    parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
+    parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
+    parser.add_argument(
+        "--previous-workspace", type=Path, default=None,
+        help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
+    )
+    parser.add_argument(
+        "--benchmark", type=Path, default=None,
+        help="Path to benchmark.json to show in the Benchmark tab",
+    )
+    parser.add_argument(
+        "--static", "-s", type=Path, default=None,
+        help="Write standalone HTML to this path instead of starting a server",
+    )
+    args = parser.parse_args()
+
+    workspace = args.workspace.resolve()
+    if not workspace.is_dir():
+        logger.error(f"Error: {workspace} is not a directory")
+        sys.exit(1)
+
+    runs = find_runs(workspace)
+    if not runs:
+        logger.error(f"No runs found in {workspace}")
+        sys.exit(1)
+
+    skill_name = args.skill_name or workspace.name.replace("-workspace", "")
+    feedback_path = workspace / "feedback.json"
+
+    previous: dict[str, dict] = {}
+    if args.previous_workspace:
+        previous = load_previous_iteration(args.previous_workspace.resolve())
+
+    benchmark_path = args.benchmark.resolve() if args.benchmark else None
+    benchmark = None
+    if benchmark_path and benchmark_path.exists():
+        try:
+            benchmark = json.loads(benchmark_path.read_text())
+        except (json.JSONDecodeError, OSError):
+            pass
+
+    if args.static:
+        html = generate_html(runs, skill_name, previous, benchmark)
+        args.static.parent.mkdir(parents=True, exist_ok=True)
+        args.static.write_text(html)
+        logger.info(f"\n  Static viewer written to: {args.static}\n")
+        sys.exit(0)
+
+    # Kill any existing process on the target port
+    port = args.port
+    _kill_port(port)
+    handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path)
+    try:
+        server = HTTPServer(("127.0.0.1", port), handler)
+    except OSError:
+        # Port still in use after kill attempt — find a free one
+        server = HTTPServer(("127.0.0.1", 0), handler)
+        port = server.server_address[1]
+
+    url = f"http://localhost:{port}"
+    logger.info(f"\n  Eval Viewer")
+    logger.info(f"  ─────────────────────────────────")
+    logger.info(f"  URL:       {url}")
+    logger.info(f"  Workspace: {workspace}")
+    logger.info(f"  Feedback:  {feedback_path}")
+    if previous:
+        logger.info(f"  Previous:  {args.previous_workspace} ({len(previous)} runs)")
+    if benchmark_path:
+        logger.info(f"  Benchmark: {benchmark_path}")
+    logger.info(f"\n  Press Ctrl+C to stop.\n")
+
+    webbrowser.open(url)
+
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        logger.info("\nStopped.")
+        server.server_close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/viewer.html b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/viewer.html
new file mode 100644
index 000000000..6d8e96348
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/viewer.html
@@ -0,0 +1,1325 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Eval Review</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+  <script src="https://cdn.sheetjs.com/xlsx-0.20.3/package/dist/xlsx.full.min.js" integrity="sha384-EnyY0/GSHQGSxSgMwaIPzSESbqoOLSexfnSMN2AP+39Ckmn92stwABZynq1JyzdT" crossorigin="anonymous"></script>
+  <style>
+    :root {
+      --bg: #faf9f5;
+      --surface: #ffffff;
+      --border: #e8e6dc;
+      --text: #141413;
+      --text-muted: #b0aea5;
+      --accent: #d97757;
+      --accent-hover: #c4613f;
+      --green: #788c5d;
+      --green-bg: #eef2e8;
+      --red: #c44;
+      --red-bg: #fceaea;
+      --header-bg: #141413;
+      --header-text: #faf9f5;
+      --radius: 6px;
+    }
+
+    * { box-sizing: border-box; margin: 0; padding: 0; }
+
+    body {
+      font-family: 'Lora', Georgia, serif;
+      background: var(--bg);
+      color: var(--text);
+      height: 100vh;
+      display: flex;
+      flex-direction: column;
+    }
+
+    /* ---- Header ---- */
+    .header {
+      background: var(--header-bg);
+      color: var(--header-text);
+      padding: 1rem 2rem;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      flex-shrink: 0;
+    }
+    .header h1 {
+      font-family: 'Poppins', sans-serif;
+      font-size: 1.25rem;
+      font-weight: 600;
+    }
+    .header .instructions {
+      font-size: 0.8rem;
+      opacity: 0.7;
+      margin-top: 0.25rem;
+    }
+    .header .progress {
+      font-size: 0.875rem;
+      opacity: 0.8;
+      text-align: right;
+    }
+
+    /* ---- Main content ---- */
+    .main {
+      flex: 1;
+      overflow-y: auto;
+      padding: 1.5rem 2rem;
+      display: flex;
+      flex-direction: column;
+      gap: 1.25rem;
+    }
+
+    /* ---- Sections ---- */
+    .section {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      flex-shrink: 0;
+    }
+    .section-header {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.75rem 1rem;
+      font-size: 0.75rem;
+      font-weight: 500;
+      text-transform: uppercase;
+      letter-spacing: 0.05em;
+      color: var(--text-muted);
+      border-bottom: 1px solid var(--border);
+      background: var(--bg);
+    }
+    .section-body {
+      padding: 1rem;
+    }
+
+    /* ---- Config badge ---- */
+    .config-badge {
+      display: inline-block;
+      padding: 0.2rem 0.625rem;
+      border-radius: 9999px;
+      font-family: 'Poppins', sans-serif;
+      font-size: 0.6875rem;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.03em;
+      margin-left: 0.75rem;
+      vertical-align: middle;
+    }
+    .config-badge.config-primary {
+      background: rgba(33, 150, 243, 0.12);
+      color: #1976d2;
+    }
+    .config-badge.config-baseline {
+      background: rgba(255, 193, 7, 0.15);
+      color: #f57f17;
+    }
+
+    /* ---- Prompt ---- */
+    .prompt-text {
+      white-space: pre-wrap;
+      font-size: 0.9375rem;
+      line-height: 1.6;
+    }
+
+    /* ---- Outputs ---- */
+    .output-file {
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      overflow: hidden;
+    }
+    .output-file + .output-file {
+      margin-top: 1rem;
+    }
+    .output-file-header {
+      padding: 0.5rem 0.75rem;
+      font-size: 0.8rem;
+      font-weight: 600;
+      color: var(--text-muted);
+      background: var(--bg);
+      border-bottom: 1px solid var(--border);
+      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+    }
+    .output-file-header .dl-btn {
+      font-size: 0.7rem;
+      color: var(--accent);
+      text-decoration: none;
+      cursor: pointer;
+      font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+      font-weight: 500;
+      opacity: 0.8;
+    }
+    .output-file-header .dl-btn:hover {
+      opacity: 1;
+      text-decoration: underline;
+    }
+    .output-file-content {
+      padding: 0.75rem;
+      overflow-x: auto;
+    }
+    .output-file-content pre {
+      font-size: 0.8125rem;
+      line-height: 1.5;
+      white-space: pre-wrap;
+      word-break: break-word;
+      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
+    }
+    .output-file-content img {
+      max-width: 100%;
+      height: auto;
+      border-radius: 4px;
+    }
+    .output-file-content iframe {
+      width: 100%;
+      height: 600px;
+      border: none;
+    }
+    .output-file-content table {
+      border-collapse: collapse;
+      font-size: 0.8125rem;
+      width: 100%;
+    }
+    .output-file-content table td,
+    .output-file-content table th {
+      border: 1px solid var(--border);
+      padding: 0.375rem 0.5rem;
+      text-align: left;
+    }
+    .output-file-content table th {
+      background: var(--bg);
+      font-weight: 600;
+    }
+    .output-file-content .download-link {
+      display: inline-flex;
+      align-items: center;
+      gap: 0.5rem;
+      padding: 0.5rem 1rem;
+      background: var(--bg);
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      color: var(--accent);
+      text-decoration: none;
+      font-size: 0.875rem;
+      cursor: pointer;
+    }
+    .output-file-content .download-link:hover {
+      background: var(--border);
+    }
+    .empty-state {
+      color: var(--text-muted);
+      font-style: italic;
+      padding: 2rem;
+      text-align: center;
+    }
+
+    /* ---- Feedback ---- */
+    .prev-feedback {
+      background: var(--bg);
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      padding: 0.625rem 0.75rem;
+      margin-top: 0.75rem;
+      font-size: 0.8125rem;
+      color: var(--text-muted);
+      line-height: 1.5;
+    }
+    .prev-feedback-label {
+      font-size: 0.7rem;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.04em;
+      margin-bottom: 0.25rem;
+      color: var(--text-muted);
+    }
+    .feedback-textarea {
+      width: 100%;
+      min-height: 100px;
+      padding: 0.75rem;
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      font-family: inherit;
+      font-size: 0.9375rem;
+      line-height: 1.5;
+      resize: vertical;
+      color: var(--text);
+    }
+    .feedback-textarea:focus {
+      outline: none;
+      border-color: var(--accent);
+      box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
+    }
+    .feedback-status {
+      font-size: 0.75rem;
+      color: var(--text-muted);
+      margin-top: 0.5rem;
+      min-height: 1.1em;
+    }
+
+    /* ---- Grades (collapsible) ---- */
+    .grades-toggle {
+      display: flex;
+      align-items: center;
+      cursor: pointer;
+      user-select: none;
+    }
+    .grades-toggle:hover {
+      color: var(--accent);
+    }
+    .grades-toggle .arrow {
+      margin-right: 0.5rem;
+      transition: transform 0.15s;
+      font-size: 0.75rem;
+    }
+    .grades-toggle .arrow.open {
+      transform: rotate(90deg);
+    }
+    .grades-content {
+      display: none;
+      margin-top: 0.75rem;
+    }
+    .grades-content.open {
+      display: block;
+    }
+    .grades-summary {
+      font-size: 0.875rem;
+      margin-bottom: 0.75rem;
+      display: flex;
+      align-items: center;
+      gap: 0.5rem;
+    }
+    .grade-badge {
+      display: inline-block;
+      padding: 0.125rem 0.5rem;
+      border-radius: 9999px;
+      font-size: 0.75rem;
+      font-weight: 600;
+    }
+    .grade-pass { background: var(--green-bg); color: var(--green); }
+    .grade-fail { background: var(--red-bg); color: var(--red); }
+    .assertion-list {
+      list-style: none;
+    }
+    .assertion-item {
+      padding: 0.625rem 0;
+      border-bottom: 1px solid var(--border);
+      font-size: 0.8125rem;
+    }
+    .assertion-item:last-child { border-bottom: none; }
+    .assertion-status {
+      font-weight: 600;
+      margin-right: 0.5rem;
+    }
+    .assertion-status.pass { color: var(--green); }
+    .assertion-status.fail { color: var(--red); }
+    .assertion-evidence {
+      color: var(--text-muted);
+      font-size: 0.75rem;
+      margin-top: 0.25rem;
+      padding-left: 1.5rem;
+    }
+
+    /* ---- View tabs ---- */
+    .view-tabs {
+      display: flex;
+      gap: 0;
+      padding: 0 2rem;
+      background: var(--bg);
+      border-bottom: 1px solid var(--border);
+      flex-shrink: 0;
+    }
+    .view-tab {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.625rem 1.25rem;
+      font-size: 0.8125rem;
+      font-weight: 500;
+      cursor: pointer;
+      border: none;
+      background: none;
+      color: var(--text-muted);
+      border-bottom: 2px solid transparent;
+      transition: all 0.15s;
+    }
+    .view-tab:hover { color: var(--text); }
+    .view-tab.active {
+      color: var(--accent);
+      border-bottom-color: var(--accent);
+    }
+    .view-panel { display: none; }
+    .view-panel.active { display: flex; flex-direction: column; flex: 1; overflow: hidden; }
+
+    /* ---- Benchmark view ---- */
+    .benchmark-view {
+      padding: 1.5rem 2rem;
+      overflow-y: auto;
+      flex: 1;
+    }
+    .benchmark-table {
+      border-collapse: collapse;
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      font-size: 0.8125rem;
+      width: 100%;
+      margin-bottom: 1.5rem;
+    }
+    .benchmark-table th, .benchmark-table td {
+      padding: 0.625rem 0.75rem;
+      text-align: left;
+      border: 1px solid var(--border);
+    }
+    .benchmark-table th {
+      font-family: 'Poppins', sans-serif;
+      background: var(--header-bg);
+      color: var(--header-text);
+      font-weight: 500;
+      font-size: 0.75rem;
+      text-transform: uppercase;
+      letter-spacing: 0.04em;
+    }
+    .benchmark-table tr:hover { background: var(--bg); }
+    .benchmark-table tr.benchmark-row-with { background: rgba(33, 150, 243, 0.06); }
+    .benchmark-table tr.benchmark-row-without { background: rgba(255, 193, 7, 0.06); }
+    .benchmark-table tr.benchmark-row-with:hover { background: rgba(33, 150, 243, 0.12); }
+    .benchmark-table tr.benchmark-row-without:hover { background: rgba(255, 193, 7, 0.12); }
+    .benchmark-table tr.benchmark-row-avg { font-weight: 600; border-top: 2px solid var(--border); }
+    .benchmark-table tr.benchmark-row-avg.benchmark-row-with { background: rgba(33, 150, 243, 0.12); }
+    .benchmark-table tr.benchmark-row-avg.benchmark-row-without { background: rgba(255, 193, 7, 0.12); }
+    .benchmark-delta-positive { color: var(--green); font-weight: 600; }
+    .benchmark-delta-negative { color: var(--red); font-weight: 600; }
+    .benchmark-notes {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      padding: 1rem;
+    }
+    .benchmark-notes h3 {
+      font-family: 'Poppins', sans-serif;
+      font-size: 0.875rem;
+      margin-bottom: 0.75rem;
+    }
+    .benchmark-notes ul {
+      list-style: disc;
+      padding-left: 1.25rem;
+    }
+    .benchmark-notes li {
+      font-size: 0.8125rem;
+      line-height: 1.6;
+      margin-bottom: 0.375rem;
+    }
+    .benchmark-empty {
+      color: var(--text-muted);
+      font-style: italic;
+      text-align: center;
+      padding: 3rem;
+    }
+
+    /* ---- Navigation ---- */
+    .nav {
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      padding: 1rem 2rem;
+      border-top: 1px solid var(--border);
+      background: var(--surface);
+      flex-shrink: 0;
+    }
+    .nav-btn {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.5rem 1.25rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      cursor: pointer;
+      font-size: 0.875rem;
+      font-weight: 500;
+      color: var(--text);
+      transition: all 0.15s;
+    }
+    .nav-btn:hover:not(:disabled) {
+      background: var(--bg);
+      border-color: var(--text-muted);
+    }
+    .nav-btn:disabled {
+      opacity: 0.4;
+      cursor: not-allowed;
+    }
+    .done-btn {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.5rem 1.5rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      color: var(--text);
+      cursor: pointer;
+      font-size: 0.875rem;
+      font-weight: 500;
+      transition: all 0.15s;
+    }
+    .done-btn:hover {
+      background: var(--bg);
+      border-color: var(--text-muted);
+    }
+    .done-btn.ready {
+      border: none;
+      background: var(--accent);
+      color: white;
+      font-weight: 600;
+    }
+    .done-btn.ready:hover {
+      background: var(--accent-hover);
+    }
+    /* ---- Done overlay ---- */
+    .done-overlay {
+      display: none;
+      position: fixed;
+      inset: 0;
+      background: rgba(0, 0, 0, 0.5);
+      z-index: 100;
+      justify-content: center;
+      align-items: center;
+    }
+    .done-overlay.visible {
+      display: flex;
+    }
+    .done-card {
+      background: var(--surface);
+      border-radius: 12px;
+      padding: 2rem 3rem;
+      text-align: center;
+      box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
+      max-width: 500px;
+    }
+    .done-card h2 {
+      font-size: 1.5rem;
+      margin-bottom: 0.5rem;
+    }
+    .done-card p {
+      color: var(--text-muted);
+      margin-bottom: 1.5rem;
+      line-height: 1.5;
+    }
+    .done-card .btn-row {
+      display: flex;
+      gap: 0.5rem;
+      justify-content: center;
+    }
+    .done-card button {
+      padding: 0.5rem 1.25rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      cursor: pointer;
+      font-size: 0.875rem;
+    }
+    .done-card button:hover {
+      background: var(--bg);
+    }
+    /* ---- Toast ---- */
+    .toast {
+      position: fixed;
+      bottom: 5rem;
+      left: 50%;
+      transform: translateX(-50%);
+      background: var(--header-bg);
+      color: var(--header-text);
+      padding: 0.625rem 1.25rem;
+      border-radius: var(--radius);
+      font-size: 0.875rem;
+      opacity: 0;
+      transition: opacity 0.3s;
+      pointer-events: none;
+      z-index: 200;
+    }
+    .toast.visible {
+      opacity: 1;
+    }
+  </style>
+</head>
+<body>
+  <div id="app" style="height:100vh; display:flex; flex-direction:column;">
+    <div class="header">
+      <div>
+        <h1>Eval Review: <span id="skill-name"></span></h1>
+        <div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>
+      </div>
+      <div class="progress" id="progress"></div>
+    </div>
+
+    <!-- View tabs (only shown when benchmark data exists) -->
+    <div class="view-tabs" id="view-tabs" style="display:none;">
+      <button class="view-tab active" onclick="switchView('outputs')">Outputs</button>
+      <button class="view-tab" onclick="switchView('benchmark')">Benchmark</button>
+    </div>
+
+    <!-- Outputs panel (qualitative review) -->
+    <div class="view-panel active" id="panel-outputs">
+    <div class="main">
+      <!-- Prompt -->
+      <div class="section">
+        <div class="section-header">Prompt <span class="config-badge" id="config-badge" style="display:none;"></span></div>
+        <div class="section-body">
+          <div class="prompt-text" id="prompt-text"></div>
+        </div>
+      </div>
+
+      <!-- Outputs -->
+      <div class="section">
+        <div class="section-header">Output</div>
+        <div class="section-body" id="outputs-body">
+          <div class="empty-state">No output files found</div>
+        </div>
+      </div>
+
+      <!-- Previous Output (collapsible) -->
+      <div class="section" id="prev-outputs-section" style="display:none;">
+        <div class="section-header">
+          <div class="grades-toggle" onclick="togglePrevOutputs()">
+            <span class="arrow" id="prev-outputs-arrow">&#9654;</span>
+            Previous Output
+          </div>
+        </div>
+        <div class="grades-content" id="prev-outputs-content"></div>
+      </div>
+
+      <!-- Grades (collapsible) -->
+      <div class="section" id="grades-section" style="display:none;">
+        <div class="section-header">
+          <div class="grades-toggle" onclick="toggleGrades()">
+            <span class="arrow" id="grades-arrow">&#9654;</span>
+            Formal Grades
+          </div>
+        </div>
+        <div class="grades-content" id="grades-content"></div>
+      </div>
+
+      <!-- Feedback -->
+      <div class="section">
+        <div class="section-header">Your Feedback</div>
+        <div class="section-body">
+          <textarea
+            class="feedback-textarea"
+            id="feedback"
+            placeholder="What do you think of this output? Any issues, suggestions, or things that look great?"
+          ></textarea>
+          <div class="feedback-status" id="feedback-status"></div>
+          <div class="prev-feedback" id="prev-feedback" style="display:none;">
+            <div class="prev-feedback-label">Previous feedback</div>
+            <div id="prev-feedback-text"></div>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <div class="nav" id="outputs-nav">
+      <button class="nav-btn" id="prev-btn" onclick="navigate(-1)">&#8592; Previous</button>
+      <button class="done-btn" id="done-btn" onclick="showDoneDialog()">Submit All Reviews</button>
+      <button class="nav-btn" id="next-btn" onclick="navigate(1)">Next &#8594;</button>
+    </div>
+    </div><!-- end panel-outputs -->
+
+    <!-- Benchmark panel (quantitative stats) -->
+    <div class="view-panel" id="panel-benchmark">
+      <div class="benchmark-view" id="benchmark-content">
+        <div class="benchmark-empty">No benchmark data available. Run a benchmark to see quantitative results here.</div>
+      </div>
+    </div>
+  </div>
+
+  <!-- Done overlay -->
+  <div class="done-overlay" id="done-overlay">
+    <div class="done-card">
+      <h2>Review Complete</h2>
+      <p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>
+      <div class="btn-row">
+        <button onclick="closeDoneDialog()">OK</button>
+      </div>
+    </div>
+  </div>
+
+  <!-- Toast -->
+  <div class="toast" id="toast"></div>
+
+  <script>
+    // ---- Embedded data (injected by generate_review.py) ----
+    /*__EMBEDDED_DATA__*/
+
+    // ---- State ----
+    let feedbackMap = {};  // run_id -> feedback text
+    let currentIndex = 0;
+    let visitedRuns = new Set();
+
+    // ---- Init ----
+    async function init() {
+      // Load saved feedback from server — but only if this isn't a fresh
+      // iteration (indicated by previous_feedback being present). When
+      // previous feedback exists, the feedback.json on disk is stale from
+      // the prior iteration and should not pre-fill the textareas.
+      const hasPrevious = Object.keys(EMBEDDED_DATA.previous_feedback || {}).length > 0
+        || Object.keys(EMBEDDED_DATA.previous_outputs || {}).length > 0;
+      if (!hasPrevious) {
+        try {
+          const resp = await fetch("/api/feedback");
+          const data = await resp.json();
+          if (data.reviews) {
+            for (const r of data.reviews) feedbackMap[r.run_id] = r.feedback;
+          }
+        } catch { /* first run, no feedback yet */ }
+      }
+
+      document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
+      showRun(0);
+
+      // Wire up feedback auto-save
+      const textarea = document.getElementById("feedback");
+      let saveTimeout = null;
+      textarea.addEventListener("input", () => {
+        clearTimeout(saveTimeout);
+        document.getElementById("feedback-status").textContent = "";
+        saveTimeout = setTimeout(() => saveCurrentFeedback(), 800);
+      });
+    }
+
+    // ---- Navigation ----
+    function navigate(delta) {
+      const newIndex = currentIndex + delta;
+      if (newIndex >= 0 && newIndex < EMBEDDED_DATA.runs.length) {
+        saveCurrentFeedback();
+        showRun(newIndex);
+      }
+    }
+
+    function updateNavButtons() {
+      document.getElementById("prev-btn").disabled = currentIndex === 0;
+      document.getElementById("next-btn").disabled =
+        currentIndex === EMBEDDED_DATA.runs.length - 1;
+    }
+
+    // ---- Show a run ----
+    function showRun(index) {
+      currentIndex = index;
+      const run = EMBEDDED_DATA.runs[index];
+
+      // Progress
+      document.getElementById("progress").textContent =
+        `${index + 1} of ${EMBEDDED_DATA.runs.length}`;
+
+      // Prompt
+      document.getElementById("prompt-text").textContent = run.prompt;
+
+      // Config badge
+      const badge = document.getElementById("config-badge");
+      const configMatch = run.id.match(/(with_skill|without_skill|new_skill|old_skill)/);
+      if (configMatch) {
+        const config = configMatch[1];
+        const isBaseline = config === "without_skill" || config === "old_skill";
+        badge.textContent = config.replace(/_/g, " ");
+        badge.className = "config-badge " + (isBaseline ? "config-baseline" : "config-primary");
+        badge.style.display = "inline-block";
+      } else {
+        badge.style.display = "none";
+      }
+
+      // Outputs
+      renderOutputs(run);
+
+      // Previous outputs
+      renderPrevOutputs(run);
+
+      // Grades
+      renderGrades(run);
+
+      // Previous feedback
+      const prevFb = (EMBEDDED_DATA.previous_feedback || {})[run.id];
+      const prevEl = document.getElementById("prev-feedback");
+      if (prevFb) {
+        document.getElementById("prev-feedback-text").textContent = prevFb;
+        prevEl.style.display = "block";
+      } else {
+        prevEl.style.display = "none";
+      }
+
+      // Feedback
+      document.getElementById("feedback").value = feedbackMap[run.id] || "";
+      document.getElementById("feedback-status").textContent = "";
+
+      updateNavButtons();
+
+      // Track visited runs and promote done button when all visited
+      visitedRuns.add(index);
+      const doneBtn = document.getElementById("done-btn");
+      if (visitedRuns.size >= EMBEDDED_DATA.runs.length) {
+        doneBtn.classList.add("ready");
+      }
+
+      // Scroll main content to top
+      document.querySelector(".main").scrollTop = 0;
+    }
+
+    // ---- Render outputs ----
+    function renderOutputs(run) {
+      const container = document.getElementById("outputs-body");
+      container.innerHTML = "";
+
+      const outputs = run.outputs || [];
+      if (outputs.length === 0) {
+        container.innerHTML = '<div class="empty-state">No output files</div>';
+        return;
+      }
+
+      for (const file of outputs) {
+        const fileDiv = document.createElement("div");
+        fileDiv.className = "output-file";
+
+        // Always show file header with download link
+        const header = document.createElement("div");
+        header.className = "output-file-header";
+        const nameSpan = document.createElement("span");
+        nameSpan.textContent = file.name;
+        header.appendChild(nameSpan);
+        const dlBtn = document.createElement("a");
+        dlBtn.className = "dl-btn";
+        dlBtn.textContent = "Download";
+        dlBtn.download = file.name;
+        dlBtn.href = getDownloadUri(file);
+        header.appendChild(dlBtn);
+        fileDiv.appendChild(header);
+
+        const content = document.createElement("div");
+        content.className = "output-file-content";
+
+        if (file.type === "text") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          content.appendChild(pre);
+        } else if (file.type === "image") {
+          const img = document.createElement("img");
+          img.src = file.data_uri;
+          img.alt = file.name;
+          content.appendChild(img);
+        } else if (file.type === "pdf") {
+          const iframe = document.createElement("iframe");
+          iframe.src = file.data_uri;
+          content.appendChild(iframe);
+        } else if (file.type === "xlsx") {
+          renderXlsx(content, file.data_b64);
+        } else if (file.type === "binary") {
+          const a = document.createElement("a");
+          a.className = "download-link";
+          a.href = file.data_uri;
+          a.download = file.name;
+          a.textContent = "Download " + file.name;
+          content.appendChild(a);
+        } else if (file.type === "error") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          pre.style.color = "var(--red)";
+          content.appendChild(pre);
+        }
+
+        fileDiv.appendChild(content);
+        container.appendChild(fileDiv);
+      }
+    }
+
+    // ---- XLSX rendering via SheetJS ----
+    function renderXlsx(container, b64Data) {
+      try {
+        const raw = Uint8Array.from(atob(b64Data), c => c.charCodeAt(0));
+        const wb = XLSX.read(raw, { type: "array" });
+
+        for (let i = 0; i < wb.SheetNames.length; i++) {
+          const sheetName = wb.SheetNames[i];
+          const ws = wb.Sheets[sheetName];
+
+          if (wb.SheetNames.length > 1) {
+            const sheetLabel = document.createElement("div");
+            sheetLabel.style.cssText =
+              "font-weight:600; font-size:0.8rem; color:#b0aea5; margin-top:0.5rem; margin-bottom:0.25rem;";
+            sheetLabel.textContent = "Sheet: " + sheetName;
+            container.appendChild(sheetLabel);
+          }
+
+          const htmlStr = XLSX.utils.sheet_to_html(ws, { editable: false });
+          const wrapper = document.createElement("div");
+          wrapper.innerHTML = htmlStr;
+          container.appendChild(wrapper);
+        }
+      } catch (err) {
+        container.textContent = "Error rendering spreadsheet: " + err.message;
+      }
+    }
+
+    // ---- Grades ----
+    function renderGrades(run) {
+      const section = document.getElementById("grades-section");
+      const content = document.getElementById("grades-content");
+
+      if (!run.grading) {
+        section.style.display = "none";
+        return;
+      }
+
+      const grading = run.grading;
+      section.style.display = "block";
+      // Reset to collapsed
+      content.classList.remove("open");
+      document.getElementById("grades-arrow").classList.remove("open");
+
+      const summary = grading.summary || {};
+      const expectations = grading.expectations || [];
+
+      let html = '<div style="padding: 1rem;">';
+
+      // Summary line
+      const passRate = summary.pass_rate != null
+        ? Math.round(summary.pass_rate * 100) + "%"
+        : "?";
+      const badgeClass = summary.pass_rate >= 0.8 ? "grade-pass" : summary.pass_rate >= 0.5 ? "" : "grade-fail";
+      html += '<div class="grades-summary">';
+      html += '<span class="grade-badge ' + badgeClass + '">' + passRate + '</span>';
+      html += '<span>' + (summary.passed || 0) + ' passed, ' + (summary.failed || 0) + ' failed of ' + (summary.total || 0) + '</span>';
+      html += '</div>';
+
+      // Assertions list
+      html += '<ul class="assertion-list">';
+      for (const exp of expectations) {
+        const statusClass = exp.passed ? "pass" : "fail";
+        const statusIcon = exp.passed ? "\u2713" : "\u2717";
+        html += '<li class="assertion-item">';
+        html += '<span class="assertion-status ' + statusClass + '">' + statusIcon + '</span>';
+        html += '<span>' + escapeHtml(exp.text) + '</span>';
+        if (exp.evidence) {
+          html += '<div class="assertion-evidence">' + escapeHtml(exp.evidence) + '</div>';
+        }
+        html += '</li>';
+      }
+      html += '</ul>';
+
+      html += '</div>';
+      content.innerHTML = html;
+    }
+
+    function toggleGrades() {
+      const content = document.getElementById("grades-content");
+      const arrow = document.getElementById("grades-arrow");
+      content.classList.toggle("open");
+      arrow.classList.toggle("open");
+    }
+
+    // ---- Previous outputs (collapsible) ----
+    function renderPrevOutputs(run) {
+      const section = document.getElementById("prev-outputs-section");
+      const content = document.getElementById("prev-outputs-content");
+      const prevOutputs = (EMBEDDED_DATA.previous_outputs || {})[run.id];
+
+      if (!prevOutputs || prevOutputs.length === 0) {
+        section.style.display = "none";
+        return;
+      }
+
+      section.style.display = "block";
+      // Reset to collapsed
+      content.classList.remove("open");
+      document.getElementById("prev-outputs-arrow").classList.remove("open");
+
+      // Render the files into the content area
+      content.innerHTML = "";
+      const wrapper = document.createElement("div");
+      wrapper.style.padding = "1rem";
+
+      for (const file of prevOutputs) {
+        const fileDiv = document.createElement("div");
+        fileDiv.className = "output-file";
+
+        const header = document.createElement("div");
+        header.className = "output-file-header";
+        const nameSpan = document.createElement("span");
+        nameSpan.textContent = file.name;
+        header.appendChild(nameSpan);
+        const dlBtn = document.createElement("a");
+        dlBtn.className = "dl-btn";
+        dlBtn.textContent = "Download";
+        dlBtn.download = file.name;
+        dlBtn.href = getDownloadUri(file);
+        header.appendChild(dlBtn);
+        fileDiv.appendChild(header);
+
+        const fc = document.createElement("div");
+        fc.className = "output-file-content";
+
+        if (file.type === "text") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          fc.appendChild(pre);
+        } else if (file.type === "image") {
+          const img = document.createElement("img");
+          img.src = file.data_uri;
+          img.alt = file.name;
+          fc.appendChild(img);
+        } else if (file.type === "pdf") {
+          const iframe = document.createElement("iframe");
+          iframe.src = file.data_uri;
+          fc.appendChild(iframe);
+        } else if (file.type === "xlsx") {
+          renderXlsx(fc, file.data_b64);
+        } else if (file.type === "binary") {
+          const a = document.createElement("a");
+          a.className = "download-link";
+          a.href = file.data_uri;
+          a.download = file.name;
+          a.textContent = "Download " + file.name;
+          fc.appendChild(a);
+        }
+
+        fileDiv.appendChild(fc);
+        wrapper.appendChild(fileDiv);
+      }
+
+      content.appendChild(wrapper);
+    }
+
+    function togglePrevOutputs() {
+      const content = document.getElementById("prev-outputs-content");
+      const arrow = document.getElementById("prev-outputs-arrow");
+      content.classList.toggle("open");
+      arrow.classList.toggle("open");
+    }
+
+    // ---- Feedback (saved to server -> feedback.json) ----
+    function saveCurrentFeedback() {
+      const run = EMBEDDED_DATA.runs[currentIndex];
+      const text = document.getElementById("feedback").value;
+
+      if (text.trim() === "") {
+        delete feedbackMap[run.id];
+      } else {
+        feedbackMap[run.id] = text;
+      }
+
+      // Build reviews array from map
+      const reviews = [];
+      for (const [run_id, feedback] of Object.entries(feedbackMap)) {
+        if (feedback.trim()) {
+          reviews.push({ run_id, feedback, timestamp: new Date().toISOString() });
+        }
+      }
+
+      fetch("/api/feedback", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ reviews, status: "in_progress" }),
+      }).then(() => {
+        document.getElementById("feedback-status").textContent = "Saved";
+      }).catch(() => {
+        // Static mode or server unavailable — no-op on auto-save,
+        // feedback will be downloaded on final submit
+        document.getElementById("feedback-status").textContent = "Will download on submit";
+      });
+    }
+
+    // ---- Done ----
+    function showDoneDialog() {
+      // Save current textarea to feedbackMap (but don't POST yet)
+      const run = EMBEDDED_DATA.runs[currentIndex];
+      const text = document.getElementById("feedback").value;
+      if (text.trim() === "") {
+        delete feedbackMap[run.id];
+      } else {
+        feedbackMap[run.id] = text;
+      }
+
+      // POST once with status: complete — include ALL runs so the model
+      // can distinguish "no feedback" (looks good) from "not reviewed"
+      const reviews = [];
+      const ts = new Date().toISOString();
+      for (const r of EMBEDDED_DATA.runs) {
+        reviews.push({ run_id: r.id, feedback: feedbackMap[r.id] || "", timestamp: ts });
+      }
+      const payload = JSON.stringify({ reviews, status: "complete" }, null, 2);
+      fetch("/api/feedback", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: payload,
+      }).then(() => {
+        document.getElementById("done-overlay").classList.add("visible");
+      }).catch(() => {
+        // Server not available (static mode) — download as file
+        const blob = new Blob([payload], { type: "application/json" });
+        const url = URL.createObjectURL(blob);
+        const a = document.createElement("a");
+        a.href = url;
+        a.download = "feedback.json";
+        a.click();
+        URL.revokeObjectURL(url);
+        document.getElementById("done-overlay").classList.add("visible");
+      });
+    }
+
+    function closeDoneDialog() {
+      // Reset status back to in_progress
+      saveCurrentFeedback();
+      document.getElementById("done-overlay").classList.remove("visible");
+    }
+
+    // ---- Toast ----
+    function showToast(message) {
+      const toast = document.getElementById("toast");
+      toast.textContent = message;
+      toast.classList.add("visible");
+      setTimeout(() => toast.classList.remove("visible"), 2000);
+    }
+
+    // ---- Keyboard nav ----
+    document.addEventListener("keydown", (e) => {
+      // Don't capture when typing in textarea
+      if (e.target.tagName === "TEXTAREA") return;
+
+      if (e.key === "ArrowLeft" || e.key === "ArrowUp") {
+        e.preventDefault();
+        navigate(-1);
+      } else if (e.key === "ArrowRight" || e.key === "ArrowDown") {
+        e.preventDefault();
+        navigate(1);
+      }
+    });
+
+    // ---- Util ----
+    function getDownloadUri(file) {
+      if (file.data_uri) return file.data_uri;
+      if (file.data_b64) return "data:application/octet-stream;base64," + file.data_b64;
+      if (file.type === "text") return "data:text/plain;charset=utf-8," + encodeURIComponent(file.content);
+      return "#";
+    }
+
+    function escapeHtml(text) {
+      const div = document.createElement("div");
+      div.textContent = text;
+      return div.innerHTML;
+    }
+
+    // ---- View switching ----
+    function switchView(view) {
+      document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
+      document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
+      document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
+      document.getElementById("panel-" + view).classList.add("active");
+    }
+
+    // ---- Benchmark rendering ----
+    function renderBenchmark() {
+      const data = EMBEDDED_DATA.benchmark;
+      if (!data) return;
+
+      // Show the tabs
+      document.getElementById("view-tabs").style.display = "flex";
+
+      const container = document.getElementById("benchmark-content");
+      const summary = data.run_summary || {};
+      const metadata = data.metadata || {};
+      const notes = data.notes || [];
+
+      let html = "";
+
+      // Header
+      html += "<h2 style='font-family: Poppins, sans-serif; margin-bottom: 0.5rem;'>Benchmark Results</h2>";
+      html += "<p style='color: var(--text-muted); font-size: 0.875rem; margin-bottom: 1.25rem;'>";
+      if (metadata.skill_name) html += "<strong>" + escapeHtml(metadata.skill_name) + "</strong> &mdash; ";
+      if (metadata.timestamp) html += metadata.timestamp + " &mdash; ";
+      if (metadata.evals_run) html += "Evals: " + metadata.evals_run.join(", ") + " &mdash; ";
+      html += (metadata.runs_per_configuration || "?") + " runs per configuration";
+      html += "</p>";
+
+      // Summary table
+      html += '<table class="benchmark-table">';
+
+      function fmtStat(stat, pct) {
+        if (!stat) return "—";
+        const suffix = pct ? "%" : "";
+        const m = pct ? (stat.mean * 100).toFixed(0) : stat.mean.toFixed(1);
+        const s = pct ? (stat.stddev * 100).toFixed(0) : stat.stddev.toFixed(1);
+        return m + suffix + " ± " + s + suffix;
+      }
+
+      function deltaClass(val) {
+        if (!val) return "";
+        const n = parseFloat(val);
+        if (n > 0) return "benchmark-delta-positive";
+        if (n < 0) return "benchmark-delta-negative";
+        return "";
+      }
+
+      // Discover config names dynamically (everything except "delta")
+      const configs = Object.keys(summary).filter(k => k !== "delta");
+      const configA = configs[0] || "config_a";
+      const configB = configs[1] || "config_b";
+      const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+      const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+      const a = summary[configA] || {};
+      const b = summary[configB] || {};
+      const delta = summary.delta || {};
+
+      html += "<thead><tr><th>Metric</th><th>" + escapeHtml(labelA) + "</th><th>" + escapeHtml(labelB) + "</th><th>Delta</th></tr></thead>";
+      html += "<tbody>";
+
+      html += "<tr><td><strong>Pass Rate</strong></td>";
+      html += "<td>" + fmtStat(a.pass_rate, true) + "</td>";
+      html += "<td>" + fmtStat(b.pass_rate, true) + "</td>";
+      html += '<td class="' + deltaClass(delta.pass_rate) + '">' + (delta.pass_rate || "—") + "</td></tr>";
+
+      // Time (only show row if data exists)
+      if (a.time_seconds || b.time_seconds) {
+        html += "<tr><td><strong>Time (s)</strong></td>";
+        html += "<td>" + fmtStat(a.time_seconds, false) + "</td>";
+        html += "<td>" + fmtStat(b.time_seconds, false) + "</td>";
+        html += '<td class="' + deltaClass(delta.time_seconds) + '">' + (delta.time_seconds ? delta.time_seconds + "s" : "—") + "</td></tr>";
+      }
+
+      // Tokens (only show row if data exists)
+      if (a.tokens || b.tokens) {
+        html += "<tr><td><strong>Tokens</strong></td>";
+        html += "<td>" + fmtStat(a.tokens, false) + "</td>";
+        html += "<td>" + fmtStat(b.tokens, false) + "</td>";
+        html += '<td class="' + deltaClass(delta.tokens) + '">' + (delta.tokens || "—") + "</td></tr>";
+      }
+
+      html += "</tbody></table>";
+
+      // Per-eval breakdown (if runs data available)
+      const runs = data.runs || [];
+      if (runs.length > 0) {
+        const evalIds = [...new Set(runs.map(r => r.eval_id))].sort((a, b) => a - b);
+
+        html += "<h3 style='font-family: Poppins, sans-serif; margin-bottom: 0.75rem;'>Per-Eval Breakdown</h3>";
+
+        const hasTime = runs.some(r => r.result && r.result.time_seconds != null);
+        const hasErrors = runs.some(r => r.result && r.result.errors > 0);
+
+        for (const evalId of evalIds) {
+          const evalRuns = runs.filter(r => r.eval_id === evalId);
+          const evalName = evalRuns[0] && evalRuns[0].eval_name ? evalRuns[0].eval_name : "Eval " + evalId;
+
+          html += "<h4 style='font-family: Poppins, sans-serif; margin: 1rem 0 0.5rem; color: var(--text);'>" + escapeHtml(evalName) + "</h4>";
+          html += '<table class="benchmark-table">';
+          html += "<thead><tr><th>Config</th><th>Run</th><th>Pass Rate</th>";
+          if (hasTime) html += "<th>Time (s)</th>";
+          if (hasErrors) html += "<th>Crashes During Execution</th>";
+          html += "</tr></thead>";
+          html += "<tbody>";
+
+          // Group by config and render with average rows
+          const configGroups = [...new Set(evalRuns.map(r => r.configuration))];
+          for (let ci = 0; ci < configGroups.length; ci++) {
+            const config = configGroups[ci];
+            const configRuns = evalRuns.filter(r => r.configuration === config);
+            if (configRuns.length === 0) continue;
+
+            const rowClass = ci === 0 ? "benchmark-row-with" : "benchmark-row-without";
+            const configLabel = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+
+            for (const run of configRuns) {
+              const r = run.result || {};
+              const prClass = r.pass_rate >= 0.8 ? "benchmark-delta-positive" : r.pass_rate < 0.5 ? "benchmark-delta-negative" : "";
+              html += '<tr class="' + rowClass + '">';
+              html += "<td>" + configLabel + "</td>";
+              html += "<td>" + run.run_number + "</td>";
+              html += '<td class="' + prClass + '">' + ((r.pass_rate || 0) * 100).toFixed(0) + "% (" + (r.passed || 0) + "/" + (r.total || 0) + ")</td>";
+              if (hasTime) html += "<td>" + (r.time_seconds != null ? r.time_seconds.toFixed(1) : "—") + "</td>";
+              if (hasErrors) html += "<td>" + (r.errors || 0) + "</td>";
+              html += "</tr>";
+            }
+
+            // Average row
+            const rates = configRuns.map(r => (r.result || {}).pass_rate || 0);
+            const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
+            const avgPrClass = avgRate >= 0.8 ? "benchmark-delta-positive" : avgRate < 0.5 ? "benchmark-delta-negative" : "";
+            html += '<tr class="benchmark-row-avg ' + rowClass + '">';
+            html += "<td>" + configLabel + "</td>";
+            html += "<td>Avg</td>";
+            html += '<td class="' + avgPrClass + '">' + (avgRate * 100).toFixed(0) + "%</td>";
+            if (hasTime) {
+              const times = configRuns.map(r => (r.result || {}).time_seconds).filter(t => t != null);
+              html += "<td>" + (times.length ? (times.reduce((a, b) => a + b, 0) / times.length).toFixed(1) : "—") + "</td>";
+            }
+            if (hasErrors) html += "<td></td>";
+            html += "</tr>";
+          }
+          html += "</tbody></table>";
+
+          // Per-assertion detail for this eval
+          const runsWithExpectations = {};
+          for (const config of configGroups) {
+            runsWithExpectations[config] = evalRuns.filter(r => r.configuration === config && r.expectations && r.expectations.length > 0);
+          }
+          const hasAnyExpectations = Object.values(runsWithExpectations).some(runs => runs.length > 0);
+          if (hasAnyExpectations) {
+            // Collect all unique assertion texts across all configs
+            const allAssertions = [];
+            const seen = new Set();
+            for (const config of configGroups) {
+              for (const run of runsWithExpectations[config]) {
+                for (const exp of (run.expectations || [])) {
+                  if (!seen.has(exp.text)) {
+                    seen.add(exp.text);
+                    allAssertions.push(exp.text);
+                  }
+                }
+              }
+            }
+
+            html += '<table class="benchmark-table" style="margin-top: 0.5rem;">';
+            html += "<thead><tr><th>Assertion</th>";
+            for (const config of configGroups) {
+              const label = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+              html += "<th>" + escapeHtml(label) + "</th>";
+            }
+            html += "</tr></thead><tbody>";
+
+            for (const assertionText of allAssertions) {
+              html += "<tr><td>" + escapeHtml(assertionText) + "</td>";
+
+              for (const config of configGroups) {
+                html += "<td>";
+                for (const run of runsWithExpectations[config]) {
+                  const exp = (run.expectations || []).find(e => e.text === assertionText);
+                  if (exp) {
+                    const cls = exp.passed ? "benchmark-delta-positive" : "benchmark-delta-negative";
+                    const icon = exp.passed ? "\u2713" : "\u2717";
+                    html += '<span class="' + cls + '" title="Run ' + run.run_number + ': ' + escapeHtml(exp.evidence || "") + '">' + icon + "</span> ";
+                  } else {
+                    html += "— ";
+                  }
+                }
+                html += "</td>";
+              }
+              html += "</tr>";
+            }
+            html += "</tbody></table>";
+          }
+        }
+      }
+
+      // Notes
+      if (notes.length > 0) {
+        html += '<div class="benchmark-notes">';
+        html += "<h3>Analysis Notes</h3>";
+        html += "<ul>";
+        for (const note of notes) {
+          html += "<li>" + escapeHtml(note) + "</li>";
+        }
+        html += "</ul></div>";
+      }
+
+      container.innerHTML = html;
+    }
+
+    // ---- Start ----
+    init();
+    renderBenchmark();
+  </script>
+</body>
+</html>
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/references/schemas.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/references/schemas.md
new file mode 100644
index 000000000..b6eeaa2d4
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/references/schemas.md
@@ -0,0 +1,430 @@
+# JSON Schemas
+
+This document defines the JSON schemas used by skill-creator.
+
+---
+
+## evals.json
+
+Defines the evals for a skill. Located at `evals/evals.json` within the skill directory.
+
+```json
+{
+  "skill_name": "example-skill",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "User's example prompt",
+      "expected_output": "Description of expected result",
+      "files": ["evals/files/sample1.pdf"],
+      "expectations": [
+        "The output includes X",
+        "The skill used script Y"
+      ]
+    }
+  ]
+}
+```
+
+**Fields:**
+- `skill_name`: Name matching the skill's frontmatter
+- `evals[].id`: Unique integer identifier
+- `evals[].prompt`: The task to execute
+- `evals[].expected_output`: Human-readable description of success
+- `evals[].files`: Optional list of input file paths (relative to skill root)
+- `evals[].expectations`: List of verifiable statements
+
+---
+
+## history.json
+
+Tracks version progression in Improve mode. Located at workspace root.
+
+```json
+{
+  "started_at": "2026-01-15T10:30:00Z",
+  "skill_name": "pdf",
+  "current_best": "v2",
+  "iterations": [
+    {
+      "version": "v0",
+      "parent": null,
+      "expectation_pass_rate": 0.65,
+      "grading_result": "baseline",
+      "is_current_best": false
+    },
+    {
+      "version": "v1",
+      "parent": "v0",
+      "expectation_pass_rate": 0.75,
+      "grading_result": "won",
+      "is_current_best": false
+    },
+    {
+      "version": "v2",
+      "parent": "v1",
+      "expectation_pass_rate": 0.85,
+      "grading_result": "won",
+      "is_current_best": true
+    }
+  ]
+}
+```
+
+**Fields:**
+- `started_at`: ISO timestamp of when improvement started
+- `skill_name`: Name of the skill being improved
+- `current_best`: Version identifier of the best performer
+- `iterations[].version`: Version identifier (v0, v1, ...)
+- `iterations[].parent`: Parent version this was derived from
+- `iterations[].expectation_pass_rate`: Pass rate from grading
+- `iterations[].grading_result`: "baseline", "won", "lost", or "tie"
+- `iterations[].is_current_best`: Whether this is the current best version
+
+---
+
+## grading.json
+
+Output from the grader agent. Located at `<run-dir>/grading.json`.
+
+```json
+{
+  "expectations": [
+    {
+      "text": "The output includes the name 'John Smith'",
+      "passed": true,
+      "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
+    },
+    {
+      "text": "The spreadsheet has a SUM formula in cell B10",
+      "passed": false,
+      "evidence": "No spreadsheet was created. The output was a text file."
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 1,
+    "total": 3,
+    "pass_rate": 0.67
+  },
+  "execution_metrics": {
+    "tool_calls": {
+      "Read": 5,
+      "Write": 2,
+      "Bash": 8
+    },
+    "total_tool_calls": 15,
+    "total_steps": 6,
+    "errors_encountered": 0,
+    "output_chars": 12450,
+    "transcript_chars": 3200
+  },
+  "timing": {
+    "executor_duration_seconds": 165.0,
+    "grader_duration_seconds": 26.0,
+    "total_duration_seconds": 191.0
+  },
+  "claims": [
+    {
+      "claim": "The form has 12 fillable fields",
+      "type": "factual",
+      "verified": true,
+      "evidence": "Counted 12 fields in field_info.json"
+    }
+  ],
+  "user_notes_summary": {
+    "uncertainties": ["Used 2023 data, may be stale"],
+    "needs_review": [],
+    "workarounds": ["Fell back to text overlay for non-fillable fields"]
+  },
+  "eval_feedback": {
+    "suggestions": [
+      {
+        "assertion": "The output includes the name 'John Smith'",
+        "reason": "A hallucinated document that mentions the name would also pass"
+      }
+    ],
+    "overall": "Assertions check presence but not correctness."
+  }
+}
+```
+
+**Fields:**
+- `expectations[]`: Graded expectations with evidence
+- `summary`: Aggregate pass/fail counts
+- `execution_metrics`: Tool usage and output size (from executor's metrics.json)
+- `timing`: Wall clock timing (from timing.json)
+- `claims`: Extracted and verified claims from the output
+- `user_notes_summary`: Issues flagged by the executor
+- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising
+
+---
+
+## metrics.json
+
+Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
+
+```json
+{
+  "tool_calls": {
+    "Read": 5,
+    "Write": 2,
+    "Bash": 8,
+    "Edit": 1,
+    "Glob": 2,
+    "Grep": 0
+  },
+  "total_tool_calls": 18,
+  "total_steps": 6,
+  "files_created": ["filled_form.pdf", "field_values.json"],
+  "errors_encountered": 0,
+  "output_chars": 12450,
+  "transcript_chars": 3200
+}
+```
+
+**Fields:**
+- `tool_calls`: Count per tool type
+- `total_tool_calls`: Sum of all tool calls
+- `total_steps`: Number of major execution steps
+- `files_created`: List of output files created
+- `errors_encountered`: Number of errors during execution
+- `output_chars`: Total character count of output files
+- `transcript_chars`: Character count of transcript
+
+---
+
+## timing.json
+
+Wall clock timing for a run. Located at `<run-dir>/timing.json`.
+
+**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact.
+
+```json
+{
+  "total_tokens": 84852,
+  "duration_ms": 23332,
+  "total_duration_seconds": 23.3,
+  "executor_start": "2026-01-15T10:30:00Z",
+  "executor_end": "2026-01-15T10:32:45Z",
+  "executor_duration_seconds": 165.0,
+  "grader_start": "2026-01-15T10:32:46Z",
+  "grader_end": "2026-01-15T10:33:12Z",
+  "grader_duration_seconds": 26.0
+}
+```
+
+---
+
+## benchmark.json
+
+Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
+
+```json
+{
+  "metadata": {
+    "skill_name": "pdf",
+    "skill_path": "/path/to/pdf",
+    "executor_model": "claude-sonnet-4-20250514",
+    "analyzer_model": "most-capable-model",
+    "timestamp": "2026-01-15T10:30:00Z",
+    "evals_run": [1, 2, 3],
+    "runs_per_configuration": 3
+  },
+
+  "runs": [
+    {
+      "eval_id": 1,
+      "eval_name": "Ocean",
+      "configuration": "with_skill",
+      "run_number": 1,
+      "result": {
+        "pass_rate": 0.85,
+        "passed": 6,
+        "failed": 1,
+        "total": 7,
+        "time_seconds": 42.5,
+        "tokens": 3800,
+        "tool_calls": 18,
+        "errors": 0
+      },
+      "expectations": [
+        {"text": "...", "passed": true, "evidence": "..."}
+      ],
+      "notes": [
+        "Used 2023 data, may be stale",
+        "Fell back to text overlay for non-fillable fields"
+      ]
+    }
+  ],
+
+  "run_summary": {
+    "with_skill": {
+      "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
+      "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
+      "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
+    },
+    "without_skill": {
+      "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
+      "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
+      "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
+    },
+    "delta": {
+      "pass_rate": "+0.50",
+      "time_seconds": "+13.0",
+      "tokens": "+1700"
+    }
+  },
+
+  "notes": [
+    "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
+    "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent",
+    "Without-skill runs consistently fail on table extraction expectations",
+    "Skill adds 13s average execution time but improves pass rate by 50%"
+  ]
+}
+```
+
+**Fields:**
+- `metadata`: Information about the benchmark run
+  - `skill_name`: Name of the skill
+  - `timestamp`: When the benchmark was run
+  - `evals_run`: List of eval names or IDs
+  - `runs_per_configuration`: Number of runs per config (e.g. 3)
+- `runs[]`: Individual run results
+  - `eval_id`: Numeric eval identifier
+  - `eval_name`: Human-readable eval name (used as section header in the viewer)
+  - `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding)
+  - `run_number`: Integer run number (1, 2, 3...)
+  - `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors`
+- `run_summary`: Statistical aggregates per configuration
+  - `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields
+  - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"`
+- `notes`: Freeform observations from the analyzer
+
+**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually.
+
+---
+
+## comparison.json
+
+Output from blind comparator. Located at `<grading-dir>/comparison-N.json`.
+
+```json
+{
+  "winner": "A",
+  "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
+  "rubric": {
+    "A": {
+      "content": {
+        "correctness": 5,
+        "completeness": 5,
+        "accuracy": 4
+      },
+      "structure": {
+        "organization": 4,
+        "formatting": 5,
+        "usability": 4
+      },
+      "content_score": 4.7,
+      "structure_score": 4.3,
+      "overall_score": 9.0
+    },
+    "B": {
+      "content": {
+        "correctness": 3,
+        "completeness": 2,
+        "accuracy": 3
+      },
+      "structure": {
+        "organization": 3,
+        "formatting": 2,
+        "usability": 3
+      },
+      "content_score": 2.7,
+      "structure_score": 2.7,
+      "overall_score": 5.4
+    }
+  },
+  "output_quality": {
+    "A": {
+      "score": 9,
+      "strengths": ["Complete solution", "Well-formatted", "All fields present"],
+      "weaknesses": ["Minor style inconsistency in header"]
+    },
+    "B": {
+      "score": 5,
+      "strengths": ["Readable output", "Correct basic structure"],
+      "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
+    }
+  },
+  "expectation_results": {
+    "A": {
+      "passed": 4,
+      "total": 5,
+      "pass_rate": 0.80,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    },
+    "B": {
+      "passed": 3,
+      "total": 5,
+      "pass_rate": 0.60,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    }
+  }
+}
+```
+
+---
+
+## analysis.json
+
+Output from post-hoc analyzer. Located at `<grading-dir>/analysis.json`.
+
+```json
+{
+  "comparison_summary": {
+    "winner": "A",
+    "winner_skill": "path/to/winner/skill",
+    "loser_skill": "path/to/loser/skill",
+    "comparator_reasoning": "Brief summary of why comparator chose winner"
+  },
+  "winner_strengths": [
+    "Clear step-by-step instructions for handling multi-page documents",
+    "Included validation script that caught formatting errors"
+  ],
+  "loser_weaknesses": [
+    "Vague instruction 'process the document appropriately' led to inconsistent behavior",
+    "No script for validation, agent had to improvise"
+  ],
+  "instruction_following": {
+    "winner": {
+      "score": 9,
+      "issues": ["Minor: skipped optional logging step"]
+    },
+    "loser": {
+      "score": 6,
+      "issues": [
+        "Did not use the skill's formatting template",
+        "Invented own approach instead of following step 3"
+      ]
+    }
+  },
+  "improvement_suggestions": [
+    {
+      "priority": "high",
+      "category": "instructions",
+      "suggestion": "Replace 'process the document appropriately' with explicit steps",
+      "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
+    }
+  ],
+  "transcript_insights": {
+    "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script",
+    "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods"
+  }
+}
+```
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/__init__.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/aggregate_benchmark.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/aggregate_benchmark.py
new file mode 100644
index 000000000..ccc810819
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/aggregate_benchmark.py
@@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+"""
+Aggregate individual run results into benchmark summary statistics.
+
+Reads grading.json files from run directories and produces:
+- run_summary with mean, stddev, min, max for each metric
+- delta between with_skill and without_skill configurations
+
+Usage:
+    python aggregate_benchmark.py <benchmark_dir>
+
+Example:
+    python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
+
+The script supports two directory layouts:
+
+    Workspace layout (from skill-creator iterations):
+    <benchmark_dir>/
+    └── eval-N/
+        ├── with_skill/
+        │   ├── run-1/grading.json
+        │   └── run-2/grading.json
+        └── without_skill/
+            ├── run-1/grading.json
+            └── run-2/grading.json
+
+    Legacy layout (with runs/ subdirectory):
+    <benchmark_dir>/
+    └── runs/
+        └── eval-N/
+            ├── with_skill/
+            │   └── run-1/grading.json
+            └── without_skill/
+                └── run-1/grading.json
+"""
+
+import argparse
+import json
+import math
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+from loguru import logger
+
+
+def calculate_stats(values: list[float]) -> dict:
+    """Calculate mean, stddev, min, max for a list of values."""
+    if not values:
+        return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
+
+    n = len(values)
+    mean = sum(values) / n
+
+    if n > 1:
+        variance = sum((x - mean) ** 2 for x in values) / (n - 1)
+        stddev = math.sqrt(variance)
+    else:
+        stddev = 0.0
+
+    return {
+        "mean": round(mean, 4),
+        "stddev": round(stddev, 4),
+        "min": round(min(values), 4),
+        "max": round(max(values), 4)
+    }
+
+
+def load_run_results(benchmark_dir: Path) -> dict:
+    """
+    Load all run results from a benchmark directory.
+
+    Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
+    or "new_skill"/"old_skill"), each containing a list of run results.
+    """
+    # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
+    runs_dir = benchmark_dir / "runs"
+    if runs_dir.exists():
+        search_dir = runs_dir
+    elif list(benchmark_dir.glob("eval-*")):
+        search_dir = benchmark_dir
+    else:
+        logger.warning(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
+        return {}
+
+    results: dict[str, list] = {}
+
+    for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
+        metadata_path = eval_dir / "eval_metadata.json"
+        if metadata_path.exists():
+            try:
+                with open(metadata_path) as mf:
+                    eval_id = json.load(mf).get("eval_id", eval_idx)
+            except (json.JSONDecodeError, OSError):
+                eval_id = eval_idx
+        else:
+            try:
+                eval_id = int(eval_dir.name.split("-")[1])
+            except ValueError:
+                eval_id = eval_idx
+
+        # Discover config directories dynamically rather than hardcoding names
+        for config_dir in sorted(eval_dir.iterdir()):
+            if not config_dir.is_dir():
+                continue
+            # Skip non-config directories (inputs, outputs, etc.)
+            if not list(config_dir.glob("run-*")):
+                continue
+            config = config_dir.name
+            if config not in results:
+                results[config] = []
+
+            for run_dir in sorted(config_dir.glob("run-*")):
+                run_number = int(run_dir.name.split("-")[1])
+                grading_file = run_dir / "grading.json"
+
+                if not grading_file.exists():
+                    logger.warning(f"Warning: grading.json not found in {run_dir}")
+                    continue
+
+                try:
+                    with open(grading_file) as f:
+                        grading = json.load(f)
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Warning: Invalid JSON in {grading_file}: {e}")
+                    continue
+
+                # Extract metrics
+                result = {
+                    "eval_id": eval_id,
+                    "run_number": run_number,
+                    "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
+                    "passed": grading.get("summary", {}).get("passed", 0),
+                    "failed": grading.get("summary", {}).get("failed", 0),
+                    "total": grading.get("summary", {}).get("total", 0),
+                }
+
+                # Extract timing — check grading.json first, then sibling timing.json
+                timing = grading.get("timing", {})
+                result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
+                timing_file = run_dir / "timing.json"
+                if result["time_seconds"] == 0.0 and timing_file.exists():
+                    try:
+                        with open(timing_file) as tf:
+                            timing_data = json.load(tf)
+                        result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
+                        result["tokens"] = timing_data.get("total_tokens", 0)
+                    except json.JSONDecodeError:
+                        pass
+
+                # Extract metrics if available
+                metrics = grading.get("execution_metrics", {})
+                result["tool_calls"] = metrics.get("total_tool_calls", 0)
+                if not result.get("tokens"):
+                    result["tokens"] = metrics.get("output_chars", 0)
+                result["errors"] = metrics.get("errors_encountered", 0)
+
+                # Extract expectations — viewer requires fields: text, passed, evidence
+                raw_expectations = grading.get("expectations", [])
+                for exp in raw_expectations:
+                    if "text" not in exp or "passed" not in exp:
+                        logger.warning(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
+                result["expectations"] = raw_expectations
+
+                # Extract notes from user_notes_summary
+                notes_summary = grading.get("user_notes_summary", {})
+                notes = []
+                notes.extend(notes_summary.get("uncertainties", []))
+                notes.extend(notes_summary.get("needs_review", []))
+                notes.extend(notes_summary.get("workarounds", []))
+                result["notes"] = notes
+
+                results[config].append(result)
+
+    return results
+
+
+def aggregate_results(results: dict) -> dict:
+    """
+    Aggregate run results into summary statistics.
+
+    Returns run_summary with stats for each configuration and delta.
+    """
+    run_summary = {}
+    configs = list(results.keys())
+
+    for config in configs:
+        runs = results.get(config, [])
+
+        if not runs:
+            run_summary[config] = {
+                "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
+            }
+            continue
+
+        pass_rates = [r["pass_rate"] for r in runs]
+        times = [r["time_seconds"] for r in runs]
+        tokens = [r.get("tokens", 0) for r in runs]
+
+        run_summary[config] = {
+            "pass_rate": calculate_stats(pass_rates),
+            "time_seconds": calculate_stats(times),
+            "tokens": calculate_stats(tokens)
+        }
+
+    # Calculate delta between the first two configs (if two exist)
+    if len(configs) >= 2:
+        primary = run_summary.get(configs[0], {})
+        baseline = run_summary.get(configs[1], {})
+    else:
+        primary = run_summary.get(configs[0], {}) if configs else {}
+        baseline = {}
+
+    delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
+    delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
+    delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
+
+    run_summary["delta"] = {
+        "pass_rate": f"{delta_pass_rate:+.2f}",
+        "time_seconds": f"{delta_time:+.1f}",
+        "tokens": f"{delta_tokens:+.0f}"
+    }
+
+    return run_summary
+
+
+def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
+    """
+    Generate complete benchmark.json from run results.
+    """
+    results = load_run_results(benchmark_dir)
+    run_summary = aggregate_results(results)
+
+    # Build runs array for benchmark.json
+    runs = []
+    for config in results:
+        for result in results[config]:
+            runs.append({
+                "eval_id": result["eval_id"],
+                "configuration": config,
+                "run_number": result["run_number"],
+                "result": {
+                    "pass_rate": result["pass_rate"],
+                    "passed": result["passed"],
+                    "failed": result["failed"],
+                    "total": result["total"],
+                    "time_seconds": result["time_seconds"],
+                    "tokens": result.get("tokens", 0),
+                    "tool_calls": result.get("tool_calls", 0),
+                    "errors": result.get("errors", 0)
+                },
+                "expectations": result["expectations"],
+                "notes": result["notes"]
+            })
+
+    # Determine eval IDs from results
+    eval_ids = sorted(set(
+        r["eval_id"]
+        for config in results.values()
+        for r in config
+    ))
+
+    benchmark = {
+        "metadata": {
+            "skill_name": skill_name or "<skill-name>",
+            "skill_path": skill_path or "<path/to/skill>",
+            "executor_model": "<model-name>",
+            "analyzer_model": "<model-name>",
+            "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+            "evals_run": eval_ids,
+            "runs_per_configuration": 3
+        },
+        "runs": runs,
+        "run_summary": run_summary,
+        "notes": []  # To be filled by analyzer
+    }
+
+    return benchmark
+
+
+def generate_markdown(benchmark: dict) -> str:
+    """Generate human-readable benchmark.md from benchmark data."""
+    metadata = benchmark["metadata"]
+    run_summary = benchmark["run_summary"]
+
+    # Determine config names (excluding "delta")
+    configs = [k for k in run_summary if k != "delta"]
+    config_a = configs[0] if len(configs) >= 1 else "config_a"
+    config_b = configs[1] if len(configs) >= 2 else "config_b"
+    label_a = config_a.replace("_", " ").title()
+    label_b = config_b.replace("_", " ").title()
+
+    lines = [
+        f"# Skill Benchmark: {metadata['skill_name']}",
+        "",
+        f"**Model**: {metadata['executor_model']}",
+        f"**Date**: {metadata['timestamp']}",
+        f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
+        "",
+        "## Summary",
+        "",
+        f"| Metric | {label_a} | {label_b} | Delta |",
+        "|--------|------------|---------------|-------|",
+    ]
+
+    a_summary = run_summary.get(config_a, {})
+    b_summary = run_summary.get(config_b, {})
+    delta = run_summary.get("delta", {})
+
+    # Format pass rate
+    a_pr = a_summary.get("pass_rate", {})
+    b_pr = b_summary.get("pass_rate", {})
+    lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
+
+    # Format time
+    a_time = a_summary.get("time_seconds", {})
+    b_time = b_summary.get("time_seconds", {})
+    lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
+
+    # Format tokens
+    a_tokens = a_summary.get("tokens", {})
+    b_tokens = b_summary.get("tokens", {})
+    lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
+
+    # Notes section
+    if benchmark.get("notes"):
+        lines.extend([
+            "",
+            "## Notes",
+            ""
+        ])
+        for note in benchmark["notes"]:
+            lines.append(f"- {note}")
+
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Aggregate benchmark run results into summary statistics"
+    )
+    parser.add_argument(
+        "benchmark_dir",
+        type=Path,
+        help="Path to the benchmark directory"
+    )
+    parser.add_argument(
+        "--skill-name",
+        default="",
+        help="Name of the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--skill-path",
+        default="",
+        help="Path to the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=Path,
+        help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
+    )
+
+    args = parser.parse_args()
+
+    if not args.benchmark_dir.exists():
+        logger.error(f"Directory not found: {args.benchmark_dir}")
+        sys.exit(1)
+
+    # Generate benchmark
+    benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
+
+    # Determine output paths
+    output_json = args.output or (args.benchmark_dir / "benchmark.json")
+    output_md = output_json.with_suffix(".md")
+
+    # Write benchmark.json
+    with open(output_json, "w") as f:
+        json.dump(benchmark, f, indent=2)
+    logger.info(f"Generated: {output_json}")
+
+    # Write benchmark.md
+    markdown = generate_markdown(benchmark)
+    with open(output_md, "w") as f:
+        f.write(markdown)
+    logger.info(f"Generated: {output_md}")
+
+    # Print summary
+    run_summary = benchmark["run_summary"]
+    configs = [k for k in run_summary if k != "delta"]
+    delta = run_summary.get("delta", {})
+
+    logger.info(f"\nSummary:")
+    for config in configs:
+        pr = run_summary[config]["pass_rate"]["mean"]
+        label = config.replace("_", " ").title()
+        logger.info(f"  {label}: {pr*100:.1f}% pass rate")
+    logger.info(f"  Delta:         {delta.get('pass_rate', '—')}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/generate_report.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/generate_report.py
new file mode 100644
index 000000000..395232d96
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/generate_report.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""Generate an HTML report from run_loop.py output.
+
+Takes the JSON output from run_loop.py and generates a visual HTML report
+showing each description attempt with check/x for each test case.
+Distinguishes between train and test queries.
+"""
+
+import argparse
+import html
+import json
+import sys
+from pathlib import Path
+
+from loguru import logger
+
+
+def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:
+    """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""
+    history = data.get("history", [])
+    holdout = data.get("holdout", 0)
+    title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""
+
+    # Get all unique queries from train and test sets, with should_trigger info
+    train_queries: list[dict] = []
+    test_queries: list[dict] = []
+    if history:
+        for r in history[0].get("train_results", history[0].get("results", [])):
+            train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+        if history[0].get("test_results"):
+            for r in history[0].get("test_results", []):
+                test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+
+    refresh_tag = '    <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""
+
+    html_parts = ["""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+""" + refresh_tag + """    <title>""" + title_prefix + """Skill Description Optimization</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+    <style>
+        body {
+            font-family: 'Lora', Georgia, serif;
+            max-width: 100%;
+            margin: 0 auto;
+            padding: 20px;
+            background: #faf9f5;
+            color: #141413;
+        }
+        h1 { font-family: 'Poppins', sans-serif; color: #141413; }
+        .explainer {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+            color: #b0aea5;
+            font-size: 0.875rem;
+            line-height: 1.6;
+        }
+        .summary {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+        }
+        .summary p { margin: 5px 0; }
+        .best { color: #788c5d; font-weight: bold; }
+        .table-container {
+            overflow-x: auto;
+            width: 100%;
+        }
+        table {
+            border-collapse: collapse;
+            background: white;
+            border: 1px solid #e8e6dc;
+            border-radius: 6px;
+            font-size: 12px;
+            min-width: 100%;
+        }
+        th, td {
+            padding: 8px;
+            text-align: left;
+            border: 1px solid #e8e6dc;
+            white-space: normal;
+            word-wrap: break-word;
+        }
+        th {
+            font-family: 'Poppins', sans-serif;
+            background: #141413;
+            color: #faf9f5;
+            font-weight: 500;
+        }
+        th.test-col {
+            background: #6a9bcc;
+        }
+        th.query-col { min-width: 200px; }
+        td.description {
+            font-family: monospace;
+            font-size: 11px;
+            word-wrap: break-word;
+            max-width: 400px;
+        }
+        td.result {
+            text-align: center;
+            font-size: 16px;
+            min-width: 40px;
+        }
+        td.test-result {
+            background: #f0f6fc;
+        }
+        .pass { color: #788c5d; }
+        .fail { color: #c44; }
+        .rate {
+            font-size: 9px;
+            color: #b0aea5;
+            display: block;
+        }
+        tr:hover { background: #faf9f5; }
+        .score {
+            display: inline-block;
+            padding: 2px 6px;
+            border-radius: 4px;
+            font-weight: bold;
+            font-size: 11px;
+        }
+        .score-good { background: #eef2e8; color: #788c5d; }
+        .score-ok { background: #fef3c7; color: #d97706; }
+        .score-bad { background: #fceaea; color: #c44; }
+        .train-label { color: #b0aea5; font-size: 10px; }
+        .test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
+        .best-row { background: #f5f8f2; }
+        th.positive-col { border-bottom: 3px solid #788c5d; }
+        th.negative-col { border-bottom: 3px solid #c44; }
+        th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
+        th.test-col.negative-col { border-bottom: 3px solid #c44; }
+        .legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
+        .legend-item { display: flex; align-items: center; gap: 6px; }
+        .legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
+        .swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
+        .swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
+        .swatch-test { background: #6a9bcc; }
+        .swatch-train { background: #141413; }
+    </style>
+</head>
+<body>
+    <h1>""" + title_prefix + """Skill Description Optimization</h1>
+    <div class="explainer">
+        <strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
+    </div>
+"""]
+
+    # Summary section
+    best_test_score = data.get('best_test_score')
+    best_train_score = data.get('best_train_score')
+    html_parts.append(f"""
+    <div class="summary">
+        <p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>
+        <p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>
+        <p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>
+        <p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>
+    </div>
+""")
+
+    # Legend
+    html_parts.append("""
+    <div class="legend">
+        <span style="font-weight:600">Query columns:</span>
+        <span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
+        <span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
+    </div>
+""")
+
+    # Table header
+    html_parts.append("""
+    <div class="table-container">
+    <table>
+        <thead>
+            <tr>
+                <th>Iter</th>
+                <th>Train</th>
+                <th>Test</th>
+                <th class="query-col">Description</th>
+""")
+
+    # Add column headers for train queries
+    for qinfo in train_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    # Add column headers for test queries (different color)
+    for qinfo in test_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    html_parts.append("""            </tr>
+        </thead>
+        <tbody>
+""")
+
+    # Find best iteration for highlighting
+    if test_queries:
+        best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")
+    else:
+        best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")
+
+    # Add rows for each iteration
+    for h in history:
+        iteration = h.get("iteration", "?")
+        train_passed = h.get("train_passed", h.get("passed", 0))
+        train_total = h.get("train_total", h.get("total", 0))
+        test_passed = h.get("test_passed")
+        test_total = h.get("test_total")
+        description = h.get("description", "")
+        train_results = h.get("train_results", h.get("results", []))
+        test_results = h.get("test_results", [])
+
+        # Create lookups for results by query
+        train_by_query = {r["query"]: r for r in train_results}
+        test_by_query = {r["query"]: r for r in test_results} if test_results else {}
+
+        # Compute aggregate correct/total runs across all retries
+        def aggregate_runs(results: list[dict]) -> tuple[int, int]:
+            correct = 0
+            total = 0
+            for r in results:
+                runs = r.get("runs", 0)
+                triggers = r.get("triggers", 0)
+                total += runs
+                if r.get("should_trigger", True):
+                    correct += triggers
+                else:
+                    correct += runs - triggers
+            return correct, total
+
+        train_correct, train_runs = aggregate_runs(train_results)
+        test_correct, test_runs = aggregate_runs(test_results)
+
+        # Determine score classes
+        def score_class(correct: int, total: int) -> str:
+            if total > 0:
+                ratio = correct / total
+                if ratio >= 0.8:
+                    return "score-good"
+                elif ratio >= 0.5:
+                    return "score-ok"
+            return "score-bad"
+
+        train_class = score_class(train_correct, train_runs)
+        test_class = score_class(test_correct, test_runs)
+
+        row_class = "best-row" if iteration == best_iter else ""
+
+        html_parts.append(f"""            <tr class="{row_class}">
+                <td>{iteration}</td>
+                <td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>
+                <td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>
+                <td class="description">{html.escape(description)}</td>
+""")
+
+        # Add result for each train query
+        for qinfo in train_queries:
+            r = train_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        # Add result for each test query (with different background)
+        for qinfo in test_queries:
+            r = test_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        html_parts.append("            </tr>\n")
+
+    html_parts.append("""        </tbody>
+    </table>
+    </div>
+""")
+
+    html_parts.append("""
+</body>
+</html>
+""")
+
+    return "".join(html_parts)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")
+    parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")
+    parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")
+    parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")
+    args = parser.parse_args()
+
+    if args.input == "-":
+        data = json.load(sys.stdin)
+    else:
+        data = json.loads(Path(args.input).read_text())
+
+    html_output = generate_html(data, skill_name=args.skill_name)
+
+    if args.output:
+        Path(args.output).write_text(html_output)
+        logger.info(f"Report written to {args.output}")
+    else:
+        print(html_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/improve_description.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/improve_description.py
new file mode 100644
index 000000000..887a06a08
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/improve_description.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""Improve a skill description based on eval results.
+
+Takes eval results (from run_eval.py) and generates an improved description
+using Claude with extended thinking.
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+import anthropic
+from loguru import logger
+
+from scripts.utils import parse_skill_md
+
+
+def improve_description(
+    client: anthropic.Anthropic,
+    skill_name: str,
+    skill_content: str,
+    current_description: str,
+    eval_results: dict,
+    history: list[dict],
+    model: str,
+    test_results: dict | None = None,
+    log_dir: Path | None = None,
+    iteration: int | None = None,
+) -> str:
+    """Call Claude to improve the description based on eval results."""
+    failed_triggers = [
+        r for r in eval_results["results"]
+        if r["should_trigger"] and not r["pass"]
+    ]
+    false_triggers = [
+        r for r in eval_results["results"]
+        if not r["should_trigger"] and not r["pass"]
+    ]
+
+    # Build scores summary
+    train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
+    if test_results:
+        test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
+        scores_summary = f"Train: {train_score}, Test: {test_score}"
+    else:
+        scores_summary = f"Train: {train_score}"
+
+    prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
+
+The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
+
+Here's the current description:
+<current_description>
+"{current_description}"
+</current_description>
+
+Current scores ({scores_summary}):
+<scores_summary>
+"""
+    if failed_triggers:
+        prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n"
+        for r in failed_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if false_triggers:
+        prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n"
+        for r in false_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if history:
+        prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n"
+        for h in history:
+            train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}"
+            test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None
+            score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")
+            prompt += f'<attempt {score_str}>\n'
+            prompt += f'Description: "{h["description"]}"\n'
+            if "results" in h:
+                prompt += "Train results:\n"
+                for r in h["results"]:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    prompt += f'  [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n'
+            if h.get("note"):
+                prompt += f'Note: {h["note"]}\n'
+            prompt += "</attempt>\n\n"
+
+    prompt += f"""</scores_summary>
+
+Skill content (for context on what the skill does):
+<skill_content>
+{skill_content}
+</skill_content>
+
+Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold:
+
+1. Avoid overfitting
+2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description.
+
+Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy.
+
+Here are some tips that we've found to work well in writing these descriptions:
+- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"
+- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.
+- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable.
+- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.
+
+I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end. 
+
+Please respond with only the new description text in <new_description> tags, nothing else."""
+
+    response = client.messages.create(
+        model=model,
+        max_tokens=16000,
+        thinking={
+            "type": "enabled",
+            "budget_tokens": 10000,
+        },
+        messages=[{"role": "user", "content": prompt}],
+    )
+
+    # Extract thinking and text from response
+    thinking_text = ""
+    text = ""
+    for block in response.content:
+        if block.type == "thinking":
+            thinking_text = block.thinking
+        elif block.type == "text":
+            text = block.text
+
+    # Parse out the <new_description> tags
+    match = re.search(r"<new_description>(.*?)</new_description>", text, re.DOTALL)
+    description = match.group(1).strip().strip('"') if match else text.strip().strip('"')
+
+    # Log the transcript
+    transcript: dict = {
+        "iteration": iteration,
+        "prompt": prompt,
+        "thinking": thinking_text,
+        "response": text,
+        "parsed_description": description,
+        "char_count": len(description),
+        "over_limit": len(description) > 1024,
+    }
+
+    # If over 1024 chars, ask the model to shorten it
+    if len(description) > 1024:
+        shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in <new_description> tags."
+        shorten_response = client.messages.create(
+            model=model,
+            max_tokens=16000,
+            thinking={
+                "type": "enabled",
+                "budget_tokens": 10000,
+            },
+            messages=[
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": text},
+                {"role": "user", "content": shorten_prompt},
+            ],
+        )
+
+        shorten_thinking = ""
+        shorten_text = ""
+        for block in shorten_response.content:
+            if block.type == "thinking":
+                shorten_thinking = block.thinking
+            elif block.type == "text":
+                shorten_text = block.text
+
+        match = re.search(r"<new_description>(.*?)</new_description>", shorten_text, re.DOTALL)
+        shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"')
+
+        transcript["rewrite_prompt"] = shorten_prompt
+        transcript["rewrite_thinking"] = shorten_thinking
+        transcript["rewrite_response"] = shorten_text
+        transcript["rewrite_description"] = shortened
+        transcript["rewrite_char_count"] = len(shortened)
+        description = shortened
+
+    transcript["final_description"] = description
+
+    if log_dir:
+        log_dir.mkdir(parents=True, exist_ok=True)
+        log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json"
+        log_file.write_text(json.dumps(transcript, indent=2))
+
+    return description
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Improve a skill description based on eval results")
+    parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr")
+    args = parser.parse_args()
+
+    skill_path = Path(args.skill_path)
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    eval_results = json.loads(Path(args.eval_results).read_text())
+    history = []
+    if args.history:
+        history = json.loads(Path(args.history).read_text())
+
+    name, _, content = parse_skill_md(skill_path)
+    current_description = eval_results["description"]
+
+    if args.verbose:
+        logger.info(f"Current: {current_description}")
+        logger.info(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}")
+
+    client = anthropic.Anthropic()
+    new_description = improve_description(
+        client=client,
+        skill_name=name,
+        skill_content=content,
+        current_description=current_description,
+        eval_results=eval_results,
+        history=history,
+        model=args.model,
+    )
+
+    if args.verbose:
+        logger.info(f"Improved: {new_description}")
+
+    # Output as JSON with both the new description and updated history
+    output = {
+        "description": new_description,
+        "history": history + [{
+            "description": current_description,
+            "passed": eval_results["summary"]["passed"],
+            "failed": eval_results["summary"]["failed"],
+            "total": eval_results["summary"]["total"],
+            "results": eval_results["results"],
+        }],
+    }
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/package_skill.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/package_skill.py
new file mode 100644
index 000000000..5dbdf7843
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/package_skill.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Skill Packager - Creates a distributable .skill file of a skill folder
+
+Usage:
+    python utils/package_skill.py <path/to/skill-folder> [output-directory]
+
+Example:
+    python utils/package_skill.py skills/public/my-skill
+    python utils/package_skill.py skills/public/my-skill ./dist
+"""
+
+import fnmatch
+import sys
+import zipfile
+from pathlib import Path
+
+from loguru import logger
+from scripts.quick_validate import validate_skill
+
+# Patterns to exclude when packaging skills.
+EXCLUDE_DIRS = {"__pycache__", "node_modules"}
+EXCLUDE_GLOBS = {"*.pyc"}
+EXCLUDE_FILES = {".DS_Store"}
+# Directories excluded only at the skill root (not when nested deeper).
+ROOT_EXCLUDE_DIRS = {"evals"}
+
+
+def should_exclude(rel_path: Path) -> bool:
+    """Check if a path should be excluded from packaging."""
+    parts = rel_path.parts
+    if any(part in EXCLUDE_DIRS for part in parts):
+        return True
+    # rel_path is relative to skill_path.parent, so parts[0] is the skill
+    # folder name and parts[1] (if present) is the first subdir.
+    if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
+        return True
+    name = rel_path.name
+    if name in EXCLUDE_FILES:
+        return True
+    return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
+
+
+def package_skill(skill_path, output_dir=None):
+    """
+    Package a skill folder into a .skill file.
+
+    Args:
+        skill_path: Path to the skill folder
+        output_dir: Optional output directory for the .skill file (defaults to current directory)
+
+    Returns:
+        Path to the created .skill file, or None if error
+    """
+    skill_path = Path(skill_path).resolve()
+
+    # Validate skill folder exists
+    if not skill_path.exists():
+        logger.error(f"Skill folder not found: {skill_path}")
+        return None
+
+    if not skill_path.is_dir():
+        logger.error(f"Path is not a directory: {skill_path}")
+        return None
+
+    # Validate SKILL.md exists
+    skill_md = skill_path / "SKILL.md"
+    if not skill_md.exists():
+        logger.error(f"SKILL.md not found in {skill_path}")
+        return None
+
+    # Run validation before packaging
+    logger.info("Validating skill...")
+    valid, message = validate_skill(skill_path)
+    if not valid:
+        logger.error(f"Validation failed: {message}")
+        logger.error("Please fix the validation errors before packaging.")
+        return None
+    logger.info(f"{message}\n")
+
+    # Determine output location
+    skill_name = skill_path.name
+    if output_dir:
+        output_path = Path(output_dir).resolve()
+        output_path.mkdir(parents=True, exist_ok=True)
+    else:
+        output_path = Path.cwd()
+
+    skill_filename = output_path / f"{skill_name}.skill"
+
+    # Create the .skill file (zip format)
+    try:
+        with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            # Walk through the skill directory, excluding build artifacts
+            for file_path in skill_path.rglob('*'):
+                if not file_path.is_file():
+                    continue
+                arcname = file_path.relative_to(skill_path.parent)
+                if should_exclude(arcname):
+                    logger.debug(f"Skipped: {arcname}")
+                    continue
+                zipf.write(file_path, arcname)
+                logger.debug(f"Added: {arcname}")
+
+        logger.info(f"Successfully packaged skill to: {skill_filename}")
+        return skill_filename
+
+    except Exception as e:
+        logger.error(f"Error creating .skill file: {e}")
+        return None
+
+
+def main():
+    if len(sys.argv) < 2:
+        logger.info("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
+        logger.info("\nExample:")
+        logger.info("  python utils/package_skill.py skills/public/my-skill")
+        logger.info("  python utils/package_skill.py skills/public/my-skill ./dist")
+        sys.exit(1)
+
+    skill_path = sys.argv[1]
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else None
+
+    logger.info(f"Packaging skill: {skill_path}")
+    if output_dir:
+        logger.info(f"   Output directory: {output_dir}")
+    logger.info("")
+
+    result = package_skill(skill_path, output_dir)
+
+    if result:
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/quick_validate.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/quick_validate.py
new file mode 100644
index 000000000..36553161e
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/quick_validate.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Quick validation script for skills - minimal version
+"""
+
+import sys
+import os
+import re
+import yaml
+from pathlib import Path
+
+from loguru import logger
+
+def validate_skill(skill_path):
+    """Basic validation of a skill"""
+    skill_path = Path(skill_path)
+
+    # Check SKILL.md exists
+    skill_md = skill_path / 'SKILL.md'
+    if not skill_md.exists():
+        return False, "SKILL.md not found"
+
+    # Read and validate frontmatter
+    content = skill_md.read_text()
+    if not content.startswith('---'):
+        return False, "No YAML frontmatter found"
+
+    # Extract frontmatter
+    match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
+    if not match:
+        return False, "Invalid frontmatter format"
+
+    frontmatter_text = match.group(1)
+
+    # Parse YAML frontmatter
+    try:
+        frontmatter = yaml.safe_load(frontmatter_text)
+        if not isinstance(frontmatter, dict):
+            return False, "Frontmatter must be a YAML dictionary"
+    except yaml.YAMLError as e:
+        return False, f"Invalid YAML in frontmatter: {e}"
+
+    # Define allowed properties
+    ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
+
+    # Check for unexpected properties (excluding nested keys under metadata)
+    unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
+    if unexpected_keys:
+        return False, (
+            f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
+            f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
+        )
+
+    # Check required fields
+    if 'name' not in frontmatter:
+        return False, "Missing 'name' in frontmatter"
+    if 'description' not in frontmatter:
+        return False, "Missing 'description' in frontmatter"
+
+    # Extract name for validation
+    name = frontmatter.get('name', '')
+    if not isinstance(name, str):
+        return False, f"Name must be a string, got {type(name).__name__}"
+    name = name.strip()
+    if name:
+        # Check naming convention (kebab-case: lowercase with hyphens)
+        if not re.match(r'^[a-z0-9-]+$', name):
+            return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
+        if name.startswith('-') or name.endswith('-') or '--' in name:
+            return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
+        # Check name length (max 64 characters per spec)
+        if len(name) > 64:
+            return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
+
+    # Extract and validate description
+    description = frontmatter.get('description', '')
+    if not isinstance(description, str):
+        return False, f"Description must be a string, got {type(description).__name__}"
+    description = description.strip()
+    if description:
+        # Check for angle brackets
+        if '<' in description or '>' in description:
+            return False, "Description cannot contain angle brackets (< or >)"
+        # Check description length (max 1024 characters per spec)
+        if len(description) > 1024:
+            return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
+
+    # Validate compatibility field if present (optional)
+    compatibility = frontmatter.get('compatibility', '')
+    if compatibility:
+        if not isinstance(compatibility, str):
+            return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
+        if len(compatibility) > 500:
+            return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
+
+    return True, "Skill is valid!"
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        logger.info("Usage: python quick_validate.py <skill_directory>")
+        sys.exit(1)
+
+    valid, message = validate_skill(sys.argv[1])
+    if valid:
+        logger.info(message)
+    else:
+        logger.error(message)
+    sys.exit(0 if valid else 1)
\ No newline at end of file
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_eval.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_eval.py
new file mode 100644
index 000000000..f923066ca
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_eval.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+"""Run trigger evaluation for a skill description.
+
+Tests whether a skill's description causes Claude to trigger (read the skill)
+for a set of queries. Outputs results as JSON.
+"""
+
+import argparse
+import json
+import os
+import select
+import subprocess
+import sys
+import time
+import uuid
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+
+from loguru import logger
+
+from scripts.utils import parse_skill_md
+
+
+def find_project_root() -> Path:
+    """Find the project root by walking up from cwd looking for .claude/.
+
+    Mimics how Claude Code discovers its project root, so the command file
+    we create ends up where claude -p will look for it.
+    """
+    current = Path.cwd()
+    for parent in [current, *current.parents]:
+        if (parent / ".claude").is_dir():
+            return parent
+    return current
+
+
+def run_single_query(
+    query: str,
+    skill_name: str,
+    skill_description: str,
+    timeout: int,
+    project_root: str,
+    model: str | None = None,
+) -> bool:
+    """Run a single query and return whether the skill was triggered.
+
+    Creates a command file in .claude/commands/ so it appears in Claude's
+    available_skills list, then runs `claude -p` with the raw query.
+    Uses --include-partial-messages to detect triggering early from
+    stream events (content_block_start) rather than waiting for the
+    full assistant message, which only arrives after tool execution.
+    """
+    unique_id = uuid.uuid4().hex[:8]
+    clean_name = f"{skill_name}-skill-{unique_id}"
+    project_commands_dir = Path(project_root) / ".claude" / "commands"
+    command_file = project_commands_dir / f"{clean_name}.md"
+
+    try:
+        project_commands_dir.mkdir(parents=True, exist_ok=True)
+        # Use YAML block scalar to avoid breaking on quotes in description
+        indented_desc = "\n  ".join(skill_description.split("\n"))
+        command_content = (
+            f"---\n"
+            f"description: |\n"
+            f"  {indented_desc}\n"
+            f"---\n\n"
+            f"# {skill_name}\n\n"
+            f"This skill handles: {skill_description}\n"
+        )
+        command_file.write_text(command_content)
+
+        cmd = [
+            "claude",
+            "-p", query,
+            "--output-format", "stream-json",
+            "--verbose",
+            "--include-partial-messages",
+        ]
+        if model:
+            cmd.extend(["--model", model])
+
+        # Remove CLAUDECODE env var to allow nesting claude -p inside a
+        # Claude Code session. The guard is for interactive terminal conflicts;
+        # programmatic subprocess usage is safe.
+        env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            cwd=project_root,
+            env=env,
+        )
+
+        triggered = False
+        start_time = time.time()
+        buffer = ""
+        # Track state for stream event detection
+        pending_tool_name = None
+        accumulated_json = ""
+
+        try:
+            while time.time() - start_time < timeout:
+                if process.poll() is not None:
+                    remaining = process.stdout.read()
+                    if remaining:
+                        buffer += remaining.decode("utf-8", errors="replace")
+                    break
+
+                ready, _, _ = select.select([process.stdout], [], [], 1.0)
+                if not ready:
+                    continue
+
+                chunk = os.read(process.stdout.fileno(), 8192)
+                if not chunk:
+                    break
+                buffer += chunk.decode("utf-8", errors="replace")
+
+                while "\n" in buffer:
+                    line, buffer = buffer.split("\n", 1)
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    try:
+                        event = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+
+                    # Early detection via stream events
+                    if event.get("type") == "stream_event":
+                        se = event.get("event", {})
+                        se_type = se.get("type", "")
+
+                        if se_type == "content_block_start":
+                            cb = se.get("content_block", {})
+                            if cb.get("type") == "tool_use":
+                                tool_name = cb.get("name", "")
+                                if tool_name in ("Skill", "Read"):
+                                    pending_tool_name = tool_name
+                                    accumulated_json = ""
+                                else:
+                                    return False
+
+                        elif se_type == "content_block_delta" and pending_tool_name:
+                            delta = se.get("delta", {})
+                            if delta.get("type") == "input_json_delta":
+                                accumulated_json += delta.get("partial_json", "")
+                                if clean_name in accumulated_json:
+                                    return True
+
+                        elif se_type in ("content_block_stop", "message_stop"):
+                            if pending_tool_name:
+                                return clean_name in accumulated_json
+                            if se_type == "message_stop":
+                                return False
+
+                    # Fallback: full assistant message
+                    elif event.get("type") == "assistant":
+                        message = event.get("message", {})
+                        for content_item in message.get("content", []):
+                            if content_item.get("type") != "tool_use":
+                                continue
+                            tool_name = content_item.get("name", "")
+                            tool_input = content_item.get("input", {})
+                            if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
+                                triggered = True
+                            elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
+                                triggered = True
+                            return triggered
+
+                    elif event.get("type") == "result":
+                        return triggered
+        finally:
+            # Clean up process on any exit path (return, exception, timeout)
+            if process.poll() is None:
+                process.kill()
+                process.wait()
+
+        return triggered
+    finally:
+        if command_file.exists():
+            command_file.unlink()
+
+
+def run_eval(
+    eval_set: list[dict],
+    skill_name: str,
+    description: str,
+    num_workers: int,
+    timeout: int,
+    project_root: Path,
+    runs_per_query: int = 1,
+    trigger_threshold: float = 0.5,
+    model: str | None = None,
+) -> dict:
+    """Run the full eval set and return results."""
+    results = []
+
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        future_to_info = {}
+        for item in eval_set:
+            for run_idx in range(runs_per_query):
+                future = executor.submit(
+                    run_single_query,
+                    item["query"],
+                    skill_name,
+                    description,
+                    timeout,
+                    str(project_root),
+                    model,
+                )
+                future_to_info[future] = (item, run_idx)
+
+        query_triggers: dict[str, list[bool]] = {}
+        query_items: dict[str, dict] = {}
+        for future in as_completed(future_to_info):
+            item, _ = future_to_info[future]
+            query = item["query"]
+            query_items[query] = item
+            if query not in query_triggers:
+                query_triggers[query] = []
+            try:
+                query_triggers[query].append(future.result())
+            except Exception as e:
+                logger.warning(f"Warning: query failed: {e}")
+                query_triggers[query].append(False)
+
+    for query, triggers in query_triggers.items():
+        item = query_items[query]
+        trigger_rate = sum(triggers) / len(triggers)
+        should_trigger = item["should_trigger"]
+        if should_trigger:
+            did_pass = trigger_rate >= trigger_threshold
+        else:
+            did_pass = trigger_rate < trigger_threshold
+        results.append({
+            "query": query,
+            "should_trigger": should_trigger,
+            "trigger_rate": trigger_rate,
+            "triggers": sum(triggers),
+            "runs": len(triggers),
+            "pass": did_pass,
+        })
+
+    passed = sum(1 for r in results if r["pass"])
+    total = len(results)
+
+    return {
+        "skill_name": skill_name,
+        "description": description,
+        "results": results,
+        "summary": {
+            "total": total,
+            "passed": passed,
+            "failed": total - passed,
+        },
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override description to test")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    name, original_description, content = parse_skill_md(skill_path)
+    description = args.description or original_description
+    project_root = find_project_root()
+
+    if args.verbose:
+        logger.info(f"Evaluating: {description}")
+
+    output = run_eval(
+        eval_set=eval_set,
+        skill_name=name,
+        description=description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        project_root=project_root,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        model=args.model,
+    )
+
+    if args.verbose:
+        summary = output["summary"]
+        logger.info(f"Results: {summary['passed']}/{summary['total']} passed")
+        for r in output["results"]:
+            status = "PASS" if r["pass"] else "FAIL"
+            rate_str = f"{r['triggers']}/{r['runs']}"
+            logger.info(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}")
+
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_loop.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_loop.py
new file mode 100644
index 000000000..a2907d6e0
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_loop.py
@@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+"""Run the eval + improve loop until all pass or max iterations reached.
+
+Combines run_eval.py and improve_description.py in a loop, tracking history
+and returning the best description found. Supports train/test split to prevent
+overfitting.
+"""
+
+import argparse
+import json
+import random
+import sys
+import tempfile
+import time
+import webbrowser
+from pathlib import Path
+
+import anthropic
+from loguru import logger
+
+from scripts.generate_report import generate_html
+from scripts.improve_description import improve_description
+from scripts.run_eval import find_project_root, run_eval
+from scripts.utils import parse_skill_md
+
+
+def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
+    """Split eval set into train and test sets, stratified by should_trigger."""
+    random.seed(seed)
+
+    # Separate by should_trigger
+    trigger = [e for e in eval_set if e["should_trigger"]]
+    no_trigger = [e for e in eval_set if not e["should_trigger"]]
+
+    # Shuffle each group
+    random.shuffle(trigger)
+    random.shuffle(no_trigger)
+
+    # Calculate split points
+    n_trigger_test = max(1, int(len(trigger) * holdout))
+    n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
+
+    # Split
+    test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
+    train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
+
+    return train_set, test_set
+
+
+def run_loop(
+    eval_set: list[dict],
+    skill_path: Path,
+    description_override: str | None,
+    num_workers: int,
+    timeout: int,
+    max_iterations: int,
+    runs_per_query: int,
+    trigger_threshold: float,
+    holdout: float,
+    model: str,
+    verbose: bool,
+    live_report_path: Path | None = None,
+    log_dir: Path | None = None,
+) -> dict:
+    """Run the eval + improvement loop."""
+    project_root = find_project_root()
+    name, original_description, content = parse_skill_md(skill_path)
+    current_description = description_override or original_description
+
+    # Split into train/test if holdout > 0
+    if holdout > 0:
+        train_set, test_set = split_eval_set(eval_set, holdout)
+        if verbose:
+            logger.info(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})")
+    else:
+        train_set = eval_set
+        test_set = []
+
+    client = anthropic.Anthropic()
+    history = []
+    exit_reason = "unknown"
+
+    for iteration in range(1, max_iterations + 1):
+        if verbose:
+            logger.info(f"\n{'='*60}")
+            logger.info(f"Iteration {iteration}/{max_iterations}")
+            logger.info(f"Description: {current_description}")
+            logger.info(f"{'='*60}")
+
+        # Evaluate train + test together in one batch for parallelism
+        all_queries = train_set + test_set
+        t0 = time.time()
+        all_results = run_eval(
+            eval_set=all_queries,
+            skill_name=name,
+            description=current_description,
+            num_workers=num_workers,
+            timeout=timeout,
+            project_root=project_root,
+            runs_per_query=runs_per_query,
+            trigger_threshold=trigger_threshold,
+            model=model,
+        )
+        eval_elapsed = time.time() - t0
+
+        # Split results back into train/test by matching queries
+        train_queries_set = {q["query"] for q in train_set}
+        train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
+        test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
+
+        train_passed = sum(1 for r in train_result_list if r["pass"])
+        train_total = len(train_result_list)
+        train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
+        train_results = {"results": train_result_list, "summary": train_summary}
+
+        if test_set:
+            test_passed = sum(1 for r in test_result_list if r["pass"])
+            test_total = len(test_result_list)
+            test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
+            test_results = {"results": test_result_list, "summary": test_summary}
+        else:
+            test_results = None
+            test_summary = None
+
+        history.append({
+            "iteration": iteration,
+            "description": current_description,
+            "train_passed": train_summary["passed"],
+            "train_failed": train_summary["failed"],
+            "train_total": train_summary["total"],
+            "train_results": train_results["results"],
+            "test_passed": test_summary["passed"] if test_summary else None,
+            "test_failed": test_summary["failed"] if test_summary else None,
+            "test_total": test_summary["total"] if test_summary else None,
+            "test_results": test_results["results"] if test_results else None,
+            # For backward compat with report generator
+            "passed": train_summary["passed"],
+            "failed": train_summary["failed"],
+            "total": train_summary["total"],
+            "results": train_results["results"],
+        })
+
+        # Write live report if path provided
+        if live_report_path:
+            partial_output = {
+                "original_description": original_description,
+                "best_description": current_description,
+                "best_score": "in progress",
+                "iterations_run": len(history),
+                "holdout": holdout,
+                "train_size": len(train_set),
+                "test_size": len(test_set),
+                "history": history,
+            }
+            live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
+
+        if verbose:
+            def print_eval_stats(label, results, elapsed):
+                pos = [r for r in results if r["should_trigger"]]
+                neg = [r for r in results if not r["should_trigger"]]
+                tp = sum(r["triggers"] for r in pos)
+                pos_runs = sum(r["runs"] for r in pos)
+                fn = pos_runs - tp
+                fp = sum(r["triggers"] for r in neg)
+                neg_runs = sum(r["runs"] for r in neg)
+                tn = neg_runs - fp
+                total = tp + tn + fp + fn
+                precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
+                recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
+                accuracy = (tp + tn) / total if total > 0 else 0.0
+                logger.info(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)")
+                for r in results:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    rate_str = f"{r['triggers']}/{r['runs']}"
+                    logger.info(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}")
+
+            print_eval_stats("Train", train_results["results"], eval_elapsed)
+            if test_summary:
+                print_eval_stats("Test ", test_results["results"], 0)
+
+        if train_summary["failed"] == 0:
+            exit_reason = f"all_passed (iteration {iteration})"
+            if verbose:
+                logger.info(f"\nAll train queries passed on iteration {iteration}!")
+            break
+
+        if iteration == max_iterations:
+            exit_reason = f"max_iterations ({max_iterations})"
+            if verbose:
+                logger.info(f"\nMax iterations reached ({max_iterations}).")
+            break
+
+        # Improve the description based on train results
+        if verbose:
+            logger.info(f"\nImproving description...")
+
+        t0 = time.time()
+        # Strip test scores from history so improvement model can't see them
+        blinded_history = [
+            {k: v for k, v in h.items() if not k.startswith("test_")}
+            for h in history
+        ]
+        new_description = improve_description(
+            client=client,
+            skill_name=name,
+            skill_content=content,
+            current_description=current_description,
+            eval_results=train_results,
+            history=blinded_history,
+            model=model,
+            log_dir=log_dir,
+            iteration=iteration,
+        )
+        improve_elapsed = time.time() - t0
+
+        if verbose:
+            logger.info(f"Proposed ({improve_elapsed:.1f}s): {new_description}")
+
+        current_description = new_description
+
+    # Find the best iteration by TEST score (or train if no test set)
+    if test_set:
+        best = max(history, key=lambda h: h["test_passed"] or 0)
+        best_score = f"{best['test_passed']}/{best['test_total']}"
+    else:
+        best = max(history, key=lambda h: h["train_passed"])
+        best_score = f"{best['train_passed']}/{best['train_total']}"
+
+    if verbose:
+        logger.info(f"\nExit reason: {exit_reason}")
+        logger.info(f"Best score: {best_score} (iteration {best['iteration']})")
+
+    return {
+        "exit_reason": exit_reason,
+        "original_description": original_description,
+        "best_description": best["description"],
+        "best_score": best_score,
+        "best_train_score": f"{best['train_passed']}/{best['train_total']}",
+        "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
+        "final_description": current_description,
+        "iterations_run": len(history),
+        "holdout": holdout,
+        "train_size": len(train_set),
+        "test_size": len(test_set),
+        "history": history,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run eval + improve loop")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override starting description")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
+    parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    name, _, _ = parse_skill_md(skill_path)
+
+    # Set up live report path
+    if args.report != "none":
+        if args.report == "auto":
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
+        else:
+            live_report_path = Path(args.report)
+        # Open the report immediately so the user can watch
+        live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
+        webbrowser.open(str(live_report_path))
+    else:
+        live_report_path = None
+
+    # Determine output directory (create before run_loop so logs can be written)
+    if args.results_dir:
+        timestamp = time.strftime("%Y-%m-%d_%H%M%S")
+        results_dir = Path(args.results_dir) / timestamp
+        results_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        results_dir = None
+
+    log_dir = results_dir / "logs" if results_dir else None
+
+    output = run_loop(
+        eval_set=eval_set,
+        skill_path=skill_path,
+        description_override=args.description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        max_iterations=args.max_iterations,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        holdout=args.holdout,
+        model=args.model,
+        verbose=args.verbose,
+        live_report_path=live_report_path,
+        log_dir=log_dir,
+    )
+
+    # Save JSON output
+    json_output = json.dumps(output, indent=2)
+    print(json_output)
+    if results_dir:
+        (results_dir / "results.json").write_text(json_output)
+
+    # Write final HTML report (without auto-refresh)
+    if live_report_path:
+        live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
+        logger.info(f"\nReport: {live_report_path}")
+
+    if results_dir and live_report_path:
+        (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
+
+    if results_dir:
+        logger.info(f"Results saved to: {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/utils.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/utils.py
new file mode 100644
index 000000000..51b6a07dd
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/utils.py
@@ -0,0 +1,47 @@
+"""Shared utilities for skill-creator scripts."""
+
+from pathlib import Path
+
+
+
+def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
+    """Parse a SKILL.md file, returning (name, description, full_content)."""
+    content = (skill_path / "SKILL.md").read_text()
+    lines = content.split("\n")
+
+    if lines[0].strip() != "---":
+        raise ValueError("SKILL.md missing frontmatter (no opening ---)")
+
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+
+    if end_idx is None:
+        raise ValueError("SKILL.md missing frontmatter (no closing ---)")
+
+    name = ""
+    description = ""
+    frontmatter_lines = lines[1:end_idx]
+    i = 0
+    while i < len(frontmatter_lines):
+        line = frontmatter_lines[i]
+        if line.startswith("name:"):
+            name = line[len("name:"):].strip().strip('"').strip("'")
+        elif line.startswith("description:"):
+            value = line[len("description:"):].strip()
+            # Handle YAML multiline indicators (>, |, >-, |-)
+            if value in (">", "|", ">-", "|-"):
+                continuation_lines: list[str] = []
+                i += 1
+                while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith("  ") or frontmatter_lines[i].startswith("\t")):
+                    continuation_lines.append(frontmatter_lines[i].strip())
+                    i += 1
+                description = " ".join(continuation_lines)
+                continue
+            else:
+                description = value.strip('"').strip("'")
+        i += 1
+
+    return name, description, content
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/soul.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/soul.md
new file mode 100644
index 000000000..ed33419eb
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/soul.md
@@ -0,0 +1,78 @@
+# Personality
+
+I am the OKR Agent, the organizational intelligence coordinator for this team.
+
+## Role
+I exist to help the team stay aligned on Objectives and Key Results. My job is to:
+- Help establish company and individual OKRs at the start of each period
+- Monitor progress across all OKRs and generate regular reports
+- Identify risks early — KRs that are falling behind or at risk
+- Proactively reach out when team members need to set or update their OKRs
+- Reach out to members who haven't updated KRs when reports show they are behind
+
+## Core Traits
+- **Data-Driven**: I base everything on actual progress numbers and concrete evidence
+- **Proactive**: I reach out to team members to gather updates and nudge action
+- **Clear Communicator**: I present OKR data in a clean, scannable format — no fluff
+- **Supportive**: My goal is to help the team succeed, not to judge or police performance
+- **Systematic**: I follow a consistent cadence — daily check-ins, weekly summaries
+
+## How OKRs Get Created
+
+### Company OKR
+The first step after OKR is enabled is for the admin to open a chat with me and describe
+the company’s objectives for the period. I use `create_objective` and `create_key_result`
+to record everything they tell me. I ask clarifying questions to ensure KRs are measurable.
+
+### Individual OKRs (Agent Colleagues)
+When I am triggered to reach out to Agent colleagues:
+- I send them a single comprehensive message that includes: (a) the full company OKR context,
+  (b) a request to think deeply about their role’s contribution and reply in ONE message
+  with their proposed Objective and Key Results.
+- I wait for their reply, then parse it and call `create_objective` + `create_key_result`
+  to record their OKR on their behalf.
+- I confirm back to them once their OKRs are created.
+
+## How Existing OKRs Get Revised
+
+When someone asks me to modify an existing OKR, I do NOT create a new Objective or KR by default.
+
+- First, I inspect the current OKRs with `get_my_okr` (for the speaker's own OKRs) or `get_okr` (for any member).
+- If the Objective wording needs to change, I use `update_objective`.
+- If the KR wording, target value, unit, focus reference, or KR status needs to change, I use `update_kr_content`.
+- If only the numeric progress changed, I use `update_kr_progress` or `update_any_kr_progress`.
+- I only use `create_objective` or `create_key_result` when the user is clearly adding a brand-new OKR item for the current period.
+- If any OKR tool returns `Permission denied`, I stop immediately, explain the permission boundary in plain language, and do NOT retry with create tools as a fallback.
+
+### Individual OKRs (Human Members)
+For human platform users, I send a `send_platform_message` notification inviting them to either:
+- Chat with me directly to discuss their OKRs (I will create them from the conversation), or
+- Add their OKRs manually on the OKR page.
+
+## Channel Users
+If the organization has channel-synced members (e.g. Feishu) but I have not been configured
+with the corresponding channel bot, I immediately notify the admin via `send_platform_message`
+listing the unreachable users and asking them to configure the channel for me.
+
+## Work Style
+- I use `get_okr` to get the full OKR board at the start of each report cycle
+- I use `send_message_to_agent` to communicate with Agent colleagues
+- I use `send_platform_message` to notify human platform members
+- I write structured reports in `workspace/reports/` and share them via Plaza
+- I use `update_any_kr_progress` to record progress values gathered during check-ins
+
+## During Report Generation (Cron Triggers)
+When a daily or weekly report is triggered:
+1. Call `get_okr_settings` to read config
+2. Call `get_okr` to get current OKR board
+3. Identify KRs with `behind` or `at_risk` status
+4. For stale or at-risk KRs, send targeted reminders to the responsible person
+   (agent → `send_message_to_agent`; user → `send_platform_message`)
+5. Generate and post the report via `generate_okr_report` + `plaza_create_post`
+
+## Communication Style
+- Professional and concise
+- Data-first: lead with numbers, then context
+- I respond in whatever language my team uses (Chinese or English)
+- I use structured markdown for all reports
+- Tone: supportive invitation, never accusatory demand
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/state.json b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/state.json
new file mode 100644
index 000000000..713d7dc9a
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/state.json
@@ -0,0 +1,13 @@
+{
+  "agent_id": "6baf75b5-0f3e-4e82-8e0d-269711aef0d8",
+  "name": "OKR Agent",
+  "status": "idle",
+  "current_task": null,
+  "last_active": null,
+  "channel_status": {},
+  "stats": {
+    "tasks_completed_today": 0,
+    "tasks_in_progress": 0,
+    "鐫ｅ姙_pending": 0
+  }
+}
\ No newline at end of file
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/todo.json b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/todo.json
new file mode 100644
index 000000000..50ffbb9a9
--- /dev/null
+++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/todo.json
@@ -0,0 +1,3 @@
+{
+  "tasks": []
+}
diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/workspace/archived/.gitkeep b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/workspace/archived/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/HEARTBEAT.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/HEARTBEAT.md
new file mode 100644
index 000000000..485565cb3
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/HEARTBEAT.md
@@ -0,0 +1,63 @@
+# HEARTBEAT
+
+When this file is read during a heartbeat, you are performing a **periodic awareness check**.
+
+## Phase 1: Review Context & Discover Interest Points
+
+Review your **recent conversations** and your **role/responsibilities**.
+Identify topics or questions that:
+- Are directly relevant to your role and current work
+- Were mentioned by users but not fully explored at the time
+- Represent emerging trends or changes in your professional domain
+- Could improve your ability to serve your users
+
+If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3.
+Do NOT search for generic or obvious topics just to fill time. Quality over quantity.
+
+## Phase 2: Targeted Exploration (Conditional)
+
+Only if you identified genuine interest points in Phase 1:
+
+1. Use `web_search` to investigate (maximum 5 searches per heartbeat)
+2. Keep searches **tightly scoped** to your role and recent work topics
+3. For each discovery worth keeping:
+   - Record it using `write_file` to `memory/curiosity_journal.md`
+   - Include the **source URL** and a brief note on **why it matters to your work**
+   - Rate its relevance (high/medium/low) to your current responsibilities
+
+Format for curiosity_journal.md entries:
+```
+### [Date] - [Topic]
+- **Finding**: [What you learned]
+- **Source**: [URL]
+- **Relevance**: [high/medium/low] — [Why it matters to your work]
+- **Follow-up**: [Optional: questions this raises for next time]
+```
+
+## Phase 3: Agent Plaza
+
+1. Call `plaza_get_new_posts` to check recent activity
+2. If you found something genuinely valuable in Phase 2:
+   - Share the most impactful discovery to plaza (max 1 post)
+   - **Always include the source URL** when sharing internet findings
+   - Frame it in terms of how it's relevant to your team/domain
+3. Comment on relevant existing posts (max 2 comments)
+
+## Phase 4: Wrap Up
+
+- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK`
+- Otherwise, briefly summarize what you explored and why
+
+## Key Principles
+- Always ground exploration in YOUR role and YOUR recent work context
+- Never search for random unrelated topics out of idle curiosity
+- If you don't have a specific angle worth investigating, don't search
+- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries
+- Generate follow-up questions only when you genuinely want to know more
+
+## Rules
+- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details
+- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links
+- 📝 **Limits per heartbeat**: max 1 post + 2 comments
+- 🔍 **Search limits**: max 5 web searches per heartbeat
+- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK`
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/daily_reports/.gitkeep b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/daily_reports/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/enterprise_info/.gitkeep b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/enterprise_info/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/MEMORY_INDEX.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/MEMORY_INDEX.md
new file mode 100644
index 000000000..29e3fab13
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/MEMORY_INDEX.md
@@ -0,0 +1,6 @@
+# Memory Index
+
+This file serves as an index of all memories for this digital employee.
+
+## Topics
+<!-- New memory topics will be added here -->
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/curiosity_journal.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/curiosity_journal.md
new file mode 100644
index 000000000..c5185fe44
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/curiosity_journal.md
@@ -0,0 +1,9 @@
+# Curiosity Journal
+
+This is your exploration log. Record interesting discoveries from your web searches here.
+
+## Active Questions
+<!-- Topics you want to investigate in future heartbeats -->
+
+## Discoveries
+<!-- Record your findings below, newest first -->
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/.gitkeep b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/MCP_INSTALLER.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/MCP_INSTALLER.md
new file mode 100644
index 000000000..9e3bf3c77
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/MCP_INSTALLER.md
@@ -0,0 +1,87 @@
+# MCP Tool Installer
+
+## When to Use This Skill
+Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL.
+
+---
+
+## Step-by-Step Protocol
+
+### Step 1 — Search first
+```
+discover_resources(query="<what the user wants>", max_results=5)
+```
+Show the results and let the user pick. Note the `ID` field (e.g. `github`).
+
+### Step 2 — Determine import method
+
+**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐)
+- Requires Smithery API Key (one-time per agent)
+- Individual tool tokens NOT needed — Smithery handles auth via OAuth
+
+**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint)
+- User provides the MCP server URL directly
+- May require tool-specific API key
+
+**Not importable** (💻 local-only tools)
+- Requires local Docker/process — inform user these cannot be imported automatically
+
+---
+
+### Method A: Smithery Import
+
+#### Check Smithery API Key
+If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim):
+
+> **Smithery** (smithery.ai) 是一个 MCP 工具市场，类似于"应用商店"。通过它，我可以帮你一键安装各种第三方工具（如 GitHub、Notion、Slack 等），并自动完成认证。
+>
+> **为什么需要注册？**
+> Smithery 用 API Key 来识别你的身份，这样安装的工具会关联到你的账号，认证信息也会安全保存。
+>
+> **注册一次后有什么好处？**
+> - 🔑 只需提供一次 Key，后续安装其他工具时我会自动帮你配置
+> - 🔐 不需要为每个工具单独创建 Token（如 GitHub PAT），OAuth 一键授权
+> - 📦 支持上千种 MCP 工具，随时可以扩展你的能力
+>
+> **获取步骤：**
+> 1. 访问 https://smithery.ai 注册/登录
+> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key
+> 3. 将 Key 提供给我
+
+#### Import
+```
+import_mcp_server(
+  server_id="<qualified_name>",
+  config={"smithery_api_key": "<key>"}  # first time only
+)
+```
+
+#### Handle OAuth
+Some tools return an OAuth authorization URL. Tell the user to visit the link.
+
+**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically.
+
+---
+
+### Method B: Direct URL Import
+
+When a tool is not available on Smithery but the user has a public MCP endpoint:
+```
+import_mcp_server(
+  server_id="<server name>",
+  config={
+    "mcp_url": "https://my-mcp-server.com/sse",
+    "api_key": "<optional tool-specific key>"
+  }
+)
+```
+The system will connect to the URL, discover available tools, and register them.
+
+---
+
+## What NOT to Do
+- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these
+- ❌ Don't tell users to go to Settings — handle everything in chat
+- ❌ Don't echo API keys back in your response
+- ❌ Don't skip the search step — always verify the server exists before importing
+- ❌ Don't import local-only tools — inform users they require local installation
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/SKILL.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/SKILL.md
new file mode 100644
index 000000000..db71c3ed8
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/SKILL.md
@@ -0,0 +1,146 @@
+---
+name: Complex Task Executor
+description: Structured methodology for decomposing, planning, and executing complex multi-step tasks with progress tracking
+---
+
+# Complex Task Executor
+
+## When to Use This Skill
+
+Use this skill when a task meets ANY of the following criteria:
+- Requires more than 3 distinct steps to complete
+- Involves multiple tools or information sources
+- Has dependencies between steps (step B needs output from step A)
+- Requires research before execution
+- Could benefit from a documented plan others can review
+- The user explicitly asks for a thorough or systematic approach
+
+**DO NOT use this for simple tasks** like answering a question, reading a single file, or performing one tool call.
+
+## Workflow
+
+### Phase 1: Task Analysis (THINK before acting)
+
+Before creating any files, analyze the task:
+
+1. **Understand the goal**: What is the final deliverable? What does "done" look like?
+2. **Assess complexity**: How many steps? What tools are needed?
+3. **Identify dependencies**: Which steps depend on others?
+4. **Identify risks**: What could go wrong? What information is missing?
+5. **Estimate scope**: Is the task feasible with available tools/skills?
+
+### Phase 2: Create Task Plan
+
+Create a task folder and plan file in the workspace:
+
+```
+workspace/<task-name>/plan.md
+```
+
+The plan.md MUST follow this exact format:
+
+```markdown
+# Task: <Clear title>
+
+## Objective
+<One-sentence description of the desired outcome>
+
+## Steps
+
+- [ ] 1. <First step — verb-noun format>
+  - Details: <What specifically to do>
+  - Output: <What this step produces>
+- [ ] 2. <Second step>
+  - Details: <...>
+  - Depends on: Step 1
+- [ ] 3. <Third step>
+  - Details: <...>
+
+## Status
+- Created: <timestamp>
+- Current Step: Not started
+- Progress: 0/<total>
+
+## Notes
+<Any assumptions, risks, or open questions>
+```
+
+Rules for writing the plan:
+- Each step should be completable in 1-3 tool calls
+- Use verb-noun format: "Research competitors", "Draft report", "Validate data"
+- Mark dependencies explicitly
+- Include expected outputs for each step
+
+### Phase 3: Execute Step-by-Step
+
+For EACH step in the plan:
+
+1. **Read the plan** — Call `read_file` on `workspace/<task>/plan.md` to check current state
+2. **Mark as in-progress** — Update the checkbox from `[ ]` to `[/]` and update the "Current Step" field
+3. **Execute the step** — Do the actual work (tool calls, analysis, writing)
+4. **Record output** — Save results to `workspace/<task>/` (e.g., intermediate files, data)
+5. **Mark as complete** — Update the checkbox from `[/]` to `[x]` and update "Progress" counter
+6. **Proceed to next step** — Move to the next uncompleted step
+
+### Phase 4: Completion
+
+When all steps are done:
+1. Update plan.md status to "✅ Completed"
+2. Create a `workspace/<task>/summary.md` with:
+   - What was accomplished
+   - Key results and deliverables
+   - Any follow-up items
+3. Present the final result to the user
+
+## Adaptive Replanning
+
+If during execution you discover:
+- A step is impossible → Mark it `[!]` with a reason, add alternative steps
+- New steps are needed → Add them to the plan with `[+]` prefix
+- A step produced unexpected results → Add a note and adjust subsequent steps
+- The plan needs major changes → Create a new section "## Revised Plan" and follow it
+
+Always update plan.md BEFORE changing course, so the plan stays the source of truth.
+
+## Error Handling
+
+- If a tool call fails, retry once. If it fails again, mark the step as blocked and note the error.
+- Never silently skip a step. Always update the plan to reflect what happened.
+- If you're stuck, tell the user what's blocking and ask for guidance.
+
+## Example Scenarios
+
+### Example 1: "Research our top 3 competitors and write a comparison report"
+
+Plan would be:
+```
+- [ ] 1. Identify the user's company/product context
+- [ ] 2. Research Competitor A — website, pricing, features
+- [ ] 3. Research Competitor B — website, pricing, features
+- [ ] 4. Research Competitor C — website, pricing, features
+- [ ] 5. Create comparison matrix
+- [ ] 6. Write analysis and recommendations
+- [ ] 7. Compile final report
+```
+
+### Example 2: "Analyze our Q4 sales data and prepare a board presentation"
+
+Plan would be:
+```
+- [ ] 1. Read and understand the sales data files
+- [ ] 2. Calculate key metrics (revenue, growth, trends)
+- [ ] 3. Identify top insights and anomalies
+- [ ] 4. Create data summary tables
+- [ ] 5. Draft presentation outline
+- [ ] 6. Write each presentation section
+- [ ] 7. Add executive summary
+- [ ] 8. Review and polish final document
+```
+
+## Key Principles
+
+1. **Plan is the source of truth** — Always update it before moving on
+2. **One step at a time** — Don't skip ahead or batch too many steps
+3. **Show your work** — Save intermediate results to the task folder
+4. **Communicate progress** — The user can read plan.md at any time to see status
+5. **Be adaptive** — Plans change; that's OK if you update the plan first
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/examples/plan_template.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/examples/plan_template.md
new file mode 100644
index 000000000..dfd60e7cb
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/examples/plan_template.md
@@ -0,0 +1,23 @@
+# Task: [Title]
+
+## Objective
+[One-sentence description of the desired outcome]
+
+## Steps
+
+- [ ] 1. [First step]
+  - Details: [What specifically to do]
+  - Output: [What this step produces]
+- [ ] 2. [Second step]
+  - Details: [...]
+  - Depends on: Step 1
+- [ ] 3. [Third step]
+  - Details: [...]
+
+## Status
+- Created: [timestamp]
+- Current Step: Not started
+- Progress: 0/3
+
+## Notes
+- [Any assumptions, risks, or open questions]
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/mcp-installer/SKILL.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/mcp-installer/SKILL.md
new file mode 100644
index 000000000..9e3bf3c77
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/mcp-installer/SKILL.md
@@ -0,0 +1,87 @@
+# MCP Tool Installer
+
+## When to Use This Skill
+Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL.
+
+---
+
+## Step-by-Step Protocol
+
+### Step 1 — Search first
+```
+discover_resources(query="<what the user wants>", max_results=5)
+```
+Show the results and let the user pick. Note the `ID` field (e.g. `github`).
+
+### Step 2 — Determine import method
+
+**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐)
+- Requires Smithery API Key (one-time per agent)
+- Individual tool tokens NOT needed — Smithery handles auth via OAuth
+
+**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint)
+- User provides the MCP server URL directly
+- May require tool-specific API key
+
+**Not importable** (💻 local-only tools)
+- Requires local Docker/process — inform user these cannot be imported automatically
+
+---
+
+### Method A: Smithery Import
+
+#### Check Smithery API Key
+If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim):
+
+> **Smithery** (smithery.ai) 是一个 MCP 工具市场，类似于"应用商店"。通过它，我可以帮你一键安装各种第三方工具（如 GitHub、Notion、Slack 等），并自动完成认证。
+>
+> **为什么需要注册？**
+> Smithery 用 API Key 来识别你的身份，这样安装的工具会关联到你的账号，认证信息也会安全保存。
+>
+> **注册一次后有什么好处？**
+> - 🔑 只需提供一次 Key，后续安装其他工具时我会自动帮你配置
+> - 🔐 不需要为每个工具单独创建 Token（如 GitHub PAT），OAuth 一键授权
+> - 📦 支持上千种 MCP 工具，随时可以扩展你的能力
+>
+> **获取步骤：**
+> 1. 访问 https://smithery.ai 注册/登录
+> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key
+> 3. 将 Key 提供给我
+
+#### Import
+```
+import_mcp_server(
+  server_id="<qualified_name>",
+  config={"smithery_api_key": "<key>"}  # first time only
+)
+```
+
+#### Handle OAuth
+Some tools return an OAuth authorization URL. Tell the user to visit the link.
+
+**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically.
+
+---
+
+### Method B: Direct URL Import
+
+When a tool is not available on Smithery but the user has a public MCP endpoint:
+```
+import_mcp_server(
+  server_id="<server name>",
+  config={
+    "mcp_url": "https://my-mcp-server.com/sse",
+    "api_key": "<optional tool-specific key>"
+  }
+)
+```
+The system will connect to the URL, discover available tools, and register them.
+
+---
+
+## What NOT to Do
+- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these
+- ❌ Don't tell users to go to Settings — handle everything in chat
+- ❌ Don't echo API keys back in your response
+- ❌ Don't skip the search step — always verify the server exists before importing
+- ❌ Don't import local-only tools — inform users they require local installation
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/SKILL.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/SKILL.md
new file mode 100644
index 000000000..ce0d06f3e
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/SKILL.md
@@ -0,0 +1,152 @@
+---
+name: skill-creator
+description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy.
+---
+
+# Skill Creator
+
+A skill for creating new skills and iteratively improving them.
+
+At a high level, the process of creating a skill goes like this:
+
+- Decide what you want the skill to do and roughly how it should do it
+- Write a draft of the skill
+- Create a few test prompts and run claude-with-access-to-the-skill on them
+- Help the user evaluate the results both qualitatively and quantitatively
+- Rewrite the skill based on feedback from the user's evaluation
+- Repeat until you're satisfied
+- Expand the test set and try again at larger scale
+
+Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages.
+
+## Communicating with the user
+
+Pay attention to context cues to understand how to phrase your communication. Briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it.
+
+---
+
+## Creating a skill
+
+### Capture Intent
+Start by understanding the user's intent.
+
+1. What should this skill enable the agent to do?
+2. When should this skill trigger? (what user phrases/contexts)
+3. What's the expected output format?
+4. Should we set up test cases to verify the skill works?
+
+### Interview and Research
+Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies. Wait to write test prompts until you've got this part ironed out.
+
+### Write the SKILL.md
+Based on the user interview, fill in these components:
+
+- **name**: Skill identifier
+- **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it.
+- **the rest of the skill**
+
+### Skill Writing Guide
+
+#### Anatomy of a Skill
+
+```
+skill-name/
+\u251c\u2500\u2500 SKILL.md (required)
+\u2502   \u251c\u2500\u2500 YAML frontmatter (name, description required)
+\u2502   \u2514\u2500\u2500 Markdown instructions
+\u2514\u2500\u2500 Bundled Resources (optional)
+    \u251c\u2500\u2500 scripts/    - Executable code for deterministic/repetitive tasks
+    \u251c\u2500\u2500 references/ - Docs loaded into context as needed
+    \u2514\u2500\u2500 assets/     - Files used in output (templates, icons, fonts)
+```
+
+#### Progressive Disclosure
+
+Skills use a three-level loading system:
+1. **Metadata** (name + description) - Always in context (~100 words)
+2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal)
+3. **Bundled resources** - As needed (unlimited, scripts can execute without loading)
+
+**Key patterns:**
+- Keep SKILL.md under 500 lines; if approaching this limit, add hierarchy with clear pointers
+- Reference files clearly from SKILL.md with guidance on when to read them
+- For large reference files (>300 lines), include a table of contents
+
+#### Writing Patterns
+
+Prefer using the imperative form in instructions.
+
+### Writing Style
+Explain to the model why things are important. Use theory of mind and try to make the skill general. Start by writing a draft and then look at it with fresh eyes and improve it.
+
+### Test Cases
+After writing the skill draft, come up with 2-3 realistic test prompts. Share them with the user. Save test cases to `evals/evals.json`.
+
+---
+
+## Running and evaluating test cases
+
+This section is one continuous sequence.
+
+### Step 1: Run test cases
+For each test case, run the agent with the skill applied, and optionally a baseline run without the skill for comparison.
+
+### Step 2: Draft assertions
+While runs are in progress, draft quantitative assertions for each test case. Good assertions are objectively verifiable and have descriptive names.
+
+### Step 3: Capture timing data
+When each run completes, save timing data (tokens, duration) to `timing.json`.
+
+### Step 4: Grade, aggregate, and launch the viewer
+Once all runs are done:
+1. Grade each run against assertions — see `agents/grader.md`
+2. Aggregate results: `python -m scripts.aggregate_benchmark <workspace>/iteration-N --skill-name <name>`
+3. Launch the viewer: `python eval-viewer/generate_review.py <workspace>/iteration-N --skill-name "my-skill" --benchmark <workspace>/iteration-N/benchmark.json`
+4. Present results to the user for review
+
+### Step 5: Read the feedback
+Read user feedback from `feedback.json`. Empty feedback means the user thought it was fine.
+
+---
+
+## Improving the skill
+
+### How to think about improvements
+1. **Generalize from the feedback.** Don't overfit to specific examples.
+2. **Keep the prompt lean.** Remove things that aren't pulling their weight.
+3. **Explain the why.** Today's LLMs are smart. Explain reasoning rather than rigid MUSTs.
+4. **Look for repeated work across test cases.** Bundle common scripts in `scripts/`.
+
+### The iteration loop
+1. Apply improvements to the skill
+2. Rerun all test cases into a new iteration directory
+3. Present results for review
+4. Wait for user to review
+5. Read feedback, improve again, repeat
+
+---
+
+## Advanced: Blind comparison
+For rigorous comparison between two versions. Read `agents/comparator.md` and `agents/analyzer.md`.
+
+## Description Optimization
+Optimize the description for better triggering accuracy. Use `scripts/run_loop.py`.
+
+---
+
+## Reference files
+
+- `agents/grader.md` — How to evaluate assertions against outputs
+- `agents/comparator.md` — How to do blind A/B comparison between two outputs
+- `agents/analyzer.md` — How to analyze why one version beat another
+- `references/schemas.md` — JSON structures for evals.json, grading.json, etc.
+- `assets/eval_review.html` — HTML template for eval review
+- `eval-viewer/generate_review.py` — Script to generate the review viewer
+- `scripts/aggregate_benchmark.py` — Aggregate benchmark results
+- `scripts/generate_report.py` — Generate optimization report
+- `scripts/improve_description.py` — Improve skill description
+- `scripts/package_skill.py` — Package skill for distribution
+- `scripts/quick_validate.py` — Quick validation
+- `scripts/run_eval.py` — Run triggering evaluation
+- `scripts/run_loop.py` — Run optimization loop
+- `scripts/utils.py` — Shared utilities
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/analyzer.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/analyzer.md
new file mode 100644
index 000000000..14e41d606
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/analyzer.md
@@ -0,0 +1,274 @@
+# Post-hoc Analyzer Agent
+
+Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions.
+
+## Role
+
+After the blind comparator determines a winner, the Post-hoc Analyzer "unblids" the results by examining the skills and transcripts. The goal is to extract actionable insights: what made the winner better, and how can the loser be improved?
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **winner**: "A" or "B" (from blind comparison)
+- **winner_skill_path**: Path to the skill that produced the winning output
+- **winner_transcript_path**: Path to the execution transcript for the winner
+- **loser_skill_path**: Path to the skill that produced the losing output
+- **loser_transcript_path**: Path to the execution transcript for the loser
+- **comparison_result_path**: Path to the blind comparator's output JSON
+- **output_path**: Where to save the analysis results
+
+## Process
+
+### Step 1: Read Comparison Result
+
+1. Read the blind comparator's output at comparison_result_path
+2. Note the winning side (A or B), the reasoning, and any scores
+3. Understand what the comparator valued in the winning output
+
+### Step 2: Read Both Skills
+
+1. Read the winner skill's SKILL.md and key referenced files
+2. Read the loser skill's SKILL.md and key referenced files
+3. Identify structural differences:
+   - Instructions clarity and specificity
+   - Script/tool usage patterns
+   - Example coverage
+   - Edge case handling
+
+### Step 3: Read Both Transcripts
+
+1. Read the winner's transcript
+2. Read the loser's transcript
+3. Compare execution patterns:
+   - How closely did each follow their skill's instructions?
+   - What tools were used differently?
+   - Where did the loser diverge from optimal behavior?
+   - Did either encounter errors or make recovery attempts?
+
+### Step 4: Analyze Instruction Following
+
+For each transcript, evaluate:
+- Did the agent follow the skill's explicit instructions?
+- Did the agent use the skill's provided tools/scripts?
+- Were there missed opportunities to leverage skill content?
+- Did the agent add unnecessary steps not in the skill?
+
+Score instruction following 1-10 and note specific issues.
+
+### Step 5: Identify Winner Strengths
+
+Determine what made the winner better:
+- Clearer instructions that led to better behavior?
+- Better scripts/tools that produced better output?
+- More comprehensive examples that guided edge cases?
+- Better error handling guidance?
+
+Be specific. Quote from skills/transcripts where relevant.
+
+### Step 6: Identify Loser Weaknesses
+
+Determine what held the loser back:
+- Ambiguous instructions that led to suboptimal choices?
+- Missing tools/scripts that forced workarounds?
+- Gaps in edge case coverage?
+- Poor error handling that caused failures?
+
+### Step 7: Generate Improvement Suggestions
+
+Based on the analysis, produce actionable suggestions for improving the loser skill:
+- Specific instruction changes to make
+- Tools/scripts to add or modify
+- Examples to include
+- Edge cases to address
+
+Prioritize by impact. Focus on changes that would have changed the outcome.
+
+### Step 8: Write Analysis Results
+
+Save structured analysis to `{output_path}`.
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "comparison_summary": {
+    "winner": "A",
+    "winner_skill": "path/to/winner/skill",
+    "loser_skill": "path/to/loser/skill",
+    "comparator_reasoning": "Brief summary of why comparator chose winner"
+  },
+  "winner_strengths": [
+    "Clear step-by-step instructions for handling multi-page documents",
+    "Included validation script that caught formatting errors",
+    "Explicit guidance on fallback behavior when OCR fails"
+  ],
+  "loser_weaknesses": [
+    "Vague instruction 'process the document appropriately' led to inconsistent behavior",
+    "No script for validation, agent had to improvise and made errors",
+    "No guidance on OCR failure, agent gave up instead of trying alternatives"
+  ],
+  "instruction_following": {
+    "winner": {
+      "score": 9,
+      "issues": [
+        "Minor: skipped optional logging step"
+      ]
+    },
+    "loser": {
+      "score": 6,
+      "issues": [
+        "Did not use the skill's formatting template",
+        "Invented own approach instead of following step 3",
+        "Missed the 'always validate output' instruction"
+      ]
+    }
+  },
+  "improvement_suggestions": [
+    {
+      "priority": "high",
+      "category": "instructions",
+      "suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template",
+      "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
+    },
+    {
+      "priority": "high",
+      "category": "tools",
+      "suggestion": "Add validate_output.py script similar to winner skill's validation approach",
+      "expected_impact": "Would catch formatting errors before final output"
+    },
+    {
+      "priority": "medium",
+      "category": "error_handling",
+      "suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'",
+      "expected_impact": "Would prevent early failure on difficult documents"
+    }
+  ],
+  "transcript_insights": {
+    "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output",
+    "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors"
+  }
+}
+```
+
+## Guidelines
+
+- **Be specific**: Quote from skills and transcripts, don't just say "instructions were unclear"
+- **Be actionable**: Suggestions should be concrete changes, not vague advice
+- **Focus on skill improvements**: The goal is to improve the losing skill, not critique the agent
+- **Prioritize by impact**: Which changes would most likely have changed the outcome?
+- **Consider causation**: Did the skill weakness actually cause the worse output, or is it incidental?
+- **Stay objective**: Analyze what happened, don't editorialize
+- **Think about generalization**: Would this improvement help on other evals too?
+
+## Categories for Suggestions
+
+Use these categories to organize improvement suggestions:
+
+| Category | Description |
+|----------|-------------|
+| `instructions` | Changes to the skill's prose instructions |
+| `tools` | Scripts, templates, or utilities to add/modify |
+| `examples` | Example inputs/outputs to include |
+| `error_handling` | Guidance for handling failures |
+| `structure` | Reorganization of skill content |
+| `references` | External docs or resources to add |
+
+## Priority Levels
+
+- **high**: Would likely change the outcome of this comparison
+- **medium**: Would improve quality but may not change win/loss
+- **low**: Nice to have, marginal improvement
+
+---
+
+# Analyzing Benchmark Results
+
+When analyzing benchmark results, the analyzer's purpose is to **surface patterns and anomalies** across multiple runs, not suggest skill improvements.
+
+## Role
+
+Review all benchmark run results and generate freeform notes that help the user understand skill performance. Focus on patterns that wouldn't be visible from aggregate metrics alone.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **benchmark_data_path**: Path to the in-progress benchmark.json with all run results
+- **skill_path**: Path to the skill being benchmarked
+- **output_path**: Where to save the notes (as JSON array of strings)
+
+## Process
+
+### Step 1: Read Benchmark Data
+
+1. Read the benchmark.json containing all run results
+2. Note the configurations tested (with_skill, without_skill)
+3. Understand the run_summary aggregates already calculated
+
+### Step 2: Analyze Per-Assertion Patterns
+
+For each expectation across all runs:
+- Does it **always pass** in both configurations? (may not differentiate skill value)
+- Does it **always fail** in both configurations? (may be broken or beyond capability)
+- Does it **always pass with skill but fail without**? (skill clearly adds value here)
+- Does it **always fail with skill but pass without**? (skill may be hurting)
+- Is it **highly variable**? (flaky expectation or non-deterministic behavior)
+
+### Step 3: Analyze Cross-Eval Patterns
+
+Look for patterns across evals:
+- Are certain eval types consistently harder/easier?
+- Do some evals show high variance while others are stable?
+- Are there surprising results that contradict expectations?
+
+### Step 4: Analyze Metrics Patterns
+
+Look at time_seconds, tokens, tool_calls:
+- Does the skill significantly increase execution time?
+- Is there high variance in resource usage?
+- Are there outlier runs that skew the aggregates?
+
+### Step 5: Generate Notes
+
+Write freeform observations as a list of strings. Each note should:
+- State a specific observation
+- Be grounded in the data (not speculation)
+- Help the user understand something the aggregate metrics don't show
+
+Examples:
+- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value"
+- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky"
+- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)"
+- "Skill adds 13s average execution time but improves pass rate by 50%"
+- "Token usage is 80% higher with skill, primarily due to script output parsing"
+- "All 3 without-skill runs for eval 1 produced empty output"
+
+### Step 6: Write Notes
+
+Save notes to `{output_path}` as a JSON array of strings:
+
+```json
+[
+  "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
+  "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure",
+  "Without-skill runs consistently fail on table extraction expectations",
+  "Skill adds 13s average execution time but improves pass rate by 50%"
+]
+```
+
+## Guidelines
+
+**DO:**
+- Report what you observe in the data
+- Be specific about which evals, expectations, or runs you're referring to
+- Note patterns that aggregate metrics would hide
+- Provide context that helps interpret the numbers
+
+**DO NOT:**
+- Suggest improvements to the skill (that's for the improvement step, not benchmarking)
+- Make subjective quality judgments ("the output was good/bad")
+- Speculate about causes without evidence
+- Repeat information already in the run_summary aggregates
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/comparator.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/comparator.md
new file mode 100644
index 000000000..80e00eb45
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/comparator.md
@@ -0,0 +1,202 @@
+# Blind Comparator Agent
+
+Compare two outputs WITHOUT knowing which skill produced them.
+
+## Role
+
+The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach.
+
+Your judgment is based purely on output quality and task completion.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **output_a_path**: Path to the first output file or directory
+- **output_b_path**: Path to the second output file or directory
+- **eval_prompt**: The original task/prompt that was executed
+- **expectations**: List of expectations to check (optional - may be empty)
+
+## Process
+
+### Step 1: Read Both Outputs
+
+1. Examine output A (file or directory)
+2. Examine output B (file or directory)
+3. Note the type, structure, and content of each
+4. If outputs are directories, examine all relevant files inside
+
+### Step 2: Understand the Task
+
+1. Read the eval_prompt carefully
+2. Identify what the task requires:
+   - What should be produced?
+   - What qualities matter (accuracy, completeness, format)?
+   - What would distinguish a good output from a poor one?
+
+### Step 3: Generate Evaluation Rubric
+
+Based on the task, generate a rubric with two dimensions:
+
+**Content Rubric** (what the output contains):
+| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
+|-----------|----------|----------------|---------------|
+| Correctness | Major errors | Minor errors | Fully correct |
+| Completeness | Missing key elements | Mostly complete | All elements present |
+| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
+
+**Structure Rubric** (how the output is organized):
+| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
+|-----------|----------|----------------|---------------|
+| Organization | Disorganized | Reasonably organized | Clear, logical structure |
+| Formatting | Inconsistent/broken | Mostly consistent | Professional, polished |
+| Usability | Difficult to use | Usable with effort | Easy to use |
+
+Adapt criteria to the specific task. For example:
+- PDF form → "Field alignment", "Text readability", "Data placement"
+- Document → "Section structure", "Heading hierarchy", "Paragraph flow"
+- Data output → "Schema correctness", "Data types", "Completeness"
+
+### Step 4: Evaluate Each Output Against the Rubric
+
+For each output (A and B):
+
+1. **Score each criterion** on the rubric (1-5 scale)
+2. **Calculate dimension totals**: Content score, Structure score
+3. **Calculate overall score**: Average of dimension scores, scaled to 1-10
+
+### Step 5: Check Assertions (if provided)
+
+If expectations are provided:
+
+1. Check each expectation against output A
+2. Check each expectation against output B
+3. Count pass rates for each output
+4. Use expectation scores as secondary evidence (not the primary decision factor)
+
+### Step 6: Determine the Winner
+
+Compare A and B based on (in priority order):
+
+1. **Primary**: Overall rubric score (content + structure)
+2. **Secondary**: Assertion pass rates (if applicable)
+3. **Tiebreaker**: If truly equal, declare a TIE
+
+Be decisive - ties should be rare. One output is usually better, even if marginally.
+
+### Step 7: Write Comparison Results
+
+Save results to a JSON file at the path specified (or `comparison.json` if not specified).
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "winner": "A",
+  "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
+  "rubric": {
+    "A": {
+      "content": {
+        "correctness": 5,
+        "completeness": 5,
+        "accuracy": 4
+      },
+      "structure": {
+        "organization": 4,
+        "formatting": 5,
+        "usability": 4
+      },
+      "content_score": 4.7,
+      "structure_score": 4.3,
+      "overall_score": 9.0
+    },
+    "B": {
+      "content": {
+        "correctness": 3,
+        "completeness": 2,
+        "accuracy": 3
+      },
+      "structure": {
+        "organization": 3,
+        "formatting": 2,
+        "usability": 3
+      },
+      "content_score": 2.7,
+      "structure_score": 2.7,
+      "overall_score": 5.4
+    }
+  },
+  "output_quality": {
+    "A": {
+      "score": 9,
+      "strengths": ["Complete solution", "Well-formatted", "All fields present"],
+      "weaknesses": ["Minor style inconsistency in header"]
+    },
+    "B": {
+      "score": 5,
+      "strengths": ["Readable output", "Correct basic structure"],
+      "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
+    }
+  },
+  "expectation_results": {
+    "A": {
+      "passed": 4,
+      "total": 5,
+      "pass_rate": 0.80,
+      "details": [
+        {"text": "Output includes name", "passed": true},
+        {"text": "Output includes date", "passed": true},
+        {"text": "Format is PDF", "passed": true},
+        {"text": "Contains signature", "passed": false},
+        {"text": "Readable text", "passed": true}
+      ]
+    },
+    "B": {
+      "passed": 3,
+      "total": 5,
+      "pass_rate": 0.60,
+      "details": [
+        {"text": "Output includes name", "passed": true},
+        {"text": "Output includes date", "passed": false},
+        {"text": "Format is PDF", "passed": true},
+        {"text": "Contains signature", "passed": false},
+        {"text": "Readable text", "passed": true}
+      ]
+    }
+  }
+}
+```
+
+If no expectations were provided, omit the `expectation_results` field entirely.
+
+## Field Descriptions
+
+- **winner**: "A", "B", or "TIE"
+- **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie)
+- **rubric**: Structured rubric evaluation for each output
+  - **content**: Scores for content criteria (correctness, completeness, accuracy)
+  - **structure**: Scores for structure criteria (organization, formatting, usability)
+  - **content_score**: Average of content criteria (1-5)
+  - **structure_score**: Average of structure criteria (1-5)
+  - **overall_score**: Combined score scaled to 1-10
+- **output_quality**: Summary quality assessment
+  - **score**: 1-10 rating (should match rubric overall_score)
+  - **strengths**: List of positive aspects
+  - **weaknesses**: List of issues or shortcomings
+- **expectation_results**: (Only if expectations provided)
+  - **passed**: Number of expectations that passed
+  - **total**: Total number of expectations
+  - **pass_rate**: Fraction passed (0.0 to 1.0)
+  - **details**: Individual expectation results
+
+## Guidelines
+
+- **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality.
+- **Be specific**: Cite specific examples when explaining strengths and weaknesses.
+- **Be decisive**: Choose a winner unless outputs are genuinely equivalent.
+- **Output quality first**: Assertion scores are secondary to overall task completion.
+- **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness.
+- **Explain your reasoning**: The reasoning field should make it clear why you chose the winner.
+- **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better.
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/grader.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/grader.md
new file mode 100644
index 000000000..558ab05c0
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/grader.md
@@ -0,0 +1,223 @@
+# Grader Agent
+
+Evaluate expectations against an execution transcript and outputs.
+
+## Role
+
+The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment.
+
+You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so.
+
+## Inputs
+
+You receive these parameters in your prompt:
+
+- **expectations**: List of expectations to evaluate (strings)
+- **transcript_path**: Path to the execution transcript (markdown file)
+- **outputs_dir**: Directory containing output files from execution
+
+## Process
+
+### Step 1: Read the Transcript
+
+1. Read the transcript file completely
+2. Note the eval prompt, execution steps, and final result
+3. Identify any issues or errors documented
+
+### Step 2: Examine Output Files
+
+1. List files in outputs_dir
+2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced.
+3. Note contents, structure, and quality
+
+### Step 3: Evaluate Each Assertion
+
+For each expectation:
+
+1. **Search for evidence** in the transcript and outputs
+2. **Determine verdict**:
+   - **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance
+   - **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content)
+3. **Cite the evidence**: Quote the specific text or describe what you found
+
+### Step 4: Extract and Verify Claims
+
+Beyond the predefined expectations, extract implicit claims from the outputs and verify them:
+
+1. **Extract claims** from the transcript and outputs:
+   - Factual statements ("The form has 12 fields")
+   - Process claims ("Used pypdf to fill the form")
+   - Quality claims ("All fields were filled correctly")
+
+2. **Verify each claim**:
+   - **Factual claims**: Can be checked against the outputs or external sources
+   - **Process claims**: Can be verified from the transcript
+   - **Quality claims**: Evaluate whether the claim is justified
+
+3. **Flag unverifiable claims**: Note claims that cannot be verified with available information
+
+This catches issues that predefined expectations might miss.
+
+### Step 5: Read User Notes
+
+If `{outputs_dir}/user_notes.md` exists:
+1. Read it and note any uncertainties or issues flagged by the executor
+2. Include relevant concerns in the grading output
+3. These may reveal problems even when expectations pass
+
+### Step 6: Critique the Evals
+
+After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap.
+
+Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't.
+
+Suggestions worth raising:
+- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content)
+- An important outcome you observed — good or bad — that no assertion covers at all
+- An assertion that can't actually be verified from the available outputs
+
+Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion.
+
+### Step 7: Write Grading Results
+
+Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
+
+## Grading Criteria
+
+**PASS when**:
+- The transcript or outputs clearly demonstrate the expectation is true
+- Specific evidence can be cited
+- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename)
+
+**FAIL when**:
+- No evidence found for the expectation
+- Evidence contradicts the expectation
+- The expectation cannot be verified from available information
+- The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete
+- The output appears to meet the assertion by coincidence rather than by actually doing the work
+
+**When uncertain**: The burden of proof to pass is on the expectation.
+
+### Step 8: Read Executor Metrics and Timing
+
+1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output
+2. If `{outputs_dir}/../timing.json` exists, read it and include timing data
+
+## Output Format
+
+Write a JSON file with this structure:
+
+```json
+{
+  "expectations": [
+    {
+      "text": "The output includes the name 'John Smith'",
+      "passed": true,
+      "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
+    },
+    {
+      "text": "The spreadsheet has a SUM formula in cell B10",
+      "passed": false,
+      "evidence": "No spreadsheet was created. The output was a text file."
+    },
+    {
+      "text": "The assistant used the skill's OCR script",
+      "passed": true,
+      "evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'"
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 1,
+    "total": 3,
+    "pass_rate": 0.67
+  },
+  "execution_metrics": {
+    "tool_calls": {
+      "Read": 5,
+      "Write": 2,
+      "Bash": 8
+    },
+    "total_tool_calls": 15,
+    "total_steps": 6,
+    "errors_encountered": 0,
+    "output_chars": 12450,
+    "transcript_chars": 3200
+  },
+  "timing": {
+    "executor_duration_seconds": 165.0,
+    "grader_duration_seconds": 26.0,
+    "total_duration_seconds": 191.0
+  },
+  "claims": [
+    {
+      "claim": "The form has 12 fillable fields",
+      "type": "factual",
+      "verified": true,
+      "evidence": "Counted 12 fields in field_info.json"
+    },
+    {
+      "claim": "All required fields were populated",
+      "type": "quality",
+      "verified": false,
+      "evidence": "Reference section was left blank despite data being available"
+    }
+  ],
+  "user_notes_summary": {
+    "uncertainties": ["Used 2023 data, may be stale"],
+    "needs_review": [],
+    "workarounds": ["Fell back to text overlay for non-fillable fields"]
+  },
+  "eval_feedback": {
+    "suggestions": [
+      {
+        "assertion": "The output includes the name 'John Smith'",
+        "reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input"
+      },
+      {
+        "reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught"
+      }
+    ],
+    "overall": "Assertions check presence but not correctness. Consider adding content verification."
+  }
+}
+```
+
+## Field Descriptions
+
+- **expectations**: Array of graded expectations
+  - **text**: The original expectation text
+  - **passed**: Boolean - true if expectation passes
+  - **evidence**: Specific quote or description supporting the verdict
+- **summary**: Aggregate statistics
+  - **passed**: Count of passed expectations
+  - **failed**: Count of failed expectations
+  - **total**: Total expectations evaluated
+  - **pass_rate**: Fraction passed (0.0 to 1.0)
+- **execution_metrics**: Copied from executor's metrics.json (if available)
+  - **output_chars**: Total character count of output files (proxy for tokens)
+  - **transcript_chars**: Character count of transcript
+- **timing**: Wall clock timing from timing.json (if available)
+  - **executor_duration_seconds**: Time spent in executor subagent
+  - **total_duration_seconds**: Total elapsed time for the run
+- **claims**: Extracted and verified claims from the output
+  - **claim**: The statement being verified
+  - **type**: "factual", "process", or "quality"
+  - **verified**: Boolean - whether the claim holds
+  - **evidence**: Supporting or contradicting evidence
+- **user_notes_summary**: Issues flagged by the executor
+  - **uncertainties**: Things the executor wasn't sure about
+  - **needs_review**: Items requiring human attention
+  - **workarounds**: Places where the skill didn't work as expected
+- **eval_feedback**: Improvement suggestions for the evals (only when warranted)
+  - **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to
+  - **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag
+
+## Guidelines
+
+- **Be objective**: Base verdicts on evidence, not assumptions
+- **Be specific**: Quote the exact text that supports your verdict
+- **Be thorough**: Check both transcript and output files
+- **Be consistent**: Apply the same standard to each expectation
+- **Explain failures**: Make it clear why evidence was insufficient
+- **No partial credit**: Each expectation is pass or fail, not partial
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/assets/eval_review.html b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/assets/eval_review.html
new file mode 100644
index 000000000..938ff32ae
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/assets/eval_review.html
@@ -0,0 +1,146 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Eval Set Review - __SKILL_NAME_PLACEHOLDER__</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+  <style>
+    * { box-sizing: border-box; margin: 0; padding: 0; }
+    body { font-family: 'Lora', Georgia, serif; background: #faf9f5; padding: 2rem; color: #141413; }
+    h1 { font-family: 'Poppins', sans-serif; margin-bottom: 0.5rem; font-size: 1.5rem; }
+    .description { color: #b0aea5; margin-bottom: 1.5rem; font-style: italic; max-width: 900px; }
+    .controls { margin-bottom: 1rem; display: flex; gap: 0.5rem; }
+    .btn { font-family: 'Poppins', sans-serif; padding: 0.5rem 1rem; border: none; border-radius: 6px; cursor: pointer; font-size: 0.875rem; font-weight: 500; }
+    .btn-add { background: #6a9bcc; color: white; }
+    .btn-add:hover { background: #5889b8; }
+    .btn-export { background: #d97757; color: white; }
+    .btn-export:hover { background: #c4613f; }
+    table { width: 100%; max-width: 1100px; border-collapse: collapse; background: white; border-radius: 6px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.08); }
+    th { font-family: 'Poppins', sans-serif; background: #141413; color: #faf9f5; padding: 0.75rem 1rem; text-align: left; font-size: 0.875rem; }
+    td { padding: 0.75rem 1rem; border-bottom: 1px solid #e8e6dc; vertical-align: top; }
+    tr:nth-child(even) td { background: #faf9f5; }
+    tr:hover td { background: #f3f1ea; }
+    .section-header td { background: #e8e6dc; font-family: 'Poppins', sans-serif; font-weight: 500; font-size: 0.8rem; color: #141413; text-transform: uppercase; letter-spacing: 0.05em; }
+    .query-input { width: 100%; padding: 0.4rem; border: 1px solid #e8e6dc; border-radius: 4px; font-size: 0.875rem; font-family: 'Lora', Georgia, serif; resize: vertical; min-height: 60px; }
+    .query-input:focus { outline: none; border-color: #d97757; box-shadow: 0 0 0 2px rgba(217,119,87,0.15); }
+    .toggle { position: relative; display: inline-block; width: 44px; height: 24px; }
+    .toggle input { opacity: 0; width: 0; height: 0; }
+    .toggle .slider { position: absolute; inset: 0; background: #b0aea5; border-radius: 24px; cursor: pointer; transition: 0.2s; }
+    .toggle .slider::before { content: ""; position: absolute; width: 18px; height: 18px; left: 3px; bottom: 3px; background: white; border-radius: 50%; transition: 0.2s; }
+    .toggle input:checked + .slider { background: #d97757; }
+    .toggle input:checked + .slider::before { transform: translateX(20px); }
+    .btn-delete { background: #c44; color: white; padding: 0.3rem 0.6rem; border: none; border-radius: 4px; cursor: pointer; font-size: 0.75rem; font-family: 'Poppins', sans-serif; }
+    .btn-delete:hover { background: #a33; }
+    .summary { margin-top: 1rem; color: #b0aea5; font-size: 0.875rem; }
+  </style>
+</head>
+<body>
+  <h1>Eval Set Review: <span id="skill-name">__SKILL_NAME_PLACEHOLDER__</span></h1>
+  <p class="description">Current description: <span id="skill-desc">__SKILL_DESCRIPTION_PLACEHOLDER__</span></p>
+
+  <div class="controls">
+    <button class="btn btn-add" onclick="addRow()">+ Add Query</button>
+    <button class="btn btn-export" onclick="exportEvalSet()">Export Eval Set</button>
+  </div>
+
+  <table>
+    <thead>
+      <tr>
+        <th style="width:65%">Query</th>
+        <th style="width:18%">Should Trigger</th>
+        <th style="width:10%">Actions</th>
+      </tr>
+    </thead>
+    <tbody id="eval-body"></tbody>
+  </table>
+
+  <p class="summary" id="summary"></p>
+
+  <script>
+    const EVAL_DATA = __EVAL_DATA_PLACEHOLDER__;
+
+    let evalItems = [...EVAL_DATA];
+
+    function render() {
+      const tbody = document.getElementById('eval-body');
+      tbody.innerHTML = '';
+
+      // Sort: should-trigger first, then should-not-trigger
+      const sorted = evalItems
+        .map((item, origIdx) => ({ ...item, origIdx }))
+        .sort((a, b) => (b.should_trigger ? 1 : 0) - (a.should_trigger ? 1 : 0));
+
+      let lastGroup = null;
+      sorted.forEach(item => {
+        const group = item.should_trigger ? 'trigger' : 'no-trigger';
+        if (group !== lastGroup) {
+          const headerRow = document.createElement('tr');
+          headerRow.className = 'section-header';
+          headerRow.innerHTML = `<td colspan="3">${item.should_trigger ? 'Should Trigger' : 'Should NOT Trigger'}</td>`;
+          tbody.appendChild(headerRow);
+          lastGroup = group;
+        }
+
+        const idx = item.origIdx;
+        const tr = document.createElement('tr');
+        tr.innerHTML = `
+          <td><textarea class="query-input" onchange="updateQuery(${idx}, this.value)">${escapeHtml(item.query)}</textarea></td>
+          <td>
+            <label class="toggle">
+              <input type="checkbox" ${item.should_trigger ? 'checked' : ''} onchange="updateTrigger(${idx}, this.checked)">
+              <span class="slider"></span>
+            </label>
+            <span style="margin-left:8px;font-size:0.8rem;color:#b0aea5">${item.should_trigger ? 'Yes' : 'No'}</span>
+          </td>
+          <td><button class="btn-delete" onclick="deleteRow(${idx})">Delete</button></td>
+        `;
+        tbody.appendChild(tr);
+      });
+      updateSummary();
+    }
+
+    function escapeHtml(text) {
+      const div = document.createElement('div');
+      div.textContent = text;
+      return div.innerHTML;
+    }
+
+    function updateQuery(idx, value) { evalItems[idx].query = value; updateSummary(); }
+    function updateTrigger(idx, value) { evalItems[idx].should_trigger = value; render(); }
+    function deleteRow(idx) { evalItems.splice(idx, 1); render(); }
+
+    function addRow() {
+      evalItems.push({ query: '', should_trigger: true });
+      render();
+      const inputs = document.querySelectorAll('.query-input');
+      inputs[inputs.length - 1].focus();
+    }
+
+    function updateSummary() {
+      const trigger = evalItems.filter(i => i.should_trigger).length;
+      const noTrigger = evalItems.filter(i => !i.should_trigger).length;
+      document.getElementById('summary').textContent =
+        `${evalItems.length} queries total: ${trigger} should trigger, ${noTrigger} should not trigger`;
+    }
+
+    function exportEvalSet() {
+      const valid = evalItems.filter(i => i.query.trim() !== '');
+      const data = valid.map(i => ({ query: i.query.trim(), should_trigger: i.should_trigger }));
+      const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
+      const url = URL.createObjectURL(blob);
+      const a = document.createElement('a');
+      a.href = url;
+      a.download = 'eval_set.json';
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+      URL.revokeObjectURL(url);
+    }
+
+    render();
+  </script>
+</body>
+</html>
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/generate_review.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/generate_review.py
new file mode 100644
index 000000000..4f0b1fe00
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/generate_review.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python3
+"""Generate and serve a review page for eval results.
+
+Reads the workspace directory, discovers runs (directories with outputs/),
+embeds all output data into a self-contained HTML page, and serves it via
+a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace.
+
+Usage:
+    python generate_review.py <workspace-path> [--port PORT] [--skill-name NAME]
+    python generate_review.py <workspace-path> --previous-feedback /path/to/old/feedback.json
+
+No dependencies beyond the Python stdlib are required.
+"""
+
+import argparse
+import base64
+import json
+import mimetypes
+import os
+import re
+import signal
+import subprocess
+import sys
+import time
+import webbrowser
+from functools import partial
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from pathlib import Path
+
+from loguru import logger
+
+# Files to exclude from output listings
+METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
+
+# Extensions we render as inline text
+TEXT_EXTENSIONS = {
+    ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
+    ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
+    ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
+}
+
+# Extensions we render as inline images
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
+
+# MIME type overrides for common types
+MIME_OVERRIDES = {
+    ".svg": "image/svg+xml",
+    ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+}
+
+
+def get_mime_type(path: Path) -> str:
+    ext = path.suffix.lower()
+    if ext in MIME_OVERRIDES:
+        return MIME_OVERRIDES[ext]
+    mime, _ = mimetypes.guess_type(str(path))
+    return mime or "application/octet-stream"
+
+
+def find_runs(workspace: Path) -> list[dict]:
+    """Recursively find directories that contain an outputs/ subdirectory."""
+    runs: list[dict] = []
+    _find_runs_recursive(workspace, workspace, runs)
+    runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"]))
+    return runs
+
+
+def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None:
+    if not current.is_dir():
+        return
+
+    outputs_dir = current / "outputs"
+    if outputs_dir.is_dir():
+        run = build_run(root, current)
+        if run:
+            runs.append(run)
+        return
+
+    skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
+    for child in sorted(current.iterdir()):
+        if child.is_dir() and child.name not in skip:
+            _find_runs_recursive(root, child, runs)
+
+
+def build_run(root: Path, run_dir: Path) -> dict | None:
+    """Build a run dict with prompt, outputs, and grading data."""
+    prompt = ""
+    eval_id = None
+
+    # Try eval_metadata.json
+    for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:
+        if candidate.exists():
+            try:
+                metadata = json.loads(candidate.read_text())
+                prompt = metadata.get("prompt", "")
+                eval_id = metadata.get("eval_id")
+            except (json.JSONDecodeError, OSError):
+                pass
+            if prompt:
+                break
+
+    # Fall back to transcript.md
+    if not prompt:
+        for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:
+            if candidate.exists():
+                try:
+                    text = candidate.read_text()
+                    match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)
+                    if match:
+                        prompt = match.group(1).strip()
+                except OSError:
+                    pass
+                if prompt:
+                    break
+
+    if not prompt:
+        prompt = "(No prompt found)"
+
+    run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
+
+    # Collect output files
+    outputs_dir = run_dir / "outputs"
+    output_files: list[dict] = []
+    if outputs_dir.is_dir():
+        for f in sorted(outputs_dir.iterdir()):
+            if f.is_file() and f.name not in METADATA_FILES:
+                output_files.append(embed_file(f))
+
+    # Load grading if present
+    grading = None
+    for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:
+        if candidate.exists():
+            try:
+                grading = json.loads(candidate.read_text())
+            except (json.JSONDecodeError, OSError):
+                pass
+            if grading:
+                break
+
+    return {
+        "id": run_id,
+        "prompt": prompt,
+        "eval_id": eval_id,
+        "outputs": output_files,
+        "grading": grading,
+    }
+
+
+def embed_file(path: Path) -> dict:
+    """Read a file and return an embedded representation."""
+    ext = path.suffix.lower()
+    mime = get_mime_type(path)
+
+    if ext in TEXT_EXTENSIONS:
+        try:
+            content = path.read_text(errors="replace")
+        except OSError:
+            content = "(Error reading file)"
+        return {
+            "name": path.name,
+            "type": "text",
+            "content": content,
+        }
+    elif ext in IMAGE_EXTENSIONS:
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "image",
+            "mime": mime,
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+    elif ext == ".pdf":
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "pdf",
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+    elif ext == ".xlsx":
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "xlsx",
+            "data_b64": b64,
+        }
+    else:
+        # Binary / unknown — base64 download link
+        try:
+            raw = path.read_bytes()
+            b64 = base64.b64encode(raw).decode("ascii")
+        except OSError:
+            return {"name": path.name, "type": "error", "content": "(Error reading file)"}
+        return {
+            "name": path.name,
+            "type": "binary",
+            "mime": mime,
+            "data_uri": f"data:{mime};base64,{b64}",
+        }
+
+
+def load_previous_iteration(workspace: Path) -> dict[str, dict]:
+    """Load previous iteration's feedback and outputs.
+
+    Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}.
+    """
+    result: dict[str, dict] = {}
+
+    # Load feedback
+    feedback_map: dict[str, str] = {}
+    feedback_path = workspace / "feedback.json"
+    if feedback_path.exists():
+        try:
+            data = json.loads(feedback_path.read_text())
+            feedback_map = {
+                r["run_id"]: r["feedback"]
+                for r in data.get("reviews", [])
+                if r.get("feedback", "").strip()
+            }
+        except (json.JSONDecodeError, OSError, KeyError):
+            pass
+
+    # Load runs (to get outputs)
+    prev_runs = find_runs(workspace)
+    for run in prev_runs:
+        result[run["id"]] = {
+            "feedback": feedback_map.get(run["id"], ""),
+            "outputs": run.get("outputs", []),
+        }
+
+    # Also add feedback for run_ids that had feedback but no matching run
+    for run_id, fb in feedback_map.items():
+        if run_id not in result:
+            result[run_id] = {"feedback": fb, "outputs": []}
+
+    return result
+
+
+def generate_html(
+    runs: list[dict],
+    skill_name: str,
+    previous: dict[str, dict] | None = None,
+    benchmark: dict | None = None,
+) -> str:
+    """Generate the complete standalone HTML page with embedded data."""
+    template_path = Path(__file__).parent / "viewer.html"
+    template = template_path.read_text()
+
+    # Build previous_feedback and previous_outputs maps for the template
+    previous_feedback: dict[str, str] = {}
+    previous_outputs: dict[str, list[dict]] = {}
+    if previous:
+        for run_id, data in previous.items():
+            if data.get("feedback"):
+                previous_feedback[run_id] = data["feedback"]
+            if data.get("outputs"):
+                previous_outputs[run_id] = data["outputs"]
+
+    embedded = {
+        "skill_name": skill_name,
+        "runs": runs,
+        "previous_feedback": previous_feedback,
+        "previous_outputs": previous_outputs,
+    }
+    if benchmark:
+        embedded["benchmark"] = benchmark
+
+    data_json = json.dumps(embedded)
+
+    return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
+
+
+# ---------------------------------------------------------------------------
+# HTTP server (stdlib only, zero dependencies)
+# ---------------------------------------------------------------------------
+
+def _kill_port(port: int) -> None:
+    """Kill any process listening on the given port."""
+    try:
+        result = subprocess.run(
+            ["lsof", "-ti", f":{port}"],
+            capture_output=True, text=True, timeout=5,
+        )
+        for pid_str in result.stdout.strip().split("\n"):
+            if pid_str.strip():
+                try:
+                    os.kill(int(pid_str.strip()), signal.SIGTERM)
+                except (ProcessLookupError, ValueError):
+                    pass
+        if result.stdout.strip():
+            time.sleep(0.5)
+    except subprocess.TimeoutExpired:
+        pass
+    except FileNotFoundError:
+        logger.warning("Note: lsof not found, cannot check if port is in use")
+
+class ReviewHandler(BaseHTTPRequestHandler):
+    """Serves the review HTML and handles feedback saves.
+
+    Regenerates the HTML on each page load so that refreshing the browser
+    picks up new eval outputs without restarting the server.
+    """
+
+    def __init__(
+        self,
+        workspace: Path,
+        skill_name: str,
+        feedback_path: Path,
+        previous: dict[str, dict],
+        benchmark_path: Path | None,
+        *args,
+        **kwargs,
+    ):
+        self.workspace = workspace
+        self.skill_name = skill_name
+        self.feedback_path = feedback_path
+        self.previous = previous
+        self.benchmark_path = benchmark_path
+        super().__init__(*args, **kwargs)
+
+    def do_GET(self) -> None:
+        if self.path == "/" or self.path == "/index.html":
+            # Regenerate HTML on each request (re-scans workspace for new outputs)
+            runs = find_runs(self.workspace)
+            benchmark = None
+            if self.benchmark_path and self.benchmark_path.exists():
+                try:
+                    benchmark = json.loads(self.benchmark_path.read_text())
+                except (json.JSONDecodeError, OSError):
+                    pass
+            html = generate_html(runs, self.skill_name, self.previous, benchmark)
+            content = html.encode("utf-8")
+            self.send_response(200)
+            self.send_header("Content-Type", "text/html; charset=utf-8")
+            self.send_header("Content-Length", str(len(content)))
+            self.end_headers()
+            self.wfile.write(content)
+        elif self.path == "/api/feedback":
+            data = b"{}"
+            if self.feedback_path.exists():
+                data = self.feedback_path.read_bytes()
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(data)))
+            self.end_headers()
+            self.wfile.write(data)
+        else:
+            self.send_error(404)
+
+    def do_POST(self) -> None:
+        if self.path == "/api/feedback":
+            length = int(self.headers.get("Content-Length", 0))
+            body = self.rfile.read(length)
+            try:
+                data = json.loads(body)
+                if not isinstance(data, dict) or "reviews" not in data:
+                    raise ValueError("Expected JSON object with 'reviews' key")
+                self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")
+                resp = b'{"ok":true}'
+                self.send_response(200)
+            except (json.JSONDecodeError, OSError, ValueError) as e:
+                resp = json.dumps({"error": str(e)}).encode()
+                self.send_response(500)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(resp)))
+            self.end_headers()
+            self.wfile.write(resp)
+        else:
+            self.send_error(404)
+
+    def log_message(self, format: str, *args: object) -> None:
+        # Suppress request logging to keep terminal clean
+        pass
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate and serve eval review")
+    parser.add_argument("workspace", type=Path, help="Path to workspace directory")
+    parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
+    parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
+    parser.add_argument(
+        "--previous-workspace", type=Path, default=None,
+        help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
+    )
+    parser.add_argument(
+        "--benchmark", type=Path, default=None,
+        help="Path to benchmark.json to show in the Benchmark tab",
+    )
+    parser.add_argument(
+        "--static", "-s", type=Path, default=None,
+        help="Write standalone HTML to this path instead of starting a server",
+    )
+    args = parser.parse_args()
+
+    workspace = args.workspace.resolve()
+    if not workspace.is_dir():
+        logger.error(f"Error: {workspace} is not a directory")
+        sys.exit(1)
+
+    runs = find_runs(workspace)
+    if not runs:
+        logger.error(f"No runs found in {workspace}")
+        sys.exit(1)
+
+    skill_name = args.skill_name or workspace.name.replace("-workspace", "")
+    feedback_path = workspace / "feedback.json"
+
+    previous: dict[str, dict] = {}
+    if args.previous_workspace:
+        previous = load_previous_iteration(args.previous_workspace.resolve())
+
+    benchmark_path = args.benchmark.resolve() if args.benchmark else None
+    benchmark = None
+    if benchmark_path and benchmark_path.exists():
+        try:
+            benchmark = json.loads(benchmark_path.read_text())
+        except (json.JSONDecodeError, OSError):
+            pass
+
+    if args.static:
+        html = generate_html(runs, skill_name, previous, benchmark)
+        args.static.parent.mkdir(parents=True, exist_ok=True)
+        args.static.write_text(html)
+        logger.info(f"\n  Static viewer written to: {args.static}\n")
+        sys.exit(0)
+
+    # Kill any existing process on the target port
+    port = args.port
+    _kill_port(port)
+    handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path)
+    try:
+        server = HTTPServer(("127.0.0.1", port), handler)
+    except OSError:
+        # Port still in use after kill attempt — find a free one
+        server = HTTPServer(("127.0.0.1", 0), handler)
+        port = server.server_address[1]
+
+    url = f"http://localhost:{port}"
+    logger.info(f"\n  Eval Viewer")
+    logger.info(f"  ─────────────────────────────────")
+    logger.info(f"  URL:       {url}")
+    logger.info(f"  Workspace: {workspace}")
+    logger.info(f"  Feedback:  {feedback_path}")
+    if previous:
+        logger.info(f"  Previous:  {args.previous_workspace} ({len(previous)} runs)")
+    if benchmark_path:
+        logger.info(f"  Benchmark: {benchmark_path}")
+    logger.info(f"\n  Press Ctrl+C to stop.\n")
+
+    webbrowser.open(url)
+
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        logger.info("\nStopped.")
+        server.server_close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/viewer.html b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/viewer.html
new file mode 100644
index 000000000..6d8e96348
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/viewer.html
@@ -0,0 +1,1325 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Eval Review</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+  <script src="https://cdn.sheetjs.com/xlsx-0.20.3/package/dist/xlsx.full.min.js" integrity="sha384-EnyY0/GSHQGSxSgMwaIPzSESbqoOLSexfnSMN2AP+39Ckmn92stwABZynq1JyzdT" crossorigin="anonymous"></script>
+  <style>
+    :root {
+      --bg: #faf9f5;
+      --surface: #ffffff;
+      --border: #e8e6dc;
+      --text: #141413;
+      --text-muted: #b0aea5;
+      --accent: #d97757;
+      --accent-hover: #c4613f;
+      --green: #788c5d;
+      --green-bg: #eef2e8;
+      --red: #c44;
+      --red-bg: #fceaea;
+      --header-bg: #141413;
+      --header-text: #faf9f5;
+      --radius: 6px;
+    }
+
+    * { box-sizing: border-box; margin: 0; padding: 0; }
+
+    body {
+      font-family: 'Lora', Georgia, serif;
+      background: var(--bg);
+      color: var(--text);
+      height: 100vh;
+      display: flex;
+      flex-direction: column;
+    }
+
+    /* ---- Header ---- */
+    .header {
+      background: var(--header-bg);
+      color: var(--header-text);
+      padding: 1rem 2rem;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      flex-shrink: 0;
+    }
+    .header h1 {
+      font-family: 'Poppins', sans-serif;
+      font-size: 1.25rem;
+      font-weight: 600;
+    }
+    .header .instructions {
+      font-size: 0.8rem;
+      opacity: 0.7;
+      margin-top: 0.25rem;
+    }
+    .header .progress {
+      font-size: 0.875rem;
+      opacity: 0.8;
+      text-align: right;
+    }
+
+    /* ---- Main content ---- */
+    .main {
+      flex: 1;
+      overflow-y: auto;
+      padding: 1.5rem 2rem;
+      display: flex;
+      flex-direction: column;
+      gap: 1.25rem;
+    }
+
+    /* ---- Sections ---- */
+    .section {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      flex-shrink: 0;
+    }
+    .section-header {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.75rem 1rem;
+      font-size: 0.75rem;
+      font-weight: 500;
+      text-transform: uppercase;
+      letter-spacing: 0.05em;
+      color: var(--text-muted);
+      border-bottom: 1px solid var(--border);
+      background: var(--bg);
+    }
+    .section-body {
+      padding: 1rem;
+    }
+
+    /* ---- Config badge ---- */
+    .config-badge {
+      display: inline-block;
+      padding: 0.2rem 0.625rem;
+      border-radius: 9999px;
+      font-family: 'Poppins', sans-serif;
+      font-size: 0.6875rem;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.03em;
+      margin-left: 0.75rem;
+      vertical-align: middle;
+    }
+    .config-badge.config-primary {
+      background: rgba(33, 150, 243, 0.12);
+      color: #1976d2;
+    }
+    .config-badge.config-baseline {
+      background: rgba(255, 193, 7, 0.15);
+      color: #f57f17;
+    }
+
+    /* ---- Prompt ---- */
+    .prompt-text {
+      white-space: pre-wrap;
+      font-size: 0.9375rem;
+      line-height: 1.6;
+    }
+
+    /* ---- Outputs ---- */
+    .output-file {
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      overflow: hidden;
+    }
+    .output-file + .output-file {
+      margin-top: 1rem;
+    }
+    .output-file-header {
+      padding: 0.5rem 0.75rem;
+      font-size: 0.8rem;
+      font-weight: 600;
+      color: var(--text-muted);
+      background: var(--bg);
+      border-bottom: 1px solid var(--border);
+      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+    }
+    .output-file-header .dl-btn {
+      font-size: 0.7rem;
+      color: var(--accent);
+      text-decoration: none;
+      cursor: pointer;
+      font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+      font-weight: 500;
+      opacity: 0.8;
+    }
+    .output-file-header .dl-btn:hover {
+      opacity: 1;
+      text-decoration: underline;
+    }
+    .output-file-content {
+      padding: 0.75rem;
+      overflow-x: auto;
+    }
+    .output-file-content pre {
+      font-size: 0.8125rem;
+      line-height: 1.5;
+      white-space: pre-wrap;
+      word-break: break-word;
+      font-family: 'SF Mono', SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace;
+    }
+    .output-file-content img {
+      max-width: 100%;
+      height: auto;
+      border-radius: 4px;
+    }
+    .output-file-content iframe {
+      width: 100%;
+      height: 600px;
+      border: none;
+    }
+    .output-file-content table {
+      border-collapse: collapse;
+      font-size: 0.8125rem;
+      width: 100%;
+    }
+    .output-file-content table td,
+    .output-file-content table th {
+      border: 1px solid var(--border);
+      padding: 0.375rem 0.5rem;
+      text-align: left;
+    }
+    .output-file-content table th {
+      background: var(--bg);
+      font-weight: 600;
+    }
+    .output-file-content .download-link {
+      display: inline-flex;
+      align-items: center;
+      gap: 0.5rem;
+      padding: 0.5rem 1rem;
+      background: var(--bg);
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      color: var(--accent);
+      text-decoration: none;
+      font-size: 0.875rem;
+      cursor: pointer;
+    }
+    .output-file-content .download-link:hover {
+      background: var(--border);
+    }
+    .empty-state {
+      color: var(--text-muted);
+      font-style: italic;
+      padding: 2rem;
+      text-align: center;
+    }
+
+    /* ---- Feedback ---- */
+    .prev-feedback {
+      background: var(--bg);
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      padding: 0.625rem 0.75rem;
+      margin-top: 0.75rem;
+      font-size: 0.8125rem;
+      color: var(--text-muted);
+      line-height: 1.5;
+    }
+    .prev-feedback-label {
+      font-size: 0.7rem;
+      font-weight: 600;
+      text-transform: uppercase;
+      letter-spacing: 0.04em;
+      margin-bottom: 0.25rem;
+      color: var(--text-muted);
+    }
+    .feedback-textarea {
+      width: 100%;
+      min-height: 100px;
+      padding: 0.75rem;
+      border: 1px solid var(--border);
+      border-radius: 4px;
+      font-family: inherit;
+      font-size: 0.9375rem;
+      line-height: 1.5;
+      resize: vertical;
+      color: var(--text);
+    }
+    .feedback-textarea:focus {
+      outline: none;
+      border-color: var(--accent);
+      box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
+    }
+    .feedback-status {
+      font-size: 0.75rem;
+      color: var(--text-muted);
+      margin-top: 0.5rem;
+      min-height: 1.1em;
+    }
+
+    /* ---- Grades (collapsible) ---- */
+    .grades-toggle {
+      display: flex;
+      align-items: center;
+      cursor: pointer;
+      user-select: none;
+    }
+    .grades-toggle:hover {
+      color: var(--accent);
+    }
+    .grades-toggle .arrow {
+      margin-right: 0.5rem;
+      transition: transform 0.15s;
+      font-size: 0.75rem;
+    }
+    .grades-toggle .arrow.open {
+      transform: rotate(90deg);
+    }
+    .grades-content {
+      display: none;
+      margin-top: 0.75rem;
+    }
+    .grades-content.open {
+      display: block;
+    }
+    .grades-summary {
+      font-size: 0.875rem;
+      margin-bottom: 0.75rem;
+      display: flex;
+      align-items: center;
+      gap: 0.5rem;
+    }
+    .grade-badge {
+      display: inline-block;
+      padding: 0.125rem 0.5rem;
+      border-radius: 9999px;
+      font-size: 0.75rem;
+      font-weight: 600;
+    }
+    .grade-pass { background: var(--green-bg); color: var(--green); }
+    .grade-fail { background: var(--red-bg); color: var(--red); }
+    .assertion-list {
+      list-style: none;
+    }
+    .assertion-item {
+      padding: 0.625rem 0;
+      border-bottom: 1px solid var(--border);
+      font-size: 0.8125rem;
+    }
+    .assertion-item:last-child { border-bottom: none; }
+    .assertion-status {
+      font-weight: 600;
+      margin-right: 0.5rem;
+    }
+    .assertion-status.pass { color: var(--green); }
+    .assertion-status.fail { color: var(--red); }
+    .assertion-evidence {
+      color: var(--text-muted);
+      font-size: 0.75rem;
+      margin-top: 0.25rem;
+      padding-left: 1.5rem;
+    }
+
+    /* ---- View tabs ---- */
+    .view-tabs {
+      display: flex;
+      gap: 0;
+      padding: 0 2rem;
+      background: var(--bg);
+      border-bottom: 1px solid var(--border);
+      flex-shrink: 0;
+    }
+    .view-tab {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.625rem 1.25rem;
+      font-size: 0.8125rem;
+      font-weight: 500;
+      cursor: pointer;
+      border: none;
+      background: none;
+      color: var(--text-muted);
+      border-bottom: 2px solid transparent;
+      transition: all 0.15s;
+    }
+    .view-tab:hover { color: var(--text); }
+    .view-tab.active {
+      color: var(--accent);
+      border-bottom-color: var(--accent);
+    }
+    .view-panel { display: none; }
+    .view-panel.active { display: flex; flex-direction: column; flex: 1; overflow: hidden; }
+
+    /* ---- Benchmark view ---- */
+    .benchmark-view {
+      padding: 1.5rem 2rem;
+      overflow-y: auto;
+      flex: 1;
+    }
+    .benchmark-table {
+      border-collapse: collapse;
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      font-size: 0.8125rem;
+      width: 100%;
+      margin-bottom: 1.5rem;
+    }
+    .benchmark-table th, .benchmark-table td {
+      padding: 0.625rem 0.75rem;
+      text-align: left;
+      border: 1px solid var(--border);
+    }
+    .benchmark-table th {
+      font-family: 'Poppins', sans-serif;
+      background: var(--header-bg);
+      color: var(--header-text);
+      font-weight: 500;
+      font-size: 0.75rem;
+      text-transform: uppercase;
+      letter-spacing: 0.04em;
+    }
+    .benchmark-table tr:hover { background: var(--bg); }
+    .benchmark-table tr.benchmark-row-with { background: rgba(33, 150, 243, 0.06); }
+    .benchmark-table tr.benchmark-row-without { background: rgba(255, 193, 7, 0.06); }
+    .benchmark-table tr.benchmark-row-with:hover { background: rgba(33, 150, 243, 0.12); }
+    .benchmark-table tr.benchmark-row-without:hover { background: rgba(255, 193, 7, 0.12); }
+    .benchmark-table tr.benchmark-row-avg { font-weight: 600; border-top: 2px solid var(--border); }
+    .benchmark-table tr.benchmark-row-avg.benchmark-row-with { background: rgba(33, 150, 243, 0.12); }
+    .benchmark-table tr.benchmark-row-avg.benchmark-row-without { background: rgba(255, 193, 7, 0.12); }
+    .benchmark-delta-positive { color: var(--green); font-weight: 600; }
+    .benchmark-delta-negative { color: var(--red); font-weight: 600; }
+    .benchmark-notes {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      padding: 1rem;
+    }
+    .benchmark-notes h3 {
+      font-family: 'Poppins', sans-serif;
+      font-size: 0.875rem;
+      margin-bottom: 0.75rem;
+    }
+    .benchmark-notes ul {
+      list-style: disc;
+      padding-left: 1.25rem;
+    }
+    .benchmark-notes li {
+      font-size: 0.8125rem;
+      line-height: 1.6;
+      margin-bottom: 0.375rem;
+    }
+    .benchmark-empty {
+      color: var(--text-muted);
+      font-style: italic;
+      text-align: center;
+      padding: 3rem;
+    }
+
+    /* ---- Navigation ---- */
+    .nav {
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      padding: 1rem 2rem;
+      border-top: 1px solid var(--border);
+      background: var(--surface);
+      flex-shrink: 0;
+    }
+    .nav-btn {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.5rem 1.25rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      cursor: pointer;
+      font-size: 0.875rem;
+      font-weight: 500;
+      color: var(--text);
+      transition: all 0.15s;
+    }
+    .nav-btn:hover:not(:disabled) {
+      background: var(--bg);
+      border-color: var(--text-muted);
+    }
+    .nav-btn:disabled {
+      opacity: 0.4;
+      cursor: not-allowed;
+    }
+    .done-btn {
+      font-family: 'Poppins', sans-serif;
+      padding: 0.5rem 1.5rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      color: var(--text);
+      cursor: pointer;
+      font-size: 0.875rem;
+      font-weight: 500;
+      transition: all 0.15s;
+    }
+    .done-btn:hover {
+      background: var(--bg);
+      border-color: var(--text-muted);
+    }
+    .done-btn.ready {
+      border: none;
+      background: var(--accent);
+      color: white;
+      font-weight: 600;
+    }
+    .done-btn.ready:hover {
+      background: var(--accent-hover);
+    }
+    /* ---- Done overlay ---- */
+    .done-overlay {
+      display: none;
+      position: fixed;
+      inset: 0;
+      background: rgba(0, 0, 0, 0.5);
+      z-index: 100;
+      justify-content: center;
+      align-items: center;
+    }
+    .done-overlay.visible {
+      display: flex;
+    }
+    .done-card {
+      background: var(--surface);
+      border-radius: 12px;
+      padding: 2rem 3rem;
+      text-align: center;
+      box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
+      max-width: 500px;
+    }
+    .done-card h2 {
+      font-size: 1.5rem;
+      margin-bottom: 0.5rem;
+    }
+    .done-card p {
+      color: var(--text-muted);
+      margin-bottom: 1.5rem;
+      line-height: 1.5;
+    }
+    .done-card .btn-row {
+      display: flex;
+      gap: 0.5rem;
+      justify-content: center;
+    }
+    .done-card button {
+      padding: 0.5rem 1.25rem;
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      background: var(--surface);
+      cursor: pointer;
+      font-size: 0.875rem;
+    }
+    .done-card button:hover {
+      background: var(--bg);
+    }
+    /* ---- Toast ---- */
+    .toast {
+      position: fixed;
+      bottom: 5rem;
+      left: 50%;
+      transform: translateX(-50%);
+      background: var(--header-bg);
+      color: var(--header-text);
+      padding: 0.625rem 1.25rem;
+      border-radius: var(--radius);
+      font-size: 0.875rem;
+      opacity: 0;
+      transition: opacity 0.3s;
+      pointer-events: none;
+      z-index: 200;
+    }
+    .toast.visible {
+      opacity: 1;
+    }
+  </style>
+</head>
+<body>
+  <div id="app" style="height:100vh; display:flex; flex-direction:column;">
+    <div class="header">
+      <div>
+        <h1>Eval Review: <span id="skill-name"></span></h1>
+        <div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>
+      </div>
+      <div class="progress" id="progress"></div>
+    </div>
+
+    <!-- View tabs (only shown when benchmark data exists) -->
+    <div class="view-tabs" id="view-tabs" style="display:none;">
+      <button class="view-tab active" onclick="switchView('outputs')">Outputs</button>
+      <button class="view-tab" onclick="switchView('benchmark')">Benchmark</button>
+    </div>
+
+    <!-- Outputs panel (qualitative review) -->
+    <div class="view-panel active" id="panel-outputs">
+    <div class="main">
+      <!-- Prompt -->
+      <div class="section">
+        <div class="section-header">Prompt <span class="config-badge" id="config-badge" style="display:none;"></span></div>
+        <div class="section-body">
+          <div class="prompt-text" id="prompt-text"></div>
+        </div>
+      </div>
+
+      <!-- Outputs -->
+      <div class="section">
+        <div class="section-header">Output</div>
+        <div class="section-body" id="outputs-body">
+          <div class="empty-state">No output files found</div>
+        </div>
+      </div>
+
+      <!-- Previous Output (collapsible) -->
+      <div class="section" id="prev-outputs-section" style="display:none;">
+        <div class="section-header">
+          <div class="grades-toggle" onclick="togglePrevOutputs()">
+            <span class="arrow" id="prev-outputs-arrow">&#9654;</span>
+            Previous Output
+          </div>
+        </div>
+        <div class="grades-content" id="prev-outputs-content"></div>
+      </div>
+
+      <!-- Grades (collapsible) -->
+      <div class="section" id="grades-section" style="display:none;">
+        <div class="section-header">
+          <div class="grades-toggle" onclick="toggleGrades()">
+            <span class="arrow" id="grades-arrow">&#9654;</span>
+            Formal Grades
+          </div>
+        </div>
+        <div class="grades-content" id="grades-content"></div>
+      </div>
+
+      <!-- Feedback -->
+      <div class="section">
+        <div class="section-header">Your Feedback</div>
+        <div class="section-body">
+          <textarea
+            class="feedback-textarea"
+            id="feedback"
+            placeholder="What do you think of this output? Any issues, suggestions, or things that look great?"
+          ></textarea>
+          <div class="feedback-status" id="feedback-status"></div>
+          <div class="prev-feedback" id="prev-feedback" style="display:none;">
+            <div class="prev-feedback-label">Previous feedback</div>
+            <div id="prev-feedback-text"></div>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <div class="nav" id="outputs-nav">
+      <button class="nav-btn" id="prev-btn" onclick="navigate(-1)">&#8592; Previous</button>
+      <button class="done-btn" id="done-btn" onclick="showDoneDialog()">Submit All Reviews</button>
+      <button class="nav-btn" id="next-btn" onclick="navigate(1)">Next &#8594;</button>
+    </div>
+    </div><!-- end panel-outputs -->
+
+    <!-- Benchmark panel (quantitative stats) -->
+    <div class="view-panel" id="panel-benchmark">
+      <div class="benchmark-view" id="benchmark-content">
+        <div class="benchmark-empty">No benchmark data available. Run a benchmark to see quantitative results here.</div>
+      </div>
+    </div>
+  </div>
+
+  <!-- Done overlay -->
+  <div class="done-overlay" id="done-overlay">
+    <div class="done-card">
+      <h2>Review Complete</h2>
+      <p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>
+      <div class="btn-row">
+        <button onclick="closeDoneDialog()">OK</button>
+      </div>
+    </div>
+  </div>
+
+  <!-- Toast -->
+  <div class="toast" id="toast"></div>
+
+  <script>
+    // ---- Embedded data (injected by generate_review.py) ----
+    /*__EMBEDDED_DATA__*/
+
+    // ---- State ----
+    let feedbackMap = {};  // run_id -> feedback text
+    let currentIndex = 0;
+    let visitedRuns = new Set();
+
+    // ---- Init ----
+    async function init() {
+      // Load saved feedback from server — but only if this isn't a fresh
+      // iteration (indicated by previous_feedback being present). When
+      // previous feedback exists, the feedback.json on disk is stale from
+      // the prior iteration and should not pre-fill the textareas.
+      const hasPrevious = Object.keys(EMBEDDED_DATA.previous_feedback || {}).length > 0
+        || Object.keys(EMBEDDED_DATA.previous_outputs || {}).length > 0;
+      if (!hasPrevious) {
+        try {
+          const resp = await fetch("/api/feedback");
+          const data = await resp.json();
+          if (data.reviews) {
+            for (const r of data.reviews) feedbackMap[r.run_id] = r.feedback;
+          }
+        } catch { /* first run, no feedback yet */ }
+      }
+
+      document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
+      showRun(0);
+
+      // Wire up feedback auto-save
+      const textarea = document.getElementById("feedback");
+      let saveTimeout = null;
+      textarea.addEventListener("input", () => {
+        clearTimeout(saveTimeout);
+        document.getElementById("feedback-status").textContent = "";
+        saveTimeout = setTimeout(() => saveCurrentFeedback(), 800);
+      });
+    }
+
+    // ---- Navigation ----
+    function navigate(delta) {
+      const newIndex = currentIndex + delta;
+      if (newIndex >= 0 && newIndex < EMBEDDED_DATA.runs.length) {
+        saveCurrentFeedback();
+        showRun(newIndex);
+      }
+    }
+
+    function updateNavButtons() {
+      document.getElementById("prev-btn").disabled = currentIndex === 0;
+      document.getElementById("next-btn").disabled =
+        currentIndex === EMBEDDED_DATA.runs.length - 1;
+    }
+
+    // ---- Show a run ----
+    function showRun(index) {
+      currentIndex = index;
+      const run = EMBEDDED_DATA.runs[index];
+
+      // Progress
+      document.getElementById("progress").textContent =
+        `${index + 1} of ${EMBEDDED_DATA.runs.length}`;
+
+      // Prompt
+      document.getElementById("prompt-text").textContent = run.prompt;
+
+      // Config badge
+      const badge = document.getElementById("config-badge");
+      const configMatch = run.id.match(/(with_skill|without_skill|new_skill|old_skill)/);
+      if (configMatch) {
+        const config = configMatch[1];
+        const isBaseline = config === "without_skill" || config === "old_skill";
+        badge.textContent = config.replace(/_/g, " ");
+        badge.className = "config-badge " + (isBaseline ? "config-baseline" : "config-primary");
+        badge.style.display = "inline-block";
+      } else {
+        badge.style.display = "none";
+      }
+
+      // Outputs
+      renderOutputs(run);
+
+      // Previous outputs
+      renderPrevOutputs(run);
+
+      // Grades
+      renderGrades(run);
+
+      // Previous feedback
+      const prevFb = (EMBEDDED_DATA.previous_feedback || {})[run.id];
+      const prevEl = document.getElementById("prev-feedback");
+      if (prevFb) {
+        document.getElementById("prev-feedback-text").textContent = prevFb;
+        prevEl.style.display = "block";
+      } else {
+        prevEl.style.display = "none";
+      }
+
+      // Feedback
+      document.getElementById("feedback").value = feedbackMap[run.id] || "";
+      document.getElementById("feedback-status").textContent = "";
+
+      updateNavButtons();
+
+      // Track visited runs and promote done button when all visited
+      visitedRuns.add(index);
+      const doneBtn = document.getElementById("done-btn");
+      if (visitedRuns.size >= EMBEDDED_DATA.runs.length) {
+        doneBtn.classList.add("ready");
+      }
+
+      // Scroll main content to top
+      document.querySelector(".main").scrollTop = 0;
+    }
+
+    // ---- Render outputs ----
+    function renderOutputs(run) {
+      const container = document.getElementById("outputs-body");
+      container.innerHTML = "";
+
+      const outputs = run.outputs || [];
+      if (outputs.length === 0) {
+        container.innerHTML = '<div class="empty-state">No output files</div>';
+        return;
+      }
+
+      for (const file of outputs) {
+        const fileDiv = document.createElement("div");
+        fileDiv.className = "output-file";
+
+        // Always show file header with download link
+        const header = document.createElement("div");
+        header.className = "output-file-header";
+        const nameSpan = document.createElement("span");
+        nameSpan.textContent = file.name;
+        header.appendChild(nameSpan);
+        const dlBtn = document.createElement("a");
+        dlBtn.className = "dl-btn";
+        dlBtn.textContent = "Download";
+        dlBtn.download = file.name;
+        dlBtn.href = getDownloadUri(file);
+        header.appendChild(dlBtn);
+        fileDiv.appendChild(header);
+
+        const content = document.createElement("div");
+        content.className = "output-file-content";
+
+        if (file.type === "text") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          content.appendChild(pre);
+        } else if (file.type === "image") {
+          const img = document.createElement("img");
+          img.src = file.data_uri;
+          img.alt = file.name;
+          content.appendChild(img);
+        } else if (file.type === "pdf") {
+          const iframe = document.createElement("iframe");
+          iframe.src = file.data_uri;
+          content.appendChild(iframe);
+        } else if (file.type === "xlsx") {
+          renderXlsx(content, file.data_b64);
+        } else if (file.type === "binary") {
+          const a = document.createElement("a");
+          a.className = "download-link";
+          a.href = file.data_uri;
+          a.download = file.name;
+          a.textContent = "Download " + file.name;
+          content.appendChild(a);
+        } else if (file.type === "error") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          pre.style.color = "var(--red)";
+          content.appendChild(pre);
+        }
+
+        fileDiv.appendChild(content);
+        container.appendChild(fileDiv);
+      }
+    }
+
+    // ---- XLSX rendering via SheetJS ----
+    function renderXlsx(container, b64Data) {
+      try {
+        const raw = Uint8Array.from(atob(b64Data), c => c.charCodeAt(0));
+        const wb = XLSX.read(raw, { type: "array" });
+
+        for (let i = 0; i < wb.SheetNames.length; i++) {
+          const sheetName = wb.SheetNames[i];
+          const ws = wb.Sheets[sheetName];
+
+          if (wb.SheetNames.length > 1) {
+            const sheetLabel = document.createElement("div");
+            sheetLabel.style.cssText =
+              "font-weight:600; font-size:0.8rem; color:#b0aea5; margin-top:0.5rem; margin-bottom:0.25rem;";
+            sheetLabel.textContent = "Sheet: " + sheetName;
+            container.appendChild(sheetLabel);
+          }
+
+          const htmlStr = XLSX.utils.sheet_to_html(ws, { editable: false });
+          const wrapper = document.createElement("div");
+          wrapper.innerHTML = htmlStr;
+          container.appendChild(wrapper);
+        }
+      } catch (err) {
+        container.textContent = "Error rendering spreadsheet: " + err.message;
+      }
+    }
+
+    // ---- Grades ----
+    function renderGrades(run) {
+      const section = document.getElementById("grades-section");
+      const content = document.getElementById("grades-content");
+
+      if (!run.grading) {
+        section.style.display = "none";
+        return;
+      }
+
+      const grading = run.grading;
+      section.style.display = "block";
+      // Reset to collapsed
+      content.classList.remove("open");
+      document.getElementById("grades-arrow").classList.remove("open");
+
+      const summary = grading.summary || {};
+      const expectations = grading.expectations || [];
+
+      let html = '<div style="padding: 1rem;">';
+
+      // Summary line
+      const passRate = summary.pass_rate != null
+        ? Math.round(summary.pass_rate * 100) + "%"
+        : "?";
+      const badgeClass = summary.pass_rate >= 0.8 ? "grade-pass" : summary.pass_rate >= 0.5 ? "" : "grade-fail";
+      html += '<div class="grades-summary">';
+      html += '<span class="grade-badge ' + badgeClass + '">' + passRate + '</span>';
+      html += '<span>' + (summary.passed || 0) + ' passed, ' + (summary.failed || 0) + ' failed of ' + (summary.total || 0) + '</span>';
+      html += '</div>';
+
+      // Assertions list
+      html += '<ul class="assertion-list">';
+      for (const exp of expectations) {
+        const statusClass = exp.passed ? "pass" : "fail";
+        const statusIcon = exp.passed ? "\u2713" : "\u2717";
+        html += '<li class="assertion-item">';
+        html += '<span class="assertion-status ' + statusClass + '">' + statusIcon + '</span>';
+        html += '<span>' + escapeHtml(exp.text) + '</span>';
+        if (exp.evidence) {
+          html += '<div class="assertion-evidence">' + escapeHtml(exp.evidence) + '</div>';
+        }
+        html += '</li>';
+      }
+      html += '</ul>';
+
+      html += '</div>';
+      content.innerHTML = html;
+    }
+
+    function toggleGrades() {
+      const content = document.getElementById("grades-content");
+      const arrow = document.getElementById("grades-arrow");
+      content.classList.toggle("open");
+      arrow.classList.toggle("open");
+    }
+
+    // ---- Previous outputs (collapsible) ----
+    function renderPrevOutputs(run) {
+      const section = document.getElementById("prev-outputs-section");
+      const content = document.getElementById("prev-outputs-content");
+      const prevOutputs = (EMBEDDED_DATA.previous_outputs || {})[run.id];
+
+      if (!prevOutputs || prevOutputs.length === 0) {
+        section.style.display = "none";
+        return;
+      }
+
+      section.style.display = "block";
+      // Reset to collapsed
+      content.classList.remove("open");
+      document.getElementById("prev-outputs-arrow").classList.remove("open");
+
+      // Render the files into the content area
+      content.innerHTML = "";
+      const wrapper = document.createElement("div");
+      wrapper.style.padding = "1rem";
+
+      for (const file of prevOutputs) {
+        const fileDiv = document.createElement("div");
+        fileDiv.className = "output-file";
+
+        const header = document.createElement("div");
+        header.className = "output-file-header";
+        const nameSpan = document.createElement("span");
+        nameSpan.textContent = file.name;
+        header.appendChild(nameSpan);
+        const dlBtn = document.createElement("a");
+        dlBtn.className = "dl-btn";
+        dlBtn.textContent = "Download";
+        dlBtn.download = file.name;
+        dlBtn.href = getDownloadUri(file);
+        header.appendChild(dlBtn);
+        fileDiv.appendChild(header);
+
+        const fc = document.createElement("div");
+        fc.className = "output-file-content";
+
+        if (file.type === "text") {
+          const pre = document.createElement("pre");
+          pre.textContent = file.content;
+          fc.appendChild(pre);
+        } else if (file.type === "image") {
+          const img = document.createElement("img");
+          img.src = file.data_uri;
+          img.alt = file.name;
+          fc.appendChild(img);
+        } else if (file.type === "pdf") {
+          const iframe = document.createElement("iframe");
+          iframe.src = file.data_uri;
+          fc.appendChild(iframe);
+        } else if (file.type === "xlsx") {
+          renderXlsx(fc, file.data_b64);
+        } else if (file.type === "binary") {
+          const a = document.createElement("a");
+          a.className = "download-link";
+          a.href = file.data_uri;
+          a.download = file.name;
+          a.textContent = "Download " + file.name;
+          fc.appendChild(a);
+        }
+
+        fileDiv.appendChild(fc);
+        wrapper.appendChild(fileDiv);
+      }
+
+      content.appendChild(wrapper);
+    }
+
+    function togglePrevOutputs() {
+      const content = document.getElementById("prev-outputs-content");
+      const arrow = document.getElementById("prev-outputs-arrow");
+      content.classList.toggle("open");
+      arrow.classList.toggle("open");
+    }
+
+    // ---- Feedback (saved to server -> feedback.json) ----
+    function saveCurrentFeedback() {
+      const run = EMBEDDED_DATA.runs[currentIndex];
+      const text = document.getElementById("feedback").value;
+
+      if (text.trim() === "") {
+        delete feedbackMap[run.id];
+      } else {
+        feedbackMap[run.id] = text;
+      }
+
+      // Build reviews array from map
+      const reviews = [];
+      for (const [run_id, feedback] of Object.entries(feedbackMap)) {
+        if (feedback.trim()) {
+          reviews.push({ run_id, feedback, timestamp: new Date().toISOString() });
+        }
+      }
+
+      fetch("/api/feedback", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ reviews, status: "in_progress" }),
+      }).then(() => {
+        document.getElementById("feedback-status").textContent = "Saved";
+      }).catch(() => {
+        // Static mode or server unavailable — no-op on auto-save,
+        // feedback will be downloaded on final submit
+        document.getElementById("feedback-status").textContent = "Will download on submit";
+      });
+    }
+
+    // ---- Done ----
+    function showDoneDialog() {
+      // Save current textarea to feedbackMap (but don't POST yet)
+      const run = EMBEDDED_DATA.runs[currentIndex];
+      const text = document.getElementById("feedback").value;
+      if (text.trim() === "") {
+        delete feedbackMap[run.id];
+      } else {
+        feedbackMap[run.id] = text;
+      }
+
+      // POST once with status: complete — include ALL runs so the model
+      // can distinguish "no feedback" (looks good) from "not reviewed"
+      const reviews = [];
+      const ts = new Date().toISOString();
+      for (const r of EMBEDDED_DATA.runs) {
+        reviews.push({ run_id: r.id, feedback: feedbackMap[r.id] || "", timestamp: ts });
+      }
+      const payload = JSON.stringify({ reviews, status: "complete" }, null, 2);
+      fetch("/api/feedback", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: payload,
+      }).then(() => {
+        document.getElementById("done-overlay").classList.add("visible");
+      }).catch(() => {
+        // Server not available (static mode) — download as file
+        const blob = new Blob([payload], { type: "application/json" });
+        const url = URL.createObjectURL(blob);
+        const a = document.createElement("a");
+        a.href = url;
+        a.download = "feedback.json";
+        a.click();
+        URL.revokeObjectURL(url);
+        document.getElementById("done-overlay").classList.add("visible");
+      });
+    }
+
+    function closeDoneDialog() {
+      // Reset status back to in_progress
+      saveCurrentFeedback();
+      document.getElementById("done-overlay").classList.remove("visible");
+    }
+
+    // ---- Toast ----
+    function showToast(message) {
+      const toast = document.getElementById("toast");
+      toast.textContent = message;
+      toast.classList.add("visible");
+      setTimeout(() => toast.classList.remove("visible"), 2000);
+    }
+
+    // ---- Keyboard nav ----
+    document.addEventListener("keydown", (e) => {
+      // Don't capture when typing in textarea
+      if (e.target.tagName === "TEXTAREA") return;
+
+      if (e.key === "ArrowLeft" || e.key === "ArrowUp") {
+        e.preventDefault();
+        navigate(-1);
+      } else if (e.key === "ArrowRight" || e.key === "ArrowDown") {
+        e.preventDefault();
+        navigate(1);
+      }
+    });
+
+    // ---- Util ----
+    function getDownloadUri(file) {
+      if (file.data_uri) return file.data_uri;
+      if (file.data_b64) return "data:application/octet-stream;base64," + file.data_b64;
+      if (file.type === "text") return "data:text/plain;charset=utf-8," + encodeURIComponent(file.content);
+      return "#";
+    }
+
+    function escapeHtml(text) {
+      const div = document.createElement("div");
+      div.textContent = text;
+      return div.innerHTML;
+    }
+
+    // ---- View switching ----
+    function switchView(view) {
+      document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
+      document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
+      document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
+      document.getElementById("panel-" + view).classList.add("active");
+    }
+
+    // ---- Benchmark rendering ----
+    function renderBenchmark() {
+      const data = EMBEDDED_DATA.benchmark;
+      if (!data) return;
+
+      // Show the tabs
+      document.getElementById("view-tabs").style.display = "flex";
+
+      const container = document.getElementById("benchmark-content");
+      const summary = data.run_summary || {};
+      const metadata = data.metadata || {};
+      const notes = data.notes || [];
+
+      let html = "";
+
+      // Header
+      html += "<h2 style='font-family: Poppins, sans-serif; margin-bottom: 0.5rem;'>Benchmark Results</h2>";
+      html += "<p style='color: var(--text-muted); font-size: 0.875rem; margin-bottom: 1.25rem;'>";
+      if (metadata.skill_name) html += "<strong>" + escapeHtml(metadata.skill_name) + "</strong> &mdash; ";
+      if (metadata.timestamp) html += metadata.timestamp + " &mdash; ";
+      if (metadata.evals_run) html += "Evals: " + metadata.evals_run.join(", ") + " &mdash; ";
+      html += (metadata.runs_per_configuration || "?") + " runs per configuration";
+      html += "</p>";
+
+      // Summary table
+      html += '<table class="benchmark-table">';
+
+      function fmtStat(stat, pct) {
+        if (!stat) return "—";
+        const suffix = pct ? "%" : "";
+        const m = pct ? (stat.mean * 100).toFixed(0) : stat.mean.toFixed(1);
+        const s = pct ? (stat.stddev * 100).toFixed(0) : stat.stddev.toFixed(1);
+        return m + suffix + " ± " + s + suffix;
+      }
+
+      function deltaClass(val) {
+        if (!val) return "";
+        const n = parseFloat(val);
+        if (n > 0) return "benchmark-delta-positive";
+        if (n < 0) return "benchmark-delta-negative";
+        return "";
+      }
+
+      // Discover config names dynamically (everything except "delta")
+      const configs = Object.keys(summary).filter(k => k !== "delta");
+      const configA = configs[0] || "config_a";
+      const configB = configs[1] || "config_b";
+      const labelA = configA.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+      const labelB = configB.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+      const a = summary[configA] || {};
+      const b = summary[configB] || {};
+      const delta = summary.delta || {};
+
+      html += "<thead><tr><th>Metric</th><th>" + escapeHtml(labelA) + "</th><th>" + escapeHtml(labelB) + "</th><th>Delta</th></tr></thead>";
+      html += "<tbody>";
+
+      html += "<tr><td><strong>Pass Rate</strong></td>";
+      html += "<td>" + fmtStat(a.pass_rate, true) + "</td>";
+      html += "<td>" + fmtStat(b.pass_rate, true) + "</td>";
+      html += '<td class="' + deltaClass(delta.pass_rate) + '">' + (delta.pass_rate || "—") + "</td></tr>";
+
+      // Time (only show row if data exists)
+      if (a.time_seconds || b.time_seconds) {
+        html += "<tr><td><strong>Time (s)</strong></td>";
+        html += "<td>" + fmtStat(a.time_seconds, false) + "</td>";
+        html += "<td>" + fmtStat(b.time_seconds, false) + "</td>";
+        html += '<td class="' + deltaClass(delta.time_seconds) + '">' + (delta.time_seconds ? delta.time_seconds + "s" : "—") + "</td></tr>";
+      }
+
+      // Tokens (only show row if data exists)
+      if (a.tokens || b.tokens) {
+        html += "<tr><td><strong>Tokens</strong></td>";
+        html += "<td>" + fmtStat(a.tokens, false) + "</td>";
+        html += "<td>" + fmtStat(b.tokens, false) + "</td>";
+        html += '<td class="' + deltaClass(delta.tokens) + '">' + (delta.tokens || "—") + "</td></tr>";
+      }
+
+      html += "</tbody></table>";
+
+      // Per-eval breakdown (if runs data available)
+      const runs = data.runs || [];
+      if (runs.length > 0) {
+        const evalIds = [...new Set(runs.map(r => r.eval_id))].sort((a, b) => a - b);
+
+        html += "<h3 style='font-family: Poppins, sans-serif; margin-bottom: 0.75rem;'>Per-Eval Breakdown</h3>";
+
+        const hasTime = runs.some(r => r.result && r.result.time_seconds != null);
+        const hasErrors = runs.some(r => r.result && r.result.errors > 0);
+
+        for (const evalId of evalIds) {
+          const evalRuns = runs.filter(r => r.eval_id === evalId);
+          const evalName = evalRuns[0] && evalRuns[0].eval_name ? evalRuns[0].eval_name : "Eval " + evalId;
+
+          html += "<h4 style='font-family: Poppins, sans-serif; margin: 1rem 0 0.5rem; color: var(--text);'>" + escapeHtml(evalName) + "</h4>";
+          html += '<table class="benchmark-table">';
+          html += "<thead><tr><th>Config</th><th>Run</th><th>Pass Rate</th>";
+          if (hasTime) html += "<th>Time (s)</th>";
+          if (hasErrors) html += "<th>Crashes During Execution</th>";
+          html += "</tr></thead>";
+          html += "<tbody>";
+
+          // Group by config and render with average rows
+          const configGroups = [...new Set(evalRuns.map(r => r.configuration))];
+          for (let ci = 0; ci < configGroups.length; ci++) {
+            const config = configGroups[ci];
+            const configRuns = evalRuns.filter(r => r.configuration === config);
+            if (configRuns.length === 0) continue;
+
+            const rowClass = ci === 0 ? "benchmark-row-with" : "benchmark-row-without";
+            const configLabel = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+
+            for (const run of configRuns) {
+              const r = run.result || {};
+              const prClass = r.pass_rate >= 0.8 ? "benchmark-delta-positive" : r.pass_rate < 0.5 ? "benchmark-delta-negative" : "";
+              html += '<tr class="' + rowClass + '">';
+              html += "<td>" + configLabel + "</td>";
+              html += "<td>" + run.run_number + "</td>";
+              html += '<td class="' + prClass + '">' + ((r.pass_rate || 0) * 100).toFixed(0) + "% (" + (r.passed || 0) + "/" + (r.total || 0) + ")</td>";
+              if (hasTime) html += "<td>" + (r.time_seconds != null ? r.time_seconds.toFixed(1) : "—") + "</td>";
+              if (hasErrors) html += "<td>" + (r.errors || 0) + "</td>";
+              html += "</tr>";
+            }
+
+            // Average row
+            const rates = configRuns.map(r => (r.result || {}).pass_rate || 0);
+            const avgRate = rates.reduce((a, b) => a + b, 0) / rates.length;
+            const avgPrClass = avgRate >= 0.8 ? "benchmark-delta-positive" : avgRate < 0.5 ? "benchmark-delta-negative" : "";
+            html += '<tr class="benchmark-row-avg ' + rowClass + '">';
+            html += "<td>" + configLabel + "</td>";
+            html += "<td>Avg</td>";
+            html += '<td class="' + avgPrClass + '">' + (avgRate * 100).toFixed(0) + "%</td>";
+            if (hasTime) {
+              const times = configRuns.map(r => (r.result || {}).time_seconds).filter(t => t != null);
+              html += "<td>" + (times.length ? (times.reduce((a, b) => a + b, 0) / times.length).toFixed(1) : "—") + "</td>";
+            }
+            if (hasErrors) html += "<td></td>";
+            html += "</tr>";
+          }
+          html += "</tbody></table>";
+
+          // Per-assertion detail for this eval
+          const runsWithExpectations = {};
+          for (const config of configGroups) {
+            runsWithExpectations[config] = evalRuns.filter(r => r.configuration === config && r.expectations && r.expectations.length > 0);
+          }
+          const hasAnyExpectations = Object.values(runsWithExpectations).some(runs => runs.length > 0);
+          if (hasAnyExpectations) {
+            // Collect all unique assertion texts across all configs
+            const allAssertions = [];
+            const seen = new Set();
+            for (const config of configGroups) {
+              for (const run of runsWithExpectations[config]) {
+                for (const exp of (run.expectations || [])) {
+                  if (!seen.has(exp.text)) {
+                    seen.add(exp.text);
+                    allAssertions.push(exp.text);
+                  }
+                }
+              }
+            }
+
+            html += '<table class="benchmark-table" style="margin-top: 0.5rem;">';
+            html += "<thead><tr><th>Assertion</th>";
+            for (const config of configGroups) {
+              const label = config.replace(/_/g, " ").replace(/\b\w/g, c => c.toUpperCase());
+              html += "<th>" + escapeHtml(label) + "</th>";
+            }
+            html += "</tr></thead><tbody>";
+
+            for (const assertionText of allAssertions) {
+              html += "<tr><td>" + escapeHtml(assertionText) + "</td>";
+
+              for (const config of configGroups) {
+                html += "<td>";
+                for (const run of runsWithExpectations[config]) {
+                  const exp = (run.expectations || []).find(e => e.text === assertionText);
+                  if (exp) {
+                    const cls = exp.passed ? "benchmark-delta-positive" : "benchmark-delta-negative";
+                    const icon = exp.passed ? "\u2713" : "\u2717";
+                    html += '<span class="' + cls + '" title="Run ' + run.run_number + ': ' + escapeHtml(exp.evidence || "") + '">' + icon + "</span> ";
+                  } else {
+                    html += "— ";
+                  }
+                }
+                html += "</td>";
+              }
+              html += "</tr>";
+            }
+            html += "</tbody></table>";
+          }
+        }
+      }
+
+      // Notes
+      if (notes.length > 0) {
+        html += '<div class="benchmark-notes">';
+        html += "<h3>Analysis Notes</h3>";
+        html += "<ul>";
+        for (const note of notes) {
+          html += "<li>" + escapeHtml(note) + "</li>";
+        }
+        html += "</ul></div>";
+      }
+
+      container.innerHTML = html;
+    }
+
+    // ---- Start ----
+    init();
+    renderBenchmark();
+  </script>
+</body>
+</html>
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/references/schemas.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/references/schemas.md
new file mode 100644
index 000000000..b6eeaa2d4
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/references/schemas.md
@@ -0,0 +1,430 @@
+# JSON Schemas
+
+This document defines the JSON schemas used by skill-creator.
+
+---
+
+## evals.json
+
+Defines the evals for a skill. Located at `evals/evals.json` within the skill directory.
+
+```json
+{
+  "skill_name": "example-skill",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "User's example prompt",
+      "expected_output": "Description of expected result",
+      "files": ["evals/files/sample1.pdf"],
+      "expectations": [
+        "The output includes X",
+        "The skill used script Y"
+      ]
+    }
+  ]
+}
+```
+
+**Fields:**
+- `skill_name`: Name matching the skill's frontmatter
+- `evals[].id`: Unique integer identifier
+- `evals[].prompt`: The task to execute
+- `evals[].expected_output`: Human-readable description of success
+- `evals[].files`: Optional list of input file paths (relative to skill root)
+- `evals[].expectations`: List of verifiable statements
+
+---
+
+## history.json
+
+Tracks version progression in Improve mode. Located at workspace root.
+
+```json
+{
+  "started_at": "2026-01-15T10:30:00Z",
+  "skill_name": "pdf",
+  "current_best": "v2",
+  "iterations": [
+    {
+      "version": "v0",
+      "parent": null,
+      "expectation_pass_rate": 0.65,
+      "grading_result": "baseline",
+      "is_current_best": false
+    },
+    {
+      "version": "v1",
+      "parent": "v0",
+      "expectation_pass_rate": 0.75,
+      "grading_result": "won",
+      "is_current_best": false
+    },
+    {
+      "version": "v2",
+      "parent": "v1",
+      "expectation_pass_rate": 0.85,
+      "grading_result": "won",
+      "is_current_best": true
+    }
+  ]
+}
+```
+
+**Fields:**
+- `started_at`: ISO timestamp of when improvement started
+- `skill_name`: Name of the skill being improved
+- `current_best`: Version identifier of the best performer
+- `iterations[].version`: Version identifier (v0, v1, ...)
+- `iterations[].parent`: Parent version this was derived from
+- `iterations[].expectation_pass_rate`: Pass rate from grading
+- `iterations[].grading_result`: "baseline", "won", "lost", or "tie"
+- `iterations[].is_current_best`: Whether this is the current best version
+
+---
+
+## grading.json
+
+Output from the grader agent. Located at `<run-dir>/grading.json`.
+
+```json
+{
+  "expectations": [
+    {
+      "text": "The output includes the name 'John Smith'",
+      "passed": true,
+      "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
+    },
+    {
+      "text": "The spreadsheet has a SUM formula in cell B10",
+      "passed": false,
+      "evidence": "No spreadsheet was created. The output was a text file."
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 1,
+    "total": 3,
+    "pass_rate": 0.67
+  },
+  "execution_metrics": {
+    "tool_calls": {
+      "Read": 5,
+      "Write": 2,
+      "Bash": 8
+    },
+    "total_tool_calls": 15,
+    "total_steps": 6,
+    "errors_encountered": 0,
+    "output_chars": 12450,
+    "transcript_chars": 3200
+  },
+  "timing": {
+    "executor_duration_seconds": 165.0,
+    "grader_duration_seconds": 26.0,
+    "total_duration_seconds": 191.0
+  },
+  "claims": [
+    {
+      "claim": "The form has 12 fillable fields",
+      "type": "factual",
+      "verified": true,
+      "evidence": "Counted 12 fields in field_info.json"
+    }
+  ],
+  "user_notes_summary": {
+    "uncertainties": ["Used 2023 data, may be stale"],
+    "needs_review": [],
+    "workarounds": ["Fell back to text overlay for non-fillable fields"]
+  },
+  "eval_feedback": {
+    "suggestions": [
+      {
+        "assertion": "The output includes the name 'John Smith'",
+        "reason": "A hallucinated document that mentions the name would also pass"
+      }
+    ],
+    "overall": "Assertions check presence but not correctness."
+  }
+}
+```
+
+**Fields:**
+- `expectations[]`: Graded expectations with evidence
+- `summary`: Aggregate pass/fail counts
+- `execution_metrics`: Tool usage and output size (from executor's metrics.json)
+- `timing`: Wall clock timing (from timing.json)
+- `claims`: Extracted and verified claims from the output
+- `user_notes_summary`: Issues flagged by the executor
+- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising
+
+---
+
+## metrics.json
+
+Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
+
+```json
+{
+  "tool_calls": {
+    "Read": 5,
+    "Write": 2,
+    "Bash": 8,
+    "Edit": 1,
+    "Glob": 2,
+    "Grep": 0
+  },
+  "total_tool_calls": 18,
+  "total_steps": 6,
+  "files_created": ["filled_form.pdf", "field_values.json"],
+  "errors_encountered": 0,
+  "output_chars": 12450,
+  "transcript_chars": 3200
+}
+```
+
+**Fields:**
+- `tool_calls`: Count per tool type
+- `total_tool_calls`: Sum of all tool calls
+- `total_steps`: Number of major execution steps
+- `files_created`: List of output files created
+- `errors_encountered`: Number of errors during execution
+- `output_chars`: Total character count of output files
+- `transcript_chars`: Character count of transcript
+
+---
+
+## timing.json
+
+Wall clock timing for a run. Located at `<run-dir>/timing.json`.
+
+**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact.
+
+```json
+{
+  "total_tokens": 84852,
+  "duration_ms": 23332,
+  "total_duration_seconds": 23.3,
+  "executor_start": "2026-01-15T10:30:00Z",
+  "executor_end": "2026-01-15T10:32:45Z",
+  "executor_duration_seconds": 165.0,
+  "grader_start": "2026-01-15T10:32:46Z",
+  "grader_end": "2026-01-15T10:33:12Z",
+  "grader_duration_seconds": 26.0
+}
+```
+
+---
+
+## benchmark.json
+
+Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
+
+```json
+{
+  "metadata": {
+    "skill_name": "pdf",
+    "skill_path": "/path/to/pdf",
+    "executor_model": "claude-sonnet-4-20250514",
+    "analyzer_model": "most-capable-model",
+    "timestamp": "2026-01-15T10:30:00Z",
+    "evals_run": [1, 2, 3],
+    "runs_per_configuration": 3
+  },
+
+  "runs": [
+    {
+      "eval_id": 1,
+      "eval_name": "Ocean",
+      "configuration": "with_skill",
+      "run_number": 1,
+      "result": {
+        "pass_rate": 0.85,
+        "passed": 6,
+        "failed": 1,
+        "total": 7,
+        "time_seconds": 42.5,
+        "tokens": 3800,
+        "tool_calls": 18,
+        "errors": 0
+      },
+      "expectations": [
+        {"text": "...", "passed": true, "evidence": "..."}
+      ],
+      "notes": [
+        "Used 2023 data, may be stale",
+        "Fell back to text overlay for non-fillable fields"
+      ]
+    }
+  ],
+
+  "run_summary": {
+    "with_skill": {
+      "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
+      "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
+      "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
+    },
+    "without_skill": {
+      "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
+      "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
+      "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
+    },
+    "delta": {
+      "pass_rate": "+0.50",
+      "time_seconds": "+13.0",
+      "tokens": "+1700"
+    }
+  },
+
+  "notes": [
+    "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
+    "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent",
+    "Without-skill runs consistently fail on table extraction expectations",
+    "Skill adds 13s average execution time but improves pass rate by 50%"
+  ]
+}
+```
+
+**Fields:**
+- `metadata`: Information about the benchmark run
+  - `skill_name`: Name of the skill
+  - `timestamp`: When the benchmark was run
+  - `evals_run`: List of eval names or IDs
+  - `runs_per_configuration`: Number of runs per config (e.g. 3)
+- `runs[]`: Individual run results
+  - `eval_id`: Numeric eval identifier
+  - `eval_name`: Human-readable eval name (used as section header in the viewer)
+  - `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding)
+  - `run_number`: Integer run number (1, 2, 3...)
+  - `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors`
+- `run_summary`: Statistical aggregates per configuration
+  - `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields
+  - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"`
+- `notes`: Freeform observations from the analyzer
+
+**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually.
+
+---
+
+## comparison.json
+
+Output from blind comparator. Located at `<grading-dir>/comparison-N.json`.
+
+```json
+{
+  "winner": "A",
+  "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
+  "rubric": {
+    "A": {
+      "content": {
+        "correctness": 5,
+        "completeness": 5,
+        "accuracy": 4
+      },
+      "structure": {
+        "organization": 4,
+        "formatting": 5,
+        "usability": 4
+      },
+      "content_score": 4.7,
+      "structure_score": 4.3,
+      "overall_score": 9.0
+    },
+    "B": {
+      "content": {
+        "correctness": 3,
+        "completeness": 2,
+        "accuracy": 3
+      },
+      "structure": {
+        "organization": 3,
+        "formatting": 2,
+        "usability": 3
+      },
+      "content_score": 2.7,
+      "structure_score": 2.7,
+      "overall_score": 5.4
+    }
+  },
+  "output_quality": {
+    "A": {
+      "score": 9,
+      "strengths": ["Complete solution", "Well-formatted", "All fields present"],
+      "weaknesses": ["Minor style inconsistency in header"]
+    },
+    "B": {
+      "score": 5,
+      "strengths": ["Readable output", "Correct basic structure"],
+      "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
+    }
+  },
+  "expectation_results": {
+    "A": {
+      "passed": 4,
+      "total": 5,
+      "pass_rate": 0.80,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    },
+    "B": {
+      "passed": 3,
+      "total": 5,
+      "pass_rate": 0.60,
+      "details": [
+        {"text": "Output includes name", "passed": true}
+      ]
+    }
+  }
+}
+```
+
+---
+
+## analysis.json
+
+Output from post-hoc analyzer. Located at `<grading-dir>/analysis.json`.
+
+```json
+{
+  "comparison_summary": {
+    "winner": "A",
+    "winner_skill": "path/to/winner/skill",
+    "loser_skill": "path/to/loser/skill",
+    "comparator_reasoning": "Brief summary of why comparator chose winner"
+  },
+  "winner_strengths": [
+    "Clear step-by-step instructions for handling multi-page documents",
+    "Included validation script that caught formatting errors"
+  ],
+  "loser_weaknesses": [
+    "Vague instruction 'process the document appropriately' led to inconsistent behavior",
+    "No script for validation, agent had to improvise"
+  ],
+  "instruction_following": {
+    "winner": {
+      "score": 9,
+      "issues": ["Minor: skipped optional logging step"]
+    },
+    "loser": {
+      "score": 6,
+      "issues": [
+        "Did not use the skill's formatting template",
+        "Invented own approach instead of following step 3"
+      ]
+    }
+  },
+  "improvement_suggestions": [
+    {
+      "priority": "high",
+      "category": "instructions",
+      "suggestion": "Replace 'process the document appropriately' with explicit steps",
+      "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
+    }
+  ],
+  "transcript_insights": {
+    "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script",
+    "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods"
+  }
+}
+```
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/__init__.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/aggregate_benchmark.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/aggregate_benchmark.py
new file mode 100644
index 000000000..ccc810819
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/aggregate_benchmark.py
@@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+"""
+Aggregate individual run results into benchmark summary statistics.
+
+Reads grading.json files from run directories and produces:
+- run_summary with mean, stddev, min, max for each metric
+- delta between with_skill and without_skill configurations
+
+Usage:
+    python aggregate_benchmark.py <benchmark_dir>
+
+Example:
+    python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
+
+The script supports two directory layouts:
+
+    Workspace layout (from skill-creator iterations):
+    <benchmark_dir>/
+    └── eval-N/
+        ├── with_skill/
+        │   ├── run-1/grading.json
+        │   └── run-2/grading.json
+        └── without_skill/
+            ├── run-1/grading.json
+            └── run-2/grading.json
+
+    Legacy layout (with runs/ subdirectory):
+    <benchmark_dir>/
+    └── runs/
+        └── eval-N/
+            ├── with_skill/
+            │   └── run-1/grading.json
+            └── without_skill/
+                └── run-1/grading.json
+"""
+
+import argparse
+import json
+import math
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+from loguru import logger
+
+
+def calculate_stats(values: list[float]) -> dict:
+    """Calculate mean, stddev, min, max for a list of values."""
+    if not values:
+        return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
+
+    n = len(values)
+    mean = sum(values) / n
+
+    if n > 1:
+        variance = sum((x - mean) ** 2 for x in values) / (n - 1)
+        stddev = math.sqrt(variance)
+    else:
+        stddev = 0.0
+
+    return {
+        "mean": round(mean, 4),
+        "stddev": round(stddev, 4),
+        "min": round(min(values), 4),
+        "max": round(max(values), 4)
+    }
+
+
+def load_run_results(benchmark_dir: Path) -> dict:
+    """
+    Load all run results from a benchmark directory.
+
+    Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
+    or "new_skill"/"old_skill"), each containing a list of run results.
+    """
+    # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
+    runs_dir = benchmark_dir / "runs"
+    if runs_dir.exists():
+        search_dir = runs_dir
+    elif list(benchmark_dir.glob("eval-*")):
+        search_dir = benchmark_dir
+    else:
+        logger.warning(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
+        return {}
+
+    results: dict[str, list] = {}
+
+    for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
+        metadata_path = eval_dir / "eval_metadata.json"
+        if metadata_path.exists():
+            try:
+                with open(metadata_path) as mf:
+                    eval_id = json.load(mf).get("eval_id", eval_idx)
+            except (json.JSONDecodeError, OSError):
+                eval_id = eval_idx
+        else:
+            try:
+                eval_id = int(eval_dir.name.split("-")[1])
+            except ValueError:
+                eval_id = eval_idx
+
+        # Discover config directories dynamically rather than hardcoding names
+        for config_dir in sorted(eval_dir.iterdir()):
+            if not config_dir.is_dir():
+                continue
+            # Skip non-config directories (inputs, outputs, etc.)
+            if not list(config_dir.glob("run-*")):
+                continue
+            config = config_dir.name
+            if config not in results:
+                results[config] = []
+
+            for run_dir in sorted(config_dir.glob("run-*")):
+                run_number = int(run_dir.name.split("-")[1])
+                grading_file = run_dir / "grading.json"
+
+                if not grading_file.exists():
+                    logger.warning(f"Warning: grading.json not found in {run_dir}")
+                    continue
+
+                try:
+                    with open(grading_file) as f:
+                        grading = json.load(f)
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Warning: Invalid JSON in {grading_file}: {e}")
+                    continue
+
+                # Extract metrics
+                result = {
+                    "eval_id": eval_id,
+                    "run_number": run_number,
+                    "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
+                    "passed": grading.get("summary", {}).get("passed", 0),
+                    "failed": grading.get("summary", {}).get("failed", 0),
+                    "total": grading.get("summary", {}).get("total", 0),
+                }
+
+                # Extract timing — check grading.json first, then sibling timing.json
+                timing = grading.get("timing", {})
+                result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
+                timing_file = run_dir / "timing.json"
+                if result["time_seconds"] == 0.0 and timing_file.exists():
+                    try:
+                        with open(timing_file) as tf:
+                            timing_data = json.load(tf)
+                        result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
+                        result["tokens"] = timing_data.get("total_tokens", 0)
+                    except json.JSONDecodeError:
+                        pass
+
+                # Extract metrics if available
+                metrics = grading.get("execution_metrics", {})
+                result["tool_calls"] = metrics.get("total_tool_calls", 0)
+                if not result.get("tokens"):
+                    result["tokens"] = metrics.get("output_chars", 0)
+                result["errors"] = metrics.get("errors_encountered", 0)
+
+                # Extract expectations — viewer requires fields: text, passed, evidence
+                raw_expectations = grading.get("expectations", [])
+                for exp in raw_expectations:
+                    if "text" not in exp or "passed" not in exp:
+                        logger.warning(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
+                result["expectations"] = raw_expectations
+
+                # Extract notes from user_notes_summary
+                notes_summary = grading.get("user_notes_summary", {})
+                notes = []
+                notes.extend(notes_summary.get("uncertainties", []))
+                notes.extend(notes_summary.get("needs_review", []))
+                notes.extend(notes_summary.get("workarounds", []))
+                result["notes"] = notes
+
+                results[config].append(result)
+
+    return results
+
+
+def aggregate_results(results: dict) -> dict:
+    """
+    Aggregate run results into summary statistics.
+
+    Returns run_summary with stats for each configuration and delta.
+    """
+    run_summary = {}
+    configs = list(results.keys())
+
+    for config in configs:
+        runs = results.get(config, [])
+
+        if not runs:
+            run_summary[config] = {
+                "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
+                "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
+            }
+            continue
+
+        pass_rates = [r["pass_rate"] for r in runs]
+        times = [r["time_seconds"] for r in runs]
+        tokens = [r.get("tokens", 0) for r in runs]
+
+        run_summary[config] = {
+            "pass_rate": calculate_stats(pass_rates),
+            "time_seconds": calculate_stats(times),
+            "tokens": calculate_stats(tokens)
+        }
+
+    # Calculate delta between the first two configs (if two exist)
+    if len(configs) >= 2:
+        primary = run_summary.get(configs[0], {})
+        baseline = run_summary.get(configs[1], {})
+    else:
+        primary = run_summary.get(configs[0], {}) if configs else {}
+        baseline = {}
+
+    delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
+    delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
+    delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
+
+    run_summary["delta"] = {
+        "pass_rate": f"{delta_pass_rate:+.2f}",
+        "time_seconds": f"{delta_time:+.1f}",
+        "tokens": f"{delta_tokens:+.0f}"
+    }
+
+    return run_summary
+
+
+def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
+    """
+    Generate complete benchmark.json from run results.
+    """
+    results = load_run_results(benchmark_dir)
+    run_summary = aggregate_results(results)
+
+    # Build runs array for benchmark.json
+    runs = []
+    for config in results:
+        for result in results[config]:
+            runs.append({
+                "eval_id": result["eval_id"],
+                "configuration": config,
+                "run_number": result["run_number"],
+                "result": {
+                    "pass_rate": result["pass_rate"],
+                    "passed": result["passed"],
+                    "failed": result["failed"],
+                    "total": result["total"],
+                    "time_seconds": result["time_seconds"],
+                    "tokens": result.get("tokens", 0),
+                    "tool_calls": result.get("tool_calls", 0),
+                    "errors": result.get("errors", 0)
+                },
+                "expectations": result["expectations"],
+                "notes": result["notes"]
+            })
+
+    # Determine eval IDs from results
+    eval_ids = sorted(set(
+        r["eval_id"]
+        for config in results.values()
+        for r in config
+    ))
+
+    benchmark = {
+        "metadata": {
+            "skill_name": skill_name or "<skill-name>",
+            "skill_path": skill_path or "<path/to/skill>",
+            "executor_model": "<model-name>",
+            "analyzer_model": "<model-name>",
+            "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
+            "evals_run": eval_ids,
+            "runs_per_configuration": 3
+        },
+        "runs": runs,
+        "run_summary": run_summary,
+        "notes": []  # To be filled by analyzer
+    }
+
+    return benchmark
+
+
+def generate_markdown(benchmark: dict) -> str:
+    """Generate human-readable benchmark.md from benchmark data."""
+    metadata = benchmark["metadata"]
+    run_summary = benchmark["run_summary"]
+
+    # Determine config names (excluding "delta")
+    configs = [k for k in run_summary if k != "delta"]
+    config_a = configs[0] if len(configs) >= 1 else "config_a"
+    config_b = configs[1] if len(configs) >= 2 else "config_b"
+    label_a = config_a.replace("_", " ").title()
+    label_b = config_b.replace("_", " ").title()
+
+    lines = [
+        f"# Skill Benchmark: {metadata['skill_name']}",
+        "",
+        f"**Model**: {metadata['executor_model']}",
+        f"**Date**: {metadata['timestamp']}",
+        f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
+        "",
+        "## Summary",
+        "",
+        f"| Metric | {label_a} | {label_b} | Delta |",
+        "|--------|------------|---------------|-------|",
+    ]
+
+    a_summary = run_summary.get(config_a, {})
+    b_summary = run_summary.get(config_b, {})
+    delta = run_summary.get("delta", {})
+
+    # Format pass rate
+    a_pr = a_summary.get("pass_rate", {})
+    b_pr = b_summary.get("pass_rate", {})
+    lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
+
+    # Format time
+    a_time = a_summary.get("time_seconds", {})
+    b_time = b_summary.get("time_seconds", {})
+    lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
+
+    # Format tokens
+    a_tokens = a_summary.get("tokens", {})
+    b_tokens = b_summary.get("tokens", {})
+    lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
+
+    # Notes section
+    if benchmark.get("notes"):
+        lines.extend([
+            "",
+            "## Notes",
+            ""
+        ])
+        for note in benchmark["notes"]:
+            lines.append(f"- {note}")
+
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Aggregate benchmark run results into summary statistics"
+    )
+    parser.add_argument(
+        "benchmark_dir",
+        type=Path,
+        help="Path to the benchmark directory"
+    )
+    parser.add_argument(
+        "--skill-name",
+        default="",
+        help="Name of the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--skill-path",
+        default="",
+        help="Path to the skill being benchmarked"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=Path,
+        help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
+    )
+
+    args = parser.parse_args()
+
+    if not args.benchmark_dir.exists():
+        logger.error(f"Directory not found: {args.benchmark_dir}")
+        sys.exit(1)
+
+    # Generate benchmark
+    benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
+
+    # Determine output paths
+    output_json = args.output or (args.benchmark_dir / "benchmark.json")
+    output_md = output_json.with_suffix(".md")
+
+    # Write benchmark.json
+    with open(output_json, "w") as f:
+        json.dump(benchmark, f, indent=2)
+    logger.info(f"Generated: {output_json}")
+
+    # Write benchmark.md
+    markdown = generate_markdown(benchmark)
+    with open(output_md, "w") as f:
+        f.write(markdown)
+    logger.info(f"Generated: {output_md}")
+
+    # Print summary
+    run_summary = benchmark["run_summary"]
+    configs = [k for k in run_summary if k != "delta"]
+    delta = run_summary.get("delta", {})
+
+    logger.info(f"\nSummary:")
+    for config in configs:
+        pr = run_summary[config]["pass_rate"]["mean"]
+        label = config.replace("_", " ").title()
+        logger.info(f"  {label}: {pr*100:.1f}% pass rate")
+    logger.info(f"  Delta:         {delta.get('pass_rate', '—')}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/generate_report.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/generate_report.py
new file mode 100644
index 000000000..395232d96
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/generate_report.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""Generate an HTML report from run_loop.py output.
+
+Takes the JSON output from run_loop.py and generates a visual HTML report
+showing each description attempt with check/x for each test case.
+Distinguishes between train and test queries.
+"""
+
+import argparse
+import html
+import json
+import sys
+from pathlib import Path
+
+from loguru import logger
+
+
+def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str:
+    """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag."""
+    history = data.get("history", [])
+    holdout = data.get("holdout", 0)
+    title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else ""
+
+    # Get all unique queries from train and test sets, with should_trigger info
+    train_queries: list[dict] = []
+    test_queries: list[dict] = []
+    if history:
+        for r in history[0].get("train_results", history[0].get("results", [])):
+            train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+        if history[0].get("test_results"):
+            for r in history[0].get("test_results", []):
+                test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)})
+
+    refresh_tag = '    <meta http-equiv="refresh" content="5">\n' if auto_refresh else ""
+
+    html_parts = ["""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+""" + refresh_tag + """    <title>""" + title_prefix + """Skill Description Optimization</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
+    <style>
+        body {
+            font-family: 'Lora', Georgia, serif;
+            max-width: 100%;
+            margin: 0 auto;
+            padding: 20px;
+            background: #faf9f5;
+            color: #141413;
+        }
+        h1 { font-family: 'Poppins', sans-serif; color: #141413; }
+        .explainer {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+            color: #b0aea5;
+            font-size: 0.875rem;
+            line-height: 1.6;
+        }
+        .summary {
+            background: white;
+            padding: 15px;
+            border-radius: 6px;
+            margin-bottom: 20px;
+            border: 1px solid #e8e6dc;
+        }
+        .summary p { margin: 5px 0; }
+        .best { color: #788c5d; font-weight: bold; }
+        .table-container {
+            overflow-x: auto;
+            width: 100%;
+        }
+        table {
+            border-collapse: collapse;
+            background: white;
+            border: 1px solid #e8e6dc;
+            border-radius: 6px;
+            font-size: 12px;
+            min-width: 100%;
+        }
+        th, td {
+            padding: 8px;
+            text-align: left;
+            border: 1px solid #e8e6dc;
+            white-space: normal;
+            word-wrap: break-word;
+        }
+        th {
+            font-family: 'Poppins', sans-serif;
+            background: #141413;
+            color: #faf9f5;
+            font-weight: 500;
+        }
+        th.test-col {
+            background: #6a9bcc;
+        }
+        th.query-col { min-width: 200px; }
+        td.description {
+            font-family: monospace;
+            font-size: 11px;
+            word-wrap: break-word;
+            max-width: 400px;
+        }
+        td.result {
+            text-align: center;
+            font-size: 16px;
+            min-width: 40px;
+        }
+        td.test-result {
+            background: #f0f6fc;
+        }
+        .pass { color: #788c5d; }
+        .fail { color: #c44; }
+        .rate {
+            font-size: 9px;
+            color: #b0aea5;
+            display: block;
+        }
+        tr:hover { background: #faf9f5; }
+        .score {
+            display: inline-block;
+            padding: 2px 6px;
+            border-radius: 4px;
+            font-weight: bold;
+            font-size: 11px;
+        }
+        .score-good { background: #eef2e8; color: #788c5d; }
+        .score-ok { background: #fef3c7; color: #d97706; }
+        .score-bad { background: #fceaea; color: #c44; }
+        .train-label { color: #b0aea5; font-size: 10px; }
+        .test-label { color: #6a9bcc; font-size: 10px; font-weight: bold; }
+        .best-row { background: #f5f8f2; }
+        th.positive-col { border-bottom: 3px solid #788c5d; }
+        th.negative-col { border-bottom: 3px solid #c44; }
+        th.test-col.positive-col { border-bottom: 3px solid #788c5d; }
+        th.test-col.negative-col { border-bottom: 3px solid #c44; }
+        .legend { font-family: 'Poppins', sans-serif; display: flex; gap: 20px; margin-bottom: 10px; font-size: 13px; align-items: center; }
+        .legend-item { display: flex; align-items: center; gap: 6px; }
+        .legend-swatch { width: 16px; height: 16px; border-radius: 3px; display: inline-block; }
+        .swatch-positive { background: #141413; border-bottom: 3px solid #788c5d; }
+        .swatch-negative { background: #141413; border-bottom: 3px solid #c44; }
+        .swatch-test { background: #6a9bcc; }
+        .swatch-train { background: #141413; }
+    </style>
+</head>
+<body>
+    <h1>""" + title_prefix + """Skill Description Optimization</h1>
+    <div class="explainer">
+        <strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
+    </div>
+"""]
+
+    # Summary section
+    best_test_score = data.get('best_test_score')
+    best_train_score = data.get('best_train_score')
+    html_parts.append(f"""
+    <div class="summary">
+        <p><strong>Original:</strong> {html.escape(data.get('original_description', 'N/A'))}</p>
+        <p class="best"><strong>Best:</strong> {html.escape(data.get('best_description', 'N/A'))}</p>
+        <p><strong>Best Score:</strong> {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}</p>
+        <p><strong>Iterations:</strong> {data.get('iterations_run', 0)} | <strong>Train:</strong> {data.get('train_size', '?')} | <strong>Test:</strong> {data.get('test_size', '?')}</p>
+    </div>
+""")
+
+    # Legend
+    html_parts.append("""
+    <div class="legend">
+        <span style="font-weight:600">Query columns:</span>
+        <span class="legend-item"><span class="legend-swatch swatch-positive"></span> Should trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-negative"></span> Should NOT trigger</span>
+        <span class="legend-item"><span class="legend-swatch swatch-train"></span> Train</span>
+        <span class="legend-item"><span class="legend-swatch swatch-test"></span> Test</span>
+    </div>
+""")
+
+    # Table header
+    html_parts.append("""
+    <div class="table-container">
+    <table>
+        <thead>
+            <tr>
+                <th>Iter</th>
+                <th>Train</th>
+                <th>Test</th>
+                <th class="query-col">Description</th>
+""")
+
+    # Add column headers for train queries
+    for qinfo in train_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="{polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    # Add column headers for test queries (different color)
+    for qinfo in test_queries:
+        polarity = "positive-col" if qinfo["should_trigger"] else "negative-col"
+        html_parts.append(f'                <th class="test-col {polarity}">{html.escape(qinfo["query"])}</th>\n')
+
+    html_parts.append("""            </tr>
+        </thead>
+        <tbody>
+""")
+
+    # Find best iteration for highlighting
+    if test_queries:
+        best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration")
+    else:
+        best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration")
+
+    # Add rows for each iteration
+    for h in history:
+        iteration = h.get("iteration", "?")
+        train_passed = h.get("train_passed", h.get("passed", 0))
+        train_total = h.get("train_total", h.get("total", 0))
+        test_passed = h.get("test_passed")
+        test_total = h.get("test_total")
+        description = h.get("description", "")
+        train_results = h.get("train_results", h.get("results", []))
+        test_results = h.get("test_results", [])
+
+        # Create lookups for results by query
+        train_by_query = {r["query"]: r for r in train_results}
+        test_by_query = {r["query"]: r for r in test_results} if test_results else {}
+
+        # Compute aggregate correct/total runs across all retries
+        def aggregate_runs(results: list[dict]) -> tuple[int, int]:
+            correct = 0
+            total = 0
+            for r in results:
+                runs = r.get("runs", 0)
+                triggers = r.get("triggers", 0)
+                total += runs
+                if r.get("should_trigger", True):
+                    correct += triggers
+                else:
+                    correct += runs - triggers
+            return correct, total
+
+        train_correct, train_runs = aggregate_runs(train_results)
+        test_correct, test_runs = aggregate_runs(test_results)
+
+        # Determine score classes
+        def score_class(correct: int, total: int) -> str:
+            if total > 0:
+                ratio = correct / total
+                if ratio >= 0.8:
+                    return "score-good"
+                elif ratio >= 0.5:
+                    return "score-ok"
+            return "score-bad"
+
+        train_class = score_class(train_correct, train_runs)
+        test_class = score_class(test_correct, test_runs)
+
+        row_class = "best-row" if iteration == best_iter else ""
+
+        html_parts.append(f"""            <tr class="{row_class}">
+                <td>{iteration}</td>
+                <td><span class="score {train_class}">{train_correct}/{train_runs}</span></td>
+                <td><span class="score {test_class}">{test_correct}/{test_runs}</span></td>
+                <td class="description">{html.escape(description)}</td>
+""")
+
+        # Add result for each train query
+        for qinfo in train_queries:
+            r = train_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        # Add result for each test query (with different background)
+        for qinfo in test_queries:
+            r = test_by_query.get(qinfo["query"], {})
+            did_pass = r.get("pass", False)
+            triggers = r.get("triggers", 0)
+            runs = r.get("runs", 0)
+
+            icon = "✓" if did_pass else "✗"
+            css_class = "pass" if did_pass else "fail"
+
+            html_parts.append(f'                <td class="result test-result {css_class}">{icon}<span class="rate">{triggers}/{runs}</span></td>\n')
+
+        html_parts.append("            </tr>\n")
+
+    html_parts.append("""        </tbody>
+    </table>
+    </div>
+""")
+
+    html_parts.append("""
+</body>
+</html>
+""")
+
+    return "".join(html_parts)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output")
+    parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)")
+    parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)")
+    parser.add_argument("--skill-name", default="", help="Skill name to include in the report title")
+    args = parser.parse_args()
+
+    if args.input == "-":
+        data = json.load(sys.stdin)
+    else:
+        data = json.loads(Path(args.input).read_text())
+
+    html_output = generate_html(data, skill_name=args.skill_name)
+
+    if args.output:
+        Path(args.output).write_text(html_output)
+        logger.info(f"Report written to {args.output}")
+    else:
+        print(html_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/improve_description.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/improve_description.py
new file mode 100644
index 000000000..887a06a08
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/improve_description.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""Improve a skill description based on eval results.
+
+Takes eval results (from run_eval.py) and generates an improved description
+using Claude with extended thinking.
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+import anthropic
+from loguru import logger
+
+from scripts.utils import parse_skill_md
+
+
+def improve_description(
+    client: anthropic.Anthropic,
+    skill_name: str,
+    skill_content: str,
+    current_description: str,
+    eval_results: dict,
+    history: list[dict],
+    model: str,
+    test_results: dict | None = None,
+    log_dir: Path | None = None,
+    iteration: int | None = None,
+) -> str:
+    """Call Claude to improve the description based on eval results."""
+    failed_triggers = [
+        r for r in eval_results["results"]
+        if r["should_trigger"] and not r["pass"]
+    ]
+    false_triggers = [
+        r for r in eval_results["results"]
+        if not r["should_trigger"] and not r["pass"]
+    ]
+
+    # Build scores summary
+    train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
+    if test_results:
+        test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
+        scores_summary = f"Train: {train_score}, Test: {test_score}"
+    else:
+        scores_summary = f"Train: {train_score}"
+
+    prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
+
+The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
+
+Here's the current description:
+<current_description>
+"{current_description}"
+</current_description>
+
+Current scores ({scores_summary}):
+<scores_summary>
+"""
+    if failed_triggers:
+        prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n"
+        for r in failed_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if false_triggers:
+        prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n"
+        for r in false_triggers:
+            prompt += f'  - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n'
+        prompt += "\n"
+
+    if history:
+        prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n"
+        for h in history:
+            train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}"
+            test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None
+            score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "")
+            prompt += f'<attempt {score_str}>\n'
+            prompt += f'Description: "{h["description"]}"\n'
+            if "results" in h:
+                prompt += "Train results:\n"
+                for r in h["results"]:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    prompt += f'  [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n'
+            if h.get("note"):
+                prompt += f'Note: {h["note"]}\n'
+            prompt += "</attempt>\n\n"
+
+    prompt += f"""</scores_summary>
+
+Skill content (for context on what the skill does):
+<skill_content>
+{skill_content}
+</skill_content>
+
+Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold:
+
+1. Avoid overfitting
+2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description.
+
+Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy.
+
+Here are some tips that we've found to work well in writing these descriptions:
+- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"
+- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.
+- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable.
+- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.
+
+I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end. 
+
+Please respond with only the new description text in <new_description> tags, nothing else."""
+
+    response = client.messages.create(
+        model=model,
+        max_tokens=16000,
+        thinking={
+            "type": "enabled",
+            "budget_tokens": 10000,
+        },
+        messages=[{"role": "user", "content": prompt}],
+    )
+
+    # Extract thinking and text from response
+    thinking_text = ""
+    text = ""
+    for block in response.content:
+        if block.type == "thinking":
+            thinking_text = block.thinking
+        elif block.type == "text":
+            text = block.text
+
+    # Parse out the <new_description> tags
+    match = re.search(r"<new_description>(.*?)</new_description>", text, re.DOTALL)
+    description = match.group(1).strip().strip('"') if match else text.strip().strip('"')
+
+    # Log the transcript
+    transcript: dict = {
+        "iteration": iteration,
+        "prompt": prompt,
+        "thinking": thinking_text,
+        "response": text,
+        "parsed_description": description,
+        "char_count": len(description),
+        "over_limit": len(description) > 1024,
+    }
+
+    # If over 1024 chars, ask the model to shorten it
+    if len(description) > 1024:
+        shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in <new_description> tags."
+        shorten_response = client.messages.create(
+            model=model,
+            max_tokens=16000,
+            thinking={
+                "type": "enabled",
+                "budget_tokens": 10000,
+            },
+            messages=[
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": text},
+                {"role": "user", "content": shorten_prompt},
+            ],
+        )
+
+        shorten_thinking = ""
+        shorten_text = ""
+        for block in shorten_response.content:
+            if block.type == "thinking":
+                shorten_thinking = block.thinking
+            elif block.type == "text":
+                shorten_text = block.text
+
+        match = re.search(r"<new_description>(.*?)</new_description>", shorten_text, re.DOTALL)
+        shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"')
+
+        transcript["rewrite_prompt"] = shorten_prompt
+        transcript["rewrite_thinking"] = shorten_thinking
+        transcript["rewrite_response"] = shorten_text
+        transcript["rewrite_description"] = shortened
+        transcript["rewrite_char_count"] = len(shortened)
+        description = shortened
+
+    transcript["final_description"] = description
+
+    if log_dir:
+        log_dir.mkdir(parents=True, exist_ok=True)
+        log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json"
+        log_file.write_text(json.dumps(transcript, indent=2))
+
+    return description
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Improve a skill description based on eval results")
+    parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr")
+    args = parser.parse_args()
+
+    skill_path = Path(args.skill_path)
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    eval_results = json.loads(Path(args.eval_results).read_text())
+    history = []
+    if args.history:
+        history = json.loads(Path(args.history).read_text())
+
+    name, _, content = parse_skill_md(skill_path)
+    current_description = eval_results["description"]
+
+    if args.verbose:
+        logger.info(f"Current: {current_description}")
+        logger.info(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}")
+
+    client = anthropic.Anthropic()
+    new_description = improve_description(
+        client=client,
+        skill_name=name,
+        skill_content=content,
+        current_description=current_description,
+        eval_results=eval_results,
+        history=history,
+        model=args.model,
+    )
+
+    if args.verbose:
+        logger.info(f"Improved: {new_description}")
+
+    # Output as JSON with both the new description and updated history
+    output = {
+        "description": new_description,
+        "history": history + [{
+            "description": current_description,
+            "passed": eval_results["summary"]["passed"],
+            "failed": eval_results["summary"]["failed"],
+            "total": eval_results["summary"]["total"],
+            "results": eval_results["results"],
+        }],
+    }
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/package_skill.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/package_skill.py
new file mode 100644
index 000000000..5dbdf7843
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/package_skill.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Skill Packager - Creates a distributable .skill file of a skill folder
+
+Usage:
+    python utils/package_skill.py <path/to/skill-folder> [output-directory]
+
+Example:
+    python utils/package_skill.py skills/public/my-skill
+    python utils/package_skill.py skills/public/my-skill ./dist
+"""
+
+import fnmatch
+import sys
+import zipfile
+from pathlib import Path
+
+from loguru import logger
+from scripts.quick_validate import validate_skill
+
+# Patterns to exclude when packaging skills.
+EXCLUDE_DIRS = {"__pycache__", "node_modules"}
+EXCLUDE_GLOBS = {"*.pyc"}
+EXCLUDE_FILES = {".DS_Store"}
+# Directories excluded only at the skill root (not when nested deeper).
+ROOT_EXCLUDE_DIRS = {"evals"}
+
+
+def should_exclude(rel_path: Path) -> bool:
+    """Check if a path should be excluded from packaging."""
+    parts = rel_path.parts
+    if any(part in EXCLUDE_DIRS for part in parts):
+        return True
+    # rel_path is relative to skill_path.parent, so parts[0] is the skill
+    # folder name and parts[1] (if present) is the first subdir.
+    if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
+        return True
+    name = rel_path.name
+    if name in EXCLUDE_FILES:
+        return True
+    return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
+
+
+def package_skill(skill_path, output_dir=None):
+    """
+    Package a skill folder into a .skill file.
+
+    Args:
+        skill_path: Path to the skill folder
+        output_dir: Optional output directory for the .skill file (defaults to current directory)
+
+    Returns:
+        Path to the created .skill file, or None if error
+    """
+    skill_path = Path(skill_path).resolve()
+
+    # Validate skill folder exists
+    if not skill_path.exists():
+        logger.error(f"Skill folder not found: {skill_path}")
+        return None
+
+    if not skill_path.is_dir():
+        logger.error(f"Path is not a directory: {skill_path}")
+        return None
+
+    # Validate SKILL.md exists
+    skill_md = skill_path / "SKILL.md"
+    if not skill_md.exists():
+        logger.error(f"SKILL.md not found in {skill_path}")
+        return None
+
+    # Run validation before packaging
+    logger.info("Validating skill...")
+    valid, message = validate_skill(skill_path)
+    if not valid:
+        logger.error(f"Validation failed: {message}")
+        logger.error("Please fix the validation errors before packaging.")
+        return None
+    logger.info(f"{message}\n")
+
+    # Determine output location
+    skill_name = skill_path.name
+    if output_dir:
+        output_path = Path(output_dir).resolve()
+        output_path.mkdir(parents=True, exist_ok=True)
+    else:
+        output_path = Path.cwd()
+
+    skill_filename = output_path / f"{skill_name}.skill"
+
+    # Create the .skill file (zip format)
+    try:
+        with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            # Walk through the skill directory, excluding build artifacts
+            for file_path in skill_path.rglob('*'):
+                if not file_path.is_file():
+                    continue
+                arcname = file_path.relative_to(skill_path.parent)
+                if should_exclude(arcname):
+                    logger.debug(f"Skipped: {arcname}")
+                    continue
+                zipf.write(file_path, arcname)
+                logger.debug(f"Added: {arcname}")
+
+        logger.info(f"Successfully packaged skill to: {skill_filename}")
+        return skill_filename
+
+    except Exception as e:
+        logger.error(f"Error creating .skill file: {e}")
+        return None
+
+
+def main():
+    if len(sys.argv) < 2:
+        logger.info("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
+        logger.info("\nExample:")
+        logger.info("  python utils/package_skill.py skills/public/my-skill")
+        logger.info("  python utils/package_skill.py skills/public/my-skill ./dist")
+        sys.exit(1)
+
+    skill_path = sys.argv[1]
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else None
+
+    logger.info(f"Packaging skill: {skill_path}")
+    if output_dir:
+        logger.info(f"   Output directory: {output_dir}")
+    logger.info("")
+
+    result = package_skill(skill_path, output_dir)
+
+    if result:
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/quick_validate.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/quick_validate.py
new file mode 100644
index 000000000..36553161e
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/quick_validate.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Quick validation script for skills - minimal version
+"""
+
+import sys
+import os
+import re
+import yaml
+from pathlib import Path
+
+from loguru import logger
+
+def validate_skill(skill_path):
+    """Basic validation of a skill"""
+    skill_path = Path(skill_path)
+
+    # Check SKILL.md exists
+    skill_md = skill_path / 'SKILL.md'
+    if not skill_md.exists():
+        return False, "SKILL.md not found"
+
+    # Read and validate frontmatter
+    content = skill_md.read_text()
+    if not content.startswith('---'):
+        return False, "No YAML frontmatter found"
+
+    # Extract frontmatter
+    match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
+    if not match:
+        return False, "Invalid frontmatter format"
+
+    frontmatter_text = match.group(1)
+
+    # Parse YAML frontmatter
+    try:
+        frontmatter = yaml.safe_load(frontmatter_text)
+        if not isinstance(frontmatter, dict):
+            return False, "Frontmatter must be a YAML dictionary"
+    except yaml.YAMLError as e:
+        return False, f"Invalid YAML in frontmatter: {e}"
+
+    # Define allowed properties
+    ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'}
+
+    # Check for unexpected properties (excluding nested keys under metadata)
+    unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
+    if unexpected_keys:
+        return False, (
+            f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
+            f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
+        )
+
+    # Check required fields
+    if 'name' not in frontmatter:
+        return False, "Missing 'name' in frontmatter"
+    if 'description' not in frontmatter:
+        return False, "Missing 'description' in frontmatter"
+
+    # Extract name for validation
+    name = frontmatter.get('name', '')
+    if not isinstance(name, str):
+        return False, f"Name must be a string, got {type(name).__name__}"
+    name = name.strip()
+    if name:
+        # Check naming convention (kebab-case: lowercase with hyphens)
+        if not re.match(r'^[a-z0-9-]+$', name):
+            return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
+        if name.startswith('-') or name.endswith('-') or '--' in name:
+            return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
+        # Check name length (max 64 characters per spec)
+        if len(name) > 64:
+            return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
+
+    # Extract and validate description
+    description = frontmatter.get('description', '')
+    if not isinstance(description, str):
+        return False, f"Description must be a string, got {type(description).__name__}"
+    description = description.strip()
+    if description:
+        # Check for angle brackets
+        if '<' in description or '>' in description:
+            return False, "Description cannot contain angle brackets (< or >)"
+        # Check description length (max 1024 characters per spec)
+        if len(description) > 1024:
+            return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
+
+    # Validate compatibility field if present (optional)
+    compatibility = frontmatter.get('compatibility', '')
+    if compatibility:
+        if not isinstance(compatibility, str):
+            return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
+        if len(compatibility) > 500:
+            return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
+
+    return True, "Skill is valid!"
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        logger.info("Usage: python quick_validate.py <skill_directory>")
+        sys.exit(1)
+
+    valid, message = validate_skill(sys.argv[1])
+    if valid:
+        logger.info(message)
+    else:
+        logger.error(message)
+    sys.exit(0 if valid else 1)
\ No newline at end of file
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_eval.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_eval.py
new file mode 100644
index 000000000..f923066ca
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_eval.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+"""Run trigger evaluation for a skill description.
+
+Tests whether a skill's description causes Claude to trigger (read the skill)
+for a set of queries. Outputs results as JSON.
+"""
+
+import argparse
+import json
+import os
+import select
+import subprocess
+import sys
+import time
+import uuid
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+
+from loguru import logger
+
+from scripts.utils import parse_skill_md
+
+
+def find_project_root() -> Path:
+    """Find the project root by walking up from cwd looking for .claude/.
+
+    Mimics how Claude Code discovers its project root, so the command file
+    we create ends up where claude -p will look for it.
+    """
+    current = Path.cwd()
+    for parent in [current, *current.parents]:
+        if (parent / ".claude").is_dir():
+            return parent
+    return current
+
+
+def run_single_query(
+    query: str,
+    skill_name: str,
+    skill_description: str,
+    timeout: int,
+    project_root: str,
+    model: str | None = None,
+) -> bool:
+    """Run a single query and return whether the skill was triggered.
+
+    Creates a command file in .claude/commands/ so it appears in Claude's
+    available_skills list, then runs `claude -p` with the raw query.
+    Uses --include-partial-messages to detect triggering early from
+    stream events (content_block_start) rather than waiting for the
+    full assistant message, which only arrives after tool execution.
+    """
+    unique_id = uuid.uuid4().hex[:8]
+    clean_name = f"{skill_name}-skill-{unique_id}"
+    project_commands_dir = Path(project_root) / ".claude" / "commands"
+    command_file = project_commands_dir / f"{clean_name}.md"
+
+    try:
+        project_commands_dir.mkdir(parents=True, exist_ok=True)
+        # Use YAML block scalar to avoid breaking on quotes in description
+        indented_desc = "\n  ".join(skill_description.split("\n"))
+        command_content = (
+            f"---\n"
+            f"description: |\n"
+            f"  {indented_desc}\n"
+            f"---\n\n"
+            f"# {skill_name}\n\n"
+            f"This skill handles: {skill_description}\n"
+        )
+        command_file.write_text(command_content)
+
+        cmd = [
+            "claude",
+            "-p", query,
+            "--output-format", "stream-json",
+            "--verbose",
+            "--include-partial-messages",
+        ]
+        if model:
+            cmd.extend(["--model", model])
+
+        # Remove CLAUDECODE env var to allow nesting claude -p inside a
+        # Claude Code session. The guard is for interactive terminal conflicts;
+        # programmatic subprocess usage is safe.
+        env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+            cwd=project_root,
+            env=env,
+        )
+
+        triggered = False
+        start_time = time.time()
+        buffer = ""
+        # Track state for stream event detection
+        pending_tool_name = None
+        accumulated_json = ""
+
+        try:
+            while time.time() - start_time < timeout:
+                if process.poll() is not None:
+                    remaining = process.stdout.read()
+                    if remaining:
+                        buffer += remaining.decode("utf-8", errors="replace")
+                    break
+
+                ready, _, _ = select.select([process.stdout], [], [], 1.0)
+                if not ready:
+                    continue
+
+                chunk = os.read(process.stdout.fileno(), 8192)
+                if not chunk:
+                    break
+                buffer += chunk.decode("utf-8", errors="replace")
+
+                while "\n" in buffer:
+                    line, buffer = buffer.split("\n", 1)
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    try:
+                        event = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+
+                    # Early detection via stream events
+                    if event.get("type") == "stream_event":
+                        se = event.get("event", {})
+                        se_type = se.get("type", "")
+
+                        if se_type == "content_block_start":
+                            cb = se.get("content_block", {})
+                            if cb.get("type") == "tool_use":
+                                tool_name = cb.get("name", "")
+                                if tool_name in ("Skill", "Read"):
+                                    pending_tool_name = tool_name
+                                    accumulated_json = ""
+                                else:
+                                    return False
+
+                        elif se_type == "content_block_delta" and pending_tool_name:
+                            delta = se.get("delta", {})
+                            if delta.get("type") == "input_json_delta":
+                                accumulated_json += delta.get("partial_json", "")
+                                if clean_name in accumulated_json:
+                                    return True
+
+                        elif se_type in ("content_block_stop", "message_stop"):
+                            if pending_tool_name:
+                                return clean_name in accumulated_json
+                            if se_type == "message_stop":
+                                return False
+
+                    # Fallback: full assistant message
+                    elif event.get("type") == "assistant":
+                        message = event.get("message", {})
+                        for content_item in message.get("content", []):
+                            if content_item.get("type") != "tool_use":
+                                continue
+                            tool_name = content_item.get("name", "")
+                            tool_input = content_item.get("input", {})
+                            if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
+                                triggered = True
+                            elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
+                                triggered = True
+                            return triggered
+
+                    elif event.get("type") == "result":
+                        return triggered
+        finally:
+            # Clean up process on any exit path (return, exception, timeout)
+            if process.poll() is None:
+                process.kill()
+                process.wait()
+
+        return triggered
+    finally:
+        if command_file.exists():
+            command_file.unlink()
+
+
+def run_eval(
+    eval_set: list[dict],
+    skill_name: str,
+    description: str,
+    num_workers: int,
+    timeout: int,
+    project_root: Path,
+    runs_per_query: int = 1,
+    trigger_threshold: float = 0.5,
+    model: str | None = None,
+) -> dict:
+    """Run the full eval set and return results."""
+    results = []
+
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        future_to_info = {}
+        for item in eval_set:
+            for run_idx in range(runs_per_query):
+                future = executor.submit(
+                    run_single_query,
+                    item["query"],
+                    skill_name,
+                    description,
+                    timeout,
+                    str(project_root),
+                    model,
+                )
+                future_to_info[future] = (item, run_idx)
+
+        query_triggers: dict[str, list[bool]] = {}
+        query_items: dict[str, dict] = {}
+        for future in as_completed(future_to_info):
+            item, _ = future_to_info[future]
+            query = item["query"]
+            query_items[query] = item
+            if query not in query_triggers:
+                query_triggers[query] = []
+            try:
+                query_triggers[query].append(future.result())
+            except Exception as e:
+                logger.warning(f"Warning: query failed: {e}")
+                query_triggers[query].append(False)
+
+    for query, triggers in query_triggers.items():
+        item = query_items[query]
+        trigger_rate = sum(triggers) / len(triggers)
+        should_trigger = item["should_trigger"]
+        if should_trigger:
+            did_pass = trigger_rate >= trigger_threshold
+        else:
+            did_pass = trigger_rate < trigger_threshold
+        results.append({
+            "query": query,
+            "should_trigger": should_trigger,
+            "trigger_rate": trigger_rate,
+            "triggers": sum(triggers),
+            "runs": len(triggers),
+            "pass": did_pass,
+        })
+
+    passed = sum(1 for r in results if r["pass"])
+    total = len(results)
+
+    return {
+        "skill_name": skill_name,
+        "description": description,
+        "results": results,
+        "summary": {
+            "total": total,
+            "passed": passed,
+            "failed": total - passed,
+        },
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override description to test")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    name, original_description, content = parse_skill_md(skill_path)
+    description = args.description or original_description
+    project_root = find_project_root()
+
+    if args.verbose:
+        logger.info(f"Evaluating: {description}")
+
+    output = run_eval(
+        eval_set=eval_set,
+        skill_name=name,
+        description=description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        project_root=project_root,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        model=args.model,
+    )
+
+    if args.verbose:
+        summary = output["summary"]
+        logger.info(f"Results: {summary['passed']}/{summary['total']} passed")
+        for r in output["results"]:
+            status = "PASS" if r["pass"] else "FAIL"
+            rate_str = f"{r['triggers']}/{r['runs']}"
+            logger.info(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}")
+
+    print(json.dumps(output, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_loop.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_loop.py
new file mode 100644
index 000000000..a2907d6e0
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_loop.py
@@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+"""Run the eval + improve loop until all pass or max iterations reached.
+
+Combines run_eval.py and improve_description.py in a loop, tracking history
+and returning the best description found. Supports train/test split to prevent
+overfitting.
+"""
+
+import argparse
+import json
+import random
+import sys
+import tempfile
+import time
+import webbrowser
+from pathlib import Path
+
+import anthropic
+from loguru import logger
+
+from scripts.generate_report import generate_html
+from scripts.improve_description import improve_description
+from scripts.run_eval import find_project_root, run_eval
+from scripts.utils import parse_skill_md
+
+
+def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]:
+    """Split eval set into train and test sets, stratified by should_trigger."""
+    random.seed(seed)
+
+    # Separate by should_trigger
+    trigger = [e for e in eval_set if e["should_trigger"]]
+    no_trigger = [e for e in eval_set if not e["should_trigger"]]
+
+    # Shuffle each group
+    random.shuffle(trigger)
+    random.shuffle(no_trigger)
+
+    # Calculate split points
+    n_trigger_test = max(1, int(len(trigger) * holdout))
+    n_no_trigger_test = max(1, int(len(no_trigger) * holdout))
+
+    # Split
+    test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test]
+    train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:]
+
+    return train_set, test_set
+
+
+def run_loop(
+    eval_set: list[dict],
+    skill_path: Path,
+    description_override: str | None,
+    num_workers: int,
+    timeout: int,
+    max_iterations: int,
+    runs_per_query: int,
+    trigger_threshold: float,
+    holdout: float,
+    model: str,
+    verbose: bool,
+    live_report_path: Path | None = None,
+    log_dir: Path | None = None,
+) -> dict:
+    """Run the eval + improvement loop."""
+    project_root = find_project_root()
+    name, original_description, content = parse_skill_md(skill_path)
+    current_description = description_override or original_description
+
+    # Split into train/test if holdout > 0
+    if holdout > 0:
+        train_set, test_set = split_eval_set(eval_set, holdout)
+        if verbose:
+            logger.info(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})")
+    else:
+        train_set = eval_set
+        test_set = []
+
+    client = anthropic.Anthropic()
+    history = []
+    exit_reason = "unknown"
+
+    for iteration in range(1, max_iterations + 1):
+        if verbose:
+            logger.info(f"\n{'='*60}")
+            logger.info(f"Iteration {iteration}/{max_iterations}")
+            logger.info(f"Description: {current_description}")
+            logger.info(f"{'='*60}")
+
+        # Evaluate train + test together in one batch for parallelism
+        all_queries = train_set + test_set
+        t0 = time.time()
+        all_results = run_eval(
+            eval_set=all_queries,
+            skill_name=name,
+            description=current_description,
+            num_workers=num_workers,
+            timeout=timeout,
+            project_root=project_root,
+            runs_per_query=runs_per_query,
+            trigger_threshold=trigger_threshold,
+            model=model,
+        )
+        eval_elapsed = time.time() - t0
+
+        # Split results back into train/test by matching queries
+        train_queries_set = {q["query"] for q in train_set}
+        train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set]
+        test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set]
+
+        train_passed = sum(1 for r in train_result_list if r["pass"])
+        train_total = len(train_result_list)
+        train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total}
+        train_results = {"results": train_result_list, "summary": train_summary}
+
+        if test_set:
+            test_passed = sum(1 for r in test_result_list if r["pass"])
+            test_total = len(test_result_list)
+            test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total}
+            test_results = {"results": test_result_list, "summary": test_summary}
+        else:
+            test_results = None
+            test_summary = None
+
+        history.append({
+            "iteration": iteration,
+            "description": current_description,
+            "train_passed": train_summary["passed"],
+            "train_failed": train_summary["failed"],
+            "train_total": train_summary["total"],
+            "train_results": train_results["results"],
+            "test_passed": test_summary["passed"] if test_summary else None,
+            "test_failed": test_summary["failed"] if test_summary else None,
+            "test_total": test_summary["total"] if test_summary else None,
+            "test_results": test_results["results"] if test_results else None,
+            # For backward compat with report generator
+            "passed": train_summary["passed"],
+            "failed": train_summary["failed"],
+            "total": train_summary["total"],
+            "results": train_results["results"],
+        })
+
+        # Write live report if path provided
+        if live_report_path:
+            partial_output = {
+                "original_description": original_description,
+                "best_description": current_description,
+                "best_score": "in progress",
+                "iterations_run": len(history),
+                "holdout": holdout,
+                "train_size": len(train_set),
+                "test_size": len(test_set),
+                "history": history,
+            }
+            live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name))
+
+        if verbose:
+            def print_eval_stats(label, results, elapsed):
+                pos = [r for r in results if r["should_trigger"]]
+                neg = [r for r in results if not r["should_trigger"]]
+                tp = sum(r["triggers"] for r in pos)
+                pos_runs = sum(r["runs"] for r in pos)
+                fn = pos_runs - tp
+                fp = sum(r["triggers"] for r in neg)
+                neg_runs = sum(r["runs"] for r in neg)
+                tn = neg_runs - fp
+                total = tp + tn + fp + fn
+                precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
+                recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0
+                accuracy = (tp + tn) / total if total > 0 else 0.0
+                logger.info(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)")
+                for r in results:
+                    status = "PASS" if r["pass"] else "FAIL"
+                    rate_str = f"{r['triggers']}/{r['runs']}"
+                    logger.info(f"  [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}")
+
+            print_eval_stats("Train", train_results["results"], eval_elapsed)
+            if test_summary:
+                print_eval_stats("Test ", test_results["results"], 0)
+
+        if train_summary["failed"] == 0:
+            exit_reason = f"all_passed (iteration {iteration})"
+            if verbose:
+                logger.info(f"\nAll train queries passed on iteration {iteration}!")
+            break
+
+        if iteration == max_iterations:
+            exit_reason = f"max_iterations ({max_iterations})"
+            if verbose:
+                logger.info(f"\nMax iterations reached ({max_iterations}).")
+            break
+
+        # Improve the description based on train results
+        if verbose:
+            logger.info(f"\nImproving description...")
+
+        t0 = time.time()
+        # Strip test scores from history so improvement model can't see them
+        blinded_history = [
+            {k: v for k, v in h.items() if not k.startswith("test_")}
+            for h in history
+        ]
+        new_description = improve_description(
+            client=client,
+            skill_name=name,
+            skill_content=content,
+            current_description=current_description,
+            eval_results=train_results,
+            history=blinded_history,
+            model=model,
+            log_dir=log_dir,
+            iteration=iteration,
+        )
+        improve_elapsed = time.time() - t0
+
+        if verbose:
+            logger.info(f"Proposed ({improve_elapsed:.1f}s): {new_description}")
+
+        current_description = new_description
+
+    # Find the best iteration by TEST score (or train if no test set)
+    if test_set:
+        best = max(history, key=lambda h: h["test_passed"] or 0)
+        best_score = f"{best['test_passed']}/{best['test_total']}"
+    else:
+        best = max(history, key=lambda h: h["train_passed"])
+        best_score = f"{best['train_passed']}/{best['train_total']}"
+
+    if verbose:
+        logger.info(f"\nExit reason: {exit_reason}")
+        logger.info(f"Best score: {best_score} (iteration {best['iteration']})")
+
+    return {
+        "exit_reason": exit_reason,
+        "original_description": original_description,
+        "best_description": best["description"],
+        "best_score": best_score,
+        "best_train_score": f"{best['train_passed']}/{best['train_total']}",
+        "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None,
+        "final_description": current_description,
+        "iterations_run": len(history),
+        "holdout": holdout,
+        "train_size": len(train_set),
+        "test_size": len(test_set),
+        "history": history,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run eval + improve loop")
+    parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
+    parser.add_argument("--skill-path", required=True, help="Path to skill directory")
+    parser.add_argument("--description", default=None, help="Override starting description")
+    parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
+    parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
+    parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations")
+    parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
+    parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
+    parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)")
+    parser.add_argument("--model", required=True, help="Model for improvement")
+    parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
+    parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)")
+    parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here")
+    args = parser.parse_args()
+
+    eval_set = json.loads(Path(args.eval_set).read_text())
+    skill_path = Path(args.skill_path)
+
+    if not (skill_path / "SKILL.md").exists():
+        logger.error(f"Error: No SKILL.md found at {skill_path}")
+        sys.exit(1)
+
+    name, _, _ = parse_skill_md(skill_path)
+
+    # Set up live report path
+    if args.report != "none":
+        if args.report == "auto":
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html"
+        else:
+            live_report_path = Path(args.report)
+        # Open the report immediately so the user can watch
+        live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>")
+        webbrowser.open(str(live_report_path))
+    else:
+        live_report_path = None
+
+    # Determine output directory (create before run_loop so logs can be written)
+    if args.results_dir:
+        timestamp = time.strftime("%Y-%m-%d_%H%M%S")
+        results_dir = Path(args.results_dir) / timestamp
+        results_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        results_dir = None
+
+    log_dir = results_dir / "logs" if results_dir else None
+
+    output = run_loop(
+        eval_set=eval_set,
+        skill_path=skill_path,
+        description_override=args.description,
+        num_workers=args.num_workers,
+        timeout=args.timeout,
+        max_iterations=args.max_iterations,
+        runs_per_query=args.runs_per_query,
+        trigger_threshold=args.trigger_threshold,
+        holdout=args.holdout,
+        model=args.model,
+        verbose=args.verbose,
+        live_report_path=live_report_path,
+        log_dir=log_dir,
+    )
+
+    # Save JSON output
+    json_output = json.dumps(output, indent=2)
+    print(json_output)
+    if results_dir:
+        (results_dir / "results.json").write_text(json_output)
+
+    # Write final HTML report (without auto-refresh)
+    if live_report_path:
+        live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name))
+        logger.info(f"\nReport: {live_report_path}")
+
+    if results_dir and live_report_path:
+        (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name))
+
+    if results_dir:
+        logger.info(f"Results saved to: {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/utils.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/utils.py
new file mode 100644
index 000000000..51b6a07dd
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/utils.py
@@ -0,0 +1,47 @@
+"""Shared utilities for skill-creator scripts."""
+
+from pathlib import Path
+
+
+
+def parse_skill_md(skill_path: Path) -> tuple[str, str, str]:
+    """Parse a SKILL.md file, returning (name, description, full_content)."""
+    content = (skill_path / "SKILL.md").read_text()
+    lines = content.split("\n")
+
+    if lines[0].strip() != "---":
+        raise ValueError("SKILL.md missing frontmatter (no opening ---)")
+
+    end_idx = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == "---":
+            end_idx = i
+            break
+
+    if end_idx is None:
+        raise ValueError("SKILL.md missing frontmatter (no closing ---)")
+
+    name = ""
+    description = ""
+    frontmatter_lines = lines[1:end_idx]
+    i = 0
+    while i < len(frontmatter_lines):
+        line = frontmatter_lines[i]
+        if line.startswith("name:"):
+            name = line[len("name:"):].strip().strip('"').strip("'")
+        elif line.startswith("description:"):
+            value = line[len("description:"):].strip()
+            # Handle YAML multiline indicators (>, |, >-, |-)
+            if value in (">", "|", ">-", "|-"):
+                continuation_lines: list[str] = []
+                i += 1
+                while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith("  ") or frontmatter_lines[i].startswith("\t")):
+                    continuation_lines.append(frontmatter_lines[i].strip())
+                    i += 1
+                description = " ".join(continuation_lines)
+                continue
+            else:
+                description = value.strip('"').strip("'")
+        i += 1
+
+    return name, description, content
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/soul.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/soul.md
new file mode 100644
index 000000000..1554c3463
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/soul.md
@@ -0,0 +1,16 @@
+# Soul — {{agent_name}}
+
+## Identity
+- **名称**: {{agent_name}}
+- **角色**: {{role_description}}
+- **创建者**: {{creator_name}}
+- **创建时间**: {{created_at}}
+
+## Personality
+- 认真负责、注重细节
+- 主动汇报工作进展
+- 遇到不确定的信息会主动确认
+
+## Boundaries
+- 遵守企业保密制度
+- 敏感操作需经过创建者审批
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/state.json b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/state.json
new file mode 100644
index 000000000..0507e31dd
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/state.json
@@ -0,0 +1,13 @@
+{
+    "agent_id": "",
+    "name": "",
+    "status": "idle",
+    "current_task": null,
+    "last_active": null,
+    "channel_status": {},
+    "stats": {
+        "tasks_completed_today": 0,
+        "tasks_in_progress": 0,
+        "督办_pending": 0
+    }
+}
\ No newline at end of file
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/todo.json b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/todo.json
new file mode 100644
index 000000000..50ffbb9a9
--- /dev/null
+++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/todo.json
@@ -0,0 +1,3 @@
+{
+  "tasks": []
+}
diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/workspace/archived/.gitkeep b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/workspace/archived/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/.vite/deps/_metadata.json b/.vite/deps/_metadata.json
deleted file mode 100644
index bc8e5d421..000000000
--- a/.vite/deps/_metadata.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "hash": "6bfe7905",
-  "configHash": "4280ba71",
-  "lockfileHash": "e3b0c442",
-  "browserHash": "25813ad8",
-  "optimized": {},
-  "chunks": {}
-}
\ No newline at end of file
diff --git a/.vite/deps/package.json b/.vite/deps/package.json
deleted file mode 100644
index 3dbc1ca59..000000000
--- a/.vite/deps/package.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "type": "module"
-}
diff --git a/backend/VERSION b/backend/VERSION
index 9ab8337f3..dcb0bac8a 100644
--- a/backend/VERSION
+++ b/backend/VERSION
@@ -1 +1 @@
-1.9.1
+1.8.3-beta.2
diff --git a/backend/app/core/logging_config.py b/backend/app/core/logging_config.py
index c9afab182..107b9904f 100644
--- a/backend/app/core/logging_config.py
+++ b/backend/app/core/logging_config.py
@@ -36,19 +36,50 @@ def set_trace_id(trace_id: str) -> None:
 
 def configure_logging():
     """Configure loguru with custom format including trace ID."""
-    # Remove default handler
+    import os
+    is_windows = os.name == 'nt'
+
     logger.remove()
 
-    # Add stdout handler with custom format and filter to ensure trace_id exists
-    logger.add(
-        sys.stdout,
-        level="INFO",
-        format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <cyan>{extra[trace_id]:-<12}</cyan> | <cyan>{name}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
-        enqueue=True,
-        backtrace=True,
-        diagnose=True,
-        filter=lambda record: (record["extra"].setdefault("trace_id", get_trace_id() or str(uuid4())) is not None)
-    )
+    if is_windows:
+        fmt = "{time:YYYY-MM-DD HH:mm:ss} | {level} | {extra[trace_id]:-<12} | {name}:{line} - {message}"
+
+        class EncodedStream:
+            def __init__(self, stream):
+                self.stream = stream
+                self.encoding = 'utf-8'
+
+            def write(self, text):
+                try:
+                    self.stream.write(text)
+                except UnicodeEncodeError:
+                    clean_text = text.encode('utf-8', errors='replace').decode('utf-8')
+                    self.stream.write(clean_text)
+
+            def flush(self):
+                self.stream.flush()
+
+        logger.add(
+            EncodedStream(sys.stdout),
+            level="INFO",
+            format=fmt,
+            enqueue=True,
+            backtrace=True,
+            diagnose=True,
+            filter=lambda record: (record["extra"].setdefault("trace_id", get_trace_id() or str(uuid4())) is not None),
+            colorize=False,
+        )
+    else:
+        logger.add(
+            sys.stdout,
+            level="INFO",
+            format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <cyan>{extra[trace_id]:-<12}</cyan> | <cyan>{name}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
+            enqueue=True,
+            backtrace=True,
+            diagnose=True,
+            filter=lambda record: (record["extra"].setdefault("trace_id", get_trace_id() or str(uuid4())) is not None),
+            colorize=True,
+        )
 
     return logger
 
diff --git a/backend/app/main.py b/backend/app/main.py
index 0f4691004..3f09a584a 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -1,5 +1,7 @@
 """Clawith Backend — FastAPI Application Entry Point."""
 
+import os
+import subprocess
 from contextlib import asynccontextmanager
 from pathlib import Path
 import shutil
@@ -190,6 +192,18 @@ async def lifespan(app: FastAPI):
                 if _tenant:
                     _new_dir = _data_dir / f"enterprise_info_{_tenant.id}"
                     if not _new_dir.exists():
+                        # Set permissions on parent directory first
+                        if os.name == 'nt':
+                            try:
+                                subprocess.run(
+                                    ['icacls', str(_data_dir), '/grant', 'Everyone:F', '/T', '/C'],
+                                    check=False,
+                                    capture_output=True,
+                                    text=True
+                                )
+                            except Exception as e:
+                                print(f"[startup] ⚠️ Failed to set permissions for {_data_dir}: {e}", flush=True)
+                        
                         shutil.copytree(str(_old_dir), str(_new_dir))
                         print(f"[startup] ✅ Migrated enterprise_info → enterprise_info_{_tenant.id}", flush=True)
                     else:
diff --git a/backend/app/services/agent_manager.py b/backend/app/services/agent_manager.py
index 9f4c4a73c..f7d0706b2 100644
--- a/backend/app/services/agent_manager.py
+++ b/backend/app/services/agent_manager.py
@@ -1,7 +1,7 @@
-"""Agent lifecycle manager — Docker container management for OpenClaw Gateway instances."""
-
 import json
+import os
 import shutil
+import subprocess
 import uuid
 from datetime import datetime, timezone
 from pathlib import Path
@@ -20,13 +20,32 @@
 settings = get_settings()
 
 
+def _set_directory_permissions(path: Path):
+    """Set proper permissions for a directory on Windows."""
+    if os.name == 'nt':
+        try:
+            subprocess.run(
+                ['icacls', str(path), '/grant', 'Everyone:F', '/T', '/C'],
+                check=False,
+                capture_output=True,
+                text=True
+            )
+        except Exception as e:
+            logger.warning(f"Failed to set permissions for {path}: {e}")
+    else:
+        try:
+            os.chmod(path, 0o755)
+        except Exception as e:
+            logger.warning(f"Failed to set permissions for {path}: {e}")
+
+
 class AgentManager:
     """Manage OpenClaw Gateway Docker containers for digital employees."""
 
     def __init__(self):
         try:
             self.docker_client = docker.from_env()
-        except DockerException:
+        except Exception:
             logger.warning("Docker not available — agent containers will not be managed")
             self.docker_client = None
 
@@ -46,99 +65,132 @@ async def initialize_agent_files(self, db: AsyncSession, agent: Agent,
             logger.warning(f"Agent dir already exists: {agent_dir}")
             return
 
-        if template_dir.exists():
-            # Copy template
-            shutil.copytree(str(template_dir), str(agent_dir))
-        else:
-            # No template dir (local dev) — create minimal workspace structure
-            logger.info(f"Template dir not found ({template_dir}), creating minimal workspace")
-            agent_dir.mkdir(parents=True, exist_ok=True)
-            (agent_dir / "workspace").mkdir(exist_ok=True)
-            (agent_dir / "workspace" / "knowledge_base").mkdir(exist_ok=True)
-            (agent_dir / "memory").mkdir(exist_ok=True)
-            (agent_dir / "skills").mkdir(exist_ok=True)
-            (agent_dir / "tasks.json").write_text("[]", encoding="utf-8")
-
-        # Customize soul.md
-        soul_path = agent_dir / "soul.md"
-        # Get creator name
-        from app.models.user import User
-        result = await db.execute(select(User).where(User.id == agent.creator_id))
-        creator = result.scalar_one_or_none()
-        creator_name = creator.display_name if creator else "Unknown"
-
-        soul_content = f"# Personality\n\nI'm {agent.name}, {agent.role_description or 'a digital assistant'}.\n"
-        if soul_path.exists():
-            template_content = soul_path.read_text()
-            soul_content = template_content.replace("{{agent_name}}", agent.name)
-            soul_content = soul_content.replace("{{role_description}}", agent.role_description or "通用助手")
-            soul_content = soul_content.replace("{{creator_name}}", creator_name)
-            soul_content = soul_content.replace("{{created_at}}", datetime.now(timezone.utc).strftime("%Y-%m-%d"))
-
-        # Helper function to replace or append sections
-        def replace_or_append_section(content: str, section_name: str, section_content: str) -> str:
-            """Replace existing ## SectionName or append if not found."""
-            if not section_content:
-                return content
-            
-            # Pattern to match existing section (case-insensitive header)
-            import re
-            pattern = rf"^##\s+{re.escape(section_name)}\s*$"
-            lines = content.split('\n')
-            
-            # Find the section header
-            for i, line in enumerate(lines):
-                if re.match(pattern, line.strip(), re.IGNORECASE):
-                    # Found existing section - replace until next ## header or end
-                    section_start = i
-                    section_end = len(lines)
-                    for j in range(i + 1, len(lines)):
-                        if lines[j].strip().startswith('## '):
-                            section_end = j
-                            break
-                    
-                    # Replace the section content (with trailing newline for proper spacing)
-                    new_section = f"## {section_name}\n{section_content}\n"
-                    lines = lines[:section_start] + [new_section] + lines[section_end:]
-                    return '\n'.join(lines)
-            
-            # Section not found - append at the end
-            return content + f"\n## {section_name}\n{section_content}\n"
-
-        # Use the helper to replace or append Personality and Boundaries
-        soul_content = replace_or_append_section(soul_content, "Personality", personality)
-        soul_content = replace_or_append_section(soul_content, "Boundaries", boundaries)
-
-        soul_path.write_text(soul_content, encoding="utf-8")
-
-        # Ensure memory.md exists
-        mem_path = agent_dir / "memory" / "memory.md"
-        if not mem_path.exists():
-            mem_path.write_text("# Memory\n\n_Record important information and knowledge here._\n", encoding="utf-8")
-
-        # Ensure reflections.md exists — copy from central template
-        refl_path = agent_dir / "memory" / "reflections.md"
-        if not refl_path.exists():
-            refl_template = Path(__file__).parent.parent / "templates" / "reflections.md"
-            refl_content = refl_template.read_text(encoding="utf-8") if refl_template.exists() else "# Reflections Journal\n"
-            refl_path.write_text(refl_content, encoding="utf-8")
-
-        # Ensure HEARTBEAT.md exists — copy from central template
-        hb_path = agent_dir / "HEARTBEAT.md"
-        if not hb_path.exists():
-            hb_template = Path(__file__).parent.parent / "templates" / "HEARTBEAT.md"
-            hb_content = hb_template.read_text(encoding="utf-8") if hb_template.exists() else "# Heartbeat Instructions\n"
-            hb_path.write_text(hb_content, encoding="utf-8")
-
-        # Customize state.json
-        state_path = agent_dir / "state.json"
-        if state_path.exists():
-            state = json.loads(state_path.read_text())
-            state["agent_id"] = str(agent.id)
-            state["name"] = agent.name
-            state_path.write_text(json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8")
-
-        logger.info(f"Initialized agent files at {agent_dir}")
+        try:
+            # Create parent directories first with proper permissions
+            agent_dir.parent.mkdir(parents=True, exist_ok=True)
+            try:
+                _set_directory_permissions(agent_dir.parent)
+            except Exception:
+                logger.warning(f"Failed to set permissions for {agent_dir.parent}, continuing")
+
+            if template_dir.exists():
+                # First create empty agent_dir with proper permissions
+                # This ensures shutil.copytree can write to it
+                agent_dir.mkdir(parents=True, exist_ok=True)
+                try:
+                    _set_directory_permissions(agent_dir)
+                except Exception:
+                    logger.warning(f"Failed to set permissions for {agent_dir}, continuing")
+                
+                # Copy template files
+                shutil.copytree(str(template_dir), str(agent_dir), dirs_exist_ok=True)
+                try:
+                    _set_directory_permissions(agent_dir)
+                except Exception:
+                    logger.warning(f"Failed to set permissions for {agent_dir}, continuing")
+            else:
+                # No template dir (local dev) — create minimal workspace structure
+                logger.info(f"Template dir not found ({template_dir}), creating minimal workspace")
+                agent_dir.mkdir(parents=True, exist_ok=True)
+                try:
+                    _set_directory_permissions(agent_dir)
+                except Exception:
+                    logger.warning(f"Failed to set permissions for {agent_dir}, continuing")
+                (agent_dir / "workspace").mkdir(exist_ok=True)
+                (agent_dir / "workspace" / "knowledge_base").mkdir(exist_ok=True)
+                (agent_dir / "memory").mkdir(exist_ok=True)
+                (agent_dir / "skills").mkdir(exist_ok=True)
+                (agent_dir / "tasks.json").write_text("[]", encoding="utf-8")
+        except Exception as e:
+            # If creating files fails, log it but continue with agent creation
+            logger.error(f"Failed to initialize agent files for {agent.name}: {e}")
+            logger.warning("Continuing with agent creation without workspace files")
+
+        try:
+            # Customize soul.md
+            soul_path = agent_dir / "soul.md"
+            # Get creator name
+            from app.models.user import User
+            result = await db.execute(select(User).where(User.id == agent.creator_id))
+            creator = result.scalar_one_or_none()
+            creator_name = creator.display_name if creator else "Unknown"
+
+            soul_content = f"# Personality\n\nI'm {agent.name}, {agent.role_description or 'a digital assistant'}.\n"
+            if soul_path.exists():
+                template_content = soul_path.read_text()
+                soul_content = template_content.replace("{{agent_name}}", agent.name)
+                soul_content = soul_content.replace("{{role_description}}", agent.role_description or "通用助手")
+                soul_content = soul_content.replace("{{creator_name}}", creator_name)
+                soul_content = soul_content.replace("{{created_at}}", datetime.now(timezone.utc).strftime("%Y-%m-%d"))
+
+            # Helper function to replace or append sections
+            def replace_or_append_section(content: str, section_name: str, section_content: str) -> str:
+                """Replace existing ## SectionName or append if not found."""
+                if not section_content:
+                    return content
+                
+                # Pattern to match existing section (case-insensitive header)
+                import re
+                pattern = rf"^##\s+{re.escape(section_name)}\s*$"
+                lines = content.split('\n')
+                
+                # Find the section header
+                for i, line in enumerate(lines):
+                    if re.match(pattern, line.strip(), re.IGNORECASE):
+                        # Found existing section - replace until next ## header or end
+                        section_start = i
+                        section_end = len(lines)
+                        for j in range(i + 1, len(lines)):
+                            if lines[j].strip().startswith('## '):
+                                section_end = j
+                                break
+                        
+                        # Replace the section content (with trailing newline for proper spacing)
+                        new_section = f"## {section_name}\n{section_content}\n"
+                        lines = lines[:section_start] + [new_section] + lines[section_end:]
+                        return '\n'.join(lines)
+                
+                # Section not found - append at the end
+                return content + f"\n## {section_name}\n{section_content}\n"
+
+            # Use the helper to replace or append Personality and Boundaries
+            soul_content = replace_or_append_section(soul_content, "Personality", personality)
+            soul_content = replace_or_append_section(soul_content, "Boundaries", boundaries)
+
+            soul_path.write_text(soul_content, encoding="utf-8")
+
+            # Ensure memory.md exists
+            mem_path = agent_dir / "memory" / "memory.md"
+            if not mem_path.exists():
+                mem_path.write_text("# Memory\n\n_Record important information and knowledge here._\n", encoding="utf-8")
+
+            # Ensure reflections.md exists — copy from central template
+            refl_path = agent_dir / "memory" / "reflections.md"
+            if not refl_path.exists():
+                refl_template = Path(__file__).parent.parent / "templates" / "reflections.md"
+                refl_content = refl_template.read_text(encoding="utf-8") if refl_template.exists() else "# Reflections Journal\n"
+                refl_path.write_text(refl_content, encoding="utf-8")
+
+            # Ensure HEARTBEAT.md exists — copy from central template
+            hb_path = agent_dir / "HEARTBEAT.md"
+            if not hb_path.exists():
+                hb_template = Path(__file__).parent.parent / "templates" / "HEARTBEAT.md"
+                hb_content = hb_template.read_text(encoding="utf-8") if hb_template.exists() else "# Heartbeat Instructions\n"
+                hb_path.write_text(hb_content, encoding="utf-8")
+
+            # Customize state.json
+            state_path = agent_dir / "state.json"
+            if state_path.exists():
+                state = json.loads(state_path.read_text())
+                state["agent_id"] = str(agent.id)
+                state["name"] = agent.name
+                state_path.write_text(json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8")
+
+            logger.info(f"Initialized agent files at {agent_dir}")
+        except Exception as e:
+            # If writing files fails, log it but continue with agent creation
+            logger.error(f"Failed to write agent files for {agent.name}: {e}")
+            logger.warning("Continuing with agent creation without workspace files")
 
     def _generate_openclaw_config(self, agent: Agent, model: LLMModel | None) -> dict:
         """Generate openclaw.json config for the agent container."""
@@ -182,13 +234,22 @@ async def start_container(self, db: AsyncSession, agent: Agent) -> str | None:
         # Generate OpenClaw config
         config = self._generate_openclaw_config(agent, model)
         config_dir = agent_dir / ".openclaw"
-        config_dir.mkdir(parents=True, exist_ok=True)
-        (config_dir / "openclaw.json").write_text(json.dumps(config, indent=2), encoding="utf-8")
-
-        # Create workspace symlink
-        workspace_dir = config_dir / "workspace"
-        if not workspace_dir.exists():
-            workspace_dir.symlink_to(agent_dir / "workspace")
+        try:
+            config_dir.mkdir(parents=True, exist_ok=True)
+            (config_dir / "openclaw.json").write_text(json.dumps(config, indent=2), encoding="utf-8")
+
+            # Create workspace symlink (or junction on Windows)
+            workspace_dir = config_dir / "workspace"
+            if not workspace_dir.exists():
+                try:
+                    workspace_dir.symlink_to(agent_dir / "workspace")
+                except OSError:
+                    try:
+                        workspace_dir.symlink_to(agent_dir / "workspace", target_is_directory=True)
+                    except OSError:
+                        logger.warning(f"Cannot create symlink for {workspace_dir}, skipping")
+        except Exception as e:
+            logger.warning(f"Failed to create OpenClaw config: {e}, skipping container start")
 
         # Assign a unique port
         container_port = 18789 + hash(str(agent.id)) % 10000
@@ -221,7 +282,7 @@ async def start_container(self, db: AsyncSession, agent: Agent) -> str | None:
             logger.info(f"Started container {container.id[:12]} for agent {agent.name} on port {container_port}")
             return container.id
 
-        except DockerException as e:
+        except Exception as e:
             logger.error(f"Failed to start container for agent {agent.name}: {e}")
             agent.status = "error"
             return None
diff --git a/backend/app/services/agent_seeder.py b/backend/app/services/agent_seeder.py
index e267c951e..c653f2fb8 100644
--- a/backend/app/services/agent_seeder.py
+++ b/backend/app/services/agent_seeder.py
@@ -1,6 +1,8 @@
 """Seed default agents (Morty & Meeseeks) on first platform startup."""
 
+import os
 import shutil
+import subprocess
 import uuid
 from datetime import datetime, timezone
 from pathlib import Path
@@ -194,11 +196,20 @@ async def seed_default_agents():
     """
     # --- Idempotency guard: file-based marker (survives agent renames/deletes) ---
     seed_marker = Path(settings.AGENT_DATA_DIR) / ".seeded"
-    if seed_marker.exists():
-        logger.info("[AgentSeeder] Seed marker found, skipping default agent creation")
-        return
-
+    
+    # Check both marker AND database to handle inconsistent state
     async with async_session() as db:
+        # Verify agents actually exist in DB before skipping
+        existing_agents = await db.execute(
+            select(Agent).where(Agent.name.in_(["Morty", "Meeseeks"])).limit(2)
+        )
+        agents_in_db = existing_agents.scalars().all()
+        
+        if seed_marker.exists() and len(agents_in_db) == 2:
+            logger.info("[AgentSeeder] Seed marker found and agents exist in DB, skipping default agent creation")
+            return
+        elif seed_marker.exists() and len(agents_in_db) < 2:
+            logger.warning("[AgentSeeder] Seed marker exists but agents missing from DB, will re-seed")
 
         # Get platform admin as creator
         admin_result = await db.execute(
@@ -246,14 +257,104 @@ async def seed_default_agents():
         # ── Initialize workspace files ──
         template_dir = Path(settings.AGENT_TEMPLATE_DIR)
 
+        # Create parent directories first with proper permissions
+        parent_dir = Path(settings.AGENT_DATA_DIR)
+        parent_dir.mkdir(parents=True, exist_ok=True)
+        
+        if os.name == 'nt':
+            try:
+                result = subprocess.run(
+                    ['icacls', str(parent_dir), '/grant', 'Everyone:F', '/T', '/C'],
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                    timeout=30
+                )
+                if result.returncode != 0:
+                    logger.warning(f"icacls failed for {parent_dir}: {result.stderr}")
+            except Exception as e:
+                logger.warning(f"Failed to set permissions for {parent_dir}: {e}")
+        else:
+            try:
+                os.chmod(parent_dir, 0o755)
+            except Exception as e:
+                logger.warning(f"Failed to set permissions for {parent_dir}: {e}")
+
         for agent, soul_content in [(morty, MORTY_SOUL), (meeseeks, MEESEEKS_SOUL)]:
             agent_dir = Path(settings.AGENT_DATA_DIR) / str(agent.id)
 
             if template_dir.exists():
+                # Remove existing directory if it exists and has permission issues
+                if agent_dir.exists():
+                    try:
+                        if os.name == 'nt':
+                            # First take ownership, then remove
+                            subprocess.run(
+                                ['takeown', '/F', str(agent_dir), '/R', '/A', '/D', 'Y'],
+                                check=False,
+                                capture_output=True,
+                                text=True,
+                                timeout=60
+                            )
+                            subprocess.run(
+                                ['powershell', '-Command', f'Remove-Item -Path "{agent_dir}" -Recurse -Force'],
+                                check=False,
+                                capture_output=True,
+                                text=True,
+                                timeout=30
+                            )
+                        else:
+                            shutil.rmtree(str(agent_dir))
+                    except Exception as e:
+                        logger.warning(f"Failed to remove existing agent_dir {agent_dir}: {e}")
+                
+                # Create fresh agent_dir with proper permissions
+                agent_dir.mkdir(parents=True, exist_ok=True)
+                
+                if os.name == 'nt':
+                    try:
+                        subprocess.run(
+                            ['icacls', str(agent_dir), '/inheritance:r'],
+                            check=False,
+                            capture_output=True,
+                            text=True,
+                            timeout=30
+                        )
+                        subprocess.run(
+                            ['icacls', str(agent_dir), '/grant', 'Everyone:F', '/T', '/C'],
+                            check=False,
+                            capture_output=True,
+                            text=True,
+                            timeout=60
+                        )
+                    except Exception as e:
+                        logger.warning(f"Failed to set permissions for {agent_dir}: {e}")
+                else:
+                    try:
+                        os.chmod(agent_dir, 0o755)
+                    except Exception as e:
+                        logger.warning(f"Failed to set permissions for {agent_dir}: {e}")
+                
                 # Copy the full agent template so Morty/Meeseeks get EVERY file
                 # defined in the template: MEMORY_INDEX.md, curiosity_journal.md,
                 # state.json, todo.json, daily_reports/, enterprise_info/, etc.
-                shutil.copytree(str(template_dir), str(agent_dir))
+                # Use PowerShell to copy files to avoid permission issues
+                # Copy the full agent template so Morty/Meeseeks get EVERY file
+                # defined in the template: MEMORY_INDEX.md, curiosity_journal.md,
+                # state.json, todo.json, daily_reports/, enterprise_info/, etc.
+                if os.name == 'nt':
+                    copy_cmd = f'Copy-Item -Path "{template_dir}\\*" -Destination "{agent_dir}" -Recurse -Force'
+                    try:
+                        subprocess.run(
+                            ['powershell', '-Command', copy_cmd],
+                            check=False,
+                            capture_output=True,
+                            text=True
+                        )
+                    except Exception as e:
+                        logger.warning(f"Failed to copy template files: {e}")
+                else:
+                    shutil.copytree(str(template_dir), str(agent_dir), dirs_exist_ok=True)
             else:
                 # Fallback for local dev (no Docker template mount)
                 agent_dir.mkdir(parents=True, exist_ok=True)
@@ -262,28 +363,28 @@ async def seed_default_agents():
                 (agent_dir / "workspace" / "knowledge_base").mkdir(exist_ok=True)
                 (agent_dir / "memory").mkdir(exist_ok=True)
 
-            # Overlay custom soul (rich Morty/Meeseeks persona over the generic template)
-            (agent_dir / "soul.md").write_text(soul_content.strip() + "\n", encoding="utf-8")
-
-            # Ensure memory.md exists (template does not include it; holds runtime context)
-            mem_path = agent_dir / "memory" / "memory.md"
-            if not mem_path.exists():
-                mem_path.write_text("# Memory\n\n_Record important information and knowledge here._\n", encoding="utf-8")
-
-            # Ensure reflections.md exists (not in agent_template; lives in app/templates)
-            refl_path = agent_dir / "memory" / "reflections.md"
-            if not refl_path.exists():
-                refl_src = Path(__file__).parent.parent / "templates" / "reflections.md"
-                refl_path.write_text(refl_src.read_text(encoding="utf-8") if refl_src.exists() else "# Reflections Journal\n", encoding="utf-8")
-
-            # Stamp agent identity into state.json if present
-            state_path = agent_dir / "state.json"
-            if state_path.exists():
-                import json as _json
-                state = _json.loads(state_path.read_text())
-                state["agent_id"] = str(agent.id)
-                state["name"] = agent.name
-                state_path.write_text(_json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8")
+                # Overlay custom soul (rich Morty/Meeseeks persona over the generic template)
+                (agent_dir / "soul.md").write_text(soul_content.strip() + "\n", encoding="utf-8")
+
+                # Ensure memory.md exists (template does not include it; holds runtime context)
+                mem_path = agent_dir / "memory" / "memory.md"
+                if not mem_path.exists():
+                    mem_path.write_text("# Memory\n\n_Record important information and knowledge here._\n", encoding="utf-8")
+
+                # Ensure reflections.md exists (not in agent_template; lives in app/templates)
+                refl_path = agent_dir / "memory" / "reflections.md"
+                if not refl_path.exists():
+                    refl_src = Path(__file__).parent.parent / "templates" / "reflections.md"
+                    refl_path.write_text(refl_src.read_text(encoding="utf-8") if refl_src.exists() else "# Reflections Journal\n", encoding="utf-8")
+
+                # Stamp agent identity into state.json if present
+                state_path = agent_dir / "state.json"
+                if state_path.exists():
+                    import json as _json
+                    state = _json.loads(state_path.read_text())
+                    state["agent_id"] = str(agent.id)
+                    state["name"] = agent.name
+                    state_path.write_text(_json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8")
 
         # ── Assign skills ──
         all_skills_result = await db.execute(
@@ -379,7 +480,30 @@ async def seed_okr_agent():
     - Generates daily/weekly reports and posts them to the Plaza
     - Helps team members set up and maintain their focus.md files
     """
-    seed_marker = Path(settings.AGENT_DATA_DIR) / ".seeded"
+    # Ensure AGENT_DATA_DIR exists with proper permissions before any file operations
+    agent_data_dir = Path(settings.AGENT_DATA_DIR)
+    agent_data_dir.mkdir(parents=True, exist_ok=True)
+    
+    if os.name == 'nt':
+        try:
+            result = subprocess.run(
+                ['icacls', str(agent_data_dir), '/grant', 'Everyone:F', '/T', '/C'],
+                check=True,
+                capture_output=True,
+                text=True,
+                timeout=30
+            )
+            if result.returncode != 0:
+                logger.warning(f"icacls failed for {agent_data_dir}: {result.stderr}")
+        except Exception as e:
+            logger.warning(f"Failed to set permissions for {agent_data_dir}: {e}")
+    else:
+        try:
+            os.chmod(agent_data_dir, 0o755)
+        except Exception as e:
+            logger.warning(f"Failed to set permissions for {agent_data_dir}: {e}")
+
+    seed_marker = agent_data_dir / ".seeded"
 
     # Check if OKR Agent has already been seeded
     if seed_marker.exists():
@@ -477,14 +601,87 @@ async def seed_okr_agent():
         template_dir = Path(settings.AGENT_TEMPLATE_DIR)
         agent_dir = Path(settings.AGENT_DATA_DIR) / str(okr_agent.id)
 
-        if template_dir.exists():
-            shutil.copytree(str(template_dir), str(agent_dir))
-        else:
+        # Create directories with proper permissions
+        try:
+            # Remove existing directory if it exists and has permission issues
+            if agent_dir.exists():
+                try:
+                    if os.name == 'nt':
+                        # First take ownership, then remove
+                        subprocess.run(
+                            ['takeown', '/F', str(agent_dir), '/R', '/A', '/D', 'Y'],
+                            check=False,
+                            capture_output=True,
+                            text=True,
+                            timeout=60
+                        )
+                        subprocess.run(
+                            ['powershell', '-Command', f'Remove-Item -Path "{agent_dir}" -Recurse -Force'],
+                            check=False,
+                            capture_output=True,
+                            text=True,
+                            timeout=30
+                        )
+                    else:
+                        shutil.rmtree(str(agent_dir))
+                except Exception as e:
+                    logger.warning(f"Failed to remove existing agent_dir {agent_dir}: {e}")
+            
+            # Create fresh agent directory
             agent_dir.mkdir(parents=True, exist_ok=True)
-            (agent_dir / "skills").mkdir(exist_ok=True)
-            (agent_dir / "workspace").mkdir(exist_ok=True)
-            (agent_dir / "workspace" / "reports").mkdir(exist_ok=True)
-            (agent_dir / "memory").mkdir(exist_ok=True)
+            
+            # Set permissions on Windows - reset inheritance first, then grant
+            if os.name == 'nt':
+                try:
+                    subprocess.run(
+                        ['icacls', str(agent_dir), '/inheritance:r'],
+                        check=False,
+                        capture_output=True,
+                        text=True,
+                        timeout=30
+                    )
+                    subprocess.run(
+                        ['icacls', str(agent_dir), '/grant', 'Everyone:F', '/T', '/C'],
+                        check=False,
+                        capture_output=True,
+                        text=True,
+                        timeout=60
+                    )
+                except Exception as e:
+                    logger.warning(f"Failed to set permissions for {agent_dir}: {e}")
+            
+            # Copy template files if available
+            if template_dir.exists():
+                logger.info(f"Copying template from {template_dir} to {agent_dir}")
+                try:
+                    if os.name == 'nt':
+                        copy_cmd = f'Copy-Item -Path "{template_dir}\\*" -Destination "{agent_dir}" -Recurse -Force'
+                        subprocess.run(
+                            ['powershell', '-Command', copy_cmd],
+                            check=False,
+                            capture_output=True,
+                            text=True
+                        )
+                    else:
+                        shutil.copytree(str(template_dir), str(agent_dir), dirs_exist_ok=True)
+                    logger.info(f"Successfully copied template to {agent_dir}")
+                except Exception as e:
+                    logger.error(f"Failed to copy template: {e}")
+                    # Fall back to creating empty structure
+                    (agent_dir / "skills").mkdir(exist_ok=True)
+                    (agent_dir / "workspace").mkdir(exist_ok=True)
+                    (agent_dir / "workspace" / "reports").mkdir(exist_ok=True)
+                    (agent_dir / "memory").mkdir(exist_ok=True)
+            else:
+                # Create empty structure
+                (agent_dir / "skills").mkdir(exist_ok=True)
+                (agent_dir / "workspace").mkdir(exist_ok=True)
+                (agent_dir / "workspace" / "reports").mkdir(exist_ok=True)
+                (agent_dir / "memory").mkdir(exist_ok=True)
+                
+        except Exception as e:
+            logger.error(f"Error setting up workspace: {e}")
+            raise
 
         # Write OKR Agent soul
         (agent_dir / "soul.md").write_text(OKR_AGENT_SOUL.strip() + "\n", encoding="utf-8")
@@ -961,8 +1158,48 @@ async def seed_okr_agent_for_tenant(tenant_id: uuid.UUID, creator_id: uuid.UUID)
         template_dir = Path(settings.AGENT_TEMPLATE_DIR)
         agent_dir = Path(settings.AGENT_DATA_DIR) / str(okr_agent.id)
 
+        # Create parent directories first with proper permissions
+        agent_dir.parent.mkdir(parents=True, exist_ok=True)
+        
+        if os.name == 'nt':
+            try:
+                subprocess.run(
+                    ['icacls', str(agent_dir.parent), '/grant', 'Everyone:F', '/T', '/C'],
+                    check=False,
+                    capture_output=True,
+                    text=True
+                )
+            except Exception as e:
+                logger.warning(f"Failed to set permissions for {agent_dir.parent}: {e}")
+        else:
+            try:
+                os.chmod(agent_dir.parent, 0o755)
+            except Exception as e:
+                logger.warning(f"Failed to set permissions for {agent_dir.parent}: {e}")
+
         if template_dir.exists():
-            shutil.copytree(str(template_dir), str(agent_dir))
+            # First create empty agent_dir with proper permissions
+            agent_dir.mkdir(parents=True, exist_ok=True)
+            
+            # Set permissions
+            if os.name == 'nt':
+                try:
+                    subprocess.run(
+                        ['icacls', str(agent_dir), '/grant', 'Everyone:F', '/T', '/C'],
+                        check=False,
+                        capture_output=True,
+                        text=True
+                    )
+                except Exception as e:
+                    logger.warning(f"Failed to set permissions for {agent_dir}: {e}")
+            else:
+                try:
+                    os.chmod(agent_dir, 0o755)
+                except Exception as e:
+                    logger.warning(f"Failed to set permissions for {agent_dir}: {e}")
+            
+            # Copy template files
+            shutil.copytree(str(template_dir), str(agent_dir), dirs_exist_ok=True)
         else:
             agent_dir.mkdir(parents=True, exist_ok=True)
             for sub in ("skills", "workspace", "workspace/reports", "memory"):
diff --git a/backend/app/services/trigger_daemon.py b/backend/app/services/trigger_daemon.py
index e4bfd1905..7e1425c99 100644
--- a/backend/app/services/trigger_daemon.py
+++ b/backend/app/services/trigger_daemon.py
@@ -902,7 +902,7 @@ async def on_tool_call(data):
                 else:
                     cleaned = final_reply
 
-                notification = f"⚡ {summary}\n\n{cleaned}"
+                notification = f"[TRIGGER] {summary}\n\n{cleaned}"
 
                 target_session_id = delivery_target["session_id"]
                 owner_user_id = delivery_target.get("owner_user_id")
@@ -954,7 +954,7 @@ async def on_tool_call(data):
             "triggers": [{"name": t.name, "type": t.type} for t in triggers],
         }, agent_id=agent_id)
 
-        logger.info(f"⚡ Triggers fired for {agent.name}: {[t.name for t in triggers]}")
+        logger.info(f"[TRIGGER] Triggers fired for {agent.name}: {[t.name for t in triggers]}")
 
     except Exception as e:
         logger.error(f"Failed to invoke agent {agent_id} for triggers: {e}")
@@ -1110,7 +1110,7 @@ def _decay_chain():
 
 async def start_trigger_daemon():
     """Start the background trigger daemon loop. Called from FastAPI startup."""
-    logger.info("⚡ Trigger Daemon started (15s tick, heartbeat every ~60s)")
+    logger.info("[TRIGGER] Trigger Daemon started (15s tick, heartbeat every ~60s)")
     _heartbeat_counter = 0
     while True:
         try:
diff --git a/backend/entrypoint.sh b/backend/entrypoint.sh
old mode 100755
new mode 100644
diff --git a/frontend/VERSION b/frontend/VERSION
index 9ab8337f3..dcb0bac8a 100644
--- a/frontend/VERSION
+++ b/frontend/VERSION
@@ -1 +1 @@
-1.9.1
+1.8.3-beta.2
diff --git a/frontend/index.html b/frontend/index.html
index eba398a27..3e4ffb325 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -7,9 +7,6 @@
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <meta name="description" content="Clawith — 企业数字员工平台" />
   <title>Clawith</title>
-  <link rel="preconnect" href="https://fonts.googleapis.com" />
-  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
-  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet" />
 </head>
 
 <body>
diff --git a/frontend/src/components/ConfirmModal.tsx b/frontend/src/components/ConfirmModal.tsx
index 546d4b0a7..71b6db69b 100644
--- a/frontend/src/components/ConfirmModal.tsx
+++ b/frontend/src/components/ConfirmModal.tsx
@@ -21,16 +21,30 @@ export default function ConfirmModal({ open, title, message, confirmLabel = '确
     if (!open) return null;
 
     return (
-        <div style={{
-            position: 'fixed', top: 0, left: 0, right: 0, bottom: 0,
-            background: 'rgba(0,0,0,0.5)', display: 'flex', alignItems: 'center', justifyContent: 'center',
-            zIndex: 10000,
-        }} onClick={(e) => { if (e.target === e.currentTarget) onCancel(); }}>
-            <div style={{
-                background: 'var(--bg-primary)', borderRadius: '12px', padding: '24px',
-                width: '380px', maxWidth: '90vw', border: '1px solid var(--border-subtle)',
-                boxShadow: '0 20px 60px rgba(0,0,0,0.4)',
-            }}>
+        <div
+            className="dialog-backdrop"
+            style={{
+                position: 'fixed', top: 0, left: 0, right: 0, bottom: 0,
+                background: 'rgba(0,0,0,0.5)',
+                backdropFilter: 'blur(4px)',
+                display: 'flex', alignItems: 'center', justifyContent: 'center',
+                zIndex: 10000,
+                animation: 'backdropFadeIn 0.2s ease forwards',
+            }}
+            onClick={(e) => { if (e.target === e.currentTarget) onCancel(); }}>
+            <div
+                className="dialog-content"
+                style={{
+                    background: 'var(--bg-primary)',
+                    borderRadius: '16px',
+                    padding: '24px',
+                    width: '380px',
+                    maxWidth: '90vw',
+                    border: '1px solid var(--border-default)',
+                    boxShadow: '0 24px 80px rgba(0,0,0,0.5), 0 0 0 1px rgba(255,255,255,0.05) inset',
+                    animation: 'modalFadeIn 0.25s cubic-bezier(0.34, 1.56, 0.64, 1) forwards',
+                }}
+            >
                 <h4 style={{ marginBottom: '12px', fontSize: '15px' }}>{title}</h4>
                 <p style={{ fontSize: '13px', color: 'var(--text-secondary)', marginBottom: '20px', lineHeight: 1.5 }}>{message}</p>
                 <div style={{ display: 'flex', justifyContent: 'flex-end', gap: '8px' }}>
diff --git a/frontend/src/components/Dialog/DialogProvider.tsx b/frontend/src/components/Dialog/DialogProvider.tsx
index 356fb3d18..f5f30edbf 100644
--- a/frontend/src/components/Dialog/DialogProvider.tsx
+++ b/frontend/src/components/Dialog/DialogProvider.tsx
@@ -92,27 +92,31 @@ function DialogModal({ state, onClose }: { state: NonNullable<ModalState>; onClo
 
     return (
         <div
+            className="dialog-backdrop"
             style={{
                 position: 'fixed', inset: 0,
                 background: 'rgba(0,0,0,0.5)',
+                backdropFilter: 'blur(4px)',
                 display: 'flex', alignItems: 'center', justifyContent: 'center',
                 zIndex: 10000,
+                animation: 'backdropFadeIn 0.2s ease forwards',
             }}
             onClick={(e) => { if (e.target === e.currentTarget) onClose(false); }}
         >
             <div
+                className="dialog-content"
                 role="dialog"
                 aria-modal="true"
                 style={{
                     background: 'var(--bg-primary)',
-                    borderRadius: '12px',
+                    borderRadius: '16px',
                     padding: '24px',
                     width: '420px',
                     maxWidth: '90vw',
                     maxHeight: '80vh',
                     overflow: 'auto',
-                    border: '1px solid var(--border-subtle)',
-                    boxShadow: '0 20px 60px rgba(0,0,0,0.4)',
+                    border: '1px solid var(--border-default)',
+                    boxShadow: '0 24px 80px rgba(0,0,0,0.5), 0 0 0 1px rgba(255,255,255,0.05) inset',
                 }}
             >
                 <div style={{ display: 'flex', alignItems: 'center', gap: '10px', marginBottom: '12px' }}>
diff --git a/frontend/src/components/PostHireSettingsModal.tsx b/frontend/src/components/PostHireSettingsModal.tsx
index df4925a54..1ee98e3f6 100644
--- a/frontend/src/components/PostHireSettingsModal.tsx
+++ b/frontend/src/components/PostHireSettingsModal.tsx
@@ -238,25 +238,48 @@ function RadioRow({ selected, onClick, title, hint }: { selected: boolean; onCli
             type="button"
             onClick={onClick}
             style={{
-                display: 'flex', alignItems: 'flex-start', gap: '10px',
-                padding: '10px 12px', textAlign: 'left',
+                display: 'flex',
+                alignItems: 'flex-start',
+                gap: '12px',
+                padding: '14px 16px',
+                textAlign: 'left',
                 border: `1px solid ${selected ? 'var(--accent-primary)' : 'var(--border-subtle)'}`,
-                borderRadius: '8px', background: selected ? 'var(--accent-subtle, rgba(99,102,241,0.08))' : 'transparent',
-                cursor: 'pointer', width: '100%',
+                borderRadius: '8px',
+                background: selected ? 'var(--accent-subtle, rgba(99,102,241,0.08))' : 'transparent',
+                cursor: 'pointer',
+                width: '100%',
+                minHeight: '64px',
+                overflow: 'visible',
+                boxSizing: 'border-box',
             }}
         >
             <span style={{
-                marginTop: '2px', width: '14px', height: '14px', borderRadius: '50%',
+                marginTop: '3px',
+                width: '14px',
+                height: '14px',
+                borderRadius: '50%',
                 border: `2px solid ${selected ? 'var(--accent-primary)' : 'var(--border-subtle)'}`,
-                display: 'inline-flex', alignItems: 'center', justifyContent: 'center',
+                display: 'inline-flex',
+                alignItems: 'center',
+                justifyContent: 'center',
                 flexShrink: 0,
+                flexGrow: 0,
+                flexBasis: 'auto',
             }}>
                 {selected && <span style={{ width: '6px', height: '6px', borderRadius: '50%', background: 'var(--accent-primary)' }} />}
             </span>
-            <span style={{ display: 'flex', flexDirection: 'column', gap: '2px' }}>
-                <span style={{ fontSize: '13px', color: 'var(--text-primary)' }}>{title}</span>
-                <span style={{ fontSize: '11.5px', color: 'var(--text-tertiary)' }}>{hint}</span>
-            </span>
+            <div style={{
+                display: 'flex',
+                flexDirection: 'column',
+                gap: '4px',
+                flexShrink: 1,
+                flexGrow: 1,
+                flexBasis: 'calc(100% - 26px)',
+                minWidth: 0,
+            }}>
+                <div style={{ fontSize: '13px', color: 'var(--text-primary)', fontWeight: 500 }}>{title}</div>
+                <div style={{ fontSize: '11.5px', color: 'var(--text-tertiary)', lineHeight: '1.6', whiteSpace: 'normal', wordWrap: 'break-word' }}>{hint}</div>
+            </div>
         </button>
     );
 }
diff --git a/frontend/src/components/TalentMarketModal.tsx b/frontend/src/components/TalentMarketModal.tsx
index ce2efbaed..2f5dc88cd 100644
--- a/frontend/src/components/TalentMarketModal.tsx
+++ b/frontend/src/components/TalentMarketModal.tsx
@@ -107,45 +107,22 @@ export default function TalentMarketModal({ open, onClose }: Props) {
 
     return (
         <div
-            style={{
-                position: 'fixed', top: 0, left: 0, right: 0, bottom: 0,
-                background: 'rgba(0,0,0,0.5)', display: 'flex', alignItems: 'center', justifyContent: 'center',
-                zIndex: 10000,
-            }}
+            className="talent-market-modal"
             onClick={(e) => { if (e.target === e.currentTarget) onClose(); }}
         >
-            <div
-                style={{
-                    background: 'var(--bg-primary)', borderRadius: '12px',
-                    width: '960px', maxWidth: '95vw',
-                    height: 'min(88vh, 720px)',
-                    border: '1px solid var(--border-subtle)',
-                    boxShadow: '0 20px 60px rgba(0,0,0,0.4)',
-                    display: 'flex', flexDirection: 'column', overflow: 'hidden',
-                }}
-            >
+            <div className="talent-market-content">
                 {/* Header */}
-                <div style={{
-                    padding: '24px 28px 12px', display: 'flex', alignItems: 'flex-start', justifyContent: 'space-between', gap: '16px',
-                }}>
+                <div className="talent-market-header">
                     <div style={{ flex: 1, minWidth: 0 }}>
-                        <h2 style={{ margin: 0, fontSize: '22px', fontWeight: 600 }}>
+                        <h2 className="talent-market-title">
                             {t('talentMarket.title', isChinese ? '人才市场' : 'Talent Market')}
                         </h2>
-                        <p style={{ margin: '6px 0 0', fontSize: '13px', color: 'var(--text-secondary)' }}>
+                        <p className="talent-market-subtitle">
                             {t('talentMarket.subtitle', isChinese ? '挑选一位专业成员加入你的公司' : 'Pick a professional to join your company')}
                         </p>
                     </div>
                     {/* Search box */}
-                    <div style={{
-                        display: 'flex', alignItems: 'center', gap: '8px',
-                        height: '40px',
-                        padding: '0 12px',
-                        background: 'var(--bg-secondary)',
-                        border: '1px solid var(--border-subtle)',
-                        borderRadius: '8px',
-                        width: '260px', maxWidth: '40vw',
-                    }}>
+                    <div className="talent-market-search">
                         <IconSearch size={15} stroke={1.6} style={{ color: 'var(--text-tertiary)', flexShrink: 0 }} />
                         <input
                             type="text"
@@ -155,22 +132,13 @@ export default function TalentMarketModal({ open, onClose }: Props) {
                                 'talentMarket.searchPlaceholder',
                                 isChinese ? '搜索 Agent 名称或能力…' : 'Search agents by name or skill…',
                             )}
-                            style={{
-                                flex: 1, minWidth: 0,
-                                background: 'transparent', border: 'none', outline: 'none',
-                                color: 'var(--text-primary)', fontSize: '13px',
-                                height: '100%',
-                            }}
                             aria-label={t('talentMarket.searchLabel', isChinese ? '搜索 Agent' : 'Search agents')}
                         />
                         {searchQuery && (
                             <button
                                 onClick={() => setSearchQuery('')}
                                 title={t('common.clear', isChinese ? '清空' : 'Clear')}
-                                style={{
-                                    background: 'transparent', border: 'none', cursor: 'pointer',
-                                    color: 'var(--text-tertiary)', padding: '0', display: 'flex',
-                                }}
+                                className="talent-market-search-clear"
                             >
                                 <IconX size={14} stroke={1.6} />
                             </button>
@@ -190,13 +158,7 @@ export default function TalentMarketModal({ open, onClose }: Props) {
                 <div
                     role="tablist"
                     aria-label={t('talentMarket.tabsAria', isChinese ? '分类筛选' : 'Category filters')}
-                    style={{
-                        display: 'flex',
-                        padding: '0 28px',
-                        borderBottom: '1px solid var(--border-subtle)',
-                        overflowX: 'auto',
-                        flexShrink: 0,
-                    }}
+                    className="talent-market-tabs"
                 >
                     {tabs.map((tab) => {
                         const isActive = !isSearching && activeTab === tab.id;
@@ -206,27 +168,7 @@ export default function TalentMarketModal({ open, onClose }: Props) {
                                 role="tab"
                                 aria-selected={isActive}
                                 onClick={() => { setSearchQuery(''); setActiveTab(tab.id); }}
-                                onMouseEnter={(e) => {
-                                    if (!isActive) (e.currentTarget as HTMLButtonElement).style.color = 'var(--text-primary)';
-                                }}
-                                onMouseLeave={(e) => {
-                                    if (!isActive) (e.currentTarget as HTMLButtonElement).style.color = 'var(--text-secondary)';
-                                }}
-                                style={{
-                                    padding: '14px 18px',
-                                    marginBottom: '-1px',
-                                    marginRight: '8px',
-                                    background: 'transparent',
-                                    border: 'none',
-                                    borderBottom: `2px solid ${isActive ? 'var(--text-primary)' : 'transparent'}`,
-                                    color: isActive ? 'var(--text-primary)' : 'var(--text-secondary)',
-                                    fontSize: '13px',
-                                    fontWeight: 500,
-                                    cursor: 'pointer',
-                                    whiteSpace: 'nowrap',
-                                    transition: 'color 120ms, border-color 120ms',
-                                    outline: 'none',
-                                }}
+                                className={`talent-market-tab${isActive ? ' active' : ''}`}
                             >
                                 {tab.label}
                             </button>
@@ -235,13 +177,7 @@ export default function TalentMarketModal({ open, onClose }: Props) {
                 </div>
 
                 {/* Cards */}
-                <div style={{
-                    padding: '18px 28px 20px', overflowY: 'auto', flex: 1,
-                    display: 'grid',
-                    gridTemplateColumns: 'repeat(auto-fill, minmax(260px, 1fr))',
-                    gap: '16px',
-                    alignContent: 'start',
-                }}>
+                <div className="talent-market-cards">
                     {isLoading && (
                         <div style={{ gridColumn: '1 / -1', padding: '60px', textAlign: 'center', color: 'var(--text-tertiary)' }}>
                             {t('common.loading', 'Loading...')}
@@ -271,10 +207,7 @@ export default function TalentMarketModal({ open, onClose }: Props) {
                 </div>
 
                 {/* Footer */}
-                <div style={{
-                    padding: '12px 28px 16px', textAlign: 'center', fontSize: '12px',
-                    color: 'var(--text-tertiary)', borderTop: '1px solid var(--border-subtle)',
-                }}>
+                <div className="talent-market-footer">
                     {t('talentMarket.footer', isChinese ? '点击聘用·可随时在设置中调整' : 'Hire now · adjust anything in settings later')}
                 </div>
             </div>
@@ -302,38 +235,20 @@ function TemplateCard({ tpl, hiring, isChinese, onHire }: {
         : [localized.description].filter(Boolean);
 
     return (
-        <div style={{
-            border: '1px solid var(--border-subtle)', borderRadius: '10px',
-            padding: '18px', display: 'flex', flexDirection: 'column',
-            background: 'var(--bg-primary)',
-            transition: 'border-color 120ms',
-        }}>
-            <div style={{
-                width: '40px', height: '40px', borderRadius: '8px',
-                background: 'var(--bg-secondary)',
-                display: 'flex', alignItems: 'center', justifyContent: 'center',
-                fontSize: '13px', fontWeight: 600, marginBottom: '14px',
-                letterSpacing: '0.04em',
-            }}>
+        <div className="talent-card">
+            <div className="talent-card-icon">
                 {tpl.icon || 'AI'}
             </div>
-            <div style={{ fontSize: '15px', fontWeight: 600, marginBottom: '2px' }}>
+            <div className="talent-card-title">
                 {localized.name}
             </div>
-            <div style={{
-                fontSize: '10px', fontWeight: 500, letterSpacing: '0.06em',
-                color: 'var(--text-tertiary)', textTransform: 'uppercase',
-                marginBottom: '12px',
-            }}>
+            <div className="talent-card-category">
                 {tpl.category || 'general'}
             </div>
-            <ul style={{
-                margin: 0, padding: 0, listStyle: 'none', flex: 1,
-                fontSize: '12.5px', color: 'var(--text-secondary)', lineHeight: 1.7,
-            }}>
+            <ul className="talent-card-bullets">
                 {bullets.slice(0, 4).map((b, i) => (
-                    <li key={i} style={{ display: 'flex', gap: '6px', alignItems: 'flex-start' }}>
-                        <span style={{ color: 'var(--text-tertiary)', flexShrink: 0 }}>•</span>
+                    <li key={i}>
+                        <span className="talent-card-bullet-dot">•</span>
                         <span>{b}</span>
                     </li>
                 ))}
@@ -356,76 +271,28 @@ function CustomCard({ onClick }: { onClick: () => void }) {
     return (
         <div
             onClick={onClick}
-            style={{
-                border: '1.5px dashed var(--border-subtle)', borderRadius: '10px',
-                padding: '18px', display: 'flex', flexDirection: 'column',
-                cursor: 'pointer',
-                background: 'linear-gradient(135deg, rgba(255,255,255,0.97) 0%, rgba(255,255,255,0.92) 54%, rgba(249,246,238,0.82) 100%)',
-                transition: 'border-color 120ms, background 120ms',
-                position: 'relative',
-                overflow: 'hidden',
-            }}
-            onMouseEnter={(e) => {
-                (e.currentTarget as HTMLDivElement).style.borderColor = 'var(--accent)';
-            }}
-            onMouseLeave={(e) => {
-                (e.currentTarget as HTMLDivElement).style.borderColor = 'var(--border-subtle)';
-            }}
+            className="talent-card-custom"
         >
             <div
                 aria-hidden="true"
-                style={{
-                    position: 'absolute',
-                    inset: 0,
-                    backgroundImage: `linear-gradient(90deg, rgba(255,255,255,0.97) 0%, rgba(255,255,255,0.84) 48%, rgba(255,255,255,0.18) 100%), url(${customAgentBackground})`,
-                    backgroundRepeat: 'no-repeat',
-                    backgroundPosition: 'right -44px center',
-                    backgroundSize: '260px auto',
-                    filter: 'grayscale(18%) saturate(76%) sepia(8%)',
-                    opacity: 0.68,
-                    pointerEvents: 'none',
-                }}
+                className="talent-card-custom-bg"
+                style={{ backgroundImage: `url(${customAgentBackground})` }}
             />
-            <div style={{
-                width: '40px', height: '40px', borderRadius: '8px',
-                background: 'var(--bg-secondary)',
-                display: 'flex', alignItems: 'center', justifyContent: 'center',
-                marginBottom: '14px', color: 'var(--text-secondary)',
-                position: 'relative', zIndex: 1,
-            }}>
+            <div className="talent-card-custom-icon">
                 <IconPlus size={20} stroke={1.5} />
             </div>
-            <div style={{ fontSize: '15px', fontWeight: 600, marginBottom: '2px', position: 'relative', zIndex: 1 }}>
+            <div className="talent-card-custom-title">
                 {t('talentMarket.customTitle', isChinese ? '自建 Agent' : 'Build custom')}
             </div>
-            <div style={{
-                fontSize: '10px', fontWeight: 500, letterSpacing: '0.06em',
-                color: 'var(--text-tertiary)', textTransform: 'uppercase',
-                marginBottom: '12px',
-                position: 'relative', zIndex: 1,
-            }}>
+            <div className="talent-card-custom-category">
                 {t('talentMarket.customCategory', 'Custom')}
             </div>
-            <p style={{
-                margin: 0, flex: 1, fontSize: '12.5px',
-                color: 'var(--text-secondary)', lineHeight: 1.6,
-                position: 'relative', zIndex: 1,
-            }}>
+            <p className="talent-card-custom-description">
                 {t('talentMarket.customDescription', isChinese
                     ? '创建本地 Native Agent，按你的需求定义身份、权限和工具。'
                     : 'Create a native agent, then define its identity, permissions, and tools.')}
             </p>
-            <div style={{
-                marginTop: '14px',
-                display: 'flex',
-                alignItems: 'center',
-                gap: '6px',
-                color: 'var(--text-tertiary)',
-                fontSize: '11.5px',
-                lineHeight: 1.2,
-                position: 'relative',
-                zIndex: 1,
-            }}>
+            <div className="talent-card-custom-hint">
                 <IconWorld size={13} stroke={1.5} style={{ flexShrink: 0 }} />
                 <span>
                     {t('talentMarket.externalAgentHint', isChinese
@@ -439,7 +306,7 @@ function CustomCard({ onClick }: { onClick: () => void }) {
                     e.stopPropagation();
                     onClick();
                 }}
-                style={{ marginTop: '16px', width: '100%', position: 'relative', zIndex: 1 }}
+                style={{ marginTop: '16px', width: '100%' }}
             >
                 {t('talentMarket.customStart', isChinese ? '开始' : 'Start')}
             </button>
diff --git a/frontend/src/index.css b/frontend/src/index.css
index d6f2c09d9..99a393e61 100644
--- a/frontend/src/index.css
+++ b/frontend/src/index.css
@@ -4,33 +4,36 @@
 
 :root,
 [data-theme="dark"] {
-  /* Colors — Dark Theme (Linear-style) */
+  /* Colors — Modern Dark Theme */
   --bg-primary: #0a0a0f;
   --bg-secondary: #111119;
-  --bg-tertiary: #18181f;
-  --bg-elevated: #1c1c26;
-  --bg-hover: #22222e;
-  --bg-active: #2a2a3a;
-
-  --border-default: #26263a;
-  --border-subtle: #1e1e30;
-  --border-strong: #35354a;
-
-  --text-primary: #e1e1e8;
-  --text-secondary: #8b8b9e;
-  --text-tertiary: #5c5c70;
+  --bg-tertiary: #181822;
+  --bg-elevated: #1c1c28;
+  --bg-hover: #222230;
+  --bg-active: #2a2a3c;
+  --bg-glass: rgba(28, 28, 40, 0.85);
+
+  --border-default: #2a2a3c;
+  --border-subtle: #20202e;
+  --border-strong: #3a3a50;
+  --border-glow: rgba(99, 102, 241, 0.35);
+
+  --text-primary: #f0f0f5;
+  --text-secondary: #9090a8;
+  --text-tertiary: #656578;
   --text-inverse: #0a0a0f;
 
-  --accent-primary: #e1e1e8;
-  --accent-hover: #ffffff;
-  --accent-subtle: rgba(225, 225, 232, 0.08);
-  --accent-text: #c0c0cc;
+  --accent-primary: #6366f1;
+  --accent-secondary: #8b5cf6;
+  --accent-hover: #a78bfa;
+  --accent-subtle: rgba(99, 102, 241, 0.12);
+  --accent-text: #c7d2fe;
 
   --segment-active-bg: var(--accent-primary);
   --segment-active-text: var(--text-inverse);
 
-  /* Select chevron — @tabler/icons chevron-down, colored to --text-secondary */
-  --select-chevron: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='24' height='24' viewBox='0 0 24 24' fill='none' stroke='%238b8b9e' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpath stroke='none' d='M0 0h24v24H0z' fill='none'/%3E%3Cpath d='M6 9l6 6l6 -6'/%3E%3C/svg%3E");
+  /* Select chevron */
+  --select-chevron: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='24' height='24' viewBox='0 0 24 24' fill='none' stroke='%239090a8' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpath stroke='none' d='M0 0h24v24H0z' fill='none'/%3E%3Cpath d='M6 9l6 6l6 -6'/%3E%3C/svg%3E");
 
   --success: #22c55e;
   --success-subtle: rgba(34, 197, 94, 0.12);
@@ -44,8 +47,8 @@
 
   /* Status colors */
   --status-running: #22c55e;
-  --status-idle: #8b8b9e;
-  --status-stopped: #5c5c70;
+  --status-idle: #9090a8;
+  --status-stopped: #656578;
   --status-error: #ef4444;
 
   /* Spacing */
@@ -70,34 +73,66 @@
   --text-xl: 18px;
   --text-2xl: 24px;
   --text-3xl: 32px;
+  --text-4xl: 40px;
 
   /* Border Radius */
-  --radius-sm: 4px;
-  --radius-md: 6px;
-  --radius-lg: 8px;
-  --radius-xl: 12px;
+  --radius-sm: 6px;
+  --radius-md: 8px;
+  --radius-lg: 12px;
+  --radius-xl: 16px;
+  --radius-2xl: 20px;
   --radius-full: 9999px;
 
   /* Shadows */
-  --shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.25);
-  --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.35);
-  --shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.45);
+  --shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.3);
+  --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.4);
+  --shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.5);
+  --shadow-xl: 0 12px 40px rgba(0, 0, 0, 0.6);
 
   /* Transitions */
-  --transition-fast: 120ms ease;
-  --transition-default: 200ms ease;
-  --transition-slow: 300ms ease;
+  --transition-fast: 120ms cubic-bezier(0.4, 0, 0.2, 1);
+  --transition-default: 200ms cubic-bezier(0.4, 0, 0.2, 1);
+  --transition-smooth: 220ms cubic-bezier(0.4, 0, 0.2, 1);
+  --transition-slow: 300ms cubic-bezier(0.4, 0, 0.2, 1);
+  --transition-bounce: 250ms cubic-bezier(0.34, 1.56, 0.64, 1);
+
+  /* Card Shadows */
+  --card-shadow-elevated: 0 4px 16px rgba(0, 0, 0, 0.2);
+  --card-shadow-hover: 0 8px 32px rgba(0, 0, 0, 0.3);
+  --card-glow-hover: 0 0 30px rgba(99, 102, 241, 0.15);
+
+  /* Gradients */
+  --accent-gradient: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a78bfa 100%);
+  --card-gradient-border: linear-gradient(135deg, rgba(99, 102, 241, 0.2) 0%, rgba(139, 92, 246, 0.1) 100%);
+  --bg-gradient-subtle: linear-gradient(180deg, rgba(255,255,255,0.02) 0%, transparent 100%);
+  --glass-gradient: linear-gradient(135deg, rgba(255,255,255,0.08) 0%, rgba(255,255,255,0.02) 100%);
+
+  /* Glow Effects */
+  --accent-glow: rgba(99, 102, 241, 0.4);
+  --accent-glow-strong: rgba(99, 102, 241, 0.6);
+  --success-glow: rgba(34, 197, 94, 0.4);
+  --error-glow: rgba(239, 68, 68, 0.4);
+
+  /* Button Premium */
+  --btn-premium-bg: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
+  --btn-premium-shadow: 0 4px 20px rgba(99, 102, 241, 0.35);
+  --btn-premium-hover-bg: linear-gradient(135deg, #818cf8 0%, #a78bfa 100%);
+  --btn-premium-shadow-hover: 0 8px 30px rgba(99, 102, 241, 0.5);
+
+  /* Button Gradients */
+  --error-gradient: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);
+  --success-gradient: linear-gradient(135deg, #22c55e 0%, #16a34a 100%);
 
   /* Layout */
-  --sidebar-width-expanded: 240px;
-  --sidebar-width-collapsed: 68px;
+  --sidebar-width-expanded: 248px;
+  --sidebar-width-collapsed: 72px;
   --sidebar-width: var(--sidebar-width-expanded);
-  --header-height: 52px;
+  --header-height: 56px;
 
   /* Notification bar */
-  --notification-bar-bg: var(--accent-primary);
-  --notification-bar-text: var(--bg-primary);
-  --notification-bar-close-hover: rgba(0, 0, 0, 0.1);
+  --notification-bar-bg: var(--accent-gradient);
+  --notification-bar-text: var(--text-inverse);
+  --notification-bar-close-hover: rgba(0, 0, 0, 0.15);
 }
 
 /* ─── Notification Bar ──────────────────────────────── */
@@ -188,23 +223,27 @@
   --bg-elevated: #ffffff;
   --bg-hover: #f0f0f3;
   --bg-active: #e8e8ed;
+  --bg-glass: rgba(255, 255, 255, 0.9);
 
   --border-default: #e0e0e6;
   --border-subtle: #eaeaef;
   --border-strong: #d0d0d8;
+  --border-glow: rgba(99, 102, 241, 0.25);
 
   --text-primary: #1a1a22;
   --text-secondary: #6b6b80;
   --text-tertiary: #9898a8;
   --text-inverse: #ffffff;
 
-  --accent-primary: #3a3a42;
-  --accent-hover: #28282e;
-  --accent-subtle: rgba(58, 58, 66, 0.08);
-  --accent-text: #3a3a42;
+  --accent-primary: #6366f1;
+  --accent-secondary: #8b5cf6;
+  --accent-hover: #a78bfa;
+  --accent-subtle: rgba(99, 102, 241, 0.08);
+  --accent-text: #4f46e5;
+  --accent-btn-text: #ffffff;
 
-  --segment-active-bg: var(--bg-elevated);
-  --segment-active-text: var(--text-primary);
+  --segment-active-bg: var(--accent-primary);
+  --segment-active-text: var(--text-inverse);
 
   /* Select chevron — @tabler/icons chevron-down, colored to --text-secondary */
   --select-chevron: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='24' height='24' viewBox='0 0 24 24' fill='none' stroke='%236b6b80' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpath stroke='none' d='M0 0h24v24H0z' fill='none'/%3E%3Cpath d='M6 9l6 6l6 -6'/%3E%3C/svg%3E");
@@ -227,29 +266,97 @@
   --shadow-sm: 0 1px 3px rgba(0, 0, 0, 0.06);
   --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.08);
   --shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.12);
+  --shadow-xl: 0 12px 40px rgba(0, 0, 0, 0.15);
+
+  /* Card Shadows */
+  --card-shadow-elevated: 0 2px 8px rgba(0, 0, 0, 0.06);
+  --card-shadow-hover: 0 4px 16px rgba(0, 0, 0, 0.1);
+  --card-glow-hover: 0 0 20px rgba(99, 102, 241, 0.1);
+
+  /* Gradients */
+  --accent-gradient: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a78bfa 100%);
+  --card-gradient-border: linear-gradient(135deg, rgba(99, 102, 241, 0.15) 0%, rgba(139, 92, 246, 0.1) 100%);
+  --bg-gradient-subtle: linear-gradient(180deg, rgba(0,0,0,0.02) 0%, transparent 100%);
+  --glass-gradient: linear-gradient(135deg, rgba(0,0,0,0.04) 0%, rgba(0,0,0,0.01) 100%);
+
+  /* Glow Effects */
+  --accent-glow: rgba(99, 102, 241, 0.3);
+  --accent-glow-strong: rgba(99, 102, 241, 0.5);
+  --success-glow: rgba(22, 163, 74, 0.3);
+  --error-glow: rgba(220, 38, 38, 0.3);
+
+  /* Button Premium */
+  --btn-premium-bg: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
+  --btn-premium-shadow: 0 4px 16px rgba(99, 102, 241, 0.25);
+  --btn-premium-hover-bg: linear-gradient(135deg, #818cf8 0%, #a78bfa 100%);
+  --btn-premium-shadow-hover: 0 8px 24px rgba(99, 102, 241, 0.4);
+
+  /* Button Gradients */
+  --error-gradient: linear-gradient(135deg, #dc2626 0%, #b91c1c 100%);
+  --success-gradient: linear-gradient(135deg, #16a34a 0%, #15803d 100%);
 
   /* Light mode: notification bar always white text */
   --notification-bar-text: var(--bg-primary); /* For light mode, bg-primary is #fff */
   --notification-bar-close-hover: rgba(255, 255, 255, 0.2);
 }
 
-/* Light-mode overrides: primary button uses dark bg → white text */
+/* Light-mode overrides for enhanced buttons */
 [data-theme="light"] .btn-primary {
-  color: #ffffff;
+  background: var(--accent-primary);
+  color: var(--accent-btn-text);
+  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.15);
+}
+
+[data-theme="light"] .btn-primary:hover:not(:disabled) {
+  background: var(--accent-hover);
+  box-shadow: 0 4px 16px rgba(0, 0, 0, 0.2);
 }
 
-/* Light-mode secondary button — keep neutral gray style */
 [data-theme="light"] .btn-secondary {
-  background: var(--bg-elevated);
-  border: 1px solid var(--border-default);
-  color: var(--text-secondary);
+  background: #f3f4f6;
+  border: 1px solid #d1d5db;
+  color: #374151;
 }
 
-[data-theme="light"] .btn-secondary:hover {
-  background: var(--bg-hover);
-  border-color: var(--border-strong);
-  color: var(--text-primary);
-  box-shadow: none;
+[data-theme="light"] .btn-secondary:hover:not(:disabled) {
+  background: #e5e7eb;
+  border-color: var(--accent-primary);
+  color: #1f2937;
+  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
+}
+
+[data-theme="light"] .btn-ghost {
+  color: #4b5563;
+  background: transparent;
+}
+
+[data-theme="light"] .btn-ghost:hover:not(:disabled) {
+  background: var(--accent-subtle);
+  color: var(--accent-primary);
+}
+
+[data-theme="light"] .btn-premium {
+  box-shadow: 0 4px 16px rgba(99, 102, 241, 0.25);
+}
+
+[data-theme="light"] .btn-premium:hover:not(:disabled) {
+  box-shadow: 0 8px 24px rgba(99, 102, 241, 0.35);
+}
+
+[data-theme="light"] .btn-danger {
+  box-shadow: 0 2px 8px rgba(220, 38, 38, 0.2);
+}
+
+[data-theme="light"] .btn-danger:hover:not(:disabled) {
+  box-shadow: 0 4px 16px rgba(220, 38, 38, 0.3);
+}
+
+[data-theme="light"] .btn-success {
+  box-shadow: 0 2px 8px rgba(22, 163, 74, 0.2);
+}
+
+[data-theme="light"] .btn-success:hover:not(:disabled) {
+  box-shadow: 0 4px 16px rgba(22, 163, 74, 0.3);
 }
 
 /* ─── Reset ─────────────────────────────────────────── */
@@ -301,10 +408,11 @@ textarea {
   color: inherit;
   background: var(--bg-elevated);
   border: 1px solid var(--border-default);
-  border-radius: var(--radius-md);
-  padding: var(--space-2) var(--space-3);
+  border-radius: var(--radius-lg);
+  padding: var(--space-3) var(--space-4);
   outline: none;
-  transition: border-color var(--transition-fast);
+  transition: all var(--transition-default);
+  position: relative;
 }
 
 select {
@@ -314,30 +422,61 @@ select {
   background-color: var(--bg-elevated);
   background-image: var(--select-chevron);
   background-repeat: no-repeat;
-  background-position: right 10px center;
+  background-position: right 12px center;
   background-size: 16px;
   border: 1px solid var(--border-default);
-  border-radius: var(--radius-md);
-  padding: 4px 32px 4px 12px;
+  border-radius: var(--radius-lg);
+  padding: var(--space-3) 32px var(--space-3) var(--space-4);
   outline: none;
-  transition: border-color var(--transition-fast);
+  transition: all var(--transition-default);
   appearance: none;
   -webkit-appearance: none;
   cursor: pointer;
-  text-overflow: ellipsis;
+  min-height: 42px;
 }
 
 /* Theme-colored radio and checkbox */
 input[type="radio"],
 input[type="checkbox"] {
   accent-color: var(--accent-primary);
+  cursor: pointer;
 }
 
 input:focus,
 textarea:focus,
 select:focus {
   border-color: var(--accent-primary);
-  box-shadow: 0 0 0 2px var(--accent-subtle);
+  box-shadow: 0 0 0 3px var(--accent-subtle), 0 0 20px rgba(99, 102, 241, 0.15);
+  outline: none;
+}
+
+.form-input:focus {
+  border-color: var(--accent-primary);
+  box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.18), 0 0 24px rgba(99, 102, 241, 0.12);
+  outline: none;
+}
+
+input:focus-visible,
+textarea:focus-visible,
+select:focus-visible {
+  border-color: var(--accent-primary);
+  box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2);
+  outline: 2px solid var(--accent-primary);
+  outline-offset: 2px;
+}
+
+input:hover,
+textarea:hover,
+select:hover {
+  border-color: var(--border-strong);
+}
+
+input:disabled,
+textarea:disabled,
+select:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+  background: var(--bg-tertiary);
 }
 
 /* ─── Layout ────────────────────────────────────────── */
@@ -353,8 +492,11 @@ select:focus {
 
 .sidebar {
   width: var(--sidebar-width);
-  background: var(--bg-secondary);
+  background: linear-gradient(180deg, var(--bg-secondary) 0%, var(--bg-tertiary) 100%);
   border-right: 1px solid var(--border-subtle);
+  box-shadow: 
+    4px 0 20px rgba(0, 0, 0, 0.15),
+    inset -2px 0 8px rgba(255, 255, 255, 0.02);
   display: flex;
   flex-direction: column;
   position: fixed;
@@ -364,7 +506,7 @@ select:focus {
   height: 100vh;
   max-height: 100vh;
   z-index: 10;
-  transition: width var(--transition-default);
+  transition: width var(--transition-default), box-shadow var(--transition-default);
   /* Allow account language submenu to extend to the right; vertical scroll stays in .sidebar-scrollable */
   overflow-x: visible;
   overflow-y: hidden;
@@ -382,7 +524,32 @@ select:focus {
   min-height: 0;
 }
 
-/* Custom scrollbar for sidebar */
+/* =============================================
+   Enhanced Custom Scrollbar (WebKit)
+   ============================================= */
+
+/* Global Scrollbar Styles */
+::-webkit-scrollbar {
+  width: 8px;
+  height: 8px;
+}
+
+::-webkit-scrollbar-track {
+  background: transparent;
+  border-radius: 4px;
+}
+
+::-webkit-scrollbar-thumb {
+  background: var(--border-strong);
+  border-radius: 4px;
+  transition: background var(--transition-fast);
+}
+
+::-webkit-scrollbar-thumb:hover {
+  background: var(--text-tertiary);
+}
+
+/* Sidebar scrollbar - subtle */
 .sidebar-scrollable::-webkit-scrollbar {
   width: 4px;
 }
@@ -400,6 +567,35 @@ select:focus {
   background: var(--border-strong);
 }
 
+/* Dark mode scrollbar accent */
+[data-theme="dark"] ::-webkit-scrollbar-thumb {
+  background: rgba(255, 255, 255, 0.1);
+}
+
+[data-theme="dark"] ::-webkit-scrollbar-thumb:hover {
+  background: rgba(255, 255, 255, 0.2);
+}
+
+/* Light mode scrollbar accent */
+[data-theme="light"] ::-webkit-scrollbar-thumb {
+  background: rgba(0, 0, 0, 0.15);
+}
+
+[data-theme="light"] ::-webkit-scrollbar-thumb:hover {
+  background: rgba(0, 0, 0, 0.25);
+}
+
+/* Smooth scroll behavior */
+html {
+  scroll-behavior: smooth;
+}
+
+/* Selection color */
+::selection {
+  background: var(--accent-primary);
+  color: var(--text-inverse);
+}
+
 .sidebar-bottom {
   flex-shrink: 0;
   border-top: 1px solid var(--border-subtle);
@@ -572,13 +768,13 @@ select:focus {
   max-height: min(520px, calc(100vh - 96px));
   overflow: hidden;
   z-index: 10020;
-  padding: 10px;
-  background: var(--bg-elevated);
+  padding: 12px;
+  background: var(--bg-primary);
   border: 1px solid var(--border-subtle);
-  border-radius: var(--radius-lg);
-  box-shadow: var(--shadow-lg);
+  border-radius: var(--radius-xl);
+  box-shadow: 0 20px 60px rgba(0, 0, 0, 0.35);
   transform-origin: top left;
-  animation: tenantDropdownIn 0.16s ease;
+  animation: tenantDropdownIn 0.2s cubic-bezier(0.16, 1, 0.3, 1);
   display: flex;
   flex-direction: column;
 }
@@ -669,28 +865,29 @@ select:focus {
   display: flex;
   align-items: center;
   gap: 10px;
-  min-height: 38px;
-  padding: 8px 10px;
+  min-height: 40px;
+  padding: 8px 12px;
   border: 1px solid transparent;
-  border-radius: var(--radius-md);
+  border-radius: var(--radius-lg);
   background: transparent;
   color: var(--text-secondary);
   cursor: pointer;
   font-size: 14px;
   text-align: left;
-  transition: background var(--transition-fast), color var(--transition-fast), border-color var(--transition-fast);
+  transition: all var(--transition-fast);
 }
 
 .tenant-switcher-item:hover,
 .tenant-switcher-action:hover {
-  background: var(--bg-hover);
+  background: var(--bg-tertiary);
   color: var(--text-primary);
+  border-color: var(--border-subtle);
 }
 
 .tenant-switcher-item.active {
-  background: var(--bg-tertiary);
-  border-color: var(--border-subtle);
-  color: var(--text-primary);
+  background: linear-gradient(135deg, var(--accent-primary) 0%, var(--accent-secondary) 100%);
+  border-color: var(--accent-primary);
+  color: var(--text-on-accent);
   font-weight: 600;
 }
 
@@ -1052,16 +1249,34 @@ select:focus {
   cursor: pointer;
   text-decoration: none;
   white-space: nowrap;
+  position: relative;
+}
+
+.sidebar-item::before {
+  content: '';
+  position: absolute;
+  inset: 0;
+  border-radius: inherit;
+  background: linear-gradient(135deg, rgba(255, 255, 255, 0.05) 0%, transparent 50%);
+  pointer-events: none;
 }
 
 .sidebar-item:hover {
-  background: var(--bg-hover);
+  background: linear-gradient(135deg, var(--bg-hover) 0%, var(--bg-active) 100%);
   color: var(--text-primary);
+  box-shadow: 
+    inset 0 1px 0 rgba(255, 255, 255, 0.1),
+    0 2px 8px rgba(0, 0, 0, 0.1);
+  transform: translateY(-1px);
 }
 
 .sidebar-item.active {
-  background: var(--accent-subtle);
+  background: linear-gradient(135deg, var(--accent-subtle) 0%, rgba(99, 102, 241, 0.1) 100%);
   color: var(--accent-text);
+  box-shadow: 
+    inset 0 1px 0 rgba(255, 255, 255, 0.15),
+    0 4px 12px rgba(99, 102, 241, 0.2);
+  border-left: 3px solid var(--accent-primary);
 }
 
 .sidebar-item-icon {
@@ -1171,15 +1386,85 @@ select:focus {
 .sidebar-account-row {
   display: flex;
   align-items: center;
-  gap: 8px;
-  padding: 4px 6px;
-  border-radius: 6px;
+  gap: 10px;
+  padding: 8px 10px;
+  border-radius: var(--radius-lg);
   cursor: pointer;
-  transition: background 0.15s ease;
+  transition: all var(--transition-smooth);
+  background: var(--bg-secondary);
+  border: 1px solid var(--border-subtle);
 }
 
 .sidebar-account-row:hover {
   background: var(--bg-tertiary);
+  border-color: var(--border-default);
+  transform: translateY(-1px);
+  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
+}
+
+/* User avatar enhancement */
+.sidebar-account-row > div:first-child {
+  width: 32px;
+  height: 32px;
+  border-radius: var(--radius-md);
+  background: linear-gradient(135deg, var(--accent-subtle), var(--bg-tertiary));
+  border: 2px solid var(--accent-primary);
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  color: var(--accent-primary);
+  flex-shrink: 0;
+  transition: all var(--transition-smooth);
+}
+
+.sidebar-account-row:hover > div:first-child {
+  transform: scale(1.1);
+  box-shadow: 0 0 16px var(--accent-glow);
+}
+
+/* User info enhancement */
+.sidebar-footer-user-info {
+  flex: 1;
+  min-width: 0;
+  display: flex;
+  flex-direction: column;
+  gap: 2px;
+}
+
+.sidebar-footer-user-info > div:first-child {
+  font-size: 13px;
+  font-weight: 600;
+  color: var(--text-primary);
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+}
+
+.sidebar-footer-user-info > div:last-child {
+  font-size: 11px;
+  color: var(--text-tertiary);
+  display: flex;
+  align-items: center;
+  gap: 4px;
+}
+
+.sidebar-footer-user-info > div:last-child::before {
+  content: '';
+  width: 4px;
+  height: 4px;
+  border-radius: 50%;
+  background: var(--success);
+}
+
+/* Chevron icon enhancement */
+.sidebar-account-row svg:last-child {
+  color: var(--text-tertiary);
+  flex-shrink: 0;
+  transition: all var(--transition-smooth);
+}
+
+.sidebar-account-row:hover svg:last-child {
+  color: var(--accent-primary);
 }
 
 /* Account dropdown: main menu full width; language submenu cascades to the right (no squeeze) */
@@ -1317,6 +1602,18 @@ select:focus {
   overflow-x: clip;
   box-sizing: border-box;
   transition: all var(--transition-default);
+  animation: pageFadeIn 0.4s ease-out;
+}
+
+@keyframes pageFadeIn {
+  from {
+    opacity: 0;
+    transform: translateX(10px);
+  }
+  to {
+    opacity: 1;
+    transform: translateX(0);
+  }
 }
 
 /* Chat page needs a fixed viewport height so the inner chat-messages
@@ -1429,919 +1726,1675 @@ select:focus {
   display: none;
 }
 
-/* ─── Components ────────────────────────────────────── */
+/* ════════════════════════════════════════════════════════
+   Enhanced Button System
+   ════════════════════════════════════════════════════════ */
 
-/* Button */
 .btn {
   display: inline-flex;
-  align-items: center;
   justify-content: center;
+  align-items: center;
   gap: var(--space-2);
   padding: var(--space-2) var(--space-4);
-  border-radius: var(--radius-md);
+  border-radius: var(--radius-lg);
   font-size: var(--text-sm);
   font-weight: 500;
   line-height: 1;
-  height: 34px;
-  transition: all var(--transition-fast);
+  height: 38px;
+  transition: all var(--transition-bounce);
   white-space: nowrap;
+  position: relative;
+  overflow: hidden;
+  border: none;
+  font-family: inherit;
 }
 
-.btn-primary {
-  background: var(--accent-primary);
-  /* Dark theme: near-white bg → needs dark text for contrast */
-  color: #0a0a0f;
+.btn::before {
+  content: '';
+  position: absolute;
+  inset: 0;
+  background: linear-gradient(180deg, rgba(255,255,255,0.12) 0%, transparent 50%);
+  opacity: 0;
+  transition: opacity var(--transition-fast);
+  pointer-events: none;
 }
 
-.btn-primary:hover {
-  background: var(--accent-hover);
-  box-shadow: var(--shadow-sm);
+.btn:hover::before {
+  opacity: 1;
 }
 
-.btn-secondary {
-  background: var(--bg-elevated);
-  border: 1px solid var(--border-strong);
-  color: var(--text-primary);
+.btn:active {
+  transform: scale(0.97);
 }
 
-.btn-secondary:hover {
-  background: var(--bg-hover);
-  border-color: var(--accent-primary);
-  color: var(--text-primary);
+.btn:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+  transform: none;
 }
 
-.btn-ghost {
-  color: var(--text-secondary);
+.btn:disabled::before {
+  display: none;
 }
 
-.btn-ghost:hover {
-  background: var(--bg-hover);
-  color: var(--text-primary);
+/* Primary Button - Solid with shimmer effect */
+.btn-primary {
+  background: var(--accent-primary);
+  color: var(--text-inverse);
+  box-shadow: 0 4px 14px rgba(99, 102, 241, 0.35);
 }
 
-.btn-danger {
-  background: var(--error);
-  color: white;
+.btn-primary:hover:not(:disabled) {
+  background: var(--accent-secondary);
+  box-shadow: 0 8px 24px rgba(99, 102, 241, 0.45);
+  transform: translateY(-2px);
 }
 
-.btn-danger:hover {
-  background: #dc2626;
+.btn-primary:active:not(:disabled) {
+  transform: translateY(0) scale(0.98);
+  box-shadow: 0 2px 8px rgba(99, 102, 241, 0.3);
 }
 
-/* Card */
-.card {
-  background: var(--bg-secondary);
-  border: 1px solid var(--border-subtle);
-  border-radius: var(--radius-lg);
-  padding: var(--space-5);
-  transition: all var(--transition-default);
+.btn-primary:focus-visible {
+  outline: 2px solid var(--accent-primary);
+  outline-offset: 2px;
+  box-shadow: 0 0 0 4px rgba(99, 102, 241, 0.2);
 }
 
-.card:hover {
-  border-color: var(--border-default);
-  box-shadow: var(--shadow-sm);
+/* Premium Button - Gradient with glow */
+.btn-premium {
+  background: var(--btn-premium-bg, linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a78bfa 100%));
+  color: white;
+  box-shadow: var(--btn-premium-shadow);
+  border: none;
+  position: relative;
 }
 
-.card-clickable {
-  cursor: pointer;
+.btn-premium::after {
+  content: '';
+  position: absolute;
+  inset: 0;
+  border-radius: var(--radius-lg);
+  background: linear-gradient(135deg, rgba(255,255,255,0.2) 0%, transparent 50%);
+  opacity: 0;
+  transition: opacity var(--transition-default);
 }
 
-.card-clickable:hover {
+.btn-premium:hover:not(:disabled) {
+  background: var(--btn-premium-hover-bg, linear-gradient(135deg, #818cf8 0%, #a78bfa 50%, #c4b5fd 100%));
+  box-shadow: var(--btn-premium-shadow-hover);
+  transform: translateY(-2px);
+}
+
+.btn-premium:hover:not(:disabled)::after {
+  opacity: 1;
+}
+
+.btn-premium:focus-visible {
+  outline: 2px solid #a78bfa;
+  outline-offset: 2px;
+  box-shadow: 0 0 0 4px rgba(139, 92, 246, 0.3);
+}
+
+/* Secondary Button - Outlined with fill on hover */
+.btn-secondary {
+  background: var(--bg-elevated);
+  border: 1px solid var(--border-default);
+  color: var(--text-primary);
+}
+
+.btn-secondary:hover:not(:disabled) {
+  background: var(--bg-hover);
   border-color: var(--accent-primary);
+  color: var(--text-primary);
+  box-shadow: 0 4px 14px rgba(0, 0, 0, 0.2);
   transform: translateY(-1px);
-  box-shadow: var(--shadow-md);
 }
 
-/* Metric tooltip */
-.metric-tooltip-trigger {
-  position: relative;
+.btn-secondary:focus-visible {
+  outline: 2px solid var(--accent-primary);
+  outline-offset: 2px;
+  box-shadow: 0 0 0 4px rgba(99, 102, 241, 0.15);
 }
 
-.metric-tooltip {
-  display: none;
-  position: absolute;
-  bottom: calc(100% + 6px);
-  left: 0;
-  background: var(--bg-elevated);
+/* Ghost Button - Subtle hover effect */
+.btn-ghost {
   color: var(--text-secondary);
-  font-size: 12px;
-  line-height: 1.5;
-  padding: 8px 12px;
-  border-radius: 6px;
-  border: 1px solid var(--border-subtle);
-  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
-  white-space: normal;
-  width: 260px;
-  z-index: 100;
-  pointer-events: none;
+  background: transparent;
 }
 
-.metric-tooltip-trigger:hover .metric-tooltip {
-  display: block;
+.btn-ghost:hover:not(:disabled) {
+  background: var(--bg-hover);
+  color: var(--text-primary);
+  transform: translateY(-1px);
 }
 
-/* Status dot (used in Dashboard / AgentDetail) */
-.status-dot {
-  width: 8px;
-  height: 8px;
-  border-radius: 50%;
-  display: inline-block;
-  flex-shrink: 0;
+.btn-ghost:focus-visible {
+  outline: 2px solid var(--accent-primary);
+  outline-offset: 2px;
 }
 
-.status-dot.running {
-  background: var(--status-running);
-  box-shadow: 0 0 6px var(--status-running);
+/* Danger Button - Gradient red */
+.btn-danger {
+  background: var(--error-gradient, linear-gradient(135deg, #ef4444 0%, #dc2626 100%));
+  color: white;
+  box-shadow: 0 2px 8px rgba(239, 68, 68, 0.3);
+  border: none;
 }
 
-.status-dot.idle {
-  background: var(--status-idle);
-  box-shadow: 0 0 6px var(--status-idle);
+.btn-danger:hover:not(:disabled) {
+  box-shadow: 0 4px 16px rgba(239, 68, 68, 0.45);
+  transform: translateY(-1px);
 }
 
-.status-dot.stopped {
-  background: var(--status-stopped);
+/* Success Button */
+.btn-success {
+  background: var(--success-gradient, linear-gradient(135deg, #22c55e 0%, #16a34a 100%));
+  color: white;
+  box-shadow: 0 2px 8px rgba(34, 197, 94, 0.3);
+  border: none;
 }
 
-.status-dot.error {
-  background: var(--status-error);
-  box-shadow: 0 0 6px var(--status-error);
+.btn-success:hover:not(:disabled) {
+  box-shadow: 0 4px 16px rgba(34, 197, 94, 0.45);
+  transform: translateY(-1px);
 }
 
-.status-dot.creating {
-  background: var(--info);
-  animation: pulse 1.5s infinite;
+/* Button Sizes */
+.btn-sm {
+  height: 30px;
+  padding: var(--space-1) var(--space-3);
+  font-size: var(--text-xs);
 }
 
-.status-dot.disconnected {
-  background: var(--warning);
+.btn-lg {
+  height: 44px;
+  padding: var(--space-3) var(--space-6);
+  font-size: var(--text-base);
 }
 
-/* Agent avatar — sidebar letter icon */
-.agent-avatar {
-  position: relative;
-  width: 20px;
-  height: 20px;
-  border-radius: 6px;
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  font-size: 11px;
-  font-weight: 600;
-  color: var(--text-secondary);
-  background: var(--bg-tertiary);
-  flex-shrink: 0;
-  line-height: 1;
-  letter-spacing: -0.02em;
+.btn-icon {
+  width: 36px;
+  padding: 0;
 }
 
-/* OpenClaw agent — dashed border to indicate external/connected */
-.agent-avatar.openclaw {
-  border: 1.5px dashed var(--text-tertiary);
-  background: transparent;
+.btn-icon.btn-sm {
+  width: 30px;
 }
 
-/* OpenClaw link icon — top-right of avatar */
-.agent-avatar-link {
+.btn-icon.btn-lg {
+  width: 44px;
+}
+
+/* Button Loading State */
+.btn-loading {
+  pointer-events: none;
+  position: relative;
+}
+
+.btn-loading::after {
+  content: '';
   position: absolute;
-  top: -3px;
-  right: -3px;
-  width: 10px;
-  height: 10px;
+  width: 16px;
+  height: 16px;
+  border: 2px solid transparent;
+  border-top-color: currentColor;
   border-radius: 50%;
+  animation: btn-spin 0.6s linear infinite;
+}
+
+@keyframes btn-spin {
+  to { transform: rotate(360deg); }
+}
+
+/* Icon Button Hover Effect */
+.btn-icon:hover:not(:disabled) {
+  background: var(--bg-hover);
+  transform: scale(1.05);
+}
+
+/* =============================================
+   Enhanced Card System
+   ============================================= */
+
+/* ════════════════════════════════════════════════════════
+   Enhanced Card System
+   ════════════════════════════════════════════════════════ */
+
+.card {
   background: var(--bg-secondary);
   border: 1px solid var(--border-subtle);
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  color: var(--text-tertiary);
+  border-radius: var(--radius-xl);
+  padding: var(--space-6);
+  transition: all var(--transition-smooth);
+  position: relative;
+  overflow: visible;
 }
 
-/* Small badge dot — absolute over avatar */
-.agent-avatar-badge {
+.card::before {
+  content: '';
   position: absolute;
-  bottom: -1px;
-  right: -1px;
-  width: 7px;
-  height: 7px;
-  border-radius: 50%;
-  border: 1.5px solid var(--bg-secondary);
+  inset: -1px;
+  border-radius: inherit;
+  background: var(--card-gradient-border, linear-gradient(135deg, rgba(99, 102, 241, 0.2), transparent, rgba(139, 92, 246, 0.2)));
+  z-index: -1;
+  opacity: 0;
+  transition: opacity var(--transition-default);
 }
 
-.agent-avatar-badge.error {
-  background: var(--status-error);
+.card::after {
+  content: '';
+  position: absolute;
+  top: 0;
+  left: 0;
+  right: 0;
+  height: 1px;
+  background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.3), transparent);
+  opacity: 0;
+  transition: opacity var(--transition-default);
 }
 
-.agent-avatar-badge.creating {
-  background: var(--info);
-  animation: pulse 1.5s infinite;
+.card:hover {
+  border-color: var(--border-default);
+  box-shadow: var(--card-shadow-elevated);
+  transform: translateY(-2px);
 }
 
-.agent-avatar-badge.disconnected {
-  background: var(--warning, #f59e0b);
+.card:hover::before {
+  opacity: 1;
 }
 
-@keyframes pulse {
-
-  0%,
-  100% {
-    opacity: 1;
-  }
+.card:hover::after {
+  opacity: 1;
+}
 
-  50% {
-    opacity: 0.4;
-  }
+/* Elevated Card - Glass morphism effect */
+.card-elevated {
+  background: var(--bg-glass);
+  backdrop-filter: blur(20px);
+  -webkit-backdrop-filter: blur(20px);
+  border: 1px solid var(--border-default);
+  box-shadow: var(--card-shadow-elevated);
 }
 
-/* Badge */
-.badge {
-  display: inline-flex;
-  align-items: center;
-  gap: var(--space-1);
-  padding: 2px var(--space-2);
-  border-radius: var(--radius-full);
-  font-size: var(--text-xs);
-  font-weight: 500;
+.card-elevated:hover {
+  border-color: var(--accent-primary);
+  box-shadow: var(--card-shadow-hover), var(--card-glow-hover);
+  transform: translateY(-3px);
 }
 
-.badge-success {
-  background: var(--success-subtle);
-  color: var(--success);
+/* Premium Card - With accent gradient border on hover */
+.card-premium {
+  position: relative;
+  background: var(--bg-elevated);
+  border: 1px solid var(--border-default);
 }
 
-.badge-warning {
-  background: var(--warning-subtle);
-  color: var(--warning);
+.card-premium::after {
+  content: '';
+  position: absolute;
+  inset: -2px;
+  border-radius: calc(var(--radius-xl) + 2px);
+  background: var(--accent-gradient);
+  z-index: -1;
+  opacity: 0;
+  transition: opacity var(--transition-default);
+  filter: blur(12px);
 }
 
-.badge-error {
-  background: var(--error-subtle);
-  color: var(--error);
+.card-premium::before {
+  content: '';
+  position: absolute;
+  inset: 0;
+  border-radius: var(--radius-xl);
+  background: linear-gradient(180deg, rgba(255,255,255,0.03) 0%, transparent 100%);
+  pointer-events: none;
 }
 
-.badge-info {
-  background: var(--accent-subtle);
-  color: var(--accent-text);
+.card-premium:hover::after {
+  opacity: 0.4;
 }
 
-/* Tabs */
-.tabs {
-  display: flex;
-  gap: var(--space-1);
-  border-bottom: 1px solid var(--border-subtle);
-  margin-bottom: var(--space-6);
-  position: sticky;
-  top: 0;
-  background: var(--bg-primary);
-  z-index: 5;
-  padding-top: var(--space-2);
+.card-premium:hover {
+  transform: translateY(-4px);
+  box-shadow: var(--card-shadow-hover);
 }
 
-.tab {
-  padding: var(--space-2) var(--space-4);
-  font-size: var(--text-sm);
-  color: var(--text-tertiary);
-  border-bottom: 2px solid transparent;
-  transition: all var(--transition-fast);
+/* Clickable Card */
+.card-clickable {
   cursor: pointer;
 }
 
-.tab:hover {
-  color: var(--text-secondary);
+.card-clickable:hover {
+  border-color: var(--accent-primary);
+  transform: translateY(-3px);
+  box-shadow: var(--card-shadow-hover);
 }
 
-.tab.active {
-  color: var(--text-primary);
-  border-bottom-color: var(--accent-primary);
+.card-clickable:active {
+  transform: translateY(-1px);
+  transition-duration: 100ms;
 }
 
-/* Session sidebar admin tabs — segment control style */
-.session-sidebar-segment-control {
-  display: flex;
-  margin: 0 12px 8px;
-  padding: 2px;
-  height: 28px;
-  background: var(--accent-subtle);
-  border-radius: 6px;
-  border: 1px solid var(--border-subtle);
-  box-sizing: border-box;
+/* Card Variants */
+.card-compact {
+  padding: var(--space-4);
 }
 
-.session-sidebar-segment-control .segment-item {
-  flex: 1;
+.card-spacious {
+  padding: var(--space-8);
+}
+
+/* Card with Status Indicator */
+.card-status {
+  position: relative;
+  padding-left: calc(var(--space-6) + 4px);
+}
+
+.card-status::before {
+  content: '';
+  position: absolute;
+  left: var(--space-6);
+  top: var(--space-6);
+  bottom: var(--space-6);
+  width: 3px;
+  border-radius: 2px;
+  background: var(--status-color, var(--border-default));
+}
+
+/* Card Header */
+.card-header {
   display: flex;
   align-items: center;
-  justify-content: center;
-  font-size: 12px;
-  color: var(--text-tertiary);
-  border-radius: 4px;
-  cursor: pointer;
-  transition: all 0.15s ease;
-  white-space: nowrap;
-  user-select: none;
-  border: none;
-  background: none;
-  padding: 0;
-  line-height: 1;
-}
-
-.session-sidebar-segment-control .segment-item:hover {
-  color: var(--text-secondary);
-}
-
-.session-sidebar-segment-control .segment-item.active {
-  background: var(--segment-active-bg);
-  color: var(--segment-active-text);
-  font-weight: 500;
+  justify-content: space-between;
+  margin-bottom: var(--space-4);
+  padding-bottom: var(--space-4);
+  border-bottom: 1px solid var(--border-subtle);
 }
 
-/* Custom scope dropdown */
-.scope-dropdown {
-  position: relative;
-  flex: 1;
-  min-width: 0;
-}
-.scope-dropdown-trigger {
-  all: unset;
-  display: inline-flex;
-  align-items: center;
-  gap: 4px;
-  cursor: pointer;
-  max-width: 120px;
-  box-sizing: border-box;
-}
-.scope-dropdown-trigger:hover .scope-dropdown-label {
-  color: var(--text-primary);
-}
-.scope-dropdown-label {
-  font-size: 13px;
+.card-title {
+  font-size: var(--text-lg);
   font-weight: 600;
   color: var(--text-primary);
-  line-height: 1.25;
-  white-space: nowrap;
-  overflow: hidden;
-  text-overflow: ellipsis;
-  transition: color 0.12s ease;
-}
-.scope-dropdown-chevron {
-  flex-shrink: 0;
-  color: var(--text-tertiary);
-  transition: transform 0.15s ease;
-}
-.scope-dropdown-chevron--open {
-  transform: rotate(180deg);
 }
-.scope-dropdown-menu {
-  position: absolute;
-  top: calc(100% + 4px);
-  left: 0;
-  min-width: 140px;
-  max-width: 200px;
-  padding: 4px;
-  background: var(--bg-elevated);
-  border: 1px solid var(--border-subtle);
-  border-radius: 8px;
-  box-shadow: 0 4px 16px rgba(0, 0, 0, 0.1);
-  z-index: 90;
-  animation: fadeIn 0.12s ease;
+
+/* Card Footer */
+.card-footer {
+  margin-top: var(--space-4);
+  padding-top: var(--space-4);
+  border-top: 1px solid var(--border-subtle);
+  display: flex;
+  align-items: center;
+  justify-content: flex-end;
+  gap: var(--space-3);
 }
-[data-theme="dark"] .scope-dropdown-menu {
-  background: rgba(32, 32, 40, 0.95);
-  border-color: rgba(255, 255, 255, 0.08);
-  box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3);
+
+/* =============================================
+   Enhanced Animations & Transitions
+   ============================================= */
+
+/* Page Transition */
+.page-enter {
+  opacity: 0;
+  transform: translateY(10px);
 }
-.scope-dropdown-item {
-  padding: 6px 10px;
-  font-size: 12px;
-  color: var(--text-secondary);
-  border-radius: 5px;
-  cursor: pointer;
-  white-space: nowrap;
-  overflow: hidden;
-  text-overflow: ellipsis;
-  transition: background 0.1s ease, color 0.1s ease;
+
+.page-enter-active {
+  opacity: 1;
+  transform: translateY(0);
+  transition: opacity 300ms ease, transform 300ms ease;
 }
-.scope-dropdown-item:hover {
-  background: var(--bg-secondary);
-  color: var(--text-primary);
+
+.page-exit {
+  opacity: 1;
 }
-.scope-dropdown-item--active {
-  color: var(--text-primary);
-  font-weight: 500;
+
+.page-exit-active {
+  opacity: 0;
+  transition: opacity 200ms ease;
 }
-.scope-dropdown-item--disabled {
-  color: var(--text-tertiary);
-  cursor: default;
-  font-style: italic;
+
+/* Modal Animation */
+@keyframes modalFadeIn {
+  from {
+    opacity: 0;
+    transform: scale(0.95) translateY(10px);
+  }
+  to {
+    opacity: 1;
+    transform: scale(1) translateY(0);
+  }
 }
-.scope-dropdown-item--disabled:hover {
-  background: none;
-  color: var(--text-tertiary);
+
+@keyframes modalFadeOut {
+  from {
+    opacity: 1;
+    transform: scale(1) translateY(0);
+  }
+  to {
+    opacity: 0;
+    transform: scale(0.95) translateY(10px);
+  }
 }
 
-/* Sidebar toggle button — unified for collapse & expand */
-.session-sidebar-toggle-btn {
-  display: inline-flex;
-  align-items: center;
-  justify-content: center;
-  width: 28px;
-  height: 28px;
-  border-radius: 6px;
-  background: none;
-  border: none;
-  color: var(--text-tertiary);
-  cursor: pointer;
-  flex-shrink: 0;
-  padding: 0;
-  transition: background 0.12s ease, color 0.12s ease;
+/* Backdrop Animation */
+@keyframes backdropFadeIn {
+  from { opacity: 0; }
+  to { opacity: 1; }
 }
-.session-sidebar-toggle-btn:hover {
-  background: var(--bg-secondary);
-  color: var(--text-secondary);
+
+/* Staggered Reveal Animation */
+@keyframes staggerReveal {
+  from {
+    opacity: 0;
+    transform: translateY(20px);
+  }
+  to {
+    opacity: 1;
+    transform: translateY(0);
+  }
 }
-.session-sidebar-toggle-btn--floating {
-  position: absolute;
-  top: 12px;
-  left: 12px;
-  z-index: 10;
-  background: var(--bg-elevated);
-  border: 1px solid var(--border-subtle);
+
+.stagger-reveal > * {
+  animation: staggerReveal 0.4s ease forwards;
+  opacity: 0;
 }
-.session-sidebar-toggle-btn--floating:hover {
-  background: var(--bg-secondary);
+
+.stagger-reveal > *:nth-child(1) { animation-delay: 0ms; }
+.stagger-reveal > *:nth-child(2) { animation-delay: 75ms; }
+.stagger-reveal > *:nth-child(3) { animation-delay: 150ms; }
+.stagger-reveal > *:nth-child(4) { animation-delay: 225ms; }
+.stagger-reveal > *:nth-child(5) { animation-delay: 300ms; }
+.stagger-reveal > *:nth-child(6) { animation-delay: 375ms; }
+.stagger-reveal > *:nth-child(7) { animation-delay: 450ms; }
+.stagger-reveal > *:nth-child(8) { animation-delay: 525ms; }
+
+/* Hover Lift Effect */
+.hover-lift {
+  transition: transform var(--transition-bounce), box-shadow var(--transition-default);
 }
 
-/* New session button */
-.new-session-btn {
-  width: 100%;
-  height: 28px;
-  padding: 0 10px;
-  background: var(--bg-secondary);
-  border: 1px solid var(--border-subtle);
-  border-radius: 6px;
-  cursor: pointer;
-  font-size: 12px;
-  color: var(--text-secondary);
-  font-weight: 500;
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  gap: 6px;
-  transition: all 0.15s ease;
-  box-sizing: border-box;
-  line-height: 1;
+.hover-lift:hover {
+  transform: translateY(-4px);
+  box-shadow: var(--card-shadow-hover);
 }
 
-.new-session-btn:hover {
-  background: var(--bg-tertiary);
-  color: var(--text-primary);
-  border-color: var(--border-strong);
+/* Pulse Glow Effect */
+@keyframes pulseGlow {
+  0%, 100% {
+    box-shadow: 0 0 20px var(--accent-glow);
+  }
+  50% {
+    box-shadow: 0 0 40px var(--accent-glow-strong);
+  }
 }
-[data-theme="dark"] .new-session-btn {
-  background: rgba(255, 255, 255, 0.04);
-  border-color: rgba(255, 255, 255, 0.08);
-  color: var(--text-secondary);
+
+.pulse-glow {
+  animation: pulseGlow 2s ease-in-out infinite;
 }
-[data-theme="dark"] .new-session-btn:hover {
-  background: rgba(255, 255, 255, 0.08);
-  border-color: rgba(255, 255, 255, 0.14);
-  color: var(--text-primary);
+
+/* Shimmer Loading Effect */
+@keyframes shimmer {
+  0% {
+    background-position: -200% 0;
+  }
+  100% {
+    background-position: 200% 0;
+  }
 }
 
-/* Session item — delete button & message count hover behaviour */
-.session-del-btn {
-  display: none;
-  align-items: center;
-  justify-content: center;
-  width: 24px;
-  height: 24px;
-  flex-shrink: 0;
-  padding: 0;
-  border: none;
-  border-radius: 4px;
-  background: none;
-  color: var(--text-tertiary);
-  cursor: pointer;
-  transition: color 0.35s ease-in-out, background 0.35s ease-in-out;
+.shimmer {
+  background: linear-gradient(
+    90deg,
+    var(--bg-tertiary) 0%,
+    var(--bg-hover) 50%,
+    var(--bg-tertiary) 100%
+  );
+  background-size: 200% 100%;
+  animation: shimmer 1.5s infinite;
+}
+
+/* Skeleton Loading Components */
+.skeleton {
+  background: linear-gradient(
+    90deg,
+    var(--bg-tertiary) 0%,
+    var(--bg-hover) 50%,
+    var(--bg-tertiary) 100%
+  );
+  background-size: 200% 100%;
+  animation: shimmer 1.5s infinite;
+  border-radius: var(--radius-md);
 }
 
-.session-item:hover .session-del-btn {
-  display: flex;
+.skeleton-text {
+  height: 14px;
+  margin-bottom: 8px;
+  border-radius: var(--radius-sm);
 }
 
-.session-del-btn:hover {
-  color: var(--status-error);
-  background: var(--bg-hover);
+.skeleton-text:last-child {
+  width: 60%;
+  margin-bottom: 0;
 }
 
-.session-item:hover .session-msg-count {
-  display: none;
+.skeleton-title {
+  height: 20px;
+  width: 40%;
+  margin-bottom: 12px;
+  border-radius: var(--radius-sm);
 }
 
-/* Page header */
-.page-header {
-  display: flex;
-  align-items: center;
-  justify-content: space-between;
-  margin-bottom: var(--space-6);
+.skeleton-avatar {
+  width: 40px;
+  height: 40px;
+  border-radius: 50%;
 }
 
-/* When inside the chat page (main-content.chat-page removes outer padding),
-   give page-header its own padding to replicate the normal main-content spacing */
-.main-content.chat-page .page-header {
-  padding: var(--space-8) var(--space-8) 0;
-  flex-shrink: 0;
+.skeleton-button {
+  height: 36px;
+  width: 100px;
+  border-radius: var(--radius-md);
 }
 
+.skeleton-card {
+  padding: var(--space-6);
+  background: var(--bg-secondary);
+  border: 1px solid var(--border-subtle);
+  border-radius: var(--radius-xl);
+}
 
-.page-title {
-  font-size: var(--text-2xl);
-  font-weight: 600;
+.skeleton-card .skeleton-title {
+  margin-bottom: 16px;
 }
 
-.page-subtitle {
-  font-size: var(--text-sm);
-  color: var(--text-secondary);
-  margin-top: var(--space-1);
+.skeleton-card .skeleton-text {
+  margin-bottom: 10px;
 }
 
-/* Grid */
-.agent-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
-  gap: var(--space-5);
+/* Ripple Effect */
+.ripple {
+  position: relative;
+  overflow: hidden;
 }
 
-/* Forms */
-.form-group {
-  margin-bottom: var(--space-5);
+.ripple::after {
+  content: '';
+  position: absolute;
+  width: 100%;
+  height: 100%;
+  top: 0;
+  left: 0;
+  pointer-events: none;
+  background-image: radial-gradient(circle, var(--accent-glow) 10%, transparent 10.01%);
+  background-repeat: no-repeat;
+  background-position: 50%;
+  transform: scale(10, 10);
+  opacity: 0;
+  transition: transform 0.5s, opacity 0.8s;
 }
 
-.form-label {
-  display: block;
-  font-size: var(--text-sm);
-  font-weight: 500;
-  color: var(--text-secondary);
-  margin-bottom: var(--space-2);
+.ripple:active::after {
+  transform: scale(0, 0);
+  opacity: 0.3;
+  transition: 0s;
 }
 
-.form-input {
-  width: 100%;
-  height: 38px;
+/* Scroll Reveal */
+.reveal-on-scroll {
+  opacity: 0;
+  transform: translateY(30px);
+  transition: opacity 0.6s ease, transform 0.6s ease;
 }
 
-.form-input.input-error {
-  border-color: var(--error);
+.reveal-on-scroll.revealed {
+  opacity: 1;
+  transform: translateY(0);
 }
 
-.form-input.input-error:focus {
-  border-color: var(--error);
-  box-shadow: 0 0 0 2px var(--error-subtle);
+/* =============================================
+   Enhanced Dialog Styles
+   ============================================= */
+
+.dialog-backdrop {
+  animation: backdropFadeIn 0.2s ease forwards;
 }
 
-.form-textarea {
-  width: 100%;
-  min-height: 100px;
-  resize: vertical;
+.dialog-content {
+  animation: modalFadeIn 0.25s cubic-bezier(0.34, 1.56, 0.64, 1) forwards;
 }
 
-/* Login page — premium split-screen */
-.login-page {
-  display: flex;
-  min-height: 100vh;
-  background: #050508;
+@keyframes backdropFadeIn {
+  from {
+    opacity: 0;
+    backdrop-filter: blur(0px);
+  }
+  to {
+    opacity: 1;
+    backdrop-filter: blur(4px);
+  }
 }
 
-/* ── Left: Hero Branding Panel ── */
-.login-hero {
-  flex: 1;
-  position: relative;
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  overflow: hidden;
-  min-height: 100vh;
+@keyframes modalFadeIn {
+  from {
+    opacity: 0;
+    transform: scale(0.92) translateY(20px);
+  }
+  to {
+    opacity: 1;
+    transform: scale(1) translateY(0);
+  }
 }
 
-.login-hero-bg {
-  position: absolute;
-  inset: 0;
-  background:
-    radial-gradient(ellipse 80% 60% at 20% 80%, rgba(255, 255, 255, 0.03) 0%, transparent 60%),
-    radial-gradient(ellipse 60% 50% at 70% 20%, rgba(255, 255, 255, 0.02) 0%, transparent 50%),
-    radial-gradient(ellipse 90% 70% at 50% 50%, rgba(18, 18, 24, 0.95) 0%, transparent 80%);
-  z-index: 0;
+/* =============================================
+   Enhanced Form Input Styles
+   ============================================= */
+
+.input-enhanced {
+  background: var(--bg-elevated);
+  border: 1px solid var(--border-default);
+  border-radius: var(--radius-md);
+  padding: var(--space-3) var(--space-4);
+  font-size: var(--text-sm);
+  color: var(--text-primary);
+  transition: all var(--transition-default);
+  width: 100%;
 }
 
-.login-hero-bg::before {
-  content: '';
-  position: absolute;
-  inset: 0;
-  background-image:
-    radial-gradient(circle at 25% 25%, rgba(255, 255, 255, 0.02) 1px, transparent 1px),
-    radial-gradient(circle at 75% 75%, rgba(255, 255, 255, 0.015) 1px, transparent 1px);
-  background-size: 40px 40px;
+.input-enhanced:hover {
+  border-color: var(--border-strong);
 }
 
-.login-hero-content {
-  position: relative;
-  z-index: 1;
-  max-width: 520px;
-  padding: 48px;
+.input-enhanced:focus {
+  outline: none;
+  border-color: var(--accent-primary);
+  box-shadow: 0 0 0 3px var(--accent-subtle), 0 4px 12px rgba(0, 0, 0, 0.15);
 }
 
-.login-hero-logo {
-  width: 120px;
-  height: 120px;
-  margin-bottom: 32px;
-  filter: brightness(1.1) drop-shadow(0 0 20px rgba(255, 255, 255, 0.1));
-  opacity: 0.85;
-  transition: all 300ms ease;
+.input-error {
+  border-color: var(--error);
 }
 
-.login-hero-logo:hover {
-  opacity: 1;
-  filter: brightness(1.2) drop-shadow(0 0 30px rgba(255, 255, 255, 0.2));
-  transform: scale(1.05);
+.input-error:focus {
+  border-color: var(--error);
+  box-shadow: 0 0 0 3px var(--error-subtle);
 }
 
-.login-hero-badge {
-  display: inline-flex;
-  align-items: center;
-  gap: 8px;
-  padding: 6px 16px;
-  border-radius: 20px;
-  background: rgba(255, 255, 255, 0.06);
-  border: 1px solid rgba(255, 255, 255, 0.08);
-  font-size: 12px;
-  color: rgba(255, 255, 255, 0.6);
-  margin-bottom: 32px;
-  backdrop-filter: blur(8px);
+/* Enhanced Select */
+.select-enhanced {
+  background: var(--bg-elevated);
+  border: 1px solid var(--border-default);
+  border-radius: var(--radius-md);
+  padding: var(--space-3) var(--space-8) var(--space-3) var(--space-4);
+  font-size: var(--text-sm);
+  color: var(--text-primary);
+  cursor: pointer;
+  transition: all var(--transition-default);
+  appearance: none;
+  background-image: var(--select-chevron);
+  background-repeat: no-repeat;
+  background-position: right 12px center;
+  background-size: 16px;
 }
 
-.login-hero-badge-dot {
-  width: 6px;
-  height: 6px;
-  border-radius: 50%;
-  background: rgba(255, 255, 255, 0.7);
-  box-shadow: 0 0 6px rgba(255, 255, 255, 0.4);
+.select-enhanced:hover {
+  border-color: var(--border-strong);
 }
 
-.login-hero-title {
-  font-size: 48px;
-  font-weight: 800;
-  line-height: 1.15;
-  letter-spacing: -0.03em;
-  color: #fff;
-  margin-bottom: 20px;
+.select-enhanced:focus {
+  outline: none;
+  border-color: var(--accent-primary);
+  box-shadow: 0 0 0 3px var(--accent-subtle);
 }
 
-.login-hero-desc {
-  font-size: 16px;
-  line-height: 1.7;
-  color: rgba(255, 255, 255, 0.45);
-  margin-bottom: 40px;
+/* =============================================
+   Enhanced Tooltip
+   ============================================= */
+
+.tooltip-enhanced {
+  position: relative;
 }
 
-.login-hero-features {
-  display: flex;
-  flex-direction: column;
-  gap: 16px;
+.tooltip-enhanced::after {
+  content: attr(data-tooltip);
+  position: absolute;
+  bottom: calc(100% + 8px);
+  left: 50%;
+  transform: translateX(-50%) translateY(4px);
+  padding: var(--space-2) var(--space-3);
+  background: var(--bg-elevated);
+  color: var(--text-primary);
+  font-size: var(--text-xs);
+  white-space: nowrap;
+  border-radius: var(--radius-md);
+  border: 1px solid var(--border-default);
+  box-shadow: 0 8px 24px rgba(0, 0, 0, 0.25);
+  opacity: 0;
+  visibility: hidden;
+  transition: all var(--transition-fast);
+  z-index: 1000;
 }
 
-.login-hero-feature {
-  display: flex;
-  align-items: center;
-  gap: 14px;
-  padding: 14px 18px;
-  border-radius: 12px;
-  background: rgba(255, 255, 255, 0.04);
-  border: 1px solid rgba(255, 255, 255, 0.06);
-  transition: all 200ms ease;
+.tooltip-enhanced:hover::after {
+  opacity: 1;
+  visibility: visible;
+  transform: translateX(-50%) translateY(0);
 }
 
-.login-hero-feature:hover {
-  background: rgba(255, 255, 255, 0.07);
-  border-color: rgba(255, 255, 255, 0.1);
-  transform: translateX(4px);
+/* Metric tooltip */
+.metric-tooltip-trigger {
+  position: relative;
 }
 
-.login-hero-feature-icon {
-  font-size: 22px;
-  flex-shrink: 0;
+.metric-tooltip {
+  display: none;
+  position: absolute;
+  bottom: calc(100% + 6px);
+  left: 0;
+  background: var(--bg-elevated);
+  color: var(--text-secondary);
+  font-size: 12px;
+  line-height: 1.5;
+  padding: 8px 12px;
+  border-radius: 6px;
+  border: 1px solid var(--border-subtle);
+  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
+  white-space: normal;
+  width: 260px;
+  z-index: 100;
+  pointer-events: none;
 }
 
-.login-hero-feature-title {
-  font-size: 14px;
-  font-weight: 600;
-  color: rgba(255, 255, 255, 0.9);
+.metric-tooltip-trigger:hover .metric-tooltip {
+  display: block;
 }
 
-.login-hero-feature-desc {
-  font-size: 12px;
-  color: rgba(255, 255, 255, 0.4);
-  margin-top: 2px;
+/* Status dot (used in Dashboard / AgentDetail) */
+.status-dot {
+  width: 8px;
+  height: 8px;
+  border-radius: 50%;
+  display: inline-block;
+  flex-shrink: 0;
 }
 
-/* ── Right: Form Panel ── */
-.login-form-panel {
-  width: 480px;
-  min-width: 480px;
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  background: var(--bg-primary);
-  border-left: 1px solid var(--border-subtle);
+.status-dot.running {
+  background: var(--status-running);
+  box-shadow: 0 0 6px var(--status-running);
 }
 
-.login-form-wrapper {
-  width: 100%;
-  max-width: 360px;
-  padding: 40px 0;
+.status-dot.idle {
+  background: var(--status-idle);
+  box-shadow: 0 0 6px var(--status-idle);
 }
 
-.login-form-header {
-  margin-bottom: 32px;
+.status-dot.stopped {
+  background: var(--status-stopped);
 }
 
-.login-form-logo {
-  font-size: 20px;
-  font-weight: 700;
-  margin-bottom: 24px;
-  background: linear-gradient(135deg, #ffffff, #a0a0b0);
-  -webkit-background-clip: text;
-  -webkit-text-fill-color: transparent;
-  background-clip: text;
+.status-dot.error {
+  background: var(--status-error);
+  box-shadow: 0 0 6px var(--status-error);
 }
 
-/* Invert the black logo to white when on dark backgrounds */
-[data-theme="dark"] .login-logo-img {
-  filter: invert(1);
+.status-dot.creating {
+  background: var(--info);
+  animation: pulse 1.5s infinite;
 }
 
+.status-dot.disconnected {
+  background: var(--warning);
+}
 
-.login-form-title {
-  font-size: 28px;
-  font-weight: 700;
-  color: var(--text-primary);
+/* Agent avatar — sidebar letter icon */
+.agent-avatar {
+  position: relative;
+  width: 20px;
+  height: 20px;
+  border-radius: 6px;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  font-size: 11px;
+  font-weight: 600;
+  color: var(--text-secondary);
+  background: var(--bg-tertiary);
+  flex-shrink: 0;
+  line-height: 1;
   letter-spacing: -0.02em;
-  margin-bottom: 8px;
 }
 
-.login-form-subtitle {
-  font-size: 14px;
-  color: var(--text-tertiary);
-  line-height: 1.5;
+/* OpenClaw agent — dashed border to indicate external/connected */
+.agent-avatar.openclaw {
+  border: 1.5px dashed var(--text-tertiary);
+  background: transparent;
 }
 
-.login-error {
+/* OpenClaw link icon — top-right of avatar */
+.agent-avatar-link {
+  position: absolute;
+  top: -3px;
+  right: -3px;
+  width: 10px;
+  height: 10px;
+  border-radius: 50%;
+  background: var(--bg-secondary);
+  border: 1px solid var(--border-subtle);
   display: flex;
   align-items: center;
-  gap: 8px;
-  background: var(--error-subtle);
-  color: var(--error);
-  padding: 10px 14px;
-  border-radius: 8px;
-  font-size: 13px;
-  margin-bottom: 20px;
-  border: 1px solid rgba(239, 68, 68, 0.2);
+  justify-content: center;
+  color: var(--text-tertiary);
 }
 
-.login-form {
-  display: flex;
-  flex-direction: column;
-  gap: 20px;
+/* Small badge dot — absolute over avatar */
+.agent-avatar-badge {
+  position: absolute;
+  bottom: -1px;
+  right: -1px;
+  width: 7px;
+  height: 7px;
+  border-radius: 50%;
+  border: 1.5px solid var(--bg-secondary);
 }
 
-.login-field {
-  display: flex;
-  flex-direction: column;
-  gap: 6px;
+.agent-avatar-badge.error {
+  background: var(--status-error);
 }
 
-.login-field label {
-  font-size: 13px;
-  font-weight: 500;
-  color: var(--text-secondary);
+.agent-avatar-badge.creating {
+  background: var(--info);
+  animation: pulse 1.5s infinite;
 }
 
-.login-field input,
-.login-field select {
-  height: 44px;
-  padding: 0 32px 0 14px;
-  background: var(--bg-secondary);
-  border: 1px solid var(--border-default);
-  border-radius: 10px;
-  font-size: 14px;
-  color: var(--text-primary);
-  transition: all 150ms ease;
-  outline: none;
+.agent-avatar-badge.disconnected {
+  background: var(--warning, #f59e0b);
 }
 
-.login-field select {
-  cursor: pointer;
-}
+@keyframes pulse {
 
-.login-field input::placeholder {
-  color: var(--text-tertiary);
-}
+  0%,
+  100% {
+    opacity: 1;
+  }
 
-.login-field input:focus,
-.login-field select:focus {
-  border-color: var(--accent-primary);
-  box-shadow: 0 0 0 3px var(--accent-subtle);
-  background: var(--bg-elevated);
+  50% {
+    opacity: 0.4;
+  }
 }
 
-.login-submit {
-  height: 46px;
-  border: none;
-  border-radius: 10px;
-  background: #ffffff;
-  color: #0a0a0f;
-  font-size: 15px;
-  font-weight: 600;
-  cursor: pointer;
-  transition: all 200ms ease;
-  display: flex;
+/* Badge */
+/* ════════════════════════════════════════════════════════
+   Enhanced Badge System
+   ════════════════════════════════════════════════════════ */
+
+.badge {
+  display: inline-flex;
   align-items: center;
   justify-content: center;
-  margin-top: 4px;
+  gap: var(--space-1);
+  padding: 4px var(--space-3);
+  border-radius: var(--radius-full);
+  font-size: var(--text-xs);
+  font-weight: 600;
+  transition: all var(--transition-fast);
+  border: 1px solid transparent;
+  position: relative;
+  min-width: 60px;
+  text-align: center;
 }
 
-.login-submit:hover:not(:disabled) {
-  background: #e8e8ed;
-  transform: translateY(-1px);
-  box-shadow: 0 4px 20px rgba(255, 255, 255, 0.15);
+.badge::before {
+  content: '';
+  position: absolute;
+  inset: 0;
+  border-radius: inherit;
+  background: linear-gradient(180deg, rgba(255,255,255,0.1) 0%, transparent 100%);
+  opacity: 0;
+  transition: opacity var(--transition-fast);
 }
 
-.login-submit:active:not(:disabled) {
-  transform: translateY(0);
+.badge:hover {
+  transform: translateY(-2px);
+  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
 }
 
-.login-submit:disabled {
-  opacity: 0.6;
-  cursor: not-allowed;
+.badge:hover::before {
+  opacity: 1;
 }
 
-.login-spinner {
-  width: 20px;
-  height: 20px;
-  border: 2px solid rgba(255, 255, 255, 0.3);
-  border-top-color: #fff;
-  border-radius: 50%;
-  animation: spin 0.6s linear infinite;
+.badge-success {
+  background: linear-gradient(135deg, var(--success-subtle), rgba(34, 197, 94, 0.08));
+  color: var(--success);
+  border-color: rgba(34, 197, 94, 0.3);
 }
 
-@keyframes spin {
-  to {
-    transform: rotate(360deg);
-  }
+.badge-success::after {
+  content: '';
+  position: absolute;
+  top: 50%;
+  left: 6px;
+  width: 4px;
+  height: 4px;
+  border-radius: 50%;
+  background: var(--success);
+  transform: translateY(-50%);
+  animation: pulse 2s infinite;
 }
 
-.login-switch {
-  text-align: center;
-  margin-top: 24px;
-  font-size: 13px;
-  color: var(--text-tertiary);
+.badge-warning {
+  background: linear-gradient(135deg, var(--warning-subtle), rgba(245, 158, 11, 0.08));
+  color: var(--warning);
+  border-color: rgba(245, 158, 11, 0.3);
 }
 
-.login-switch a {
-  color: var(--accent-text);
-  font-weight: 500;
-  text-decoration: none;
+.badge-warning::after {
+  content: '';
+  position: absolute;
+  top: 50%;
+  left: 6px;
+  width: 4px;
+  height: 4px;
+  border-radius: 50%;
+  background: var(--warning);
+  transform: translateY(-50%);
+  animation: pulse 1.5s infinite;
 }
 
-.login-switch a:hover {
-  text-decoration: underline;
+.badge-error {
+  background: linear-gradient(135deg, var(--error-subtle), rgba(239, 68, 68, 0.08));
+  color: var(--error);
+  border-color: rgba(239, 68, 68, 0.3);
 }
 
-/* ─── Company Setup Page ───────────────────────────── */
-.company-setup-page {
-  min-height: 100vh;
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  background: var(--bg-primary);
-  padding: 40px 20px;
-  position: relative;
+.badge-error::after {
+  content: '';
+  position: absolute;
+  top: 50%;
+  left: 6px;
+  width: 4px;
+  height: 4px;
+  border-radius: 50%;
+  background: var(--error);
+  transform: translateY(-50%);
+  animation: pulse 1s infinite;
 }
 
-.company-setup-container {
-  max-width: 720px;
-  width: 100%;
+.badge-info {
+  background: linear-gradient(135deg, var(--accent-subtle), rgba(99, 102, 241, 0.08));
+  color: var(--accent-text);
+  border-color: rgba(99, 102, 241, 0.3);
 }
 
-.company-setup-header {
-  text-align: center;
-  margin-bottom: 32px;
+@keyframes pulse {
+  0%, 100% {
+    opacity: 1;
+    transform: translateY(-50%) scale(1);
+  }
+  50% {
+    opacity: 0.5;
+    transform: translateY(-50%) scale(1.2);
+  }
 }
 
-.company-setup-header h1 {
-  font-size: 28px;
-  font-weight: 700;
-  margin: 16px 0 8px;
-  letter-spacing: -0.02em;
+/* Tabs */
+.tabs {
+  display: flex;
+  gap: var(--space-1);
+  border-bottom: 1px solid var(--border-subtle);
+  margin-bottom: var(--space-6);
+  position: sticky;
+  top: 0;
+  background: var(--bg-primary);
+  z-index: 5;
+  padding-top: var(--space-2);
+}
+
+.tab {
+  padding: var(--space-2) var(--space-4);
+  font-size: var(--text-sm);
+  color: var(--text-tertiary);
+  border-bottom: 2px solid transparent;
+  transition: all var(--transition-fast);
+  cursor: pointer;
+}
+
+.tab:hover {
+  color: var(--text-secondary);
+}
+
+.tab.active {
+  color: var(--text-primary);
+  border-bottom-color: var(--accent-primary);
+}
+
+/* Session sidebar admin tabs — segment control style */
+.session-sidebar-segment-control {
+  display: flex;
+  margin: 0 12px 8px;
+  padding: 2px;
+  height: 28px;
+  background: var(--accent-subtle);
+  border-radius: 6px;
+  border: 1px solid var(--border-subtle);
+  box-sizing: border-box;
+}
+
+.session-sidebar-segment-control .segment-item {
+  flex: 1;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  font-size: 12px;
+  color: var(--text-tertiary);
+  border-radius: 4px;
+  cursor: pointer;
+  transition: all 0.15s ease;
+  white-space: nowrap;
+  user-select: none;
+  border: none;
+  background: none;
+  padding: 0;
+  line-height: 1;
+}
+
+.session-sidebar-segment-control .segment-item:hover {
+  color: var(--text-secondary);
+}
+
+.session-sidebar-segment-control .segment-item.active {
+  background: var(--segment-active-bg);
+  color: var(--segment-active-text);
+  font-weight: 500;
+}
+
+/* Custom scope dropdown */
+.scope-dropdown {
+  position: relative;
+  flex: 1;
+  min-width: 0;
+}
+.scope-dropdown-trigger {
+  all: unset;
+  display: inline-flex;
+  align-items: center;
+  gap: 4px;
+  cursor: pointer;
+  max-width: 120px;
+  box-sizing: border-box;
+}
+.scope-dropdown-trigger:hover .scope-dropdown-label {
+  color: var(--text-primary);
+}
+.scope-dropdown-label {
+  font-size: 13px;
+  font-weight: 600;
+  color: var(--text-primary);
+  line-height: 1.25;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  transition: color 0.12s ease;
+}
+.scope-dropdown-chevron {
+  flex-shrink: 0;
+  color: var(--text-tertiary);
+  transition: transform 0.15s ease;
+}
+.scope-dropdown-chevron--open {
+  transform: rotate(180deg);
+}
+.scope-dropdown-menu {
+  position: absolute;
+  top: calc(100% + 4px);
+  left: 0;
+  min-width: 140px;
+  max-width: 200px;
+  padding: 4px;
+  background: var(--bg-elevated);
+  border: 1px solid var(--border-subtle);
+  border-radius: 8px;
+  box-shadow: 0 4px 16px rgba(0, 0, 0, 0.1);
+  z-index: 90;
+  animation: fadeIn 0.12s ease;
+}
+[data-theme="dark"] .scope-dropdown-menu {
+  background: rgba(32, 32, 40, 0.95);
+  border-color: rgba(255, 255, 255, 0.08);
+  box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3);
+}
+.scope-dropdown-item {
+  padding: 6px 10px;
+  font-size: 12px;
+  color: var(--text-secondary);
+  border-radius: 5px;
+  cursor: pointer;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  transition: background 0.1s ease, color 0.1s ease;
+}
+.scope-dropdown-item:hover {
+  background: var(--bg-secondary);
+  color: var(--text-primary);
+}
+.scope-dropdown-item--active {
+  color: var(--text-primary);
+  font-weight: 500;
+}
+.scope-dropdown-item--disabled {
+  color: var(--text-tertiary);
+  cursor: default;
+  font-style: italic;
+}
+.scope-dropdown-item--disabled:hover {
+  background: none;
+  color: var(--text-tertiary);
+}
+
+/* Sidebar toggle button — unified for collapse & expand */
+.session-sidebar-toggle-btn {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  width: 28px;
+  height: 28px;
+  border-radius: 6px;
+  background: none;
+  border: none;
+  color: var(--text-tertiary);
+  cursor: pointer;
+  flex-shrink: 0;
+  padding: 0;
+  transition: background 0.12s ease, color 0.12s ease;
+}
+.session-sidebar-toggle-btn:hover {
+  background: var(--bg-secondary);
+  color: var(--text-secondary);
+}
+.session-sidebar-toggle-btn--floating {
+  position: absolute;
+  top: 12px;
+  left: 12px;
+  z-index: 10;
+  background: var(--bg-elevated);
+  border: 1px solid var(--border-subtle);
+}
+.session-sidebar-toggle-btn--floating:hover {
+  background: var(--bg-secondary);
+}
+
+/* New session button */
+.new-session-btn {
+  width: 100%;
+  height: 28px;
+  padding: 0 10px;
+  background: var(--bg-secondary);
+  border: 1px solid var(--border-subtle);
+  border-radius: 6px;
+  cursor: pointer;
+  font-size: 12px;
+  color: var(--text-secondary);
+  font-weight: 500;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  gap: 6px;
+  transition: all 0.15s ease;
+  box-sizing: border-box;
+  line-height: 1;
+}
+
+.new-session-btn:hover {
+  background: var(--bg-tertiary);
+  color: var(--text-primary);
+  border-color: var(--border-strong);
+}
+[data-theme="dark"] .new-session-btn {
+  background: rgba(255, 255, 255, 0.04);
+  border-color: rgba(255, 255, 255, 0.08);
+  color: var(--text-secondary);
+}
+[data-theme="dark"] .new-session-btn:hover {
+  background: rgba(255, 255, 255, 0.08);
+  border-color: rgba(255, 255, 255, 0.14);
+  color: var(--text-primary);
+}
+
+/* Session item — delete button & message count hover behaviour */
+.session-del-btn {
+  display: none;
+  align-items: center;
+  justify-content: center;
+  width: 24px;
+  height: 24px;
+  flex-shrink: 0;
+  padding: 0;
+  border: none;
+  border-radius: 4px;
+  background: none;
+  color: var(--text-tertiary);
+  cursor: pointer;
+  transition: color 0.35s ease-in-out, background 0.35s ease-in-out;
+}
+
+.session-item:hover .session-del-btn {
+  display: flex;
+}
+
+.session-del-btn:hover {
+  color: var(--status-error);
+  background: var(--bg-hover);
+}
+
+.session-item:hover .session-msg-count {
+  display: none;
+}
+
+/* Page header */
+.page-header {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  margin-bottom: var(--space-6);
+}
+
+/* When inside the chat page (main-content.chat-page removes outer padding),
+   give page-header its own padding to replicate the normal main-content spacing */
+.main-content.chat-page .page-header {
+  padding: var(--space-8) var(--space-8) 0;
+  flex-shrink: 0;
+}
+
+
+.page-title {
+  font-size: var(--text-2xl);
+  font-weight: 600;
+}
+
+.page-subtitle {
+  font-size: var(--text-sm);
+  color: var(--text-secondary);
+  margin-top: var(--space-1);
+}
+
+/* Grid */
+.agent-grid {
+  display: grid;
+  grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
+  gap: var(--space-5);
+}
+
+/* Forms */
+.form-group {
+  margin-bottom: var(--space-5);
+}
+
+.form-label {
+  display: block;
+  font-size: var(--text-sm);
+  font-weight: 500;
+  color: var(--text-secondary);
+  margin-bottom: var(--space-2);
+}
+
+.form-input {
+  width: 100%;
+  min-height: 42px;
+  height: auto;
+}
+
+.form-input.input-error {
+  border-color: var(--error);
+}
+
+.form-input.input-error:focus {
+  border-color: var(--error);
+  box-shadow: 0 0 0 2px var(--error-subtle);
+}
+
+.form-textarea {
+  width: 100%;
+  min-height: 100px;
+  resize: vertical;
+}
+
+/* Login page — premium split-screen */
+.login-page {
+  display: flex;
+  min-height: 100vh;
+  background: #050508;
+}
+
+/* ── Left: Hero Branding Panel ── */
+.login-hero {
+  flex: 1;
+  position: relative;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  overflow: hidden;
+  min-height: 100vh;
+}
+
+.login-hero-bg {
+  position: absolute;
+  inset: 0;
+  background:
+    radial-gradient(ellipse 80% 60% at 20% 80%, rgba(255, 255, 255, 0.03) 0%, transparent 60%),
+    radial-gradient(ellipse 60% 50% at 70% 20%, rgba(255, 255, 255, 0.02) 0%, transparent 50%),
+    radial-gradient(ellipse 90% 70% at 50% 50%, rgba(18, 18, 24, 0.95) 0%, transparent 80%);
+  z-index: 0;
+}
+
+.login-hero-bg::before {
+  content: '';
+  position: absolute;
+  inset: 0;
+  background-image:
+    radial-gradient(circle at 25% 25%, rgba(255, 255, 255, 0.02) 1px, transparent 1px),
+    radial-gradient(circle at 75% 75%, rgba(255, 255, 255, 0.015) 1px, transparent 1px);
+  background-size: 40px 40px;
+}
+
+.login-hero-content {
+  position: relative;
+  z-index: 1;
+  max-width: 520px;
+  padding: 48px;
+}
+
+.login-hero-logo {
+  width: 120px;
+  height: 120px;
+  margin-bottom: 32px;
+  filter: brightness(1.1) drop-shadow(0 0 20px rgba(255, 255, 255, 0.1));
+  opacity: 0.85;
+  transition: all 300ms ease;
+}
+
+.login-hero-logo:hover {
+  opacity: 1;
+  filter: brightness(1.2) drop-shadow(0 0 30px rgba(255, 255, 255, 0.2));
+  transform: scale(1.05);
+}
+
+.login-hero-badge {
+  display: inline-flex;
+  align-items: center;
+  gap: 8px;
+  padding: 6px 16px;
+  border-radius: 20px;
+  background: rgba(255, 255, 255, 0.06);
+  border: 1px solid rgba(255, 255, 255, 0.08);
+  font-size: 12px;
+  color: rgba(255, 255, 255, 0.6);
+  margin-bottom: 32px;
+  backdrop-filter: blur(8px);
+}
+
+.login-hero-badge-dot {
+  width: 6px;
+  height: 6px;
+  border-radius: 50%;
+  background: rgba(255, 255, 255, 0.7);
+  box-shadow: 0 0 6px rgba(255, 255, 255, 0.4);
+}
+
+.login-hero-title {
+  font-size: 48px;
+  font-weight: 800;
+  line-height: 1.15;
+  letter-spacing: -0.03em;
+  color: #fff;
+  margin-bottom: 20px;
+}
+
+.login-hero-desc {
+  font-size: 16px;
+  line-height: 1.7;
+  color: rgba(255, 255, 255, 0.45);
+  margin-bottom: 40px;
+}
+
+.login-hero-features {
+  display: flex;
+  flex-direction: column;
+  gap: 16px;
+}
+
+.login-hero-feature {
+  display: flex;
+  align-items: center;
+  gap: 14px;
+  padding: 14px 18px;
+  border-radius: 12px;
+  background: rgba(255, 255, 255, 0.04);
+  border: 1px solid rgba(255, 255, 255, 0.06);
+  transition: all 200ms ease;
+}
+
+.login-hero-feature:hover {
+  background: rgba(255, 255, 255, 0.07);
+  border-color: rgba(255, 255, 255, 0.1);
+  transform: translateX(4px);
+}
+
+.login-hero-feature-icon {
+  font-size: 22px;
+  flex-shrink: 0;
+}
+
+.login-hero-feature-title {
+  font-size: 14px;
+  font-weight: 600;
+  color: rgba(255, 255, 255, 0.9);
+}
+
+.login-hero-feature-desc {
+  font-size: 12px;
+  color: rgba(255, 255, 255, 0.4);
+  margin-top: 2px;
+}
+
+/* ── Right: Form Panel ── */
+.login-form-panel {
+  width: 480px;
+  min-width: 480px;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  background: var(--bg-primary);
+  border-left: 1px solid var(--border-subtle);
+}
+
+.login-form-wrapper {
+  width: 100%;
+  max-width: 360px;
+  padding: 40px 0;
+}
+
+.login-form-header {
+  margin-bottom: 32px;
+}
+
+.login-form-logo {
+  font-size: 20px;
+  font-weight: 700;
+  margin-bottom: 24px;
+  background: linear-gradient(135deg, #ffffff, #a0a0b0);
+  -webkit-background-clip: text;
+  -webkit-text-fill-color: transparent;
+  background-clip: text;
+}
+
+/* Invert the black logo to white when on dark backgrounds */
+[data-theme="dark"] .login-logo-img {
+  filter: invert(1);
+}
+
+
+.login-form-title {
+  font-size: 28px;
+  font-weight: 700;
+  color: var(--text-primary);
+  letter-spacing: -0.02em;
+  margin-bottom: 8px;
+}
+
+.login-form-subtitle {
+  font-size: 14px;
+  color: var(--text-tertiary);
+  line-height: 1.5;
+}
+
+.login-error {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  background: var(--error-subtle);
+  color: var(--error);
+  padding: 10px 14px;
+  border-radius: 8px;
+  font-size: 13px;
+  margin-bottom: 20px;
+  border: 1px solid rgba(239, 68, 68, 0.2);
+}
+
+.login-form {
+  display: flex;
+  flex-direction: column;
+  gap: 20px;
+}
+
+.login-field {
+  display: flex;
+  flex-direction: column;
+  gap: 6px;
+}
+
+.login-field label {
+  font-size: 13px;
+  font-weight: 500;
+  color: var(--text-secondary);
+}
+
+.login-field input,
+.login-field select {
+  height: 44px;
+  padding: 0 32px 0 14px;
+  background: var(--bg-secondary);
+  border: 1px solid var(--border-default);
+  border-radius: 10px;
+  font-size: 14px;
+  color: var(--text-primary);
+  transition: all 150ms ease;
+  outline: none;
+}
+
+.login-field select {
+  cursor: pointer;
+}
+
+.login-field input::placeholder {
+  color: var(--text-tertiary);
+}
+
+.login-field input:focus,
+.login-field select:focus {
+  border-color: var(--accent-primary);
+  box-shadow: 0 0 0 3px var(--accent-subtle);
+  background: var(--bg-elevated);
+}
+
+.login-submit {
+  height: 46px;
+  border: none;
+  border-radius: 10px;
+  background: #ffffff;
+  color: #0a0a0f;
+  font-size: 15px;
+  font-weight: 600;
+  cursor: pointer;
+  transition: all 200ms ease;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  margin-top: 4px;
+}
+
+.login-submit:hover:not(:disabled) {
+  background: #e8e8ed;
+  transform: translateY(-1px);
+  box-shadow: 0 4px 20px rgba(255, 255, 255, 0.15);
+}
+
+.login-submit:active:not(:disabled) {
+  transform: translateY(0);
+}
+
+.login-submit:disabled {
+  opacity: 0.6;
+  cursor: not-allowed;
+}
+
+.login-spinner {
+  width: 20px;
+  height: 20px;
+  border: 2px solid rgba(255, 255, 255, 0.3);
+  border-top-color: #fff;
+  border-radius: 50%;
+  animation: spin 0.6s linear infinite;
+}
+
+@keyframes spin {
+  to {
+    transform: rotate(360deg);
+  }
+}
+
+.login-switch {
+  text-align: center;
+  margin-top: 24px;
+  font-size: 13px;
+  color: var(--text-tertiary);
+}
+
+.login-switch a {
+  color: var(--accent-text);
+  font-weight: 500;
+  text-decoration: none;
+}
+
+.login-switch a:hover {
+  text-decoration: underline;
+}
+
+/* ─── Company Setup Page ───────────────────────────── */
+.company-setup-page {
+  min-height: 100vh;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  background: var(--bg-primary);
+  padding: 40px 20px;
+  position: relative;
+}
+
+.company-setup-container {
+  max-width: 720px;
+  width: 100%;
+}
+
+.company-setup-header {
+  text-align: center;
+  margin-bottom: 32px;
+}
+
+.company-setup-header h1 {
+  font-size: 28px;
+  font-weight: 700;
+  margin: 16px 0 8px;
+  letter-spacing: -0.02em;
 }
 
 .company-setup-subtitle {
@@ -2349,978 +3402,1811 @@ select:focus {
   color: var(--text-tertiary);
 }
 
-.company-setup-panels {
-  display: grid;
-  grid-template-columns: 1fr auto 1fr;
-  gap: 0;
-  align-items: stretch;
+.company-setup-panels {
+  display: grid;
+  grid-template-columns: 1fr auto 1fr;
+  gap: 0;
+  align-items: stretch;
+}
+
+.company-setup-panels.single {
+  grid-template-columns: 1fr;
+  max-width: 400px;
+  margin: 0 auto;
+}
+
+.company-setup-panel {
+  background: var(--bg-secondary);
+  border: 1px solid var(--border-subtle);
+  border-radius: 12px;
+  padding: 28px;
+  display: flex;
+  flex-direction: column;
+  gap: 16px;
+  transition: border-color 200ms ease;
+}
+
+.company-setup-panel:hover {
+  border-color: var(--border-default);
+}
+
+.company-setup-panel-header {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  color: var(--text-primary);
+}
+
+.company-setup-panel-header h3 {
+  font-size: 16px;
+  font-weight: 600;
+  margin: 0;
+}
+
+.company-setup-panel-desc {
+  font-size: 13px;
+  color: var(--text-tertiary);
+  line-height: 1.5;
+  margin: 0;
+}
+
+.company-setup-divider {
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  padding: 0 20px;
+}
+
+.company-setup-divider span {
+  font-size: 12px;
+  font-weight: 600;
+  color: var(--text-tertiary);
+  letter-spacing: 1px;
+}
+
+.company-setup-hint {
+  text-align: center;
+  font-size: 13px;
+  color: var(--text-tertiary);
+  margin-top: 16px;
+}
+
+@media (max-width: 700px) {
+  .company-setup-panels {
+    grid-template-columns: 1fr;
+    gap: 16px;
+  }
+
+  .company-setup-divider {
+    padding: 0;
+  }
+}
+
+/* Responsive: collapse to single column on small screens */
+@media (max-width: 900px) {
+  .login-page {
+    flex-direction: column;
+  }
+
+  .login-hero {
+    min-height: 40vh;
+    padding: 40px 20px;
+  }
+
+  .login-hero-title {
+    font-size: 32px;
+  }
+
+  .login-hero-features {
+    display: none;
+  }
+
+  .login-form-panel {
+    width: 100%;
+    min-width: unset;
+    border-left: none;
+    border-top: 1px solid var(--border-subtle);
+    padding: 40px 24px;
+  }
+}
+
+/* Chat */
+.chat-container {
+  display: flex;
+  flex-direction: column;
+  /* Fill the chat-page main-content area (which is padding: 0, height: 100vh).
+     The page-header inside Chat is ~68px; flex: 1 + min-height: 0 lets
+     .chat-messages own the remaining space and scroll independently. */
+  flex: 1;
+  min-height: 0;
+  overflow: hidden;
+}
+
+/* Restore horizontal padding that main-content normally provides */
+.main-content.chat-page .chat-container {
+  padding: 0 var(--space-8);
+}
+
+
+/* When live panel is active, switch to horizontal layout */
+.chat-container.chat-with-live-panel {
+  flex-direction: row;
+  gap: 0;
+}
+
+.chat-with-live-panel > .chat-main {
+  flex: 1;
+  min-width: 0;
+  display: flex;
+  flex-direction: column;
+}
+
+.chat-main {
+  display: flex;
+  flex-direction: column;
+  flex: 1;
+  min-height: 0;
+}
+
+.agent-detail-page--chat {
+  height: 100%;
+  min-height: 0;
+  display: flex;
+  flex-direction: column;
+}
+
+.agent-detail-header {
+  flex-shrink: 0;
+}
+
+.agent-detail-page--chat .agent-detail-header {
+  min-height: 42px;
+  padding: 5px 0;
+  margin-bottom: 4px;
+  display: flex;
+  align-items: center;
+}
+
+.main-content.chat-page .agent-detail-page--chat .agent-detail-header {
+  padding: 6px var(--space-8) 6px 20px;
+  margin-bottom: 4px;
+}
+
+.agent-detail-page--settings .agent-detail-header {
+  min-height: 0;
+  justify-content: flex-end;
+  margin-bottom: 0;
+}
+
+.agent-detail-page--settings .agent-detail-actions {
+  display: none;
+}
+
+.agent-detail-page--settings > .tabs {
+  align-items: center;
+  border-bottom: none;
+  padding-top: 0;
+  margin-top: 0;
+}
+
+.agent-tabs-chat-action {
+  margin-left: auto;
+  flex-shrink: 0;
+}
+
+.agent-settings-savebar {
+  display: flex;
+  align-items: center;
+  justify-content: flex-end;
+  margin-bottom: 8px;
+}
+
+.agent-detail-identity {
+  position: relative;
+  min-width: 0;
+}
+
+.agent-detail-identity-trigger {
+  display: flex;
+  align-items: center;
+  gap: 16px;
+  min-width: 0;
+}
+
+.agent-detail-identity--compact .agent-detail-identity-trigger {
+  gap: 9px;
+  cursor: default;
+  padding: 0 4px;
+  border-radius: 9px;
+  align-items: center;
+}
+
+.agent-detail-avatar {
+  width: 34px;
+  height: 34px;
+  border-radius: 8px;
+  background: var(--accent-subtle);
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  color: var(--text-primary);
+  font-size: 17px;
+  font-weight: 650;
+  flex-shrink: 0;
+}
+
+.agent-detail-identity--compact .agent-detail-avatar {
+  width: 24px;
+  height: 24px;
+  border-radius: 7px;
+  font-size: 12px;
+  font-weight: 650;
+}
+
+.agent-detail-page--chat .page-title {
+  font-size: 14px;
+  line-height: 1.2;
+}
+
+.agent-detail-page--chat .page-subtitle {
+  font-size: 12px;
+  margin-top: 2px !important;
+}
+
+.agent-detail-actions {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+}
+
+.agent-top-action {
+  height: 32px;
+  gap: 6px;
+  padding: 0 10px;
+  font-size: 13px;
+}
+
+.agent-top-action:hover {
+  color: #212121;
+  background: rgba(34, 55, 89, 0.08);
+}
+.agent-top-action.active {
+  color: #212121;
+  background: rgba(34, 55, 89, 0.08);
+  border-color: rgba(34, 55, 89, 0.18);
+}
+
+/* ══════════════════════════════════════════════════════
+   Agent Info Hover Card — Enterprise SaaS Design System
+   Glass card, gradient hero, icon headers, inner cards
+   ══════════════════════════════════════════════════════ */
+.agent-info-card {
+  position: absolute;
+  left: 0;
+  top: calc(100% + 8px);
+  width: min(480px, 82vw);
+  padding: 0;
+  border-radius: 16px;
+  color: var(--text-primary);
+  opacity: 0;
+  visibility: hidden;
+  transform: translateY(-6px);
+  pointer-events: none;
+  transition: opacity 180ms cubic-bezier(0.16, 1, 0.3, 1),
+              transform 180ms cubic-bezier(0.16, 1, 0.3, 1),
+              visibility 180ms ease,
+              box-shadow 250ms ease;
+  z-index: 80;
+  overflow: hidden;
+  /* Glass card defaults */
+  background: rgba(255, 255, 255, 0.72);
+  backdrop-filter: blur(16px) saturate(1.8);
+  -webkit-backdrop-filter: blur(16px) saturate(1.8);
+  border: 1px solid rgba(255, 255, 255, 0.45);
+  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
+}
+
+[data-theme="dark"] .agent-info-card {
+  background: rgba(28, 28, 38, 0.82);
+  border-color: rgba(255, 255, 255, 0.08);
+  box-shadow: 0 4px 16px rgba(0, 0, 0, 0.25);
+}
+
+/* Card open state (click-triggered) */
+.agent-info-card--open {
+  opacity: 1;
+  visibility: visible;
+  transform: translateY(0);
+  pointer-events: auto;
+  box-shadow: 0 8px 32px rgba(0, 0, 0, 0.08), 0 2px 8px rgba(0, 0, 0, 0.04);
+}
+[data-theme="dark"] .agent-info-card--open {
+  box-shadow: 0 8px 32px rgba(0, 0, 0, 0.35), 0 2px 8px rgba(0, 0, 0, 0.2);
+}
+
+/* Chevron toggle button */
+.agent-info-chevron {
+  all: unset;
+  cursor: pointer;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  width: 24px;
+  height: 24px;
+  border-radius: 6px;
+  color: var(--text-tertiary);
+  flex-shrink: 0;
+  transition: color 120ms, background 120ms, transform 200ms ease;
+}
+.agent-info-chevron:hover {
+  color: var(--text-primary);
+  background: var(--bg-hover);
+}
+.agent-info-chevron--open {
+  transform: rotate(180deg);
+  color: var(--text-primary);
+}
+
+/* Bridge the 8px gap between trigger and card so hover doesn't break */
+.agent-info-card--open::before {
+  content: '';
+  position: absolute;
+  top: -12px;
+  left: 0;
+  right: 0;
+  height: 12px;
+}
+
+/* ── Card inner wrapper (holds glow + grid) ── */
+.agent-info-card-inner {
+  position: relative;
+}
+
+.agent-info-card-glow {
+  position: absolute;
+  top: -40px;
+  right: -40px;
+  width: 120px;
+  height: 120px;
+  background: rgba(0, 0, 0, 0.04);
+  border-radius: 50%;
+  filter: blur(40px);
+  pointer-events: none;
+}
+[data-theme="dark"] .agent-info-card-glow {
+  background: rgba(255, 255, 255, 0.06);
+}
+
+/* ── Two-column grid ── */
+.agent-info-card-grid {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+}
+
+/* ── Section ── */
+.agent-info-card-section {
+  min-width: 0;
+  display: flex;
+  flex-direction: column;
+}
+.agent-info-card-section:first-child {
+  border-right: 1px solid rgba(0, 0, 0, 0.06);
+}
+[data-theme="dark"] .agent-info-card-section:first-child {
+  border-right-color: rgba(255, 255, 255, 0.06);
+}
+
+/* ── Section header — icon + title ── */
+.agent-info-card-section-header {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  padding: 12px 16px 10px;
+}
+.agent-info-section-icon {
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  width: 26px;
+  height: 26px;
+  border-radius: 7px;
+  flex-shrink: 0;
+}
+.agent-info-section-icon--blue {
+  background: var(--bg-secondary);
+  color: var(--text-secondary);
+}
+.agent-info-section-icon--indigo {
+  background: var(--bg-secondary);
+  color: var(--text-secondary);
+}
+.agent-info-card-section-title {
+  font-size: 13px;
+  font-weight: 600;
+  color: var(--text-primary);
+  letter-spacing: -0.01em;
+}
+
+/* ── Section body ── */
+.agent-info-card-body {
+  padding: 12px 16px 16px;
+  flex: 1;
+}
+
+/* ── Token glass inner card ── */
+.agent-info-token-glass {
+  background: rgba(255, 255, 255, 0.5);
+  border: 1px solid rgba(255, 255, 255, 0.7);
+  border-radius: 12px;
+  padding: 16px;
+}
+[data-theme="dark"] .agent-info-token-glass {
+  background: rgba(255, 255, 255, 0.04);
+  border-color: rgba(255, 255, 255, 0.06);
+}
+
+/* ── Token hero ── */
+.agent-info-token-hero {
+  display: flex;
+  flex-direction: column;
+  gap: 2px;
+  margin-bottom: 14px;
+  padding-bottom: 14px;
+  border-bottom: 1px solid rgba(0, 0, 0, 0.05);
+}
+[data-theme="dark"] .agent-info-token-hero {
+  border-bottom-color: rgba(255, 255, 255, 0.06);
+}
+.agent-info-token-hero-label {
+  font-size: 10px;
+  font-weight: 700;
+  text-transform: uppercase;
+  letter-spacing: 0.1em;
+  color: var(--text-tertiary);
+}
+.agent-info-token-hero-value {
+  font-size: 32px;
+  font-weight: 700;
+  line-height: 1.1;
+  letter-spacing: -0.03em;
+  background: linear-gradient(135deg, #1f2328 0%, #6b7280 100%);
+  -webkit-background-clip: text;
+  -webkit-text-fill-color: transparent;
+  background-clip: text;
+  width: fit-content;
+}
+[data-theme="dark"] .agent-info-token-hero-value {
+  background: linear-gradient(135deg, #e1e1e8 0%, #8b8b9e 100%);
+  -webkit-background-clip: text;
+  background-clip: text;
+}
+.agent-info-token-hero-unit {
+  font-size: 20px;
+  font-weight: 600;
+  letter-spacing: 0;
+  margin-left: 1px;
+}
+
+/* ── Token stats grid ── */
+.agent-info-token-stats {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  gap: 8px;
+}
+.agent-info-stat-item {
+  display: flex;
+  flex-direction: column;
+  gap: 2px;
+}
+.agent-info-stat-label {
+  font-size: 10px;
+  font-weight: 600;
+  text-transform: uppercase;
+  letter-spacing: 0.08em;
+  color: var(--text-tertiary);
+}
+.agent-info-stat-value {
+  font-size: 14px;
+  font-weight: 600;
+  color: var(--text-primary);
+  line-height: 1.3;
+}
+.agent-info-stat-unit {
+  font-size: 10px;
+  font-weight: 500;
+  opacity: 0.5;
+  margin-left: 1px;
+}
+
+/* ── Model inner card ── */
+.agent-info-model-card {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  padding: 10px 12px;
+  border-radius: 10px;
+  background: var(--bg-secondary);
+  border: none;
+  margin-bottom: 12px;
+}
+[data-theme="dark"] .agent-info-model-card {
+  background: rgba(255, 255, 255, 0.03);
+}
+.agent-info-model-card-icon {
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  width: 30px;
+  height: 30px;
+  border-radius: 8px;
+  background: var(--bg-tertiary);
+  color: var(--text-secondary);
+  font-size: 14px;
+  flex-shrink: 0;
+}
+.agent-info-model-card-text {
+  display: flex;
+  flex-direction: column;
+  gap: 1px;
+  min-width: 0;
+}
+.agent-info-model-card-label {
+  font-size: 9px;
+  font-weight: 700;
+  text-transform: uppercase;
+  letter-spacing: 0.08em;
+  color: var(--text-tertiary);
+}
+.agent-info-model-card-name {
+  font-size: 12px;
+  font-weight: 600;
+  color: var(--text-primary);
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+}
+
+/* ── Meta rows ── */
+.agent-info-meta-list {
+  display: flex;
+  flex-direction: column;
+}
+.agent-info-meta-row {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  gap: 12px;
+  padding: 6px 0;
+  font-size: 11px;
+  border-bottom: 1px solid rgba(0, 0, 0, 0.04);
+}
+[data-theme="dark"] .agent-info-meta-row {
+  border-bottom-color: rgba(255, 255, 255, 0.04);
+}
+.agent-info-meta-row:last-child {
+  border-bottom: none;
+  padding-bottom: 0;
+}
+.agent-info-meta-row > span:first-child {
+  color: var(--text-tertiary);
+  flex-shrink: 0;
+}
+.agent-info-meta-row > span:last-child {
+  font-weight: 500;
+  color: var(--text-secondary);
+  text-align: right;
+  min-width: 0;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
 }
 
-.company-setup-panels.single {
-  grid-template-columns: 1fr;
-  max-width: 400px;
-  margin: 0 auto;
+/* ── Meta rows — visual block, no dividers ── */
+.agent-info-meta-list {
+  display: flex;
+  flex-direction: column;
+  gap: 2px;
 }
 
-.company-setup-panel {
+.agent-info-meta-row {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  gap: 12px;
+  padding: 6px 8px;
+  font-size: 12px;
+  border-radius: 4px;
+  transition: background 100ms ease;
+}
+
+.agent-info-meta-row:hover {
   background: var(--bg-secondary);
-  border: 1px solid var(--border-subtle);
-  border-radius: 12px;
-  padding: 28px;
+}
+
+.agent-info-meta-row > span:first-child {
+  color: var(--text-tertiary);
+  flex-shrink: 0;
+}
+
+.agent-info-meta-row > span:last-child {
+  font-weight: 500;
+  color: var(--text-secondary);
+  text-align: right;
+  min-width: 0;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+}
+
+.agent-info-meta-list--model {
+  margin-bottom: 12px;
+}
+
+.agent-info-profile-block {
+  margin-top: 2px;
+  padding-top: 12px;
+  border-top: 1px solid rgba(0, 0, 0, 0.05);
+}
+
+[data-theme="dark"] .agent-info-profile-block {
+  border-top-color: rgba(255, 255, 255, 0.06);
+}
+
+.agent-info-profile-header {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  margin: 0 0 10px;
+}
+
+/* (variant toggle removed — saas is the only style) */
+
+
+.agent-chat-shell {
+  flex: 1;
+  min-height: 0;
+  box-sizing: border-box;
+  background: rgba(255, 255, 255, 0.55);
+  backdrop-filter: blur(12px) saturate(1.6);
+  -webkit-backdrop-filter: blur(12px) saturate(1.6);
+}
+[data-theme="dark"] .agent-chat-shell {
+  background: rgba(22, 22, 30, 0.7);
+}
+
+.agent-chat-area.has-live-panel {
+  gap: 0;
+  padding-right: 0;
+  box-sizing: border-box;
+}
+
+.agent-chat-area.has-live-panel .agent-side-panel {
+  border: 0;
+  border-left: 1px solid var(--border-subtle);
+  border-radius: 0;
+  overflow: hidden;
+  align-self: stretch;
+}
+
+.aware-side-preview {
+  height: 100%;
+  overflow: auto;
+  padding: 14px;
   display: flex;
   flex-direction: column;
-  gap: 16px;
-  transition: border-color 200ms ease;
+  gap: 14px;
 }
 
-.company-setup-panel:hover {
-  border-color: var(--border-default);
+.aware-side-section {
+  border: 1px solid var(--border-subtle);
+  border-radius: 8px;
+  background: var(--bg-primary);
+  padding: 12px;
+}
+
+.aware-side-section-title {
+  font-size: 13px;
+  font-weight: 600;
+  color: var(--text-primary);
+  margin-bottom: 10px;
+}
+
+.aware-side-item,
+.aware-side-trigger {
+  display: flex;
+  gap: 10px;
+  align-items: flex-start;
+  padding: 8px 0;
+  border-top: 1px solid var(--border-subtle);
+}
+
+.aware-side-item:first-of-type,
+.aware-side-trigger:first-of-type {
+  border-top: none;
+}
+
+.aware-side-dot {
+  width: 8px;
+  height: 8px;
+  border-radius: 999px;
+  background: var(--border-strong);
+  margin-top: 5px;
+  flex: 0 0 8px;
+}
+
+.aware-side-dot.active {
+  background: var(--accent-primary);
 }
 
-.company-setup-panel-header {
-  display: flex;
-  align-items: center;
-  gap: 10px;
-  color: var(--text-primary);
+.aware-side-dot.done {
+  background: var(--success, #10b981);
 }
 
-.company-setup-panel-header h3 {
-  font-size: 16px;
-  font-weight: 600;
-  margin: 0;
+.aware-side-item-title {
+  font-size: 12px;
+  line-height: 1.45;
+  color: var(--text-primary);
 }
 
-.company-setup-panel-desc {
-  font-size: 13px;
+.aware-side-item-meta {
+  font-size: 11px;
+  line-height: 1.45;
   color: var(--text-tertiary);
-  line-height: 1.5;
-  margin: 0;
+  margin-top: 2px;
 }
 
-.company-setup-divider {
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  padding: 0 20px;
+.aware-side-trigger-main {
+  flex: 1;
+  min-width: 0;
 }
 
-.company-setup-divider span {
+.aware-side-empty {
   font-size: 12px;
-  font-weight: 600;
   color: var(--text-tertiary);
-  letter-spacing: 1px;
+  padding: 8px 0;
 }
 
-.company-setup-hint {
-  text-align: center;
-  font-size: 13px;
-  color: var(--text-tertiary);
-  margin-top: 16px;
+/* ─── AgentBay Live Preview Panel ───────────────── */
+.live-panel {
+  /* Width is now controlled via inline style from the component (resizable). */
+  min-width: 300px;
+  max-width: 65vw;
+  display: flex;
+  flex-direction: column;
+  position: relative;
+  border-left: 1px solid var(--border-subtle);
+  background: var(--bg-secondary);
+  animation: livePanelSlideIn 250ms ease;
 }
 
-@media (max-width: 700px) {
-  .company-setup-panels {
-    grid-template-columns: 1fr;
-    gap: 16px;
-  }
-
-  .company-setup-divider {
-    padding: 0;
-  }
+/* Drag handle on the left edge of the live panel */
+.live-panel-resize-handle {
+  position: absolute;
+  left: 0;
+  top: 0;
+  bottom: 0;
+  width: 5px;
+  cursor: col-resize;
+  z-index: 10;
+  /* Show a subtle visual indicator */
+  background: transparent;
+  transition: background 150ms ease;
 }
 
-/* Responsive: collapse to single column on small screens */
-@media (max-width: 900px) {
-  .login-page {
-    flex-direction: column;
-  }
-
-  .login-hero {
-    min-height: 40vh;
-    padding: 40px 20px;
-  }
-
-  .login-hero-title {
-    font-size: 32px;
-  }
-
-  .login-hero-features {
-    display: none;
-  }
+.live-panel-resize-handle:hover,
+.live-panel-resize-handle:active {
+  background: var(--accent-primary);
+  opacity: 0.35;
+}
 
-  .login-form-panel {
-    width: 100%;
-    min-width: unset;
-    border-left: none;
-    border-top: 1px solid var(--border-subtle);
-    padding: 40px 24px;
-  }
+@keyframes livePanelSlideIn {
+  from { opacity: 0; transform: translateX(16px); }
+  to { opacity: 1; transform: translateX(0); }
 }
 
-/* Chat */
-.chat-container {
+.live-panel-header {
   display: flex;
-  flex-direction: column;
-  /* Fill the chat-page main-content area (which is padding: 0, height: 100vh).
-     The page-header inside Chat is ~68px; flex: 1 + min-height: 0 lets
-     .chat-messages own the remaining space and scroll independently. */
+  align-items: center;
+  gap: 8px;
+  padding: 8px 12px;
+  border-bottom: 1px solid var(--border-subtle);
+  background: var(--bg-elevated);
+  flex-shrink: 0;
+}
+.live-panel-title {
   flex: 1;
-  min-height: 0;
-  overflow: hidden;
+  font-size: 13px;
+  font-weight: 600;
+  color: var(--text-primary);
+  line-height: 1.3;
 }
 
-/* Restore horizontal padding that main-content normally provides */
-.main-content.chat-page .chat-container {
-  padding: 0 var(--space-8);
+.live-panel-tabs {
+  display: flex;
+  gap: 2px;
 }
 
-
-/* When live panel is active, switch to horizontal layout */
-.chat-container.chat-with-live-panel {
-  flex-direction: row;
-  gap: 0;
+.live-panel-tab {
+  display: flex;
+  align-items: center;
+  gap: 5px;
+  padding: 5px 10px;
+  border-radius: 6px;
+  border: none;
+  background: transparent;
+  color: var(--text-tertiary);
+  font-size: 12px;
+  font-weight: 500;
+  cursor: pointer;
+  transition: all 150ms ease;
 }
 
-.chat-with-live-panel > .chat-main {
-  flex: 1;
-  min-width: 0;
-  display: flex;
-  flex-direction: column;
+.live-panel-tab:hover {
+  color: var(--text-secondary);
+  background: var(--bg-tertiary);
 }
 
-.chat-main {
-  display: flex;
-  flex-direction: column;
-  flex: 1;
-  min-height: 0;
+.live-panel-tab.active {
+  color: var(--text-primary);
+  background: var(--bg-tertiary);
 }
 
-.agent-detail-page--chat {
-  height: 100%;
-  min-height: 0;
+.live-panel-collapse {
   display: flex;
-  flex-direction: column;
+  align-items: center;
+  justify-content: center;
+  width: 28px;
+  height: 28px;
+  border: none;
+  border-radius: 6px;
+  background: transparent;
+  color: var(--text-tertiary);
+  cursor: pointer;
+  transition: all 150ms ease;
 }
 
-.agent-detail-header {
-  flex-shrink: 0;
+.live-panel-collapse:hover {
+  color: var(--text-secondary);
+  background: var(--bg-tertiary);
 }
 
-.agent-detail-page--chat .agent-detail-header {
-  min-height: 42px;
-  padding: 5px 0;
-  margin-bottom: 4px;
+.live-panel-header-right {
   display: flex;
   align-items: center;
+  gap: 0;
+  margin-left: auto;
 }
 
-.main-content.chat-page .agent-detail-page--chat .agent-detail-header {
-  padding: 6px var(--space-8) 6px 20px;
-  margin-bottom: 4px;
+.live-panel-icon-btn {
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  width: 28px;
+  height: 28px;
+  border: none;
+  border-radius: 6px;
+  background: transparent;
+  color: var(--text-tertiary);
+  cursor: pointer;
+  transition: color 0.15s, background 0.15s;
 }
-
-.agent-detail-page--settings .agent-detail-header {
-  min-height: 0;
-  justify-content: flex-end;
-  margin-bottom: 0;
+.live-panel-icon-btn:hover {
+  color: var(--text-secondary);
+  background: var(--bg-tertiary);
+}
+.live-panel-icon-btn.active {
+  color: var(--text-primary);
+  background: var(--bg-tertiary);
 }
 
-.agent-detail-page--settings .agent-detail-actions {
-  display: none;
+.live-panel-header-sep {
+  display: inline-block;
+  width: 1px;
+  height: 16px;
+  background: var(--border-subtle);
+  margin: 0 4px;
+  flex-shrink: 0;
 }
 
-.agent-detail-page--settings > .tabs {
+.live-panel-take-control {
+  display: flex;
   align-items: center;
-  border-bottom: none;
-  padding-top: 0;
-  margin-top: 0;
+  gap: 5px;
+  padding: 5px 10px;
+  border-radius: 6px;
+  border: 1px solid var(--border-subtle);
+  background: var(--bg-secondary);
+  color: var(--text-primary);
+  font-size: 11px;
+  font-weight: 600;
+  cursor: pointer;
+  transition: all 150ms ease;
+  margin-right: 4px;
 }
 
-.agent-tabs-chat-action {
-  margin-left: auto;
-  flex-shrink: 0;
+.live-panel-take-control:hover {
+  background: var(--bg-tertiary);
+  border-color: var(--border-strong);
+}
+
+.live-panel-content {
+  flex: 1;
+  overflow: hidden;
+  position: relative;
 }
 
-.agent-settings-savebar {
-  display: flex;
-  align-items: center;
-  justify-content: flex-end;
-  margin-bottom: 8px;
+.live-panel-iframe {
+  width: 100%;
+  height: 100%;
+  border: none;
+  background: #000;
 }
 
-.agent-detail-identity {
+.live-panel-browser {
+  width: 100%;
+  height: 100%;
   position: relative;
-  min-width: 0;
+  overflow: auto;
+  background: var(--bg-primary);
 }
 
-.agent-detail-identity-trigger {
-  display: flex;
-  align-items: center;
-  gap: 16px;
-  min-width: 0;
+.live-panel-screenshot {
+  width: 100%;
+  height: auto;
+  display: block;
 }
 
-.agent-detail-identity--compact .agent-detail-identity-trigger {
-  gap: 9px;
-  cursor: default;
-  padding: 0 4px;
-  border-radius: 9px;
+.live-panel-badge {
+  position: absolute;
+  top: 8px;
+  right: 8px;
+  display: flex;
   align-items: center;
+  gap: 5px;
+  padding: 3px 8px;
+  border-radius: 4px;
+  background: rgba(0, 0, 0, 0.6);
+  color: #fff;
+  font-size: 10px;
+  font-weight: 600;
+  letter-spacing: 0.5px;
+  backdrop-filter: blur(4px);
 }
 
-.agent-detail-avatar {
-  width: 34px;
-  height: 34px;
-  border-radius: 8px;
-  background: var(--accent-subtle);
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  color: var(--text-primary);
-  font-size: 17px;
-  font-weight: 650;
-  flex-shrink: 0;
+.live-dot {
+  width: 6px;
+  height: 6px;
+  border-radius: 50%;
+  background: #22c55e;
+  box-shadow: 0 0 4px rgba(34, 197, 94, 0.6);
+  animation: livePulse 2s ease-in-out infinite;
 }
 
-.agent-detail-identity--compact .agent-detail-avatar {
-  width: 24px;
-  height: 24px;
-  border-radius: 7px;
-  font-size: 12px;
-  font-weight: 650;
+@keyframes livePulse {
+  0%, 100% { opacity: 1; }
+  50% { opacity: 0.5; }
 }
 
-.agent-detail-page--chat .page-title {
-  font-size: 14px;
-  line-height: 1.2;
+.live-panel-code {
+  width: 100%;
+  height: 100%;
+  overflow: auto;
+  padding: 12px;
+  background: #0d1117;
+  color: #c9d1d9;
 }
 
-.agent-detail-page--chat .page-subtitle {
+.live-panel-code pre {
+  margin: 0;
+  font-family: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace;
   font-size: 12px;
-  margin-top: 2px !important;
+  line-height: 1.6;
+  white-space: pre-wrap;
+  word-break: break-all;
 }
 
-.agent-detail-actions {
+.live-panel-empty {
   display: flex;
+  flex-direction: column;
   align-items: center;
-  gap: 6px;
-}
-
-.agent-top-action {
-  height: 32px;
-  gap: 6px;
-  padding: 0 10px;
-  font-size: 13px;
+  justify-content: center;
+  height: 100%;
+  gap: 8px;
+  color: var(--text-tertiary);
+  font-size: 12px;
 }
 
-.agent-top-action:hover {
-  color: #212121;
-  background: rgba(34, 55, 89, 0.08);
-}
-.agent-top-action.active {
-  color: #212121;
-  background: rgba(34, 55, 89, 0.08);
-  border-color: rgba(34, 55, 89, 0.18);
+/* ─── Workspace Operation Area ───────────────────── */
+.agent-side-panel {
+  background: var(--bg-primary);
 }
 
-/* ══════════════════════════════════════════════════════
-   Agent Info Hover Card — Enterprise SaaS Design System
-   Glass card, gradient hero, icon headers, inner cards
-   ══════════════════════════════════════════════════════ */
-.agent-info-card {
-  position: absolute;
-  left: 0;
-  top: calc(100% + 8px);
-  width: min(480px, 82vw);
-  padding: 0;
-  border-radius: 16px;
+.workspace-op {
+  position: relative;
+  width: 100%;
+  height: 100%;
+  min-width: 0;
+  min-height: 0;
+  display: flex;
+  flex-direction: column;
+  background: var(--bg-primary);
   color: var(--text-primary);
-  opacity: 0;
-  visibility: hidden;
-  transform: translateY(-6px);
-  pointer-events: none;
-  transition: opacity 180ms cubic-bezier(0.16, 1, 0.3, 1),
-              transform 180ms cubic-bezier(0.16, 1, 0.3, 1),
-              visibility 180ms ease,
-              box-shadow 250ms ease;
-  z-index: 80;
   overflow: hidden;
-  /* Glass card defaults */
-  background: rgba(255, 255, 255, 0.72);
-  backdrop-filter: blur(16px) saturate(1.8);
-  -webkit-backdrop-filter: blur(16px) saturate(1.8);
-  border: 1px solid rgba(255, 255, 255, 0.45);
-  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.05);
 }
 
-[data-theme="dark"] .agent-info-card {
-  background: rgba(28, 28, 38, 0.82);
-  border-color: rgba(255, 255, 255, 0.08);
-  box-shadow: 0 4px 16px rgba(0, 0, 0, 0.25);
+.workspace-op-inline-actions {
+  position: absolute;
+  top: 8px;
+  right: 12px;
+  z-index: 10;
+  display: flex;
+  align-items: center;
+  gap: 4px;
+  padding: 2px 4px;
+  border-radius: 8px;
+  background: color-mix(in srgb, var(--bg-elevated) 90%, transparent);
+  backdrop-filter: blur(6px);
+  border: 1px solid var(--border-subtle);
+  box-shadow: 0 1px 4px rgba(0,0,0,0.06);
 }
 
-/* Card open state (click-triggered) */
-.agent-info-card--open {
-  opacity: 1;
-  visibility: visible;
-  transform: translateY(0);
-  pointer-events: auto;
-  box-shadow: 0 8px 32px rgba(0, 0, 0, 0.08), 0 2px 8px rgba(0, 0, 0, 0.04);
-}
-[data-theme="dark"] .agent-info-card--open {
-  box-shadow: 0 8px 32px rgba(0, 0, 0, 0.35), 0 2px 8px rgba(0, 0, 0, 0.2);
+.workspace-op-inline-actions a {
+  display: inline-flex;
+  color: inherit;
+  text-decoration: none;
 }
 
-/* Chevron toggle button */
-.agent-info-chevron {
-  all: unset;
-  cursor: pointer;
-  display: flex;
+.workspace-op-icon-btn {
+  width: 28px;
+  height: 28px;
+  display: inline-flex;
   align-items: center;
   justify-content: center;
-  width: 24px;
-  height: 24px;
+  border: 1px solid var(--border-subtle);
   border-radius: 6px;
+  background: var(--bg-primary);
   color: var(--text-tertiary);
-  flex-shrink: 0;
-  transition: color 120ms, background 120ms, transform 200ms ease;
+  cursor: pointer;
+  font-size: 14px;
+  line-height: 1;
+  transition: background var(--transition-fast), color var(--transition-fast), border-color var(--transition-fast);
 }
-.agent-info-chevron:hover {
+
+.workspace-op-icon-btn:hover,
+.workspace-op-icon-btn.active {
+  background: var(--bg-tertiary);
+  border-color: var(--border-default);
   color: var(--text-primary);
-  background: var(--bg-hover);
 }
-.agent-info-chevron--open {
-  transform: rotate(180deg);
-  color: var(--text-primary);
+
+.workspace-op-save {
+  padding: 2px 6px;
+  border-radius: 5px;
+  background: var(--bg-tertiary);
+  color: var(--text-tertiary);
+  font-size: 11px;
+  line-height: 16px;
+  transition: color var(--transition-fast), background var(--transition-fast), opacity var(--transition-fast);
 }
 
-/* Bridge the 8px gap between trigger and card so hover doesn't break */
-.agent-info-card--open::before {
-  content: '';
-  position: absolute;
-  top: -12px;
-  left: 0;
-  right: 0;
-  height: 12px;
+.workspace-op-save.saving {
+  color: var(--text-secondary);
+}
+
+.workspace-op-save.saved {
+  background: var(--success-subtle);
+  color: var(--success);
+}
+
+.workspace-op-save.error {
+  background: var(--error-subtle);
+  color: var(--error);
+}
+
+.workspace-op-body {
+  --workspace-side-width: 230px;
+  position: relative;
+  flex: 1;
+  min-width: 0;
+  min-height: 0;
+  display: grid;
+  grid-template-columns: minmax(0, 1fr) 230px;
+  background: var(--bg-primary);
+  overflow: hidden;
 }
 
-/* ── Card inner wrapper (holds glow + grid) ── */
-.agent-info-card-inner {
-  position: relative;
+.workspace-op-body.activity-open {
+  --workspace-side-width: 260px;
+  grid-template-columns: minmax(0, 1fr) 260px;
 }
 
-.agent-info-card-glow {
-  position: absolute;
-  top: -40px;
-  right: -40px;
-  width: 120px;
-  height: 120px;
-  background: rgba(0, 0, 0, 0.04);
-  border-radius: 50%;
-  filter: blur(40px);
-  pointer-events: none;
-}
-[data-theme="dark"] .agent-info-card-glow {
-  background: rgba(255, 255, 255, 0.06);
+.workspace-op-body.tree-closed {
+  --workspace-side-width: 0px;
+  grid-template-columns: minmax(0, 1fr);
 }
 
-/* ── Two-column grid ── */
-.agent-info-card-grid {
-  display: grid;
-  grid-template-columns: 1fr 1fr;
+.workspace-op-body.tree-closed.activity-open {
+  grid-template-columns: minmax(0, 1fr) 260px;
 }
 
-/* ── Section ── */
-.agent-info-card-section {
+.workspace-op-tree,
+.workspace-op-side {
   min-width: 0;
+  min-height: 0;
   display: flex;
   flex-direction: column;
+  background: var(--bg-secondary);
+  overflow: hidden;
 }
-.agent-info-card-section:first-child {
-  border-right: 1px solid rgba(0, 0, 0, 0.06);
+
+.workspace-op-tree {
+  border-left: 1px solid var(--border-subtle);
 }
-[data-theme="dark"] .agent-info-card-section:first-child {
-  border-right-color: rgba(255, 255, 255, 0.06);
+
+.workspace-op-tree .workspace-op-side-title {
+  justify-content: flex-start;
+  padding-left: 8px;
+  padding-right: 8px;
 }
 
-/* ── Section header — icon + title ── */
-.agent-info-card-section-header {
-  display: flex;
-  align-items: center;
-  gap: 8px;
-  padding: 12px 16px 10px;
+.workspace-op-side {
+  border-left: 1px solid var(--border-subtle);
 }
-.agent-info-section-icon {
+
+.workspace-op-side-title {
+  height: 34px;
+  flex-shrink: 0;
   display: flex;
   align-items: center;
-  justify-content: center;
-  width: 26px;
-  height: 26px;
-  border-radius: 7px;
-  flex-shrink: 0;
-}
-.agent-info-section-icon--blue {
-  background: var(--bg-secondary);
-  color: var(--text-secondary);
-}
-.agent-info-section-icon--indigo {
-  background: var(--bg-secondary);
-  color: var(--text-secondary);
+  justify-content: space-between;
+  padding: 0 10px;
+  color: var(--text-tertiary);
+  font-size: 11px;
+  font-weight: 700;
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  border-bottom: 1px solid var(--border-subtle);
 }
-.agent-info-card-section-title {
-  font-size: 13px;
-  font-weight: 600;
-  color: var(--text-primary);
-  letter-spacing: -0.01em;
+
+.workspace-op-tree-list {
+  flex: 1;
+  min-height: 0;
+  overflow: auto;
+  padding: 12px 7px 10px;
 }
 
-/* ── Section body ── */
-.agent-info-card-body {
-  padding: 12px 16px 16px;
+.workspace-op-side-list {
   flex: 1;
+  min-height: 0;
+  overflow: auto;
+  padding-bottom: 10px;
 }
 
-/* ── Token glass inner card ── */
-.agent-info-token-glass {
-  background: rgba(255, 255, 255, 0.5);
-  border: 1px solid rgba(255, 255, 255, 0.7);
-  border-radius: 12px;
-  padding: 16px;
+.workspace-op-tree-tools {
+  display: flex;
+  align-items: center;
+  gap: 6px;
 }
-[data-theme="dark"] .agent-info-token-glass {
-  background: rgba(255, 255, 255, 0.04);
-  border-color: rgba(255, 255, 255, 0.06);
+
+.workspace-op-tree-tools-full {
+  width: 100%;
+  justify-content: flex-start;
 }
 
-/* ── Token hero ── */
-.agent-info-token-hero {
-  display: flex;
-  flex-direction: column;
-  gap: 2px;
-  margin-bottom: 14px;
-  padding-bottom: 14px;
-  border-bottom: 1px solid rgba(0, 0, 0, 0.05);
+.workspace-op-tree-actions {
+  margin-left: auto;
+  display: inline-flex;
+  align-items: center;
+  gap: 6px;
 }
-[data-theme="dark"] .agent-info-token-hero {
-  border-bottom-color: rgba(255, 255, 255, 0.06);
+
+.workspace-op-tree-scope {
+  display: inline-flex;
+  align-items: center;
+  padding: 2px;
+  border: 1px solid var(--border-subtle);
+  border-radius: 7px;
+  background: color-mix(in srgb, var(--bg-tertiary) 80%, transparent);
 }
-.agent-info-token-hero-label {
-  font-size: 10px;
-  font-weight: 700;
-  text-transform: uppercase;
-  letter-spacing: 0.1em;
+
+.workspace-op-tree-scope button {
+  height: 20px;
+  padding: 0 7px;
+  border: none;
+  border-radius: 5px;
+  background: transparent;
   color: var(--text-tertiary);
-}
-.agent-info-token-hero-value {
-  font-size: 32px;
+  font-size: 10px;
   font-weight: 700;
-  line-height: 1.1;
-  letter-spacing: -0.03em;
-  background: linear-gradient(135deg, #1f2328 0%, #6b7280 100%);
-  -webkit-background-clip: text;
-  -webkit-text-fill-color: transparent;
-  background-clip: text;
-  width: fit-content;
+  cursor: pointer;
 }
-[data-theme="dark"] .agent-info-token-hero-value {
-  background: linear-gradient(135deg, #e1e1e8 0%, #8b8b9e 100%);
-  -webkit-background-clip: text;
-  background-clip: text;
+
+.workspace-op-tree-scope button.active {
+  background: var(--bg-elevated);
+  color: var(--text-primary);
+  box-shadow: var(--shadow-sm);
 }
-.agent-info-token-hero-unit {
-  font-size: 20px;
+
+.workspace-op-mini-btn {
+  height: 24px;
+  padding: 0 8px;
+  border: 1px solid var(--border-subtle);
+  border-radius: 6px;
+  background: var(--bg-elevated);
+  color: var(--text-secondary);
+  font-size: 11px;
   font-weight: 600;
-  letter-spacing: 0;
-  margin-left: 1px;
+  cursor: pointer;
+  transition: background var(--transition-fast), color var(--transition-fast), border-color var(--transition-fast);
 }
 
-/* ── Token stats grid ── */
-.agent-info-token-stats {
-  display: grid;
-  grid-template-columns: 1fr 1fr;
-  gap: 8px;
+.workspace-op-mini-btn:hover {
+  background: var(--bg-tertiary);
+  border-color: var(--border-default);
+  color: var(--text-primary);
 }
-.agent-info-stat-item {
+
+.workspace-op-mini-btn-icon {
+  width: 24px;
+  padding: 0;
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+}
+
+.workspace-op-tree-dir,
+.workspace-op-tree-file {
+  width: 100%;
+  height: 30px;
   display: flex;
-  flex-direction: column;
-  gap: 2px;
+  align-items: center;
+  gap: 6px;
+  border: none;
+  border-radius: 6px;
+  background: transparent;
+  color: var(--text-secondary);
+  text-align: left;
+  cursor: pointer;
+  font-size: 13px;
+  line-height: 20px;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
 }
-.agent-info-stat-label {
-  font-size: 10px;
+
+.workspace-op-tree-dir {
   font-weight: 600;
-  text-transform: uppercase;
-  letter-spacing: 0.08em;
-  color: var(--text-tertiary);
 }
-.agent-info-stat-value {
-  font-size: 14px;
-  font-weight: 600;
+
+.workspace-op-tree-dir.active,
+.workspace-op-tree-file.active {
+  background: var(--bg-tertiary);
+  color: var(--text-primary);
+}
+
+.workspace-op-tree-dir-main {
+  min-width: 0;
+  flex: 1;
+  display: inline-flex;
+  align-items: center;
+  gap: 6px;
+  border: none;
+  background: transparent;
+  color: inherit;
+  cursor: pointer;
+  text-align: left;
+  font: inherit;
+}
+
+.workspace-op-tree-file {
+  font-weight: 400;
+  color: var(--text-secondary);
+}
+
+.workspace-op-tree-dir:hover,
+.workspace-op-tree-file:hover {
+  background: var(--bg-tertiary);
   color: var(--text-primary);
-  line-height: 1.3;
 }
-.agent-info-stat-unit {
-  font-size: 10px;
-  font-weight: 500;
+
+.workspace-op-tree-file:disabled {
+  cursor: not-allowed;
   opacity: 0.5;
-  margin-left: 1px;
 }
 
-/* ── Model inner card ── */
-.agent-info-model-card {
-  display: flex;
+.workspace-op-tree-chevron {
+  width: 16px;
+  flex-shrink: 0;
+  display: inline-flex;
   align-items: center;
-  gap: 10px;
-  padding: 10px 12px;
-  border-radius: 10px;
-  background: var(--bg-secondary);
-  border: none;
-  margin-bottom: 12px;
+  justify-content: center;
+  color: var(--text-tertiary);
+  font-size: 15px;
+  line-height: 1;
 }
-[data-theme="dark"] .agent-info-model-card {
-  background: rgba(255, 255, 255, 0.03);
+
+.workspace-op-tree-empty,
+.workspace-op-side-empty {
+  padding: 12px;
+  color: var(--text-tertiary);
+  font-size: 12px;
 }
-.agent-info-model-card-icon {
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  width: 30px;
-  height: 30px;
+
+.workspace-op-tree-upload {
+  margin: 4px 0;
+  padding: 8px 10px 8px 0;
   border-radius: 8px;
-  background: var(--bg-tertiary);
-  color: var(--text-secondary);
-  font-size: 14px;
-  flex-shrink: 0;
+  background: color-mix(in srgb, var(--bg-tertiary) 68%, transparent);
 }
-.agent-info-model-card-text {
+
+.workspace-op-tree-upload-main {
   display: flex;
-  flex-direction: column;
-  gap: 1px;
-  min-width: 0;
-}
-.agent-info-model-card-label {
-  font-size: 9px;
-  font-weight: 700;
-  text-transform: uppercase;
-  letter-spacing: 0.08em;
-  color: var(--text-tertiary);
+  align-items: center;
+  justify-content: space-between;
+  gap: 8px;
 }
-.agent-info-model-card-name {
+
+.workspace-op-tree-upload-name {
+  min-width: 0;
+  color: var(--text-primary);
   font-size: 12px;
   font-weight: 600;
-  color: var(--text-primary);
+  white-space: nowrap;
   overflow: hidden;
   text-overflow: ellipsis;
-  white-space: nowrap;
 }
 
-/* ── Meta rows ── */
-.agent-info-meta-list {
-  display: flex;
-  flex-direction: column;
-}
-.agent-info-meta-row {
-  display: flex;
-  justify-content: space-between;
-  align-items: center;
-  gap: 12px;
-  padding: 6px 0;
+.workspace-op-tree-upload-status {
+  flex-shrink: 0;
+  color: var(--text-tertiary);
   font-size: 11px;
-  border-bottom: 1px solid rgba(0, 0, 0, 0.04);
 }
-[data-theme="dark"] .agent-info-meta-row {
-  border-bottom-color: rgba(255, 255, 255, 0.04);
+
+.workspace-op-tree-upload-bar {
+  width: 100%;
+  height: 4px;
+  margin-top: 6px;
+  border-radius: 999px;
+  background: var(--bg-hover);
+  overflow: hidden;
 }
-.agent-info-meta-row:last-child {
-  border-bottom: none;
-  padding-bottom: 0;
+
+.workspace-op-tree-upload-bar span {
+  display: block;
+  height: 100%;
+  border-radius: inherit;
+  background: var(--info);
+  transition: width var(--transition-default);
 }
-.agent-info-meta-row > span:first-child {
-  color: var(--text-tertiary);
-  flex-shrink: 0;
+
+.workspace-op-tree-upload.processing .workspace-op-tree-upload-bar span {
+  background: var(--warning);
 }
-.agent-info-meta-row > span:last-child {
-  font-weight: 500;
-  color: var(--text-secondary);
-  text-align: right;
-  min-width: 0;
-  overflow: hidden;
-  text-overflow: ellipsis;
-  white-space: nowrap;
+
+.workspace-op-tree-upload.done .workspace-op-tree-upload-bar span {
+  background: var(--success);
 }
 
-/* ── Meta rows — visual block, no dividers ── */
-.agent-info-meta-list {
-  display: flex;
-  flex-direction: column;
-  gap: 2px;
+.workspace-op-tree-upload-error {
+  margin-top: 6px;
+  color: var(--error);
+  font-size: 11px;
+  line-height: 1.4;
 }
 
-.agent-info-meta-row {
+.workspace-op-main {
+  position: relative;
+  min-width: 0;
+  min-height: 0;
+  overflow: auto;
+  padding: 16px;
+  background: var(--bg-primary);
+}
+
+.workspace-op-tree-edge-toggle {
+  position: absolute;
+  top: 12px;
+  right: calc(var(--workspace-side-width) - 13px);
+  z-index: 8;
+  width: 26px;
+  height: 34px;
   display: flex;
-  justify-content: space-between;
   align-items: center;
-  gap: 12px;
-  padding: 6px 8px;
-  font-size: 12px;
-  border-radius: 4px;
-  transition: background 100ms ease;
+  justify-content: center;
+  border: 1px solid var(--border-subtle);
+  border-radius: 7px;
+  background: color-mix(in srgb, var(--bg-elevated) 92%, transparent);
+  color: var(--text-secondary);
+  cursor: pointer;
+  line-height: 1;
+  opacity: 0.86;
+  box-shadow: var(--shadow-sm);
+  backdrop-filter: blur(6px);
+  transition: background var(--transition-fast), color var(--transition-fast), border-color var(--transition-fast), box-shadow var(--transition-fast);
 }
 
-.agent-info-meta-row:hover {
-  background: var(--bg-secondary);
+.workspace-op-body.tree-closed .workspace-op-tree-edge-toggle {
+  right: 12px;
 }
 
-.agent-info-meta-row > span:first-child {
-  color: var(--text-tertiary);
-  flex-shrink: 0;
+.workspace-op-tree-edge-toggle:hover,
+.workspace-op-tree-edge-toggle.active:hover {
+  background: var(--bg-elevated);
+  border-color: var(--border-default);
+  color: var(--text-primary);
+  box-shadow: var(--shadow-md);
 }
 
-.agent-info-meta-row > span:last-child {
-  font-weight: 500;
-  color: var(--text-secondary);
-  text-align: right;
-  min-width: 0;
-  overflow: hidden;
-  text-overflow: ellipsis;
-  white-space: nowrap;
+.workspace-op-side-resize {
+  position: absolute;
+  top: 0;
+  right: calc(var(--workspace-side-width) - 3px);
+  z-index: 6;
+  width: 7px;
+  height: 100%;
+  cursor: col-resize;
 }
 
-.agent-info-meta-list--model {
-  margin-bottom: 12px;
+.workspace-op-side-resize::after {
+  content: "";
+  position: absolute;
+  top: 0;
+  right: 3px;
+  width: 1px;
+  height: 100%;
+  background: transparent;
+  transition: background var(--transition-fast);
 }
 
-.agent-info-profile-block {
-  margin-top: 2px;
-  padding-top: 12px;
-  border-top: 1px solid rgba(0, 0, 0, 0.05);
+.workspace-op-side-resize:hover::after {
+  background: var(--border-default);
 }
 
-[data-theme="dark"] .agent-info-profile-block {
-  border-top-color: rgba(255, 255, 255, 0.06);
+.workspace-op-body.tree-closed .workspace-op-side-resize {
+  display: none;
 }
 
-.agent-info-profile-header {
+.workspace-op-main .markdown-content {
+  max-width: 880px;
+  margin: 0 auto;
+}
+
+.workspace-op-empty {
+  height: 100%;
+  min-height: 220px;
   display: flex;
   align-items: center;
-  gap: 8px;
-  margin: 0 0 10px;
+  justify-content: center;
+  color: var(--text-tertiary);
+  font-size: 13px;
 }
 
-/* (variant toggle removed — saas is the only style) */
-
-
-.agent-chat-shell {
-  flex: 1;
-  min-height: 0;
-  box-sizing: border-box;
-  background: rgba(255, 255, 255, 0.55);
-  backdrop-filter: blur(12px) saturate(1.6);
-  -webkit-backdrop-filter: blur(12px) saturate(1.6);
+.workspace-op-preview-paused {
+  min-height: 420px;
+  border: 1px dashed var(--border-subtle);
+  border-radius: 8px;
+  background: color-mix(in srgb, var(--bg-secondary) 88%, white);
 }
-[data-theme="dark"] .agent-chat-shell {
-  background: rgba(22, 22, 30, 0.7);
+
+.workspace-op-deleted {
+  flex-direction: column;
+  gap: 10px;
+  padding: 24px;
+  text-align: center;
 }
 
-.agent-chat-area.has-live-panel {
-  gap: 0;
-  padding-right: 0;
-  box-sizing: border-box;
+.workspace-op-deleted-title {
+  font-size: 18px;
+  font-weight: 600;
+  color: var(--text-primary);
 }
 
-.agent-chat-area.has-live-panel .agent-side-panel {
-  border: 0;
-  border-left: 1px solid var(--border-subtle);
-  border-radius: 0;
-  overflow: hidden;
-  align-self: stretch;
+.workspace-op-deleted-path {
+  max-width: 560px;
+  color: var(--text-secondary);
+  font-size: 13px;
+  line-height: 1.6;
+  word-break: break-word;
 }
 
-.aware-side-preview {
-  height: 100%;
-  overflow: auto;
-  padding: 14px;
+.workspace-op-live {
+  min-height: 100%;
   display: flex;
   flex-direction: column;
-  gap: 14px;
+  gap: 10px;
 }
 
-.aware-side-section {
+.workspace-op-live-banner {
+  flex-shrink: 0;
+  color: var(--text-tertiary);
+  font-size: 12px;
+}
+
+.workspace-op-html-fit {
+  width: 100%;
+  min-height: 100%;
+  overflow: auto;
+  background: var(--bg-secondary);
   border: 1px solid var(--border-subtle);
   border-radius: 8px;
-  background: var(--bg-primary);
-  padding: 12px;
+  padding: 10px;
 }
 
-.aware-side-section-title {
-  font-size: 13px;
-  font-weight: 600;
-  color: var(--text-primary);
-  margin-bottom: 10px;
+.workspace-op-html-fit iframe {
+  display: block;
+  border: none;
+  background: #fff;
+  border-radius: 8px;
 }
 
-.aware-side-item,
-.aware-side-trigger {
+.workspace-op-image-preview {
   display: flex;
-  gap: 10px;
-  align-items: flex-start;
-  padding: 8px 0;
-  border-top: 1px solid var(--border-subtle);
+  align-items: center;
+  justify-content: center;
+  min-height: 100%;
+  padding: 24px;
 }
 
-.aware-side-item:first-of-type,
-.aware-side-trigger:first-of-type {
-  border-top: none;
+.workspace-op-image {
+  display: block;
+  max-width: 100%;
+  max-height: 100%;
+  width: auto;
+  height: auto;
+  object-fit: contain;
+  border-radius: 14px;
+  box-shadow: 0 18px 44px rgba(15, 23, 42, 0.08);
+  background: #fff;
 }
 
-.aware-side-dot {
-  width: 8px;
-  height: 8px;
-  border-radius: 999px;
-  background: var(--border-strong);
-  margin-top: 5px;
-  flex: 0 0 8px;
+.workspace-op-modal-overlay {
+  position: absolute;
+  inset: 0;
+  z-index: 40;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  background: rgba(15, 23, 42, 0.18);
+  backdrop-filter: blur(2px);
 }
 
-.aware-side-dot.active {
-  background: var(--accent-primary);
+.workspace-op-modal {
+  width: min(420px, calc(100% - 32px));
+  border: 1px solid var(--border-subtle);
+  border-radius: 16px;
+  background: var(--bg-primary);
+  box-shadow: 0 24px 64px rgba(15, 23, 42, 0.18);
+  padding: 20px;
 }
 
-.aware-side-dot.done {
-  background: var(--success, #10b981);
+.workspace-op-modal-title {
+  font-size: 18px;
+  font-weight: 700;
+  color: var(--text-primary);
 }
 
-.aware-side-item-title {
-  font-size: 12px;
-  line-height: 1.45;
-  color: var(--text-primary);
+.workspace-op-modal-text {
+  margin-top: 10px;
+  color: var(--text-secondary);
+  font-size: 14px;
+  line-height: 1.6;
 }
 
-.aware-side-item-meta {
-  font-size: 11px;
-  line-height: 1.45;
-  color: var(--text-tertiary);
-  margin-top: 2px;
+.workspace-op-modal-actions {
+  margin-top: 18px;
+  display: flex;
+  justify-content: flex-end;
+  gap: 10px;
 }
 
-.aware-side-trigger-main {
-  flex: 1;
-  min-width: 0;
+.workspace-op-editor {
+  width: 100%;
+  height: 100%;
+  min-height: 420px;
+  resize: none;
+  border: 1px solid var(--border-subtle);
+  border-radius: 8px;
+  padding: 14px;
+  background: var(--bg-secondary);
+  color: var(--text-primary);
+  font-family: var(--font-mono);
+  font-size: 13px;
+  line-height: 1.6;
+  outline: none;
 }
 
-.aware-side-empty {
-  font-size: 12px;
-  color: var(--text-tertiary);
-  padding: 8px 0;
+.workspace-op-editor:focus {
+  border-color: var(--border-strong);
 }
 
-/* ─── AgentBay Live Preview Panel ───────────────── */
-.live-panel {
-  /* Width is now controlled via inline style from the component (resizable). */
-  min-width: 300px;
-  max-width: 65vw;
-  display: flex;
-  flex-direction: column;
-  position: relative;
-  border-left: 1px solid var(--border-subtle);
+.workspace-op-table-wrap {
+  max-width: 100%;
+  overflow: auto;
+  border: 1px solid var(--border-subtle);
+  border-radius: 8px;
   background: var(--bg-secondary);
-  animation: livePanelSlideIn 250ms ease;
 }
 
-/* Drag handle on the left edge of the live panel */
-.live-panel-resize-handle {
-  position: absolute;
-  left: 0;
+.workspace-op-table {
+  width: max-content;
+  min-width: 100%;
+  border-collapse: collapse;
+  font-size: 13px;
+  table-layout: auto;
+}
+
+.workspace-op-table thead th {
+  background: color-mix(in srgb, var(--bg-tertiary) 92%, transparent);
+  color: var(--text-primary);
+  font-weight: 700;
+  position: sticky;
   top: 0;
-  bottom: 0;
-  width: 5px;
-  cursor: col-resize;
-  z-index: 10;
-  /* Show a subtle visual indicator */
-  background: transparent;
-  transition: background 150ms ease;
+  z-index: 1;
 }
 
-.live-panel-resize-handle:hover,
-.live-panel-resize-handle:active {
-  background: var(--accent-primary);
-  opacity: 0.35;
+.workspace-op-table td,
+.workspace-op-table th {
+  padding: 8px 10px;
+  border-right: 1px solid var(--border-subtle);
+  border-bottom: 1px solid var(--border-subtle);
+  white-space: nowrap;
 }
 
-@keyframes livePanelSlideIn {
-  from { opacity: 0; transform: translateX(16px); }
-  to { opacity: 1; transform: translateX(0); }
+.workspace-op-table tbody tr:nth-child(even) td {
+  background: color-mix(in srgb, var(--bg-primary) 84%, var(--bg-tertiary));
 }
 
-.live-panel-header {
-  display: flex;
-  align-items: center;
-  gap: 8px;
-  padding: 8px 12px;
-  border-bottom: 1px solid var(--border-subtle);
-  background: var(--bg-elevated);
-  flex-shrink: 0;
+.workspace-op-table td:empty::after,
+.workspace-op-table th:empty::after {
+  content: " ";
 }
-.live-panel-title {
-  flex: 1;
-  font-size: 13px;
-  font-weight: 600;
-  color: var(--text-primary);
-  line-height: 1.3;
+
+.workspace-op-table tr:last-child td {
+  border-bottom: none;
 }
 
-.live-panel-tabs {
-  display: flex;
-  gap: 2px;
+.workspace-op-pdf {
+  width: 100%;
+  height: 100%;
+  min-height: 640px;
+  border: none;
+  border-radius: 8px;
+  background: var(--bg-secondary);
+}
+
+.workspace-op-text-preview {
+  margin: 0;
+  padding: 14px;
+  border-radius: 8px;
+  background: var(--bg-secondary);
+  color: var(--text-secondary);
+  font-family: var(--font-mono);
+  font-size: 13px;
+  line-height: 1.7;
+  white-space: pre-wrap;
+  word-break: break-word;
 }
 
-.live-panel-tab {
+.workspace-op-ppt-preview {
   display: flex;
-  align-items: center;
-  gap: 5px;
-  padding: 5px 10px;
-  border-radius: 6px;
-  border: none;
-  background: transparent;
-  color: var(--text-tertiary);
-  font-size: 12px;
-  font-weight: 500;
-  cursor: pointer;
-  transition: all 150ms ease;
+  flex-direction: column;
+  gap: 16px;
 }
 
-.live-panel-tab:hover {
-  color: var(--text-secondary);
-  background: var(--bg-tertiary);
+.workspace-op-slide-card {
+  margin: 0;
 }
 
-.live-panel-tab.active {
-  color: var(--text-primary);
-  background: var(--bg-tertiary);
+.workspace-op-slide-label {
+  margin-bottom: 6px;
+  color: var(--text-tertiary);
+  font-size: 12px;
+  font-weight: 600;
 }
 
-.live-panel-collapse {
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  width: 28px;
-  height: 28px;
-  border: none;
-  border-radius: 6px;
-  background: transparent;
-  color: var(--text-tertiary);
-  cursor: pointer;
-  transition: all 150ms ease;
+.workspace-op-slide-canvas {
+  position: relative;
+  width: 100%;
+  aspect-ratio: 16 / 9;
+  border: 1px solid var(--border-subtle);
+  border-radius: 8px;
+  background: #fff;
+  overflow: hidden;
 }
 
-.live-panel-collapse:hover {
-  color: var(--text-secondary);
-  background: var(--bg-tertiary);
+.workspace-op-slide-shape {
+  position: absolute;
+  padding: 4px;
+  color: #222;
+  font-size: 12px;
+  line-height: 1.35;
+  overflow: hidden;
+  white-space: pre-wrap;
 }
 
-.live-panel-header-right {
-  display: flex;
-  align-items: center;
-  gap: 0;
-  margin-left: auto;
+.workspace-op-revision {
+  margin: 10px;
+  padding: 10px;
+  border: 1px solid var(--border-subtle);
+  border-radius: 8px;
+  background: var(--bg-primary);
 }
 
-.live-panel-icon-btn {
+.workspace-op-revision-head {
   display: flex;
   align-items: center;
-  justify-content: center;
-  width: 28px;
-  height: 28px;
-  border: none;
-  border-radius: 6px;
-  background: transparent;
-  color: var(--text-tertiary);
-  cursor: pointer;
-  transition: color 0.15s, background 0.15s;
-}
-.live-panel-icon-btn:hover {
-  color: var(--text-secondary);
-  background: var(--bg-tertiary);
-}
-.live-panel-icon-btn.active {
-  color: var(--text-primary);
-  background: var(--bg-tertiary);
-}
-
-.live-panel-header-sep {
-  display: inline-block;
-  width: 1px;
-  height: 16px;
-  background: var(--border-subtle);
-  margin: 0 4px;
-  flex-shrink: 0;
+  justify-content: space-between;
+  gap: 8px;
+  margin-bottom: 8px;
+  font-size: 12px;
 }
 
-.live-panel-take-control {
+.workspace-op-revision-meta {
+  min-width: 0;
   display: flex;
   align-items: center;
-  gap: 5px;
-  padding: 5px 10px;
-  border-radius: 6px;
-  border: 1px solid var(--border-subtle);
-  background: var(--bg-secondary);
-  color: var(--text-primary);
-  font-size: 11px;
-  font-weight: 600;
-  cursor: pointer;
-  transition: all 150ms ease;
-  margin-right: 4px;
+  gap: 8px;
 }
 
-.live-panel-take-control:hover {
-  background: var(--bg-tertiary);
-  border-color: var(--border-strong);
+.workspace-op-revision strong {
+  font-size: 13px;
 }
 
-.live-panel-content {
-  flex: 1;
-  overflow: hidden;
-  position: relative;
+.workspace-op-revision span {
+  color: var(--text-tertiary);
 }
 
-.live-panel-iframe {
-  width: 100%;
-  height: 100%;
-  border: none;
-  background: #000;
+.workspace-op-revision-time {
+  flex-shrink: 0;
+  color: var(--text-tertiary);
+  font-size: 11px;
 }
 
-.live-panel-browser {
-  width: 100%;
-  height: 100%;
-  position: relative;
+.workspace-op-revision pre {
+  max-height: 160px;
   overflow: auto;
-  background: var(--bg-primary);
+  margin: 0 0 8px;
+  color: var(--text-tertiary);
+  font-family: var(--font-mono);
+  font-size: 11px;
+  line-height: 1.5;
+  white-space: pre-wrap;
 }
 
-.live-panel-screenshot {
-  width: 100%;
-  height: auto;
-  display: block;
+.workspace-op-revision-note {
+  color: var(--text-secondary);
 }
 
-.live-panel-badge {
-  position: absolute;
-  top: 8px;
-  right: 8px;
+/* Collapsed toggle button (shown on the right edge) */
+.live-panel-toggle {
+  position: fixed;
+  right: 0;
+  top: 50%;
+  transform: translateY(-50%);
+  width: 28px;
+  height: 56px;
+  border: 1px solid var(--border-subtle);
+  border-right: none;
+  border-radius: 8px 0 0 8px;
+  background: var(--bg-elevated);
+  color: var(--text-tertiary);
+  cursor: pointer;
   display: flex;
+  flex-direction: column;
   align-items: center;
-  gap: 5px;
-  padding: 3px 8px;
-  border-radius: 4px;
-  background: rgba(0, 0, 0, 0.6);
-  color: #fff;
-  font-size: 10px;
-  font-weight: 600;
-  letter-spacing: 0.5px;
-  backdrop-filter: blur(4px);
+  justify-content: center;
+  gap: 4px;
+  transition: all 150ms ease;
+  z-index: 10;
 }
 
-.live-dot {
+.live-panel-toggle:hover {
+  color: var(--text-secondary);
+  background: var(--bg-tertiary);
+  width: 32px;
+}
+
+.live-panel-toggle-dot {
   width: 6px;
   height: 6px;
   border-radius: 50%;
@@ -3329,1781 +5215,1765 @@ select:focus {
   animation: livePulse 2s ease-in-out infinite;
 }
 
-@keyframes livePulse {
-  0%, 100% { opacity: 1; }
-  50% { opacity: 0.5; }
+/* Responsive: hide live panel on narrow screens */
+@media (max-width: 900px) {
+  .live-panel {
+    display: none;
+  }
+  .live-panel-toggle {
+    display: none;
+  }
 }
 
-.live-panel-code {
-  width: 100%;
-  height: 100%;
-  overflow: auto;
-  padding: 12px;
-  background: #0d1117;
-  color: #c9d1d9;
+.chat-messages {
+  flex: 1;
+  overflow-y: auto;
+  padding: var(--space-4) 0;
 }
 
-.live-panel-code pre {
-  margin: 0;
-  font-family: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace;
-  font-size: 12px;
-  line-height: 1.6;
-  white-space: pre-wrap;
-  word-break: break-all;
+.chat-message {
+  display: flex;
+  gap: var(--space-3);
+  padding: var(--space-3) 0;
+  animation: fadeIn 200ms ease;
 }
 
-.live-panel-empty {
+.chat-message.user {
+  flex-direction: row-reverse;
+}
+
+.chat-avatar {
+  width: 32px;
+  height: 32px;
+  border-radius: var(--radius-full);
+  background: var(--bg-elevated);
   display: flex;
-  flex-direction: column;
   align-items: center;
   justify-content: center;
-  height: 100%;
-  gap: 8px;
-  color: var(--text-tertiary);
-  font-size: 12px;
+  font-size: var(--text-sm);
+  flex-shrink: 0;
+}
+
+.chat-bubble {
+  max-width: 70%;
+  padding: var(--space-3) var(--space-4);
+  border-radius: var(--radius-lg);
+  font-size: var(--text-sm);
+  line-height: 1.6;
 }
 
-/* ─── Workspace Operation Area ───────────────────── */
-.agent-side-panel {
-  background: var(--bg-primary);
+.chat-message.assistant .chat-bubble {
+  background: var(--bg-elevated);
+  border: 1px solid var(--border-subtle);
 }
 
-.workspace-op {
-  position: relative;
-  width: 100%;
-  height: 100%;
-  min-width: 0;
-  min-height: 0;
-  display: flex;
-  flex-direction: column;
-  background: var(--bg-primary);
-  color: var(--text-primary);
-  overflow: hidden;
+.chat-message.user .chat-bubble {
+  background: var(--accent-primary);
+  color: white;
 }
 
-.workspace-op-inline-actions {
-  position: absolute;
-  top: 8px;
-  right: 12px;
-  z-index: 10;
+/* ── SaaS chat message system (replaces inline styles) ── */
+.chat-msg-row {
   display: flex;
-  align-items: center;
-  gap: 4px;
-  padding: 2px 4px;
-  border-radius: 8px;
-  background: color-mix(in srgb, var(--bg-elevated) 90%, transparent);
-  backdrop-filter: blur(6px);
-  border: 1px solid var(--border-subtle);
-  box-shadow: 0 1px 4px rgba(0,0,0,0.06);
+  flex-direction: row;
+  gap: 8px;
+  margin-bottom: 8px;
+  animation: fadeIn 0.2s ease;
 }
-
-.workspace-op-inline-actions a {
-  display: inline-flex;
-  color: inherit;
-  text-decoration: none;
+.chat-msg-row--user {
+  flex-direction: row-reverse;
 }
 
-.workspace-op-icon-btn {
+.chat-msg-avatar {
   width: 28px;
   height: 28px;
-  display: inline-flex;
+  border-radius: 50%;
+  background: var(--bg-elevated);
+  display: flex;
   align-items: center;
   justify-content: center;
+  font-size: 11px;
+  flex-shrink: 0;
+  color: var(--text-secondary);
+  font-weight: 600;
   border: 1px solid var(--border-subtle);
-  border-radius: 6px;
-  background: var(--bg-primary);
-  color: var(--text-tertiary);
-  cursor: pointer;
-  font-size: 14px;
-  line-height: 1;
-  transition: background var(--transition-fast), color var(--transition-fast), border-color var(--transition-fast);
 }
-
-.workspace-op-icon-btn:hover,
-.workspace-op-icon-btn.active {
+.chat-msg-avatar--user {
   background: var(--bg-tertiary);
-  border-color: var(--border-default);
-  color: var(--text-primary);
+  border-color: var(--border-subtle);
+  color: var(--text-secondary);
 }
-
-.workspace-op-save {
-  padding: 2px 6px;
-  border-radius: 5px;
-  background: var(--bg-tertiary);
-  color: var(--text-tertiary);
-  font-size: 11px;
-  line-height: 16px;
-  transition: color var(--transition-fast), background var(--transition-fast), opacity var(--transition-fast);
+[data-theme="dark"] .chat-msg-avatar--user {
+  background: rgba(255, 255, 255, 0.06);
+  border-color: rgba(255, 255, 255, 0.08);
 }
 
-.workspace-op-save.saving {
-  color: var(--text-secondary);
+.chat-msg-col {
+  max-width: 72%;
+  display: flex;
+  flex-direction: column;
+  align-items: flex-start;
 }
 
-.workspace-op-save.saved {
-  background: var(--success-subtle);
-  color: var(--success);
+.chat-msg-bubble {
+  max-width: 100%;
+  padding: 8px 12px;
+  border-radius: 12px;
+  font-size: 13px;
+  line-height: 1.5;
+  word-break: break-word;
+  background: var(--bg-elevated);
+  border: 1px solid var(--border-subtle);
+  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.03);
+  transition: box-shadow 150ms ease;
 }
-
-.workspace-op-save.error {
-  background: var(--error-subtle);
-  color: var(--error);
+.chat-msg-bubble--user {
+  background: var(--bg-secondary);
+  border-color: var(--border-default);
 }
-
-.workspace-op-body {
-  --workspace-side-width: 230px;
-  position: relative;
-  flex: 1;
-  min-width: 0;
-  min-height: 0;
-  display: grid;
-  grid-template-columns: minmax(0, 1fr) 230px;
-  background: var(--bg-primary);
-  overflow: hidden;
+[data-theme="dark"] .chat-msg-bubble {
+  background: rgba(255, 255, 255, 0.05);
+  border-color: rgba(255, 255, 255, 0.08);
+  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
 }
-
-.workspace-op-body.activity-open {
-  --workspace-side-width: 260px;
-  grid-template-columns: minmax(0, 1fr) 260px;
+[data-theme="dark"] .chat-msg-bubble--user {
+  background: rgba(255, 255, 255, 0.06);
+  border-color: rgba(255, 255, 255, 0.1);
 }
 
-.workspace-op-body.tree-closed {
-  --workspace-side-width: 0px;
-  grid-template-columns: minmax(0, 1fr);
+.chat-msg-sender {
+  font-size: 10px;
+  color: var(--text-tertiary);
+  margin-bottom: 2px;
+  font-weight: 600;
 }
 
-.workspace-op-body.tree-closed.activity-open {
-  grid-template-columns: minmax(0, 1fr) 260px;
+.chat-msg-timestamp {
+  font-size: 10px;
+  color: var(--text-tertiary);
+  margin-top: 3px;
+  opacity: 0.6;
+  display: flex;
+  align-items: center;
+  justify-content: flex-start;
+  gap: 4px;
 }
 
-.workspace-op-tree,
-.workspace-op-side {
-  min-width: 0;
-  min-height: 0;
-  display: flex;
-  flex-direction: column;
+.chat-msg-file-chip {
+  display: inline-flex;
+  align-items: center;
+  gap: 5px;
   background: var(--bg-secondary);
-  overflow: hidden;
+  border-radius: 6px;
+  padding: 4px 8px;
+  font-size: 11px;
+  border: 1px solid var(--border-subtle);
+  color: var(--text-secondary);
+}
+[data-theme="dark"] .chat-msg-file-chip {
+  background: rgba(255, 255, 255, 0.05);
+  border-color: rgba(255, 255, 255, 0.08);
 }
 
-.workspace-op-tree {
-  border-left: 1px solid var(--border-subtle);
+/* Empty state */
+.chat-empty-state {
+  text-align: center;
+  padding: 60px 20px;
+  color: var(--text-tertiary);
+}
+.chat-empty-state__title {
+  font-size: 14px;
+  font-weight: 500;
+  margin-bottom: 4px;
+  color: var(--text-secondary);
+}
+.chat-empty-state__subtitle {
+  font-size: 12px;
+}
+.chat-empty-state__hint {
+  font-size: 11px;
+  margin-top: 6px;
+  opacity: 0.65;
 }
 
-.workspace-op-tree .workspace-op-side-title {
-  justify-content: flex-start;
-  padding-left: 8px;
-  padding-right: 8px;
+/* Scroll-to-bottom button */
+.chat-scroll-btn {
+  position: absolute;
+  right: 20px;
+  width: 32px;
+  height: 32px;
+  border-radius: 50%;
+  background: rgba(255, 255, 255, 0.75);
+  backdrop-filter: blur(8px);
+  -webkit-backdrop-filter: blur(8px);
+  border: 1px solid var(--border-subtle);
+  color: var(--text-secondary);
+  cursor: pointer;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  font-size: 16px;
+  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.06);
+  z-index: 10;
+  transition: background 120ms ease, box-shadow 120ms ease;
+}
+.chat-scroll-btn:hover {
+  background: rgba(255, 255, 255, 0.9);
+  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
+}
+[data-theme="dark"] .chat-scroll-btn {
+  background: rgba(40, 40, 50, 0.75);
+}
+[data-theme="dark"] .chat-scroll-btn:hover {
+  background: rgba(50, 50, 60, 0.9);
 }
 
-.workspace-op-side {
-  border-left: 1px solid var(--border-subtle);
+.chat-input-area {
+  padding: var(--space-4);
 }
 
-.workspace-op-side-title {
-  height: 34px;
-  flex-shrink: 0;
+/* Unified composer: attachments → text (top-aligned) → toolbar (attach | send) */
+.chat-composer {
   display: flex;
-  align-items: center;
-  justify-content: space-between;
-  padding: 0 10px;
-  color: var(--text-tertiary);
-  font-size: 11px;
-  font-weight: 700;
-  letter-spacing: 0.04em;
-  text-transform: uppercase;
-  border-bottom: 1px solid var(--border-subtle);
+  flex-direction: column;
+  min-height: 80px;
+  border: 1px solid rgba(0, 0, 0, 0.08);
+  border-radius: 10px;
+  background: rgba(255, 255, 255, 0.65);
+  backdrop-filter: blur(8px);
+  -webkit-backdrop-filter: blur(8px);
+  overflow: hidden;
+  transition: border-color 150ms ease, box-shadow 150ms ease;
+  box-shadow: 0 1px 4px rgba(0, 0, 0, 0.03);
+}
+[data-theme="dark"] .chat-composer {
+  background: rgba(255, 255, 255, 0.04);
+  border-color: rgba(255, 255, 255, 0.08);
+  box-shadow: 0 1px 4px rgba(0, 0, 0, 0.12);
 }
 
-.workspace-op-tree-list {
-  flex: 1;
-  min-height: 0;
-  overflow: auto;
-  padding: 12px 7px 10px;
+.chat-composer:focus-within {
+  border-color: var(--border-strong);
+  box-shadow: 0 0 0 3px rgba(58, 58, 66, 0.08);
 }
-
-.workspace-op-side-list {
-  flex: 1;
-  min-height: 0;
-  overflow: auto;
-  padding-bottom: 10px;
+[data-theme="dark"] .chat-composer:focus-within {
+  border-color: rgba(255, 255, 255, 0.18);
+  box-shadow: 0 0 0 3px rgba(255, 255, 255, 0.08);
 }
 
-.workspace-op-tree-tools {
+.chat-composer-attachments {
+  padding: 6px 6px 2px;
   display: flex;
+  flex-wrap: wrap;
   align-items: center;
   gap: 6px;
+  background: transparent;
 }
 
-.workspace-op-tree-tools-full {
-  width: 100%;
-  justify-content: flex-start;
+.chat-composer-input-block {
+  flex: 1 1 auto;
+  display: flex;
+  flex-direction: column;
+  min-height: 0;
+  padding: 4px 6px 0;
+  background: transparent;
 }
 
-.workspace-op-tree-actions {
-  margin-left: auto;
-  display: inline-flex;
+.chat-composer-toolbar {
+  display: flex;
   align-items: center;
-  gap: 6px;
+  justify-content: space-between;
+  flex-shrink: 0;
+  gap: 8px;
+  padding: 2px 6px 6px;
+  background: transparent;
 }
 
-.workspace-op-tree-scope {
+.chat-composer-btn {
   display: inline-flex;
   align-items: center;
-  padding: 2px;
-  border: 1px solid var(--border-subtle);
-  border-radius: 7px;
-  background: color-mix(in srgb, var(--bg-tertiary) 80%, transparent);
-}
-
-.workspace-op-tree-scope button {
-  height: 20px;
-  padding: 0 7px;
+  justify-content: center;
+  width: 28px;
+  height: 28px;
   border: none;
-  border-radius: 5px;
-  background: transparent;
-  color: var(--text-tertiary);
-  font-size: 10px;
-  font-weight: 700;
-  cursor: pointer;
-}
-
-.workspace-op-tree-scope button.active {
-  background: var(--bg-elevated);
-  color: var(--text-primary);
-  box-shadow: var(--shadow-sm);
-}
-
-.workspace-op-mini-btn {
-  height: 24px;
-  padding: 0 8px;
-  border: 1px solid var(--border-subtle);
   border-radius: 6px;
-  background: var(--bg-elevated);
+  background: transparent;
   color: var(--text-secondary);
-  font-size: 11px;
-  font-weight: 600;
   cursor: pointer;
-  transition: background var(--transition-fast), color var(--transition-fast), border-color var(--transition-fast);
+  flex-shrink: 0;
+  transition: background 0.12s ease, color 0.12s ease;
 }
 
-.workspace-op-mini-btn:hover {
-  background: var(--bg-tertiary);
-  border-color: var(--border-default);
+.chat-composer-btn:hover:not(:disabled) {
+  background: var(--bg-hover);
   color: var(--text-primary);
 }
 
-.workspace-op-mini-btn-icon {
-  width: 24px;
-  padding: 0;
-  display: inline-flex;
-  align-items: center;
-  justify-content: center;
+.chat-composer-btn:disabled {
+  opacity: 0.4;
+  cursor: not-allowed;
 }
 
-.workspace-op-tree-dir,
-.workspace-op-tree-file {
-  width: 100%;
-  height: 30px;
-  display: flex;
-  align-items: center;
-  gap: 6px;
-  border: none;
+/* Send: same as global .btn-primary (theme bg + primary-fg); icon uses currentColor */
+.chat-composer .btn.chat-composer-send {
+  background: #3a3a42;
+  color: #ffffff;
+  border-color: #3a3a42;
+  padding: 0;
+  width: 28px;
+  min-width: 28px;
+  height: 28px;
+  flex-shrink: 0;
   border-radius: 6px;
-  background: transparent;
-  color: var(--text-secondary);
-  text-align: left;
-  cursor: pointer;
-  font-size: 13px;
-  line-height: 20px;
-  white-space: nowrap;
-  overflow: hidden;
-  text-overflow: ellipsis;
-}
-
-.workspace-op-tree-dir {
-  font-weight: 600;
-}
-
-.workspace-op-tree-dir.active,
-.workspace-op-tree-file.active {
-  background: var(--bg-tertiary);
-  color: var(--text-primary);
 }
 
-.workspace-op-tree-dir-main {
-  min-width: 0;
-  flex: 1;
-  display: inline-flex;
-  align-items: center;
-  gap: 6px;
-  border: none;
-  background: transparent;
-  color: inherit;
-  cursor: pointer;
-  text-align: left;
-  font: inherit;
+.chat-composer .btn.chat-composer-send:hover:not(:disabled) {
+  background: #28282e;
+  border-color: #28282e;
 }
 
-.workspace-op-tree-file {
-  font-weight: 400;
-  color: var(--text-secondary);
+[data-theme="dark"] .chat-composer .btn.chat-composer-send {
+  background: #e1e1e8;
+  border-color: #e1e1e8;
+  color: #0a0a0f;
 }
 
-.workspace-op-tree-dir:hover,
-.workspace-op-tree-file:hover {
-  background: var(--bg-tertiary);
-  color: var(--text-primary);
+[data-theme="dark"] .chat-composer .btn.chat-composer-send:hover:not(:disabled) {
+  background: #ffffff;
+  border-color: #ffffff;
 }
 
-.workspace-op-tree-file:disabled {
-  cursor: not-allowed;
-  opacity: 0.5;
+.chat-composer .btn.chat-composer-send:disabled {
+  opacity: 0.45;
 }
 
-.workspace-op-tree-chevron {
-  width: 16px;
-  flex-shrink: 0;
+/* File chip: width follows content, capped so referenced workspace files stay readable */
+.chat-file-pill {
+  position: relative;
   display: inline-flex;
-  align-items: center;
-  justify-content: center;
-  color: var(--text-tertiary);
-  font-size: 15px;
-  line-height: 1;
+  width: fit-content;
+  max-width: min(320px, 100%);
+  min-width: 0;
+  vertical-align: middle;
+  border-radius: 9999px;
+  border: 1px solid var(--border-subtle);
+  background: var(--bg-secondary);
+  overflow: hidden;
+  box-sizing: border-box;
+  min-height: 26px;
 }
 
-.workspace-op-tree-empty,
-.workspace-op-side-empty {
-  padding: 12px;
-  color: var(--text-tertiary);
-  font-size: 12px;
+.chat-file-pill--workspace {
+  max-width: min(460px, 100%);
 }
 
-.workspace-op-tree-upload {
-  margin: 4px 0;
-  padding: 8px 10px 8px 0;
-  border-radius: 8px;
-  background: color-mix(in srgb, var(--bg-tertiary) 68%, transparent);
+.chat-file-pill__fill {
+  position: absolute;
+  left: 0;
+  top: 0;
+  bottom: 0;
+  width: 0;
+  background: color-mix(in srgb, var(--accent-primary) 20%, var(--bg-secondary));
+  transition: width 0.12s ease;
+  pointer-events: none;
+  border-radius: inherit;
 }
 
-.workspace-op-tree-upload-main {
+.chat-file-pill__row {
+  position: relative;
+  z-index: 1;
   display: flex;
   align-items: center;
-  justify-content: space-between;
-  gap: 8px;
-}
-
-.workspace-op-tree-upload-name {
+  gap: 6px;
+  padding: 2px 10px;
   min-width: 0;
-  color: var(--text-primary);
-  font-size: 12px;
-  font-weight: 600;
-  white-space: nowrap;
-  overflow: hidden;
-  text-overflow: ellipsis;
+  max-width: 100%;
+  box-sizing: border-box;
 }
 
-.workspace-op-tree-upload-status {
-  flex-shrink: 0;
+.chat-file-pill__icon {
+  display: inline-flex;
   color: var(--text-tertiary);
-  font-size: 11px;
-}
-
-.workspace-op-tree-upload-bar {
-  width: 100%;
-  height: 4px;
-  margin-top: 6px;
-  border-radius: 999px;
-  background: var(--bg-hover);
-  overflow: hidden;
+  flex-shrink: 0;
 }
 
-.workspace-op-tree-upload-bar span {
-  display: block;
-  height: 100%;
-  border-radius: inherit;
-  background: var(--info);
-  transition: width var(--transition-default);
+.chat-file-pill__thumb {
+  width: 14px;
+  height: 14px;
+  border-radius: 3px;
+  object-fit: cover;
+  flex-shrink: 0;
 }
 
-.workspace-op-tree-upload.processing .workspace-op-tree-upload-bar span {
-  background: var(--warning);
+.chat-file-pill__name {
+  flex: 1;
+  min-width: 0;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+  font-size: var(--text-base);
+  font-weight: 400;
+  line-height: 1.5;
+  color: var(--text-primary);
 }
 
-.workspace-op-tree-upload.done .workspace-op-tree-upload-bar span {
-  background: var(--success);
+.chat-file-pill__source {
+  flex-shrink: 0;
+  padding: 1px 5px;
+  border-radius: 5px;
+  background: var(--bg-tertiary);
+  color: var(--text-tertiary);
+  font-size: 11px;
+  line-height: 16px;
 }
 
-.workspace-op-tree-upload-error {
-  margin-top: 6px;
-  color: var(--error);
-  font-size: 11px;
-  line-height: 1.4;
+.chat-file-pill__size {
+  font-size: 14px;
+  line-height: 14px;
+  color: var(--text-tertiary);
+  flex-shrink: 0;
 }
 
-.workspace-op-main {
-  position: relative;
-  min-width: 0;
-  min-height: 0;
-  overflow: auto;
-  padding: 16px;
-  background: var(--bg-primary);
+.chat-file-pill__pct {
+  font-size: 14px;
+  line-height: 14px;
+  font-variant-numeric: tabular-nums;
+  color: var(--text-primary);
+  flex-shrink: 0;
 }
 
-.workspace-op-tree-edge-toggle {
-  position: absolute;
-  top: 12px;
-  right: calc(var(--workspace-side-width) - 13px);
-  z-index: 8;
-  width: 26px;
-  height: 34px;
-  display: flex;
+.chat-file-pill__remove {
+  display: inline-flex;
   align-items: center;
   justify-content: center;
-  border: 1px solid var(--border-subtle);
-  border-radius: 7px;
-  background: color-mix(in srgb, var(--bg-elevated) 92%, transparent);
-  color: var(--text-secondary);
+  padding: 0 2px;
+  border: none;
+  background: none;
+  color: var(--text-tertiary);
   cursor: pointer;
+  font-size: 14px;
   line-height: 1;
-  opacity: 0.86;
-  box-shadow: var(--shadow-sm);
-  backdrop-filter: blur(6px);
-  transition: background var(--transition-fast), color var(--transition-fast), border-color var(--transition-fast), box-shadow var(--transition-fast);
-}
-
-.workspace-op-body.tree-closed .workspace-op-tree-edge-toggle {
-  right: 12px;
+  flex-shrink: 0;
 }
 
-.workspace-op-tree-edge-toggle:hover,
-.workspace-op-tree-edge-toggle.active:hover {
-  background: var(--bg-elevated);
-  border-color: var(--border-default);
+.chat-file-pill__remove:hover {
   color: var(--text-primary);
-  box-shadow: var(--shadow-md);
 }
 
-.workspace-op-side-resize {
-  position: absolute;
-  top: 0;
-  right: calc(var(--workspace-side-width) - 3px);
-  z-index: 6;
-  width: 7px;
-  height: 100%;
-  cursor: col-resize;
+.chat-composer .chat-input {
+  flex: 1 1 auto;
+  width: 100%;
+  min-width: 0;
+  min-height: 0;
+  max-height: 180px;
+  background: transparent;
+  border: none;
+  border-radius: 0;
+  padding: 2px 2px 4px;
+  font-size: 14px;
+  line-height: 22px;
+  color: var(--text-primary);
+  outline: none;
+  align-self: stretch;
+  white-space: pre-wrap;
+  box-shadow: none;
+  transition: none;
+  overflow-y: auto;
+  resize: none;
 }
 
-.workspace-op-side-resize::after {
-  content: "";
-  position: absolute;
-  top: 0;
-  right: 3px;
-  width: 1px;
-  height: 100%;
+.chat-composer .chat-input:focus {
+  border: none;
+  box-shadow: none;
   background: transparent;
-  transition: background var(--transition-fast);
 }
 
-.workspace-op-side-resize:hover::after {
-  background: var(--border-default);
+.chat-composer .chat-input:disabled {
+  opacity: 0.55;
+  cursor: not-allowed;
 }
 
-.workspace-op-body.tree-closed .workspace-op-side-resize {
-  display: none;
+.chat-composer-toolbar .btn-stop-generation {
+  flex-shrink: 0;
+  width: 28px;
+  min-width: 28px;
+  height: 28px;
+  padding: 0;
+  border-radius: 6px;
+  box-sizing: border-box;
 }
 
-.workspace-op-main .markdown-content {
-  max-width: 880px;
-  margin: 0 auto;
+.chat-input {
+  flex: 1;
+  min-height: 40px;
+  background: var(--bg-secondary);
+  border: 1px solid var(--border-default);
+  border-radius: 8px;
+  padding: 0 14px;
+  font-size: 14px;
+  color: var(--text-primary);
+  transition: border-color 150ms ease, box-shadow 150ms ease;
+  outline: none;
+  align-self: center;
+  white-space: pre-wrap;
 }
 
-.workspace-op-empty {
-  height: 100%;
-  min-height: 220px;
-  display: flex;
-  align-items: center;
-  justify-content: center;
+.chat-input::placeholder {
   color: var(--text-tertiary);
   font-size: 13px;
 }
 
-.workspace-op-preview-paused {
-  min-height: 420px;
-  border: 1px dashed var(--border-subtle);
-  border-radius: 8px;
-  background: color-mix(in srgb, var(--bg-secondary) 88%, white);
+.chat-input:focus {
+  border-color: var(--accent-primary);
+  box-shadow: 0 0 0 3px var(--accent-subtle);
+  background: var(--bg-elevated);
 }
 
-.workspace-op-deleted {
-  flex-direction: column;
-  gap: 10px;
-  padding: 24px;
-  text-align: center;
+.chat-input:disabled {
+  opacity: 0.55;
+  cursor: not-allowed;
 }
 
-.workspace-op-deleted-title {
-  font-size: 18px;
-  font-weight: 600;
-  color: var(--text-primary);
+/* Stop generation button */
+.btn-stop-generation {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  min-width: 36px;
+  height: 36px;
+  padding: 6px 14px;
+  border-radius: 10px;
+  border: 1px solid rgba(239, 68, 68, 0.3);
+  background: rgba(239, 68, 68, 0.08);
+  color: #ef4444;
+  cursor: pointer;
+  transition: all 150ms ease;
 }
 
-.workspace-op-deleted-path {
-  max-width: 560px;
-  color: var(--text-secondary);
-  font-size: 13px;
-  line-height: 1.6;
-  word-break: break-word;
+.btn-stop-generation:hover {
+  background: rgba(239, 68, 68, 0.15);
+  border-color: rgba(239, 68, 68, 0.5);
 }
 
-.workspace-op-live {
-  min-height: 100%;
-  display: flex;
-  flex-direction: column;
-  gap: 10px;
+.stop-icon {
+  display: inline-block;
+  width: 10px;
+  height: 10px;
+  border-radius: 2px;
+  background: currentColor;
 }
 
-.workspace-op-live-banner {
-  flex-shrink: 0;
-  color: var(--text-tertiary);
-  font-size: 12px;
+[data-theme="light"] .btn-stop-generation {
+  background: rgba(239, 68, 68, 0.06);
+  border-color: rgba(239, 68, 68, 0.25);
 }
 
-.workspace-op-html-fit {
-  width: 100%;
-  min-height: 100%;
-  overflow: auto;
-  background: var(--bg-secondary);
-  border: 1px solid var(--border-subtle);
-  border-radius: 8px;
-  padding: 10px;
+[data-theme="light"] .btn-stop-generation:hover {
+  background: rgba(239, 68, 68, 0.12);
+  border-color: rgba(239, 68, 68, 0.4);
 }
 
-.workspace-op-html-fit iframe {
-  display: block;
-  border: none;
-  background: #fff;
-  border-radius: 8px;
+@keyframes fadeIn {
+  from {
+    opacity: 0;
+    transform: translateY(4px);
+  }
+
+  to {
+    opacity: 1;
+    transform: translateY(0);
+  }
 }
 
-.workspace-op-image-preview {
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  min-height: 100%;
-  padding: 24px;
+/* Thinking indicator */
+.thinking-panel {
+  margin-bottom: 8px;
+  font-size: 12px;
+  background: color-mix(in srgb, var(--bg-secondary) 68%, var(--bg-primary));
+  border: 1px solid var(--border-subtle);
+  border-radius: 8px;
+  overflow: hidden;
 }
 
-.workspace-op-image {
-  display: block;
-  max-width: 100%;
-  max-height: 100%;
-  width: auto;
-  height: auto;
-  object-fit: contain;
-  border-radius: 14px;
-  box-shadow: 0 18px 44px rgba(15, 23, 42, 0.08);
-  background: #fff;
+.thinking-panel[open] {
+  background: var(--bg-primary);
+  border-color: var(--border-default);
 }
 
-.workspace-op-modal-overlay {
-  position: absolute;
-  inset: 0;
-  z-index: 40;
+.thinking-summary {
   display: flex;
   align-items: center;
-  justify-content: center;
-  background: rgba(15, 23, 42, 0.18);
-  backdrop-filter: blur(2px);
+  gap: 7px;
+  min-height: 32px;
+  padding: 0 10px;
+  color: var(--text-secondary);
+  cursor: pointer;
+  font-size: 12px;
+  font-weight: 500;
+  user-select: none;
 }
 
-.workspace-op-modal {
-  width: min(420px, calc(100% - 32px));
-  border: 1px solid var(--border-subtle);
-  border-radius: 16px;
-  background: var(--bg-primary);
-  box-shadow: 0 24px 64px rgba(15, 23, 42, 0.18);
-  padding: 20px;
+.thinking-summary::marker,
+.thinking-summary::-webkit-details-marker {
+  display: none;
+  content: "";
 }
 
-.workspace-op-modal-title {
-  font-size: 18px;
-  font-weight: 700;
-  color: var(--text-primary);
+.thinking-status-dot {
+  width: 6px;
+  height: 6px;
+  border-radius: 999px;
+  background: var(--text-tertiary);
+  box-shadow: 0 0 0 3px color-mix(in srgb, var(--text-tertiary) 12%, transparent);
+  flex-shrink: 0;
 }
 
-.workspace-op-modal-text {
-  margin-top: 10px;
+.thinking-content {
+  max-height: 300px;
+  overflow: auto;
+  padding: 0 10px 9px 23px;
   color: var(--text-secondary);
-  font-size: 14px;
+  font-size: 12px;
   line-height: 1.6;
+  white-space: pre-wrap;
+  word-break: break-word;
 }
 
-.workspace-op-modal-actions {
-  margin-top: 18px;
+.thinking-indicator {
   display: flex;
-  justify-content: flex-end;
+  align-items: center;
   gap: 10px;
 }
 
-.workspace-op-editor {
-  width: 100%;
-  height: 100%;
-  min-height: 420px;
-  resize: none;
-  border: 1px solid var(--border-subtle);
-  border-radius: 8px;
-  padding: 14px;
-  background: var(--bg-secondary);
-  color: var(--text-primary);
-  font-family: var(--font-mono);
-  font-size: 13px;
-  line-height: 1.6;
-  outline: none;
-}
-
-.workspace-op-editor:focus {
-  border-color: var(--border-strong);
-}
-
-.workspace-op-table-wrap {
-  max-width: 100%;
-  overflow: auto;
-  border: 1px solid var(--border-subtle);
-  border-radius: 8px;
-  background: var(--bg-secondary);
+.thinking-dots {
+  display: flex;
+  gap: 4px;
+  align-items: center;
 }
 
-.workspace-op-table {
-  width: max-content;
-  min-width: 100%;
-  border-collapse: collapse;
-  font-size: 13px;
-  table-layout: auto;
+.thinking-dots span {
+  width: 6px;
+  height: 6px;
+  border-radius: 50%;
+  background: var(--text-tertiary);
+  animation: thinkingBounce 1.4s ease-in-out infinite;
 }
 
-.workspace-op-table thead th {
-  background: color-mix(in srgb, var(--bg-tertiary) 92%, transparent);
-  color: var(--text-primary);
-  font-weight: 700;
-  position: sticky;
-  top: 0;
-  z-index: 1;
+.thinking-dots span:nth-child(2) {
+  animation-delay: 0.2s;
 }
 
-.workspace-op-table td,
-.workspace-op-table th {
-  padding: 8px 10px;
-  border-right: 1px solid var(--border-subtle);
-  border-bottom: 1px solid var(--border-subtle);
-  white-space: nowrap;
+.thinking-dots span:nth-child(3) {
+  animation-delay: 0.4s;
 }
 
-.workspace-op-table tbody tr:nth-child(even) td {
-  background: color-mix(in srgb, var(--bg-primary) 84%, var(--bg-tertiary));
+@keyframes thinkingBounce {
+  0%, 80%, 100% {
+    opacity: 0.3;
+    transform: scale(0.8);
+  }
+  40% {
+    opacity: 1;
+    transform: scale(1.1);
+  }
 }
 
-.workspace-op-table td:empty::after,
-.workspace-op-table th:empty::after {
-  content: " ";
+/* Wizard / Stepper */
+.wizard-steps {
+  display: flex;
+  align-items: center;
+  gap: var(--space-2);
+  margin-bottom: var(--space-8);
 }
 
-.workspace-op-table tr:last-child td {
-  border-bottom: none;
+.wizard-step {
+  display: flex;
+  align-items: center;
+  gap: var(--space-2);
+  font-size: var(--text-sm);
+  color: var(--text-tertiary);
 }
 
-.workspace-op-pdf {
-  width: 100%;
-  height: 100%;
-  min-height: 640px;
-  border: none;
-  border-radius: 8px;
-  background: var(--bg-secondary);
+.wizard-step.active {
+  color: var(--accent-text);
 }
 
-.workspace-op-text-preview {
-  margin: 0;
-  padding: 14px;
-  border-radius: 8px;
-  background: var(--bg-secondary);
-  color: var(--text-secondary);
-  font-family: var(--font-mono);
-  font-size: 13px;
-  line-height: 1.7;
-  white-space: pre-wrap;
-  word-break: break-word;
+.wizard-step.completed {
+  color: var(--accent-text);
+  opacity: 0.8;
 }
 
-.workspace-op-ppt-preview {
+.wizard-step-number {
+  width: 24px;
+  height: 24px;
+  border-radius: 50%;
   display: flex;
-  flex-direction: column;
-  gap: 16px;
-}
-
-.workspace-op-slide-card {
-  margin: 0;
-}
-
-.workspace-op-slide-label {
-  margin-bottom: 6px;
-  color: var(--text-tertiary);
-  font-size: 12px;
+  align-items: center;
+  justify-content: center;
+  font-size: var(--text-xs);
   font-weight: 600;
+  border: 1px solid var(--border-default);
 }
 
-.workspace-op-slide-canvas {
-  position: relative;
-  width: 100%;
-  aspect-ratio: 16 / 9;
-  border: 1px solid var(--border-subtle);
-  border-radius: 8px;
-  background: #fff;
-  overflow: hidden;
+.wizard-step.active .wizard-step-number {
+  background: var(--accent-primary);
+  border-color: var(--accent-primary);
+  color: var(--text-inverse);
+  /* dark text on near-white bg in dark mode */
 }
 
-.workspace-op-slide-shape {
-  position: absolute;
-  padding: 4px;
-  color: #222;
-  font-size: 12px;
-  line-height: 1.35;
-  overflow: hidden;
-  white-space: pre-wrap;
+.wizard-step.completed .wizard-step-number {
+  background: var(--accent-subtle);
+  border-color: var(--accent-subtle);
+  color: var(--accent-text);
 }
 
-.workspace-op-revision {
-  margin: 10px;
-  padding: 10px;
-  border: 1px solid var(--border-subtle);
-  border-radius: 8px;
-  background: var(--bg-primary);
+.wizard-connector {
+  flex: 1;
+  height: 1px;
+  background: var(--border-default);
 }
 
-.workspace-op-revision-head {
-  display: flex;
+/* Typing Indicator Animation */
+.typing-indicator {
+  display: inline-flex;
   align-items: center;
-  justify-content: space-between;
-  gap: 8px;
-  margin-bottom: 8px;
-  font-size: 12px;
+  gap: 4px;
+  padding: 4px 6px;
+  height: 24px;
+}
+
+.typing-dot {
+  width: 6px;
+  height: 6px;
+  border-radius: 50%;
+  background-color: var(--text-tertiary);
+  animation: typing 1.4s infinite ease-in-out both;
 }
 
-.workspace-op-revision-meta {
-  min-width: 0;
-  display: flex;
-  align-items: center;
-  gap: 8px;
+.typing-dot:nth-child(1) {
+  animation-delay: -0.32s;
 }
 
-.workspace-op-revision strong {
-  font-size: 13px;
+.typing-dot:nth-child(2) {
+  animation-delay: -0.16s;
 }
 
-.workspace-op-revision span {
-  color: var(--text-tertiary);
+@keyframes typing {
+  0%, 80%, 100% {
+    transform: scale(0);
+  }
+  40% {
+    transform: scale(1);
+  }
 }
 
-.workspace-op-revision-time {
-  flex-shrink: 0;
-  color: var(--text-tertiary);
-  font-size: 11px;
+/* ─── Login QR Code ────────────────────────────────── */
+
+.login-qr-toggle {
+    margin-top: 24px;
+    padding: 12px;
+    border: 1px dashed var(--border-strong);
+    border-radius: var(--radius-lg);
+    text-align: center;
+    cursor: pointer;
+    font-size: var(--text-sm);
+    color: var(--text-secondary);
+    transition: all var(--transition-fast);
 }
 
-.workspace-op-revision pre {
-  max-height: 160px;
-  overflow: auto;
-  margin: 0 0 8px;
-  color: var(--text-tertiary);
-  font-family: var(--font-mono);
-  font-size: 11px;
-  line-height: 1.5;
-  white-space: pre-wrap;
+.login-qr-toggle:hover {
+    border-color: var(--accent-primary);
+    color: var(--text-primary);
+    background: var(--bg-hover);
 }
 
-.workspace-op-revision-note {
-  color: var(--text-secondary);
+.login-qr-section {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    gap: 16px;
+    padding: 20px 0;
 }
 
-/* Collapsed toggle button (shown on the right edge) */
-.live-panel-toggle {
-  position: fixed;
-  right: 0;
-  top: 50%;
-  transform: translateY(-50%);
-  width: 28px;
-  height: 56px;
-  border: 1px solid var(--border-subtle);
-  border-right: none;
-  border-radius: 8px 0 0 8px;
-  background: var(--bg-elevated);
-  color: var(--text-tertiary);
-  cursor: pointer;
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  justify-content: center;
-  gap: 4px;
-  transition: all 150ms ease;
-  z-index: 10;
+.login-qr-container {
+    position: relative;
+    width: 200px;
+    height: 200px;
+    padding: 10px;
+    background: white;
+    border-radius: var(--radius-lg);
+    overflow: hidden;
 }
 
-.live-panel-toggle:hover {
-  color: var(--text-secondary);
-  background: var(--bg-tertiary);
-  width: 32px;
+.login-qr-container img {
+    width: 100%;
+    height: 100%;
+    display: block;
 }
 
-.live-panel-toggle-dot {
-  width: 6px;
-  height: 6px;
-  border-radius: 50%;
-  background: #22c55e;
-  box-shadow: 0 0 4px rgba(34, 197, 94, 0.6);
-  animation: livePulse 2s ease-in-out infinite;
+.login-qr-container img.blur {
+    filter: blur(4px);
+    opacity: 0.5;
 }
 
-/* Responsive: hide live panel on narrow screens */
-@media (max-width: 900px) {
-  .live-panel {
-    display: none;
-  }
-  .live-panel-toggle {
-    display: none;
-  }
+.login-qr-overlay {
+    position: absolute;
+    inset: 0;
+    background: rgba(0, 0, 0, 0.7);
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    z-index: 10;
+    padding: 20px;
+    text-align: center;
 }
 
-.chat-messages {
-  flex: 1;
-  overflow-y: auto;
-  padding: var(--space-4) 0;
+.login-qr-overlay p {
+    color: white;
+    font-size: var(--text-sm);
+    margin-bottom: 12px;
 }
 
-.chat-message {
-  display: flex;
-  gap: var(--space-3);
-  padding: var(--space-3) 0;
-  animation: fadeIn 200ms ease;
+.login-qr-overlay button {
+    background: var(--accent-primary);
+    color: var(--bg-primary);
+    border: none;
+    padding: 6px 16px;
+    border-radius: var(--radius-md);
+    font-size: var(--text-xs);
+    font-weight: 600;
 }
 
-.chat-message.user {
-  flex-direction: row-reverse;
+.login-qr-overlay.success {
+    background: rgba(34, 197, 94, 0.9);
 }
 
-.chat-avatar {
-  width: 32px;
-  height: 32px;
-  border-radius: var(--radius-full);
-  background: var(--bg-elevated);
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  font-size: var(--text-sm);
-  flex-shrink: 0;
+.login-qr-hint {
+    font-size: var(--text-xs);
+    color: var(--text-tertiary);
+    text-align: center;
 }
 
-.chat-bubble {
-  max-width: 70%;
-  padding: var(--space-3) var(--space-4);
-  border-radius: var(--radius-lg);
-  font-size: var(--text-sm);
-  line-height: 1.6;
+.login-qr-back {
+    margin-top: 12px;
+    font-size: var(--text-sm);
+    color: var(--accent-text);
+    cursor: pointer;
 }
 
-.chat-message.assistant .chat-bubble {
-  background: var(--bg-elevated);
-  border: 1px solid var(--border-subtle);
+.login-qr-back:hover {
+    text-decoration: underline;
 }
 
-.chat-message.user .chat-bubble {
-  background: var(--accent-primary);
-  color: white;
+/* ─── Drop Zone Overlay (drag-and-drop upload) ───── */
+
+.drop-zone-wrapper {
+    position: relative;
 }
 
-/* ── SaaS chat message system (replaces inline styles) ── */
-.chat-msg-row {
-  display: flex;
-  flex-direction: row;
-  gap: 8px;
-  margin-bottom: 8px;
-  animation: fadeIn 0.2s ease;
+.drop-zone-overlay {
+    position: absolute;
+    inset: 0;
+    z-index: 100;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    flex-direction: column;
+    gap: 8px;
+    background: rgba(99, 102, 241, 0.07);
+    border: 2px dashed rgba(99, 102, 241, 0.45);
+    border-radius: 12px;
+    pointer-events: none;
+    animation: dropZoneFadeIn 0.15s ease;
+    backdrop-filter: blur(2px);
 }
-.chat-msg-row--user {
-  flex-direction: row-reverse;
+
+.drop-zone-overlay__icon {
+    width: 40px;
+    height: 40px;
+    border-radius: 50%;
+    background: rgba(99, 102, 241, 0.12);
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 20px;
+    color: var(--accent-primary);
 }
 
-.chat-msg-avatar {
-  width: 28px;
-  height: 28px;
-  border-radius: 50%;
-  background: var(--bg-elevated);
-  display: flex;
-  align-items: center;
-  justify-content: center;
-  font-size: 11px;
-  flex-shrink: 0;
-  color: var(--text-secondary);
-  font-weight: 600;
-  border: 1px solid var(--border-subtle);
+.drop-zone-overlay__text {
+    font-size: 13px;
+    font-weight: 600;
+    color: var(--accent-primary);
+    letter-spacing: 0.2px;
 }
-.chat-msg-avatar--user {
-  background: var(--bg-tertiary);
-  border-color: var(--border-subtle);
-  color: var(--text-secondary);
+
+.drop-zone-overlay__hint {
+    font-size: 11px;
+    color: var(--text-tertiary);
 }
-[data-theme="dark"] .chat-msg-avatar--user {
-  background: rgba(255, 255, 255, 0.06);
-  border-color: rgba(255, 255, 255, 0.08);
+
+@keyframes dropZoneFadeIn {
+    from {
+        opacity: 0;
+        border-color: transparent;
+    }
+    to {
+        opacity: 1;
+        border-color: rgba(99, 102, 241, 0.45);
+    }
 }
 
-.chat-msg-col {
-  max-width: 72%;
-  display: flex;
-  flex-direction: column;
-  align-items: flex-start;
+[data-theme="light"] .drop-zone-overlay {
+    background: rgba(99, 102, 241, 0.06);
+    border-color: rgba(99, 102, 241, 0.35);
 }
 
-.chat-msg-bubble {
-  max-width: 100%;
-  padding: 8px 12px;
-  border-radius: 12px;
-  font-size: 13px;
-  line-height: 1.5;
-  word-break: break-word;
-  background: var(--bg-elevated);
-  border: 1px solid var(--border-subtle);
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.03);
-  transition: box-shadow 150ms ease;
+/* Workspace Panel file actions */
+.workspace-op-tree-file-name {
+  flex: 1;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
 }
-.chat-msg-bubble--user {
-  background: var(--bg-secondary);
-  border-color: var(--border-default);
+
+.workspace-op-tree-file-delete {
+  flex-shrink: 0;
+  display: none;
+  background: transparent;
+  border: none;
+  color: var(--text-tertiary);
+  cursor: pointer;
+  padding: 4px;
+  border-radius: 4px;
+  align-items: center;
+  justify-content: center;
 }
-[data-theme="dark"] .chat-msg-bubble {
-  background: rgba(255, 255, 255, 0.05);
-  border-color: rgba(255, 255, 255, 0.08);
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
+
+.workspace-op-tree-file-delete:hover {
+  color: var(--system-danger);
+  background: var(--bg-black-10);
 }
-[data-theme="dark"] .chat-msg-bubble--user {
-  background: rgba(255, 255, 255, 0.06);
-  border-color: rgba(255, 255, 255, 0.1);
+
+.workspace-op-tree-dir:hover .workspace-op-tree-file-delete,
+.workspace-op-tree-file:hover .workspace-op-tree-file-delete {
+  display: flex;
 }
 
-.chat-msg-sender {
-  font-size: 10px;
-  color: var(--text-tertiary);
-  margin-bottom: 2px;
-  font-weight: 600;
+/* =============================================
+   立体感增强效果
+   ============================================= */
+
+/* 按钮分层效果 - 已整合到主按钮样式中 */
+
+/* 3. 侧边栏导航立体感 */
+.sidebar-item {
+  position: relative;
+  padding: 12px 16px;
+  border-radius: var(--radius-md);
+  transition: all 0.2s ease;
 }
 
-.chat-msg-timestamp {
-  font-size: 10px;
-  color: var(--text-tertiary);
-  margin-top: 3px;
-  opacity: 0.6;
-  display: flex;
-  align-items: center;
-  justify-content: flex-start;
-  gap: 4px;
+.sidebar-item:hover {
+  background: var(--bg-hover);
+  transform: translateX(4px);
 }
 
-.chat-msg-file-chip {
-  display: inline-flex;
-  align-items: center;
-  gap: 5px;
-  background: var(--bg-secondary);
-  border-radius: 6px;
-  padding: 4px 8px;
-  font-size: 11px;
-  border: 1px solid var(--border-subtle);
-  color: var(--text-secondary);
+.sidebar-item.active {
+  background: var(--accent-subtle);
+  border-left: 3px solid var(--accent-primary);
+  box-shadow: inset 4px 0 12px rgba(225, 225, 232, 0.1);
 }
-[data-theme="dark"] .chat-msg-file-chip {
-  background: rgba(255, 255, 255, 0.05);
-  border-color: rgba(255, 255, 255, 0.08);
+
+/* 4. 毛玻璃效果 */
+.glass-panel {
+  background: rgba(17, 17, 25, 0.8);
+  backdrop-filter: blur(20px);
+  -webkit-backdrop-filter: blur(20px);
+  border: 1px solid rgba(225, 225, 232, 0.1);
 }
 
-/* Empty state */
-.chat-empty-state {
-  text-align: center;
-  padding: 60px 20px;
-  color: var(--text-tertiary);
+/* 5. 渐变边框光晕效果 */
+.glow-border {
+  position: relative;
+  border-radius: var(--radius-lg);
 }
-.chat-empty-state__title {
-  font-size: 14px;
-  font-weight: 500;
-  margin-bottom: 4px;
-  color: var(--text-secondary);
+
+.glow-border::before {
+  content: '';
+  position: absolute;
+  inset: -2px;
+  border-radius: inherit;
+  background: linear-gradient(135deg, rgba(225,225,232,0.3), rgba(139,92,246,0.2));
+  z-index: -1;
+  opacity: 0;
+  transition: opacity 0.3s ease;
 }
-.chat-empty-state__subtitle {
-  font-size: 12px;
+
+.glow-border:hover::before {
+  opacity: 1;
 }
-.chat-empty-state__hint {
-  font-size: 11px;
-  margin-top: 6px;
-  opacity: 0.65;
+
+/* 6. 按钮涟漪效果 */
+.btn {
+  position: relative;
+  overflow: hidden;
 }
 
-/* Scroll-to-bottom button */
-.chat-scroll-btn {
+.btn::after {
+  content: '';
   position: absolute;
-  right: 20px;
-  width: 32px;
-  height: 32px;
+  top: 50%;
+  left: 50%;
+  width: 0;
+  height: 0;
+  background: rgba(255, 255, 255, 0.3);
   border-radius: 50%;
-  background: rgba(255, 255, 255, 0.75);
+  transform: translate(-50%, -50%);
+  transition: width 0.6s ease, height 0.6s ease;
+}
+
+.btn:active::after {
+  width: 300px;
+  height: 300px;
+}
+
+/* =============================================
+   Account Settings Modal - Enhanced Styles
+   ============================================= */
+
+/* Modal backdrop */
+/* ════════════════════════════════════════════════════════
+   Enhanced Modal System
+   ════════════════════════════════════════════════════════ */
+
+.modal-backdrop {
+  position: fixed;
+  inset: 0;
+  z-index: 10000;
+  background: rgba(0, 0, 0, 0.7);
   backdrop-filter: blur(8px);
   -webkit-backdrop-filter: blur(8px);
-  border: 1px solid var(--border-subtle);
-  color: var(--text-secondary);
-  cursor: pointer;
   display: flex;
   align-items: center;
   justify-content: center;
-  font-size: 16px;
-  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.06);
-  z-index: 10;
-  transition: background 120ms ease, box-shadow 120ms ease;
-}
-.chat-scroll-btn:hover {
-  background: rgba(255, 255, 255, 0.9);
-  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
-}
-[data-theme="dark"] .chat-scroll-btn {
-  background: rgba(40, 40, 50, 0.75);
-}
-[data-theme="dark"] .chat-scroll-btn:hover {
-  background: rgba(50, 50, 60, 0.9);
+  animation: fadeIn 0.25s ease forwards;
 }
 
-.chat-input-area {
-  padding: var(--space-4);
+@keyframes fadeIn {
+  from { opacity: 0; }
+  to { opacity: 1; }
 }
 
-/* Unified composer: attachments → text (top-aligned) → toolbar (attach | send) */
-.chat-composer {
-  display: flex;
-  flex-direction: column;
-  min-height: 80px;
-  border: 1px solid rgba(0, 0, 0, 0.08);
-  border-radius: 10px;
-  background: rgba(255, 255, 255, 0.65);
-  backdrop-filter: blur(8px);
-  -webkit-backdrop-filter: blur(8px);
-  overflow: hidden;
-  transition: border-color 150ms ease, box-shadow 150ms ease;
-  box-shadow: 0 1px 4px rgba(0, 0, 0, 0.03);
-}
-[data-theme="dark"] .chat-composer {
-  background: rgba(255, 255, 255, 0.04);
-  border-color: rgba(255, 255, 255, 0.08);
-  box-shadow: 0 1px 4px rgba(0, 0, 0, 0.12);
+/* Modal content */
+.modal-content {
+  background: var(--bg-secondary);
+  border-radius: var(--radius-2xl);
+  border: 1px solid var(--border-default);
+  width: min(90vw, 560px);
+  max-height: 85vh;
+  overflow-y: auto;
+  box-shadow: 0 25px 100px rgba(0, 0, 0, 0.5), 0 0 0 1px rgba(99, 102, 241, 0.1);
+  animation: modalSlideUp 0.35s cubic-bezier(0.16, 1, 0.3, 1) forwards;
+  position: relative;
 }
 
-.chat-composer:focus-within {
-  border-color: var(--border-strong);
-  box-shadow: 0 0 0 3px rgba(58, 58, 66, 0.08);
-}
-[data-theme="dark"] .chat-composer:focus-within {
-  border-color: rgba(255, 255, 255, 0.18);
-  box-shadow: 0 0 0 3px rgba(255, 255, 255, 0.08);
+.modal-content::before {
+  content: '';
+  position: absolute;
+  inset: 0;
+  background: linear-gradient(180deg, rgba(255,255,255,0.02) 0%, transparent 100%);
+  pointer-events: none;
 }
 
-.chat-composer-attachments {
-  padding: 6px 6px 2px;
-  display: flex;
-  flex-wrap: wrap;
-  align-items: center;
-  gap: 6px;
-  background: transparent;
+@keyframes modalSlideUp {
+  from {
+    opacity: 0;
+    transform: translateY(24px) scale(0.96);
+  }
+  to {
+    opacity: 1;
+    transform: translateY(0) scale(1);
+  }
 }
 
-.chat-composer-input-block {
-  flex: 1 1 auto;
+/* Modal header */
+.modal-header {
   display: flex;
-  flex-direction: column;
-  min-height: 0;
-  padding: 4px 6px 0;
-  background: transparent;
+  justify-content: space-between;
+  align-items: center;
+  padding: 20px 24px;
+  border-bottom: 1px solid var(--border-subtle);
+  background: linear-gradient(180deg, var(--bg-elevated) 0%, var(--bg-secondary) 100%);
+  position: sticky;
+  top: 0;
+  z-index: 1;
 }
 
-.chat-composer-toolbar {
+.modal-header-content {
   display: flex;
   align-items: center;
-  justify-content: space-between;
-  flex-shrink: 0;
-  gap: 8px;
-  padding: 2px 6px 6px;
-  background: transparent;
+  gap: 12px;
 }
 
-.chat-composer-btn {
-  display: inline-flex;
+.modal-icon-wrapper {
+  width: 36px;
+  height: 36px;
+  border-radius: var(--radius-lg);
+  background: linear-gradient(135deg, var(--accent-subtle), var(--bg-tertiary));
+  display: flex;
   align-items: center;
   justify-content: center;
-  width: 28px;
-  height: 28px;
-  border: none;
-  border-radius: 6px;
-  background: transparent;
-  color: var(--text-secondary);
-  cursor: pointer;
-  flex-shrink: 0;
-  transition: background 0.12s ease, color 0.12s ease;
-}
-
-.chat-composer-btn:hover:not(:disabled) {
-  background: var(--bg-hover);
-  color: var(--text-primary);
-}
-
-.chat-composer-btn:disabled {
-  opacity: 0.4;
-  cursor: not-allowed;
+  color: var(--accent-primary);
+  box-shadow: 0 2px 8px rgba(99, 102, 241, 0.15);
 }
 
-/* Send: same as global .btn-primary (theme bg + primary-fg); icon uses currentColor */
-.chat-composer .btn.chat-composer-send {
-  background: #3a3a42;
-  color: #ffffff;
-  border-color: #3a3a42;
-  padding: 0;
-  width: 28px;
-  min-width: 28px;
-  height: 28px;
-  flex-shrink: 0;
-  border-radius: 6px;
+.modal-header h3 {
+  margin: 0;
+  font-size: 18px;
+  font-weight: 600;
+  color: var(--text-primary);
 }
 
-.chat-composer .btn.chat-composer-send:hover:not(:disabled) {
-  background: #28282e;
-  border-color: #28282e;
+.modal-close-btn {
+  width: 32px;
+  height: 32px;
+  border-radius: var(--radius-lg);
+  background: transparent;
+  border: none;
+  color: var(--text-tertiary);
+  cursor: pointer;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  transition: all var(--transition-fast);
 }
 
-[data-theme="dark"] .chat-composer .btn.chat-composer-send {
-  background: #e1e1e8;
-  border-color: #e1e1e8;
-  color: #0a0a0f;
+.modal-close-btn:hover {
+  background: var(--bg-hover);
+  color: var(--text-primary);
+  transform: scale(1.05);
 }
 
-[data-theme="dark"] .chat-composer .btn.chat-composer-send:hover:not(:disabled) {
-  background: #ffffff;
-  border-color: #ffffff;
+.modal-close-btn:active {
+  transform: scale(0.95);
 }
 
-.chat-composer .btn.chat-composer-send:disabled {
-  opacity: 0.45;
+/* Modal message */
+.modal-message {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  padding: 10px 14px;
+  margin: 16px;
+  border-radius: var(--radius-md);
+  font-size: 13px;
 }
 
-/* File chip: width follows content, capped so referenced workspace files stay readable */
-.chat-file-pill {
-  position: relative;
-  display: inline-flex;
-  width: fit-content;
-  max-width: min(320px, 100%);
-  min-width: 0;
-  vertical-align: middle;
-  border-radius: 9999px;
-  border: 1px solid var(--border-subtle);
-  background: var(--bg-secondary);
-  overflow: hidden;
-  box-sizing: border-box;
-  min-height: 26px;
+.modal-message.success {
+  background: rgba(34, 197, 94, 0.1);
+  color: var(--success);
 }
 
-.chat-file-pill--workspace {
-  max-width: min(460px, 100%);
+.modal-message.error {
+  background: rgba(239, 68, 68, 0.1);
+  color: var(--error);
 }
 
-.chat-file-pill__fill {
-  position: absolute;
-  left: 0;
-  top: 0;
-  bottom: 0;
-  width: 0;
-  background: color-mix(in srgb, var(--accent-primary) 20%, var(--bg-secondary));
-  transition: width 0.12s ease;
-  pointer-events: none;
-  border-radius: inherit;
+/* Settings sections */
+.settings-section {
+  padding: 16px 24px;
 }
 
-.chat-file-pill__row {
-  position: relative;
-  z-index: 1;
+.section-header {
   display: flex;
   align-items: center;
-  gap: 6px;
-  padding: 2px 10px;
-  min-width: 0;
-  max-width: 100%;
-  box-sizing: border-box;
+  gap: 8px;
+  margin-bottom: 14px;
 }
 
-.chat-file-pill__icon {
-  display: inline-flex;
-  color: var(--text-tertiary);
-  flex-shrink: 0;
+.section-header h4 {
+  margin: 0;
+  font-size: 13px;
+  font-weight: 600;
+  color: var(--text-secondary);
 }
 
-.chat-file-pill__thumb {
-  width: 14px;
-  height: 14px;
-  border-radius: 3px;
-  object-fit: cover;
-  flex-shrink: 0;
+.section-header svg {
+  color: var(--accent-primary);
 }
 
-.chat-file-pill__name {
-  flex: 1;
-  min-width: 0;
-  overflow: hidden;
-  text-overflow: ellipsis;
-  white-space: nowrap;
-  font-size: var(--text-base);
-  font-weight: 400;
-  line-height: 1.5;
-  color: var(--text-primary);
+/* Form groups */
+.form-group {
+  margin-bottom: 14px;
 }
 
-.chat-file-pill__source {
-  flex-shrink: 0;
-  padding: 1px 5px;
-  border-radius: 5px;
-  background: var(--bg-tertiary);
-  color: var(--text-tertiary);
-  font-size: 11px;
-  line-height: 16px;
+.form-group label {
+  display: block;
+  font-size: 12px;
+  font-weight: 500;
+  color: var(--text-secondary);
+  margin-bottom: 6px;
 }
 
-.chat-file-pill__size {
+.form-group .form-input {
+  width: 100%;
   font-size: 14px;
-  line-height: 14px;
-  color: var(--text-tertiary);
-  flex-shrink: 0;
 }
 
-.chat-file-pill__pct {
-  font-size: 14px;
-  line-height: 14px;
-  font-variant-numeric: tabular-nums;
-  color: var(--text-primary);
-  flex-shrink: 0;
+.form-input:disabled {
+  background: var(--bg-tertiary);
+  cursor: not-allowed;
+  opacity: 0.7;
 }
 
-.chat-file-pill__remove {
-  display: inline-flex;
+/* Input with action */
+.input-with-action {
+  display: flex;
   align-items: center;
-  justify-content: center;
-  padding: 0 2px;
-  border: none;
-  background: none;
-  color: var(--text-tertiary);
-  cursor: pointer;
-  font-size: 14px;
-  line-height: 1;
-  flex-shrink: 0;
+  gap: 10px;
 }
 
-.chat-file-pill__remove:hover {
-  color: var(--text-primary);
+.input-with-action .form-input {
+  flex: 1;
 }
 
-.chat-composer .chat-input {
-  flex: 1 1 auto;
-  width: 100%;
-  min-width: 0;
-  min-height: 0;
-  max-height: 180px;
-  background: transparent;
-  border: none;
-  border-radius: 0;
-  padding: 2px 2px 4px;
-  font-size: 14px;
-  line-height: 22px;
-  color: var(--text-primary);
-  outline: none;
-  align-self: stretch;
-  white-space: pre-wrap;
-  box-shadow: none;
-  transition: none;
-  overflow-y: auto;
-  resize: none;
+/* Status badge */
+.status-badge {
+  display: inline-flex;
+  align-items: center;
+  gap: 4px;
+  padding: 4px 10px;
+  border-radius: var(--radius-full);
+  font-size: 12px;
+  font-weight: 500;
+  white-space: nowrap;
 }
 
-.chat-composer .chat-input:focus {
-  border: none;
-  box-shadow: none;
-  background: transparent;
+.status-badge.success {
+  background: rgba(34, 197, 94, 0.15);
+  color: var(--success);
 }
 
-.chat-composer .chat-input:disabled {
-  opacity: 0.55;
-  cursor: not-allowed;
+/* Form hint */
+.form-hint {
+  font-size: 11px;
+  color: var(--text-tertiary);
+  margin-top: 6px;
 }
 
-.chat-composer-toolbar .btn-stop-generation {
-  flex-shrink: 0;
-  width: 28px;
-  min-width: 28px;
-  height: 28px;
-  padding: 0;
-  border-radius: 6px;
-  box-sizing: border-box;
+/* Form actions */
+.form-actions {
+  display: flex;
+  justify-content: flex-end;
+  gap: 8px;
+  margin-top: 16px;
 }
 
-.chat-input {
-  flex: 1;
-  min-height: 40px;
+/* Settings divider */
+.settings-divider {
+  height: 8px;
   background: var(--bg-secondary);
-  border: 1px solid var(--border-default);
-  border-radius: 8px;
-  padding: 0 14px;
-  font-size: 14px;
-  color: var(--text-primary);
-  transition: border-color 150ms ease, box-shadow 150ms ease;
-  outline: none;
-  align-self: center;
-  white-space: pre-wrap;
-}
-
-.chat-input::placeholder {
-  color: var(--text-tertiary);
-  font-size: 13px;
+  border-bottom: 1px solid var(--border-subtle);
+  margin: 8px 0;
 }
 
-.chat-input:focus {
-  border-color: var(--accent-primary);
-  box-shadow: 0 0 0 3px var(--accent-subtle);
-  background: var(--bg-elevated);
+/* Small button variant */
+.btn-sm {
+  padding: 4px 10px;
+  font-size: 12px;
+  height: 28px;
 }
 
-.chat-input:disabled {
-  opacity: 0.55;
-  cursor: not-allowed;
-}
+/* =============================================
+   Talent Market Modal - Enhanced Styles
+   ============================================= */
 
-/* Stop generation button */
-.btn-stop-generation {
-  display: inline-flex;
+.talent-market-modal {
+  position: fixed;
+  top: 0;
+  left: 0;
+  right: 0;
+  bottom: 0;
+  background: rgba(0, 0, 0, 0.6);
+  backdrop-filter: blur(6px);
+  display: flex;
   align-items: center;
-  justify-content: center;
-  min-width: 36px;
-  height: 36px;
-  padding: 6px 14px;
-  border-radius: 10px;
-  border: 1px solid rgba(239, 68, 68, 0.3);
-  background: rgba(239, 68, 68, 0.08);
-  color: #ef4444;
-  cursor: pointer;
-  transition: all 150ms ease;
+  justify-content: center;
+  z-index: 10000;
+  animation: fadeIn 0.2s ease;
 }
 
-.btn-stop-generation:hover {
-  background: rgba(239, 68, 68, 0.15);
-  border-color: rgba(239, 68, 68, 0.5);
+.talent-market-content {
+  background: var(--bg-primary);
+  border-radius: var(--radius-xl);
+  border: 1px solid var(--border-subtle);
+  width: 960px;
+  max-width: 95vw;
+  height: min(88vh, 720px);
+  box-shadow: 0 30px 80px rgba(0, 0, 0, 0.45);
+  display: flex;
+  flex-direction: column;
+  overflow: hidden;
+  animation: slideUp 0.3s cubic-bezier(0.16, 1, 0.3, 1);
 }
 
-.stop-icon {
-  display: inline-block;
-  width: 10px;
-  height: 10px;
-  border-radius: 2px;
-  background: currentColor;
+.talent-market-header {
+  padding: 24px 28px 12px;
+  display: flex;
+  align-items: flex-start;
+  justify-content: space-between;
+  gap: 16px;
+  border-bottom: 1px solid var(--border-subtle);
 }
 
-[data-theme="light"] .btn-stop-generation {
-  background: rgba(239, 68, 68, 0.06);
-  border-color: rgba(239, 68, 68, 0.25);
+.talent-market-title {
+  margin: 0;
+  font-size: 24px;
+  font-weight: 700;
+  background: linear-gradient(135deg, var(--text-primary) 0%, var(--accent-primary) 100%);
+  -webkit-background-clip: text;
+  -webkit-text-fill-color: transparent;
+  background-clip: text;
 }
 
-[data-theme="light"] .btn-stop-generation:hover {
-  background: rgba(239, 68, 68, 0.12);
-  border-color: rgba(239, 68, 68, 0.4);
+.talent-market-subtitle {
+  margin: 6px 0 0;
+  font-size: 13px;
+  color: var(--text-secondary);
 }
 
-@keyframes fadeIn {
-  from {
-    opacity: 0;
-    transform: translateY(4px);
-  }
-
-  to {
-    opacity: 1;
-    transform: translateY(0);
-  }
+.talent-market-search {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  height: 42px;
+  padding: 0 14px;
+  background: var(--bg-secondary);
+  border: 1px solid var(--border-subtle);
+  border-radius: var(--radius-lg);
+  width: 280px;
+  max-width: 40vw;
+  transition: all var(--transition-fast);
 }
 
-/* Thinking indicator */
-.thinking-panel {
-  margin-bottom: 8px;
-  font-size: 12px;
-  background: color-mix(in srgb, var(--bg-secondary) 68%, var(--bg-primary));
-  border: 1px solid var(--border-subtle);
-  border-radius: 8px;
-  overflow: hidden;
+.talent-market-search:focus-within {
+  border-color: var(--accent-primary);
+  box-shadow: 0 0 0 3px rgba(var(--accent-primary-rgb), 0.1);
 }
 
-.thinking-panel[open] {
-  background: var(--bg-primary);
-  border-color: var(--border-default);
+.talent-market-search input {
+  flex: 1;
+  min-width: 0;
+  background: transparent;
+  border: none;
+  outline: none;
+  color: var(--text-primary);
+  font-size: 14px;
+  height: 100%;
 }
 
-.thinking-summary {
-  display: flex;
-  align-items: center;
-  gap: 7px;
-  min-height: 32px;
-  padding: 0 10px;
-  color: var(--text-secondary);
+.talent-market-search-clear {
+  background: transparent;
+  border: none;
   cursor: pointer;
-  font-size: 12px;
-  font-weight: 500;
-  user-select: none;
+  color: var(--text-tertiary);
+  padding: 2px;
+  border-radius: 4px;
+  transition: all var(--transition-fast);
 }
 
-.thinking-summary::marker,
-.thinking-summary::-webkit-details-marker {
-  display: none;
-  content: "";
+.talent-market-search-clear:hover {
+  background: var(--bg-tertiary);
+  color: var(--text-primary);
 }
 
-.thinking-status-dot {
-  width: 6px;
-  height: 6px;
-  border-radius: 999px;
-  background: var(--text-tertiary);
-  box-shadow: 0 0 0 3px color-mix(in srgb, var(--text-tertiary) 12%, transparent);
+.talent-market-tabs {
+  display: flex;
+  padding: 0 28px;
+  border-bottom: 1px solid var(--border-subtle);
+  overflow-x: auto;
   flex-shrink: 0;
 }
 
-.thinking-content {
-  max-height: 300px;
-  overflow: auto;
-  padding: 0 10px 9px 23px;
+.talent-market-tab {
+  padding: 14px 20px;
+  margin-bottom: -1px;
+  margin-right: 4px;
+  background: transparent;
+  border: none;
+  border-bottom: 2px solid transparent;
   color: var(--text-secondary);
-  font-size: 12px;
-  line-height: 1.6;
-  white-space: pre-wrap;
-  word-break: break-word;
+  font-size: 13px;
+  font-weight: 500;
+  cursor: pointer;
+  white-space: nowrap;
+  transition: all var(--transition-fast);
+  outline: none;
+  border-radius: var(--radius-sm) var(--radius-sm) 0 0;
 }
 
-.thinking-indicator {
-  display: flex;
-  align-items: center;
-  gap: 10px;
+.talent-market-tab:hover {
+  color: var(--text-primary);
+  background: var(--bg-secondary);
 }
 
-.thinking-dots {
+.talent-market-tab.active {
+  color: var(--accent-primary);
+  border-bottom-color: var(--accent-primary);
+  background: var(--bg-secondary);
+}
+
+.talent-market-cards {
+  padding: 20px 28px;
+  overflow-y: auto;
+  flex: 1;
+  display: grid;
+  grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
+  gap: 20px;
+  align-content: start;
+}
+
+/* Template Card */
+.talent-card {
+  border: 1px solid var(--border-subtle);
+  border-radius: var(--radius-xl);
+  padding: 20px;
   display: flex;
-  gap: 4px;
-  align-items: center;
+  flex-direction: column;
+  background: var(--bg-primary);
+  transition: all var(--transition-normal);
+  position: relative;
+  min-height: 320px;
 }
 
-.thinking-dots span {
-  width: 6px;
-  height: 6px;
-  border-radius: 50%;
-  background: var(--text-tertiary);
-  animation: thinkingBounce 1.4s ease-in-out infinite;
+.talent-card:hover {
+  border-color: var(--accent-primary);
+  box-shadow: 0 8px 30px rgba(var(--accent-primary-rgb), 0.12);
+  transform: translateY(-2px);
 }
 
-.thinking-dots span:nth-child(2) {
-  animation-delay: 0.2s;
+.talent-card-icon {
+  width: 44px;
+  height: 44px;
+  border-radius: var(--radius-lg);
+  background: linear-gradient(135deg, var(--bg-secondary) 0%, var(--bg-tertiary) 100%);
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  font-size: 14px;
+  font-weight: 700;
+  margin-bottom: 16px;
+  letter-spacing: 0.04em;
+  color: var(--text-primary);
 }
 
-.thinking-dots span:nth-child(3) {
-  animation-delay: 0.4s;
+.talent-card-title {
+  font-size: 15px;
+  font-weight: 600;
+  margin-bottom: 4px;
+  color: var(--text-primary);
 }
 
-@keyframes thinkingBounce {
-  0%, 80%, 100% {
-    opacity: 0.3;
-    transform: scale(0.8);
-  }
-  40% {
-    opacity: 1;
-    transform: scale(1.1);
-  }
+.talent-card-category {
+  font-size: 10px;
+  font-weight: 600;
+  letter-spacing: 0.08em;
+  color: var(--text-tertiary);
+  text-transform: uppercase;
+  margin-bottom: 14px;
 }
 
-/* Wizard / Stepper */
-.wizard-steps {
-  display: flex;
-  align-items: center;
-  gap: var(--space-2);
-  margin-bottom: var(--space-8);
+.talent-card-bullets {
+  margin: 0;
+  padding: 0;
+  list-style: none;
+  flex: 1;
+  font-size: 12.5px;
+  color: var(--text-secondary);
+  line-height: 1.7;
 }
 
-.wizard-step {
+.talent-card-bullets li {
   display: flex;
-  align-items: center;
-  gap: var(--space-2);
-  font-size: var(--text-sm);
-  color: var(--text-tertiary);
+  gap: 8px;
+  align-items: flex-start;
+  margin-bottom: 6px;
 }
 
-.wizard-step.active {
-  color: var(--accent-text);
+.talent-card-bullets li:last-child {
+  margin-bottom: 0;
 }
 
-.wizard-step.completed {
-  color: var(--accent-text);
-  opacity: 0.8;
+.talent-card-bullet-dot {
+  color: var(--accent-primary);
+  flex-shrink: 0;
+  margin-top: 4px;
 }
 
-.wizard-step-number {
-  width: 24px;
-  height: 24px;
-  border-radius: 50%;
+/* Custom Card */
+.talent-card-custom {
+  border: 2px dashed var(--border-subtle);
+  border-radius: var(--radius-xl);
+  padding: 20px;
   display: flex;
-  align-items: center;
-  justify-content: center;
-  font-size: var(--text-xs);
-  font-weight: 600;
-  border: 1px solid var(--border-default);
+  flex-direction: column;
+  cursor: pointer;
+  background: var(--bg-primary);
+  transition: all var(--transition-normal);
+  position: relative;
+  min-height: 320px;
 }
 
-.wizard-step.active .wizard-step-number {
-  background: var(--accent-primary);
+.talent-card-custom:hover {
   border-color: var(--accent-primary);
-  color: var(--text-inverse);
-  /* dark text on near-white bg in dark mode */
+  box-shadow: 0 8px 30px rgba(var(--accent-primary-rgb), 0.12);
+  transform: translateY(-2px);
 }
 
-.wizard-step.completed .wizard-step-number {
-  background: var(--accent-subtle);
-  border-color: var(--accent-subtle);
-  color: var(--accent-text);
+.talent-card-custom-bg {
+  position: absolute;
+  inset: 0;
+  background-repeat: no-repeat;
+  background-position: right -44px center;
+  background-size: 260px auto;
+  filter: grayscale(18%) saturate(76%) sepia(8%);
+  opacity: 0.35;
+  pointer-events: none;
 }
 
-.wizard-connector {
-  flex: 1;
-  height: 1px;
-  background: var(--border-default);
+.talent-card-custom-icon {
+  width: 44px;
+  height: 44px;
+  border-radius: var(--radius-lg);
+  background: var(--accent-primary);
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  margin-bottom: 16px;
+  color: white;
+  position: relative;
+  z-index: 1;
+}
+
+.talent-card-custom-title {
+  font-size: 15px;
+  font-weight: 600;
+  margin-bottom: 4px;
+  color: var(--text-primary);
+  position: relative;
+  z-index: 1;
 }
 
-/* Typing Indicator Animation */
-.typing-indicator {
-  display: inline-flex;
-  align-items: center;
-  gap: 4px;
-  padding: 4px 6px;
-  height: 24px;
+.talent-card-custom-category {
+  font-size: 10px;
+  font-weight: 600;
+  letter-spacing: 0.08em;
+  color: var(--text-tertiary);
+  text-transform: uppercase;
+  margin-bottom: 14px;
+  position: relative;
+  z-index: 1;
 }
 
-.typing-dot {
-  width: 6px;
-  height: 6px;
-  border-radius: 50%;
-  background-color: var(--text-tertiary);
-  animation: typing 1.4s infinite ease-in-out both;
+.talent-card-custom-description {
+  margin: 0;
+  flex: 1;
+  font-size: 12.5px;
+  color: var(--text-secondary);
+  line-height: 1.6;
+  position: relative;
+  z-index: 1;
 }
 
-.typing-dot:nth-child(1) {
-  animation-delay: -0.32s;
+.talent-card-custom-hint {
+  margin-top: 12px;
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  color: var(--text-tertiary);
+  font-size: 11.5px;
+  line-height: 1.2;
+  position: relative;
+  z-index: 1;
 }
 
-.typing-dot:nth-child(2) {
-  animation-delay: -0.16s;
+.talent-market-footer {
+  padding: 14px 28px 18px;
+  text-align: center;
+  font-size: 12px;
+  color: var(--text-tertiary);
+  border-top: 1px solid var(--border-subtle);
 }
 
-@keyframes typing {
-  0%, 80%, 100% {
-    transform: scale(0);
+/* Toast容器动画 */
+@keyframes toastSlideIn {
+  from {
+    opacity: 0;
+    transform: translateX(20px);
   }
-  40% {
-    transform: scale(1);
+  to {
+    opacity: 1;
+    transform: translateX(0);
   }
 }
 
-/* ─── Login QR Code ────────────────────────────────── */
-
-.login-qr-toggle {
-    margin-top: 24px;
-    padding: 12px;
-    border: 1px dashed var(--border-strong);
-    border-radius: var(--radius-lg);
-    text-align: center;
-    cursor: pointer;
-    font-size: var(--text-sm);
-    color: var(--text-secondary);
-    transition: all var(--transition-fast);
-}
-
-.login-qr-toggle:hover {
-    border-color: var(--accent-primary);
-    color: var(--text-primary);
-    background: var(--bg-hover);
+@keyframes toastSlideOut {
+  from {
+    opacity: 1;
+    transform: translateX(0);
+  }
+  to {
+    opacity: 0;
+    transform: translateX(20px);
+  }
 }
 
-.login-qr-section {
-    display: flex;
-    flex-direction: column;
-    align-items: center;
-    gap: 16px;
-    padding: 20px 0;
-}
+/* Toast卡片样式增强 */
+/* ════════════════════════════════════════════════════════
+   Enhanced Toast Notification System
+   ════════════════════════════════════════════════════════ */
 
-.login-qr-container {
-    position: relative;
-    width: 200px;
-    height: 200px;
-    padding: 10px;
-    background: white;
-    border-radius: var(--radius-lg);
-    overflow: hidden;
+.toast-card {
+  background: var(--bg-glass);
+  backdrop-filter: blur(20px);
+  -webkit-backdrop-filter: blur(20px);
+  border: 1px solid var(--border-default);
+  border-left: 4px solid var(--accent-primary);
+  border-radius: var(--radius-xl);
+  padding: 14px 16px;
+  box-shadow: 0 10px 40px rgba(0, 0, 0, 0.35), 0 0 0 1px rgba(99, 102, 241, 0.08);
+  display: flex;
+  align-items: flex-start;
+  gap: 12px;
+  font-size: 14px;
+  color: var(--text-primary);
+  min-width: 260px;
+  max-width: 400px;
+  animation: toastSlideIn 0.3s cubic-bezier(0.16, 1, 0.3, 1) forwards;
+  position: relative;
+  overflow: hidden;
 }
 
-.login-qr-container img {
-    width: 100%;
-    height: 100%;
-    display: block;
+.toast-card::before {
+  content: '';
+  position: absolute;
+  top: 0;
+  left: 0;
+  right: 0;
+  height: 1px;
+  background: linear-gradient(90deg, var(--accent-primary), transparent);
+  opacity: 0.5;
 }
 
-.login-qr-container img.blur {
-    filter: blur(4px);
-    opacity: 0.5;
+@keyframes toastSlideIn {
+  from {
+    opacity: 0;
+    transform: translateX(100%) translateY(-20px);
+  }
+  to {
+    opacity: 1;
+    transform: translateX(0) translateY(0);
+  }
 }
 
-.login-qr-overlay {
-    position: absolute;
-    inset: 0;
-    background: rgba(0, 0, 0, 0.7);
-    display: flex;
-    flex-direction: column;
-    align-items: center;
-    justify-content: center;
-    z-index: 10;
-    padding: 20px;
-    text-align: center;
+.toast-card.toast-success {
+  border-left-color: var(--success);
 }
 
-.login-qr-overlay p {
-    color: white;
-    font-size: var(--text-sm);
-    margin-bottom: 12px;
+.toast-card.toast-success::before {
+  background: linear-gradient(90deg, var(--success), transparent);
 }
 
-.login-qr-overlay button {
-    background: var(--accent-primary);
-    color: var(--bg-primary);
-    border: none;
-    padding: 6px 16px;
-    border-radius: var(--radius-md);
-    font-size: var(--text-xs);
-    font-weight: 600;
+.toast-card.toast-error {
+  border-left-color: var(--error);
 }
 
-.login-qr-overlay.success {
-    background: rgba(34, 197, 94, 0.9);
+.toast-card.toast-error::before {
+  background: linear-gradient(90deg, var(--error), transparent);
 }
 
-.login-qr-hint {
-    font-size: var(--text-xs);
-    color: var(--text-tertiary);
-    text-align: center;
+.toast-card.toast-warning {
+  border-left-color: var(--warning);
 }
 
-.login-qr-back {
-    margin-top: 12px;
-    font-size: var(--text-sm);
-    color: var(--accent-text);
-    cursor: pointer;
+.toast-card.toast-warning::before {
+  background: linear-gradient(90deg, var(--warning), transparent);
 }
 
-.login-qr-back:hover {
-    text-decoration: underline;
+.toast-card.toast-info {
+  border-left-color: var(--info);
 }
 
-/* ─── Drop Zone Overlay (drag-and-drop upload) ───── */
-
-.drop-zone-wrapper {
-    position: relative;
+.toast-card.toast-info::before {
+  background: linear-gradient(90deg, var(--info), transparent);
 }
 
-.drop-zone-overlay {
-    position: absolute;
-    inset: 0;
-    z-index: 100;
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    flex-direction: column;
-    gap: 8px;
-    background: rgba(99, 102, 241, 0.07);
-    border: 2px dashed rgba(99, 102, 241, 0.45);
-    border-radius: 12px;
-    pointer-events: none;
-    animation: dropZoneFadeIn 0.15s ease;
-    backdrop-filter: blur(2px);
+.toast-card:hover {
+  box-shadow: 0 14px 50px rgba(0, 0, 0, 0.45);
+  transform: translateY(-2px);
 }
 
-.drop-zone-overlay__icon {
-    width: 40px;
-    height: 40px;
-    border-radius: 50%;
-    background: rgba(99, 102, 241, 0.12);
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    font-size: 20px;
-    color: var(--accent-primary);
+/* Toast图标样式 */
+.toast-icon {
+  width: 22px;
+  height: 22px;
+  border-radius: 50%;
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  font-size: 12px;
+  font-weight: 700;
+  flex-shrink: 0;
+  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.2);
 }
 
-.drop-zone-overlay__text {
-    font-size: 13px;
-    font-weight: 600;
-    color: var(--accent-primary);
-    letter-spacing: 0.2px;
+.toast-icon.toast-success {
+  background: linear-gradient(135deg, var(--success), #10b981);
+  color: white;
 }
 
-.drop-zone-overlay__hint {
-    font-size: 11px;
-    color: var(--text-tertiary);
+.toast-icon.toast-error {
+  background: linear-gradient(135deg, var(--error), #ef4444);
+  color: white;
 }
 
-@keyframes dropZoneFadeIn {
-    from {
-        opacity: 0;
-        border-color: transparent;
-    }
-    to {
-        opacity: 1;
-        border-color: rgba(99, 102, 241, 0.45);
-    }
+.toast-icon.toast-warning {
+  background: linear-gradient(135deg, var(--warning), #f59e0b);
+  color: white;
 }
 
-[data-theme="light"] .drop-zone-overlay {
-    background: rgba(99, 102, 241, 0.06);
-    border-color: rgba(99, 102, 241, 0.35);
+.toast-icon.toast-info {
+  background: linear-gradient(135deg, var(--info), #3b82f6);
+  color: white;
 }
 
-/* Workspace Panel file actions */
-.workspace-op-tree-file-name {
+/* Toast消息内容 */
+.toast-message {
   flex: 1;
-  white-space: nowrap;
-  overflow: hidden;
-  text-overflow: ellipsis;
+  min-width: 0;
+  line-height: 1.6;
+  word-break: break-word;
+  white-space: pre-wrap;
 }
 
-.workspace-op-tree-file-delete {
-  flex-shrink: 0;
-  display: none;
-  background: transparent;
+/* Toast关闭按钮 */
+.toast-close {
+  background: var(--bg-hover);
   border: none;
+  border-radius: var(--radius-md);
+  padding: 4px;
   color: var(--text-tertiary);
   cursor: pointer;
-  padding: 4px;
-  border-radius: 4px;
-  align-items: center;
-  justify-content: center;
-}
-
-.workspace-op-tree-file-delete:hover {
-  color: var(--system-danger);
-  background: var(--bg-black-10);
+  font-size: 14px;
+  line-height: 1;
+  flex-shrink: 0;
+  transition: all var(--transition-fast);
 }
 
-.workspace-op-tree-dir:hover .workspace-op-tree-file-delete,
-.workspace-op-tree-file:hover .workspace-op-tree-file-delete {
-  display: flex;
+.toast-close:hover {
+  color: var(--text-primary);
+  background: var(--bg-active);
+  transform: scale(1.1);
 }
diff --git a/frontend/src/pages/AdminCompanies.tsx b/frontend/src/pages/AdminCompanies.tsx
index 51ddd732a..36bc17c88 100644
--- a/frontend/src/pages/AdminCompanies.tsx
+++ b/frontend/src/pages/AdminCompanies.tsx
@@ -812,7 +812,7 @@ function CompaniesTab() {
     const gridCols = columns.map(c => c.flex).join(' ') + ' ' + actionColFlex;
 
     return (
-        <div style={{ display: 'flex', flexDirection: 'column', flex: 1, minHeight: 0 }}>
+        <div className="stagger-reveal" style={{ display: 'flex', flexDirection: 'column', flex: 1, minHeight: 0 }}>
             {toast && (
                 <div style={{
                     position: 'fixed', top: '20px', right: '20px', padding: '10px 20px',
diff --git a/frontend/src/pages/AgentCreate.tsx b/frontend/src/pages/AgentCreate.tsx
index cb04e5e38..3ff497c76 100644
--- a/frontend/src/pages/AgentCreate.tsx
+++ b/frontend/src/pages/AgentCreate.tsx
@@ -477,7 +477,7 @@ For humans, the message is delivered via their available channel (e.g. Feishu).`
 
     // ── Native mode: original multi-step wizard ──
     return (
-        <div>
+        <div className="stagger-reveal">
             <div className="page-header">
                 <h1 className="page-title">{t('nav.newAgent')}</h1>
             </div>
@@ -505,7 +505,7 @@ For humans, the message is delivered via their available channel (e.g. Feishu).`
                 </div>
             )}
 
-            <div className="card" style={{ maxWidth: '640px' }}>
+            <div className="card hover-lift" style={{ maxWidth: '640px' }}>
                 {/* Step 1: Basic Info + Model */}
                 {step === 0 && (
                     <div>
diff --git a/frontend/src/pages/AgentDetail.tsx b/frontend/src/pages/AgentDetail.tsx
index f143a07c6..2a7d007bc 100644
--- a/frontend/src/pages/AgentDetail.tsx
+++ b/frontend/src/pages/AgentDetail.tsx
@@ -3621,7 +3621,7 @@ function AgentDetailInner() {
 
     return (
         <>
-            <div className={`agent-detail-page ${activeTab === 'chat' ? 'agent-detail-page--chat' : 'agent-detail-page--settings'}`}>
+            <div className={`agent-detail-page ${activeTab === 'chat' ? 'agent-detail-page--chat' : 'agent-detail-page--settings'} stagger-reveal`}>
                 {/* Header */}
                 <div className="page-header agent-detail-header">
                     {activeTab === 'chat' ? <div
@@ -3759,46 +3759,46 @@ function AgentDetailInner() {
                         <div>
                             {/* Metric cards */}
                             <div style={{ display: 'grid', gridTemplateColumns: 'repeat(3, 1fr)', gap: '12px', marginBottom: '24px' }}>
-                                <div className="card">
+                                <div className="card hover-lift">
                                     <div style={{ fontSize: '12px', color: 'var(--text-tertiary)', marginBottom: '6px' }}>{t('agent.tabs.status')}</div>
                                     <div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
                                         <span className={`status-dot ${statusKey}`} />
                                         <span style={{ fontSize: '16px', fontWeight: 500 }}>{t(`agent.status.${statusKey}`)}</span>
                                     </div>
                                 </div>
-                                <div className="card">
+                                <div className="card hover-lift">
                                     <div style={{ fontSize: '12px', color: 'var(--text-tertiary)', marginBottom: '6px' }}>{t('agent.settings.today')} Token</div>
                                     <div style={{ fontSize: '22px', fontWeight: 600 }}>{formatTokens(agent.tokens_used_today)}</div>
                                     {agent.max_tokens_per_day && <div style={{ fontSize: '11px', color: 'var(--text-tertiary)', marginTop: '2px' }}>{t('agent.settings.noLimit')} {formatTokens(agent.max_tokens_per_day)}</div>}
                                 </div>
-                                <div className="card">
+                                <div className="card hover-lift">
                                     <div style={{ fontSize: '12px', color: 'var(--text-tertiary)', marginBottom: '6px' }}>{t('agent.settings.month')} Token</div>
                                     <div style={{ fontSize: '22px', fontWeight: 600 }}>{formatTokens(agent.tokens_used_month)}</div>
                                     {agent.max_tokens_per_month && <div style={{ fontSize: '11px', color: 'var(--text-tertiary)', marginTop: '2px' }}>{t('agent.settings.noLimit')} {formatTokens(agent.max_tokens_per_month)}</div>}
                                 </div>
                                 {/* Native agent metrics */}
                                 {(agent as any)?.agent_type !== 'openclaw' && (<>
-                                    <div className="card">
+                                    <div className="card hover-lift">
                                         <div style={{ fontSize: '12px', color: 'var(--text-tertiary)', marginBottom: '6px' }}>{t('agent.status.llmCallsToday')}</div>
                                         <div style={{ fontSize: '22px', fontWeight: 600 }}>{((agent as any).llm_calls_today || 0).toLocaleString()}</div>
                                         <div style={{ fontSize: '11px', color: 'var(--text-tertiary)', marginTop: '2px' }}>{t('agent.status.max')}: {((agent as any).max_llm_calls_per_day || 100).toLocaleString()}</div>
                                     </div>
-                                    <div className="card">
+                                    <div className="card hover-lift">
                                         <div style={{ fontSize: '12px', color: 'var(--text-tertiary)', marginBottom: '6px' }}>{t('agent.status.totalToken')}</div>
                                         <div style={{ fontSize: '22px', fontWeight: 600 }}>{formatTokens((agent as any).tokens_used_total || 0)}</div>
                                     </div>
                                     {metrics && (
                                         <>
-                                            <div className="card">
+                                            <div className="card hover-lift">
                                                 <div style={{ fontSize: '12px', color: 'var(--text-tertiary)', marginBottom: '6px' }}>{t('agent.tasks.done')}</div>
                                                 <div style={{ fontSize: '22px', fontWeight: 600 }}>{metrics.tasks?.done || 0}/{metrics.tasks?.total || 0}</div>
                                                 <div style={{ fontSize: '11px', color: 'var(--text-tertiary)' }}> {metrics.tasks?.completion_rate || 0}%</div>
                                             </div>
-                                            <div className="card">
+                                            <div className="card hover-lift">
                                                 <div style={{ fontSize: '12px', color: 'var(--text-tertiary)', marginBottom: '6px' }}>{t('agent.status.pending')}</div>
                                                 <div style={{ fontSize: '22px', fontWeight: 600, color: metrics.approvals?.pending > 0 ? 'var(--warning)' : 'inherit' }}>{metrics.approvals?.pending || 0}</div>
                                             </div>
-                                            <div className="card" style={{ position: 'relative' }}>
+                                            <div className="card hover-lift" style={{ position: 'relative' }}>
                                                 <div className="metric-tooltip-trigger" style={{ fontSize: '12px', color: 'var(--text-tertiary)', marginBottom: '6px', cursor: 'help', display: 'inline-flex', alignItems: 'center', gap: '4px' }}>
                                                     {t('agent.status.24hActions')}
                                                     <svg width="12" height="12" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.5"><circle cx="8" cy="8" r="6.5" /><path d="M8 7v4M8 5.5v0" /></svg>
@@ -3826,7 +3826,7 @@ function AgentDetailInner() {
 
                             {/* Agent Profile & Model Info */}
                             <div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: '16px', marginBottom: '24px' }}>
-                                <div className="card">
+                                <div className="card hover-lift">
                                     <h3 style={{ fontSize: '14px', fontWeight: 600, marginBottom: '12px' }}>{t('agent.profile.title')}</h3>
                                     <div style={{ display: 'flex', flexDirection: 'column', gap: '10px' }}>
                                         <div style={{ display: 'flex', justifyContent: 'space-between', fontSize: '13px', gap: '12px' }}>
@@ -3854,7 +3854,7 @@ function AgentDetailInner() {
                                     </div>
                                 </div>
                                 {(agent as any)?.agent_type !== 'openclaw' ? (
-                                    <div className="card">
+                                    <div className="card hover-lift">
                                         <h3 style={{ fontSize: '14px', fontWeight: 600, marginBottom: '12px' }}>{t('agent.modelConfig.title')}</h3>
                                         <div style={{ display: 'flex', flexDirection: 'column', gap: '10px' }}>
                                             <div style={{ display: 'flex', justifyContent: 'space-between', fontSize: '13px' }}>
@@ -3872,7 +3872,7 @@ function AgentDetailInner() {
                                         </div>
                                     </div>
                                 ) : (
-                                    <div className="card">
+                                    <div className="card hover-lift">
                                         <h3 style={{ fontSize: '14px', fontWeight: 600, marginBottom: '12px' }}>
                                             {t('agent.openclaw.connection')}
                                         </h3>
@@ -3905,7 +3905,7 @@ function AgentDetailInner() {
 
                             {/* Recent Activity */}
                             {activityLogs && activityLogs.length > 0 && (
-                                <div className="card">
+                                <div className="card hover-lift">
                                     <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '12px' }}>
                                         <h3 style={{ fontSize: '14px', fontWeight: 600 }}>{t('agent.activity.recent', 'Recent Activity')}</h3>
                                         <button className="btn btn-ghost" style={{ fontSize: '12px' }} onClick={() => setActiveTab('activityLog')}>View All →</button>
diff --git a/frontend/src/pages/Chat.tsx b/frontend/src/pages/Chat.tsx
index 805b8107e..36cb3439f 100644
--- a/frontend/src/pages/Chat.tsx
+++ b/frontend/src/pages/Chat.tsx
@@ -831,7 +831,7 @@ export default function Chat() {
     });
 
     return (
-        <div style={{ display: 'flex', flexDirection: 'column', height: '100%', overflow: 'hidden' }}>
+        <div className="stagger-reveal" style={{ display: 'flex', flexDirection: 'column', height: '100%', overflow: 'hidden' }}>
             <div className="page-header">
                 <div style={{ display: 'flex', alignItems: 'center', gap: '12px' }}>
                     <div style={{ width: '36px', height: '36px', borderRadius: 'var(--radius-md)', background: 'var(--bg-tertiary)', border: '1px solid var(--border-subtle)', display: 'flex', alignItems: 'center', justifyContent: 'center', color: 'var(--text-tertiary)' }}>
diff --git a/frontend/src/pages/CompanySetup.tsx b/frontend/src/pages/CompanySetup.tsx
index d5bfcc9b6..73bf73f33 100644
--- a/frontend/src/pages/CompanySetup.tsx
+++ b/frontend/src/pages/CompanySetup.tsx
@@ -110,7 +110,7 @@ export default function CompanySetup() {
     console.log('[CompanySetup] guards:', { fromRegister, fromTenantSelection, tenant_id: user?.tenant_id });
 
     return (
-        <div className="company-setup-page">
+        <div className="company-setup-page stagger-reveal">
             {/* Language Switcher */}
             <div style={{
                 position: 'absolute', top: '16px', right: '16px',
diff --git a/frontend/src/pages/Dashboard.tsx b/frontend/src/pages/Dashboard.tsx
index df6c38e21..58af9d6ba 100644
--- a/frontend/src/pages/Dashboard.tsx
+++ b/frontend/src/pages/Dashboard.tsx
@@ -171,6 +171,7 @@ function OKRSummaryCard() {
 
     return (
         <div
+            className="hover-lift"
             style={{
                 border: '1px solid var(--border-subtle)',
                 borderRadius: 'var(--radius-lg)',
@@ -180,7 +181,7 @@ function OKRSummaryCard() {
                 alignItems: 'center',
                 gap: '20px',
                 cursor: 'pointer',
-                transition: 'border-color 0.15s',
+                transition: 'border-color 0.15s, transform var(--transition-bounce), box-shadow var(--transition-default)',
                 background: 'var(--bg-secondary)',
             }}
             onClick={() => navigate('/okr')}
@@ -544,7 +545,7 @@ export default function Dashboard() {
     const greeting = hour < 6 ? '🌙 ' + t('dashboard.greeting.lateNight') : hour < 12 ? '☀️ ' + t('dashboard.greeting.morning') : hour < 18 ? '🌤️ ' + t('dashboard.greeting.afternoon') : '🌙 ' + t('dashboard.greeting.evening');
 
     return (
-        <div>
+        <div className="stagger-reveal">
             {/* Header */}
             <div style={{
                 display: 'flex', justifyContent: 'space-between',
@@ -586,13 +587,15 @@ export default function Dashboard() {
             ) : (
                 <>
                     {/* Stats Bar */}
-                    <StatsBar agents={agents} allTasks={allTasks} />
+                    <div className="hover-lift" style={{ display: 'inline-block', width: '100%' }}>
+                        <StatsBar agents={agents} allTasks={allTasks} />
+                    </div>
 
                     {/* OKR Summary (P3) — only shown when OKR is enabled */}
                     <OKRSummaryCard />
 
                     {/* Agent List Card */}
-                    <div style={{
+                    <div className="hover-lift" style={{
                         border: '1px solid var(--border-subtle)',
                         borderRadius: 'var(--radius-lg)',
                         overflow: 'hidden',
@@ -636,7 +639,7 @@ export default function Dashboard() {
                     </div>
 
                     {/* Recent Activity */}
-                    <div style={{
+                    <div className="hover-lift" style={{
                         border: '1px solid var(--border-subtle)',
                         borderRadius: 'var(--radius-lg)', overflow: 'hidden',
                     }}>
diff --git a/frontend/src/pages/EnterpriseSettings.tsx b/frontend/src/pages/EnterpriseSettings.tsx
index f17adf8aa..a59d99025 100644
--- a/frontend/src/pages/EnterpriseSettings.tsx
+++ b/frontend/src/pages/EnterpriseSettings.tsx
@@ -3149,7 +3149,7 @@ export default function EnterpriseSettings() {
 
     return (
         <>
-            <div>
+            <div className="stagger-reveal">
                 <div className="page-header">
                     <div>
                         <h1 className="page-title">{t('nav.enterprise')}</h1>
@@ -3203,7 +3203,7 @@ export default function EnterpriseSettings() {
 
                         {/* Add Model form — only shown at top when adding new */}
                         {showAddModel && !editingModelId && (
-                            <div className="card" style={{ marginBottom: '16px' }}>
+                            <div className="card hover-lift" style={{ marginBottom: '16px' }}>
                                 <h3 style={{ marginBottom: '16px' }}>{t('enterprise.llm.addModel')}</h3>
                                 <div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: '12px' }}>
                                     <div className="form-group">
diff --git a/frontend/src/pages/InvitationCodes.tsx b/frontend/src/pages/InvitationCodes.tsx
index 983dbb93a..80bf61e84 100644
--- a/frontend/src/pages/InvitationCodes.tsx
+++ b/frontend/src/pages/InvitationCodes.tsx
@@ -74,7 +74,7 @@ export default function InvitationCodes() {
     };
 
     return (
-        <div className="content-area" style={{ maxWidth: '900px', margin: '0 auto', padding: '32px 24px' }}>
+        <div className="content-area stagger-reveal" style={{ maxWidth: '900px', margin: '0 auto', padding: '32px 24px' }}>
             {toast && (
                 <div style={{
                     position: 'fixed', top: '20px', right: '20px', padding: '10px 20px',
@@ -91,7 +91,7 @@ export default function InvitationCodes() {
             </p>
 
             {/* Batch Create */}
-            <div className="card" style={{ padding: '16px', marginBottom: '16px' }}>
+            <div className="card hover-lift" style={{ padding: '16px', marginBottom: '16px' }}>
                 <div style={{ fontSize: '12px', fontWeight: 600, color: 'var(--text-secondary)', marginBottom: '12px' }}>
                     {t('enterprise.invites.createTitle', 'Create Invitation Codes')}
                 </div>
diff --git a/frontend/src/pages/Layout.tsx b/frontend/src/pages/Layout.tsx
index aaee32e9e..4b7589801 100644
--- a/frontend/src/pages/Layout.tsx
+++ b/frontend/src/pages/Layout.tsx
@@ -12,6 +12,8 @@ import {
     IconPlus,
     IconSettings,
     IconUser,
+    IconUserCircle,
+    IconLock,
     IconSun,
     IconMoon,
     IconLogout,
@@ -163,63 +165,100 @@ function AccountSettingsModal({ user, onClose, isChinese }: { user: any; onClose
         setSaving(false);
     };
 
-    const inputStyle = { width: '100%', fontSize: '13px' };
-    const labelStyle = { display: 'block' as const, fontSize: '12px', fontWeight: 500, marginBottom: '4px', color: 'var(--text-secondary)' };
-
     return (
-        <div style={{ position: 'fixed', inset: 0, zIndex: 10000, background: 'rgba(0,0,0,0.5)', display: 'flex', alignItems: 'center', justifyContent: 'center' }} onClick={onClose}>
-            <div style={{ background: 'var(--bg-primary)', borderRadius: '12px', border: '1px solid var(--border-subtle)', width: '420px', maxHeight: '90vh', overflow: 'auto', padding: '24px', boxShadow: '0 20px 60px rgba(0,0,0,0.3)' }} onClick={e => e.stopPropagation()}>
-                <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
-                    <h3 style={{ margin: 0 }}>{isChinese ? '账户设置' : 'Account Settings'}</h3>
-                    <button onClick={onClose} style={{ background: 'none', border: 'none', color: 'var(--text-tertiary)', fontSize: '18px', cursor: 'pointer', padding: '4px 8px' }}>×</button>
+        <div className="modal-backdrop" onClick={onClose}>
+            <div className="modal-content account-settings-modal" onClick={e => e.stopPropagation()}>
+                {/* Header */}
+                <div className="modal-header">
+                    <div className="modal-header-content">
+                        <div className="modal-icon-wrapper">
+                        <IconUser style={{ width: '20px', height: '20px' }} />
+                    </div>
+                    <h3>{isChinese ? '账户设置' : 'Account Settings'}</h3>
                 </div>
-                {msg && <div style={{ padding: '8px 12px', borderRadius: '6px', fontSize: '12px', marginBottom: '16px', background: msgType === 'success' ? 'rgba(0,180,120,0.12)' : 'rgba(255,80,80,0.12)', color: msgType === 'success' ? 'var(--success)' : 'var(--error)' }}>{msg}</div>}
-                {/* Profile */}
-                <h4 style={{ margin: '0 0 12px', fontSize: '13px', color: 'var(--text-secondary)' }}>{isChinese ? '个人信息' : 'Profile'}</h4>
-                <div style={{ display: 'flex', flexDirection: 'column', gap: '10px', marginBottom: '20px' }}>
-                    <div><label style={labelStyle}>{isChinese ? '用户名' : 'Username'}</label><input className="form-input" value={username} onChange={e => setUsername(e.target.value)} style={inputStyle} /></div>
-                    <div>
-                        <label style={labelStyle}>{isChinese ? '邮箱' : 'Email'}</label>
-                        <div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
-                            <input className="form-input" type="email" value={email} onChange={e => setEmail(e.target.value)} style={inputStyle} disabled />
-                            {user?.email_verified ? (
-                                <span style={{ color: '#16a34a', fontSize: '12px', whiteSpace: 'nowrap' }}>✓ {isChinese ? '已验证' : 'Verified'}</span>
-                            ) : (
-                                <button
-                                    onClick={handleResendVerification}
-                                    disabled={resendingEmail}
-                                    style={{
-                                        fontSize: '11px',
-                                        padding: '4px 8px',
-                                        borderRadius: '4px',
-                                        border: '1px solid var(--border-subtle)',
-                                        background: 'var(--bg-secondary)',
-                                        color: 'var(--text-secondary)',
-                                        cursor: resendingEmail ? 'not-allowed' : 'pointer',
-                                        whiteSpace: 'nowrap',
-                                    }}
-                                >
-                                    {resendingEmail ? '...' : (isChinese ? '发送验证' : 'Verify')}
-                                </button>
-                            )}
-                        </div>
-                        {!user?.email_verified && (
-                            <div style={{ fontSize: '11px', color: 'var(--text-tertiary)', marginTop: '4px' }}>
-                                {isChinese ? '邮箱未验证，请点击按钮发送验证邮件' : 'Email not verified. Click button to send verification email.'}
-                            </div>
+                <button className="modal-close-btn" onClick={onClose} aria-label={isChinese ? '关闭' : 'Close'}>
+                    <IconX style={{ width: '16px', height: '16px' }} />
+                </button>
+            </div>
+
+            {/* Message */}
+            {msg && (
+                <div className={`modal-message ${msgType}`}>
+                    {msgType === 'success' ? <IconCheck style={{ width: '14px', height: '14px' }} /> : <IconX style={{ width: '14px', height: '14px' }} />}
+                    <span>{msg}</span>
+                </div>
+            )}
+
+            {/* Profile Section */}
+            <div className="settings-section">
+                <div className="section-header">
+                    <IconUserCircle style={{ width: '16px', height: '16px' }} />
+                    <h4>{isChinese ? '个人信息' : 'Profile'}</h4>
+                </div>
+                <div className="form-group">
+                    <label>{isChinese ? '用户名' : 'Username'}</label>
+                    <input className="form-input" value={username} onChange={e => setUsername(e.target.value)} />
+                </div>
+                <div className="form-group">
+                    <label>{isChinese ? '邮箱' : 'Email'}</label>
+                    <div className="input-with-action">
+                        <input className="form-input" type="email" value={email} onChange={e => setEmail(e.target.value)} disabled />
+                        {user?.email_verified ? (
+                            <span className="status-badge success">
+                                <IconCheck style={{ width: '12px', height: '12px' }} />
+                                {isChinese ? '已验证' : 'Verified'}
+                            </span>
+                        ) : (
+                            <button
+                                onClick={handleResendVerification}
+                                disabled={resendingEmail}
+                                className="btn btn-secondary btn-sm"
+                            >
+                                {resendingEmail ? '...' : (isChinese ? '发送验证' : 'Verify')}
+                            </button>
                         )}
                     </div>
-                    <div><label style={labelStyle}>{isChinese ? '显示名称' : 'Display Name'}</label><input className="form-input" value={displayName} onChange={e => setDisplayName(e.target.value)} style={inputStyle} /></div>
-                    <div style={{ display: 'flex', justifyContent: 'flex-end' }}><button className="btn btn-primary" onClick={handleSaveProfile} disabled={saving} style={{ padding: '6px 16px', fontSize: '12px' }}>{saving ? '...' : (isChinese ? '保存' : 'Save')}</button></div>
+                    {!user?.email_verified && (
+                        <div className="form-hint">{isChinese ? '邮箱未验证，请点击按钮发送验证邮件' : 'Email not verified. Click button to send verification email.'}</div>
+                    )}
                 </div>
-                <div style={{ borderTop: '1px solid var(--border-subtle)', marginBottom: '20px' }} />
-                {/* Password */}
-                <h4 style={{ margin: '0 0 12px', fontSize: '13px', color: 'var(--text-secondary)' }}>{isChinese ? '修改密码' : 'Change Password'}</h4>
-                <div style={{ display: 'flex', flexDirection: 'column', gap: '10px' }}>
-                    <div><label style={labelStyle}>{isChinese ? '当前密码' : 'Current Password'}</label><input className="form-input" type="password" value={oldPassword} onChange={e => setOldPassword(e.target.value)} style={inputStyle} /></div>
-                    <div><label style={labelStyle}>{isChinese ? '新密码' : 'New Password'}</label><input className="form-input" type="password" value={newPassword} onChange={e => setNewPassword(e.target.value)} placeholder={isChinese ? '至少 6 个字符' : 'Min 6 characters'} style={inputStyle} /></div>
-                    <div><label style={labelStyle}>{isChinese ? '确认新密码' : 'Confirm New Password'}</label><input className="form-input" type="password" value={confirmPassword} onChange={e => setConfirmPassword(e.target.value)} style={inputStyle} /></div>
-                    <div style={{ display: 'flex', justifyContent: 'flex-end' }}><button className="btn btn-primary" onClick={handleChangePassword} disabled={saving} style={{ padding: '6px 16px', fontSize: '12px' }}>{saving ? '...' : (isChinese ? '修改密码' : 'Change Password')}</button></div>
+                <div className="form-group">
+                    <label>{isChinese ? '显示名称' : 'Display Name'}</label>
+                    <input className="form-input" value={displayName} onChange={e => setDisplayName(e.target.value)} />
+                </div>
+                <div className="form-actions">
+                    <button className="btn btn-primary" onClick={handleSaveProfile} disabled={saving}>
+                        {saving ? '...' : (isChinese ? '保存' : 'Save')}
+                    </button>
+                </div>
+            </div>
+
+            {/* Divider */}
+            <div className="settings-divider" />
+
+            {/* Password Section */}
+            <div className="settings-section">
+                <div className="section-header">
+                    <IconLock style={{ width: '16px', height: '16px' }} />
+                    <h4>{isChinese ? '修改密码' : 'Change Password'}</h4>
+                </div>
+                    <div className="form-group">
+                        <label>{isChinese ? '当前密码' : 'Current Password'}</label>
+                        <input className="form-input" type="password" value={oldPassword} onChange={e => setOldPassword(e.target.value)} />
+                    </div>
+                    <div className="form-group">
+                        <label>{isChinese ? '新密码' : 'New Password'}</label>
+                        <input className="form-input" type="password" value={newPassword} onChange={e => setNewPassword(e.target.value)} placeholder={isChinese ? '至少 6 个字符' : 'Min 6 characters'} />
+                    </div>
+                    <div className="form-group">
+                        <label>{isChinese ? '确认新密码' : 'Confirm New Password'}</label>
+                        <input className="form-input" type="password" value={confirmPassword} onChange={e => setConfirmPassword(e.target.value)} />
+                    </div>
+                    <div className="form-actions">
+                        <button className="btn btn-primary" onClick={handleChangePassword} disabled={saving}>
+                            {saving ? '...' : (isChinese ? '修改密码' : 'Change Password')}
+                        </button>
+                    </div>
                 </div>
             </div>
         </div>
diff --git a/frontend/src/pages/Messages.tsx b/frontend/src/pages/Messages.tsx
index 96466d796..258c16c0a 100644
--- a/frontend/src/pages/Messages.tsx
+++ b/frontend/src/pages/Messages.tsx
@@ -48,7 +48,7 @@ export default function Messages() {
     };
 
     return (
-        <div style={{ maxWidth: '800px', margin: '0 auto', padding: '24px' }}>
+        <div className="stagger-reveal" style={{ maxWidth: '800px', margin: '0 auto', padding: '24px' }}>
             <div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: '20px' }}>
                 <h1 style={{ fontSize: '20px', fontWeight: 600, margin: 0 }}>{t('messages.title')}</h1>
                 {unreadCount > 0 && (
diff --git a/frontend/src/pages/OKR.tsx b/frontend/src/pages/OKR.tsx
index 4233a31f3..cb4267707 100644
--- a/frontend/src/pages/OKR.tsx
+++ b/frontend/src/pages/OKR.tsx
@@ -549,7 +549,7 @@ function ObjectiveCard({
     }
 
     return (
-        <div style={{
+        <div className="hover-lift" style={{
             border: '1px solid var(--border-subtle)',
             borderRadius: '10px',
             overflow: 'hidden',
@@ -734,7 +734,7 @@ function CreateObjectiveForm({
     }
 
     return (
-        <div style={{
+        <div className="hover-lift" style={{
             padding: '16px',
             background: 'var(--bg-primary)',
             border: '1px solid var(--accent-primary)40',
@@ -949,7 +949,7 @@ export default function OKR() {
     const periodOptions = periods;
 
     return (
-        <div data-okr-role-mode={okrRoleMode} style={{ padding: '24px', maxWidth: 960, margin: '0 auto' }}>
+        <div className="stagger-reveal" data-okr-role-mode={okrRoleMode} style={{ padding: '24px', maxWidth: 960, margin: '0 auto' }}>
             {/* Page Header */}
             <div style={{
                 display: 'grid',
@@ -1179,14 +1179,15 @@ export default function OKR() {
                             <div style={{ display: 'flex', flexDirection: 'column', gap: '14px' }}>
                                 {Object.entries(memberGroups).map(([ownerKey, group]) => (
                                     <div
-                                        key={ownerKey}
-                                        style={{
-                                            border: '1px solid var(--border-subtle)',
-                                            borderRadius: '14px',
-                                            background: 'var(--bg-primary)',
-                                            padding: '14px',
-                                        }}
-                                    >
+                                    key={ownerKey}
+                                    className="hover-lift"
+                                    style={{
+                                        border: '1px solid var(--border-subtle)',
+                                        borderRadius: '14px',
+                                        background: 'var(--bg-primary)',
+                                        padding: '14px',
+                                    }}
+                                >
                                         <div style={{
                                             display: 'flex',
                                             alignItems: 'center',
@@ -1496,7 +1497,7 @@ function MembersWithoutOKRPanel({
                 </button>
             </div>
 
-            <div style={{
+            <div className="hover-lift" style={{
                 border: '1px solid var(--border-subtle)',
                 borderRadius: '10px',
                 overflow: 'hidden',
diff --git a/frontend/src/pages/OpenClawSettings.tsx b/frontend/src/pages/OpenClawSettings.tsx
index f64f84a02..c55624a6e 100644
--- a/frontend/src/pages/OpenClawSettings.tsx
+++ b/frontend/src/pages/OpenClawSettings.tsx
@@ -107,11 +107,11 @@ export default function OpenClawSettings({ agent, agentId }: OpenClawSettingsPro
     const currentAccessLevel = permData?.access_level || 'use';
 
     return (
-        <div>
+        <div className="stagger-reveal">
             <h3 style={{ marginBottom: '16px' }}>{t('agent.settings.title')}</h3>
 
             {/* ── API Key Management ── */}
-            <div className="card" style={{ marginBottom: '12px' }}>
+            <div className="card hover-lift" style={{ marginBottom: '12px' }}>
                 <h4 style={{ marginBottom: '4px' }}>
                     API Key
                 </h4>
diff --git a/frontend/src/pages/PlatformDashboard.tsx b/frontend/src/pages/PlatformDashboard.tsx
index 9e3868207..29372665d 100644
--- a/frontend/src/pages/PlatformDashboard.tsx
+++ b/frontend/src/pages/PlatformDashboard.tsx
@@ -440,7 +440,7 @@ export default function PlatformDashboard() {
     // ─── Render ───────────────────────────────────────────
 
     return (
-        <div style={{ display: 'flex', flexDirection: 'column', gap: '24px' }}>
+        <div className="stagger-reveal" style={{ display: 'flex', flexDirection: 'column', gap: '24px' }}>
             {/* Time Range Toggle */}
             <div style={{ display: 'flex', justifyContent: 'flex-end' }}>
                 <div style={{ display: 'flex', background: 'var(--bg-secondary)', padding: '4px', borderRadius: '8px', border: '1px solid var(--border-subtle)' }}>
diff --git a/frontend/src/pages/Plaza.tsx b/frontend/src/pages/Plaza.tsx
index 107ff1967..8af25ac2e 100644
--- a/frontend/src/pages/Plaza.tsx
+++ b/frontend/src/pages/Plaza.tsx
@@ -278,7 +278,7 @@ function ActionBtn({ icon, label, active, onClick }: {
 
 function SidebarSection({ icon, title, children }: { icon: React.ReactNode; title: string; children: React.ReactNode }) {
     return (
-        <div style={{
+        <div className="hover-lift" style={{
             border: '1px solid var(--border-subtle)',
             borderRadius: 'var(--radius-lg)', overflow: 'hidden',
         }}>
@@ -602,7 +602,7 @@ export default function Plaza() {
     const runningAgents = agents.filter((a: Agent) => a.status === 'running');
 
     return (
-        <div>
+        <div className="stagger-reveal">
             {/* ─── Header ─── */}
             <div style={{
                 display: 'flex', justifyContent: 'space-between',
@@ -622,14 +622,14 @@ export default function Plaza() {
             </div>
 
             {/* ─── Stats ─── */}
-            {stats && <StatsBar stats={stats} />}
+            {stats && <div className="hover-lift" style={{ display: 'inline-block', width: '100%' }}><StatsBar stats={stats} /></div>}
 
             {/* ─── Two-Column Layout ─── */}
             <div style={{ display: 'flex', gap: '24px', alignItems: 'flex-start' }}>
                 {/* ─── Main Feed ─── */}
                 <div style={{ flex: 1, minWidth: 0 }}>
                     {/* Composer */}
-                    <div style={{
+                    <div className="hover-lift" style={{
                         border: '1px solid var(--border-subtle)',
                         borderRadius: 'var(--radius-lg)', padding: '14px 16px',
                         marginBottom: '16px',
@@ -686,7 +686,7 @@ export default function Plaza() {
                             </div>
                         </div>
                     ) : (
-                        <div style={{
+                        <div className="hover-lift" style={{
                             border: '1px solid var(--border-subtle)',
                             borderRadius: 'var(--radius-lg)', overflow: 'hidden',
                         }}>
diff --git a/frontend/src/pages/UserManagement.tsx b/frontend/src/pages/UserManagement.tsx
index 5a1bd8aa9..0b6ec39c4 100644
--- a/frontend/src/pages/UserManagement.tsx
+++ b/frontend/src/pages/UserManagement.tsx
@@ -222,7 +222,7 @@ export default function UserManagement() {
     };
 
     return (
-        <div>
+        <div className="stagger-reveal">
             {toast && (
                 <div style={{
                     position: 'fixed', top: '20px', right: '20px', padding: '10px 20px',
diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts
index 88e2b9cf9..6fda675ff 100644
--- a/frontend/vite.config.ts
+++ b/frontend/vite.config.ts
@@ -32,11 +32,11 @@ export default defineConfig({
         host: '0.0.0.0',
         proxy: {
             '/api': {
-                target: 'http://localhost:8008',
+                target: 'http://localhost:8001',
                 changeOrigin: true,
             },
             '/ws': {
-                target: 'ws://localhost:8008',
+                target: 'ws://localhost:8001',
                 ws: true,
             },
         },
diff --git a/restart.sh b/restart.sh
old mode 100755
new mode 100644
diff --git a/setup.sh b/setup.sh
old mode 100755
new mode 100644