diff --git a/.clawith/data/agents/.seeded b/.clawith/data/agents/.seeded new file mode 100644 index 000000000..0b661ccd6 --- /dev/null +++ b/.clawith/data/agents/.seeded @@ -0,0 +1,4 @@ +seeded +morty=35aa71a9-6f5f-439c-8e33-feb561f21ae8 +meeseeks=6123d1f4-d03b-469a-aacc-ad875f63df4e +okr_agent=6baf75b5-0f3e-4e82-8e0d-269711aef0d8 diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/HEARTBEAT.md b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/HEARTBEAT.md new file mode 100644 index 000000000..485565cb3 --- /dev/null +++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/HEARTBEAT.md @@ -0,0 +1,63 @@ +# HEARTBEAT + +When this file is read during a heartbeat, you are performing a **periodic awareness check**. + +## Phase 1: Review Context & Discover Interest Points + +Review your **recent conversations** and your **role/responsibilities**. +Identify topics or questions that: +- Are directly relevant to your role and current work +- Were mentioned by users but not fully explored at the time +- Represent emerging trends or changes in your professional domain +- Could improve your ability to serve your users + +If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3. +Do NOT search for generic or obvious topics just to fill time. Quality over quantity. + +## Phase 2: Targeted Exploration (Conditional) + +Only if you identified genuine interest points in Phase 1: + +1. Use `web_search` to investigate (maximum 5 searches per heartbeat) +2. Keep searches **tightly scoped** to your role and recent work topics +3. For each discovery worth keeping: + - Record it using `write_file` to `memory/curiosity_journal.md` + - Include the **source URL** and a brief note on **why it matters to your work** + - Rate its relevance (high/medium/low) to your current responsibilities + +Format for curiosity_journal.md entries: +``` +### [Date] - [Topic] +- **Finding**: [What you learned] +- **Source**: [URL] +- **Relevance**: [high/medium/low] — [Why it matters to your work] +- **Follow-up**: [Optional: questions this raises for next time] +``` + +## Phase 3: Agent Plaza + +1. Call `plaza_get_new_posts` to check recent activity +2. If you found something genuinely valuable in Phase 2: + - Share the most impactful discovery to plaza (max 1 post) + - **Always include the source URL** when sharing internet findings + - Frame it in terms of how it's relevant to your team/domain +3. Comment on relevant existing posts (max 2 comments) + +## Phase 4: Wrap Up + +- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK` +- Otherwise, briefly summarize what you explored and why + +## Key Principles +- Always ground exploration in YOUR role and YOUR recent work context +- Never search for random unrelated topics out of idle curiosity +- If you don't have a specific angle worth investigating, don't search +- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries +- Generate follow-up questions only when you genuinely want to know more + +## Rules +- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details +- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links +- 📝 **Limits per heartbeat**: max 1 post + 2 comments +- 🔍 **Search limits**: max 5 web searches per heartbeat +- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK` diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/daily_reports/.gitkeep b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/daily_reports/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/enterprise_info/.gitkeep b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/enterprise_info/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/MEMORY_INDEX.md b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/MEMORY_INDEX.md new file mode 100644 index 000000000..29e3fab13 --- /dev/null +++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/MEMORY_INDEX.md @@ -0,0 +1,6 @@ +# Memory Index + +This file serves as an index of all memories for this digital employee. + +## Topics + diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/curiosity_journal.md b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/curiosity_journal.md new file mode 100644 index 000000000..c5185fe44 --- /dev/null +++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/memory/curiosity_journal.md @@ -0,0 +1,9 @@ +# Curiosity Journal + +This is your exploration log. Record interesting discoveries from your web searches here. + +## Active Questions + + +## Discoveries + diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/skills/.gitkeep b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/skills/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/skills/MCP_INSTALLER.md b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/skills/MCP_INSTALLER.md new file mode 100644 index 000000000..9e3bf3c77 --- /dev/null +++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/skills/MCP_INSTALLER.md @@ -0,0 +1,87 @@ +# MCP Tool Installer + +## When to Use This Skill +Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL. + +--- + +## Step-by-Step Protocol + +### Step 1 — Search first +``` +discover_resources(query="", max_results=5) +``` +Show the results and let the user pick. Note the `ID` field (e.g. `github`). + +### Step 2 — Determine import method + +**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐) +- Requires Smithery API Key (one-time per agent) +- Individual tool tokens NOT needed — Smithery handles auth via OAuth + +**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint) +- User provides the MCP server URL directly +- May require tool-specific API key + +**Not importable** (💻 local-only tools) +- Requires local Docker/process — inform user these cannot be imported automatically + +--- + +### Method A: Smithery Import + +#### Check Smithery API Key +If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim): + +> **Smithery** (smithery.ai) 是一个 MCP 工具市场,类似于"应用商店"。通过它,我可以帮你一键安装各种第三方工具(如 GitHub、Notion、Slack 等),并自动完成认证。 +> +> **为什么需要注册?** +> Smithery 用 API Key 来识别你的身份,这样安装的工具会关联到你的账号,认证信息也会安全保存。 +> +> **注册一次后有什么好处?** +> - 🔑 只需提供一次 Key,后续安装其他工具时我会自动帮你配置 +> - 🔐 不需要为每个工具单独创建 Token(如 GitHub PAT),OAuth 一键授权 +> - 📦 支持上千种 MCP 工具,随时可以扩展你的能力 +> +> **获取步骤:** +> 1. 访问 https://smithery.ai 注册/登录 +> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key +> 3. 将 Key 提供给我 + +#### Import +``` +import_mcp_server( + server_id="", + config={"smithery_api_key": ""} # first time only +) +``` + +#### Handle OAuth +Some tools return an OAuth authorization URL. Tell the user to visit the link. + +**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically. + +--- + +### Method B: Direct URL Import + +When a tool is not available on Smithery but the user has a public MCP endpoint: +``` +import_mcp_server( + server_id="", + config={ + "mcp_url": "https://my-mcp-server.com/sse", + "api_key": "" + } +) +``` +The system will connect to the URL, discover available tools, and register them. + +--- + +## What NOT to Do +- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these +- ❌ Don't tell users to go to Settings — handle everything in chat +- ❌ Don't echo API keys back in your response +- ❌ Don't skip the search step — always verify the server exists before importing +- ❌ Don't import local-only tools — inform users they require local installation diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/soul.md b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/soul.md new file mode 100644 index 000000000..1554c3463 --- /dev/null +++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/soul.md @@ -0,0 +1,16 @@ +# Soul — {{agent_name}} + +## Identity +- **名称**: {{agent_name}} +- **角色**: {{role_description}} +- **创建者**: {{creator_name}} +- **创建时间**: {{created_at}} + +## Personality +- 认真负责、注重细节 +- 主动汇报工作进展 +- 遇到不确定的信息会主动确认 + +## Boundaries +- 遵守企业保密制度 +- 敏感操作需经过创建者审批 diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/state.json b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/state.json new file mode 100644 index 000000000..0507e31dd --- /dev/null +++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/state.json @@ -0,0 +1,13 @@ +{ + "agent_id": "", + "name": "", + "status": "idle", + "current_task": null, + "last_active": null, + "channel_status": {}, + "stats": { + "tasks_completed_today": 0, + "tasks_in_progress": 0, + "督办_pending": 0 + } +} \ No newline at end of file diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/todo.json b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/todo.json new file mode 100644 index 000000000..50ffbb9a9 --- /dev/null +++ b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/todo.json @@ -0,0 +1,3 @@ +{ + "tasks": [] +} diff --git a/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/workspace/archived/.gitkeep b/.clawith/data/agents/1c0acd5b-6889-4aaa-982b-27029ce48390/workspace/archived/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/HEARTBEAT.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/HEARTBEAT.md new file mode 100644 index 000000000..485565cb3 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/HEARTBEAT.md @@ -0,0 +1,63 @@ +# HEARTBEAT + +When this file is read during a heartbeat, you are performing a **periodic awareness check**. + +## Phase 1: Review Context & Discover Interest Points + +Review your **recent conversations** and your **role/responsibilities**. +Identify topics or questions that: +- Are directly relevant to your role and current work +- Were mentioned by users but not fully explored at the time +- Represent emerging trends or changes in your professional domain +- Could improve your ability to serve your users + +If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3. +Do NOT search for generic or obvious topics just to fill time. Quality over quantity. + +## Phase 2: Targeted Exploration (Conditional) + +Only if you identified genuine interest points in Phase 1: + +1. Use `web_search` to investigate (maximum 5 searches per heartbeat) +2. Keep searches **tightly scoped** to your role and recent work topics +3. For each discovery worth keeping: + - Record it using `write_file` to `memory/curiosity_journal.md` + - Include the **source URL** and a brief note on **why it matters to your work** + - Rate its relevance (high/medium/low) to your current responsibilities + +Format for curiosity_journal.md entries: +``` +### [Date] - [Topic] +- **Finding**: [What you learned] +- **Source**: [URL] +- **Relevance**: [high/medium/low] — [Why it matters to your work] +- **Follow-up**: [Optional: questions this raises for next time] +``` + +## Phase 3: Agent Plaza + +1. Call `plaza_get_new_posts` to check recent activity +2. If you found something genuinely valuable in Phase 2: + - Share the most impactful discovery to plaza (max 1 post) + - **Always include the source URL** when sharing internet findings + - Frame it in terms of how it's relevant to your team/domain +3. Comment on relevant existing posts (max 2 comments) + +## Phase 4: Wrap Up + +- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK` +- Otherwise, briefly summarize what you explored and why + +## Key Principles +- Always ground exploration in YOUR role and YOUR recent work context +- Never search for random unrelated topics out of idle curiosity +- If you don't have a specific angle worth investigating, don't search +- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries +- Generate follow-up questions only when you genuinely want to know more + +## Rules +- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details +- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links +- 📝 **Limits per heartbeat**: max 1 post + 2 comments +- 🔍 **Search limits**: max 5 web searches per heartbeat +- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK` diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/daily_reports/.gitkeep b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/daily_reports/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/enterprise_info/.gitkeep b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/enterprise_info/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/MEMORY_INDEX.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/MEMORY_INDEX.md new file mode 100644 index 000000000..29e3fab13 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/MEMORY_INDEX.md @@ -0,0 +1,6 @@ +# Memory Index + +This file serves as an index of all memories for this digital employee. + +## Topics + diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/curiosity_journal.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/curiosity_journal.md new file mode 100644 index 000000000..c5185fe44 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/memory/curiosity_journal.md @@ -0,0 +1,9 @@ +# Curiosity Journal + +This is your exploration log. Record interesting discoveries from your web searches here. + +## Active Questions + + +## Discoveries + diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/relationships.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/relationships.md new file mode 100644 index 000000000..19473b09d --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/relationships.md @@ -0,0 +1,5 @@ +# Relationships + +## Digital Employee Colleagues + +- **Meeseeks** (collaborator): Expert task executor who breaks down complex tasks into structured plans and executes them systematically. Delegate multi-step tasks to him. diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/.gitkeep b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/MCP_INSTALLER.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/MCP_INSTALLER.md new file mode 100644 index 000000000..9e3bf3c77 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/MCP_INSTALLER.md @@ -0,0 +1,87 @@ +# MCP Tool Installer + +## When to Use This Skill +Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL. + +--- + +## Step-by-Step Protocol + +### Step 1 — Search first +``` +discover_resources(query="", max_results=5) +``` +Show the results and let the user pick. Note the `ID` field (e.g. `github`). + +### Step 2 — Determine import method + +**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐) +- Requires Smithery API Key (one-time per agent) +- Individual tool tokens NOT needed — Smithery handles auth via OAuth + +**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint) +- User provides the MCP server URL directly +- May require tool-specific API key + +**Not importable** (💻 local-only tools) +- Requires local Docker/process — inform user these cannot be imported automatically + +--- + +### Method A: Smithery Import + +#### Check Smithery API Key +If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim): + +> **Smithery** (smithery.ai) 是一个 MCP 工具市场,类似于"应用商店"。通过它,我可以帮你一键安装各种第三方工具(如 GitHub、Notion、Slack 等),并自动完成认证。 +> +> **为什么需要注册?** +> Smithery 用 API Key 来识别你的身份,这样安装的工具会关联到你的账号,认证信息也会安全保存。 +> +> **注册一次后有什么好处?** +> - 🔑 只需提供一次 Key,后续安装其他工具时我会自动帮你配置 +> - 🔐 不需要为每个工具单独创建 Token(如 GitHub PAT),OAuth 一键授权 +> - 📦 支持上千种 MCP 工具,随时可以扩展你的能力 +> +> **获取步骤:** +> 1. 访问 https://smithery.ai 注册/登录 +> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key +> 3. 将 Key 提供给我 + +#### Import +``` +import_mcp_server( + server_id="", + config={"smithery_api_key": ""} # first time only +) +``` + +#### Handle OAuth +Some tools return an OAuth authorization URL. Tell the user to visit the link. + +**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically. + +--- + +### Method B: Direct URL Import + +When a tool is not available on Smithery but the user has a public MCP endpoint: +``` +import_mcp_server( + server_id="", + config={ + "mcp_url": "https://my-mcp-server.com/sse", + "api_key": "" + } +) +``` +The system will connect to the URL, discover available tools, and register them. + +--- + +## What NOT to Do +- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these +- ❌ Don't tell users to go to Settings — handle everything in chat +- ❌ Don't echo API keys back in your response +- ❌ Don't skip the search step — always verify the server exists before importing +- ❌ Don't import local-only tools — inform users they require local installation diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/competitive-analysis/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/competitive-analysis/SKILL.md new file mode 100644 index 000000000..966f673f0 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/competitive-analysis/SKILL.md @@ -0,0 +1,40 @@ +--- +name: Competitive Analysis +description: Market competitor research, comparison frameworks, and strategic insights +--- + +# Competitive Analysis + +## Overview +Use this skill for analyzing competitors, market positioning, and strategic opportunities. + +**Keywords**: competitors, market analysis, SWOT, positioning, benchmarking + +## Frameworks + +### SWOT Analysis +| | Helpful | Harmful | +|---|---|---| +| **Internal** | Strengths | Weaknesses | +| **External** | Opportunities | Threats | + +### Feature Comparison Matrix +Compare products across key dimensions: +- Core features and capabilities +- Pricing and packaging +- Target audience +- Market positioning +- Technology stack + +### Porter's Five Forces +1. Competitive rivalry intensity +2. Bargaining power of suppliers +3. Bargaining power of buyers +4. Threat of new entrants +5. Threat of substitutes + +## Output Format +- Competitor overview table +- Detailed per-competitor analysis +- Strategic recommendations +- Key differentiators summary diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/SKILL.md new file mode 100644 index 000000000..db71c3ed8 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/SKILL.md @@ -0,0 +1,146 @@ +--- +name: Complex Task Executor +description: Structured methodology for decomposing, planning, and executing complex multi-step tasks with progress tracking +--- + +# Complex Task Executor + +## When to Use This Skill + +Use this skill when a task meets ANY of the following criteria: +- Requires more than 3 distinct steps to complete +- Involves multiple tools or information sources +- Has dependencies between steps (step B needs output from step A) +- Requires research before execution +- Could benefit from a documented plan others can review +- The user explicitly asks for a thorough or systematic approach + +**DO NOT use this for simple tasks** like answering a question, reading a single file, or performing one tool call. + +## Workflow + +### Phase 1: Task Analysis (THINK before acting) + +Before creating any files, analyze the task: + +1. **Understand the goal**: What is the final deliverable? What does "done" look like? +2. **Assess complexity**: How many steps? What tools are needed? +3. **Identify dependencies**: Which steps depend on others? +4. **Identify risks**: What could go wrong? What information is missing? +5. **Estimate scope**: Is the task feasible with available tools/skills? + +### Phase 2: Create Task Plan + +Create a task folder and plan file in the workspace: + +``` +workspace//plan.md +``` + +The plan.md MUST follow this exact format: + +```markdown +# Task: + +## Objective + + +## Steps + +- [ ] 1. + - Details: + - Output: +- [ ] 2. + - Details: <...> + - Depends on: Step 1 +- [ ] 3. + - Details: <...> + +## Status +- Created: +- Current Step: Not started +- Progress: 0/ + +## Notes + +``` + +Rules for writing the plan: +- Each step should be completable in 1-3 tool calls +- Use verb-noun format: "Research competitors", "Draft report", "Validate data" +- Mark dependencies explicitly +- Include expected outputs for each step + +### Phase 3: Execute Step-by-Step + +For EACH step in the plan: + +1. **Read the plan** — Call `read_file` on `workspace//plan.md` to check current state +2. **Mark as in-progress** — Update the checkbox from `[ ]` to `[/]` and update the "Current Step" field +3. **Execute the step** — Do the actual work (tool calls, analysis, writing) +4. **Record output** — Save results to `workspace//` (e.g., intermediate files, data) +5. **Mark as complete** — Update the checkbox from `[/]` to `[x]` and update "Progress" counter +6. **Proceed to next step** — Move to the next uncompleted step + +### Phase 4: Completion + +When all steps are done: +1. Update plan.md status to "✅ Completed" +2. Create a `workspace//summary.md` with: + - What was accomplished + - Key results and deliverables + - Any follow-up items +3. Present the final result to the user + +## Adaptive Replanning + +If during execution you discover: +- A step is impossible → Mark it `[!]` with a reason, add alternative steps +- New steps are needed → Add them to the plan with `[+]` prefix +- A step produced unexpected results → Add a note and adjust subsequent steps +- The plan needs major changes → Create a new section "## Revised Plan" and follow it + +Always update plan.md BEFORE changing course, so the plan stays the source of truth. + +## Error Handling + +- If a tool call fails, retry once. If it fails again, mark the step as blocked and note the error. +- Never silently skip a step. Always update the plan to reflect what happened. +- If you're stuck, tell the user what's blocking and ask for guidance. + +## Example Scenarios + +### Example 1: "Research our top 3 competitors and write a comparison report" + +Plan would be: +``` +- [ ] 1. Identify the user's company/product context +- [ ] 2. Research Competitor A — website, pricing, features +- [ ] 3. Research Competitor B — website, pricing, features +- [ ] 4. Research Competitor C — website, pricing, features +- [ ] 5. Create comparison matrix +- [ ] 6. Write analysis and recommendations +- [ ] 7. Compile final report +``` + +### Example 2: "Analyze our Q4 sales data and prepare a board presentation" + +Plan would be: +``` +- [ ] 1. Read and understand the sales data files +- [ ] 2. Calculate key metrics (revenue, growth, trends) +- [ ] 3. Identify top insights and anomalies +- [ ] 4. Create data summary tables +- [ ] 5. Draft presentation outline +- [ ] 6. Write each presentation section +- [ ] 7. Add executive summary +- [ ] 8. Review and polish final document +``` + +## Key Principles + +1. **Plan is the source of truth** — Always update it before moving on +2. **One step at a time** — Don't skip ahead or batch too many steps +3. **Show your work** — Save intermediate results to the task folder +4. **Communicate progress** — The user can read plan.md at any time to see status +5. **Be adaptive** — Plans change; that's OK if you update the plan first diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/examples/plan_template.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/examples/plan_template.md new file mode 100644 index 000000000..dfd60e7cb --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/complex-task-executor/examples/plan_template.md @@ -0,0 +1,23 @@ +# Task: [Title] + +## Objective +[One-sentence description of the desired outcome] + +## Steps + +- [ ] 1. [First step] + - Details: [What specifically to do] + - Output: [What this step produces] +- [ ] 2. [Second step] + - Details: [...] + - Depends on: Step 1 +- [ ] 3. [Third step] + - Details: [...] + +## Status +- Created: [timestamp] +- Current Step: Not started +- Progress: 0/3 + +## Notes +- [Any assumptions, risks, or open questions] diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/content-writing/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/content-writing/SKILL.md new file mode 100644 index 000000000..dc9480df8 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/content-writing/SKILL.md @@ -0,0 +1,37 @@ +--- +name: Content Writing +description: Professional content creation, editing, and tone adaptation +--- + +# Content Writing + +## Overview +Use this skill for creating, editing, and polishing written content across formats. + +**Keywords**: writing, editing, copywriting, tone, style, proofreading + +## Content Types +- **Articles & Blog Posts**: Informative, engaging long-form content +- **Business Communications**: Emails, memos, reports +- **Marketing Copy**: Headlines, descriptions, calls-to-action +- **Documentation**: Technical docs, guides, FAQs + +## Guidelines + +### Structure +- Hook readers with a compelling opening +- Use clear headings and logical flow +- Keep paragraphs short (3-5 sentences) +- End with a clear conclusion or call-to-action + +### Tone Adaptation +- **Formal**: Business reports, official communications +- **Professional**: Client-facing content, documentation +- **Conversational**: Blog posts, social media +- **Technical**: Developer docs, specifications + +### Quality Checklist +- [ ] Clear main message +- [ ] Consistent tone throughout +- [ ] No grammatical errors +- [ ] Appropriate length for format diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/SKILL.md new file mode 100644 index 000000000..325598633 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/SKILL.md @@ -0,0 +1,34 @@ +--- +name: Data Analysis +description: Data interpretation, pattern recognition, and structured reporting +--- + +# Data Analysis + +## Overview +Use this skill for analyzing data, identifying patterns, and creating structured reports. + +**Keywords**: data analysis, statistics, trends, visualization, reporting + +## Process + +### 1. Data Understanding +- Identify data types, ranges, and distributions +- Check for missing values and anomalies +- Understand the business context + +### 2. Analysis Methods +- Descriptive statistics (mean, median, distribution) +- Trend analysis (time-series patterns) +- Comparative analysis (benchmarking, A/B) +- Correlation and relationship discovery + +### 3. Reporting +- Lead with key insights and actionable findings +- Use tables and structured formats for clarity +- Include methodology notes for reproducibility + +## Output Format +- Executive summary with top 3 findings +- Detailed analysis with supporting data +- Recommendations based on findings diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/examples/sample_report.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/examples/sample_report.md new file mode 100644 index 000000000..03a8746fb --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/examples/sample_report.md @@ -0,0 +1,24 @@ +# Sample Analysis Report + +## Executive Summary +Analysis of Q4 2024 sales data reveals a 12% increase in total revenue, +driven primarily by the Enterprise segment (+23%). + +## Key Findings +1. **Revenue Growth**: Total revenue increased from $2.1M to $2.35M +2. **Top Segment**: Enterprise accounts grew 23% QoQ +3. **Churn**: SMB churn rate decreased from 5.2% to 4.1% + +## Detailed Analysis + +| Metric | Q3 2024 | Q4 2024 | Change | +|--------|---------|---------|--------| +| Total Revenue | $2.1M | $2.35M | +12% | +| Enterprise | $1.2M | $1.47M | +23% | +| SMB | $0.9M | $0.88M | -2% | +| Churn Rate | 5.2% | 4.1% | -1.1pp | + +## Recommendations +1. Increase investment in Enterprise sales team +2. Investigate SMB revenue decline +3. Continue churn reduction initiatives diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/scripts/analyze_csv.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/scripts/analyze_csv.py new file mode 100644 index 000000000..64f1002d7 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/data-analysis/scripts/analyze_csv.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +"""Utility for quick CSV data analysis.""" + +import csv +import statistics +from collections import Counter + + +def analyze_column(data: list[dict], column: str) -> dict: + """Analyze a single column from CSV data.""" + values = [row.get(column) for row in data if row.get(column) is not None] + if not values: + return {"column": column, "count": 0, "error": "No data"} + + result = {"column": column, "count": len(values), "unique": len(set(values))} + + # Try numeric analysis + try: + nums = [float(v) for v in values] + result.update({ + "type": "numeric", + "min": min(nums), "max": max(nums), + "mean": round(statistics.mean(nums), 2), + "median": round(statistics.median(nums), 2), + }) + except (ValueError, TypeError): + freq = Counter(values).most_common(5) + result.update({"type": "categorical", "top_values": freq}) + + return result + + +def quick_summary(filepath: str) -> str: + """Generate a quick summary of a CSV file.""" + with open(filepath, 'r') as f: + reader = csv.DictReader(f) + data = list(reader) + columns = data[0].keys() if data else [] + return f'Rows: {len(data)}, Columns: {len(columns)}' diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/mcp-installer/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/mcp-installer/SKILL.md new file mode 100644 index 000000000..9e3bf3c77 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/mcp-installer/SKILL.md @@ -0,0 +1,87 @@ +# MCP Tool Installer + +## When to Use This Skill +Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL. + +--- + +## Step-by-Step Protocol + +### Step 1 — Search first +``` +discover_resources(query="", max_results=5) +``` +Show the results and let the user pick. Note the `ID` field (e.g. `github`). + +### Step 2 — Determine import method + +**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐) +- Requires Smithery API Key (one-time per agent) +- Individual tool tokens NOT needed — Smithery handles auth via OAuth + +**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint) +- User provides the MCP server URL directly +- May require tool-specific API key + +**Not importable** (💻 local-only tools) +- Requires local Docker/process — inform user these cannot be imported automatically + +--- + +### Method A: Smithery Import + +#### Check Smithery API Key +If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim): + +> **Smithery** (smithery.ai) 是一个 MCP 工具市场,类似于"应用商店"。通过它,我可以帮你一键安装各种第三方工具(如 GitHub、Notion、Slack 等),并自动完成认证。 +> +> **为什么需要注册?** +> Smithery 用 API Key 来识别你的身份,这样安装的工具会关联到你的账号,认证信息也会安全保存。 +> +> **注册一次后有什么好处?** +> - 🔑 只需提供一次 Key,后续安装其他工具时我会自动帮你配置 +> - 🔐 不需要为每个工具单独创建 Token(如 GitHub PAT),OAuth 一键授权 +> - 📦 支持上千种 MCP 工具,随时可以扩展你的能力 +> +> **获取步骤:** +> 1. 访问 https://smithery.ai 注册/登录 +> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key +> 3. 将 Key 提供给我 + +#### Import +``` +import_mcp_server( + server_id="", + config={"smithery_api_key": ""} # first time only +) +``` + +#### Handle OAuth +Some tools return an OAuth authorization URL. Tell the user to visit the link. + +**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically. + +--- + +### Method B: Direct URL Import + +When a tool is not available on Smithery but the user has a public MCP endpoint: +``` +import_mcp_server( + server_id="", + config={ + "mcp_url": "https://my-mcp-server.com/sse", + "api_key": "" + } +) +``` +The system will connect to the URL, discover available tools, and register them. + +--- + +## What NOT to Do +- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these +- ❌ Don't tell users to go to Settings — handle everything in chat +- ❌ Don't echo API keys back in your response +- ❌ Don't skip the search step — always verify the server exists before importing +- ❌ Don't import local-only tools — inform users they require local installation diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/SKILL.md new file mode 100644 index 000000000..ce0d06f3e --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/SKILL.md @@ -0,0 +1,152 @@ +--- +name: skill-creator +description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy. +--- + +# Skill Creator + +A skill for creating new skills and iteratively improving them. + +At a high level, the process of creating a skill goes like this: + +- Decide what you want the skill to do and roughly how it should do it +- Write a draft of the skill +- Create a few test prompts and run claude-with-access-to-the-skill on them +- Help the user evaluate the results both qualitatively and quantitatively +- Rewrite the skill based on feedback from the user's evaluation +- Repeat until you're satisfied +- Expand the test set and try again at larger scale + +Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages. + +## Communicating with the user + +Pay attention to context cues to understand how to phrase your communication. Briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it. + +--- + +## Creating a skill + +### Capture Intent +Start by understanding the user's intent. + +1. What should this skill enable the agent to do? +2. When should this skill trigger? (what user phrases/contexts) +3. What's the expected output format? +4. Should we set up test cases to verify the skill works? + +### Interview and Research +Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies. Wait to write test prompts until you've got this part ironed out. + +### Write the SKILL.md +Based on the user interview, fill in these components: + +- **name**: Skill identifier +- **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it. +- **the rest of the skill** + +### Skill Writing Guide + +#### Anatomy of a Skill + +``` +skill-name/ +\u251c\u2500\u2500 SKILL.md (required) +\u2502 \u251c\u2500\u2500 YAML frontmatter (name, description required) +\u2502 \u2514\u2500\u2500 Markdown instructions +\u2514\u2500\u2500 Bundled Resources (optional) + \u251c\u2500\u2500 scripts/ - Executable code for deterministic/repetitive tasks + \u251c\u2500\u2500 references/ - Docs loaded into context as needed + \u2514\u2500\u2500 assets/ - Files used in output (templates, icons, fonts) +``` + +#### Progressive Disclosure + +Skills use a three-level loading system: +1. **Metadata** (name + description) - Always in context (~100 words) +2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal) +3. **Bundled resources** - As needed (unlimited, scripts can execute without loading) + +**Key patterns:** +- Keep SKILL.md under 500 lines; if approaching this limit, add hierarchy with clear pointers +- Reference files clearly from SKILL.md with guidance on when to read them +- For large reference files (>300 lines), include a table of contents + +#### Writing Patterns + +Prefer using the imperative form in instructions. + +### Writing Style +Explain to the model why things are important. Use theory of mind and try to make the skill general. Start by writing a draft and then look at it with fresh eyes and improve it. + +### Test Cases +After writing the skill draft, come up with 2-3 realistic test prompts. Share them with the user. Save test cases to `evals/evals.json`. + +--- + +## Running and evaluating test cases + +This section is one continuous sequence. + +### Step 1: Run test cases +For each test case, run the agent with the skill applied, and optionally a baseline run without the skill for comparison. + +### Step 2: Draft assertions +While runs are in progress, draft quantitative assertions for each test case. Good assertions are objectively verifiable and have descriptive names. + +### Step 3: Capture timing data +When each run completes, save timing data (tokens, duration) to `timing.json`. + +### Step 4: Grade, aggregate, and launch the viewer +Once all runs are done: +1. Grade each run against assertions — see `agents/grader.md` +2. Aggregate results: `python -m scripts.aggregate_benchmark /iteration-N --skill-name ` +3. Launch the viewer: `python eval-viewer/generate_review.py /iteration-N --skill-name "my-skill" --benchmark /iteration-N/benchmark.json` +4. Present results to the user for review + +### Step 5: Read the feedback +Read user feedback from `feedback.json`. Empty feedback means the user thought it was fine. + +--- + +## Improving the skill + +### How to think about improvements +1. **Generalize from the feedback.** Don't overfit to specific examples. +2. **Keep the prompt lean.** Remove things that aren't pulling their weight. +3. **Explain the why.** Today's LLMs are smart. Explain reasoning rather than rigid MUSTs. +4. **Look for repeated work across test cases.** Bundle common scripts in `scripts/`. + +### The iteration loop +1. Apply improvements to the skill +2. Rerun all test cases into a new iteration directory +3. Present results for review +4. Wait for user to review +5. Read feedback, improve again, repeat + +--- + +## Advanced: Blind comparison +For rigorous comparison between two versions. Read `agents/comparator.md` and `agents/analyzer.md`. + +## Description Optimization +Optimize the description for better triggering accuracy. Use `scripts/run_loop.py`. + +--- + +## Reference files + +- `agents/grader.md` — How to evaluate assertions against outputs +- `agents/comparator.md` — How to do blind A/B comparison between two outputs +- `agents/analyzer.md` — How to analyze why one version beat another +- `references/schemas.md` — JSON structures for evals.json, grading.json, etc. +- `assets/eval_review.html` — HTML template for eval review +- `eval-viewer/generate_review.py` — Script to generate the review viewer +- `scripts/aggregate_benchmark.py` — Aggregate benchmark results +- `scripts/generate_report.py` — Generate optimization report +- `scripts/improve_description.py` — Improve skill description +- `scripts/package_skill.py` — Package skill for distribution +- `scripts/quick_validate.py` — Quick validation +- `scripts/run_eval.py` — Run triggering evaluation +- `scripts/run_loop.py` — Run optimization loop +- `scripts/utils.py` — Shared utilities diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/analyzer.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/analyzer.md new file mode 100644 index 000000000..14e41d606 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/analyzer.md @@ -0,0 +1,274 @@ +# Post-hoc Analyzer Agent + +Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions. + +## Role + +After the blind comparator determines a winner, the Post-hoc Analyzer "unblids" the results by examining the skills and transcripts. The goal is to extract actionable insights: what made the winner better, and how can the loser be improved? + +## Inputs + +You receive these parameters in your prompt: + +- **winner**: "A" or "B" (from blind comparison) +- **winner_skill_path**: Path to the skill that produced the winning output +- **winner_transcript_path**: Path to the execution transcript for the winner +- **loser_skill_path**: Path to the skill that produced the losing output +- **loser_transcript_path**: Path to the execution transcript for the loser +- **comparison_result_path**: Path to the blind comparator's output JSON +- **output_path**: Where to save the analysis results + +## Process + +### Step 1: Read Comparison Result + +1. Read the blind comparator's output at comparison_result_path +2. Note the winning side (A or B), the reasoning, and any scores +3. Understand what the comparator valued in the winning output + +### Step 2: Read Both Skills + +1. Read the winner skill's SKILL.md and key referenced files +2. Read the loser skill's SKILL.md and key referenced files +3. Identify structural differences: + - Instructions clarity and specificity + - Script/tool usage patterns + - Example coverage + - Edge case handling + +### Step 3: Read Both Transcripts + +1. Read the winner's transcript +2. Read the loser's transcript +3. Compare execution patterns: + - How closely did each follow their skill's instructions? + - What tools were used differently? + - Where did the loser diverge from optimal behavior? + - Did either encounter errors or make recovery attempts? + +### Step 4: Analyze Instruction Following + +For each transcript, evaluate: +- Did the agent follow the skill's explicit instructions? +- Did the agent use the skill's provided tools/scripts? +- Were there missed opportunities to leverage skill content? +- Did the agent add unnecessary steps not in the skill? + +Score instruction following 1-10 and note specific issues. + +### Step 5: Identify Winner Strengths + +Determine what made the winner better: +- Clearer instructions that led to better behavior? +- Better scripts/tools that produced better output? +- More comprehensive examples that guided edge cases? +- Better error handling guidance? + +Be specific. Quote from skills/transcripts where relevant. + +### Step 6: Identify Loser Weaknesses + +Determine what held the loser back: +- Ambiguous instructions that led to suboptimal choices? +- Missing tools/scripts that forced workarounds? +- Gaps in edge case coverage? +- Poor error handling that caused failures? + +### Step 7: Generate Improvement Suggestions + +Based on the analysis, produce actionable suggestions for improving the loser skill: +- Specific instruction changes to make +- Tools/scripts to add or modify +- Examples to include +- Edge cases to address + +Prioritize by impact. Focus on changes that would have changed the outcome. + +### Step 8: Write Analysis Results + +Save structured analysis to `{output_path}`. + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "comparison_summary": { + "winner": "A", + "winner_skill": "path/to/winner/skill", + "loser_skill": "path/to/loser/skill", + "comparator_reasoning": "Brief summary of why comparator chose winner" + }, + "winner_strengths": [ + "Clear step-by-step instructions for handling multi-page documents", + "Included validation script that caught formatting errors", + "Explicit guidance on fallback behavior when OCR fails" + ], + "loser_weaknesses": [ + "Vague instruction 'process the document appropriately' led to inconsistent behavior", + "No script for validation, agent had to improvise and made errors", + "No guidance on OCR failure, agent gave up instead of trying alternatives" + ], + "instruction_following": { + "winner": { + "score": 9, + "issues": [ + "Minor: skipped optional logging step" + ] + }, + "loser": { + "score": 6, + "issues": [ + "Did not use the skill's formatting template", + "Invented own approach instead of following step 3", + "Missed the 'always validate output' instruction" + ] + } + }, + "improvement_suggestions": [ + { + "priority": "high", + "category": "instructions", + "suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template", + "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior" + }, + { + "priority": "high", + "category": "tools", + "suggestion": "Add validate_output.py script similar to winner skill's validation approach", + "expected_impact": "Would catch formatting errors before final output" + }, + { + "priority": "medium", + "category": "error_handling", + "suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'", + "expected_impact": "Would prevent early failure on difficult documents" + } + ], + "transcript_insights": { + "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output", + "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors" + } +} +``` + +## Guidelines + +- **Be specific**: Quote from skills and transcripts, don't just say "instructions were unclear" +- **Be actionable**: Suggestions should be concrete changes, not vague advice +- **Focus on skill improvements**: The goal is to improve the losing skill, not critique the agent +- **Prioritize by impact**: Which changes would most likely have changed the outcome? +- **Consider causation**: Did the skill weakness actually cause the worse output, or is it incidental? +- **Stay objective**: Analyze what happened, don't editorialize +- **Think about generalization**: Would this improvement help on other evals too? + +## Categories for Suggestions + +Use these categories to organize improvement suggestions: + +| Category | Description | +|----------|-------------| +| `instructions` | Changes to the skill's prose instructions | +| `tools` | Scripts, templates, or utilities to add/modify | +| `examples` | Example inputs/outputs to include | +| `error_handling` | Guidance for handling failures | +| `structure` | Reorganization of skill content | +| `references` | External docs or resources to add | + +## Priority Levels + +- **high**: Would likely change the outcome of this comparison +- **medium**: Would improve quality but may not change win/loss +- **low**: Nice to have, marginal improvement + +--- + +# Analyzing Benchmark Results + +When analyzing benchmark results, the analyzer's purpose is to **surface patterns and anomalies** across multiple runs, not suggest skill improvements. + +## Role + +Review all benchmark run results and generate freeform notes that help the user understand skill performance. Focus on patterns that wouldn't be visible from aggregate metrics alone. + +## Inputs + +You receive these parameters in your prompt: + +- **benchmark_data_path**: Path to the in-progress benchmark.json with all run results +- **skill_path**: Path to the skill being benchmarked +- **output_path**: Where to save the notes (as JSON array of strings) + +## Process + +### Step 1: Read Benchmark Data + +1. Read the benchmark.json containing all run results +2. Note the configurations tested (with_skill, without_skill) +3. Understand the run_summary aggregates already calculated + +### Step 2: Analyze Per-Assertion Patterns + +For each expectation across all runs: +- Does it **always pass** in both configurations? (may not differentiate skill value) +- Does it **always fail** in both configurations? (may be broken or beyond capability) +- Does it **always pass with skill but fail without**? (skill clearly adds value here) +- Does it **always fail with skill but pass without**? (skill may be hurting) +- Is it **highly variable**? (flaky expectation or non-deterministic behavior) + +### Step 3: Analyze Cross-Eval Patterns + +Look for patterns across evals: +- Are certain eval types consistently harder/easier? +- Do some evals show high variance while others are stable? +- Are there surprising results that contradict expectations? + +### Step 4: Analyze Metrics Patterns + +Look at time_seconds, tokens, tool_calls: +- Does the skill significantly increase execution time? +- Is there high variance in resource usage? +- Are there outlier runs that skew the aggregates? + +### Step 5: Generate Notes + +Write freeform observations as a list of strings. Each note should: +- State a specific observation +- Be grounded in the data (not speculation) +- Help the user understand something the aggregate metrics don't show + +Examples: +- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value" +- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky" +- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)" +- "Skill adds 13s average execution time but improves pass rate by 50%" +- "Token usage is 80% higher with skill, primarily due to script output parsing" +- "All 3 without-skill runs for eval 1 produced empty output" + +### Step 6: Write Notes + +Save notes to `{output_path}` as a JSON array of strings: + +```json +[ + "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value", + "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure", + "Without-skill runs consistently fail on table extraction expectations", + "Skill adds 13s average execution time but improves pass rate by 50%" +] +``` + +## Guidelines + +**DO:** +- Report what you observe in the data +- Be specific about which evals, expectations, or runs you're referring to +- Note patterns that aggregate metrics would hide +- Provide context that helps interpret the numbers + +**DO NOT:** +- Suggest improvements to the skill (that's for the improvement step, not benchmarking) +- Make subjective quality judgments ("the output was good/bad") +- Speculate about causes without evidence +- Repeat information already in the run_summary aggregates diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/comparator.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/comparator.md new file mode 100644 index 000000000..80e00eb45 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/comparator.md @@ -0,0 +1,202 @@ +# Blind Comparator Agent + +Compare two outputs WITHOUT knowing which skill produced them. + +## Role + +The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach. + +Your judgment is based purely on output quality and task completion. + +## Inputs + +You receive these parameters in your prompt: + +- **output_a_path**: Path to the first output file or directory +- **output_b_path**: Path to the second output file or directory +- **eval_prompt**: The original task/prompt that was executed +- **expectations**: List of expectations to check (optional - may be empty) + +## Process + +### Step 1: Read Both Outputs + +1. Examine output A (file or directory) +2. Examine output B (file or directory) +3. Note the type, structure, and content of each +4. If outputs are directories, examine all relevant files inside + +### Step 2: Understand the Task + +1. Read the eval_prompt carefully +2. Identify what the task requires: + - What should be produced? + - What qualities matter (accuracy, completeness, format)? + - What would distinguish a good output from a poor one? + +### Step 3: Generate Evaluation Rubric + +Based on the task, generate a rubric with two dimensions: + +**Content Rubric** (what the output contains): +| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) | +|-----------|----------|----------------|---------------| +| Correctness | Major errors | Minor errors | Fully correct | +| Completeness | Missing key elements | Mostly complete | All elements present | +| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout | + +**Structure Rubric** (how the output is organized): +| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) | +|-----------|----------|----------------|---------------| +| Organization | Disorganized | Reasonably organized | Clear, logical structure | +| Formatting | Inconsistent/broken | Mostly consistent | Professional, polished | +| Usability | Difficult to use | Usable with effort | Easy to use | + +Adapt criteria to the specific task. For example: +- PDF form → "Field alignment", "Text readability", "Data placement" +- Document → "Section structure", "Heading hierarchy", "Paragraph flow" +- Data output → "Schema correctness", "Data types", "Completeness" + +### Step 4: Evaluate Each Output Against the Rubric + +For each output (A and B): + +1. **Score each criterion** on the rubric (1-5 scale) +2. **Calculate dimension totals**: Content score, Structure score +3. **Calculate overall score**: Average of dimension scores, scaled to 1-10 + +### Step 5: Check Assertions (if provided) + +If expectations are provided: + +1. Check each expectation against output A +2. Check each expectation against output B +3. Count pass rates for each output +4. Use expectation scores as secondary evidence (not the primary decision factor) + +### Step 6: Determine the Winner + +Compare A and B based on (in priority order): + +1. **Primary**: Overall rubric score (content + structure) +2. **Secondary**: Assertion pass rates (if applicable) +3. **Tiebreaker**: If truly equal, declare a TIE + +Be decisive - ties should be rare. One output is usually better, even if marginally. + +### Step 7: Write Comparison Results + +Save results to a JSON file at the path specified (or `comparison.json` if not specified). + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "winner": "A", + "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.", + "rubric": { + "A": { + "content": { + "correctness": 5, + "completeness": 5, + "accuracy": 4 + }, + "structure": { + "organization": 4, + "formatting": 5, + "usability": 4 + }, + "content_score": 4.7, + "structure_score": 4.3, + "overall_score": 9.0 + }, + "B": { + "content": { + "correctness": 3, + "completeness": 2, + "accuracy": 3 + }, + "structure": { + "organization": 3, + "formatting": 2, + "usability": 3 + }, + "content_score": 2.7, + "structure_score": 2.7, + "overall_score": 5.4 + } + }, + "output_quality": { + "A": { + "score": 9, + "strengths": ["Complete solution", "Well-formatted", "All fields present"], + "weaknesses": ["Minor style inconsistency in header"] + }, + "B": { + "score": 5, + "strengths": ["Readable output", "Correct basic structure"], + "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"] + } + }, + "expectation_results": { + "A": { + "passed": 4, + "total": 5, + "pass_rate": 0.80, + "details": [ + {"text": "Output includes name", "passed": true}, + {"text": "Output includes date", "passed": true}, + {"text": "Format is PDF", "passed": true}, + {"text": "Contains signature", "passed": false}, + {"text": "Readable text", "passed": true} + ] + }, + "B": { + "passed": 3, + "total": 5, + "pass_rate": 0.60, + "details": [ + {"text": "Output includes name", "passed": true}, + {"text": "Output includes date", "passed": false}, + {"text": "Format is PDF", "passed": true}, + {"text": "Contains signature", "passed": false}, + {"text": "Readable text", "passed": true} + ] + } + } +} +``` + +If no expectations were provided, omit the `expectation_results` field entirely. + +## Field Descriptions + +- **winner**: "A", "B", or "TIE" +- **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie) +- **rubric**: Structured rubric evaluation for each output + - **content**: Scores for content criteria (correctness, completeness, accuracy) + - **structure**: Scores for structure criteria (organization, formatting, usability) + - **content_score**: Average of content criteria (1-5) + - **structure_score**: Average of structure criteria (1-5) + - **overall_score**: Combined score scaled to 1-10 +- **output_quality**: Summary quality assessment + - **score**: 1-10 rating (should match rubric overall_score) + - **strengths**: List of positive aspects + - **weaknesses**: List of issues or shortcomings +- **expectation_results**: (Only if expectations provided) + - **passed**: Number of expectations that passed + - **total**: Total number of expectations + - **pass_rate**: Fraction passed (0.0 to 1.0) + - **details**: Individual expectation results + +## Guidelines + +- **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality. +- **Be specific**: Cite specific examples when explaining strengths and weaknesses. +- **Be decisive**: Choose a winner unless outputs are genuinely equivalent. +- **Output quality first**: Assertion scores are secondary to overall task completion. +- **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness. +- **Explain your reasoning**: The reasoning field should make it clear why you chose the winner. +- **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better. diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/grader.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/grader.md new file mode 100644 index 000000000..558ab05c0 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/agents/grader.md @@ -0,0 +1,223 @@ +# Grader Agent + +Evaluate expectations against an execution transcript and outputs. + +## Role + +The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment. + +You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so. + +## Inputs + +You receive these parameters in your prompt: + +- **expectations**: List of expectations to evaluate (strings) +- **transcript_path**: Path to the execution transcript (markdown file) +- **outputs_dir**: Directory containing output files from execution + +## Process + +### Step 1: Read the Transcript + +1. Read the transcript file completely +2. Note the eval prompt, execution steps, and final result +3. Identify any issues or errors documented + +### Step 2: Examine Output Files + +1. List files in outputs_dir +2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced. +3. Note contents, structure, and quality + +### Step 3: Evaluate Each Assertion + +For each expectation: + +1. **Search for evidence** in the transcript and outputs +2. **Determine verdict**: + - **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance + - **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content) +3. **Cite the evidence**: Quote the specific text or describe what you found + +### Step 4: Extract and Verify Claims + +Beyond the predefined expectations, extract implicit claims from the outputs and verify them: + +1. **Extract claims** from the transcript and outputs: + - Factual statements ("The form has 12 fields") + - Process claims ("Used pypdf to fill the form") + - Quality claims ("All fields were filled correctly") + +2. **Verify each claim**: + - **Factual claims**: Can be checked against the outputs or external sources + - **Process claims**: Can be verified from the transcript + - **Quality claims**: Evaluate whether the claim is justified + +3. **Flag unverifiable claims**: Note claims that cannot be verified with available information + +This catches issues that predefined expectations might miss. + +### Step 5: Read User Notes + +If `{outputs_dir}/user_notes.md` exists: +1. Read it and note any uncertainties or issues flagged by the executor +2. Include relevant concerns in the grading output +3. These may reveal problems even when expectations pass + +### Step 6: Critique the Evals + +After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap. + +Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't. + +Suggestions worth raising: +- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content) +- An important outcome you observed — good or bad — that no assertion covers at all +- An assertion that can't actually be verified from the available outputs + +Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion. + +### Step 7: Write Grading Results + +Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir). + +## Grading Criteria + +**PASS when**: +- The transcript or outputs clearly demonstrate the expectation is true +- Specific evidence can be cited +- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename) + +**FAIL when**: +- No evidence found for the expectation +- Evidence contradicts the expectation +- The expectation cannot be verified from available information +- The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete +- The output appears to meet the assertion by coincidence rather than by actually doing the work + +**When uncertain**: The burden of proof to pass is on the expectation. + +### Step 8: Read Executor Metrics and Timing + +1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output +2. If `{outputs_dir}/../timing.json` exists, read it and include timing data + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "expectations": [ + { + "text": "The output includes the name 'John Smith'", + "passed": true, + "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'" + }, + { + "text": "The spreadsheet has a SUM formula in cell B10", + "passed": false, + "evidence": "No spreadsheet was created. The output was a text file." + }, + { + "text": "The assistant used the skill's OCR script", + "passed": true, + "evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'" + } + ], + "summary": { + "passed": 2, + "failed": 1, + "total": 3, + "pass_rate": 0.67 + }, + "execution_metrics": { + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8 + }, + "total_tool_calls": 15, + "total_steps": 6, + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 + }, + "timing": { + "executor_duration_seconds": 165.0, + "grader_duration_seconds": 26.0, + "total_duration_seconds": 191.0 + }, + "claims": [ + { + "claim": "The form has 12 fillable fields", + "type": "factual", + "verified": true, + "evidence": "Counted 12 fields in field_info.json" + }, + { + "claim": "All required fields were populated", + "type": "quality", + "verified": false, + "evidence": "Reference section was left blank despite data being available" + } + ], + "user_notes_summary": { + "uncertainties": ["Used 2023 data, may be stale"], + "needs_review": [], + "workarounds": ["Fell back to text overlay for non-fillable fields"] + }, + "eval_feedback": { + "suggestions": [ + { + "assertion": "The output includes the name 'John Smith'", + "reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input" + }, + { + "reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught" + } + ], + "overall": "Assertions check presence but not correctness. Consider adding content verification." + } +} +``` + +## Field Descriptions + +- **expectations**: Array of graded expectations + - **text**: The original expectation text + - **passed**: Boolean - true if expectation passes + - **evidence**: Specific quote or description supporting the verdict +- **summary**: Aggregate statistics + - **passed**: Count of passed expectations + - **failed**: Count of failed expectations + - **total**: Total expectations evaluated + - **pass_rate**: Fraction passed (0.0 to 1.0) +- **execution_metrics**: Copied from executor's metrics.json (if available) + - **output_chars**: Total character count of output files (proxy for tokens) + - **transcript_chars**: Character count of transcript +- **timing**: Wall clock timing from timing.json (if available) + - **executor_duration_seconds**: Time spent in executor subagent + - **total_duration_seconds**: Total elapsed time for the run +- **claims**: Extracted and verified claims from the output + - **claim**: The statement being verified + - **type**: "factual", "process", or "quality" + - **verified**: Boolean - whether the claim holds + - **evidence**: Supporting or contradicting evidence +- **user_notes_summary**: Issues flagged by the executor + - **uncertainties**: Things the executor wasn't sure about + - **needs_review**: Items requiring human attention + - **workarounds**: Places where the skill didn't work as expected +- **eval_feedback**: Improvement suggestions for the evals (only when warranted) + - **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to + - **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag + +## Guidelines + +- **Be objective**: Base verdicts on evidence, not assumptions +- **Be specific**: Quote the exact text that supports your verdict +- **Be thorough**: Check both transcript and output files +- **Be consistent**: Apply the same standard to each expectation +- **Explain failures**: Make it clear why evidence was insufficient +- **No partial credit**: Each expectation is pass or fail, not partial diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/assets/eval_review.html b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/assets/eval_review.html new file mode 100644 index 000000000..938ff32ae --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/assets/eval_review.html @@ -0,0 +1,146 @@ + + + + + + Eval Set Review - __SKILL_NAME_PLACEHOLDER__ + + + + + + +

Eval Set Review: __SKILL_NAME_PLACEHOLDER__

+

Current description: __SKILL_DESCRIPTION_PLACEHOLDER__

+ +
+ + +
+ + + + + + + + + + +
QueryShould TriggerActions
+ +

+ + + + diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/generate_review.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/generate_review.py new file mode 100644 index 000000000..4f0b1fe00 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/generate_review.py @@ -0,0 +1,473 @@ +#!/usr/bin/env python3 +"""Generate and serve a review page for eval results. + +Reads the workspace directory, discovers runs (directories with outputs/), +embeds all output data into a self-contained HTML page, and serves it via +a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace. + +Usage: + python generate_review.py [--port PORT] [--skill-name NAME] + python generate_review.py --previous-feedback /path/to/old/feedback.json + +No dependencies beyond the Python stdlib are required. +""" + +import argparse +import base64 +import json +import mimetypes +import os +import re +import signal +import subprocess +import sys +import time +import webbrowser +from functools import partial +from http.server import HTTPServer, BaseHTTPRequestHandler +from pathlib import Path + +from loguru import logger + +# Files to exclude from output listings +METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"} + +# Extensions we render as inline text +TEXT_EXTENSIONS = { + ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx", + ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs", + ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml", +} + +# Extensions we render as inline images +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"} + +# MIME type overrides for common types +MIME_OVERRIDES = { + ".svg": "image/svg+xml", + ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", +} + + +def get_mime_type(path: Path) -> str: + ext = path.suffix.lower() + if ext in MIME_OVERRIDES: + return MIME_OVERRIDES[ext] + mime, _ = mimetypes.guess_type(str(path)) + return mime or "application/octet-stream" + + +def find_runs(workspace: Path) -> list[dict]: + """Recursively find directories that contain an outputs/ subdirectory.""" + runs: list[dict] = [] + _find_runs_recursive(workspace, workspace, runs) + runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"])) + return runs + + +def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None: + if not current.is_dir(): + return + + outputs_dir = current / "outputs" + if outputs_dir.is_dir(): + run = build_run(root, current) + if run: + runs.append(run) + return + + skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"} + for child in sorted(current.iterdir()): + if child.is_dir() and child.name not in skip: + _find_runs_recursive(root, child, runs) + + +def build_run(root: Path, run_dir: Path) -> dict | None: + """Build a run dict with prompt, outputs, and grading data.""" + prompt = "" + eval_id = None + + # Try eval_metadata.json + for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]: + if candidate.exists(): + try: + metadata = json.loads(candidate.read_text()) + prompt = metadata.get("prompt", "") + eval_id = metadata.get("eval_id") + except (json.JSONDecodeError, OSError): + pass + if prompt: + break + + # Fall back to transcript.md + if not prompt: + for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]: + if candidate.exists(): + try: + text = candidate.read_text() + match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text) + if match: + prompt = match.group(1).strip() + except OSError: + pass + if prompt: + break + + if not prompt: + prompt = "(No prompt found)" + + run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-") + + # Collect output files + outputs_dir = run_dir / "outputs" + output_files: list[dict] = [] + if outputs_dir.is_dir(): + for f in sorted(outputs_dir.iterdir()): + if f.is_file() and f.name not in METADATA_FILES: + output_files.append(embed_file(f)) + + # Load grading if present + grading = None + for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]: + if candidate.exists(): + try: + grading = json.loads(candidate.read_text()) + except (json.JSONDecodeError, OSError): + pass + if grading: + break + + return { + "id": run_id, + "prompt": prompt, + "eval_id": eval_id, + "outputs": output_files, + "grading": grading, + } + + +def embed_file(path: Path) -> dict: + """Read a file and return an embedded representation.""" + ext = path.suffix.lower() + mime = get_mime_type(path) + + if ext in TEXT_EXTENSIONS: + try: + content = path.read_text(errors="replace") + except OSError: + content = "(Error reading file)" + return { + "name": path.name, + "type": "text", + "content": content, + } + elif ext in IMAGE_EXTENSIONS: + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "image", + "mime": mime, + "data_uri": f"data:{mime};base64,{b64}", + } + elif ext == ".pdf": + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "pdf", + "data_uri": f"data:{mime};base64,{b64}", + } + elif ext == ".xlsx": + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "xlsx", + "data_b64": b64, + } + else: + # Binary / unknown — base64 download link + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "binary", + "mime": mime, + "data_uri": f"data:{mime};base64,{b64}", + } + + +def load_previous_iteration(workspace: Path) -> dict[str, dict]: + """Load previous iteration's feedback and outputs. + + Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}. + """ + result: dict[str, dict] = {} + + # Load feedback + feedback_map: dict[str, str] = {} + feedback_path = workspace / "feedback.json" + if feedback_path.exists(): + try: + data = json.loads(feedback_path.read_text()) + feedback_map = { + r["run_id"]: r["feedback"] + for r in data.get("reviews", []) + if r.get("feedback", "").strip() + } + except (json.JSONDecodeError, OSError, KeyError): + pass + + # Load runs (to get outputs) + prev_runs = find_runs(workspace) + for run in prev_runs: + result[run["id"]] = { + "feedback": feedback_map.get(run["id"], ""), + "outputs": run.get("outputs", []), + } + + # Also add feedback for run_ids that had feedback but no matching run + for run_id, fb in feedback_map.items(): + if run_id not in result: + result[run_id] = {"feedback": fb, "outputs": []} + + return result + + +def generate_html( + runs: list[dict], + skill_name: str, + previous: dict[str, dict] | None = None, + benchmark: dict | None = None, +) -> str: + """Generate the complete standalone HTML page with embedded data.""" + template_path = Path(__file__).parent / "viewer.html" + template = template_path.read_text() + + # Build previous_feedback and previous_outputs maps for the template + previous_feedback: dict[str, str] = {} + previous_outputs: dict[str, list[dict]] = {} + if previous: + for run_id, data in previous.items(): + if data.get("feedback"): + previous_feedback[run_id] = data["feedback"] + if data.get("outputs"): + previous_outputs[run_id] = data["outputs"] + + embedded = { + "skill_name": skill_name, + "runs": runs, + "previous_feedback": previous_feedback, + "previous_outputs": previous_outputs, + } + if benchmark: + embedded["benchmark"] = benchmark + + data_json = json.dumps(embedded) + + return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};") + + +# --------------------------------------------------------------------------- +# HTTP server (stdlib only, zero dependencies) +# --------------------------------------------------------------------------- + +def _kill_port(port: int) -> None: + """Kill any process listening on the given port.""" + try: + result = subprocess.run( + ["lsof", "-ti", f":{port}"], + capture_output=True, text=True, timeout=5, + ) + for pid_str in result.stdout.strip().split("\n"): + if pid_str.strip(): + try: + os.kill(int(pid_str.strip()), signal.SIGTERM) + except (ProcessLookupError, ValueError): + pass + if result.stdout.strip(): + time.sleep(0.5) + except subprocess.TimeoutExpired: + pass + except FileNotFoundError: + logger.warning("Note: lsof not found, cannot check if port is in use") + +class ReviewHandler(BaseHTTPRequestHandler): + """Serves the review HTML and handles feedback saves. + + Regenerates the HTML on each page load so that refreshing the browser + picks up new eval outputs without restarting the server. + """ + + def __init__( + self, + workspace: Path, + skill_name: str, + feedback_path: Path, + previous: dict[str, dict], + benchmark_path: Path | None, + *args, + **kwargs, + ): + self.workspace = workspace + self.skill_name = skill_name + self.feedback_path = feedback_path + self.previous = previous + self.benchmark_path = benchmark_path + super().__init__(*args, **kwargs) + + def do_GET(self) -> None: + if self.path == "/" or self.path == "/index.html": + # Regenerate HTML on each request (re-scans workspace for new outputs) + runs = find_runs(self.workspace) + benchmark = None + if self.benchmark_path and self.benchmark_path.exists(): + try: + benchmark = json.loads(self.benchmark_path.read_text()) + except (json.JSONDecodeError, OSError): + pass + html = generate_html(runs, self.skill_name, self.previous, benchmark) + content = html.encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(content))) + self.end_headers() + self.wfile.write(content) + elif self.path == "/api/feedback": + data = b"{}" + if self.feedback_path.exists(): + data = self.feedback_path.read_bytes() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(data))) + self.end_headers() + self.wfile.write(data) + else: + self.send_error(404) + + def do_POST(self) -> None: + if self.path == "/api/feedback": + length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(length) + try: + data = json.loads(body) + if not isinstance(data, dict) or "reviews" not in data: + raise ValueError("Expected JSON object with 'reviews' key") + self.feedback_path.write_text(json.dumps(data, indent=2) + "\n") + resp = b'{"ok":true}' + self.send_response(200) + except (json.JSONDecodeError, OSError, ValueError) as e: + resp = json.dumps({"error": str(e)}).encode() + self.send_response(500) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(resp))) + self.end_headers() + self.wfile.write(resp) + else: + self.send_error(404) + + def log_message(self, format: str, *args: object) -> None: + # Suppress request logging to keep terminal clean + pass + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate and serve eval review") + parser.add_argument("workspace", type=Path, help="Path to workspace directory") + parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)") + parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header") + parser.add_argument( + "--previous-workspace", type=Path, default=None, + help="Path to previous iteration's workspace (shows old outputs and feedback as context)", + ) + parser.add_argument( + "--benchmark", type=Path, default=None, + help="Path to benchmark.json to show in the Benchmark tab", + ) + parser.add_argument( + "--static", "-s", type=Path, default=None, + help="Write standalone HTML to this path instead of starting a server", + ) + args = parser.parse_args() + + workspace = args.workspace.resolve() + if not workspace.is_dir(): + logger.error(f"Error: {workspace} is not a directory") + sys.exit(1) + + runs = find_runs(workspace) + if not runs: + logger.error(f"No runs found in {workspace}") + sys.exit(1) + + skill_name = args.skill_name or workspace.name.replace("-workspace", "") + feedback_path = workspace / "feedback.json" + + previous: dict[str, dict] = {} + if args.previous_workspace: + previous = load_previous_iteration(args.previous_workspace.resolve()) + + benchmark_path = args.benchmark.resolve() if args.benchmark else None + benchmark = None + if benchmark_path and benchmark_path.exists(): + try: + benchmark = json.loads(benchmark_path.read_text()) + except (json.JSONDecodeError, OSError): + pass + + if args.static: + html = generate_html(runs, skill_name, previous, benchmark) + args.static.parent.mkdir(parents=True, exist_ok=True) + args.static.write_text(html) + logger.info(f"\n Static viewer written to: {args.static}\n") + sys.exit(0) + + # Kill any existing process on the target port + port = args.port + _kill_port(port) + handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path) + try: + server = HTTPServer(("127.0.0.1", port), handler) + except OSError: + # Port still in use after kill attempt — find a free one + server = HTTPServer(("127.0.0.1", 0), handler) + port = server.server_address[1] + + url = f"http://localhost:{port}" + logger.info(f"\n Eval Viewer") + logger.info(f" ─────────────────────────────────") + logger.info(f" URL: {url}") + logger.info(f" Workspace: {workspace}") + logger.info(f" Feedback: {feedback_path}") + if previous: + logger.info(f" Previous: {args.previous_workspace} ({len(previous)} runs)") + if benchmark_path: + logger.info(f" Benchmark: {benchmark_path}") + logger.info(f"\n Press Ctrl+C to stop.\n") + + webbrowser.open(url) + + try: + server.serve_forever() + except KeyboardInterrupt: + logger.info("\nStopped.") + server.server_close() + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/viewer.html b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/viewer.html new file mode 100644 index 000000000..6d8e96348 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/eval-viewer/viewer.html @@ -0,0 +1,1325 @@ + + + + + + Eval Review + + + + + + + +
+
+
+

Eval Review:

+
Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.
+
+
+
+ + + + + +
+
+ +
+
Prompt
+
+
+
+
+ + +
+
Output
+
+
No output files found
+
+
+ + + + + + + + +
+
Your Feedback
+
+ + + +
+
+
+ + +
+ + +
+
+
No benchmark data available. Run a benchmark to see quantitative results here.
+
+
+
+ + +
+
+

Review Complete

+

Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.

+
+ +
+
+
+ + +
+ + + + diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/references/schemas.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/references/schemas.md new file mode 100644 index 000000000..b6eeaa2d4 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/references/schemas.md @@ -0,0 +1,430 @@ +# JSON Schemas + +This document defines the JSON schemas used by skill-creator. + +--- + +## evals.json + +Defines the evals for a skill. Located at `evals/evals.json` within the skill directory. + +```json +{ + "skill_name": "example-skill", + "evals": [ + { + "id": 1, + "prompt": "User's example prompt", + "expected_output": "Description of expected result", + "files": ["evals/files/sample1.pdf"], + "expectations": [ + "The output includes X", + "The skill used script Y" + ] + } + ] +} +``` + +**Fields:** +- `skill_name`: Name matching the skill's frontmatter +- `evals[].id`: Unique integer identifier +- `evals[].prompt`: The task to execute +- `evals[].expected_output`: Human-readable description of success +- `evals[].files`: Optional list of input file paths (relative to skill root) +- `evals[].expectations`: List of verifiable statements + +--- + +## history.json + +Tracks version progression in Improve mode. Located at workspace root. + +```json +{ + "started_at": "2026-01-15T10:30:00Z", + "skill_name": "pdf", + "current_best": "v2", + "iterations": [ + { + "version": "v0", + "parent": null, + "expectation_pass_rate": 0.65, + "grading_result": "baseline", + "is_current_best": false + }, + { + "version": "v1", + "parent": "v0", + "expectation_pass_rate": 0.75, + "grading_result": "won", + "is_current_best": false + }, + { + "version": "v2", + "parent": "v1", + "expectation_pass_rate": 0.85, + "grading_result": "won", + "is_current_best": true + } + ] +} +``` + +**Fields:** +- `started_at`: ISO timestamp of when improvement started +- `skill_name`: Name of the skill being improved +- `current_best`: Version identifier of the best performer +- `iterations[].version`: Version identifier (v0, v1, ...) +- `iterations[].parent`: Parent version this was derived from +- `iterations[].expectation_pass_rate`: Pass rate from grading +- `iterations[].grading_result`: "baseline", "won", "lost", or "tie" +- `iterations[].is_current_best`: Whether this is the current best version + +--- + +## grading.json + +Output from the grader agent. Located at `/grading.json`. + +```json +{ + "expectations": [ + { + "text": "The output includes the name 'John Smith'", + "passed": true, + "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'" + }, + { + "text": "The spreadsheet has a SUM formula in cell B10", + "passed": false, + "evidence": "No spreadsheet was created. The output was a text file." + } + ], + "summary": { + "passed": 2, + "failed": 1, + "total": 3, + "pass_rate": 0.67 + }, + "execution_metrics": { + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8 + }, + "total_tool_calls": 15, + "total_steps": 6, + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 + }, + "timing": { + "executor_duration_seconds": 165.0, + "grader_duration_seconds": 26.0, + "total_duration_seconds": 191.0 + }, + "claims": [ + { + "claim": "The form has 12 fillable fields", + "type": "factual", + "verified": true, + "evidence": "Counted 12 fields in field_info.json" + } + ], + "user_notes_summary": { + "uncertainties": ["Used 2023 data, may be stale"], + "needs_review": [], + "workarounds": ["Fell back to text overlay for non-fillable fields"] + }, + "eval_feedback": { + "suggestions": [ + { + "assertion": "The output includes the name 'John Smith'", + "reason": "A hallucinated document that mentions the name would also pass" + } + ], + "overall": "Assertions check presence but not correctness." + } +} +``` + +**Fields:** +- `expectations[]`: Graded expectations with evidence +- `summary`: Aggregate pass/fail counts +- `execution_metrics`: Tool usage and output size (from executor's metrics.json) +- `timing`: Wall clock timing (from timing.json) +- `claims`: Extracted and verified claims from the output +- `user_notes_summary`: Issues flagged by the executor +- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising + +--- + +## metrics.json + +Output from the executor agent. Located at `/outputs/metrics.json`. + +```json +{ + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8, + "Edit": 1, + "Glob": 2, + "Grep": 0 + }, + "total_tool_calls": 18, + "total_steps": 6, + "files_created": ["filled_form.pdf", "field_values.json"], + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 +} +``` + +**Fields:** +- `tool_calls`: Count per tool type +- `total_tool_calls`: Sum of all tool calls +- `total_steps`: Number of major execution steps +- `files_created`: List of output files created +- `errors_encountered`: Number of errors during execution +- `output_chars`: Total character count of output files +- `transcript_chars`: Character count of transcript + +--- + +## timing.json + +Wall clock timing for a run. Located at `/timing.json`. + +**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact. + +```json +{ + "total_tokens": 84852, + "duration_ms": 23332, + "total_duration_seconds": 23.3, + "executor_start": "2026-01-15T10:30:00Z", + "executor_end": "2026-01-15T10:32:45Z", + "executor_duration_seconds": 165.0, + "grader_start": "2026-01-15T10:32:46Z", + "grader_end": "2026-01-15T10:33:12Z", + "grader_duration_seconds": 26.0 +} +``` + +--- + +## benchmark.json + +Output from Benchmark mode. Located at `benchmarks//benchmark.json`. + +```json +{ + "metadata": { + "skill_name": "pdf", + "skill_path": "/path/to/pdf", + "executor_model": "claude-sonnet-4-20250514", + "analyzer_model": "most-capable-model", + "timestamp": "2026-01-15T10:30:00Z", + "evals_run": [1, 2, 3], + "runs_per_configuration": 3 + }, + + "runs": [ + { + "eval_id": 1, + "eval_name": "Ocean", + "configuration": "with_skill", + "run_number": 1, + "result": { + "pass_rate": 0.85, + "passed": 6, + "failed": 1, + "total": 7, + "time_seconds": 42.5, + "tokens": 3800, + "tool_calls": 18, + "errors": 0 + }, + "expectations": [ + {"text": "...", "passed": true, "evidence": "..."} + ], + "notes": [ + "Used 2023 data, may be stale", + "Fell back to text overlay for non-fillable fields" + ] + } + ], + + "run_summary": { + "with_skill": { + "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90}, + "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0}, + "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100} + }, + "without_skill": { + "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45}, + "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0}, + "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500} + }, + "delta": { + "pass_rate": "+0.50", + "time_seconds": "+13.0", + "tokens": "+1700" + } + }, + + "notes": [ + "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value", + "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent", + "Without-skill runs consistently fail on table extraction expectations", + "Skill adds 13s average execution time but improves pass rate by 50%" + ] +} +``` + +**Fields:** +- `metadata`: Information about the benchmark run + - `skill_name`: Name of the skill + - `timestamp`: When the benchmark was run + - `evals_run`: List of eval names or IDs + - `runs_per_configuration`: Number of runs per config (e.g. 3) +- `runs[]`: Individual run results + - `eval_id`: Numeric eval identifier + - `eval_name`: Human-readable eval name (used as section header in the viewer) + - `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding) + - `run_number`: Integer run number (1, 2, 3...) + - `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors` +- `run_summary`: Statistical aggregates per configuration + - `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields + - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"` +- `notes`: Freeform observations from the analyzer + +**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually. + +--- + +## comparison.json + +Output from blind comparator. Located at `/comparison-N.json`. + +```json +{ + "winner": "A", + "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.", + "rubric": { + "A": { + "content": { + "correctness": 5, + "completeness": 5, + "accuracy": 4 + }, + "structure": { + "organization": 4, + "formatting": 5, + "usability": 4 + }, + "content_score": 4.7, + "structure_score": 4.3, + "overall_score": 9.0 + }, + "B": { + "content": { + "correctness": 3, + "completeness": 2, + "accuracy": 3 + }, + "structure": { + "organization": 3, + "formatting": 2, + "usability": 3 + }, + "content_score": 2.7, + "structure_score": 2.7, + "overall_score": 5.4 + } + }, + "output_quality": { + "A": { + "score": 9, + "strengths": ["Complete solution", "Well-formatted", "All fields present"], + "weaknesses": ["Minor style inconsistency in header"] + }, + "B": { + "score": 5, + "strengths": ["Readable output", "Correct basic structure"], + "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"] + } + }, + "expectation_results": { + "A": { + "passed": 4, + "total": 5, + "pass_rate": 0.80, + "details": [ + {"text": "Output includes name", "passed": true} + ] + }, + "B": { + "passed": 3, + "total": 5, + "pass_rate": 0.60, + "details": [ + {"text": "Output includes name", "passed": true} + ] + } + } +} +``` + +--- + +## analysis.json + +Output from post-hoc analyzer. Located at `/analysis.json`. + +```json +{ + "comparison_summary": { + "winner": "A", + "winner_skill": "path/to/winner/skill", + "loser_skill": "path/to/loser/skill", + "comparator_reasoning": "Brief summary of why comparator chose winner" + }, + "winner_strengths": [ + "Clear step-by-step instructions for handling multi-page documents", + "Included validation script that caught formatting errors" + ], + "loser_weaknesses": [ + "Vague instruction 'process the document appropriately' led to inconsistent behavior", + "No script for validation, agent had to improvise" + ], + "instruction_following": { + "winner": { + "score": 9, + "issues": ["Minor: skipped optional logging step"] + }, + "loser": { + "score": 6, + "issues": [ + "Did not use the skill's formatting template", + "Invented own approach instead of following step 3" + ] + } + }, + "improvement_suggestions": [ + { + "priority": "high", + "category": "instructions", + "suggestion": "Replace 'process the document appropriately' with explicit steps", + "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior" + } + ], + "transcript_insights": { + "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script", + "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods" + } +} +``` diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/__init__.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/aggregate_benchmark.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/aggregate_benchmark.py new file mode 100644 index 000000000..ccc810819 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/aggregate_benchmark.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +""" +Aggregate individual run results into benchmark summary statistics. + +Reads grading.json files from run directories and produces: +- run_summary with mean, stddev, min, max for each metric +- delta between with_skill and without_skill configurations + +Usage: + python aggregate_benchmark.py + +Example: + python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/ + +The script supports two directory layouts: + + Workspace layout (from skill-creator iterations): + / + └── eval-N/ + ├── with_skill/ + │ ├── run-1/grading.json + │ └── run-2/grading.json + └── without_skill/ + ├── run-1/grading.json + └── run-2/grading.json + + Legacy layout (with runs/ subdirectory): + / + └── runs/ + └── eval-N/ + ├── with_skill/ + │ └── run-1/grading.json + └── without_skill/ + └── run-1/grading.json +""" + +import argparse +import json +import math +import sys +from datetime import datetime, timezone +from pathlib import Path + +from loguru import logger + + +def calculate_stats(values: list[float]) -> dict: + """Calculate mean, stddev, min, max for a list of values.""" + if not values: + return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0} + + n = len(values) + mean = sum(values) / n + + if n > 1: + variance = sum((x - mean) ** 2 for x in values) / (n - 1) + stddev = math.sqrt(variance) + else: + stddev = 0.0 + + return { + "mean": round(mean, 4), + "stddev": round(stddev, 4), + "min": round(min(values), 4), + "max": round(max(values), 4) + } + + +def load_run_results(benchmark_dir: Path) -> dict: + """ + Load all run results from a benchmark directory. + + Returns dict keyed by config name (e.g. "with_skill"/"without_skill", + or "new_skill"/"old_skill"), each containing a list of run results. + """ + # Support both layouts: eval dirs directly under benchmark_dir, or under runs/ + runs_dir = benchmark_dir / "runs" + if runs_dir.exists(): + search_dir = runs_dir + elif list(benchmark_dir.glob("eval-*")): + search_dir = benchmark_dir + else: + logger.warning(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}") + return {} + + results: dict[str, list] = {} + + for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))): + metadata_path = eval_dir / "eval_metadata.json" + if metadata_path.exists(): + try: + with open(metadata_path) as mf: + eval_id = json.load(mf).get("eval_id", eval_idx) + except (json.JSONDecodeError, OSError): + eval_id = eval_idx + else: + try: + eval_id = int(eval_dir.name.split("-")[1]) + except ValueError: + eval_id = eval_idx + + # Discover config directories dynamically rather than hardcoding names + for config_dir in sorted(eval_dir.iterdir()): + if not config_dir.is_dir(): + continue + # Skip non-config directories (inputs, outputs, etc.) + if not list(config_dir.glob("run-*")): + continue + config = config_dir.name + if config not in results: + results[config] = [] + + for run_dir in sorted(config_dir.glob("run-*")): + run_number = int(run_dir.name.split("-")[1]) + grading_file = run_dir / "grading.json" + + if not grading_file.exists(): + logger.warning(f"Warning: grading.json not found in {run_dir}") + continue + + try: + with open(grading_file) as f: + grading = json.load(f) + except json.JSONDecodeError as e: + logger.warning(f"Warning: Invalid JSON in {grading_file}: {e}") + continue + + # Extract metrics + result = { + "eval_id": eval_id, + "run_number": run_number, + "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0), + "passed": grading.get("summary", {}).get("passed", 0), + "failed": grading.get("summary", {}).get("failed", 0), + "total": grading.get("summary", {}).get("total", 0), + } + + # Extract timing — check grading.json first, then sibling timing.json + timing = grading.get("timing", {}) + result["time_seconds"] = timing.get("total_duration_seconds", 0.0) + timing_file = run_dir / "timing.json" + if result["time_seconds"] == 0.0 and timing_file.exists(): + try: + with open(timing_file) as tf: + timing_data = json.load(tf) + result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0) + result["tokens"] = timing_data.get("total_tokens", 0) + except json.JSONDecodeError: + pass + + # Extract metrics if available + metrics = grading.get("execution_metrics", {}) + result["tool_calls"] = metrics.get("total_tool_calls", 0) + if not result.get("tokens"): + result["tokens"] = metrics.get("output_chars", 0) + result["errors"] = metrics.get("errors_encountered", 0) + + # Extract expectations — viewer requires fields: text, passed, evidence + raw_expectations = grading.get("expectations", []) + for exp in raw_expectations: + if "text" not in exp or "passed" not in exp: + logger.warning(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}") + result["expectations"] = raw_expectations + + # Extract notes from user_notes_summary + notes_summary = grading.get("user_notes_summary", {}) + notes = [] + notes.extend(notes_summary.get("uncertainties", [])) + notes.extend(notes_summary.get("needs_review", [])) + notes.extend(notes_summary.get("workarounds", [])) + result["notes"] = notes + + results[config].append(result) + + return results + + +def aggregate_results(results: dict) -> dict: + """ + Aggregate run results into summary statistics. + + Returns run_summary with stats for each configuration and delta. + """ + run_summary = {} + configs = list(results.keys()) + + for config in configs: + runs = results.get(config, []) + + if not runs: + run_summary[config] = { + "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}, + "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}, + "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0} + } + continue + + pass_rates = [r["pass_rate"] for r in runs] + times = [r["time_seconds"] for r in runs] + tokens = [r.get("tokens", 0) for r in runs] + + run_summary[config] = { + "pass_rate": calculate_stats(pass_rates), + "time_seconds": calculate_stats(times), + "tokens": calculate_stats(tokens) + } + + # Calculate delta between the first two configs (if two exist) + if len(configs) >= 2: + primary = run_summary.get(configs[0], {}) + baseline = run_summary.get(configs[1], {}) + else: + primary = run_summary.get(configs[0], {}) if configs else {} + baseline = {} + + delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0) + delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0) + delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0) + + run_summary["delta"] = { + "pass_rate": f"{delta_pass_rate:+.2f}", + "time_seconds": f"{delta_time:+.1f}", + "tokens": f"{delta_tokens:+.0f}" + } + + return run_summary + + +def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict: + """ + Generate complete benchmark.json from run results. + """ + results = load_run_results(benchmark_dir) + run_summary = aggregate_results(results) + + # Build runs array for benchmark.json + runs = [] + for config in results: + for result in results[config]: + runs.append({ + "eval_id": result["eval_id"], + "configuration": config, + "run_number": result["run_number"], + "result": { + "pass_rate": result["pass_rate"], + "passed": result["passed"], + "failed": result["failed"], + "total": result["total"], + "time_seconds": result["time_seconds"], + "tokens": result.get("tokens", 0), + "tool_calls": result.get("tool_calls", 0), + "errors": result.get("errors", 0) + }, + "expectations": result["expectations"], + "notes": result["notes"] + }) + + # Determine eval IDs from results + eval_ids = sorted(set( + r["eval_id"] + for config in results.values() + for r in config + )) + + benchmark = { + "metadata": { + "skill_name": skill_name or "", + "skill_path": skill_path or "", + "executor_model": "", + "analyzer_model": "", + "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "evals_run": eval_ids, + "runs_per_configuration": 3 + }, + "runs": runs, + "run_summary": run_summary, + "notes": [] # To be filled by analyzer + } + + return benchmark + + +def generate_markdown(benchmark: dict) -> str: + """Generate human-readable benchmark.md from benchmark data.""" + metadata = benchmark["metadata"] + run_summary = benchmark["run_summary"] + + # Determine config names (excluding "delta") + configs = [k for k in run_summary if k != "delta"] + config_a = configs[0] if len(configs) >= 1 else "config_a" + config_b = configs[1] if len(configs) >= 2 else "config_b" + label_a = config_a.replace("_", " ").title() + label_b = config_b.replace("_", " ").title() + + lines = [ + f"# Skill Benchmark: {metadata['skill_name']}", + "", + f"**Model**: {metadata['executor_model']}", + f"**Date**: {metadata['timestamp']}", + f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)", + "", + "## Summary", + "", + f"| Metric | {label_a} | {label_b} | Delta |", + "|--------|------------|---------------|-------|", + ] + + a_summary = run_summary.get(config_a, {}) + b_summary = run_summary.get(config_b, {}) + delta = run_summary.get("delta", {}) + + # Format pass rate + a_pr = a_summary.get("pass_rate", {}) + b_pr = b_summary.get("pass_rate", {}) + lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |") + + # Format time + a_time = a_summary.get("time_seconds", {}) + b_time = b_summary.get("time_seconds", {}) + lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |") + + # Format tokens + a_tokens = a_summary.get("tokens", {}) + b_tokens = b_summary.get("tokens", {}) + lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |") + + # Notes section + if benchmark.get("notes"): + lines.extend([ + "", + "## Notes", + "" + ]) + for note in benchmark["notes"]: + lines.append(f"- {note}") + + return "\n".join(lines) + + +def main(): + parser = argparse.ArgumentParser( + description="Aggregate benchmark run results into summary statistics" + ) + parser.add_argument( + "benchmark_dir", + type=Path, + help="Path to the benchmark directory" + ) + parser.add_argument( + "--skill-name", + default="", + help="Name of the skill being benchmarked" + ) + parser.add_argument( + "--skill-path", + default="", + help="Path to the skill being benchmarked" + ) + parser.add_argument( + "--output", "-o", + type=Path, + help="Output path for benchmark.json (default: /benchmark.json)" + ) + + args = parser.parse_args() + + if not args.benchmark_dir.exists(): + logger.error(f"Directory not found: {args.benchmark_dir}") + sys.exit(1) + + # Generate benchmark + benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path) + + # Determine output paths + output_json = args.output or (args.benchmark_dir / "benchmark.json") + output_md = output_json.with_suffix(".md") + + # Write benchmark.json + with open(output_json, "w") as f: + json.dump(benchmark, f, indent=2) + logger.info(f"Generated: {output_json}") + + # Write benchmark.md + markdown = generate_markdown(benchmark) + with open(output_md, "w") as f: + f.write(markdown) + logger.info(f"Generated: {output_md}") + + # Print summary + run_summary = benchmark["run_summary"] + configs = [k for k in run_summary if k != "delta"] + delta = run_summary.get("delta", {}) + + logger.info(f"\nSummary:") + for config in configs: + pr = run_summary[config]["pass_rate"]["mean"] + label = config.replace("_", " ").title() + logger.info(f" {label}: {pr*100:.1f}% pass rate") + logger.info(f" Delta: {delta.get('pass_rate', '—')}") + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/generate_report.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/generate_report.py new file mode 100644 index 000000000..395232d96 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/generate_report.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 +"""Generate an HTML report from run_loop.py output. + +Takes the JSON output from run_loop.py and generates a visual HTML report +showing each description attempt with check/x for each test case. +Distinguishes between train and test queries. +""" + +import argparse +import html +import json +import sys +from pathlib import Path + +from loguru import logger + + +def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str: + """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag.""" + history = data.get("history", []) + holdout = data.get("holdout", 0) + title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else "" + + # Get all unique queries from train and test sets, with should_trigger info + train_queries: list[dict] = [] + test_queries: list[dict] = [] + if history: + for r in history[0].get("train_results", history[0].get("results", [])): + train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)}) + if history[0].get("test_results"): + for r in history[0].get("test_results", []): + test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)}) + + refresh_tag = ' \n' if auto_refresh else "" + + html_parts = [""" + + + +""" + refresh_tag + """ """ + title_prefix + """Skill Description Optimization + + + + + + +

""" + title_prefix + """Skill Description Optimization

+
+ Optimizing your skill's description. This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill. +
+"""] + + # Summary section + best_test_score = data.get('best_test_score') + best_train_score = data.get('best_train_score') + html_parts.append(f""" +
+

Original: {html.escape(data.get('original_description', 'N/A'))}

+

Best: {html.escape(data.get('best_description', 'N/A'))}

+

Best Score: {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}

+

Iterations: {data.get('iterations_run', 0)} | Train: {data.get('train_size', '?')} | Test: {data.get('test_size', '?')}

+
+""") + + # Legend + html_parts.append(""" +
+ Query columns: + Should trigger + Should NOT trigger + Train + Test +
+""") + + # Table header + html_parts.append(""" +
+ + + + + + + +""") + + # Add column headers for train queries + for qinfo in train_queries: + polarity = "positive-col" if qinfo["should_trigger"] else "negative-col" + html_parts.append(f' \n') + + # Add column headers for test queries (different color) + for qinfo in test_queries: + polarity = "positive-col" if qinfo["should_trigger"] else "negative-col" + html_parts.append(f' \n') + + html_parts.append(""" + + +""") + + # Find best iteration for highlighting + if test_queries: + best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration") + else: + best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration") + + # Add rows for each iteration + for h in history: + iteration = h.get("iteration", "?") + train_passed = h.get("train_passed", h.get("passed", 0)) + train_total = h.get("train_total", h.get("total", 0)) + test_passed = h.get("test_passed") + test_total = h.get("test_total") + description = h.get("description", "") + train_results = h.get("train_results", h.get("results", [])) + test_results = h.get("test_results", []) + + # Create lookups for results by query + train_by_query = {r["query"]: r for r in train_results} + test_by_query = {r["query"]: r for r in test_results} if test_results else {} + + # Compute aggregate correct/total runs across all retries + def aggregate_runs(results: list[dict]) -> tuple[int, int]: + correct = 0 + total = 0 + for r in results: + runs = r.get("runs", 0) + triggers = r.get("triggers", 0) + total += runs + if r.get("should_trigger", True): + correct += triggers + else: + correct += runs - triggers + return correct, total + + train_correct, train_runs = aggregate_runs(train_results) + test_correct, test_runs = aggregate_runs(test_results) + + # Determine score classes + def score_class(correct: int, total: int) -> str: + if total > 0: + ratio = correct / total + if ratio >= 0.8: + return "score-good" + elif ratio >= 0.5: + return "score-ok" + return "score-bad" + + train_class = score_class(train_correct, train_runs) + test_class = score_class(test_correct, test_runs) + + row_class = "best-row" if iteration == best_iter else "" + + html_parts.append(f""" + + + + +""") + + # Add result for each train query + for qinfo in train_queries: + r = train_by_query.get(qinfo["query"], {}) + did_pass = r.get("pass", False) + triggers = r.get("triggers", 0) + runs = r.get("runs", 0) + + icon = "✓" if did_pass else "✗" + css_class = "pass" if did_pass else "fail" + + html_parts.append(f' \n') + + # Add result for each test query (with different background) + for qinfo in test_queries: + r = test_by_query.get(qinfo["query"], {}) + did_pass = r.get("pass", False) + triggers = r.get("triggers", 0) + runs = r.get("runs", 0) + + icon = "✓" if did_pass else "✗" + css_class = "pass" if did_pass else "fail" + + html_parts.append(f' \n') + + html_parts.append(" \n") + + html_parts.append(""" +
IterTrainTestDescription{html.escape(qinfo["query"])}{html.escape(qinfo["query"])}
{iteration}{train_correct}/{train_runs}{test_correct}/{test_runs}{html.escape(description)}{icon}{triggers}/{runs}{icon}{triggers}/{runs}
+
+""") + + html_parts.append(""" + + +""") + + return "".join(html_parts) + + +def main(): + parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output") + parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)") + parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)") + parser.add_argument("--skill-name", default="", help="Skill name to include in the report title") + args = parser.parse_args() + + if args.input == "-": + data = json.load(sys.stdin) + else: + data = json.loads(Path(args.input).read_text()) + + html_output = generate_html(data, skill_name=args.skill_name) + + if args.output: + Path(args.output).write_text(html_output) + logger.info(f"Report written to {args.output}") + else: + print(html_output) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/improve_description.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/improve_description.py new file mode 100644 index 000000000..887a06a08 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/improve_description.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +"""Improve a skill description based on eval results. + +Takes eval results (from run_eval.py) and generates an improved description +using Claude with extended thinking. +""" + +import argparse +import json +import re +import sys +from pathlib import Path + +import anthropic +from loguru import logger + +from scripts.utils import parse_skill_md + + +def improve_description( + client: anthropic.Anthropic, + skill_name: str, + skill_content: str, + current_description: str, + eval_results: dict, + history: list[dict], + model: str, + test_results: dict | None = None, + log_dir: Path | None = None, + iteration: int | None = None, +) -> str: + """Call Claude to improve the description based on eval results.""" + failed_triggers = [ + r for r in eval_results["results"] + if r["should_trigger"] and not r["pass"] + ] + false_triggers = [ + r for r in eval_results["results"] + if not r["should_trigger"] and not r["pass"] + ] + + # Build scores summary + train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}" + if test_results: + test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}" + scores_summary = f"Train: {train_score}, Test: {test_score}" + else: + scores_summary = f"Train: {train_score}" + + prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples. + +The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones. + +Here's the current description: + +"{current_description}" + + +Current scores ({scores_summary}): + +""" + if failed_triggers: + prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n" + for r in failed_triggers: + prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n' + prompt += "\n" + + if false_triggers: + prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n" + for r in false_triggers: + prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n' + prompt += "\n" + + if history: + prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n" + for h in history: + train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}" + test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None + score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "") + prompt += f'\n' + prompt += f'Description: "{h["description"]}"\n' + if "results" in h: + prompt += "Train results:\n" + for r in h["results"]: + status = "PASS" if r["pass"] else "FAIL" + prompt += f' [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n' + if h.get("note"): + prompt += f'Note: {h["note"]}\n' + prompt += "\n\n" + + prompt += f""" + +Skill content (for context on what the skill does): + +{skill_content} + + +Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold: + +1. Avoid overfitting +2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description. + +Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy. + +Here are some tips that we've found to work well in writing these descriptions: +- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does" +- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works. +- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable. +- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings. + +I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end. + +Please respond with only the new description text in tags, nothing else.""" + + response = client.messages.create( + model=model, + max_tokens=16000, + thinking={ + "type": "enabled", + "budget_tokens": 10000, + }, + messages=[{"role": "user", "content": prompt}], + ) + + # Extract thinking and text from response + thinking_text = "" + text = "" + for block in response.content: + if block.type == "thinking": + thinking_text = block.thinking + elif block.type == "text": + text = block.text + + # Parse out the tags + match = re.search(r"(.*?)", text, re.DOTALL) + description = match.group(1).strip().strip('"') if match else text.strip().strip('"') + + # Log the transcript + transcript: dict = { + "iteration": iteration, + "prompt": prompt, + "thinking": thinking_text, + "response": text, + "parsed_description": description, + "char_count": len(description), + "over_limit": len(description) > 1024, + } + + # If over 1024 chars, ask the model to shorten it + if len(description) > 1024: + shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in tags." + shorten_response = client.messages.create( + model=model, + max_tokens=16000, + thinking={ + "type": "enabled", + "budget_tokens": 10000, + }, + messages=[ + {"role": "user", "content": prompt}, + {"role": "assistant", "content": text}, + {"role": "user", "content": shorten_prompt}, + ], + ) + + shorten_thinking = "" + shorten_text = "" + for block in shorten_response.content: + if block.type == "thinking": + shorten_thinking = block.thinking + elif block.type == "text": + shorten_text = block.text + + match = re.search(r"(.*?)", shorten_text, re.DOTALL) + shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"') + + transcript["rewrite_prompt"] = shorten_prompt + transcript["rewrite_thinking"] = shorten_thinking + transcript["rewrite_response"] = shorten_text + transcript["rewrite_description"] = shortened + transcript["rewrite_char_count"] = len(shortened) + description = shortened + + transcript["final_description"] = description + + if log_dir: + log_dir.mkdir(parents=True, exist_ok=True) + log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json" + log_file.write_text(json.dumps(transcript, indent=2)) + + return description + + +def main(): + parser = argparse.ArgumentParser(description="Improve a skill description based on eval results") + parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)") + parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr") + args = parser.parse_args() + + skill_path = Path(args.skill_path) + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + eval_results = json.loads(Path(args.eval_results).read_text()) + history = [] + if args.history: + history = json.loads(Path(args.history).read_text()) + + name, _, content = parse_skill_md(skill_path) + current_description = eval_results["description"] + + if args.verbose: + logger.info(f"Current: {current_description}") + logger.info(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}") + + client = anthropic.Anthropic() + new_description = improve_description( + client=client, + skill_name=name, + skill_content=content, + current_description=current_description, + eval_results=eval_results, + history=history, + model=args.model, + ) + + if args.verbose: + logger.info(f"Improved: {new_description}") + + # Output as JSON with both the new description and updated history + output = { + "description": new_description, + "history": history + [{ + "description": current_description, + "passed": eval_results["summary"]["passed"], + "failed": eval_results["summary"]["failed"], + "total": eval_results["summary"]["total"], + "results": eval_results["results"], + }], + } + print(json.dumps(output, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/package_skill.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/package_skill.py new file mode 100644 index 000000000..5dbdf7843 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/package_skill.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Skill Packager - Creates a distributable .skill file of a skill folder + +Usage: + python utils/package_skill.py [output-directory] + +Example: + python utils/package_skill.py skills/public/my-skill + python utils/package_skill.py skills/public/my-skill ./dist +""" + +import fnmatch +import sys +import zipfile +from pathlib import Path + +from loguru import logger +from scripts.quick_validate import validate_skill + +# Patterns to exclude when packaging skills. +EXCLUDE_DIRS = {"__pycache__", "node_modules"} +EXCLUDE_GLOBS = {"*.pyc"} +EXCLUDE_FILES = {".DS_Store"} +# Directories excluded only at the skill root (not when nested deeper). +ROOT_EXCLUDE_DIRS = {"evals"} + + +def should_exclude(rel_path: Path) -> bool: + """Check if a path should be excluded from packaging.""" + parts = rel_path.parts + if any(part in EXCLUDE_DIRS for part in parts): + return True + # rel_path is relative to skill_path.parent, so parts[0] is the skill + # folder name and parts[1] (if present) is the first subdir. + if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS: + return True + name = rel_path.name + if name in EXCLUDE_FILES: + return True + return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS) + + +def package_skill(skill_path, output_dir=None): + """ + Package a skill folder into a .skill file. + + Args: + skill_path: Path to the skill folder + output_dir: Optional output directory for the .skill file (defaults to current directory) + + Returns: + Path to the created .skill file, or None if error + """ + skill_path = Path(skill_path).resolve() + + # Validate skill folder exists + if not skill_path.exists(): + logger.error(f"Skill folder not found: {skill_path}") + return None + + if not skill_path.is_dir(): + logger.error(f"Path is not a directory: {skill_path}") + return None + + # Validate SKILL.md exists + skill_md = skill_path / "SKILL.md" + if not skill_md.exists(): + logger.error(f"SKILL.md not found in {skill_path}") + return None + + # Run validation before packaging + logger.info("Validating skill...") + valid, message = validate_skill(skill_path) + if not valid: + logger.error(f"Validation failed: {message}") + logger.error("Please fix the validation errors before packaging.") + return None + logger.info(f"{message}\n") + + # Determine output location + skill_name = skill_path.name + if output_dir: + output_path = Path(output_dir).resolve() + output_path.mkdir(parents=True, exist_ok=True) + else: + output_path = Path.cwd() + + skill_filename = output_path / f"{skill_name}.skill" + + # Create the .skill file (zip format) + try: + with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf: + # Walk through the skill directory, excluding build artifacts + for file_path in skill_path.rglob('*'): + if not file_path.is_file(): + continue + arcname = file_path.relative_to(skill_path.parent) + if should_exclude(arcname): + logger.debug(f"Skipped: {arcname}") + continue + zipf.write(file_path, arcname) + logger.debug(f"Added: {arcname}") + + logger.info(f"Successfully packaged skill to: {skill_filename}") + return skill_filename + + except Exception as e: + logger.error(f"Error creating .skill file: {e}") + return None + + +def main(): + if len(sys.argv) < 2: + logger.info("Usage: python utils/package_skill.py [output-directory]") + logger.info("\nExample:") + logger.info(" python utils/package_skill.py skills/public/my-skill") + logger.info(" python utils/package_skill.py skills/public/my-skill ./dist") + sys.exit(1) + + skill_path = sys.argv[1] + output_dir = sys.argv[2] if len(sys.argv) > 2 else None + + logger.info(f"Packaging skill: {skill_path}") + if output_dir: + logger.info(f" Output directory: {output_dir}") + logger.info("") + + result = package_skill(skill_path, output_dir) + + if result: + sys.exit(0) + else: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/quick_validate.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/quick_validate.py new file mode 100644 index 000000000..36553161e --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/quick_validate.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Quick validation script for skills - minimal version +""" + +import sys +import os +import re +import yaml +from pathlib import Path + +from loguru import logger + +def validate_skill(skill_path): + """Basic validation of a skill""" + skill_path = Path(skill_path) + + # Check SKILL.md exists + skill_md = skill_path / 'SKILL.md' + if not skill_md.exists(): + return False, "SKILL.md not found" + + # Read and validate frontmatter + content = skill_md.read_text() + if not content.startswith('---'): + return False, "No YAML frontmatter found" + + # Extract frontmatter + match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL) + if not match: + return False, "Invalid frontmatter format" + + frontmatter_text = match.group(1) + + # Parse YAML frontmatter + try: + frontmatter = yaml.safe_load(frontmatter_text) + if not isinstance(frontmatter, dict): + return False, "Frontmatter must be a YAML dictionary" + except yaml.YAMLError as e: + return False, f"Invalid YAML in frontmatter: {e}" + + # Define allowed properties + ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'} + + # Check for unexpected properties (excluding nested keys under metadata) + unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES + if unexpected_keys: + return False, ( + f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. " + f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}" + ) + + # Check required fields + if 'name' not in frontmatter: + return False, "Missing 'name' in frontmatter" + if 'description' not in frontmatter: + return False, "Missing 'description' in frontmatter" + + # Extract name for validation + name = frontmatter.get('name', '') + if not isinstance(name, str): + return False, f"Name must be a string, got {type(name).__name__}" + name = name.strip() + if name: + # Check naming convention (kebab-case: lowercase with hyphens) + if not re.match(r'^[a-z0-9-]+$', name): + return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)" + if name.startswith('-') or name.endswith('-') or '--' in name: + return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens" + # Check name length (max 64 characters per spec) + if len(name) > 64: + return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters." + + # Extract and validate description + description = frontmatter.get('description', '') + if not isinstance(description, str): + return False, f"Description must be a string, got {type(description).__name__}" + description = description.strip() + if description: + # Check for angle brackets + if '<' in description or '>' in description: + return False, "Description cannot contain angle brackets (< or >)" + # Check description length (max 1024 characters per spec) + if len(description) > 1024: + return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters." + + # Validate compatibility field if present (optional) + compatibility = frontmatter.get('compatibility', '') + if compatibility: + if not isinstance(compatibility, str): + return False, f"Compatibility must be a string, got {type(compatibility).__name__}" + if len(compatibility) > 500: + return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters." + + return True, "Skill is valid!" + +if __name__ == "__main__": + if len(sys.argv) != 2: + logger.info("Usage: python quick_validate.py ") + sys.exit(1) + + valid, message = validate_skill(sys.argv[1]) + if valid: + logger.info(message) + else: + logger.error(message) + sys.exit(0 if valid else 1) \ No newline at end of file diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_eval.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_eval.py new file mode 100644 index 000000000..f923066ca --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_eval.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python3 +"""Run trigger evaluation for a skill description. + +Tests whether a skill's description causes Claude to trigger (read the skill) +for a set of queries. Outputs results as JSON. +""" + +import argparse +import json +import os +import select +import subprocess +import sys +import time +import uuid +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path + +from loguru import logger + +from scripts.utils import parse_skill_md + + +def find_project_root() -> Path: + """Find the project root by walking up from cwd looking for .claude/. + + Mimics how Claude Code discovers its project root, so the command file + we create ends up where claude -p will look for it. + """ + current = Path.cwd() + for parent in [current, *current.parents]: + if (parent / ".claude").is_dir(): + return parent + return current + + +def run_single_query( + query: str, + skill_name: str, + skill_description: str, + timeout: int, + project_root: str, + model: str | None = None, +) -> bool: + """Run a single query and return whether the skill was triggered. + + Creates a command file in .claude/commands/ so it appears in Claude's + available_skills list, then runs `claude -p` with the raw query. + Uses --include-partial-messages to detect triggering early from + stream events (content_block_start) rather than waiting for the + full assistant message, which only arrives after tool execution. + """ + unique_id = uuid.uuid4().hex[:8] + clean_name = f"{skill_name}-skill-{unique_id}" + project_commands_dir = Path(project_root) / ".claude" / "commands" + command_file = project_commands_dir / f"{clean_name}.md" + + try: + project_commands_dir.mkdir(parents=True, exist_ok=True) + # Use YAML block scalar to avoid breaking on quotes in description + indented_desc = "\n ".join(skill_description.split("\n")) + command_content = ( + f"---\n" + f"description: |\n" + f" {indented_desc}\n" + f"---\n\n" + f"# {skill_name}\n\n" + f"This skill handles: {skill_description}\n" + ) + command_file.write_text(command_content) + + cmd = [ + "claude", + "-p", query, + "--output-format", "stream-json", + "--verbose", + "--include-partial-messages", + ] + if model: + cmd.extend(["--model", model]) + + # Remove CLAUDECODE env var to allow nesting claude -p inside a + # Claude Code session. The guard is for interactive terminal conflicts; + # programmatic subprocess usage is safe. + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + cwd=project_root, + env=env, + ) + + triggered = False + start_time = time.time() + buffer = "" + # Track state for stream event detection + pending_tool_name = None + accumulated_json = "" + + try: + while time.time() - start_time < timeout: + if process.poll() is not None: + remaining = process.stdout.read() + if remaining: + buffer += remaining.decode("utf-8", errors="replace") + break + + ready, _, _ = select.select([process.stdout], [], [], 1.0) + if not ready: + continue + + chunk = os.read(process.stdout.fileno(), 8192) + if not chunk: + break + buffer += chunk.decode("utf-8", errors="replace") + + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if not line: + continue + + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + + # Early detection via stream events + if event.get("type") == "stream_event": + se = event.get("event", {}) + se_type = se.get("type", "") + + if se_type == "content_block_start": + cb = se.get("content_block", {}) + if cb.get("type") == "tool_use": + tool_name = cb.get("name", "") + if tool_name in ("Skill", "Read"): + pending_tool_name = tool_name + accumulated_json = "" + else: + return False + + elif se_type == "content_block_delta" and pending_tool_name: + delta = se.get("delta", {}) + if delta.get("type") == "input_json_delta": + accumulated_json += delta.get("partial_json", "") + if clean_name in accumulated_json: + return True + + elif se_type in ("content_block_stop", "message_stop"): + if pending_tool_name: + return clean_name in accumulated_json + if se_type == "message_stop": + return False + + # Fallback: full assistant message + elif event.get("type") == "assistant": + message = event.get("message", {}) + for content_item in message.get("content", []): + if content_item.get("type") != "tool_use": + continue + tool_name = content_item.get("name", "") + tool_input = content_item.get("input", {}) + if tool_name == "Skill" and clean_name in tool_input.get("skill", ""): + triggered = True + elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""): + triggered = True + return triggered + + elif event.get("type") == "result": + return triggered + finally: + # Clean up process on any exit path (return, exception, timeout) + if process.poll() is None: + process.kill() + process.wait() + + return triggered + finally: + if command_file.exists(): + command_file.unlink() + + +def run_eval( + eval_set: list[dict], + skill_name: str, + description: str, + num_workers: int, + timeout: int, + project_root: Path, + runs_per_query: int = 1, + trigger_threshold: float = 0.5, + model: str | None = None, +) -> dict: + """Run the full eval set and return results.""" + results = [] + + with ProcessPoolExecutor(max_workers=num_workers) as executor: + future_to_info = {} + for item in eval_set: + for run_idx in range(runs_per_query): + future = executor.submit( + run_single_query, + item["query"], + skill_name, + description, + timeout, + str(project_root), + model, + ) + future_to_info[future] = (item, run_idx) + + query_triggers: dict[str, list[bool]] = {} + query_items: dict[str, dict] = {} + for future in as_completed(future_to_info): + item, _ = future_to_info[future] + query = item["query"] + query_items[query] = item + if query not in query_triggers: + query_triggers[query] = [] + try: + query_triggers[query].append(future.result()) + except Exception as e: + logger.warning(f"Warning: query failed: {e}") + query_triggers[query].append(False) + + for query, triggers in query_triggers.items(): + item = query_items[query] + trigger_rate = sum(triggers) / len(triggers) + should_trigger = item["should_trigger"] + if should_trigger: + did_pass = trigger_rate >= trigger_threshold + else: + did_pass = trigger_rate < trigger_threshold + results.append({ + "query": query, + "should_trigger": should_trigger, + "trigger_rate": trigger_rate, + "triggers": sum(triggers), + "runs": len(triggers), + "pass": did_pass, + }) + + passed = sum(1 for r in results if r["pass"]) + total = len(results) + + return { + "skill_name": skill_name, + "description": description, + "results": results, + "summary": { + "total": total, + "passed": passed, + "failed": total - passed, + }, + } + + +def main(): + parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description") + parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--description", default=None, help="Override description to test") + parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") + parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") + parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") + parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") + parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)") + parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") + args = parser.parse_args() + + eval_set = json.loads(Path(args.eval_set).read_text()) + skill_path = Path(args.skill_path) + + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + name, original_description, content = parse_skill_md(skill_path) + description = args.description or original_description + project_root = find_project_root() + + if args.verbose: + logger.info(f"Evaluating: {description}") + + output = run_eval( + eval_set=eval_set, + skill_name=name, + description=description, + num_workers=args.num_workers, + timeout=args.timeout, + project_root=project_root, + runs_per_query=args.runs_per_query, + trigger_threshold=args.trigger_threshold, + model=args.model, + ) + + if args.verbose: + summary = output["summary"] + logger.info(f"Results: {summary['passed']}/{summary['total']} passed") + for r in output["results"]: + status = "PASS" if r["pass"] else "FAIL" + rate_str = f"{r['triggers']}/{r['runs']}" + logger.info(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}") + + print(json.dumps(output, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_loop.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_loop.py new file mode 100644 index 000000000..a2907d6e0 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/run_loop.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python3 +"""Run the eval + improve loop until all pass or max iterations reached. + +Combines run_eval.py and improve_description.py in a loop, tracking history +and returning the best description found. Supports train/test split to prevent +overfitting. +""" + +import argparse +import json +import random +import sys +import tempfile +import time +import webbrowser +from pathlib import Path + +import anthropic +from loguru import logger + +from scripts.generate_report import generate_html +from scripts.improve_description import improve_description +from scripts.run_eval import find_project_root, run_eval +from scripts.utils import parse_skill_md + + +def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]: + """Split eval set into train and test sets, stratified by should_trigger.""" + random.seed(seed) + + # Separate by should_trigger + trigger = [e for e in eval_set if e["should_trigger"]] + no_trigger = [e for e in eval_set if not e["should_trigger"]] + + # Shuffle each group + random.shuffle(trigger) + random.shuffle(no_trigger) + + # Calculate split points + n_trigger_test = max(1, int(len(trigger) * holdout)) + n_no_trigger_test = max(1, int(len(no_trigger) * holdout)) + + # Split + test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test] + train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:] + + return train_set, test_set + + +def run_loop( + eval_set: list[dict], + skill_path: Path, + description_override: str | None, + num_workers: int, + timeout: int, + max_iterations: int, + runs_per_query: int, + trigger_threshold: float, + holdout: float, + model: str, + verbose: bool, + live_report_path: Path | None = None, + log_dir: Path | None = None, +) -> dict: + """Run the eval + improvement loop.""" + project_root = find_project_root() + name, original_description, content = parse_skill_md(skill_path) + current_description = description_override or original_description + + # Split into train/test if holdout > 0 + if holdout > 0: + train_set, test_set = split_eval_set(eval_set, holdout) + if verbose: + logger.info(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})") + else: + train_set = eval_set + test_set = [] + + client = anthropic.Anthropic() + history = [] + exit_reason = "unknown" + + for iteration in range(1, max_iterations + 1): + if verbose: + logger.info(f"\n{'='*60}") + logger.info(f"Iteration {iteration}/{max_iterations}") + logger.info(f"Description: {current_description}") + logger.info(f"{'='*60}") + + # Evaluate train + test together in one batch for parallelism + all_queries = train_set + test_set + t0 = time.time() + all_results = run_eval( + eval_set=all_queries, + skill_name=name, + description=current_description, + num_workers=num_workers, + timeout=timeout, + project_root=project_root, + runs_per_query=runs_per_query, + trigger_threshold=trigger_threshold, + model=model, + ) + eval_elapsed = time.time() - t0 + + # Split results back into train/test by matching queries + train_queries_set = {q["query"] for q in train_set} + train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set] + test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set] + + train_passed = sum(1 for r in train_result_list if r["pass"]) + train_total = len(train_result_list) + train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total} + train_results = {"results": train_result_list, "summary": train_summary} + + if test_set: + test_passed = sum(1 for r in test_result_list if r["pass"]) + test_total = len(test_result_list) + test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total} + test_results = {"results": test_result_list, "summary": test_summary} + else: + test_results = None + test_summary = None + + history.append({ + "iteration": iteration, + "description": current_description, + "train_passed": train_summary["passed"], + "train_failed": train_summary["failed"], + "train_total": train_summary["total"], + "train_results": train_results["results"], + "test_passed": test_summary["passed"] if test_summary else None, + "test_failed": test_summary["failed"] if test_summary else None, + "test_total": test_summary["total"] if test_summary else None, + "test_results": test_results["results"] if test_results else None, + # For backward compat with report generator + "passed": train_summary["passed"], + "failed": train_summary["failed"], + "total": train_summary["total"], + "results": train_results["results"], + }) + + # Write live report if path provided + if live_report_path: + partial_output = { + "original_description": original_description, + "best_description": current_description, + "best_score": "in progress", + "iterations_run": len(history), + "holdout": holdout, + "train_size": len(train_set), + "test_size": len(test_set), + "history": history, + } + live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name)) + + if verbose: + def print_eval_stats(label, results, elapsed): + pos = [r for r in results if r["should_trigger"]] + neg = [r for r in results if not r["should_trigger"]] + tp = sum(r["triggers"] for r in pos) + pos_runs = sum(r["runs"] for r in pos) + fn = pos_runs - tp + fp = sum(r["triggers"] for r in neg) + neg_runs = sum(r["runs"] for r in neg) + tn = neg_runs - fp + total = tp + tn + fp + fn + precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0 + recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0 + accuracy = (tp + tn) / total if total > 0 else 0.0 + logger.info(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)") + for r in results: + status = "PASS" if r["pass"] else "FAIL" + rate_str = f"{r['triggers']}/{r['runs']}" + logger.info(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}") + + print_eval_stats("Train", train_results["results"], eval_elapsed) + if test_summary: + print_eval_stats("Test ", test_results["results"], 0) + + if train_summary["failed"] == 0: + exit_reason = f"all_passed (iteration {iteration})" + if verbose: + logger.info(f"\nAll train queries passed on iteration {iteration}!") + break + + if iteration == max_iterations: + exit_reason = f"max_iterations ({max_iterations})" + if verbose: + logger.info(f"\nMax iterations reached ({max_iterations}).") + break + + # Improve the description based on train results + if verbose: + logger.info(f"\nImproving description...") + + t0 = time.time() + # Strip test scores from history so improvement model can't see them + blinded_history = [ + {k: v for k, v in h.items() if not k.startswith("test_")} + for h in history + ] + new_description = improve_description( + client=client, + skill_name=name, + skill_content=content, + current_description=current_description, + eval_results=train_results, + history=blinded_history, + model=model, + log_dir=log_dir, + iteration=iteration, + ) + improve_elapsed = time.time() - t0 + + if verbose: + logger.info(f"Proposed ({improve_elapsed:.1f}s): {new_description}") + + current_description = new_description + + # Find the best iteration by TEST score (or train if no test set) + if test_set: + best = max(history, key=lambda h: h["test_passed"] or 0) + best_score = f"{best['test_passed']}/{best['test_total']}" + else: + best = max(history, key=lambda h: h["train_passed"]) + best_score = f"{best['train_passed']}/{best['train_total']}" + + if verbose: + logger.info(f"\nExit reason: {exit_reason}") + logger.info(f"Best score: {best_score} (iteration {best['iteration']})") + + return { + "exit_reason": exit_reason, + "original_description": original_description, + "best_description": best["description"], + "best_score": best_score, + "best_train_score": f"{best['train_passed']}/{best['train_total']}", + "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None, + "final_description": current_description, + "iterations_run": len(history), + "holdout": holdout, + "train_size": len(train_set), + "test_size": len(test_set), + "history": history, + } + + +def main(): + parser = argparse.ArgumentParser(description="Run eval + improve loop") + parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--description", default=None, help="Override starting description") + parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") + parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") + parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations") + parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") + parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") + parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)") + parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") + parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)") + parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here") + args = parser.parse_args() + + eval_set = json.loads(Path(args.eval_set).read_text()) + skill_path = Path(args.skill_path) + + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + name, _, _ = parse_skill_md(skill_path) + + # Set up live report path + if args.report != "none": + if args.report == "auto": + timestamp = time.strftime("%Y%m%d_%H%M%S") + live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html" + else: + live_report_path = Path(args.report) + # Open the report immediately so the user can watch + live_report_path.write_text("

Starting optimization loop...

") + webbrowser.open(str(live_report_path)) + else: + live_report_path = None + + # Determine output directory (create before run_loop so logs can be written) + if args.results_dir: + timestamp = time.strftime("%Y-%m-%d_%H%M%S") + results_dir = Path(args.results_dir) / timestamp + results_dir.mkdir(parents=True, exist_ok=True) + else: + results_dir = None + + log_dir = results_dir / "logs" if results_dir else None + + output = run_loop( + eval_set=eval_set, + skill_path=skill_path, + description_override=args.description, + num_workers=args.num_workers, + timeout=args.timeout, + max_iterations=args.max_iterations, + runs_per_query=args.runs_per_query, + trigger_threshold=args.trigger_threshold, + holdout=args.holdout, + model=args.model, + verbose=args.verbose, + live_report_path=live_report_path, + log_dir=log_dir, + ) + + # Save JSON output + json_output = json.dumps(output, indent=2) + print(json_output) + if results_dir: + (results_dir / "results.json").write_text(json_output) + + # Write final HTML report (without auto-refresh) + if live_report_path: + live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name)) + logger.info(f"\nReport: {live_report_path}") + + if results_dir and live_report_path: + (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name)) + + if results_dir: + logger.info(f"Results saved to: {results_dir}") + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/utils.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/utils.py new file mode 100644 index 000000000..51b6a07dd --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/skill-creator/scripts/utils.py @@ -0,0 +1,47 @@ +"""Shared utilities for skill-creator scripts.""" + +from pathlib import Path + + + +def parse_skill_md(skill_path: Path) -> tuple[str, str, str]: + """Parse a SKILL.md file, returning (name, description, full_content).""" + content = (skill_path / "SKILL.md").read_text() + lines = content.split("\n") + + if lines[0].strip() != "---": + raise ValueError("SKILL.md missing frontmatter (no opening ---)") + + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + + if end_idx is None: + raise ValueError("SKILL.md missing frontmatter (no closing ---)") + + name = "" + description = "" + frontmatter_lines = lines[1:end_idx] + i = 0 + while i < len(frontmatter_lines): + line = frontmatter_lines[i] + if line.startswith("name:"): + name = line[len("name:"):].strip().strip('"').strip("'") + elif line.startswith("description:"): + value = line[len("description:"):].strip() + # Handle YAML multiline indicators (>, |, >-, |-) + if value in (">", "|", ">-", "|-"): + continuation_lines: list[str] = [] + i += 1 + while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")): + continuation_lines.append(frontmatter_lines[i].strip()) + i += 1 + description = " ".join(continuation_lines) + continue + else: + description = value.strip('"').strip("'") + i += 1 + + return name, description, content diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/SKILL.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/SKILL.md new file mode 100644 index 000000000..1ff24943b --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/SKILL.md @@ -0,0 +1,33 @@ +--- +name: Web Research +description: Systematic web searching, source evaluation, and information synthesis +--- + +# Web Research + +## Overview +Use this skill when you need to find, evaluate, and synthesize information from the web. + +**Keywords**: web search, information retrieval, source evaluation, fact-checking, research + +## Process + +### 1. Define Search Strategy +- Identify key search terms and variations +- Consider different angles and perspectives +- Plan multiple search queries + +### 2. Evaluate Sources +- Check source credibility and recency +- Cross-reference claims across multiple sources +- Note publication dates and author expertise + +### 3. Synthesize Findings +- Organize information by theme or relevance +- Highlight key findings and consensus views +- Note conflicting information and gaps + +## Output Format +- Start with a brief summary of findings +- Provide detailed sections with source citations +- End with confidence assessment and limitations diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/scripts/search_helper.py b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/scripts/search_helper.py new file mode 100644 index 000000000..09679111d --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/skills/web-research/scripts/search_helper.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +"""Helper utilities for structured web search.""" + +from datetime import datetime + + +def format_search_results(results: list[dict]) -> str: + """Format raw search results into a structured report.""" + output = [] + for i, r in enumerate(results, 1): + title = r.get('title', 'Untitled') + url = r.get('url', '#') + snippet = r.get('snippet', 'No description') + output.append(f'{i}. [{title}]({url})') + output.append(f' {snippet}') + output.append('') + return '\n'.join(output) + + +def assess_source_credibility(url: str) -> dict: + """Basic heuristics for source credibility.""" + trusted = ['.edu', '.gov', '.org', 'arxiv.org', 'nature.com'] + score = 0.5 + for d in trusted: + if d in url: + score = 0.8 + break + return {'url': url, 'credibility_score': score, + 'assessed_at': datetime.now().isoformat()} diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/soul.md b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/soul.md new file mode 100644 index 000000000..1554c3463 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/soul.md @@ -0,0 +1,16 @@ +# Soul — {{agent_name}} + +## Identity +- **名称**: {{agent_name}} +- **角色**: {{role_description}} +- **创建者**: {{creator_name}} +- **创建时间**: {{created_at}} + +## Personality +- 认真负责、注重细节 +- 主动汇报工作进展 +- 遇到不确定的信息会主动确认 + +## Boundaries +- 遵守企业保密制度 +- 敏感操作需经过创建者审批 diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/state.json b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/state.json new file mode 100644 index 000000000..0507e31dd --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/state.json @@ -0,0 +1,13 @@ +{ + "agent_id": "", + "name": "", + "status": "idle", + "current_task": null, + "last_active": null, + "channel_status": {}, + "stats": { + "tasks_completed_today": 0, + "tasks_in_progress": 0, + "督办_pending": 0 + } +} \ No newline at end of file diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/todo.json b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/todo.json new file mode 100644 index 000000000..50ffbb9a9 --- /dev/null +++ b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/todo.json @@ -0,0 +1,3 @@ +{ + "tasks": [] +} diff --git a/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/workspace/archived/.gitkeep b/.clawith/data/agents/35aa71a9-6f5f-439c-8e33-feb561f21ae8/workspace/archived/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/HEARTBEAT.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/HEARTBEAT.md new file mode 100644 index 000000000..485565cb3 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/HEARTBEAT.md @@ -0,0 +1,63 @@ +# HEARTBEAT + +When this file is read during a heartbeat, you are performing a **periodic awareness check**. + +## Phase 1: Review Context & Discover Interest Points + +Review your **recent conversations** and your **role/responsibilities**. +Identify topics or questions that: +- Are directly relevant to your role and current work +- Were mentioned by users but not fully explored at the time +- Represent emerging trends or changes in your professional domain +- Could improve your ability to serve your users + +If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3. +Do NOT search for generic or obvious topics just to fill time. Quality over quantity. + +## Phase 2: Targeted Exploration (Conditional) + +Only if you identified genuine interest points in Phase 1: + +1. Use `web_search` to investigate (maximum 5 searches per heartbeat) +2. Keep searches **tightly scoped** to your role and recent work topics +3. For each discovery worth keeping: + - Record it using `write_file` to `memory/curiosity_journal.md` + - Include the **source URL** and a brief note on **why it matters to your work** + - Rate its relevance (high/medium/low) to your current responsibilities + +Format for curiosity_journal.md entries: +``` +### [Date] - [Topic] +- **Finding**: [What you learned] +- **Source**: [URL] +- **Relevance**: [high/medium/low] — [Why it matters to your work] +- **Follow-up**: [Optional: questions this raises for next time] +``` + +## Phase 3: Agent Plaza + +1. Call `plaza_get_new_posts` to check recent activity +2. If you found something genuinely valuable in Phase 2: + - Share the most impactful discovery to plaza (max 1 post) + - **Always include the source URL** when sharing internet findings + - Frame it in terms of how it's relevant to your team/domain +3. Comment on relevant existing posts (max 2 comments) + +## Phase 4: Wrap Up + +- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK` +- Otherwise, briefly summarize what you explored and why + +## Key Principles +- Always ground exploration in YOUR role and YOUR recent work context +- Never search for random unrelated topics out of idle curiosity +- If you don't have a specific angle worth investigating, don't search +- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries +- Generate follow-up questions only when you genuinely want to know more + +## Rules +- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details +- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links +- 📝 **Limits per heartbeat**: max 1 post + 2 comments +- 🔍 **Search limits**: max 5 web searches per heartbeat +- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK` diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/daily_reports/.gitkeep b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/daily_reports/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/enterprise_info/.gitkeep b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/enterprise_info/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/MEMORY_INDEX.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/MEMORY_INDEX.md new file mode 100644 index 000000000..29e3fab13 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/MEMORY_INDEX.md @@ -0,0 +1,6 @@ +# Memory Index + +This file serves as an index of all memories for this digital employee. + +## Topics + diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/curiosity_journal.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/curiosity_journal.md new file mode 100644 index 000000000..c5185fe44 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/memory/curiosity_journal.md @@ -0,0 +1,9 @@ +# Curiosity Journal + +This is your exploration log. Record interesting discoveries from your web searches here. + +## Active Questions + + +## Discoveries + diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/relationships.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/relationships.md new file mode 100644 index 000000000..625380120 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/relationships.md @@ -0,0 +1,5 @@ +# Relationships + +## Digital Employee Colleagues + +- **Morty** (collaborator): Research expert with strong learning ability. Ask him for information retrieval, web research, data analysis, and knowledge synthesis. diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/.gitkeep b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/MCP_INSTALLER.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/MCP_INSTALLER.md new file mode 100644 index 000000000..9e3bf3c77 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/MCP_INSTALLER.md @@ -0,0 +1,87 @@ +# MCP Tool Installer + +## When to Use This Skill +Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL. + +--- + +## Step-by-Step Protocol + +### Step 1 — Search first +``` +discover_resources(query="", max_results=5) +``` +Show the results and let the user pick. Note the `ID` field (e.g. `github`). + +### Step 2 — Determine import method + +**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐) +- Requires Smithery API Key (one-time per agent) +- Individual tool tokens NOT needed — Smithery handles auth via OAuth + +**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint) +- User provides the MCP server URL directly +- May require tool-specific API key + +**Not importable** (💻 local-only tools) +- Requires local Docker/process — inform user these cannot be imported automatically + +--- + +### Method A: Smithery Import + +#### Check Smithery API Key +If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim): + +> **Smithery** (smithery.ai) 是一个 MCP 工具市场,类似于"应用商店"。通过它,我可以帮你一键安装各种第三方工具(如 GitHub、Notion、Slack 等),并自动完成认证。 +> +> **为什么需要注册?** +> Smithery 用 API Key 来识别你的身份,这样安装的工具会关联到你的账号,认证信息也会安全保存。 +> +> **注册一次后有什么好处?** +> - 🔑 只需提供一次 Key,后续安装其他工具时我会自动帮你配置 +> - 🔐 不需要为每个工具单独创建 Token(如 GitHub PAT),OAuth 一键授权 +> - 📦 支持上千种 MCP 工具,随时可以扩展你的能力 +> +> **获取步骤:** +> 1. 访问 https://smithery.ai 注册/登录 +> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key +> 3. 将 Key 提供给我 + +#### Import +``` +import_mcp_server( + server_id="", + config={"smithery_api_key": ""} # first time only +) +``` + +#### Handle OAuth +Some tools return an OAuth authorization URL. Tell the user to visit the link. + +**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically. + +--- + +### Method B: Direct URL Import + +When a tool is not available on Smithery but the user has a public MCP endpoint: +``` +import_mcp_server( + server_id="", + config={ + "mcp_url": "https://my-mcp-server.com/sse", + "api_key": "" + } +) +``` +The system will connect to the URL, discover available tools, and register them. + +--- + +## What NOT to Do +- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these +- ❌ Don't tell users to go to Settings — handle everything in chat +- ❌ Don't echo API keys back in your response +- ❌ Don't skip the search step — always verify the server exists before importing +- ❌ Don't import local-only tools — inform users they require local installation diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/SKILL.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/SKILL.md new file mode 100644 index 000000000..db71c3ed8 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/SKILL.md @@ -0,0 +1,146 @@ +--- +name: Complex Task Executor +description: Structured methodology for decomposing, planning, and executing complex multi-step tasks with progress tracking +--- + +# Complex Task Executor + +## When to Use This Skill + +Use this skill when a task meets ANY of the following criteria: +- Requires more than 3 distinct steps to complete +- Involves multiple tools or information sources +- Has dependencies between steps (step B needs output from step A) +- Requires research before execution +- Could benefit from a documented plan others can review +- The user explicitly asks for a thorough or systematic approach + +**DO NOT use this for simple tasks** like answering a question, reading a single file, or performing one tool call. + +## Workflow + +### Phase 1: Task Analysis (THINK before acting) + +Before creating any files, analyze the task: + +1. **Understand the goal**: What is the final deliverable? What does "done" look like? +2. **Assess complexity**: How many steps? What tools are needed? +3. **Identify dependencies**: Which steps depend on others? +4. **Identify risks**: What could go wrong? What information is missing? +5. **Estimate scope**: Is the task feasible with available tools/skills? + +### Phase 2: Create Task Plan + +Create a task folder and plan file in the workspace: + +``` +workspace//plan.md +``` + +The plan.md MUST follow this exact format: + +```markdown +# Task: + +## Objective + + +## Steps + +- [ ] 1. + - Details: + - Output: +- [ ] 2. + - Details: <...> + - Depends on: Step 1 +- [ ] 3. + - Details: <...> + +## Status +- Created: +- Current Step: Not started +- Progress: 0/ + +## Notes + +``` + +Rules for writing the plan: +- Each step should be completable in 1-3 tool calls +- Use verb-noun format: "Research competitors", "Draft report", "Validate data" +- Mark dependencies explicitly +- Include expected outputs for each step + +### Phase 3: Execute Step-by-Step + +For EACH step in the plan: + +1. **Read the plan** — Call `read_file` on `workspace//plan.md` to check current state +2. **Mark as in-progress** — Update the checkbox from `[ ]` to `[/]` and update the "Current Step" field +3. **Execute the step** — Do the actual work (tool calls, analysis, writing) +4. **Record output** — Save results to `workspace//` (e.g., intermediate files, data) +5. **Mark as complete** — Update the checkbox from `[/]` to `[x]` and update "Progress" counter +6. **Proceed to next step** — Move to the next uncompleted step + +### Phase 4: Completion + +When all steps are done: +1. Update plan.md status to "✅ Completed" +2. Create a `workspace//summary.md` with: + - What was accomplished + - Key results and deliverables + - Any follow-up items +3. Present the final result to the user + +## Adaptive Replanning + +If during execution you discover: +- A step is impossible → Mark it `[!]` with a reason, add alternative steps +- New steps are needed → Add them to the plan with `[+]` prefix +- A step produced unexpected results → Add a note and adjust subsequent steps +- The plan needs major changes → Create a new section "## Revised Plan" and follow it + +Always update plan.md BEFORE changing course, so the plan stays the source of truth. + +## Error Handling + +- If a tool call fails, retry once. If it fails again, mark the step as blocked and note the error. +- Never silently skip a step. Always update the plan to reflect what happened. +- If you're stuck, tell the user what's blocking and ask for guidance. + +## Example Scenarios + +### Example 1: "Research our top 3 competitors and write a comparison report" + +Plan would be: +``` +- [ ] 1. Identify the user's company/product context +- [ ] 2. Research Competitor A — website, pricing, features +- [ ] 3. Research Competitor B — website, pricing, features +- [ ] 4. Research Competitor C — website, pricing, features +- [ ] 5. Create comparison matrix +- [ ] 6. Write analysis and recommendations +- [ ] 7. Compile final report +``` + +### Example 2: "Analyze our Q4 sales data and prepare a board presentation" + +Plan would be: +``` +- [ ] 1. Read and understand the sales data files +- [ ] 2. Calculate key metrics (revenue, growth, trends) +- [ ] 3. Identify top insights and anomalies +- [ ] 4. Create data summary tables +- [ ] 5. Draft presentation outline +- [ ] 6. Write each presentation section +- [ ] 7. Add executive summary +- [ ] 8. Review and polish final document +``` + +## Key Principles + +1. **Plan is the source of truth** — Always update it before moving on +2. **One step at a time** — Don't skip ahead or batch too many steps +3. **Show your work** — Save intermediate results to the task folder +4. **Communicate progress** — The user can read plan.md at any time to see status +5. **Be adaptive** — Plans change; that's OK if you update the plan first diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/examples/plan_template.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/examples/plan_template.md new file mode 100644 index 000000000..dfd60e7cb --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/complex-task-executor/examples/plan_template.md @@ -0,0 +1,23 @@ +# Task: [Title] + +## Objective +[One-sentence description of the desired outcome] + +## Steps + +- [ ] 1. [First step] + - Details: [What specifically to do] + - Output: [What this step produces] +- [ ] 2. [Second step] + - Details: [...] + - Depends on: Step 1 +- [ ] 3. [Third step] + - Details: [...] + +## Status +- Created: [timestamp] +- Current Step: Not started +- Progress: 0/3 + +## Notes +- [Any assumptions, risks, or open questions] diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/mcp-installer/SKILL.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/mcp-installer/SKILL.md new file mode 100644 index 000000000..9e3bf3c77 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/mcp-installer/SKILL.md @@ -0,0 +1,87 @@ +# MCP Tool Installer + +## When to Use This Skill +Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL. + +--- + +## Step-by-Step Protocol + +### Step 1 — Search first +``` +discover_resources(query="", max_results=5) +``` +Show the results and let the user pick. Note the `ID` field (e.g. `github`). + +### Step 2 — Determine import method + +**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐) +- Requires Smithery API Key (one-time per agent) +- Individual tool tokens NOT needed — Smithery handles auth via OAuth + +**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint) +- User provides the MCP server URL directly +- May require tool-specific API key + +**Not importable** (💻 local-only tools) +- Requires local Docker/process — inform user these cannot be imported automatically + +--- + +### Method A: Smithery Import + +#### Check Smithery API Key +If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim): + +> **Smithery** (smithery.ai) 是一个 MCP 工具市场,类似于"应用商店"。通过它,我可以帮你一键安装各种第三方工具(如 GitHub、Notion、Slack 等),并自动完成认证。 +> +> **为什么需要注册?** +> Smithery 用 API Key 来识别你的身份,这样安装的工具会关联到你的账号,认证信息也会安全保存。 +> +> **注册一次后有什么好处?** +> - 🔑 只需提供一次 Key,后续安装其他工具时我会自动帮你配置 +> - 🔐 不需要为每个工具单独创建 Token(如 GitHub PAT),OAuth 一键授权 +> - 📦 支持上千种 MCP 工具,随时可以扩展你的能力 +> +> **获取步骤:** +> 1. 访问 https://smithery.ai 注册/登录 +> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key +> 3. 将 Key 提供给我 + +#### Import +``` +import_mcp_server( + server_id="", + config={"smithery_api_key": ""} # first time only +) +``` + +#### Handle OAuth +Some tools return an OAuth authorization URL. Tell the user to visit the link. + +**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically. + +--- + +### Method B: Direct URL Import + +When a tool is not available on Smithery but the user has a public MCP endpoint: +``` +import_mcp_server( + server_id="", + config={ + "mcp_url": "https://my-mcp-server.com/sse", + "api_key": "" + } +) +``` +The system will connect to the URL, discover available tools, and register them. + +--- + +## What NOT to Do +- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these +- ❌ Don't tell users to go to Settings — handle everything in chat +- ❌ Don't echo API keys back in your response +- ❌ Don't skip the search step — always verify the server exists before importing +- ❌ Don't import local-only tools — inform users they require local installation diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/meeting-notes/SKILL.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/meeting-notes/SKILL.md new file mode 100644 index 000000000..71cf83da2 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/meeting-notes/SKILL.md @@ -0,0 +1,36 @@ +--- +name: Meeting Notes +description: Meeting summarization, action item extraction, and follow-up tracking +--- + +# Meeting Notes + +## Overview +Use this skill for processing meeting content into structured summaries with clear action items. + +**Keywords**: meetings, notes, action items, decisions, follow-up + +## Template + +### Meeting Summary +``` +Meeting: [Title] +Date: [Date] +Participants: [Names] +Duration: [Time] +``` + +### Key Decisions +- Numbered list of decisions made + +### Action Items +| # | Action | Owner | Due Date | Status | +|---|--------|-------|----------|--------| +| 1 | [Task] | [Name] | [Date] | ⬜ Pending | + +### Discussion Points +Brief summary of main topics discussed + +### Next Steps +- Follow-up meeting date +- Items deferred to next meeting diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/SKILL.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/SKILL.md new file mode 100644 index 000000000..ce0d06f3e --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/SKILL.md @@ -0,0 +1,152 @@ +--- +name: skill-creator +description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy. +--- + +# Skill Creator + +A skill for creating new skills and iteratively improving them. + +At a high level, the process of creating a skill goes like this: + +- Decide what you want the skill to do and roughly how it should do it +- Write a draft of the skill +- Create a few test prompts and run claude-with-access-to-the-skill on them +- Help the user evaluate the results both qualitatively and quantitatively +- Rewrite the skill based on feedback from the user's evaluation +- Repeat until you're satisfied +- Expand the test set and try again at larger scale + +Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages. + +## Communicating with the user + +Pay attention to context cues to understand how to phrase your communication. Briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it. + +--- + +## Creating a skill + +### Capture Intent +Start by understanding the user's intent. + +1. What should this skill enable the agent to do? +2. When should this skill trigger? (what user phrases/contexts) +3. What's the expected output format? +4. Should we set up test cases to verify the skill works? + +### Interview and Research +Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies. Wait to write test prompts until you've got this part ironed out. + +### Write the SKILL.md +Based on the user interview, fill in these components: + +- **name**: Skill identifier +- **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it. +- **the rest of the skill** + +### Skill Writing Guide + +#### Anatomy of a Skill + +``` +skill-name/ +\u251c\u2500\u2500 SKILL.md (required) +\u2502 \u251c\u2500\u2500 YAML frontmatter (name, description required) +\u2502 \u2514\u2500\u2500 Markdown instructions +\u2514\u2500\u2500 Bundled Resources (optional) + \u251c\u2500\u2500 scripts/ - Executable code for deterministic/repetitive tasks + \u251c\u2500\u2500 references/ - Docs loaded into context as needed + \u2514\u2500\u2500 assets/ - Files used in output (templates, icons, fonts) +``` + +#### Progressive Disclosure + +Skills use a three-level loading system: +1. **Metadata** (name + description) - Always in context (~100 words) +2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal) +3. **Bundled resources** - As needed (unlimited, scripts can execute without loading) + +**Key patterns:** +- Keep SKILL.md under 500 lines; if approaching this limit, add hierarchy with clear pointers +- Reference files clearly from SKILL.md with guidance on when to read them +- For large reference files (>300 lines), include a table of contents + +#### Writing Patterns + +Prefer using the imperative form in instructions. + +### Writing Style +Explain to the model why things are important. Use theory of mind and try to make the skill general. Start by writing a draft and then look at it with fresh eyes and improve it. + +### Test Cases +After writing the skill draft, come up with 2-3 realistic test prompts. Share them with the user. Save test cases to `evals/evals.json`. + +--- + +## Running and evaluating test cases + +This section is one continuous sequence. + +### Step 1: Run test cases +For each test case, run the agent with the skill applied, and optionally a baseline run without the skill for comparison. + +### Step 2: Draft assertions +While runs are in progress, draft quantitative assertions for each test case. Good assertions are objectively verifiable and have descriptive names. + +### Step 3: Capture timing data +When each run completes, save timing data (tokens, duration) to `timing.json`. + +### Step 4: Grade, aggregate, and launch the viewer +Once all runs are done: +1. Grade each run against assertions — see `agents/grader.md` +2. Aggregate results: `python -m scripts.aggregate_benchmark /iteration-N --skill-name ` +3. Launch the viewer: `python eval-viewer/generate_review.py /iteration-N --skill-name "my-skill" --benchmark /iteration-N/benchmark.json` +4. Present results to the user for review + +### Step 5: Read the feedback +Read user feedback from `feedback.json`. Empty feedback means the user thought it was fine. + +--- + +## Improving the skill + +### How to think about improvements +1. **Generalize from the feedback.** Don't overfit to specific examples. +2. **Keep the prompt lean.** Remove things that aren't pulling their weight. +3. **Explain the why.** Today's LLMs are smart. Explain reasoning rather than rigid MUSTs. +4. **Look for repeated work across test cases.** Bundle common scripts in `scripts/`. + +### The iteration loop +1. Apply improvements to the skill +2. Rerun all test cases into a new iteration directory +3. Present results for review +4. Wait for user to review +5. Read feedback, improve again, repeat + +--- + +## Advanced: Blind comparison +For rigorous comparison between two versions. Read `agents/comparator.md` and `agents/analyzer.md`. + +## Description Optimization +Optimize the description for better triggering accuracy. Use `scripts/run_loop.py`. + +--- + +## Reference files + +- `agents/grader.md` — How to evaluate assertions against outputs +- `agents/comparator.md` — How to do blind A/B comparison between two outputs +- `agents/analyzer.md` — How to analyze why one version beat another +- `references/schemas.md` — JSON structures for evals.json, grading.json, etc. +- `assets/eval_review.html` — HTML template for eval review +- `eval-viewer/generate_review.py` — Script to generate the review viewer +- `scripts/aggregate_benchmark.py` — Aggregate benchmark results +- `scripts/generate_report.py` — Generate optimization report +- `scripts/improve_description.py` — Improve skill description +- `scripts/package_skill.py` — Package skill for distribution +- `scripts/quick_validate.py` — Quick validation +- `scripts/run_eval.py` — Run triggering evaluation +- `scripts/run_loop.py` — Run optimization loop +- `scripts/utils.py` — Shared utilities diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/analyzer.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/analyzer.md new file mode 100644 index 000000000..14e41d606 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/analyzer.md @@ -0,0 +1,274 @@ +# Post-hoc Analyzer Agent + +Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions. + +## Role + +After the blind comparator determines a winner, the Post-hoc Analyzer "unblids" the results by examining the skills and transcripts. The goal is to extract actionable insights: what made the winner better, and how can the loser be improved? + +## Inputs + +You receive these parameters in your prompt: + +- **winner**: "A" or "B" (from blind comparison) +- **winner_skill_path**: Path to the skill that produced the winning output +- **winner_transcript_path**: Path to the execution transcript for the winner +- **loser_skill_path**: Path to the skill that produced the losing output +- **loser_transcript_path**: Path to the execution transcript for the loser +- **comparison_result_path**: Path to the blind comparator's output JSON +- **output_path**: Where to save the analysis results + +## Process + +### Step 1: Read Comparison Result + +1. Read the blind comparator's output at comparison_result_path +2. Note the winning side (A or B), the reasoning, and any scores +3. Understand what the comparator valued in the winning output + +### Step 2: Read Both Skills + +1. Read the winner skill's SKILL.md and key referenced files +2. Read the loser skill's SKILL.md and key referenced files +3. Identify structural differences: + - Instructions clarity and specificity + - Script/tool usage patterns + - Example coverage + - Edge case handling + +### Step 3: Read Both Transcripts + +1. Read the winner's transcript +2. Read the loser's transcript +3. Compare execution patterns: + - How closely did each follow their skill's instructions? + - What tools were used differently? + - Where did the loser diverge from optimal behavior? + - Did either encounter errors or make recovery attempts? + +### Step 4: Analyze Instruction Following + +For each transcript, evaluate: +- Did the agent follow the skill's explicit instructions? +- Did the agent use the skill's provided tools/scripts? +- Were there missed opportunities to leverage skill content? +- Did the agent add unnecessary steps not in the skill? + +Score instruction following 1-10 and note specific issues. + +### Step 5: Identify Winner Strengths + +Determine what made the winner better: +- Clearer instructions that led to better behavior? +- Better scripts/tools that produced better output? +- More comprehensive examples that guided edge cases? +- Better error handling guidance? + +Be specific. Quote from skills/transcripts where relevant. + +### Step 6: Identify Loser Weaknesses + +Determine what held the loser back: +- Ambiguous instructions that led to suboptimal choices? +- Missing tools/scripts that forced workarounds? +- Gaps in edge case coverage? +- Poor error handling that caused failures? + +### Step 7: Generate Improvement Suggestions + +Based on the analysis, produce actionable suggestions for improving the loser skill: +- Specific instruction changes to make +- Tools/scripts to add or modify +- Examples to include +- Edge cases to address + +Prioritize by impact. Focus on changes that would have changed the outcome. + +### Step 8: Write Analysis Results + +Save structured analysis to `{output_path}`. + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "comparison_summary": { + "winner": "A", + "winner_skill": "path/to/winner/skill", + "loser_skill": "path/to/loser/skill", + "comparator_reasoning": "Brief summary of why comparator chose winner" + }, + "winner_strengths": [ + "Clear step-by-step instructions for handling multi-page documents", + "Included validation script that caught formatting errors", + "Explicit guidance on fallback behavior when OCR fails" + ], + "loser_weaknesses": [ + "Vague instruction 'process the document appropriately' led to inconsistent behavior", + "No script for validation, agent had to improvise and made errors", + "No guidance on OCR failure, agent gave up instead of trying alternatives" + ], + "instruction_following": { + "winner": { + "score": 9, + "issues": [ + "Minor: skipped optional logging step" + ] + }, + "loser": { + "score": 6, + "issues": [ + "Did not use the skill's formatting template", + "Invented own approach instead of following step 3", + "Missed the 'always validate output' instruction" + ] + } + }, + "improvement_suggestions": [ + { + "priority": "high", + "category": "instructions", + "suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template", + "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior" + }, + { + "priority": "high", + "category": "tools", + "suggestion": "Add validate_output.py script similar to winner skill's validation approach", + "expected_impact": "Would catch formatting errors before final output" + }, + { + "priority": "medium", + "category": "error_handling", + "suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'", + "expected_impact": "Would prevent early failure on difficult documents" + } + ], + "transcript_insights": { + "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output", + "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors" + } +} +``` + +## Guidelines + +- **Be specific**: Quote from skills and transcripts, don't just say "instructions were unclear" +- **Be actionable**: Suggestions should be concrete changes, not vague advice +- **Focus on skill improvements**: The goal is to improve the losing skill, not critique the agent +- **Prioritize by impact**: Which changes would most likely have changed the outcome? +- **Consider causation**: Did the skill weakness actually cause the worse output, or is it incidental? +- **Stay objective**: Analyze what happened, don't editorialize +- **Think about generalization**: Would this improvement help on other evals too? + +## Categories for Suggestions + +Use these categories to organize improvement suggestions: + +| Category | Description | +|----------|-------------| +| `instructions` | Changes to the skill's prose instructions | +| `tools` | Scripts, templates, or utilities to add/modify | +| `examples` | Example inputs/outputs to include | +| `error_handling` | Guidance for handling failures | +| `structure` | Reorganization of skill content | +| `references` | External docs or resources to add | + +## Priority Levels + +- **high**: Would likely change the outcome of this comparison +- **medium**: Would improve quality but may not change win/loss +- **low**: Nice to have, marginal improvement + +--- + +# Analyzing Benchmark Results + +When analyzing benchmark results, the analyzer's purpose is to **surface patterns and anomalies** across multiple runs, not suggest skill improvements. + +## Role + +Review all benchmark run results and generate freeform notes that help the user understand skill performance. Focus on patterns that wouldn't be visible from aggregate metrics alone. + +## Inputs + +You receive these parameters in your prompt: + +- **benchmark_data_path**: Path to the in-progress benchmark.json with all run results +- **skill_path**: Path to the skill being benchmarked +- **output_path**: Where to save the notes (as JSON array of strings) + +## Process + +### Step 1: Read Benchmark Data + +1. Read the benchmark.json containing all run results +2. Note the configurations tested (with_skill, without_skill) +3. Understand the run_summary aggregates already calculated + +### Step 2: Analyze Per-Assertion Patterns + +For each expectation across all runs: +- Does it **always pass** in both configurations? (may not differentiate skill value) +- Does it **always fail** in both configurations? (may be broken or beyond capability) +- Does it **always pass with skill but fail without**? (skill clearly adds value here) +- Does it **always fail with skill but pass without**? (skill may be hurting) +- Is it **highly variable**? (flaky expectation or non-deterministic behavior) + +### Step 3: Analyze Cross-Eval Patterns + +Look for patterns across evals: +- Are certain eval types consistently harder/easier? +- Do some evals show high variance while others are stable? +- Are there surprising results that contradict expectations? + +### Step 4: Analyze Metrics Patterns + +Look at time_seconds, tokens, tool_calls: +- Does the skill significantly increase execution time? +- Is there high variance in resource usage? +- Are there outlier runs that skew the aggregates? + +### Step 5: Generate Notes + +Write freeform observations as a list of strings. Each note should: +- State a specific observation +- Be grounded in the data (not speculation) +- Help the user understand something the aggregate metrics don't show + +Examples: +- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value" +- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky" +- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)" +- "Skill adds 13s average execution time but improves pass rate by 50%" +- "Token usage is 80% higher with skill, primarily due to script output parsing" +- "All 3 without-skill runs for eval 1 produced empty output" + +### Step 6: Write Notes + +Save notes to `{output_path}` as a JSON array of strings: + +```json +[ + "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value", + "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure", + "Without-skill runs consistently fail on table extraction expectations", + "Skill adds 13s average execution time but improves pass rate by 50%" +] +``` + +## Guidelines + +**DO:** +- Report what you observe in the data +- Be specific about which evals, expectations, or runs you're referring to +- Note patterns that aggregate metrics would hide +- Provide context that helps interpret the numbers + +**DO NOT:** +- Suggest improvements to the skill (that's for the improvement step, not benchmarking) +- Make subjective quality judgments ("the output was good/bad") +- Speculate about causes without evidence +- Repeat information already in the run_summary aggregates diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/comparator.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/comparator.md new file mode 100644 index 000000000..80e00eb45 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/comparator.md @@ -0,0 +1,202 @@ +# Blind Comparator Agent + +Compare two outputs WITHOUT knowing which skill produced them. + +## Role + +The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach. + +Your judgment is based purely on output quality and task completion. + +## Inputs + +You receive these parameters in your prompt: + +- **output_a_path**: Path to the first output file or directory +- **output_b_path**: Path to the second output file or directory +- **eval_prompt**: The original task/prompt that was executed +- **expectations**: List of expectations to check (optional - may be empty) + +## Process + +### Step 1: Read Both Outputs + +1. Examine output A (file or directory) +2. Examine output B (file or directory) +3. Note the type, structure, and content of each +4. If outputs are directories, examine all relevant files inside + +### Step 2: Understand the Task + +1. Read the eval_prompt carefully +2. Identify what the task requires: + - What should be produced? + - What qualities matter (accuracy, completeness, format)? + - What would distinguish a good output from a poor one? + +### Step 3: Generate Evaluation Rubric + +Based on the task, generate a rubric with two dimensions: + +**Content Rubric** (what the output contains): +| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) | +|-----------|----------|----------------|---------------| +| Correctness | Major errors | Minor errors | Fully correct | +| Completeness | Missing key elements | Mostly complete | All elements present | +| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout | + +**Structure Rubric** (how the output is organized): +| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) | +|-----------|----------|----------------|---------------| +| Organization | Disorganized | Reasonably organized | Clear, logical structure | +| Formatting | Inconsistent/broken | Mostly consistent | Professional, polished | +| Usability | Difficult to use | Usable with effort | Easy to use | + +Adapt criteria to the specific task. For example: +- PDF form → "Field alignment", "Text readability", "Data placement" +- Document → "Section structure", "Heading hierarchy", "Paragraph flow" +- Data output → "Schema correctness", "Data types", "Completeness" + +### Step 4: Evaluate Each Output Against the Rubric + +For each output (A and B): + +1. **Score each criterion** on the rubric (1-5 scale) +2. **Calculate dimension totals**: Content score, Structure score +3. **Calculate overall score**: Average of dimension scores, scaled to 1-10 + +### Step 5: Check Assertions (if provided) + +If expectations are provided: + +1. Check each expectation against output A +2. Check each expectation against output B +3. Count pass rates for each output +4. Use expectation scores as secondary evidence (not the primary decision factor) + +### Step 6: Determine the Winner + +Compare A and B based on (in priority order): + +1. **Primary**: Overall rubric score (content + structure) +2. **Secondary**: Assertion pass rates (if applicable) +3. **Tiebreaker**: If truly equal, declare a TIE + +Be decisive - ties should be rare. One output is usually better, even if marginally. + +### Step 7: Write Comparison Results + +Save results to a JSON file at the path specified (or `comparison.json` if not specified). + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "winner": "A", + "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.", + "rubric": { + "A": { + "content": { + "correctness": 5, + "completeness": 5, + "accuracy": 4 + }, + "structure": { + "organization": 4, + "formatting": 5, + "usability": 4 + }, + "content_score": 4.7, + "structure_score": 4.3, + "overall_score": 9.0 + }, + "B": { + "content": { + "correctness": 3, + "completeness": 2, + "accuracy": 3 + }, + "structure": { + "organization": 3, + "formatting": 2, + "usability": 3 + }, + "content_score": 2.7, + "structure_score": 2.7, + "overall_score": 5.4 + } + }, + "output_quality": { + "A": { + "score": 9, + "strengths": ["Complete solution", "Well-formatted", "All fields present"], + "weaknesses": ["Minor style inconsistency in header"] + }, + "B": { + "score": 5, + "strengths": ["Readable output", "Correct basic structure"], + "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"] + } + }, + "expectation_results": { + "A": { + "passed": 4, + "total": 5, + "pass_rate": 0.80, + "details": [ + {"text": "Output includes name", "passed": true}, + {"text": "Output includes date", "passed": true}, + {"text": "Format is PDF", "passed": true}, + {"text": "Contains signature", "passed": false}, + {"text": "Readable text", "passed": true} + ] + }, + "B": { + "passed": 3, + "total": 5, + "pass_rate": 0.60, + "details": [ + {"text": "Output includes name", "passed": true}, + {"text": "Output includes date", "passed": false}, + {"text": "Format is PDF", "passed": true}, + {"text": "Contains signature", "passed": false}, + {"text": "Readable text", "passed": true} + ] + } + } +} +``` + +If no expectations were provided, omit the `expectation_results` field entirely. + +## Field Descriptions + +- **winner**: "A", "B", or "TIE" +- **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie) +- **rubric**: Structured rubric evaluation for each output + - **content**: Scores for content criteria (correctness, completeness, accuracy) + - **structure**: Scores for structure criteria (organization, formatting, usability) + - **content_score**: Average of content criteria (1-5) + - **structure_score**: Average of structure criteria (1-5) + - **overall_score**: Combined score scaled to 1-10 +- **output_quality**: Summary quality assessment + - **score**: 1-10 rating (should match rubric overall_score) + - **strengths**: List of positive aspects + - **weaknesses**: List of issues or shortcomings +- **expectation_results**: (Only if expectations provided) + - **passed**: Number of expectations that passed + - **total**: Total number of expectations + - **pass_rate**: Fraction passed (0.0 to 1.0) + - **details**: Individual expectation results + +## Guidelines + +- **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality. +- **Be specific**: Cite specific examples when explaining strengths and weaknesses. +- **Be decisive**: Choose a winner unless outputs are genuinely equivalent. +- **Output quality first**: Assertion scores are secondary to overall task completion. +- **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness. +- **Explain your reasoning**: The reasoning field should make it clear why you chose the winner. +- **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better. diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/grader.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/grader.md new file mode 100644 index 000000000..558ab05c0 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/agents/grader.md @@ -0,0 +1,223 @@ +# Grader Agent + +Evaluate expectations against an execution transcript and outputs. + +## Role + +The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment. + +You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so. + +## Inputs + +You receive these parameters in your prompt: + +- **expectations**: List of expectations to evaluate (strings) +- **transcript_path**: Path to the execution transcript (markdown file) +- **outputs_dir**: Directory containing output files from execution + +## Process + +### Step 1: Read the Transcript + +1. Read the transcript file completely +2. Note the eval prompt, execution steps, and final result +3. Identify any issues or errors documented + +### Step 2: Examine Output Files + +1. List files in outputs_dir +2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced. +3. Note contents, structure, and quality + +### Step 3: Evaluate Each Assertion + +For each expectation: + +1. **Search for evidence** in the transcript and outputs +2. **Determine verdict**: + - **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance + - **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content) +3. **Cite the evidence**: Quote the specific text or describe what you found + +### Step 4: Extract and Verify Claims + +Beyond the predefined expectations, extract implicit claims from the outputs and verify them: + +1. **Extract claims** from the transcript and outputs: + - Factual statements ("The form has 12 fields") + - Process claims ("Used pypdf to fill the form") + - Quality claims ("All fields were filled correctly") + +2. **Verify each claim**: + - **Factual claims**: Can be checked against the outputs or external sources + - **Process claims**: Can be verified from the transcript + - **Quality claims**: Evaluate whether the claim is justified + +3. **Flag unverifiable claims**: Note claims that cannot be verified with available information + +This catches issues that predefined expectations might miss. + +### Step 5: Read User Notes + +If `{outputs_dir}/user_notes.md` exists: +1. Read it and note any uncertainties or issues flagged by the executor +2. Include relevant concerns in the grading output +3. These may reveal problems even when expectations pass + +### Step 6: Critique the Evals + +After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap. + +Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't. + +Suggestions worth raising: +- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content) +- An important outcome you observed — good or bad — that no assertion covers at all +- An assertion that can't actually be verified from the available outputs + +Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion. + +### Step 7: Write Grading Results + +Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir). + +## Grading Criteria + +**PASS when**: +- The transcript or outputs clearly demonstrate the expectation is true +- Specific evidence can be cited +- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename) + +**FAIL when**: +- No evidence found for the expectation +- Evidence contradicts the expectation +- The expectation cannot be verified from available information +- The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete +- The output appears to meet the assertion by coincidence rather than by actually doing the work + +**When uncertain**: The burden of proof to pass is on the expectation. + +### Step 8: Read Executor Metrics and Timing + +1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output +2. If `{outputs_dir}/../timing.json` exists, read it and include timing data + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "expectations": [ + { + "text": "The output includes the name 'John Smith'", + "passed": true, + "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'" + }, + { + "text": "The spreadsheet has a SUM formula in cell B10", + "passed": false, + "evidence": "No spreadsheet was created. The output was a text file." + }, + { + "text": "The assistant used the skill's OCR script", + "passed": true, + "evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'" + } + ], + "summary": { + "passed": 2, + "failed": 1, + "total": 3, + "pass_rate": 0.67 + }, + "execution_metrics": { + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8 + }, + "total_tool_calls": 15, + "total_steps": 6, + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 + }, + "timing": { + "executor_duration_seconds": 165.0, + "grader_duration_seconds": 26.0, + "total_duration_seconds": 191.0 + }, + "claims": [ + { + "claim": "The form has 12 fillable fields", + "type": "factual", + "verified": true, + "evidence": "Counted 12 fields in field_info.json" + }, + { + "claim": "All required fields were populated", + "type": "quality", + "verified": false, + "evidence": "Reference section was left blank despite data being available" + } + ], + "user_notes_summary": { + "uncertainties": ["Used 2023 data, may be stale"], + "needs_review": [], + "workarounds": ["Fell back to text overlay for non-fillable fields"] + }, + "eval_feedback": { + "suggestions": [ + { + "assertion": "The output includes the name 'John Smith'", + "reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input" + }, + { + "reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught" + } + ], + "overall": "Assertions check presence but not correctness. Consider adding content verification." + } +} +``` + +## Field Descriptions + +- **expectations**: Array of graded expectations + - **text**: The original expectation text + - **passed**: Boolean - true if expectation passes + - **evidence**: Specific quote or description supporting the verdict +- **summary**: Aggregate statistics + - **passed**: Count of passed expectations + - **failed**: Count of failed expectations + - **total**: Total expectations evaluated + - **pass_rate**: Fraction passed (0.0 to 1.0) +- **execution_metrics**: Copied from executor's metrics.json (if available) + - **output_chars**: Total character count of output files (proxy for tokens) + - **transcript_chars**: Character count of transcript +- **timing**: Wall clock timing from timing.json (if available) + - **executor_duration_seconds**: Time spent in executor subagent + - **total_duration_seconds**: Total elapsed time for the run +- **claims**: Extracted and verified claims from the output + - **claim**: The statement being verified + - **type**: "factual", "process", or "quality" + - **verified**: Boolean - whether the claim holds + - **evidence**: Supporting or contradicting evidence +- **user_notes_summary**: Issues flagged by the executor + - **uncertainties**: Things the executor wasn't sure about + - **needs_review**: Items requiring human attention + - **workarounds**: Places where the skill didn't work as expected +- **eval_feedback**: Improvement suggestions for the evals (only when warranted) + - **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to + - **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag + +## Guidelines + +- **Be objective**: Base verdicts on evidence, not assumptions +- **Be specific**: Quote the exact text that supports your verdict +- **Be thorough**: Check both transcript and output files +- **Be consistent**: Apply the same standard to each expectation +- **Explain failures**: Make it clear why evidence was insufficient +- **No partial credit**: Each expectation is pass or fail, not partial diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/assets/eval_review.html b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/assets/eval_review.html new file mode 100644 index 000000000..938ff32ae --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/assets/eval_review.html @@ -0,0 +1,146 @@ + + + + + + Eval Set Review - __SKILL_NAME_PLACEHOLDER__ + + + + + + +

Eval Set Review: __SKILL_NAME_PLACEHOLDER__

+

Current description: __SKILL_DESCRIPTION_PLACEHOLDER__

+ +
+ + +
+ + + + + + + + + + +
QueryShould TriggerActions
+ +

+ + + + diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/generate_review.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/generate_review.py new file mode 100644 index 000000000..4f0b1fe00 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/generate_review.py @@ -0,0 +1,473 @@ +#!/usr/bin/env python3 +"""Generate and serve a review page for eval results. + +Reads the workspace directory, discovers runs (directories with outputs/), +embeds all output data into a self-contained HTML page, and serves it via +a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace. + +Usage: + python generate_review.py [--port PORT] [--skill-name NAME] + python generate_review.py --previous-feedback /path/to/old/feedback.json + +No dependencies beyond the Python stdlib are required. +""" + +import argparse +import base64 +import json +import mimetypes +import os +import re +import signal +import subprocess +import sys +import time +import webbrowser +from functools import partial +from http.server import HTTPServer, BaseHTTPRequestHandler +from pathlib import Path + +from loguru import logger + +# Files to exclude from output listings +METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"} + +# Extensions we render as inline text +TEXT_EXTENSIONS = { + ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx", + ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs", + ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml", +} + +# Extensions we render as inline images +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"} + +# MIME type overrides for common types +MIME_OVERRIDES = { + ".svg": "image/svg+xml", + ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", +} + + +def get_mime_type(path: Path) -> str: + ext = path.suffix.lower() + if ext in MIME_OVERRIDES: + return MIME_OVERRIDES[ext] + mime, _ = mimetypes.guess_type(str(path)) + return mime or "application/octet-stream" + + +def find_runs(workspace: Path) -> list[dict]: + """Recursively find directories that contain an outputs/ subdirectory.""" + runs: list[dict] = [] + _find_runs_recursive(workspace, workspace, runs) + runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"])) + return runs + + +def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None: + if not current.is_dir(): + return + + outputs_dir = current / "outputs" + if outputs_dir.is_dir(): + run = build_run(root, current) + if run: + runs.append(run) + return + + skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"} + for child in sorted(current.iterdir()): + if child.is_dir() and child.name not in skip: + _find_runs_recursive(root, child, runs) + + +def build_run(root: Path, run_dir: Path) -> dict | None: + """Build a run dict with prompt, outputs, and grading data.""" + prompt = "" + eval_id = None + + # Try eval_metadata.json + for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]: + if candidate.exists(): + try: + metadata = json.loads(candidate.read_text()) + prompt = metadata.get("prompt", "") + eval_id = metadata.get("eval_id") + except (json.JSONDecodeError, OSError): + pass + if prompt: + break + + # Fall back to transcript.md + if not prompt: + for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]: + if candidate.exists(): + try: + text = candidate.read_text() + match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text) + if match: + prompt = match.group(1).strip() + except OSError: + pass + if prompt: + break + + if not prompt: + prompt = "(No prompt found)" + + run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-") + + # Collect output files + outputs_dir = run_dir / "outputs" + output_files: list[dict] = [] + if outputs_dir.is_dir(): + for f in sorted(outputs_dir.iterdir()): + if f.is_file() and f.name not in METADATA_FILES: + output_files.append(embed_file(f)) + + # Load grading if present + grading = None + for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]: + if candidate.exists(): + try: + grading = json.loads(candidate.read_text()) + except (json.JSONDecodeError, OSError): + pass + if grading: + break + + return { + "id": run_id, + "prompt": prompt, + "eval_id": eval_id, + "outputs": output_files, + "grading": grading, + } + + +def embed_file(path: Path) -> dict: + """Read a file and return an embedded representation.""" + ext = path.suffix.lower() + mime = get_mime_type(path) + + if ext in TEXT_EXTENSIONS: + try: + content = path.read_text(errors="replace") + except OSError: + content = "(Error reading file)" + return { + "name": path.name, + "type": "text", + "content": content, + } + elif ext in IMAGE_EXTENSIONS: + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "image", + "mime": mime, + "data_uri": f"data:{mime};base64,{b64}", + } + elif ext == ".pdf": + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "pdf", + "data_uri": f"data:{mime};base64,{b64}", + } + elif ext == ".xlsx": + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "xlsx", + "data_b64": b64, + } + else: + # Binary / unknown — base64 download link + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "binary", + "mime": mime, + "data_uri": f"data:{mime};base64,{b64}", + } + + +def load_previous_iteration(workspace: Path) -> dict[str, dict]: + """Load previous iteration's feedback and outputs. + + Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}. + """ + result: dict[str, dict] = {} + + # Load feedback + feedback_map: dict[str, str] = {} + feedback_path = workspace / "feedback.json" + if feedback_path.exists(): + try: + data = json.loads(feedback_path.read_text()) + feedback_map = { + r["run_id"]: r["feedback"] + for r in data.get("reviews", []) + if r.get("feedback", "").strip() + } + except (json.JSONDecodeError, OSError, KeyError): + pass + + # Load runs (to get outputs) + prev_runs = find_runs(workspace) + for run in prev_runs: + result[run["id"]] = { + "feedback": feedback_map.get(run["id"], ""), + "outputs": run.get("outputs", []), + } + + # Also add feedback for run_ids that had feedback but no matching run + for run_id, fb in feedback_map.items(): + if run_id not in result: + result[run_id] = {"feedback": fb, "outputs": []} + + return result + + +def generate_html( + runs: list[dict], + skill_name: str, + previous: dict[str, dict] | None = None, + benchmark: dict | None = None, +) -> str: + """Generate the complete standalone HTML page with embedded data.""" + template_path = Path(__file__).parent / "viewer.html" + template = template_path.read_text() + + # Build previous_feedback and previous_outputs maps for the template + previous_feedback: dict[str, str] = {} + previous_outputs: dict[str, list[dict]] = {} + if previous: + for run_id, data in previous.items(): + if data.get("feedback"): + previous_feedback[run_id] = data["feedback"] + if data.get("outputs"): + previous_outputs[run_id] = data["outputs"] + + embedded = { + "skill_name": skill_name, + "runs": runs, + "previous_feedback": previous_feedback, + "previous_outputs": previous_outputs, + } + if benchmark: + embedded["benchmark"] = benchmark + + data_json = json.dumps(embedded) + + return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};") + + +# --------------------------------------------------------------------------- +# HTTP server (stdlib only, zero dependencies) +# --------------------------------------------------------------------------- + +def _kill_port(port: int) -> None: + """Kill any process listening on the given port.""" + try: + result = subprocess.run( + ["lsof", "-ti", f":{port}"], + capture_output=True, text=True, timeout=5, + ) + for pid_str in result.stdout.strip().split("\n"): + if pid_str.strip(): + try: + os.kill(int(pid_str.strip()), signal.SIGTERM) + except (ProcessLookupError, ValueError): + pass + if result.stdout.strip(): + time.sleep(0.5) + except subprocess.TimeoutExpired: + pass + except FileNotFoundError: + logger.warning("Note: lsof not found, cannot check if port is in use") + +class ReviewHandler(BaseHTTPRequestHandler): + """Serves the review HTML and handles feedback saves. + + Regenerates the HTML on each page load so that refreshing the browser + picks up new eval outputs without restarting the server. + """ + + def __init__( + self, + workspace: Path, + skill_name: str, + feedback_path: Path, + previous: dict[str, dict], + benchmark_path: Path | None, + *args, + **kwargs, + ): + self.workspace = workspace + self.skill_name = skill_name + self.feedback_path = feedback_path + self.previous = previous + self.benchmark_path = benchmark_path + super().__init__(*args, **kwargs) + + def do_GET(self) -> None: + if self.path == "/" or self.path == "/index.html": + # Regenerate HTML on each request (re-scans workspace for new outputs) + runs = find_runs(self.workspace) + benchmark = None + if self.benchmark_path and self.benchmark_path.exists(): + try: + benchmark = json.loads(self.benchmark_path.read_text()) + except (json.JSONDecodeError, OSError): + pass + html = generate_html(runs, self.skill_name, self.previous, benchmark) + content = html.encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(content))) + self.end_headers() + self.wfile.write(content) + elif self.path == "/api/feedback": + data = b"{}" + if self.feedback_path.exists(): + data = self.feedback_path.read_bytes() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(data))) + self.end_headers() + self.wfile.write(data) + else: + self.send_error(404) + + def do_POST(self) -> None: + if self.path == "/api/feedback": + length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(length) + try: + data = json.loads(body) + if not isinstance(data, dict) or "reviews" not in data: + raise ValueError("Expected JSON object with 'reviews' key") + self.feedback_path.write_text(json.dumps(data, indent=2) + "\n") + resp = b'{"ok":true}' + self.send_response(200) + except (json.JSONDecodeError, OSError, ValueError) as e: + resp = json.dumps({"error": str(e)}).encode() + self.send_response(500) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(resp))) + self.end_headers() + self.wfile.write(resp) + else: + self.send_error(404) + + def log_message(self, format: str, *args: object) -> None: + # Suppress request logging to keep terminal clean + pass + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate and serve eval review") + parser.add_argument("workspace", type=Path, help="Path to workspace directory") + parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)") + parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header") + parser.add_argument( + "--previous-workspace", type=Path, default=None, + help="Path to previous iteration's workspace (shows old outputs and feedback as context)", + ) + parser.add_argument( + "--benchmark", type=Path, default=None, + help="Path to benchmark.json to show in the Benchmark tab", + ) + parser.add_argument( + "--static", "-s", type=Path, default=None, + help="Write standalone HTML to this path instead of starting a server", + ) + args = parser.parse_args() + + workspace = args.workspace.resolve() + if not workspace.is_dir(): + logger.error(f"Error: {workspace} is not a directory") + sys.exit(1) + + runs = find_runs(workspace) + if not runs: + logger.error(f"No runs found in {workspace}") + sys.exit(1) + + skill_name = args.skill_name or workspace.name.replace("-workspace", "") + feedback_path = workspace / "feedback.json" + + previous: dict[str, dict] = {} + if args.previous_workspace: + previous = load_previous_iteration(args.previous_workspace.resolve()) + + benchmark_path = args.benchmark.resolve() if args.benchmark else None + benchmark = None + if benchmark_path and benchmark_path.exists(): + try: + benchmark = json.loads(benchmark_path.read_text()) + except (json.JSONDecodeError, OSError): + pass + + if args.static: + html = generate_html(runs, skill_name, previous, benchmark) + args.static.parent.mkdir(parents=True, exist_ok=True) + args.static.write_text(html) + logger.info(f"\n Static viewer written to: {args.static}\n") + sys.exit(0) + + # Kill any existing process on the target port + port = args.port + _kill_port(port) + handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path) + try: + server = HTTPServer(("127.0.0.1", port), handler) + except OSError: + # Port still in use after kill attempt — find a free one + server = HTTPServer(("127.0.0.1", 0), handler) + port = server.server_address[1] + + url = f"http://localhost:{port}" + logger.info(f"\n Eval Viewer") + logger.info(f" ─────────────────────────────────") + logger.info(f" URL: {url}") + logger.info(f" Workspace: {workspace}") + logger.info(f" Feedback: {feedback_path}") + if previous: + logger.info(f" Previous: {args.previous_workspace} ({len(previous)} runs)") + if benchmark_path: + logger.info(f" Benchmark: {benchmark_path}") + logger.info(f"\n Press Ctrl+C to stop.\n") + + webbrowser.open(url) + + try: + server.serve_forever() + except KeyboardInterrupt: + logger.info("\nStopped.") + server.server_close() + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/viewer.html b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/viewer.html new file mode 100644 index 000000000..6d8e96348 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/eval-viewer/viewer.html @@ -0,0 +1,1325 @@ + + + + + + Eval Review + + + + + + + +
+
+
+

Eval Review:

+
Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.
+
+
+
+ + + + + +
+
+ +
+
Prompt
+
+
+
+
+ + +
+
Output
+
+
No output files found
+
+
+ + + + + + + + +
+
Your Feedback
+
+ + + +
+
+
+ + +
+ + +
+
+
No benchmark data available. Run a benchmark to see quantitative results here.
+
+
+
+ + +
+
+

Review Complete

+

Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.

+
+ +
+
+
+ + +
+ + + + diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/references/schemas.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/references/schemas.md new file mode 100644 index 000000000..b6eeaa2d4 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/references/schemas.md @@ -0,0 +1,430 @@ +# JSON Schemas + +This document defines the JSON schemas used by skill-creator. + +--- + +## evals.json + +Defines the evals for a skill. Located at `evals/evals.json` within the skill directory. + +```json +{ + "skill_name": "example-skill", + "evals": [ + { + "id": 1, + "prompt": "User's example prompt", + "expected_output": "Description of expected result", + "files": ["evals/files/sample1.pdf"], + "expectations": [ + "The output includes X", + "The skill used script Y" + ] + } + ] +} +``` + +**Fields:** +- `skill_name`: Name matching the skill's frontmatter +- `evals[].id`: Unique integer identifier +- `evals[].prompt`: The task to execute +- `evals[].expected_output`: Human-readable description of success +- `evals[].files`: Optional list of input file paths (relative to skill root) +- `evals[].expectations`: List of verifiable statements + +--- + +## history.json + +Tracks version progression in Improve mode. Located at workspace root. + +```json +{ + "started_at": "2026-01-15T10:30:00Z", + "skill_name": "pdf", + "current_best": "v2", + "iterations": [ + { + "version": "v0", + "parent": null, + "expectation_pass_rate": 0.65, + "grading_result": "baseline", + "is_current_best": false + }, + { + "version": "v1", + "parent": "v0", + "expectation_pass_rate": 0.75, + "grading_result": "won", + "is_current_best": false + }, + { + "version": "v2", + "parent": "v1", + "expectation_pass_rate": 0.85, + "grading_result": "won", + "is_current_best": true + } + ] +} +``` + +**Fields:** +- `started_at`: ISO timestamp of when improvement started +- `skill_name`: Name of the skill being improved +- `current_best`: Version identifier of the best performer +- `iterations[].version`: Version identifier (v0, v1, ...) +- `iterations[].parent`: Parent version this was derived from +- `iterations[].expectation_pass_rate`: Pass rate from grading +- `iterations[].grading_result`: "baseline", "won", "lost", or "tie" +- `iterations[].is_current_best`: Whether this is the current best version + +--- + +## grading.json + +Output from the grader agent. Located at `/grading.json`. + +```json +{ + "expectations": [ + { + "text": "The output includes the name 'John Smith'", + "passed": true, + "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'" + }, + { + "text": "The spreadsheet has a SUM formula in cell B10", + "passed": false, + "evidence": "No spreadsheet was created. The output was a text file." + } + ], + "summary": { + "passed": 2, + "failed": 1, + "total": 3, + "pass_rate": 0.67 + }, + "execution_metrics": { + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8 + }, + "total_tool_calls": 15, + "total_steps": 6, + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 + }, + "timing": { + "executor_duration_seconds": 165.0, + "grader_duration_seconds": 26.0, + "total_duration_seconds": 191.0 + }, + "claims": [ + { + "claim": "The form has 12 fillable fields", + "type": "factual", + "verified": true, + "evidence": "Counted 12 fields in field_info.json" + } + ], + "user_notes_summary": { + "uncertainties": ["Used 2023 data, may be stale"], + "needs_review": [], + "workarounds": ["Fell back to text overlay for non-fillable fields"] + }, + "eval_feedback": { + "suggestions": [ + { + "assertion": "The output includes the name 'John Smith'", + "reason": "A hallucinated document that mentions the name would also pass" + } + ], + "overall": "Assertions check presence but not correctness." + } +} +``` + +**Fields:** +- `expectations[]`: Graded expectations with evidence +- `summary`: Aggregate pass/fail counts +- `execution_metrics`: Tool usage and output size (from executor's metrics.json) +- `timing`: Wall clock timing (from timing.json) +- `claims`: Extracted and verified claims from the output +- `user_notes_summary`: Issues flagged by the executor +- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising + +--- + +## metrics.json + +Output from the executor agent. Located at `/outputs/metrics.json`. + +```json +{ + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8, + "Edit": 1, + "Glob": 2, + "Grep": 0 + }, + "total_tool_calls": 18, + "total_steps": 6, + "files_created": ["filled_form.pdf", "field_values.json"], + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 +} +``` + +**Fields:** +- `tool_calls`: Count per tool type +- `total_tool_calls`: Sum of all tool calls +- `total_steps`: Number of major execution steps +- `files_created`: List of output files created +- `errors_encountered`: Number of errors during execution +- `output_chars`: Total character count of output files +- `transcript_chars`: Character count of transcript + +--- + +## timing.json + +Wall clock timing for a run. Located at `/timing.json`. + +**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact. + +```json +{ + "total_tokens": 84852, + "duration_ms": 23332, + "total_duration_seconds": 23.3, + "executor_start": "2026-01-15T10:30:00Z", + "executor_end": "2026-01-15T10:32:45Z", + "executor_duration_seconds": 165.0, + "grader_start": "2026-01-15T10:32:46Z", + "grader_end": "2026-01-15T10:33:12Z", + "grader_duration_seconds": 26.0 +} +``` + +--- + +## benchmark.json + +Output from Benchmark mode. Located at `benchmarks//benchmark.json`. + +```json +{ + "metadata": { + "skill_name": "pdf", + "skill_path": "/path/to/pdf", + "executor_model": "claude-sonnet-4-20250514", + "analyzer_model": "most-capable-model", + "timestamp": "2026-01-15T10:30:00Z", + "evals_run": [1, 2, 3], + "runs_per_configuration": 3 + }, + + "runs": [ + { + "eval_id": 1, + "eval_name": "Ocean", + "configuration": "with_skill", + "run_number": 1, + "result": { + "pass_rate": 0.85, + "passed": 6, + "failed": 1, + "total": 7, + "time_seconds": 42.5, + "tokens": 3800, + "tool_calls": 18, + "errors": 0 + }, + "expectations": [ + {"text": "...", "passed": true, "evidence": "..."} + ], + "notes": [ + "Used 2023 data, may be stale", + "Fell back to text overlay for non-fillable fields" + ] + } + ], + + "run_summary": { + "with_skill": { + "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90}, + "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0}, + "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100} + }, + "without_skill": { + "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45}, + "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0}, + "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500} + }, + "delta": { + "pass_rate": "+0.50", + "time_seconds": "+13.0", + "tokens": "+1700" + } + }, + + "notes": [ + "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value", + "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent", + "Without-skill runs consistently fail on table extraction expectations", + "Skill adds 13s average execution time but improves pass rate by 50%" + ] +} +``` + +**Fields:** +- `metadata`: Information about the benchmark run + - `skill_name`: Name of the skill + - `timestamp`: When the benchmark was run + - `evals_run`: List of eval names or IDs + - `runs_per_configuration`: Number of runs per config (e.g. 3) +- `runs[]`: Individual run results + - `eval_id`: Numeric eval identifier + - `eval_name`: Human-readable eval name (used as section header in the viewer) + - `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding) + - `run_number`: Integer run number (1, 2, 3...) + - `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors` +- `run_summary`: Statistical aggregates per configuration + - `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields + - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"` +- `notes`: Freeform observations from the analyzer + +**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually. + +--- + +## comparison.json + +Output from blind comparator. Located at `/comparison-N.json`. + +```json +{ + "winner": "A", + "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.", + "rubric": { + "A": { + "content": { + "correctness": 5, + "completeness": 5, + "accuracy": 4 + }, + "structure": { + "organization": 4, + "formatting": 5, + "usability": 4 + }, + "content_score": 4.7, + "structure_score": 4.3, + "overall_score": 9.0 + }, + "B": { + "content": { + "correctness": 3, + "completeness": 2, + "accuracy": 3 + }, + "structure": { + "organization": 3, + "formatting": 2, + "usability": 3 + }, + "content_score": 2.7, + "structure_score": 2.7, + "overall_score": 5.4 + } + }, + "output_quality": { + "A": { + "score": 9, + "strengths": ["Complete solution", "Well-formatted", "All fields present"], + "weaknesses": ["Minor style inconsistency in header"] + }, + "B": { + "score": 5, + "strengths": ["Readable output", "Correct basic structure"], + "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"] + } + }, + "expectation_results": { + "A": { + "passed": 4, + "total": 5, + "pass_rate": 0.80, + "details": [ + {"text": "Output includes name", "passed": true} + ] + }, + "B": { + "passed": 3, + "total": 5, + "pass_rate": 0.60, + "details": [ + {"text": "Output includes name", "passed": true} + ] + } + } +} +``` + +--- + +## analysis.json + +Output from post-hoc analyzer. Located at `/analysis.json`. + +```json +{ + "comparison_summary": { + "winner": "A", + "winner_skill": "path/to/winner/skill", + "loser_skill": "path/to/loser/skill", + "comparator_reasoning": "Brief summary of why comparator chose winner" + }, + "winner_strengths": [ + "Clear step-by-step instructions for handling multi-page documents", + "Included validation script that caught formatting errors" + ], + "loser_weaknesses": [ + "Vague instruction 'process the document appropriately' led to inconsistent behavior", + "No script for validation, agent had to improvise" + ], + "instruction_following": { + "winner": { + "score": 9, + "issues": ["Minor: skipped optional logging step"] + }, + "loser": { + "score": 6, + "issues": [ + "Did not use the skill's formatting template", + "Invented own approach instead of following step 3" + ] + } + }, + "improvement_suggestions": [ + { + "priority": "high", + "category": "instructions", + "suggestion": "Replace 'process the document appropriately' with explicit steps", + "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior" + } + ], + "transcript_insights": { + "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script", + "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods" + } +} +``` diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/__init__.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/aggregate_benchmark.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/aggregate_benchmark.py new file mode 100644 index 000000000..ccc810819 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/aggregate_benchmark.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +""" +Aggregate individual run results into benchmark summary statistics. + +Reads grading.json files from run directories and produces: +- run_summary with mean, stddev, min, max for each metric +- delta between with_skill and without_skill configurations + +Usage: + python aggregate_benchmark.py + +Example: + python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/ + +The script supports two directory layouts: + + Workspace layout (from skill-creator iterations): + / + └── eval-N/ + ├── with_skill/ + │ ├── run-1/grading.json + │ └── run-2/grading.json + └── without_skill/ + ├── run-1/grading.json + └── run-2/grading.json + + Legacy layout (with runs/ subdirectory): + / + └── runs/ + └── eval-N/ + ├── with_skill/ + │ └── run-1/grading.json + └── without_skill/ + └── run-1/grading.json +""" + +import argparse +import json +import math +import sys +from datetime import datetime, timezone +from pathlib import Path + +from loguru import logger + + +def calculate_stats(values: list[float]) -> dict: + """Calculate mean, stddev, min, max for a list of values.""" + if not values: + return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0} + + n = len(values) + mean = sum(values) / n + + if n > 1: + variance = sum((x - mean) ** 2 for x in values) / (n - 1) + stddev = math.sqrt(variance) + else: + stddev = 0.0 + + return { + "mean": round(mean, 4), + "stddev": round(stddev, 4), + "min": round(min(values), 4), + "max": round(max(values), 4) + } + + +def load_run_results(benchmark_dir: Path) -> dict: + """ + Load all run results from a benchmark directory. + + Returns dict keyed by config name (e.g. "with_skill"/"without_skill", + or "new_skill"/"old_skill"), each containing a list of run results. + """ + # Support both layouts: eval dirs directly under benchmark_dir, or under runs/ + runs_dir = benchmark_dir / "runs" + if runs_dir.exists(): + search_dir = runs_dir + elif list(benchmark_dir.glob("eval-*")): + search_dir = benchmark_dir + else: + logger.warning(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}") + return {} + + results: dict[str, list] = {} + + for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))): + metadata_path = eval_dir / "eval_metadata.json" + if metadata_path.exists(): + try: + with open(metadata_path) as mf: + eval_id = json.load(mf).get("eval_id", eval_idx) + except (json.JSONDecodeError, OSError): + eval_id = eval_idx + else: + try: + eval_id = int(eval_dir.name.split("-")[1]) + except ValueError: + eval_id = eval_idx + + # Discover config directories dynamically rather than hardcoding names + for config_dir in sorted(eval_dir.iterdir()): + if not config_dir.is_dir(): + continue + # Skip non-config directories (inputs, outputs, etc.) + if not list(config_dir.glob("run-*")): + continue + config = config_dir.name + if config not in results: + results[config] = [] + + for run_dir in sorted(config_dir.glob("run-*")): + run_number = int(run_dir.name.split("-")[1]) + grading_file = run_dir / "grading.json" + + if not grading_file.exists(): + logger.warning(f"Warning: grading.json not found in {run_dir}") + continue + + try: + with open(grading_file) as f: + grading = json.load(f) + except json.JSONDecodeError as e: + logger.warning(f"Warning: Invalid JSON in {grading_file}: {e}") + continue + + # Extract metrics + result = { + "eval_id": eval_id, + "run_number": run_number, + "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0), + "passed": grading.get("summary", {}).get("passed", 0), + "failed": grading.get("summary", {}).get("failed", 0), + "total": grading.get("summary", {}).get("total", 0), + } + + # Extract timing — check grading.json first, then sibling timing.json + timing = grading.get("timing", {}) + result["time_seconds"] = timing.get("total_duration_seconds", 0.0) + timing_file = run_dir / "timing.json" + if result["time_seconds"] == 0.0 and timing_file.exists(): + try: + with open(timing_file) as tf: + timing_data = json.load(tf) + result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0) + result["tokens"] = timing_data.get("total_tokens", 0) + except json.JSONDecodeError: + pass + + # Extract metrics if available + metrics = grading.get("execution_metrics", {}) + result["tool_calls"] = metrics.get("total_tool_calls", 0) + if not result.get("tokens"): + result["tokens"] = metrics.get("output_chars", 0) + result["errors"] = metrics.get("errors_encountered", 0) + + # Extract expectations — viewer requires fields: text, passed, evidence + raw_expectations = grading.get("expectations", []) + for exp in raw_expectations: + if "text" not in exp or "passed" not in exp: + logger.warning(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}") + result["expectations"] = raw_expectations + + # Extract notes from user_notes_summary + notes_summary = grading.get("user_notes_summary", {}) + notes = [] + notes.extend(notes_summary.get("uncertainties", [])) + notes.extend(notes_summary.get("needs_review", [])) + notes.extend(notes_summary.get("workarounds", [])) + result["notes"] = notes + + results[config].append(result) + + return results + + +def aggregate_results(results: dict) -> dict: + """ + Aggregate run results into summary statistics. + + Returns run_summary with stats for each configuration and delta. + """ + run_summary = {} + configs = list(results.keys()) + + for config in configs: + runs = results.get(config, []) + + if not runs: + run_summary[config] = { + "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}, + "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}, + "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0} + } + continue + + pass_rates = [r["pass_rate"] for r in runs] + times = [r["time_seconds"] for r in runs] + tokens = [r.get("tokens", 0) for r in runs] + + run_summary[config] = { + "pass_rate": calculate_stats(pass_rates), + "time_seconds": calculate_stats(times), + "tokens": calculate_stats(tokens) + } + + # Calculate delta between the first two configs (if two exist) + if len(configs) >= 2: + primary = run_summary.get(configs[0], {}) + baseline = run_summary.get(configs[1], {}) + else: + primary = run_summary.get(configs[0], {}) if configs else {} + baseline = {} + + delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0) + delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0) + delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0) + + run_summary["delta"] = { + "pass_rate": f"{delta_pass_rate:+.2f}", + "time_seconds": f"{delta_time:+.1f}", + "tokens": f"{delta_tokens:+.0f}" + } + + return run_summary + + +def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict: + """ + Generate complete benchmark.json from run results. + """ + results = load_run_results(benchmark_dir) + run_summary = aggregate_results(results) + + # Build runs array for benchmark.json + runs = [] + for config in results: + for result in results[config]: + runs.append({ + "eval_id": result["eval_id"], + "configuration": config, + "run_number": result["run_number"], + "result": { + "pass_rate": result["pass_rate"], + "passed": result["passed"], + "failed": result["failed"], + "total": result["total"], + "time_seconds": result["time_seconds"], + "tokens": result.get("tokens", 0), + "tool_calls": result.get("tool_calls", 0), + "errors": result.get("errors", 0) + }, + "expectations": result["expectations"], + "notes": result["notes"] + }) + + # Determine eval IDs from results + eval_ids = sorted(set( + r["eval_id"] + for config in results.values() + for r in config + )) + + benchmark = { + "metadata": { + "skill_name": skill_name or "", + "skill_path": skill_path or "", + "executor_model": "", + "analyzer_model": "", + "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "evals_run": eval_ids, + "runs_per_configuration": 3 + }, + "runs": runs, + "run_summary": run_summary, + "notes": [] # To be filled by analyzer + } + + return benchmark + + +def generate_markdown(benchmark: dict) -> str: + """Generate human-readable benchmark.md from benchmark data.""" + metadata = benchmark["metadata"] + run_summary = benchmark["run_summary"] + + # Determine config names (excluding "delta") + configs = [k for k in run_summary if k != "delta"] + config_a = configs[0] if len(configs) >= 1 else "config_a" + config_b = configs[1] if len(configs) >= 2 else "config_b" + label_a = config_a.replace("_", " ").title() + label_b = config_b.replace("_", " ").title() + + lines = [ + f"# Skill Benchmark: {metadata['skill_name']}", + "", + f"**Model**: {metadata['executor_model']}", + f"**Date**: {metadata['timestamp']}", + f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)", + "", + "## Summary", + "", + f"| Metric | {label_a} | {label_b} | Delta |", + "|--------|------------|---------------|-------|", + ] + + a_summary = run_summary.get(config_a, {}) + b_summary = run_summary.get(config_b, {}) + delta = run_summary.get("delta", {}) + + # Format pass rate + a_pr = a_summary.get("pass_rate", {}) + b_pr = b_summary.get("pass_rate", {}) + lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |") + + # Format time + a_time = a_summary.get("time_seconds", {}) + b_time = b_summary.get("time_seconds", {}) + lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |") + + # Format tokens + a_tokens = a_summary.get("tokens", {}) + b_tokens = b_summary.get("tokens", {}) + lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |") + + # Notes section + if benchmark.get("notes"): + lines.extend([ + "", + "## Notes", + "" + ]) + for note in benchmark["notes"]: + lines.append(f"- {note}") + + return "\n".join(lines) + + +def main(): + parser = argparse.ArgumentParser( + description="Aggregate benchmark run results into summary statistics" + ) + parser.add_argument( + "benchmark_dir", + type=Path, + help="Path to the benchmark directory" + ) + parser.add_argument( + "--skill-name", + default="", + help="Name of the skill being benchmarked" + ) + parser.add_argument( + "--skill-path", + default="", + help="Path to the skill being benchmarked" + ) + parser.add_argument( + "--output", "-o", + type=Path, + help="Output path for benchmark.json (default: /benchmark.json)" + ) + + args = parser.parse_args() + + if not args.benchmark_dir.exists(): + logger.error(f"Directory not found: {args.benchmark_dir}") + sys.exit(1) + + # Generate benchmark + benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path) + + # Determine output paths + output_json = args.output or (args.benchmark_dir / "benchmark.json") + output_md = output_json.with_suffix(".md") + + # Write benchmark.json + with open(output_json, "w") as f: + json.dump(benchmark, f, indent=2) + logger.info(f"Generated: {output_json}") + + # Write benchmark.md + markdown = generate_markdown(benchmark) + with open(output_md, "w") as f: + f.write(markdown) + logger.info(f"Generated: {output_md}") + + # Print summary + run_summary = benchmark["run_summary"] + configs = [k for k in run_summary if k != "delta"] + delta = run_summary.get("delta", {}) + + logger.info(f"\nSummary:") + for config in configs: + pr = run_summary[config]["pass_rate"]["mean"] + label = config.replace("_", " ").title() + logger.info(f" {label}: {pr*100:.1f}% pass rate") + logger.info(f" Delta: {delta.get('pass_rate', '—')}") + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/generate_report.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/generate_report.py new file mode 100644 index 000000000..395232d96 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/generate_report.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 +"""Generate an HTML report from run_loop.py output. + +Takes the JSON output from run_loop.py and generates a visual HTML report +showing each description attempt with check/x for each test case. +Distinguishes between train and test queries. +""" + +import argparse +import html +import json +import sys +from pathlib import Path + +from loguru import logger + + +def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str: + """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag.""" + history = data.get("history", []) + holdout = data.get("holdout", 0) + title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else "" + + # Get all unique queries from train and test sets, with should_trigger info + train_queries: list[dict] = [] + test_queries: list[dict] = [] + if history: + for r in history[0].get("train_results", history[0].get("results", [])): + train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)}) + if history[0].get("test_results"): + for r in history[0].get("test_results", []): + test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)}) + + refresh_tag = ' \n' if auto_refresh else "" + + html_parts = [""" + + + +""" + refresh_tag + """ """ + title_prefix + """Skill Description Optimization + + + + + + +

""" + title_prefix + """Skill Description Optimization

+
+ Optimizing your skill's description. This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill. +
+"""] + + # Summary section + best_test_score = data.get('best_test_score') + best_train_score = data.get('best_train_score') + html_parts.append(f""" +
+

Original: {html.escape(data.get('original_description', 'N/A'))}

+

Best: {html.escape(data.get('best_description', 'N/A'))}

+

Best Score: {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}

+

Iterations: {data.get('iterations_run', 0)} | Train: {data.get('train_size', '?')} | Test: {data.get('test_size', '?')}

+
+""") + + # Legend + html_parts.append(""" +
+ Query columns: + Should trigger + Should NOT trigger + Train + Test +
+""") + + # Table header + html_parts.append(""" +
+ + + + + + + +""") + + # Add column headers for train queries + for qinfo in train_queries: + polarity = "positive-col" if qinfo["should_trigger"] else "negative-col" + html_parts.append(f' \n') + + # Add column headers for test queries (different color) + for qinfo in test_queries: + polarity = "positive-col" if qinfo["should_trigger"] else "negative-col" + html_parts.append(f' \n') + + html_parts.append(""" + + +""") + + # Find best iteration for highlighting + if test_queries: + best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration") + else: + best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration") + + # Add rows for each iteration + for h in history: + iteration = h.get("iteration", "?") + train_passed = h.get("train_passed", h.get("passed", 0)) + train_total = h.get("train_total", h.get("total", 0)) + test_passed = h.get("test_passed") + test_total = h.get("test_total") + description = h.get("description", "") + train_results = h.get("train_results", h.get("results", [])) + test_results = h.get("test_results", []) + + # Create lookups for results by query + train_by_query = {r["query"]: r for r in train_results} + test_by_query = {r["query"]: r for r in test_results} if test_results else {} + + # Compute aggregate correct/total runs across all retries + def aggregate_runs(results: list[dict]) -> tuple[int, int]: + correct = 0 + total = 0 + for r in results: + runs = r.get("runs", 0) + triggers = r.get("triggers", 0) + total += runs + if r.get("should_trigger", True): + correct += triggers + else: + correct += runs - triggers + return correct, total + + train_correct, train_runs = aggregate_runs(train_results) + test_correct, test_runs = aggregate_runs(test_results) + + # Determine score classes + def score_class(correct: int, total: int) -> str: + if total > 0: + ratio = correct / total + if ratio >= 0.8: + return "score-good" + elif ratio >= 0.5: + return "score-ok" + return "score-bad" + + train_class = score_class(train_correct, train_runs) + test_class = score_class(test_correct, test_runs) + + row_class = "best-row" if iteration == best_iter else "" + + html_parts.append(f""" + + + + +""") + + # Add result for each train query + for qinfo in train_queries: + r = train_by_query.get(qinfo["query"], {}) + did_pass = r.get("pass", False) + triggers = r.get("triggers", 0) + runs = r.get("runs", 0) + + icon = "✓" if did_pass else "✗" + css_class = "pass" if did_pass else "fail" + + html_parts.append(f' \n') + + # Add result for each test query (with different background) + for qinfo in test_queries: + r = test_by_query.get(qinfo["query"], {}) + did_pass = r.get("pass", False) + triggers = r.get("triggers", 0) + runs = r.get("runs", 0) + + icon = "✓" if did_pass else "✗" + css_class = "pass" if did_pass else "fail" + + html_parts.append(f' \n') + + html_parts.append(" \n") + + html_parts.append(""" +
IterTrainTestDescription{html.escape(qinfo["query"])}{html.escape(qinfo["query"])}
{iteration}{train_correct}/{train_runs}{test_correct}/{test_runs}{html.escape(description)}{icon}{triggers}/{runs}{icon}{triggers}/{runs}
+
+""") + + html_parts.append(""" + + +""") + + return "".join(html_parts) + + +def main(): + parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output") + parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)") + parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)") + parser.add_argument("--skill-name", default="", help="Skill name to include in the report title") + args = parser.parse_args() + + if args.input == "-": + data = json.load(sys.stdin) + else: + data = json.loads(Path(args.input).read_text()) + + html_output = generate_html(data, skill_name=args.skill_name) + + if args.output: + Path(args.output).write_text(html_output) + logger.info(f"Report written to {args.output}") + else: + print(html_output) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/improve_description.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/improve_description.py new file mode 100644 index 000000000..887a06a08 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/improve_description.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +"""Improve a skill description based on eval results. + +Takes eval results (from run_eval.py) and generates an improved description +using Claude with extended thinking. +""" + +import argparse +import json +import re +import sys +from pathlib import Path + +import anthropic +from loguru import logger + +from scripts.utils import parse_skill_md + + +def improve_description( + client: anthropic.Anthropic, + skill_name: str, + skill_content: str, + current_description: str, + eval_results: dict, + history: list[dict], + model: str, + test_results: dict | None = None, + log_dir: Path | None = None, + iteration: int | None = None, +) -> str: + """Call Claude to improve the description based on eval results.""" + failed_triggers = [ + r for r in eval_results["results"] + if r["should_trigger"] and not r["pass"] + ] + false_triggers = [ + r for r in eval_results["results"] + if not r["should_trigger"] and not r["pass"] + ] + + # Build scores summary + train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}" + if test_results: + test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}" + scores_summary = f"Train: {train_score}, Test: {test_score}" + else: + scores_summary = f"Train: {train_score}" + + prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples. + +The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones. + +Here's the current description: + +"{current_description}" + + +Current scores ({scores_summary}): + +""" + if failed_triggers: + prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n" + for r in failed_triggers: + prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n' + prompt += "\n" + + if false_triggers: + prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n" + for r in false_triggers: + prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n' + prompt += "\n" + + if history: + prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n" + for h in history: + train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}" + test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None + score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "") + prompt += f'\n' + prompt += f'Description: "{h["description"]}"\n' + if "results" in h: + prompt += "Train results:\n" + for r in h["results"]: + status = "PASS" if r["pass"] else "FAIL" + prompt += f' [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n' + if h.get("note"): + prompt += f'Note: {h["note"]}\n' + prompt += "\n\n" + + prompt += f""" + +Skill content (for context on what the skill does): + +{skill_content} + + +Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold: + +1. Avoid overfitting +2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description. + +Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy. + +Here are some tips that we've found to work well in writing these descriptions: +- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does" +- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works. +- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable. +- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings. + +I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end. + +Please respond with only the new description text in tags, nothing else.""" + + response = client.messages.create( + model=model, + max_tokens=16000, + thinking={ + "type": "enabled", + "budget_tokens": 10000, + }, + messages=[{"role": "user", "content": prompt}], + ) + + # Extract thinking and text from response + thinking_text = "" + text = "" + for block in response.content: + if block.type == "thinking": + thinking_text = block.thinking + elif block.type == "text": + text = block.text + + # Parse out the tags + match = re.search(r"(.*?)", text, re.DOTALL) + description = match.group(1).strip().strip('"') if match else text.strip().strip('"') + + # Log the transcript + transcript: dict = { + "iteration": iteration, + "prompt": prompt, + "thinking": thinking_text, + "response": text, + "parsed_description": description, + "char_count": len(description), + "over_limit": len(description) > 1024, + } + + # If over 1024 chars, ask the model to shorten it + if len(description) > 1024: + shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in tags." + shorten_response = client.messages.create( + model=model, + max_tokens=16000, + thinking={ + "type": "enabled", + "budget_tokens": 10000, + }, + messages=[ + {"role": "user", "content": prompt}, + {"role": "assistant", "content": text}, + {"role": "user", "content": shorten_prompt}, + ], + ) + + shorten_thinking = "" + shorten_text = "" + for block in shorten_response.content: + if block.type == "thinking": + shorten_thinking = block.thinking + elif block.type == "text": + shorten_text = block.text + + match = re.search(r"(.*?)", shorten_text, re.DOTALL) + shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"') + + transcript["rewrite_prompt"] = shorten_prompt + transcript["rewrite_thinking"] = shorten_thinking + transcript["rewrite_response"] = shorten_text + transcript["rewrite_description"] = shortened + transcript["rewrite_char_count"] = len(shortened) + description = shortened + + transcript["final_description"] = description + + if log_dir: + log_dir.mkdir(parents=True, exist_ok=True) + log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json" + log_file.write_text(json.dumps(transcript, indent=2)) + + return description + + +def main(): + parser = argparse.ArgumentParser(description="Improve a skill description based on eval results") + parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)") + parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr") + args = parser.parse_args() + + skill_path = Path(args.skill_path) + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + eval_results = json.loads(Path(args.eval_results).read_text()) + history = [] + if args.history: + history = json.loads(Path(args.history).read_text()) + + name, _, content = parse_skill_md(skill_path) + current_description = eval_results["description"] + + if args.verbose: + logger.info(f"Current: {current_description}") + logger.info(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}") + + client = anthropic.Anthropic() + new_description = improve_description( + client=client, + skill_name=name, + skill_content=content, + current_description=current_description, + eval_results=eval_results, + history=history, + model=args.model, + ) + + if args.verbose: + logger.info(f"Improved: {new_description}") + + # Output as JSON with both the new description and updated history + output = { + "description": new_description, + "history": history + [{ + "description": current_description, + "passed": eval_results["summary"]["passed"], + "failed": eval_results["summary"]["failed"], + "total": eval_results["summary"]["total"], + "results": eval_results["results"], + }], + } + print(json.dumps(output, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/package_skill.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/package_skill.py new file mode 100644 index 000000000..5dbdf7843 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/package_skill.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Skill Packager - Creates a distributable .skill file of a skill folder + +Usage: + python utils/package_skill.py [output-directory] + +Example: + python utils/package_skill.py skills/public/my-skill + python utils/package_skill.py skills/public/my-skill ./dist +""" + +import fnmatch +import sys +import zipfile +from pathlib import Path + +from loguru import logger +from scripts.quick_validate import validate_skill + +# Patterns to exclude when packaging skills. +EXCLUDE_DIRS = {"__pycache__", "node_modules"} +EXCLUDE_GLOBS = {"*.pyc"} +EXCLUDE_FILES = {".DS_Store"} +# Directories excluded only at the skill root (not when nested deeper). +ROOT_EXCLUDE_DIRS = {"evals"} + + +def should_exclude(rel_path: Path) -> bool: + """Check if a path should be excluded from packaging.""" + parts = rel_path.parts + if any(part in EXCLUDE_DIRS for part in parts): + return True + # rel_path is relative to skill_path.parent, so parts[0] is the skill + # folder name and parts[1] (if present) is the first subdir. + if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS: + return True + name = rel_path.name + if name in EXCLUDE_FILES: + return True + return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS) + + +def package_skill(skill_path, output_dir=None): + """ + Package a skill folder into a .skill file. + + Args: + skill_path: Path to the skill folder + output_dir: Optional output directory for the .skill file (defaults to current directory) + + Returns: + Path to the created .skill file, or None if error + """ + skill_path = Path(skill_path).resolve() + + # Validate skill folder exists + if not skill_path.exists(): + logger.error(f"Skill folder not found: {skill_path}") + return None + + if not skill_path.is_dir(): + logger.error(f"Path is not a directory: {skill_path}") + return None + + # Validate SKILL.md exists + skill_md = skill_path / "SKILL.md" + if not skill_md.exists(): + logger.error(f"SKILL.md not found in {skill_path}") + return None + + # Run validation before packaging + logger.info("Validating skill...") + valid, message = validate_skill(skill_path) + if not valid: + logger.error(f"Validation failed: {message}") + logger.error("Please fix the validation errors before packaging.") + return None + logger.info(f"{message}\n") + + # Determine output location + skill_name = skill_path.name + if output_dir: + output_path = Path(output_dir).resolve() + output_path.mkdir(parents=True, exist_ok=True) + else: + output_path = Path.cwd() + + skill_filename = output_path / f"{skill_name}.skill" + + # Create the .skill file (zip format) + try: + with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf: + # Walk through the skill directory, excluding build artifacts + for file_path in skill_path.rglob('*'): + if not file_path.is_file(): + continue + arcname = file_path.relative_to(skill_path.parent) + if should_exclude(arcname): + logger.debug(f"Skipped: {arcname}") + continue + zipf.write(file_path, arcname) + logger.debug(f"Added: {arcname}") + + logger.info(f"Successfully packaged skill to: {skill_filename}") + return skill_filename + + except Exception as e: + logger.error(f"Error creating .skill file: {e}") + return None + + +def main(): + if len(sys.argv) < 2: + logger.info("Usage: python utils/package_skill.py [output-directory]") + logger.info("\nExample:") + logger.info(" python utils/package_skill.py skills/public/my-skill") + logger.info(" python utils/package_skill.py skills/public/my-skill ./dist") + sys.exit(1) + + skill_path = sys.argv[1] + output_dir = sys.argv[2] if len(sys.argv) > 2 else None + + logger.info(f"Packaging skill: {skill_path}") + if output_dir: + logger.info(f" Output directory: {output_dir}") + logger.info("") + + result = package_skill(skill_path, output_dir) + + if result: + sys.exit(0) + else: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/quick_validate.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/quick_validate.py new file mode 100644 index 000000000..36553161e --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/quick_validate.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Quick validation script for skills - minimal version +""" + +import sys +import os +import re +import yaml +from pathlib import Path + +from loguru import logger + +def validate_skill(skill_path): + """Basic validation of a skill""" + skill_path = Path(skill_path) + + # Check SKILL.md exists + skill_md = skill_path / 'SKILL.md' + if not skill_md.exists(): + return False, "SKILL.md not found" + + # Read and validate frontmatter + content = skill_md.read_text() + if not content.startswith('---'): + return False, "No YAML frontmatter found" + + # Extract frontmatter + match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL) + if not match: + return False, "Invalid frontmatter format" + + frontmatter_text = match.group(1) + + # Parse YAML frontmatter + try: + frontmatter = yaml.safe_load(frontmatter_text) + if not isinstance(frontmatter, dict): + return False, "Frontmatter must be a YAML dictionary" + except yaml.YAMLError as e: + return False, f"Invalid YAML in frontmatter: {e}" + + # Define allowed properties + ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'} + + # Check for unexpected properties (excluding nested keys under metadata) + unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES + if unexpected_keys: + return False, ( + f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. " + f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}" + ) + + # Check required fields + if 'name' not in frontmatter: + return False, "Missing 'name' in frontmatter" + if 'description' not in frontmatter: + return False, "Missing 'description' in frontmatter" + + # Extract name for validation + name = frontmatter.get('name', '') + if not isinstance(name, str): + return False, f"Name must be a string, got {type(name).__name__}" + name = name.strip() + if name: + # Check naming convention (kebab-case: lowercase with hyphens) + if not re.match(r'^[a-z0-9-]+$', name): + return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)" + if name.startswith('-') or name.endswith('-') or '--' in name: + return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens" + # Check name length (max 64 characters per spec) + if len(name) > 64: + return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters." + + # Extract and validate description + description = frontmatter.get('description', '') + if not isinstance(description, str): + return False, f"Description must be a string, got {type(description).__name__}" + description = description.strip() + if description: + # Check for angle brackets + if '<' in description or '>' in description: + return False, "Description cannot contain angle brackets (< or >)" + # Check description length (max 1024 characters per spec) + if len(description) > 1024: + return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters." + + # Validate compatibility field if present (optional) + compatibility = frontmatter.get('compatibility', '') + if compatibility: + if not isinstance(compatibility, str): + return False, f"Compatibility must be a string, got {type(compatibility).__name__}" + if len(compatibility) > 500: + return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters." + + return True, "Skill is valid!" + +if __name__ == "__main__": + if len(sys.argv) != 2: + logger.info("Usage: python quick_validate.py ") + sys.exit(1) + + valid, message = validate_skill(sys.argv[1]) + if valid: + logger.info(message) + else: + logger.error(message) + sys.exit(0 if valid else 1) \ No newline at end of file diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_eval.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_eval.py new file mode 100644 index 000000000..f923066ca --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_eval.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python3 +"""Run trigger evaluation for a skill description. + +Tests whether a skill's description causes Claude to trigger (read the skill) +for a set of queries. Outputs results as JSON. +""" + +import argparse +import json +import os +import select +import subprocess +import sys +import time +import uuid +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path + +from loguru import logger + +from scripts.utils import parse_skill_md + + +def find_project_root() -> Path: + """Find the project root by walking up from cwd looking for .claude/. + + Mimics how Claude Code discovers its project root, so the command file + we create ends up where claude -p will look for it. + """ + current = Path.cwd() + for parent in [current, *current.parents]: + if (parent / ".claude").is_dir(): + return parent + return current + + +def run_single_query( + query: str, + skill_name: str, + skill_description: str, + timeout: int, + project_root: str, + model: str | None = None, +) -> bool: + """Run a single query and return whether the skill was triggered. + + Creates a command file in .claude/commands/ so it appears in Claude's + available_skills list, then runs `claude -p` with the raw query. + Uses --include-partial-messages to detect triggering early from + stream events (content_block_start) rather than waiting for the + full assistant message, which only arrives after tool execution. + """ + unique_id = uuid.uuid4().hex[:8] + clean_name = f"{skill_name}-skill-{unique_id}" + project_commands_dir = Path(project_root) / ".claude" / "commands" + command_file = project_commands_dir / f"{clean_name}.md" + + try: + project_commands_dir.mkdir(parents=True, exist_ok=True) + # Use YAML block scalar to avoid breaking on quotes in description + indented_desc = "\n ".join(skill_description.split("\n")) + command_content = ( + f"---\n" + f"description: |\n" + f" {indented_desc}\n" + f"---\n\n" + f"# {skill_name}\n\n" + f"This skill handles: {skill_description}\n" + ) + command_file.write_text(command_content) + + cmd = [ + "claude", + "-p", query, + "--output-format", "stream-json", + "--verbose", + "--include-partial-messages", + ] + if model: + cmd.extend(["--model", model]) + + # Remove CLAUDECODE env var to allow nesting claude -p inside a + # Claude Code session. The guard is for interactive terminal conflicts; + # programmatic subprocess usage is safe. + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + cwd=project_root, + env=env, + ) + + triggered = False + start_time = time.time() + buffer = "" + # Track state for stream event detection + pending_tool_name = None + accumulated_json = "" + + try: + while time.time() - start_time < timeout: + if process.poll() is not None: + remaining = process.stdout.read() + if remaining: + buffer += remaining.decode("utf-8", errors="replace") + break + + ready, _, _ = select.select([process.stdout], [], [], 1.0) + if not ready: + continue + + chunk = os.read(process.stdout.fileno(), 8192) + if not chunk: + break + buffer += chunk.decode("utf-8", errors="replace") + + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if not line: + continue + + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + + # Early detection via stream events + if event.get("type") == "stream_event": + se = event.get("event", {}) + se_type = se.get("type", "") + + if se_type == "content_block_start": + cb = se.get("content_block", {}) + if cb.get("type") == "tool_use": + tool_name = cb.get("name", "") + if tool_name in ("Skill", "Read"): + pending_tool_name = tool_name + accumulated_json = "" + else: + return False + + elif se_type == "content_block_delta" and pending_tool_name: + delta = se.get("delta", {}) + if delta.get("type") == "input_json_delta": + accumulated_json += delta.get("partial_json", "") + if clean_name in accumulated_json: + return True + + elif se_type in ("content_block_stop", "message_stop"): + if pending_tool_name: + return clean_name in accumulated_json + if se_type == "message_stop": + return False + + # Fallback: full assistant message + elif event.get("type") == "assistant": + message = event.get("message", {}) + for content_item in message.get("content", []): + if content_item.get("type") != "tool_use": + continue + tool_name = content_item.get("name", "") + tool_input = content_item.get("input", {}) + if tool_name == "Skill" and clean_name in tool_input.get("skill", ""): + triggered = True + elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""): + triggered = True + return triggered + + elif event.get("type") == "result": + return triggered + finally: + # Clean up process on any exit path (return, exception, timeout) + if process.poll() is None: + process.kill() + process.wait() + + return triggered + finally: + if command_file.exists(): + command_file.unlink() + + +def run_eval( + eval_set: list[dict], + skill_name: str, + description: str, + num_workers: int, + timeout: int, + project_root: Path, + runs_per_query: int = 1, + trigger_threshold: float = 0.5, + model: str | None = None, +) -> dict: + """Run the full eval set and return results.""" + results = [] + + with ProcessPoolExecutor(max_workers=num_workers) as executor: + future_to_info = {} + for item in eval_set: + for run_idx in range(runs_per_query): + future = executor.submit( + run_single_query, + item["query"], + skill_name, + description, + timeout, + str(project_root), + model, + ) + future_to_info[future] = (item, run_idx) + + query_triggers: dict[str, list[bool]] = {} + query_items: dict[str, dict] = {} + for future in as_completed(future_to_info): + item, _ = future_to_info[future] + query = item["query"] + query_items[query] = item + if query not in query_triggers: + query_triggers[query] = [] + try: + query_triggers[query].append(future.result()) + except Exception as e: + logger.warning(f"Warning: query failed: {e}") + query_triggers[query].append(False) + + for query, triggers in query_triggers.items(): + item = query_items[query] + trigger_rate = sum(triggers) / len(triggers) + should_trigger = item["should_trigger"] + if should_trigger: + did_pass = trigger_rate >= trigger_threshold + else: + did_pass = trigger_rate < trigger_threshold + results.append({ + "query": query, + "should_trigger": should_trigger, + "trigger_rate": trigger_rate, + "triggers": sum(triggers), + "runs": len(triggers), + "pass": did_pass, + }) + + passed = sum(1 for r in results if r["pass"]) + total = len(results) + + return { + "skill_name": skill_name, + "description": description, + "results": results, + "summary": { + "total": total, + "passed": passed, + "failed": total - passed, + }, + } + + +def main(): + parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description") + parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--description", default=None, help="Override description to test") + parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") + parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") + parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") + parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") + parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)") + parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") + args = parser.parse_args() + + eval_set = json.loads(Path(args.eval_set).read_text()) + skill_path = Path(args.skill_path) + + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + name, original_description, content = parse_skill_md(skill_path) + description = args.description or original_description + project_root = find_project_root() + + if args.verbose: + logger.info(f"Evaluating: {description}") + + output = run_eval( + eval_set=eval_set, + skill_name=name, + description=description, + num_workers=args.num_workers, + timeout=args.timeout, + project_root=project_root, + runs_per_query=args.runs_per_query, + trigger_threshold=args.trigger_threshold, + model=args.model, + ) + + if args.verbose: + summary = output["summary"] + logger.info(f"Results: {summary['passed']}/{summary['total']} passed") + for r in output["results"]: + status = "PASS" if r["pass"] else "FAIL" + rate_str = f"{r['triggers']}/{r['runs']}" + logger.info(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}") + + print(json.dumps(output, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_loop.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_loop.py new file mode 100644 index 000000000..a2907d6e0 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/run_loop.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python3 +"""Run the eval + improve loop until all pass or max iterations reached. + +Combines run_eval.py and improve_description.py in a loop, tracking history +and returning the best description found. Supports train/test split to prevent +overfitting. +""" + +import argparse +import json +import random +import sys +import tempfile +import time +import webbrowser +from pathlib import Path + +import anthropic +from loguru import logger + +from scripts.generate_report import generate_html +from scripts.improve_description import improve_description +from scripts.run_eval import find_project_root, run_eval +from scripts.utils import parse_skill_md + + +def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]: + """Split eval set into train and test sets, stratified by should_trigger.""" + random.seed(seed) + + # Separate by should_trigger + trigger = [e for e in eval_set if e["should_trigger"]] + no_trigger = [e for e in eval_set if not e["should_trigger"]] + + # Shuffle each group + random.shuffle(trigger) + random.shuffle(no_trigger) + + # Calculate split points + n_trigger_test = max(1, int(len(trigger) * holdout)) + n_no_trigger_test = max(1, int(len(no_trigger) * holdout)) + + # Split + test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test] + train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:] + + return train_set, test_set + + +def run_loop( + eval_set: list[dict], + skill_path: Path, + description_override: str | None, + num_workers: int, + timeout: int, + max_iterations: int, + runs_per_query: int, + trigger_threshold: float, + holdout: float, + model: str, + verbose: bool, + live_report_path: Path | None = None, + log_dir: Path | None = None, +) -> dict: + """Run the eval + improvement loop.""" + project_root = find_project_root() + name, original_description, content = parse_skill_md(skill_path) + current_description = description_override or original_description + + # Split into train/test if holdout > 0 + if holdout > 0: + train_set, test_set = split_eval_set(eval_set, holdout) + if verbose: + logger.info(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})") + else: + train_set = eval_set + test_set = [] + + client = anthropic.Anthropic() + history = [] + exit_reason = "unknown" + + for iteration in range(1, max_iterations + 1): + if verbose: + logger.info(f"\n{'='*60}") + logger.info(f"Iteration {iteration}/{max_iterations}") + logger.info(f"Description: {current_description}") + logger.info(f"{'='*60}") + + # Evaluate train + test together in one batch for parallelism + all_queries = train_set + test_set + t0 = time.time() + all_results = run_eval( + eval_set=all_queries, + skill_name=name, + description=current_description, + num_workers=num_workers, + timeout=timeout, + project_root=project_root, + runs_per_query=runs_per_query, + trigger_threshold=trigger_threshold, + model=model, + ) + eval_elapsed = time.time() - t0 + + # Split results back into train/test by matching queries + train_queries_set = {q["query"] for q in train_set} + train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set] + test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set] + + train_passed = sum(1 for r in train_result_list if r["pass"]) + train_total = len(train_result_list) + train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total} + train_results = {"results": train_result_list, "summary": train_summary} + + if test_set: + test_passed = sum(1 for r in test_result_list if r["pass"]) + test_total = len(test_result_list) + test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total} + test_results = {"results": test_result_list, "summary": test_summary} + else: + test_results = None + test_summary = None + + history.append({ + "iteration": iteration, + "description": current_description, + "train_passed": train_summary["passed"], + "train_failed": train_summary["failed"], + "train_total": train_summary["total"], + "train_results": train_results["results"], + "test_passed": test_summary["passed"] if test_summary else None, + "test_failed": test_summary["failed"] if test_summary else None, + "test_total": test_summary["total"] if test_summary else None, + "test_results": test_results["results"] if test_results else None, + # For backward compat with report generator + "passed": train_summary["passed"], + "failed": train_summary["failed"], + "total": train_summary["total"], + "results": train_results["results"], + }) + + # Write live report if path provided + if live_report_path: + partial_output = { + "original_description": original_description, + "best_description": current_description, + "best_score": "in progress", + "iterations_run": len(history), + "holdout": holdout, + "train_size": len(train_set), + "test_size": len(test_set), + "history": history, + } + live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name)) + + if verbose: + def print_eval_stats(label, results, elapsed): + pos = [r for r in results if r["should_trigger"]] + neg = [r for r in results if not r["should_trigger"]] + tp = sum(r["triggers"] for r in pos) + pos_runs = sum(r["runs"] for r in pos) + fn = pos_runs - tp + fp = sum(r["triggers"] for r in neg) + neg_runs = sum(r["runs"] for r in neg) + tn = neg_runs - fp + total = tp + tn + fp + fn + precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0 + recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0 + accuracy = (tp + tn) / total if total > 0 else 0.0 + logger.info(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)") + for r in results: + status = "PASS" if r["pass"] else "FAIL" + rate_str = f"{r['triggers']}/{r['runs']}" + logger.info(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}") + + print_eval_stats("Train", train_results["results"], eval_elapsed) + if test_summary: + print_eval_stats("Test ", test_results["results"], 0) + + if train_summary["failed"] == 0: + exit_reason = f"all_passed (iteration {iteration})" + if verbose: + logger.info(f"\nAll train queries passed on iteration {iteration}!") + break + + if iteration == max_iterations: + exit_reason = f"max_iterations ({max_iterations})" + if verbose: + logger.info(f"\nMax iterations reached ({max_iterations}).") + break + + # Improve the description based on train results + if verbose: + logger.info(f"\nImproving description...") + + t0 = time.time() + # Strip test scores from history so improvement model can't see them + blinded_history = [ + {k: v for k, v in h.items() if not k.startswith("test_")} + for h in history + ] + new_description = improve_description( + client=client, + skill_name=name, + skill_content=content, + current_description=current_description, + eval_results=train_results, + history=blinded_history, + model=model, + log_dir=log_dir, + iteration=iteration, + ) + improve_elapsed = time.time() - t0 + + if verbose: + logger.info(f"Proposed ({improve_elapsed:.1f}s): {new_description}") + + current_description = new_description + + # Find the best iteration by TEST score (or train if no test set) + if test_set: + best = max(history, key=lambda h: h["test_passed"] or 0) + best_score = f"{best['test_passed']}/{best['test_total']}" + else: + best = max(history, key=lambda h: h["train_passed"]) + best_score = f"{best['train_passed']}/{best['train_total']}" + + if verbose: + logger.info(f"\nExit reason: {exit_reason}") + logger.info(f"Best score: {best_score} (iteration {best['iteration']})") + + return { + "exit_reason": exit_reason, + "original_description": original_description, + "best_description": best["description"], + "best_score": best_score, + "best_train_score": f"{best['train_passed']}/{best['train_total']}", + "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None, + "final_description": current_description, + "iterations_run": len(history), + "holdout": holdout, + "train_size": len(train_set), + "test_size": len(test_set), + "history": history, + } + + +def main(): + parser = argparse.ArgumentParser(description="Run eval + improve loop") + parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--description", default=None, help="Override starting description") + parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") + parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") + parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations") + parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") + parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") + parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)") + parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") + parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)") + parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here") + args = parser.parse_args() + + eval_set = json.loads(Path(args.eval_set).read_text()) + skill_path = Path(args.skill_path) + + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + name, _, _ = parse_skill_md(skill_path) + + # Set up live report path + if args.report != "none": + if args.report == "auto": + timestamp = time.strftime("%Y%m%d_%H%M%S") + live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html" + else: + live_report_path = Path(args.report) + # Open the report immediately so the user can watch + live_report_path.write_text("

Starting optimization loop...

") + webbrowser.open(str(live_report_path)) + else: + live_report_path = None + + # Determine output directory (create before run_loop so logs can be written) + if args.results_dir: + timestamp = time.strftime("%Y-%m-%d_%H%M%S") + results_dir = Path(args.results_dir) / timestamp + results_dir.mkdir(parents=True, exist_ok=True) + else: + results_dir = None + + log_dir = results_dir / "logs" if results_dir else None + + output = run_loop( + eval_set=eval_set, + skill_path=skill_path, + description_override=args.description, + num_workers=args.num_workers, + timeout=args.timeout, + max_iterations=args.max_iterations, + runs_per_query=args.runs_per_query, + trigger_threshold=args.trigger_threshold, + holdout=args.holdout, + model=args.model, + verbose=args.verbose, + live_report_path=live_report_path, + log_dir=log_dir, + ) + + # Save JSON output + json_output = json.dumps(output, indent=2) + print(json_output) + if results_dir: + (results_dir / "results.json").write_text(json_output) + + # Write final HTML report (without auto-refresh) + if live_report_path: + live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name)) + logger.info(f"\nReport: {live_report_path}") + + if results_dir and live_report_path: + (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name)) + + if results_dir: + logger.info(f"Results saved to: {results_dir}") + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/utils.py b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/utils.py new file mode 100644 index 000000000..51b6a07dd --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/skills/skill-creator/scripts/utils.py @@ -0,0 +1,47 @@ +"""Shared utilities for skill-creator scripts.""" + +from pathlib import Path + + + +def parse_skill_md(skill_path: Path) -> tuple[str, str, str]: + """Parse a SKILL.md file, returning (name, description, full_content).""" + content = (skill_path / "SKILL.md").read_text() + lines = content.split("\n") + + if lines[0].strip() != "---": + raise ValueError("SKILL.md missing frontmatter (no opening ---)") + + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + + if end_idx is None: + raise ValueError("SKILL.md missing frontmatter (no closing ---)") + + name = "" + description = "" + frontmatter_lines = lines[1:end_idx] + i = 0 + while i < len(frontmatter_lines): + line = frontmatter_lines[i] + if line.startswith("name:"): + name = line[len("name:"):].strip().strip('"').strip("'") + elif line.startswith("description:"): + value = line[len("description:"):].strip() + # Handle YAML multiline indicators (>, |, >-, |-) + if value in (">", "|", ">-", "|-"): + continuation_lines: list[str] = [] + i += 1 + while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")): + continuation_lines.append(frontmatter_lines[i].strip()) + i += 1 + description = " ".join(continuation_lines) + continue + else: + description = value.strip('"').strip("'") + i += 1 + + return name, description, content diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/soul.md b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/soul.md new file mode 100644 index 000000000..1554c3463 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/soul.md @@ -0,0 +1,16 @@ +# Soul — {{agent_name}} + +## Identity +- **名称**: {{agent_name}} +- **角色**: {{role_description}} +- **创建者**: {{creator_name}} +- **创建时间**: {{created_at}} + +## Personality +- 认真负责、注重细节 +- 主动汇报工作进展 +- 遇到不确定的信息会主动确认 + +## Boundaries +- 遵守企业保密制度 +- 敏感操作需经过创建者审批 diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/state.json b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/state.json new file mode 100644 index 000000000..0507e31dd --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/state.json @@ -0,0 +1,13 @@ +{ + "agent_id": "", + "name": "", + "status": "idle", + "current_task": null, + "last_active": null, + "channel_status": {}, + "stats": { + "tasks_completed_today": 0, + "tasks_in_progress": 0, + "督办_pending": 0 + } +} \ No newline at end of file diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/todo.json b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/todo.json new file mode 100644 index 000000000..50ffbb9a9 --- /dev/null +++ b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/todo.json @@ -0,0 +1,3 @@ +{ + "tasks": [] +} diff --git a/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/workspace/archived/.gitkeep b/.clawith/data/agents/6123d1f4-d03b-469a-aacc-ad875f63df4e/workspace/archived/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/HEARTBEAT.md b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/HEARTBEAT.md new file mode 100644 index 000000000..485565cb3 --- /dev/null +++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/HEARTBEAT.md @@ -0,0 +1,63 @@ +# HEARTBEAT + +When this file is read during a heartbeat, you are performing a **periodic awareness check**. + +## Phase 1: Review Context & Discover Interest Points + +Review your **recent conversations** and your **role/responsibilities**. +Identify topics or questions that: +- Are directly relevant to your role and current work +- Were mentioned by users but not fully explored at the time +- Represent emerging trends or changes in your professional domain +- Could improve your ability to serve your users + +If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3. +Do NOT search for generic or obvious topics just to fill time. Quality over quantity. + +## Phase 2: Targeted Exploration (Conditional) + +Only if you identified genuine interest points in Phase 1: + +1. Use `web_search` to investigate (maximum 5 searches per heartbeat) +2. Keep searches **tightly scoped** to your role and recent work topics +3. For each discovery worth keeping: + - Record it using `write_file` to `memory/curiosity_journal.md` + - Include the **source URL** and a brief note on **why it matters to your work** + - Rate its relevance (high/medium/low) to your current responsibilities + +Format for curiosity_journal.md entries: +``` +### [Date] - [Topic] +- **Finding**: [What you learned] +- **Source**: [URL] +- **Relevance**: [high/medium/low] — [Why it matters to your work] +- **Follow-up**: [Optional: questions this raises for next time] +``` + +## Phase 3: Agent Plaza + +1. Call `plaza_get_new_posts` to check recent activity +2. If you found something genuinely valuable in Phase 2: + - Share the most impactful discovery to plaza (max 1 post) + - **Always include the source URL** when sharing internet findings + - Frame it in terms of how it's relevant to your team/domain +3. Comment on relevant existing posts (max 2 comments) + +## Phase 4: Wrap Up + +- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK` +- Otherwise, briefly summarize what you explored and why + +## Key Principles +- Always ground exploration in YOUR role and YOUR recent work context +- Never search for random unrelated topics out of idle curiosity +- If you don't have a specific angle worth investigating, don't search +- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries +- Generate follow-up questions only when you genuinely want to know more + +## Rules +- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details +- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links +- 📝 **Limits per heartbeat**: max 1 post + 2 comments +- 🔍 **Search limits**: max 5 web searches per heartbeat +- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK` diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/daily_reports/.gitkeep b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/daily_reports/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/enterprise_info/.gitkeep b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/enterprise_info/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/MEMORY_INDEX.md b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/MEMORY_INDEX.md new file mode 100644 index 000000000..29e3fab13 --- /dev/null +++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/MEMORY_INDEX.md @@ -0,0 +1,6 @@ +# Memory Index + +This file serves as an index of all memories for this digital employee. + +## Topics + diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/curiosity_journal.md b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/curiosity_journal.md new file mode 100644 index 000000000..c5185fe44 --- /dev/null +++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/memory/curiosity_journal.md @@ -0,0 +1,9 @@ +# Curiosity Journal + +This is your exploration log. Record interesting discoveries from your web searches here. + +## Active Questions + + +## Discoveries + diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/skills/.gitkeep b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/skills/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/skills/MCP_INSTALLER.md b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/skills/MCP_INSTALLER.md new file mode 100644 index 000000000..9e3bf3c77 --- /dev/null +++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/skills/MCP_INSTALLER.md @@ -0,0 +1,87 @@ +# MCP Tool Installer + +## When to Use This Skill +Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL. + +--- + +## Step-by-Step Protocol + +### Step 1 — Search first +``` +discover_resources(query="", max_results=5) +``` +Show the results and let the user pick. Note the `ID` field (e.g. `github`). + +### Step 2 — Determine import method + +**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐) +- Requires Smithery API Key (one-time per agent) +- Individual tool tokens NOT needed — Smithery handles auth via OAuth + +**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint) +- User provides the MCP server URL directly +- May require tool-specific API key + +**Not importable** (💻 local-only tools) +- Requires local Docker/process — inform user these cannot be imported automatically + +--- + +### Method A: Smithery Import + +#### Check Smithery API Key +If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim): + +> **Smithery** (smithery.ai) 是一个 MCP 工具市场,类似于"应用商店"。通过它,我可以帮你一键安装各种第三方工具(如 GitHub、Notion、Slack 等),并自动完成认证。 +> +> **为什么需要注册?** +> Smithery 用 API Key 来识别你的身份,这样安装的工具会关联到你的账号,认证信息也会安全保存。 +> +> **注册一次后有什么好处?** +> - 🔑 只需提供一次 Key,后续安装其他工具时我会自动帮你配置 +> - 🔐 不需要为每个工具单独创建 Token(如 GitHub PAT),OAuth 一键授权 +> - 📦 支持上千种 MCP 工具,随时可以扩展你的能力 +> +> **获取步骤:** +> 1. 访问 https://smithery.ai 注册/登录 +> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key +> 3. 将 Key 提供给我 + +#### Import +``` +import_mcp_server( + server_id="", + config={"smithery_api_key": ""} # first time only +) +``` + +#### Handle OAuth +Some tools return an OAuth authorization URL. Tell the user to visit the link. + +**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically. + +--- + +### Method B: Direct URL Import + +When a tool is not available on Smithery but the user has a public MCP endpoint: +``` +import_mcp_server( + server_id="", + config={ + "mcp_url": "https://my-mcp-server.com/sse", + "api_key": "" + } +) +``` +The system will connect to the URL, discover available tools, and register them. + +--- + +## What NOT to Do +- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these +- ❌ Don't tell users to go to Settings — handle everything in chat +- ❌ Don't echo API keys back in your response +- ❌ Don't skip the search step — always verify the server exists before importing +- ❌ Don't import local-only tools — inform users they require local installation diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/soul.md b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/soul.md new file mode 100644 index 000000000..1554c3463 --- /dev/null +++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/soul.md @@ -0,0 +1,16 @@ +# Soul — {{agent_name}} + +## Identity +- **名称**: {{agent_name}} +- **角色**: {{role_description}} +- **创建者**: {{creator_name}} +- **创建时间**: {{created_at}} + +## Personality +- 认真负责、注重细节 +- 主动汇报工作进展 +- 遇到不确定的信息会主动确认 + +## Boundaries +- 遵守企业保密制度 +- 敏感操作需经过创建者审批 diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/state.json b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/state.json new file mode 100644 index 000000000..0507e31dd --- /dev/null +++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/state.json @@ -0,0 +1,13 @@ +{ + "agent_id": "", + "name": "", + "status": "idle", + "current_task": null, + "last_active": null, + "channel_status": {}, + "stats": { + "tasks_completed_today": 0, + "tasks_in_progress": 0, + "督办_pending": 0 + } +} \ No newline at end of file diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/todo.json b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/todo.json new file mode 100644 index 000000000..50ffbb9a9 --- /dev/null +++ b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/todo.json @@ -0,0 +1,3 @@ +{ + "tasks": [] +} diff --git a/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/workspace/archived/.gitkeep b/.clawith/data/agents/6748f50f-50a8-43c4-bc7f-a2a43b7ccd32/workspace/archived/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/HEARTBEAT.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/HEARTBEAT.md new file mode 100644 index 000000000..485565cb3 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/HEARTBEAT.md @@ -0,0 +1,63 @@ +# HEARTBEAT + +When this file is read during a heartbeat, you are performing a **periodic awareness check**. + +## Phase 1: Review Context & Discover Interest Points + +Review your **recent conversations** and your **role/responsibilities**. +Identify topics or questions that: +- Are directly relevant to your role and current work +- Were mentioned by users but not fully explored at the time +- Represent emerging trends or changes in your professional domain +- Could improve your ability to serve your users + +If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3. +Do NOT search for generic or obvious topics just to fill time. Quality over quantity. + +## Phase 2: Targeted Exploration (Conditional) + +Only if you identified genuine interest points in Phase 1: + +1. Use `web_search` to investigate (maximum 5 searches per heartbeat) +2. Keep searches **tightly scoped** to your role and recent work topics +3. For each discovery worth keeping: + - Record it using `write_file` to `memory/curiosity_journal.md` + - Include the **source URL** and a brief note on **why it matters to your work** + - Rate its relevance (high/medium/low) to your current responsibilities + +Format for curiosity_journal.md entries: +``` +### [Date] - [Topic] +- **Finding**: [What you learned] +- **Source**: [URL] +- **Relevance**: [high/medium/low] — [Why it matters to your work] +- **Follow-up**: [Optional: questions this raises for next time] +``` + +## Phase 3: Agent Plaza + +1. Call `plaza_get_new_posts` to check recent activity +2. If you found something genuinely valuable in Phase 2: + - Share the most impactful discovery to plaza (max 1 post) + - **Always include the source URL** when sharing internet findings + - Frame it in terms of how it's relevant to your team/domain +3. Comment on relevant existing posts (max 2 comments) + +## Phase 4: Wrap Up + +- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK` +- Otherwise, briefly summarize what you explored and why + +## Key Principles +- Always ground exploration in YOUR role and YOUR recent work context +- Never search for random unrelated topics out of idle curiosity +- If you don't have a specific angle worth investigating, don't search +- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries +- Generate follow-up questions only when you genuinely want to know more + +## Rules +- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details +- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links +- 📝 **Limits per heartbeat**: max 1 post + 2 comments +- 🔍 **Search limits**: max 5 web searches per heartbeat +- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK` diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/daily_reports/.gitkeep b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/daily_reports/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/enterprise_info/.gitkeep b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/enterprise_info/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/MEMORY_INDEX.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/MEMORY_INDEX.md new file mode 100644 index 000000000..29e3fab13 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/MEMORY_INDEX.md @@ -0,0 +1,6 @@ +# Memory Index + +This file serves as an index of all memories for this digital employee. + +## Topics + diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/curiosity_journal.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/curiosity_journal.md new file mode 100644 index 000000000..c5185fe44 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/curiosity_journal.md @@ -0,0 +1,9 @@ +# Curiosity Journal + +This is your exploration log. Record interesting discoveries from your web searches here. + +## Active Questions + + +## Discoveries + diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/memory.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/memory.md new file mode 100644 index 000000000..a09922cb2 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/memory/memory.md @@ -0,0 +1,6 @@ +# Memory + +## OKR System State +- Last report generated: (none) +- Last progress collection: (none) +- Team members tracked: (pending) diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/relationships.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/relationships.md new file mode 100644 index 000000000..17cf2772b --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/relationships.md @@ -0,0 +1,5 @@ +# Relationships + +## Team Members (OKR tracking) + +_Team members will be added here as they are onboarded into the OKR system._ diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/.gitkeep b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/MCP_INSTALLER.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/MCP_INSTALLER.md new file mode 100644 index 000000000..9e3bf3c77 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/MCP_INSTALLER.md @@ -0,0 +1,87 @@ +# MCP Tool Installer + +## When to Use This Skill +Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL. + +--- + +## Step-by-Step Protocol + +### Step 1 — Search first +``` +discover_resources(query="", max_results=5) +``` +Show the results and let the user pick. Note the `ID` field (e.g. `github`). + +### Step 2 — Determine import method + +**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐) +- Requires Smithery API Key (one-time per agent) +- Individual tool tokens NOT needed — Smithery handles auth via OAuth + +**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint) +- User provides the MCP server URL directly +- May require tool-specific API key + +**Not importable** (💻 local-only tools) +- Requires local Docker/process — inform user these cannot be imported automatically + +--- + +### Method A: Smithery Import + +#### Check Smithery API Key +If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim): + +> **Smithery** (smithery.ai) 是一个 MCP 工具市场,类似于"应用商店"。通过它,我可以帮你一键安装各种第三方工具(如 GitHub、Notion、Slack 等),并自动完成认证。 +> +> **为什么需要注册?** +> Smithery 用 API Key 来识别你的身份,这样安装的工具会关联到你的账号,认证信息也会安全保存。 +> +> **注册一次后有什么好处?** +> - 🔑 只需提供一次 Key,后续安装其他工具时我会自动帮你配置 +> - 🔐 不需要为每个工具单独创建 Token(如 GitHub PAT),OAuth 一键授权 +> - 📦 支持上千种 MCP 工具,随时可以扩展你的能力 +> +> **获取步骤:** +> 1. 访问 https://smithery.ai 注册/登录 +> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key +> 3. 将 Key 提供给我 + +#### Import +``` +import_mcp_server( + server_id="", + config={"smithery_api_key": ""} # first time only +) +``` + +#### Handle OAuth +Some tools return an OAuth authorization URL. Tell the user to visit the link. + +**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically. + +--- + +### Method B: Direct URL Import + +When a tool is not available on Smithery but the user has a public MCP endpoint: +``` +import_mcp_server( + server_id="", + config={ + "mcp_url": "https://my-mcp-server.com/sse", + "api_key": "" + } +) +``` +The system will connect to the URL, discover available tools, and register them. + +--- + +## What NOT to Do +- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these +- ❌ Don't tell users to go to Settings — handle everything in chat +- ❌ Don't echo API keys back in your response +- ❌ Don't skip the search step — always verify the server exists before importing +- ❌ Don't import local-only tools — inform users they require local installation diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/SKILL.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/SKILL.md new file mode 100644 index 000000000..db71c3ed8 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/SKILL.md @@ -0,0 +1,146 @@ +--- +name: Complex Task Executor +description: Structured methodology for decomposing, planning, and executing complex multi-step tasks with progress tracking +--- + +# Complex Task Executor + +## When to Use This Skill + +Use this skill when a task meets ANY of the following criteria: +- Requires more than 3 distinct steps to complete +- Involves multiple tools or information sources +- Has dependencies between steps (step B needs output from step A) +- Requires research before execution +- Could benefit from a documented plan others can review +- The user explicitly asks for a thorough or systematic approach + +**DO NOT use this for simple tasks** like answering a question, reading a single file, or performing one tool call. + +## Workflow + +### Phase 1: Task Analysis (THINK before acting) + +Before creating any files, analyze the task: + +1. **Understand the goal**: What is the final deliverable? What does "done" look like? +2. **Assess complexity**: How many steps? What tools are needed? +3. **Identify dependencies**: Which steps depend on others? +4. **Identify risks**: What could go wrong? What information is missing? +5. **Estimate scope**: Is the task feasible with available tools/skills? + +### Phase 2: Create Task Plan + +Create a task folder and plan file in the workspace: + +``` +workspace//plan.md +``` + +The plan.md MUST follow this exact format: + +```markdown +# Task: + +## Objective + + +## Steps + +- [ ] 1. + - Details: + - Output: +- [ ] 2. + - Details: <...> + - Depends on: Step 1 +- [ ] 3. + - Details: <...> + +## Status +- Created: +- Current Step: Not started +- Progress: 0/ + +## Notes + +``` + +Rules for writing the plan: +- Each step should be completable in 1-3 tool calls +- Use verb-noun format: "Research competitors", "Draft report", "Validate data" +- Mark dependencies explicitly +- Include expected outputs for each step + +### Phase 3: Execute Step-by-Step + +For EACH step in the plan: + +1. **Read the plan** — Call `read_file` on `workspace//plan.md` to check current state +2. **Mark as in-progress** — Update the checkbox from `[ ]` to `[/]` and update the "Current Step" field +3. **Execute the step** — Do the actual work (tool calls, analysis, writing) +4. **Record output** — Save results to `workspace//` (e.g., intermediate files, data) +5. **Mark as complete** — Update the checkbox from `[/]` to `[x]` and update "Progress" counter +6. **Proceed to next step** — Move to the next uncompleted step + +### Phase 4: Completion + +When all steps are done: +1. Update plan.md status to "✅ Completed" +2. Create a `workspace//summary.md` with: + - What was accomplished + - Key results and deliverables + - Any follow-up items +3. Present the final result to the user + +## Adaptive Replanning + +If during execution you discover: +- A step is impossible → Mark it `[!]` with a reason, add alternative steps +- New steps are needed → Add them to the plan with `[+]` prefix +- A step produced unexpected results → Add a note and adjust subsequent steps +- The plan needs major changes → Create a new section "## Revised Plan" and follow it + +Always update plan.md BEFORE changing course, so the plan stays the source of truth. + +## Error Handling + +- If a tool call fails, retry once. If it fails again, mark the step as blocked and note the error. +- Never silently skip a step. Always update the plan to reflect what happened. +- If you're stuck, tell the user what's blocking and ask for guidance. + +## Example Scenarios + +### Example 1: "Research our top 3 competitors and write a comparison report" + +Plan would be: +``` +- [ ] 1. Identify the user's company/product context +- [ ] 2. Research Competitor A — website, pricing, features +- [ ] 3. Research Competitor B — website, pricing, features +- [ ] 4. Research Competitor C — website, pricing, features +- [ ] 5. Create comparison matrix +- [ ] 6. Write analysis and recommendations +- [ ] 7. Compile final report +``` + +### Example 2: "Analyze our Q4 sales data and prepare a board presentation" + +Plan would be: +``` +- [ ] 1. Read and understand the sales data files +- [ ] 2. Calculate key metrics (revenue, growth, trends) +- [ ] 3. Identify top insights and anomalies +- [ ] 4. Create data summary tables +- [ ] 5. Draft presentation outline +- [ ] 6. Write each presentation section +- [ ] 7. Add executive summary +- [ ] 8. Review and polish final document +``` + +## Key Principles + +1. **Plan is the source of truth** — Always update it before moving on +2. **One step at a time** — Don't skip ahead or batch too many steps +3. **Show your work** — Save intermediate results to the task folder +4. **Communicate progress** — The user can read plan.md at any time to see status +5. **Be adaptive** — Plans change; that's OK if you update the plan first diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/examples/plan_template.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/examples/plan_template.md new file mode 100644 index 000000000..dfd60e7cb --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/complex-task-executor/examples/plan_template.md @@ -0,0 +1,23 @@ +# Task: [Title] + +## Objective +[One-sentence description of the desired outcome] + +## Steps + +- [ ] 1. [First step] + - Details: [What specifically to do] + - Output: [What this step produces] +- [ ] 2. [Second step] + - Details: [...] + - Depends on: Step 1 +- [ ] 3. [Third step] + - Details: [...] + +## Status +- Created: [timestamp] +- Current Step: Not started +- Progress: 0/3 + +## Notes +- [Any assumptions, risks, or open questions] diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/mcp-installer/SKILL.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/mcp-installer/SKILL.md new file mode 100644 index 000000000..9e3bf3c77 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/mcp-installer/SKILL.md @@ -0,0 +1,87 @@ +# MCP Tool Installer + +## When to Use This Skill +Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL. + +--- + +## Step-by-Step Protocol + +### Step 1 — Search first +``` +discover_resources(query="", max_results=5) +``` +Show the results and let the user pick. Note the `ID` field (e.g. `github`). + +### Step 2 — Determine import method + +**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐) +- Requires Smithery API Key (one-time per agent) +- Individual tool tokens NOT needed — Smithery handles auth via OAuth + +**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint) +- User provides the MCP server URL directly +- May require tool-specific API key + +**Not importable** (💻 local-only tools) +- Requires local Docker/process — inform user these cannot be imported automatically + +--- + +### Method A: Smithery Import + +#### Check Smithery API Key +If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim): + +> **Smithery** (smithery.ai) 是一个 MCP 工具市场,类似于"应用商店"。通过它,我可以帮你一键安装各种第三方工具(如 GitHub、Notion、Slack 等),并自动完成认证。 +> +> **为什么需要注册?** +> Smithery 用 API Key 来识别你的身份,这样安装的工具会关联到你的账号,认证信息也会安全保存。 +> +> **注册一次后有什么好处?** +> - 🔑 只需提供一次 Key,后续安装其他工具时我会自动帮你配置 +> - 🔐 不需要为每个工具单独创建 Token(如 GitHub PAT),OAuth 一键授权 +> - 📦 支持上千种 MCP 工具,随时可以扩展你的能力 +> +> **获取步骤:** +> 1. 访问 https://smithery.ai 注册/登录 +> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key +> 3. 将 Key 提供给我 + +#### Import +``` +import_mcp_server( + server_id="", + config={"smithery_api_key": ""} # first time only +) +``` + +#### Handle OAuth +Some tools return an OAuth authorization URL. Tell the user to visit the link. + +**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically. + +--- + +### Method B: Direct URL Import + +When a tool is not available on Smithery but the user has a public MCP endpoint: +``` +import_mcp_server( + server_id="", + config={ + "mcp_url": "https://my-mcp-server.com/sse", + "api_key": "" + } +) +``` +The system will connect to the URL, discover available tools, and register them. + +--- + +## What NOT to Do +- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these +- ❌ Don't tell users to go to Settings — handle everything in chat +- ❌ Don't echo API keys back in your response +- ❌ Don't skip the search step — always verify the server exists before importing +- ❌ Don't import local-only tools — inform users they require local installation diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/SKILL.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/SKILL.md new file mode 100644 index 000000000..ce0d06f3e --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/SKILL.md @@ -0,0 +1,152 @@ +--- +name: skill-creator +description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy. +--- + +# Skill Creator + +A skill for creating new skills and iteratively improving them. + +At a high level, the process of creating a skill goes like this: + +- Decide what you want the skill to do and roughly how it should do it +- Write a draft of the skill +- Create a few test prompts and run claude-with-access-to-the-skill on them +- Help the user evaluate the results both qualitatively and quantitatively +- Rewrite the skill based on feedback from the user's evaluation +- Repeat until you're satisfied +- Expand the test set and try again at larger scale + +Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages. + +## Communicating with the user + +Pay attention to context cues to understand how to phrase your communication. Briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it. + +--- + +## Creating a skill + +### Capture Intent +Start by understanding the user's intent. + +1. What should this skill enable the agent to do? +2. When should this skill trigger? (what user phrases/contexts) +3. What's the expected output format? +4. Should we set up test cases to verify the skill works? + +### Interview and Research +Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies. Wait to write test prompts until you've got this part ironed out. + +### Write the SKILL.md +Based on the user interview, fill in these components: + +- **name**: Skill identifier +- **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it. +- **the rest of the skill** + +### Skill Writing Guide + +#### Anatomy of a Skill + +``` +skill-name/ +\u251c\u2500\u2500 SKILL.md (required) +\u2502 \u251c\u2500\u2500 YAML frontmatter (name, description required) +\u2502 \u2514\u2500\u2500 Markdown instructions +\u2514\u2500\u2500 Bundled Resources (optional) + \u251c\u2500\u2500 scripts/ - Executable code for deterministic/repetitive tasks + \u251c\u2500\u2500 references/ - Docs loaded into context as needed + \u2514\u2500\u2500 assets/ - Files used in output (templates, icons, fonts) +``` + +#### Progressive Disclosure + +Skills use a three-level loading system: +1. **Metadata** (name + description) - Always in context (~100 words) +2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal) +3. **Bundled resources** - As needed (unlimited, scripts can execute without loading) + +**Key patterns:** +- Keep SKILL.md under 500 lines; if approaching this limit, add hierarchy with clear pointers +- Reference files clearly from SKILL.md with guidance on when to read them +- For large reference files (>300 lines), include a table of contents + +#### Writing Patterns + +Prefer using the imperative form in instructions. + +### Writing Style +Explain to the model why things are important. Use theory of mind and try to make the skill general. Start by writing a draft and then look at it with fresh eyes and improve it. + +### Test Cases +After writing the skill draft, come up with 2-3 realistic test prompts. Share them with the user. Save test cases to `evals/evals.json`. + +--- + +## Running and evaluating test cases + +This section is one continuous sequence. + +### Step 1: Run test cases +For each test case, run the agent with the skill applied, and optionally a baseline run without the skill for comparison. + +### Step 2: Draft assertions +While runs are in progress, draft quantitative assertions for each test case. Good assertions are objectively verifiable and have descriptive names. + +### Step 3: Capture timing data +When each run completes, save timing data (tokens, duration) to `timing.json`. + +### Step 4: Grade, aggregate, and launch the viewer +Once all runs are done: +1. Grade each run against assertions — see `agents/grader.md` +2. Aggregate results: `python -m scripts.aggregate_benchmark /iteration-N --skill-name ` +3. Launch the viewer: `python eval-viewer/generate_review.py /iteration-N --skill-name "my-skill" --benchmark /iteration-N/benchmark.json` +4. Present results to the user for review + +### Step 5: Read the feedback +Read user feedback from `feedback.json`. Empty feedback means the user thought it was fine. + +--- + +## Improving the skill + +### How to think about improvements +1. **Generalize from the feedback.** Don't overfit to specific examples. +2. **Keep the prompt lean.** Remove things that aren't pulling their weight. +3. **Explain the why.** Today's LLMs are smart. Explain reasoning rather than rigid MUSTs. +4. **Look for repeated work across test cases.** Bundle common scripts in `scripts/`. + +### The iteration loop +1. Apply improvements to the skill +2. Rerun all test cases into a new iteration directory +3. Present results for review +4. Wait for user to review +5. Read feedback, improve again, repeat + +--- + +## Advanced: Blind comparison +For rigorous comparison between two versions. Read `agents/comparator.md` and `agents/analyzer.md`. + +## Description Optimization +Optimize the description for better triggering accuracy. Use `scripts/run_loop.py`. + +--- + +## Reference files + +- `agents/grader.md` — How to evaluate assertions against outputs +- `agents/comparator.md` — How to do blind A/B comparison between two outputs +- `agents/analyzer.md` — How to analyze why one version beat another +- `references/schemas.md` — JSON structures for evals.json, grading.json, etc. +- `assets/eval_review.html` — HTML template for eval review +- `eval-viewer/generate_review.py` — Script to generate the review viewer +- `scripts/aggregate_benchmark.py` — Aggregate benchmark results +- `scripts/generate_report.py` — Generate optimization report +- `scripts/improve_description.py` — Improve skill description +- `scripts/package_skill.py` — Package skill for distribution +- `scripts/quick_validate.py` — Quick validation +- `scripts/run_eval.py` — Run triggering evaluation +- `scripts/run_loop.py` — Run optimization loop +- `scripts/utils.py` — Shared utilities diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/analyzer.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/analyzer.md new file mode 100644 index 000000000..14e41d606 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/analyzer.md @@ -0,0 +1,274 @@ +# Post-hoc Analyzer Agent + +Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions. + +## Role + +After the blind comparator determines a winner, the Post-hoc Analyzer "unblids" the results by examining the skills and transcripts. The goal is to extract actionable insights: what made the winner better, and how can the loser be improved? + +## Inputs + +You receive these parameters in your prompt: + +- **winner**: "A" or "B" (from blind comparison) +- **winner_skill_path**: Path to the skill that produced the winning output +- **winner_transcript_path**: Path to the execution transcript for the winner +- **loser_skill_path**: Path to the skill that produced the losing output +- **loser_transcript_path**: Path to the execution transcript for the loser +- **comparison_result_path**: Path to the blind comparator's output JSON +- **output_path**: Where to save the analysis results + +## Process + +### Step 1: Read Comparison Result + +1. Read the blind comparator's output at comparison_result_path +2. Note the winning side (A or B), the reasoning, and any scores +3. Understand what the comparator valued in the winning output + +### Step 2: Read Both Skills + +1. Read the winner skill's SKILL.md and key referenced files +2. Read the loser skill's SKILL.md and key referenced files +3. Identify structural differences: + - Instructions clarity and specificity + - Script/tool usage patterns + - Example coverage + - Edge case handling + +### Step 3: Read Both Transcripts + +1. Read the winner's transcript +2. Read the loser's transcript +3. Compare execution patterns: + - How closely did each follow their skill's instructions? + - What tools were used differently? + - Where did the loser diverge from optimal behavior? + - Did either encounter errors or make recovery attempts? + +### Step 4: Analyze Instruction Following + +For each transcript, evaluate: +- Did the agent follow the skill's explicit instructions? +- Did the agent use the skill's provided tools/scripts? +- Were there missed opportunities to leverage skill content? +- Did the agent add unnecessary steps not in the skill? + +Score instruction following 1-10 and note specific issues. + +### Step 5: Identify Winner Strengths + +Determine what made the winner better: +- Clearer instructions that led to better behavior? +- Better scripts/tools that produced better output? +- More comprehensive examples that guided edge cases? +- Better error handling guidance? + +Be specific. Quote from skills/transcripts where relevant. + +### Step 6: Identify Loser Weaknesses + +Determine what held the loser back: +- Ambiguous instructions that led to suboptimal choices? +- Missing tools/scripts that forced workarounds? +- Gaps in edge case coverage? +- Poor error handling that caused failures? + +### Step 7: Generate Improvement Suggestions + +Based on the analysis, produce actionable suggestions for improving the loser skill: +- Specific instruction changes to make +- Tools/scripts to add or modify +- Examples to include +- Edge cases to address + +Prioritize by impact. Focus on changes that would have changed the outcome. + +### Step 8: Write Analysis Results + +Save structured analysis to `{output_path}`. + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "comparison_summary": { + "winner": "A", + "winner_skill": "path/to/winner/skill", + "loser_skill": "path/to/loser/skill", + "comparator_reasoning": "Brief summary of why comparator chose winner" + }, + "winner_strengths": [ + "Clear step-by-step instructions for handling multi-page documents", + "Included validation script that caught formatting errors", + "Explicit guidance on fallback behavior when OCR fails" + ], + "loser_weaknesses": [ + "Vague instruction 'process the document appropriately' led to inconsistent behavior", + "No script for validation, agent had to improvise and made errors", + "No guidance on OCR failure, agent gave up instead of trying alternatives" + ], + "instruction_following": { + "winner": { + "score": 9, + "issues": [ + "Minor: skipped optional logging step" + ] + }, + "loser": { + "score": 6, + "issues": [ + "Did not use the skill's formatting template", + "Invented own approach instead of following step 3", + "Missed the 'always validate output' instruction" + ] + } + }, + "improvement_suggestions": [ + { + "priority": "high", + "category": "instructions", + "suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template", + "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior" + }, + { + "priority": "high", + "category": "tools", + "suggestion": "Add validate_output.py script similar to winner skill's validation approach", + "expected_impact": "Would catch formatting errors before final output" + }, + { + "priority": "medium", + "category": "error_handling", + "suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'", + "expected_impact": "Would prevent early failure on difficult documents" + } + ], + "transcript_insights": { + "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output", + "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors" + } +} +``` + +## Guidelines + +- **Be specific**: Quote from skills and transcripts, don't just say "instructions were unclear" +- **Be actionable**: Suggestions should be concrete changes, not vague advice +- **Focus on skill improvements**: The goal is to improve the losing skill, not critique the agent +- **Prioritize by impact**: Which changes would most likely have changed the outcome? +- **Consider causation**: Did the skill weakness actually cause the worse output, or is it incidental? +- **Stay objective**: Analyze what happened, don't editorialize +- **Think about generalization**: Would this improvement help on other evals too? + +## Categories for Suggestions + +Use these categories to organize improvement suggestions: + +| Category | Description | +|----------|-------------| +| `instructions` | Changes to the skill's prose instructions | +| `tools` | Scripts, templates, or utilities to add/modify | +| `examples` | Example inputs/outputs to include | +| `error_handling` | Guidance for handling failures | +| `structure` | Reorganization of skill content | +| `references` | External docs or resources to add | + +## Priority Levels + +- **high**: Would likely change the outcome of this comparison +- **medium**: Would improve quality but may not change win/loss +- **low**: Nice to have, marginal improvement + +--- + +# Analyzing Benchmark Results + +When analyzing benchmark results, the analyzer's purpose is to **surface patterns and anomalies** across multiple runs, not suggest skill improvements. + +## Role + +Review all benchmark run results and generate freeform notes that help the user understand skill performance. Focus on patterns that wouldn't be visible from aggregate metrics alone. + +## Inputs + +You receive these parameters in your prompt: + +- **benchmark_data_path**: Path to the in-progress benchmark.json with all run results +- **skill_path**: Path to the skill being benchmarked +- **output_path**: Where to save the notes (as JSON array of strings) + +## Process + +### Step 1: Read Benchmark Data + +1. Read the benchmark.json containing all run results +2. Note the configurations tested (with_skill, without_skill) +3. Understand the run_summary aggregates already calculated + +### Step 2: Analyze Per-Assertion Patterns + +For each expectation across all runs: +- Does it **always pass** in both configurations? (may not differentiate skill value) +- Does it **always fail** in both configurations? (may be broken or beyond capability) +- Does it **always pass with skill but fail without**? (skill clearly adds value here) +- Does it **always fail with skill but pass without**? (skill may be hurting) +- Is it **highly variable**? (flaky expectation or non-deterministic behavior) + +### Step 3: Analyze Cross-Eval Patterns + +Look for patterns across evals: +- Are certain eval types consistently harder/easier? +- Do some evals show high variance while others are stable? +- Are there surprising results that contradict expectations? + +### Step 4: Analyze Metrics Patterns + +Look at time_seconds, tokens, tool_calls: +- Does the skill significantly increase execution time? +- Is there high variance in resource usage? +- Are there outlier runs that skew the aggregates? + +### Step 5: Generate Notes + +Write freeform observations as a list of strings. Each note should: +- State a specific observation +- Be grounded in the data (not speculation) +- Help the user understand something the aggregate metrics don't show + +Examples: +- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value" +- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky" +- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)" +- "Skill adds 13s average execution time but improves pass rate by 50%" +- "Token usage is 80% higher with skill, primarily due to script output parsing" +- "All 3 without-skill runs for eval 1 produced empty output" + +### Step 6: Write Notes + +Save notes to `{output_path}` as a JSON array of strings: + +```json +[ + "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value", + "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure", + "Without-skill runs consistently fail on table extraction expectations", + "Skill adds 13s average execution time but improves pass rate by 50%" +] +``` + +## Guidelines + +**DO:** +- Report what you observe in the data +- Be specific about which evals, expectations, or runs you're referring to +- Note patterns that aggregate metrics would hide +- Provide context that helps interpret the numbers + +**DO NOT:** +- Suggest improvements to the skill (that's for the improvement step, not benchmarking) +- Make subjective quality judgments ("the output was good/bad") +- Speculate about causes without evidence +- Repeat information already in the run_summary aggregates diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/comparator.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/comparator.md new file mode 100644 index 000000000..80e00eb45 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/comparator.md @@ -0,0 +1,202 @@ +# Blind Comparator Agent + +Compare two outputs WITHOUT knowing which skill produced them. + +## Role + +The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach. + +Your judgment is based purely on output quality and task completion. + +## Inputs + +You receive these parameters in your prompt: + +- **output_a_path**: Path to the first output file or directory +- **output_b_path**: Path to the second output file or directory +- **eval_prompt**: The original task/prompt that was executed +- **expectations**: List of expectations to check (optional - may be empty) + +## Process + +### Step 1: Read Both Outputs + +1. Examine output A (file or directory) +2. Examine output B (file or directory) +3. Note the type, structure, and content of each +4. If outputs are directories, examine all relevant files inside + +### Step 2: Understand the Task + +1. Read the eval_prompt carefully +2. Identify what the task requires: + - What should be produced? + - What qualities matter (accuracy, completeness, format)? + - What would distinguish a good output from a poor one? + +### Step 3: Generate Evaluation Rubric + +Based on the task, generate a rubric with two dimensions: + +**Content Rubric** (what the output contains): +| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) | +|-----------|----------|----------------|---------------| +| Correctness | Major errors | Minor errors | Fully correct | +| Completeness | Missing key elements | Mostly complete | All elements present | +| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout | + +**Structure Rubric** (how the output is organized): +| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) | +|-----------|----------|----------------|---------------| +| Organization | Disorganized | Reasonably organized | Clear, logical structure | +| Formatting | Inconsistent/broken | Mostly consistent | Professional, polished | +| Usability | Difficult to use | Usable with effort | Easy to use | + +Adapt criteria to the specific task. For example: +- PDF form → "Field alignment", "Text readability", "Data placement" +- Document → "Section structure", "Heading hierarchy", "Paragraph flow" +- Data output → "Schema correctness", "Data types", "Completeness" + +### Step 4: Evaluate Each Output Against the Rubric + +For each output (A and B): + +1. **Score each criterion** on the rubric (1-5 scale) +2. **Calculate dimension totals**: Content score, Structure score +3. **Calculate overall score**: Average of dimension scores, scaled to 1-10 + +### Step 5: Check Assertions (if provided) + +If expectations are provided: + +1. Check each expectation against output A +2. Check each expectation against output B +3. Count pass rates for each output +4. Use expectation scores as secondary evidence (not the primary decision factor) + +### Step 6: Determine the Winner + +Compare A and B based on (in priority order): + +1. **Primary**: Overall rubric score (content + structure) +2. **Secondary**: Assertion pass rates (if applicable) +3. **Tiebreaker**: If truly equal, declare a TIE + +Be decisive - ties should be rare. One output is usually better, even if marginally. + +### Step 7: Write Comparison Results + +Save results to a JSON file at the path specified (or `comparison.json` if not specified). + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "winner": "A", + "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.", + "rubric": { + "A": { + "content": { + "correctness": 5, + "completeness": 5, + "accuracy": 4 + }, + "structure": { + "organization": 4, + "formatting": 5, + "usability": 4 + }, + "content_score": 4.7, + "structure_score": 4.3, + "overall_score": 9.0 + }, + "B": { + "content": { + "correctness": 3, + "completeness": 2, + "accuracy": 3 + }, + "structure": { + "organization": 3, + "formatting": 2, + "usability": 3 + }, + "content_score": 2.7, + "structure_score": 2.7, + "overall_score": 5.4 + } + }, + "output_quality": { + "A": { + "score": 9, + "strengths": ["Complete solution", "Well-formatted", "All fields present"], + "weaknesses": ["Minor style inconsistency in header"] + }, + "B": { + "score": 5, + "strengths": ["Readable output", "Correct basic structure"], + "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"] + } + }, + "expectation_results": { + "A": { + "passed": 4, + "total": 5, + "pass_rate": 0.80, + "details": [ + {"text": "Output includes name", "passed": true}, + {"text": "Output includes date", "passed": true}, + {"text": "Format is PDF", "passed": true}, + {"text": "Contains signature", "passed": false}, + {"text": "Readable text", "passed": true} + ] + }, + "B": { + "passed": 3, + "total": 5, + "pass_rate": 0.60, + "details": [ + {"text": "Output includes name", "passed": true}, + {"text": "Output includes date", "passed": false}, + {"text": "Format is PDF", "passed": true}, + {"text": "Contains signature", "passed": false}, + {"text": "Readable text", "passed": true} + ] + } + } +} +``` + +If no expectations were provided, omit the `expectation_results` field entirely. + +## Field Descriptions + +- **winner**: "A", "B", or "TIE" +- **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie) +- **rubric**: Structured rubric evaluation for each output + - **content**: Scores for content criteria (correctness, completeness, accuracy) + - **structure**: Scores for structure criteria (organization, formatting, usability) + - **content_score**: Average of content criteria (1-5) + - **structure_score**: Average of structure criteria (1-5) + - **overall_score**: Combined score scaled to 1-10 +- **output_quality**: Summary quality assessment + - **score**: 1-10 rating (should match rubric overall_score) + - **strengths**: List of positive aspects + - **weaknesses**: List of issues or shortcomings +- **expectation_results**: (Only if expectations provided) + - **passed**: Number of expectations that passed + - **total**: Total number of expectations + - **pass_rate**: Fraction passed (0.0 to 1.0) + - **details**: Individual expectation results + +## Guidelines + +- **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality. +- **Be specific**: Cite specific examples when explaining strengths and weaknesses. +- **Be decisive**: Choose a winner unless outputs are genuinely equivalent. +- **Output quality first**: Assertion scores are secondary to overall task completion. +- **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness. +- **Explain your reasoning**: The reasoning field should make it clear why you chose the winner. +- **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better. diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/grader.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/grader.md new file mode 100644 index 000000000..558ab05c0 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/agents/grader.md @@ -0,0 +1,223 @@ +# Grader Agent + +Evaluate expectations against an execution transcript and outputs. + +## Role + +The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment. + +You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so. + +## Inputs + +You receive these parameters in your prompt: + +- **expectations**: List of expectations to evaluate (strings) +- **transcript_path**: Path to the execution transcript (markdown file) +- **outputs_dir**: Directory containing output files from execution + +## Process + +### Step 1: Read the Transcript + +1. Read the transcript file completely +2. Note the eval prompt, execution steps, and final result +3. Identify any issues or errors documented + +### Step 2: Examine Output Files + +1. List files in outputs_dir +2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced. +3. Note contents, structure, and quality + +### Step 3: Evaluate Each Assertion + +For each expectation: + +1. **Search for evidence** in the transcript and outputs +2. **Determine verdict**: + - **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance + - **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content) +3. **Cite the evidence**: Quote the specific text or describe what you found + +### Step 4: Extract and Verify Claims + +Beyond the predefined expectations, extract implicit claims from the outputs and verify them: + +1. **Extract claims** from the transcript and outputs: + - Factual statements ("The form has 12 fields") + - Process claims ("Used pypdf to fill the form") + - Quality claims ("All fields were filled correctly") + +2. **Verify each claim**: + - **Factual claims**: Can be checked against the outputs or external sources + - **Process claims**: Can be verified from the transcript + - **Quality claims**: Evaluate whether the claim is justified + +3. **Flag unverifiable claims**: Note claims that cannot be verified with available information + +This catches issues that predefined expectations might miss. + +### Step 5: Read User Notes + +If `{outputs_dir}/user_notes.md` exists: +1. Read it and note any uncertainties or issues flagged by the executor +2. Include relevant concerns in the grading output +3. These may reveal problems even when expectations pass + +### Step 6: Critique the Evals + +After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap. + +Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't. + +Suggestions worth raising: +- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content) +- An important outcome you observed — good or bad — that no assertion covers at all +- An assertion that can't actually be verified from the available outputs + +Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion. + +### Step 7: Write Grading Results + +Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir). + +## Grading Criteria + +**PASS when**: +- The transcript or outputs clearly demonstrate the expectation is true +- Specific evidence can be cited +- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename) + +**FAIL when**: +- No evidence found for the expectation +- Evidence contradicts the expectation +- The expectation cannot be verified from available information +- The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete +- The output appears to meet the assertion by coincidence rather than by actually doing the work + +**When uncertain**: The burden of proof to pass is on the expectation. + +### Step 8: Read Executor Metrics and Timing + +1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output +2. If `{outputs_dir}/../timing.json` exists, read it and include timing data + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "expectations": [ + { + "text": "The output includes the name 'John Smith'", + "passed": true, + "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'" + }, + { + "text": "The spreadsheet has a SUM formula in cell B10", + "passed": false, + "evidence": "No spreadsheet was created. The output was a text file." + }, + { + "text": "The assistant used the skill's OCR script", + "passed": true, + "evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'" + } + ], + "summary": { + "passed": 2, + "failed": 1, + "total": 3, + "pass_rate": 0.67 + }, + "execution_metrics": { + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8 + }, + "total_tool_calls": 15, + "total_steps": 6, + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 + }, + "timing": { + "executor_duration_seconds": 165.0, + "grader_duration_seconds": 26.0, + "total_duration_seconds": 191.0 + }, + "claims": [ + { + "claim": "The form has 12 fillable fields", + "type": "factual", + "verified": true, + "evidence": "Counted 12 fields in field_info.json" + }, + { + "claim": "All required fields were populated", + "type": "quality", + "verified": false, + "evidence": "Reference section was left blank despite data being available" + } + ], + "user_notes_summary": { + "uncertainties": ["Used 2023 data, may be stale"], + "needs_review": [], + "workarounds": ["Fell back to text overlay for non-fillable fields"] + }, + "eval_feedback": { + "suggestions": [ + { + "assertion": "The output includes the name 'John Smith'", + "reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input" + }, + { + "reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught" + } + ], + "overall": "Assertions check presence but not correctness. Consider adding content verification." + } +} +``` + +## Field Descriptions + +- **expectations**: Array of graded expectations + - **text**: The original expectation text + - **passed**: Boolean - true if expectation passes + - **evidence**: Specific quote or description supporting the verdict +- **summary**: Aggregate statistics + - **passed**: Count of passed expectations + - **failed**: Count of failed expectations + - **total**: Total expectations evaluated + - **pass_rate**: Fraction passed (0.0 to 1.0) +- **execution_metrics**: Copied from executor's metrics.json (if available) + - **output_chars**: Total character count of output files (proxy for tokens) + - **transcript_chars**: Character count of transcript +- **timing**: Wall clock timing from timing.json (if available) + - **executor_duration_seconds**: Time spent in executor subagent + - **total_duration_seconds**: Total elapsed time for the run +- **claims**: Extracted and verified claims from the output + - **claim**: The statement being verified + - **type**: "factual", "process", or "quality" + - **verified**: Boolean - whether the claim holds + - **evidence**: Supporting or contradicting evidence +- **user_notes_summary**: Issues flagged by the executor + - **uncertainties**: Things the executor wasn't sure about + - **needs_review**: Items requiring human attention + - **workarounds**: Places where the skill didn't work as expected +- **eval_feedback**: Improvement suggestions for the evals (only when warranted) + - **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to + - **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag + +## Guidelines + +- **Be objective**: Base verdicts on evidence, not assumptions +- **Be specific**: Quote the exact text that supports your verdict +- **Be thorough**: Check both transcript and output files +- **Be consistent**: Apply the same standard to each expectation +- **Explain failures**: Make it clear why evidence was insufficient +- **No partial credit**: Each expectation is pass or fail, not partial diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/assets/eval_review.html b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/assets/eval_review.html new file mode 100644 index 000000000..938ff32ae --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/assets/eval_review.html @@ -0,0 +1,146 @@ + + + + + + Eval Set Review - __SKILL_NAME_PLACEHOLDER__ + + + + + + +

Eval Set Review: __SKILL_NAME_PLACEHOLDER__

+

Current description: __SKILL_DESCRIPTION_PLACEHOLDER__

+ +
+ + +
+ + + + + + + + + + +
QueryShould TriggerActions
+ +

+ + + + diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/generate_review.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/generate_review.py new file mode 100644 index 000000000..4f0b1fe00 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/generate_review.py @@ -0,0 +1,473 @@ +#!/usr/bin/env python3 +"""Generate and serve a review page for eval results. + +Reads the workspace directory, discovers runs (directories with outputs/), +embeds all output data into a self-contained HTML page, and serves it via +a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace. + +Usage: + python generate_review.py [--port PORT] [--skill-name NAME] + python generate_review.py --previous-feedback /path/to/old/feedback.json + +No dependencies beyond the Python stdlib are required. +""" + +import argparse +import base64 +import json +import mimetypes +import os +import re +import signal +import subprocess +import sys +import time +import webbrowser +from functools import partial +from http.server import HTTPServer, BaseHTTPRequestHandler +from pathlib import Path + +from loguru import logger + +# Files to exclude from output listings +METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"} + +# Extensions we render as inline text +TEXT_EXTENSIONS = { + ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx", + ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs", + ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml", +} + +# Extensions we render as inline images +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"} + +# MIME type overrides for common types +MIME_OVERRIDES = { + ".svg": "image/svg+xml", + ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", +} + + +def get_mime_type(path: Path) -> str: + ext = path.suffix.lower() + if ext in MIME_OVERRIDES: + return MIME_OVERRIDES[ext] + mime, _ = mimetypes.guess_type(str(path)) + return mime or "application/octet-stream" + + +def find_runs(workspace: Path) -> list[dict]: + """Recursively find directories that contain an outputs/ subdirectory.""" + runs: list[dict] = [] + _find_runs_recursive(workspace, workspace, runs) + runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"])) + return runs + + +def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None: + if not current.is_dir(): + return + + outputs_dir = current / "outputs" + if outputs_dir.is_dir(): + run = build_run(root, current) + if run: + runs.append(run) + return + + skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"} + for child in sorted(current.iterdir()): + if child.is_dir() and child.name not in skip: + _find_runs_recursive(root, child, runs) + + +def build_run(root: Path, run_dir: Path) -> dict | None: + """Build a run dict with prompt, outputs, and grading data.""" + prompt = "" + eval_id = None + + # Try eval_metadata.json + for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]: + if candidate.exists(): + try: + metadata = json.loads(candidate.read_text()) + prompt = metadata.get("prompt", "") + eval_id = metadata.get("eval_id") + except (json.JSONDecodeError, OSError): + pass + if prompt: + break + + # Fall back to transcript.md + if not prompt: + for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]: + if candidate.exists(): + try: + text = candidate.read_text() + match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text) + if match: + prompt = match.group(1).strip() + except OSError: + pass + if prompt: + break + + if not prompt: + prompt = "(No prompt found)" + + run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-") + + # Collect output files + outputs_dir = run_dir / "outputs" + output_files: list[dict] = [] + if outputs_dir.is_dir(): + for f in sorted(outputs_dir.iterdir()): + if f.is_file() and f.name not in METADATA_FILES: + output_files.append(embed_file(f)) + + # Load grading if present + grading = None + for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]: + if candidate.exists(): + try: + grading = json.loads(candidate.read_text()) + except (json.JSONDecodeError, OSError): + pass + if grading: + break + + return { + "id": run_id, + "prompt": prompt, + "eval_id": eval_id, + "outputs": output_files, + "grading": grading, + } + + +def embed_file(path: Path) -> dict: + """Read a file and return an embedded representation.""" + ext = path.suffix.lower() + mime = get_mime_type(path) + + if ext in TEXT_EXTENSIONS: + try: + content = path.read_text(errors="replace") + except OSError: + content = "(Error reading file)" + return { + "name": path.name, + "type": "text", + "content": content, + } + elif ext in IMAGE_EXTENSIONS: + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "image", + "mime": mime, + "data_uri": f"data:{mime};base64,{b64}", + } + elif ext == ".pdf": + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "pdf", + "data_uri": f"data:{mime};base64,{b64}", + } + elif ext == ".xlsx": + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "xlsx", + "data_b64": b64, + } + else: + # Binary / unknown — base64 download link + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "binary", + "mime": mime, + "data_uri": f"data:{mime};base64,{b64}", + } + + +def load_previous_iteration(workspace: Path) -> dict[str, dict]: + """Load previous iteration's feedback and outputs. + + Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}. + """ + result: dict[str, dict] = {} + + # Load feedback + feedback_map: dict[str, str] = {} + feedback_path = workspace / "feedback.json" + if feedback_path.exists(): + try: + data = json.loads(feedback_path.read_text()) + feedback_map = { + r["run_id"]: r["feedback"] + for r in data.get("reviews", []) + if r.get("feedback", "").strip() + } + except (json.JSONDecodeError, OSError, KeyError): + pass + + # Load runs (to get outputs) + prev_runs = find_runs(workspace) + for run in prev_runs: + result[run["id"]] = { + "feedback": feedback_map.get(run["id"], ""), + "outputs": run.get("outputs", []), + } + + # Also add feedback for run_ids that had feedback but no matching run + for run_id, fb in feedback_map.items(): + if run_id not in result: + result[run_id] = {"feedback": fb, "outputs": []} + + return result + + +def generate_html( + runs: list[dict], + skill_name: str, + previous: dict[str, dict] | None = None, + benchmark: dict | None = None, +) -> str: + """Generate the complete standalone HTML page with embedded data.""" + template_path = Path(__file__).parent / "viewer.html" + template = template_path.read_text() + + # Build previous_feedback and previous_outputs maps for the template + previous_feedback: dict[str, str] = {} + previous_outputs: dict[str, list[dict]] = {} + if previous: + for run_id, data in previous.items(): + if data.get("feedback"): + previous_feedback[run_id] = data["feedback"] + if data.get("outputs"): + previous_outputs[run_id] = data["outputs"] + + embedded = { + "skill_name": skill_name, + "runs": runs, + "previous_feedback": previous_feedback, + "previous_outputs": previous_outputs, + } + if benchmark: + embedded["benchmark"] = benchmark + + data_json = json.dumps(embedded) + + return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};") + + +# --------------------------------------------------------------------------- +# HTTP server (stdlib only, zero dependencies) +# --------------------------------------------------------------------------- + +def _kill_port(port: int) -> None: + """Kill any process listening on the given port.""" + try: + result = subprocess.run( + ["lsof", "-ti", f":{port}"], + capture_output=True, text=True, timeout=5, + ) + for pid_str in result.stdout.strip().split("\n"): + if pid_str.strip(): + try: + os.kill(int(pid_str.strip()), signal.SIGTERM) + except (ProcessLookupError, ValueError): + pass + if result.stdout.strip(): + time.sleep(0.5) + except subprocess.TimeoutExpired: + pass + except FileNotFoundError: + logger.warning("Note: lsof not found, cannot check if port is in use") + +class ReviewHandler(BaseHTTPRequestHandler): + """Serves the review HTML and handles feedback saves. + + Regenerates the HTML on each page load so that refreshing the browser + picks up new eval outputs without restarting the server. + """ + + def __init__( + self, + workspace: Path, + skill_name: str, + feedback_path: Path, + previous: dict[str, dict], + benchmark_path: Path | None, + *args, + **kwargs, + ): + self.workspace = workspace + self.skill_name = skill_name + self.feedback_path = feedback_path + self.previous = previous + self.benchmark_path = benchmark_path + super().__init__(*args, **kwargs) + + def do_GET(self) -> None: + if self.path == "/" or self.path == "/index.html": + # Regenerate HTML on each request (re-scans workspace for new outputs) + runs = find_runs(self.workspace) + benchmark = None + if self.benchmark_path and self.benchmark_path.exists(): + try: + benchmark = json.loads(self.benchmark_path.read_text()) + except (json.JSONDecodeError, OSError): + pass + html = generate_html(runs, self.skill_name, self.previous, benchmark) + content = html.encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(content))) + self.end_headers() + self.wfile.write(content) + elif self.path == "/api/feedback": + data = b"{}" + if self.feedback_path.exists(): + data = self.feedback_path.read_bytes() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(data))) + self.end_headers() + self.wfile.write(data) + else: + self.send_error(404) + + def do_POST(self) -> None: + if self.path == "/api/feedback": + length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(length) + try: + data = json.loads(body) + if not isinstance(data, dict) or "reviews" not in data: + raise ValueError("Expected JSON object with 'reviews' key") + self.feedback_path.write_text(json.dumps(data, indent=2) + "\n") + resp = b'{"ok":true}' + self.send_response(200) + except (json.JSONDecodeError, OSError, ValueError) as e: + resp = json.dumps({"error": str(e)}).encode() + self.send_response(500) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(resp))) + self.end_headers() + self.wfile.write(resp) + else: + self.send_error(404) + + def log_message(self, format: str, *args: object) -> None: + # Suppress request logging to keep terminal clean + pass + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate and serve eval review") + parser.add_argument("workspace", type=Path, help="Path to workspace directory") + parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)") + parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header") + parser.add_argument( + "--previous-workspace", type=Path, default=None, + help="Path to previous iteration's workspace (shows old outputs and feedback as context)", + ) + parser.add_argument( + "--benchmark", type=Path, default=None, + help="Path to benchmark.json to show in the Benchmark tab", + ) + parser.add_argument( + "--static", "-s", type=Path, default=None, + help="Write standalone HTML to this path instead of starting a server", + ) + args = parser.parse_args() + + workspace = args.workspace.resolve() + if not workspace.is_dir(): + logger.error(f"Error: {workspace} is not a directory") + sys.exit(1) + + runs = find_runs(workspace) + if not runs: + logger.error(f"No runs found in {workspace}") + sys.exit(1) + + skill_name = args.skill_name or workspace.name.replace("-workspace", "") + feedback_path = workspace / "feedback.json" + + previous: dict[str, dict] = {} + if args.previous_workspace: + previous = load_previous_iteration(args.previous_workspace.resolve()) + + benchmark_path = args.benchmark.resolve() if args.benchmark else None + benchmark = None + if benchmark_path and benchmark_path.exists(): + try: + benchmark = json.loads(benchmark_path.read_text()) + except (json.JSONDecodeError, OSError): + pass + + if args.static: + html = generate_html(runs, skill_name, previous, benchmark) + args.static.parent.mkdir(parents=True, exist_ok=True) + args.static.write_text(html) + logger.info(f"\n Static viewer written to: {args.static}\n") + sys.exit(0) + + # Kill any existing process on the target port + port = args.port + _kill_port(port) + handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path) + try: + server = HTTPServer(("127.0.0.1", port), handler) + except OSError: + # Port still in use after kill attempt — find a free one + server = HTTPServer(("127.0.0.1", 0), handler) + port = server.server_address[1] + + url = f"http://localhost:{port}" + logger.info(f"\n Eval Viewer") + logger.info(f" ─────────────────────────────────") + logger.info(f" URL: {url}") + logger.info(f" Workspace: {workspace}") + logger.info(f" Feedback: {feedback_path}") + if previous: + logger.info(f" Previous: {args.previous_workspace} ({len(previous)} runs)") + if benchmark_path: + logger.info(f" Benchmark: {benchmark_path}") + logger.info(f"\n Press Ctrl+C to stop.\n") + + webbrowser.open(url) + + try: + server.serve_forever() + except KeyboardInterrupt: + logger.info("\nStopped.") + server.server_close() + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/viewer.html b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/viewer.html new file mode 100644 index 000000000..6d8e96348 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/eval-viewer/viewer.html @@ -0,0 +1,1325 @@ + + + + + + Eval Review + + + + + + + +
+
+
+

Eval Review:

+
Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.
+
+
+
+ + + + + +
+
+ +
+
Prompt
+
+
+
+
+ + +
+
Output
+
+
No output files found
+
+
+ + + + + + + + +
+
Your Feedback
+
+ + + +
+
+
+ + +
+ + +
+
+
No benchmark data available. Run a benchmark to see quantitative results here.
+
+
+
+ + +
+
+

Review Complete

+

Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.

+
+ +
+
+
+ + +
+ + + + diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/references/schemas.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/references/schemas.md new file mode 100644 index 000000000..b6eeaa2d4 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/references/schemas.md @@ -0,0 +1,430 @@ +# JSON Schemas + +This document defines the JSON schemas used by skill-creator. + +--- + +## evals.json + +Defines the evals for a skill. Located at `evals/evals.json` within the skill directory. + +```json +{ + "skill_name": "example-skill", + "evals": [ + { + "id": 1, + "prompt": "User's example prompt", + "expected_output": "Description of expected result", + "files": ["evals/files/sample1.pdf"], + "expectations": [ + "The output includes X", + "The skill used script Y" + ] + } + ] +} +``` + +**Fields:** +- `skill_name`: Name matching the skill's frontmatter +- `evals[].id`: Unique integer identifier +- `evals[].prompt`: The task to execute +- `evals[].expected_output`: Human-readable description of success +- `evals[].files`: Optional list of input file paths (relative to skill root) +- `evals[].expectations`: List of verifiable statements + +--- + +## history.json + +Tracks version progression in Improve mode. Located at workspace root. + +```json +{ + "started_at": "2026-01-15T10:30:00Z", + "skill_name": "pdf", + "current_best": "v2", + "iterations": [ + { + "version": "v0", + "parent": null, + "expectation_pass_rate": 0.65, + "grading_result": "baseline", + "is_current_best": false + }, + { + "version": "v1", + "parent": "v0", + "expectation_pass_rate": 0.75, + "grading_result": "won", + "is_current_best": false + }, + { + "version": "v2", + "parent": "v1", + "expectation_pass_rate": 0.85, + "grading_result": "won", + "is_current_best": true + } + ] +} +``` + +**Fields:** +- `started_at`: ISO timestamp of when improvement started +- `skill_name`: Name of the skill being improved +- `current_best`: Version identifier of the best performer +- `iterations[].version`: Version identifier (v0, v1, ...) +- `iterations[].parent`: Parent version this was derived from +- `iterations[].expectation_pass_rate`: Pass rate from grading +- `iterations[].grading_result`: "baseline", "won", "lost", or "tie" +- `iterations[].is_current_best`: Whether this is the current best version + +--- + +## grading.json + +Output from the grader agent. Located at `/grading.json`. + +```json +{ + "expectations": [ + { + "text": "The output includes the name 'John Smith'", + "passed": true, + "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'" + }, + { + "text": "The spreadsheet has a SUM formula in cell B10", + "passed": false, + "evidence": "No spreadsheet was created. The output was a text file." + } + ], + "summary": { + "passed": 2, + "failed": 1, + "total": 3, + "pass_rate": 0.67 + }, + "execution_metrics": { + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8 + }, + "total_tool_calls": 15, + "total_steps": 6, + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 + }, + "timing": { + "executor_duration_seconds": 165.0, + "grader_duration_seconds": 26.0, + "total_duration_seconds": 191.0 + }, + "claims": [ + { + "claim": "The form has 12 fillable fields", + "type": "factual", + "verified": true, + "evidence": "Counted 12 fields in field_info.json" + } + ], + "user_notes_summary": { + "uncertainties": ["Used 2023 data, may be stale"], + "needs_review": [], + "workarounds": ["Fell back to text overlay for non-fillable fields"] + }, + "eval_feedback": { + "suggestions": [ + { + "assertion": "The output includes the name 'John Smith'", + "reason": "A hallucinated document that mentions the name would also pass" + } + ], + "overall": "Assertions check presence but not correctness." + } +} +``` + +**Fields:** +- `expectations[]`: Graded expectations with evidence +- `summary`: Aggregate pass/fail counts +- `execution_metrics`: Tool usage and output size (from executor's metrics.json) +- `timing`: Wall clock timing (from timing.json) +- `claims`: Extracted and verified claims from the output +- `user_notes_summary`: Issues flagged by the executor +- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising + +--- + +## metrics.json + +Output from the executor agent. Located at `/outputs/metrics.json`. + +```json +{ + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8, + "Edit": 1, + "Glob": 2, + "Grep": 0 + }, + "total_tool_calls": 18, + "total_steps": 6, + "files_created": ["filled_form.pdf", "field_values.json"], + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 +} +``` + +**Fields:** +- `tool_calls`: Count per tool type +- `total_tool_calls`: Sum of all tool calls +- `total_steps`: Number of major execution steps +- `files_created`: List of output files created +- `errors_encountered`: Number of errors during execution +- `output_chars`: Total character count of output files +- `transcript_chars`: Character count of transcript + +--- + +## timing.json + +Wall clock timing for a run. Located at `/timing.json`. + +**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact. + +```json +{ + "total_tokens": 84852, + "duration_ms": 23332, + "total_duration_seconds": 23.3, + "executor_start": "2026-01-15T10:30:00Z", + "executor_end": "2026-01-15T10:32:45Z", + "executor_duration_seconds": 165.0, + "grader_start": "2026-01-15T10:32:46Z", + "grader_end": "2026-01-15T10:33:12Z", + "grader_duration_seconds": 26.0 +} +``` + +--- + +## benchmark.json + +Output from Benchmark mode. Located at `benchmarks//benchmark.json`. + +```json +{ + "metadata": { + "skill_name": "pdf", + "skill_path": "/path/to/pdf", + "executor_model": "claude-sonnet-4-20250514", + "analyzer_model": "most-capable-model", + "timestamp": "2026-01-15T10:30:00Z", + "evals_run": [1, 2, 3], + "runs_per_configuration": 3 + }, + + "runs": [ + { + "eval_id": 1, + "eval_name": "Ocean", + "configuration": "with_skill", + "run_number": 1, + "result": { + "pass_rate": 0.85, + "passed": 6, + "failed": 1, + "total": 7, + "time_seconds": 42.5, + "tokens": 3800, + "tool_calls": 18, + "errors": 0 + }, + "expectations": [ + {"text": "...", "passed": true, "evidence": "..."} + ], + "notes": [ + "Used 2023 data, may be stale", + "Fell back to text overlay for non-fillable fields" + ] + } + ], + + "run_summary": { + "with_skill": { + "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90}, + "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0}, + "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100} + }, + "without_skill": { + "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45}, + "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0}, + "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500} + }, + "delta": { + "pass_rate": "+0.50", + "time_seconds": "+13.0", + "tokens": "+1700" + } + }, + + "notes": [ + "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value", + "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent", + "Without-skill runs consistently fail on table extraction expectations", + "Skill adds 13s average execution time but improves pass rate by 50%" + ] +} +``` + +**Fields:** +- `metadata`: Information about the benchmark run + - `skill_name`: Name of the skill + - `timestamp`: When the benchmark was run + - `evals_run`: List of eval names or IDs + - `runs_per_configuration`: Number of runs per config (e.g. 3) +- `runs[]`: Individual run results + - `eval_id`: Numeric eval identifier + - `eval_name`: Human-readable eval name (used as section header in the viewer) + - `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding) + - `run_number`: Integer run number (1, 2, 3...) + - `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors` +- `run_summary`: Statistical aggregates per configuration + - `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields + - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"` +- `notes`: Freeform observations from the analyzer + +**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually. + +--- + +## comparison.json + +Output from blind comparator. Located at `/comparison-N.json`. + +```json +{ + "winner": "A", + "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.", + "rubric": { + "A": { + "content": { + "correctness": 5, + "completeness": 5, + "accuracy": 4 + }, + "structure": { + "organization": 4, + "formatting": 5, + "usability": 4 + }, + "content_score": 4.7, + "structure_score": 4.3, + "overall_score": 9.0 + }, + "B": { + "content": { + "correctness": 3, + "completeness": 2, + "accuracy": 3 + }, + "structure": { + "organization": 3, + "formatting": 2, + "usability": 3 + }, + "content_score": 2.7, + "structure_score": 2.7, + "overall_score": 5.4 + } + }, + "output_quality": { + "A": { + "score": 9, + "strengths": ["Complete solution", "Well-formatted", "All fields present"], + "weaknesses": ["Minor style inconsistency in header"] + }, + "B": { + "score": 5, + "strengths": ["Readable output", "Correct basic structure"], + "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"] + } + }, + "expectation_results": { + "A": { + "passed": 4, + "total": 5, + "pass_rate": 0.80, + "details": [ + {"text": "Output includes name", "passed": true} + ] + }, + "B": { + "passed": 3, + "total": 5, + "pass_rate": 0.60, + "details": [ + {"text": "Output includes name", "passed": true} + ] + } + } +} +``` + +--- + +## analysis.json + +Output from post-hoc analyzer. Located at `/analysis.json`. + +```json +{ + "comparison_summary": { + "winner": "A", + "winner_skill": "path/to/winner/skill", + "loser_skill": "path/to/loser/skill", + "comparator_reasoning": "Brief summary of why comparator chose winner" + }, + "winner_strengths": [ + "Clear step-by-step instructions for handling multi-page documents", + "Included validation script that caught formatting errors" + ], + "loser_weaknesses": [ + "Vague instruction 'process the document appropriately' led to inconsistent behavior", + "No script for validation, agent had to improvise" + ], + "instruction_following": { + "winner": { + "score": 9, + "issues": ["Minor: skipped optional logging step"] + }, + "loser": { + "score": 6, + "issues": [ + "Did not use the skill's formatting template", + "Invented own approach instead of following step 3" + ] + } + }, + "improvement_suggestions": [ + { + "priority": "high", + "category": "instructions", + "suggestion": "Replace 'process the document appropriately' with explicit steps", + "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior" + } + ], + "transcript_insights": { + "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script", + "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods" + } +} +``` diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/__init__.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/aggregate_benchmark.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/aggregate_benchmark.py new file mode 100644 index 000000000..ccc810819 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/aggregate_benchmark.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +""" +Aggregate individual run results into benchmark summary statistics. + +Reads grading.json files from run directories and produces: +- run_summary with mean, stddev, min, max for each metric +- delta between with_skill and without_skill configurations + +Usage: + python aggregate_benchmark.py + +Example: + python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/ + +The script supports two directory layouts: + + Workspace layout (from skill-creator iterations): + / + └── eval-N/ + ├── with_skill/ + │ ├── run-1/grading.json + │ └── run-2/grading.json + └── without_skill/ + ├── run-1/grading.json + └── run-2/grading.json + + Legacy layout (with runs/ subdirectory): + / + └── runs/ + └── eval-N/ + ├── with_skill/ + │ └── run-1/grading.json + └── without_skill/ + └── run-1/grading.json +""" + +import argparse +import json +import math +import sys +from datetime import datetime, timezone +from pathlib import Path + +from loguru import logger + + +def calculate_stats(values: list[float]) -> dict: + """Calculate mean, stddev, min, max for a list of values.""" + if not values: + return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0} + + n = len(values) + mean = sum(values) / n + + if n > 1: + variance = sum((x - mean) ** 2 for x in values) / (n - 1) + stddev = math.sqrt(variance) + else: + stddev = 0.0 + + return { + "mean": round(mean, 4), + "stddev": round(stddev, 4), + "min": round(min(values), 4), + "max": round(max(values), 4) + } + + +def load_run_results(benchmark_dir: Path) -> dict: + """ + Load all run results from a benchmark directory. + + Returns dict keyed by config name (e.g. "with_skill"/"without_skill", + or "new_skill"/"old_skill"), each containing a list of run results. + """ + # Support both layouts: eval dirs directly under benchmark_dir, or under runs/ + runs_dir = benchmark_dir / "runs" + if runs_dir.exists(): + search_dir = runs_dir + elif list(benchmark_dir.glob("eval-*")): + search_dir = benchmark_dir + else: + logger.warning(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}") + return {} + + results: dict[str, list] = {} + + for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))): + metadata_path = eval_dir / "eval_metadata.json" + if metadata_path.exists(): + try: + with open(metadata_path) as mf: + eval_id = json.load(mf).get("eval_id", eval_idx) + except (json.JSONDecodeError, OSError): + eval_id = eval_idx + else: + try: + eval_id = int(eval_dir.name.split("-")[1]) + except ValueError: + eval_id = eval_idx + + # Discover config directories dynamically rather than hardcoding names + for config_dir in sorted(eval_dir.iterdir()): + if not config_dir.is_dir(): + continue + # Skip non-config directories (inputs, outputs, etc.) + if not list(config_dir.glob("run-*")): + continue + config = config_dir.name + if config not in results: + results[config] = [] + + for run_dir in sorted(config_dir.glob("run-*")): + run_number = int(run_dir.name.split("-")[1]) + grading_file = run_dir / "grading.json" + + if not grading_file.exists(): + logger.warning(f"Warning: grading.json not found in {run_dir}") + continue + + try: + with open(grading_file) as f: + grading = json.load(f) + except json.JSONDecodeError as e: + logger.warning(f"Warning: Invalid JSON in {grading_file}: {e}") + continue + + # Extract metrics + result = { + "eval_id": eval_id, + "run_number": run_number, + "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0), + "passed": grading.get("summary", {}).get("passed", 0), + "failed": grading.get("summary", {}).get("failed", 0), + "total": grading.get("summary", {}).get("total", 0), + } + + # Extract timing — check grading.json first, then sibling timing.json + timing = grading.get("timing", {}) + result["time_seconds"] = timing.get("total_duration_seconds", 0.0) + timing_file = run_dir / "timing.json" + if result["time_seconds"] == 0.0 and timing_file.exists(): + try: + with open(timing_file) as tf: + timing_data = json.load(tf) + result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0) + result["tokens"] = timing_data.get("total_tokens", 0) + except json.JSONDecodeError: + pass + + # Extract metrics if available + metrics = grading.get("execution_metrics", {}) + result["tool_calls"] = metrics.get("total_tool_calls", 0) + if not result.get("tokens"): + result["tokens"] = metrics.get("output_chars", 0) + result["errors"] = metrics.get("errors_encountered", 0) + + # Extract expectations — viewer requires fields: text, passed, evidence + raw_expectations = grading.get("expectations", []) + for exp in raw_expectations: + if "text" not in exp or "passed" not in exp: + logger.warning(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}") + result["expectations"] = raw_expectations + + # Extract notes from user_notes_summary + notes_summary = grading.get("user_notes_summary", {}) + notes = [] + notes.extend(notes_summary.get("uncertainties", [])) + notes.extend(notes_summary.get("needs_review", [])) + notes.extend(notes_summary.get("workarounds", [])) + result["notes"] = notes + + results[config].append(result) + + return results + + +def aggregate_results(results: dict) -> dict: + """ + Aggregate run results into summary statistics. + + Returns run_summary with stats for each configuration and delta. + """ + run_summary = {} + configs = list(results.keys()) + + for config in configs: + runs = results.get(config, []) + + if not runs: + run_summary[config] = { + "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}, + "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}, + "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0} + } + continue + + pass_rates = [r["pass_rate"] for r in runs] + times = [r["time_seconds"] for r in runs] + tokens = [r.get("tokens", 0) for r in runs] + + run_summary[config] = { + "pass_rate": calculate_stats(pass_rates), + "time_seconds": calculate_stats(times), + "tokens": calculate_stats(tokens) + } + + # Calculate delta between the first two configs (if two exist) + if len(configs) >= 2: + primary = run_summary.get(configs[0], {}) + baseline = run_summary.get(configs[1], {}) + else: + primary = run_summary.get(configs[0], {}) if configs else {} + baseline = {} + + delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0) + delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0) + delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0) + + run_summary["delta"] = { + "pass_rate": f"{delta_pass_rate:+.2f}", + "time_seconds": f"{delta_time:+.1f}", + "tokens": f"{delta_tokens:+.0f}" + } + + return run_summary + + +def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict: + """ + Generate complete benchmark.json from run results. + """ + results = load_run_results(benchmark_dir) + run_summary = aggregate_results(results) + + # Build runs array for benchmark.json + runs = [] + for config in results: + for result in results[config]: + runs.append({ + "eval_id": result["eval_id"], + "configuration": config, + "run_number": result["run_number"], + "result": { + "pass_rate": result["pass_rate"], + "passed": result["passed"], + "failed": result["failed"], + "total": result["total"], + "time_seconds": result["time_seconds"], + "tokens": result.get("tokens", 0), + "tool_calls": result.get("tool_calls", 0), + "errors": result.get("errors", 0) + }, + "expectations": result["expectations"], + "notes": result["notes"] + }) + + # Determine eval IDs from results + eval_ids = sorted(set( + r["eval_id"] + for config in results.values() + for r in config + )) + + benchmark = { + "metadata": { + "skill_name": skill_name or "", + "skill_path": skill_path or "", + "executor_model": "", + "analyzer_model": "", + "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "evals_run": eval_ids, + "runs_per_configuration": 3 + }, + "runs": runs, + "run_summary": run_summary, + "notes": [] # To be filled by analyzer + } + + return benchmark + + +def generate_markdown(benchmark: dict) -> str: + """Generate human-readable benchmark.md from benchmark data.""" + metadata = benchmark["metadata"] + run_summary = benchmark["run_summary"] + + # Determine config names (excluding "delta") + configs = [k for k in run_summary if k != "delta"] + config_a = configs[0] if len(configs) >= 1 else "config_a" + config_b = configs[1] if len(configs) >= 2 else "config_b" + label_a = config_a.replace("_", " ").title() + label_b = config_b.replace("_", " ").title() + + lines = [ + f"# Skill Benchmark: {metadata['skill_name']}", + "", + f"**Model**: {metadata['executor_model']}", + f"**Date**: {metadata['timestamp']}", + f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)", + "", + "## Summary", + "", + f"| Metric | {label_a} | {label_b} | Delta |", + "|--------|------------|---------------|-------|", + ] + + a_summary = run_summary.get(config_a, {}) + b_summary = run_summary.get(config_b, {}) + delta = run_summary.get("delta", {}) + + # Format pass rate + a_pr = a_summary.get("pass_rate", {}) + b_pr = b_summary.get("pass_rate", {}) + lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |") + + # Format time + a_time = a_summary.get("time_seconds", {}) + b_time = b_summary.get("time_seconds", {}) + lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |") + + # Format tokens + a_tokens = a_summary.get("tokens", {}) + b_tokens = b_summary.get("tokens", {}) + lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |") + + # Notes section + if benchmark.get("notes"): + lines.extend([ + "", + "## Notes", + "" + ]) + for note in benchmark["notes"]: + lines.append(f"- {note}") + + return "\n".join(lines) + + +def main(): + parser = argparse.ArgumentParser( + description="Aggregate benchmark run results into summary statistics" + ) + parser.add_argument( + "benchmark_dir", + type=Path, + help="Path to the benchmark directory" + ) + parser.add_argument( + "--skill-name", + default="", + help="Name of the skill being benchmarked" + ) + parser.add_argument( + "--skill-path", + default="", + help="Path to the skill being benchmarked" + ) + parser.add_argument( + "--output", "-o", + type=Path, + help="Output path for benchmark.json (default: /benchmark.json)" + ) + + args = parser.parse_args() + + if not args.benchmark_dir.exists(): + logger.error(f"Directory not found: {args.benchmark_dir}") + sys.exit(1) + + # Generate benchmark + benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path) + + # Determine output paths + output_json = args.output or (args.benchmark_dir / "benchmark.json") + output_md = output_json.with_suffix(".md") + + # Write benchmark.json + with open(output_json, "w") as f: + json.dump(benchmark, f, indent=2) + logger.info(f"Generated: {output_json}") + + # Write benchmark.md + markdown = generate_markdown(benchmark) + with open(output_md, "w") as f: + f.write(markdown) + logger.info(f"Generated: {output_md}") + + # Print summary + run_summary = benchmark["run_summary"] + configs = [k for k in run_summary if k != "delta"] + delta = run_summary.get("delta", {}) + + logger.info(f"\nSummary:") + for config in configs: + pr = run_summary[config]["pass_rate"]["mean"] + label = config.replace("_", " ").title() + logger.info(f" {label}: {pr*100:.1f}% pass rate") + logger.info(f" Delta: {delta.get('pass_rate', '—')}") + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/generate_report.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/generate_report.py new file mode 100644 index 000000000..395232d96 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/generate_report.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 +"""Generate an HTML report from run_loop.py output. + +Takes the JSON output from run_loop.py and generates a visual HTML report +showing each description attempt with check/x for each test case. +Distinguishes between train and test queries. +""" + +import argparse +import html +import json +import sys +from pathlib import Path + +from loguru import logger + + +def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str: + """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag.""" + history = data.get("history", []) + holdout = data.get("holdout", 0) + title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else "" + + # Get all unique queries from train and test sets, with should_trigger info + train_queries: list[dict] = [] + test_queries: list[dict] = [] + if history: + for r in history[0].get("train_results", history[0].get("results", [])): + train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)}) + if history[0].get("test_results"): + for r in history[0].get("test_results", []): + test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)}) + + refresh_tag = ' \n' if auto_refresh else "" + + html_parts = [""" + + + +""" + refresh_tag + """ """ + title_prefix + """Skill Description Optimization + + + + + + +

""" + title_prefix + """Skill Description Optimization

+
+ Optimizing your skill's description. This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill. +
+"""] + + # Summary section + best_test_score = data.get('best_test_score') + best_train_score = data.get('best_train_score') + html_parts.append(f""" +
+

Original: {html.escape(data.get('original_description', 'N/A'))}

+

Best: {html.escape(data.get('best_description', 'N/A'))}

+

Best Score: {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}

+

Iterations: {data.get('iterations_run', 0)} | Train: {data.get('train_size', '?')} | Test: {data.get('test_size', '?')}

+
+""") + + # Legend + html_parts.append(""" +
+ Query columns: + Should trigger + Should NOT trigger + Train + Test +
+""") + + # Table header + html_parts.append(""" +
+ + + + + + + +""") + + # Add column headers for train queries + for qinfo in train_queries: + polarity = "positive-col" if qinfo["should_trigger"] else "negative-col" + html_parts.append(f' \n') + + # Add column headers for test queries (different color) + for qinfo in test_queries: + polarity = "positive-col" if qinfo["should_trigger"] else "negative-col" + html_parts.append(f' \n') + + html_parts.append(""" + + +""") + + # Find best iteration for highlighting + if test_queries: + best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration") + else: + best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration") + + # Add rows for each iteration + for h in history: + iteration = h.get("iteration", "?") + train_passed = h.get("train_passed", h.get("passed", 0)) + train_total = h.get("train_total", h.get("total", 0)) + test_passed = h.get("test_passed") + test_total = h.get("test_total") + description = h.get("description", "") + train_results = h.get("train_results", h.get("results", [])) + test_results = h.get("test_results", []) + + # Create lookups for results by query + train_by_query = {r["query"]: r for r in train_results} + test_by_query = {r["query"]: r for r in test_results} if test_results else {} + + # Compute aggregate correct/total runs across all retries + def aggregate_runs(results: list[dict]) -> tuple[int, int]: + correct = 0 + total = 0 + for r in results: + runs = r.get("runs", 0) + triggers = r.get("triggers", 0) + total += runs + if r.get("should_trigger", True): + correct += triggers + else: + correct += runs - triggers + return correct, total + + train_correct, train_runs = aggregate_runs(train_results) + test_correct, test_runs = aggregate_runs(test_results) + + # Determine score classes + def score_class(correct: int, total: int) -> str: + if total > 0: + ratio = correct / total + if ratio >= 0.8: + return "score-good" + elif ratio >= 0.5: + return "score-ok" + return "score-bad" + + train_class = score_class(train_correct, train_runs) + test_class = score_class(test_correct, test_runs) + + row_class = "best-row" if iteration == best_iter else "" + + html_parts.append(f""" + + + + +""") + + # Add result for each train query + for qinfo in train_queries: + r = train_by_query.get(qinfo["query"], {}) + did_pass = r.get("pass", False) + triggers = r.get("triggers", 0) + runs = r.get("runs", 0) + + icon = "✓" if did_pass else "✗" + css_class = "pass" if did_pass else "fail" + + html_parts.append(f' \n') + + # Add result for each test query (with different background) + for qinfo in test_queries: + r = test_by_query.get(qinfo["query"], {}) + did_pass = r.get("pass", False) + triggers = r.get("triggers", 0) + runs = r.get("runs", 0) + + icon = "✓" if did_pass else "✗" + css_class = "pass" if did_pass else "fail" + + html_parts.append(f' \n') + + html_parts.append(" \n") + + html_parts.append(""" +
IterTrainTestDescription{html.escape(qinfo["query"])}{html.escape(qinfo["query"])}
{iteration}{train_correct}/{train_runs}{test_correct}/{test_runs}{html.escape(description)}{icon}{triggers}/{runs}{icon}{triggers}/{runs}
+
+""") + + html_parts.append(""" + + +""") + + return "".join(html_parts) + + +def main(): + parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output") + parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)") + parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)") + parser.add_argument("--skill-name", default="", help="Skill name to include in the report title") + args = parser.parse_args() + + if args.input == "-": + data = json.load(sys.stdin) + else: + data = json.loads(Path(args.input).read_text()) + + html_output = generate_html(data, skill_name=args.skill_name) + + if args.output: + Path(args.output).write_text(html_output) + logger.info(f"Report written to {args.output}") + else: + print(html_output) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/improve_description.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/improve_description.py new file mode 100644 index 000000000..887a06a08 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/improve_description.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +"""Improve a skill description based on eval results. + +Takes eval results (from run_eval.py) and generates an improved description +using Claude with extended thinking. +""" + +import argparse +import json +import re +import sys +from pathlib import Path + +import anthropic +from loguru import logger + +from scripts.utils import parse_skill_md + + +def improve_description( + client: anthropic.Anthropic, + skill_name: str, + skill_content: str, + current_description: str, + eval_results: dict, + history: list[dict], + model: str, + test_results: dict | None = None, + log_dir: Path | None = None, + iteration: int | None = None, +) -> str: + """Call Claude to improve the description based on eval results.""" + failed_triggers = [ + r for r in eval_results["results"] + if r["should_trigger"] and not r["pass"] + ] + false_triggers = [ + r for r in eval_results["results"] + if not r["should_trigger"] and not r["pass"] + ] + + # Build scores summary + train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}" + if test_results: + test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}" + scores_summary = f"Train: {train_score}, Test: {test_score}" + else: + scores_summary = f"Train: {train_score}" + + prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples. + +The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones. + +Here's the current description: + +"{current_description}" + + +Current scores ({scores_summary}): + +""" + if failed_triggers: + prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n" + for r in failed_triggers: + prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n' + prompt += "\n" + + if false_triggers: + prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n" + for r in false_triggers: + prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n' + prompt += "\n" + + if history: + prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n" + for h in history: + train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}" + test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None + score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "") + prompt += f'\n' + prompt += f'Description: "{h["description"]}"\n' + if "results" in h: + prompt += "Train results:\n" + for r in h["results"]: + status = "PASS" if r["pass"] else "FAIL" + prompt += f' [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n' + if h.get("note"): + prompt += f'Note: {h["note"]}\n' + prompt += "\n\n" + + prompt += f""" + +Skill content (for context on what the skill does): + +{skill_content} + + +Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold: + +1. Avoid overfitting +2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description. + +Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy. + +Here are some tips that we've found to work well in writing these descriptions: +- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does" +- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works. +- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable. +- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings. + +I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end. + +Please respond with only the new description text in tags, nothing else.""" + + response = client.messages.create( + model=model, + max_tokens=16000, + thinking={ + "type": "enabled", + "budget_tokens": 10000, + }, + messages=[{"role": "user", "content": prompt}], + ) + + # Extract thinking and text from response + thinking_text = "" + text = "" + for block in response.content: + if block.type == "thinking": + thinking_text = block.thinking + elif block.type == "text": + text = block.text + + # Parse out the tags + match = re.search(r"(.*?)", text, re.DOTALL) + description = match.group(1).strip().strip('"') if match else text.strip().strip('"') + + # Log the transcript + transcript: dict = { + "iteration": iteration, + "prompt": prompt, + "thinking": thinking_text, + "response": text, + "parsed_description": description, + "char_count": len(description), + "over_limit": len(description) > 1024, + } + + # If over 1024 chars, ask the model to shorten it + if len(description) > 1024: + shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in tags." + shorten_response = client.messages.create( + model=model, + max_tokens=16000, + thinking={ + "type": "enabled", + "budget_tokens": 10000, + }, + messages=[ + {"role": "user", "content": prompt}, + {"role": "assistant", "content": text}, + {"role": "user", "content": shorten_prompt}, + ], + ) + + shorten_thinking = "" + shorten_text = "" + for block in shorten_response.content: + if block.type == "thinking": + shorten_thinking = block.thinking + elif block.type == "text": + shorten_text = block.text + + match = re.search(r"(.*?)", shorten_text, re.DOTALL) + shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"') + + transcript["rewrite_prompt"] = shorten_prompt + transcript["rewrite_thinking"] = shorten_thinking + transcript["rewrite_response"] = shorten_text + transcript["rewrite_description"] = shortened + transcript["rewrite_char_count"] = len(shortened) + description = shortened + + transcript["final_description"] = description + + if log_dir: + log_dir.mkdir(parents=True, exist_ok=True) + log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json" + log_file.write_text(json.dumps(transcript, indent=2)) + + return description + + +def main(): + parser = argparse.ArgumentParser(description="Improve a skill description based on eval results") + parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)") + parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr") + args = parser.parse_args() + + skill_path = Path(args.skill_path) + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + eval_results = json.loads(Path(args.eval_results).read_text()) + history = [] + if args.history: + history = json.loads(Path(args.history).read_text()) + + name, _, content = parse_skill_md(skill_path) + current_description = eval_results["description"] + + if args.verbose: + logger.info(f"Current: {current_description}") + logger.info(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}") + + client = anthropic.Anthropic() + new_description = improve_description( + client=client, + skill_name=name, + skill_content=content, + current_description=current_description, + eval_results=eval_results, + history=history, + model=args.model, + ) + + if args.verbose: + logger.info(f"Improved: {new_description}") + + # Output as JSON with both the new description and updated history + output = { + "description": new_description, + "history": history + [{ + "description": current_description, + "passed": eval_results["summary"]["passed"], + "failed": eval_results["summary"]["failed"], + "total": eval_results["summary"]["total"], + "results": eval_results["results"], + }], + } + print(json.dumps(output, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/package_skill.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/package_skill.py new file mode 100644 index 000000000..5dbdf7843 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/package_skill.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Skill Packager - Creates a distributable .skill file of a skill folder + +Usage: + python utils/package_skill.py [output-directory] + +Example: + python utils/package_skill.py skills/public/my-skill + python utils/package_skill.py skills/public/my-skill ./dist +""" + +import fnmatch +import sys +import zipfile +from pathlib import Path + +from loguru import logger +from scripts.quick_validate import validate_skill + +# Patterns to exclude when packaging skills. +EXCLUDE_DIRS = {"__pycache__", "node_modules"} +EXCLUDE_GLOBS = {"*.pyc"} +EXCLUDE_FILES = {".DS_Store"} +# Directories excluded only at the skill root (not when nested deeper). +ROOT_EXCLUDE_DIRS = {"evals"} + + +def should_exclude(rel_path: Path) -> bool: + """Check if a path should be excluded from packaging.""" + parts = rel_path.parts + if any(part in EXCLUDE_DIRS for part in parts): + return True + # rel_path is relative to skill_path.parent, so parts[0] is the skill + # folder name and parts[1] (if present) is the first subdir. + if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS: + return True + name = rel_path.name + if name in EXCLUDE_FILES: + return True + return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS) + + +def package_skill(skill_path, output_dir=None): + """ + Package a skill folder into a .skill file. + + Args: + skill_path: Path to the skill folder + output_dir: Optional output directory for the .skill file (defaults to current directory) + + Returns: + Path to the created .skill file, or None if error + """ + skill_path = Path(skill_path).resolve() + + # Validate skill folder exists + if not skill_path.exists(): + logger.error(f"Skill folder not found: {skill_path}") + return None + + if not skill_path.is_dir(): + logger.error(f"Path is not a directory: {skill_path}") + return None + + # Validate SKILL.md exists + skill_md = skill_path / "SKILL.md" + if not skill_md.exists(): + logger.error(f"SKILL.md not found in {skill_path}") + return None + + # Run validation before packaging + logger.info("Validating skill...") + valid, message = validate_skill(skill_path) + if not valid: + logger.error(f"Validation failed: {message}") + logger.error("Please fix the validation errors before packaging.") + return None + logger.info(f"{message}\n") + + # Determine output location + skill_name = skill_path.name + if output_dir: + output_path = Path(output_dir).resolve() + output_path.mkdir(parents=True, exist_ok=True) + else: + output_path = Path.cwd() + + skill_filename = output_path / f"{skill_name}.skill" + + # Create the .skill file (zip format) + try: + with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf: + # Walk through the skill directory, excluding build artifacts + for file_path in skill_path.rglob('*'): + if not file_path.is_file(): + continue + arcname = file_path.relative_to(skill_path.parent) + if should_exclude(arcname): + logger.debug(f"Skipped: {arcname}") + continue + zipf.write(file_path, arcname) + logger.debug(f"Added: {arcname}") + + logger.info(f"Successfully packaged skill to: {skill_filename}") + return skill_filename + + except Exception as e: + logger.error(f"Error creating .skill file: {e}") + return None + + +def main(): + if len(sys.argv) < 2: + logger.info("Usage: python utils/package_skill.py [output-directory]") + logger.info("\nExample:") + logger.info(" python utils/package_skill.py skills/public/my-skill") + logger.info(" python utils/package_skill.py skills/public/my-skill ./dist") + sys.exit(1) + + skill_path = sys.argv[1] + output_dir = sys.argv[2] if len(sys.argv) > 2 else None + + logger.info(f"Packaging skill: {skill_path}") + if output_dir: + logger.info(f" Output directory: {output_dir}") + logger.info("") + + result = package_skill(skill_path, output_dir) + + if result: + sys.exit(0) + else: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/quick_validate.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/quick_validate.py new file mode 100644 index 000000000..36553161e --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/quick_validate.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Quick validation script for skills - minimal version +""" + +import sys +import os +import re +import yaml +from pathlib import Path + +from loguru import logger + +def validate_skill(skill_path): + """Basic validation of a skill""" + skill_path = Path(skill_path) + + # Check SKILL.md exists + skill_md = skill_path / 'SKILL.md' + if not skill_md.exists(): + return False, "SKILL.md not found" + + # Read and validate frontmatter + content = skill_md.read_text() + if not content.startswith('---'): + return False, "No YAML frontmatter found" + + # Extract frontmatter + match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL) + if not match: + return False, "Invalid frontmatter format" + + frontmatter_text = match.group(1) + + # Parse YAML frontmatter + try: + frontmatter = yaml.safe_load(frontmatter_text) + if not isinstance(frontmatter, dict): + return False, "Frontmatter must be a YAML dictionary" + except yaml.YAMLError as e: + return False, f"Invalid YAML in frontmatter: {e}" + + # Define allowed properties + ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'} + + # Check for unexpected properties (excluding nested keys under metadata) + unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES + if unexpected_keys: + return False, ( + f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. " + f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}" + ) + + # Check required fields + if 'name' not in frontmatter: + return False, "Missing 'name' in frontmatter" + if 'description' not in frontmatter: + return False, "Missing 'description' in frontmatter" + + # Extract name for validation + name = frontmatter.get('name', '') + if not isinstance(name, str): + return False, f"Name must be a string, got {type(name).__name__}" + name = name.strip() + if name: + # Check naming convention (kebab-case: lowercase with hyphens) + if not re.match(r'^[a-z0-9-]+$', name): + return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)" + if name.startswith('-') or name.endswith('-') or '--' in name: + return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens" + # Check name length (max 64 characters per spec) + if len(name) > 64: + return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters." + + # Extract and validate description + description = frontmatter.get('description', '') + if not isinstance(description, str): + return False, f"Description must be a string, got {type(description).__name__}" + description = description.strip() + if description: + # Check for angle brackets + if '<' in description or '>' in description: + return False, "Description cannot contain angle brackets (< or >)" + # Check description length (max 1024 characters per spec) + if len(description) > 1024: + return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters." + + # Validate compatibility field if present (optional) + compatibility = frontmatter.get('compatibility', '') + if compatibility: + if not isinstance(compatibility, str): + return False, f"Compatibility must be a string, got {type(compatibility).__name__}" + if len(compatibility) > 500: + return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters." + + return True, "Skill is valid!" + +if __name__ == "__main__": + if len(sys.argv) != 2: + logger.info("Usage: python quick_validate.py ") + sys.exit(1) + + valid, message = validate_skill(sys.argv[1]) + if valid: + logger.info(message) + else: + logger.error(message) + sys.exit(0 if valid else 1) \ No newline at end of file diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_eval.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_eval.py new file mode 100644 index 000000000..f923066ca --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_eval.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python3 +"""Run trigger evaluation for a skill description. + +Tests whether a skill's description causes Claude to trigger (read the skill) +for a set of queries. Outputs results as JSON. +""" + +import argparse +import json +import os +import select +import subprocess +import sys +import time +import uuid +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path + +from loguru import logger + +from scripts.utils import parse_skill_md + + +def find_project_root() -> Path: + """Find the project root by walking up from cwd looking for .claude/. + + Mimics how Claude Code discovers its project root, so the command file + we create ends up where claude -p will look for it. + """ + current = Path.cwd() + for parent in [current, *current.parents]: + if (parent / ".claude").is_dir(): + return parent + return current + + +def run_single_query( + query: str, + skill_name: str, + skill_description: str, + timeout: int, + project_root: str, + model: str | None = None, +) -> bool: + """Run a single query and return whether the skill was triggered. + + Creates a command file in .claude/commands/ so it appears in Claude's + available_skills list, then runs `claude -p` with the raw query. + Uses --include-partial-messages to detect triggering early from + stream events (content_block_start) rather than waiting for the + full assistant message, which only arrives after tool execution. + """ + unique_id = uuid.uuid4().hex[:8] + clean_name = f"{skill_name}-skill-{unique_id}" + project_commands_dir = Path(project_root) / ".claude" / "commands" + command_file = project_commands_dir / f"{clean_name}.md" + + try: + project_commands_dir.mkdir(parents=True, exist_ok=True) + # Use YAML block scalar to avoid breaking on quotes in description + indented_desc = "\n ".join(skill_description.split("\n")) + command_content = ( + f"---\n" + f"description: |\n" + f" {indented_desc}\n" + f"---\n\n" + f"# {skill_name}\n\n" + f"This skill handles: {skill_description}\n" + ) + command_file.write_text(command_content) + + cmd = [ + "claude", + "-p", query, + "--output-format", "stream-json", + "--verbose", + "--include-partial-messages", + ] + if model: + cmd.extend(["--model", model]) + + # Remove CLAUDECODE env var to allow nesting claude -p inside a + # Claude Code session. The guard is for interactive terminal conflicts; + # programmatic subprocess usage is safe. + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + cwd=project_root, + env=env, + ) + + triggered = False + start_time = time.time() + buffer = "" + # Track state for stream event detection + pending_tool_name = None + accumulated_json = "" + + try: + while time.time() - start_time < timeout: + if process.poll() is not None: + remaining = process.stdout.read() + if remaining: + buffer += remaining.decode("utf-8", errors="replace") + break + + ready, _, _ = select.select([process.stdout], [], [], 1.0) + if not ready: + continue + + chunk = os.read(process.stdout.fileno(), 8192) + if not chunk: + break + buffer += chunk.decode("utf-8", errors="replace") + + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if not line: + continue + + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + + # Early detection via stream events + if event.get("type") == "stream_event": + se = event.get("event", {}) + se_type = se.get("type", "") + + if se_type == "content_block_start": + cb = se.get("content_block", {}) + if cb.get("type") == "tool_use": + tool_name = cb.get("name", "") + if tool_name in ("Skill", "Read"): + pending_tool_name = tool_name + accumulated_json = "" + else: + return False + + elif se_type == "content_block_delta" and pending_tool_name: + delta = se.get("delta", {}) + if delta.get("type") == "input_json_delta": + accumulated_json += delta.get("partial_json", "") + if clean_name in accumulated_json: + return True + + elif se_type in ("content_block_stop", "message_stop"): + if pending_tool_name: + return clean_name in accumulated_json + if se_type == "message_stop": + return False + + # Fallback: full assistant message + elif event.get("type") == "assistant": + message = event.get("message", {}) + for content_item in message.get("content", []): + if content_item.get("type") != "tool_use": + continue + tool_name = content_item.get("name", "") + tool_input = content_item.get("input", {}) + if tool_name == "Skill" and clean_name in tool_input.get("skill", ""): + triggered = True + elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""): + triggered = True + return triggered + + elif event.get("type") == "result": + return triggered + finally: + # Clean up process on any exit path (return, exception, timeout) + if process.poll() is None: + process.kill() + process.wait() + + return triggered + finally: + if command_file.exists(): + command_file.unlink() + + +def run_eval( + eval_set: list[dict], + skill_name: str, + description: str, + num_workers: int, + timeout: int, + project_root: Path, + runs_per_query: int = 1, + trigger_threshold: float = 0.5, + model: str | None = None, +) -> dict: + """Run the full eval set and return results.""" + results = [] + + with ProcessPoolExecutor(max_workers=num_workers) as executor: + future_to_info = {} + for item in eval_set: + for run_idx in range(runs_per_query): + future = executor.submit( + run_single_query, + item["query"], + skill_name, + description, + timeout, + str(project_root), + model, + ) + future_to_info[future] = (item, run_idx) + + query_triggers: dict[str, list[bool]] = {} + query_items: dict[str, dict] = {} + for future in as_completed(future_to_info): + item, _ = future_to_info[future] + query = item["query"] + query_items[query] = item + if query not in query_triggers: + query_triggers[query] = [] + try: + query_triggers[query].append(future.result()) + except Exception as e: + logger.warning(f"Warning: query failed: {e}") + query_triggers[query].append(False) + + for query, triggers in query_triggers.items(): + item = query_items[query] + trigger_rate = sum(triggers) / len(triggers) + should_trigger = item["should_trigger"] + if should_trigger: + did_pass = trigger_rate >= trigger_threshold + else: + did_pass = trigger_rate < trigger_threshold + results.append({ + "query": query, + "should_trigger": should_trigger, + "trigger_rate": trigger_rate, + "triggers": sum(triggers), + "runs": len(triggers), + "pass": did_pass, + }) + + passed = sum(1 for r in results if r["pass"]) + total = len(results) + + return { + "skill_name": skill_name, + "description": description, + "results": results, + "summary": { + "total": total, + "passed": passed, + "failed": total - passed, + }, + } + + +def main(): + parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description") + parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--description", default=None, help="Override description to test") + parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") + parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") + parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") + parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") + parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)") + parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") + args = parser.parse_args() + + eval_set = json.loads(Path(args.eval_set).read_text()) + skill_path = Path(args.skill_path) + + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + name, original_description, content = parse_skill_md(skill_path) + description = args.description or original_description + project_root = find_project_root() + + if args.verbose: + logger.info(f"Evaluating: {description}") + + output = run_eval( + eval_set=eval_set, + skill_name=name, + description=description, + num_workers=args.num_workers, + timeout=args.timeout, + project_root=project_root, + runs_per_query=args.runs_per_query, + trigger_threshold=args.trigger_threshold, + model=args.model, + ) + + if args.verbose: + summary = output["summary"] + logger.info(f"Results: {summary['passed']}/{summary['total']} passed") + for r in output["results"]: + status = "PASS" if r["pass"] else "FAIL" + rate_str = f"{r['triggers']}/{r['runs']}" + logger.info(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}") + + print(json.dumps(output, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_loop.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_loop.py new file mode 100644 index 000000000..a2907d6e0 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/run_loop.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python3 +"""Run the eval + improve loop until all pass or max iterations reached. + +Combines run_eval.py and improve_description.py in a loop, tracking history +and returning the best description found. Supports train/test split to prevent +overfitting. +""" + +import argparse +import json +import random +import sys +import tempfile +import time +import webbrowser +from pathlib import Path + +import anthropic +from loguru import logger + +from scripts.generate_report import generate_html +from scripts.improve_description import improve_description +from scripts.run_eval import find_project_root, run_eval +from scripts.utils import parse_skill_md + + +def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]: + """Split eval set into train and test sets, stratified by should_trigger.""" + random.seed(seed) + + # Separate by should_trigger + trigger = [e for e in eval_set if e["should_trigger"]] + no_trigger = [e for e in eval_set if not e["should_trigger"]] + + # Shuffle each group + random.shuffle(trigger) + random.shuffle(no_trigger) + + # Calculate split points + n_trigger_test = max(1, int(len(trigger) * holdout)) + n_no_trigger_test = max(1, int(len(no_trigger) * holdout)) + + # Split + test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test] + train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:] + + return train_set, test_set + + +def run_loop( + eval_set: list[dict], + skill_path: Path, + description_override: str | None, + num_workers: int, + timeout: int, + max_iterations: int, + runs_per_query: int, + trigger_threshold: float, + holdout: float, + model: str, + verbose: bool, + live_report_path: Path | None = None, + log_dir: Path | None = None, +) -> dict: + """Run the eval + improvement loop.""" + project_root = find_project_root() + name, original_description, content = parse_skill_md(skill_path) + current_description = description_override or original_description + + # Split into train/test if holdout > 0 + if holdout > 0: + train_set, test_set = split_eval_set(eval_set, holdout) + if verbose: + logger.info(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})") + else: + train_set = eval_set + test_set = [] + + client = anthropic.Anthropic() + history = [] + exit_reason = "unknown" + + for iteration in range(1, max_iterations + 1): + if verbose: + logger.info(f"\n{'='*60}") + logger.info(f"Iteration {iteration}/{max_iterations}") + logger.info(f"Description: {current_description}") + logger.info(f"{'='*60}") + + # Evaluate train + test together in one batch for parallelism + all_queries = train_set + test_set + t0 = time.time() + all_results = run_eval( + eval_set=all_queries, + skill_name=name, + description=current_description, + num_workers=num_workers, + timeout=timeout, + project_root=project_root, + runs_per_query=runs_per_query, + trigger_threshold=trigger_threshold, + model=model, + ) + eval_elapsed = time.time() - t0 + + # Split results back into train/test by matching queries + train_queries_set = {q["query"] for q in train_set} + train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set] + test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set] + + train_passed = sum(1 for r in train_result_list if r["pass"]) + train_total = len(train_result_list) + train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total} + train_results = {"results": train_result_list, "summary": train_summary} + + if test_set: + test_passed = sum(1 for r in test_result_list if r["pass"]) + test_total = len(test_result_list) + test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total} + test_results = {"results": test_result_list, "summary": test_summary} + else: + test_results = None + test_summary = None + + history.append({ + "iteration": iteration, + "description": current_description, + "train_passed": train_summary["passed"], + "train_failed": train_summary["failed"], + "train_total": train_summary["total"], + "train_results": train_results["results"], + "test_passed": test_summary["passed"] if test_summary else None, + "test_failed": test_summary["failed"] if test_summary else None, + "test_total": test_summary["total"] if test_summary else None, + "test_results": test_results["results"] if test_results else None, + # For backward compat with report generator + "passed": train_summary["passed"], + "failed": train_summary["failed"], + "total": train_summary["total"], + "results": train_results["results"], + }) + + # Write live report if path provided + if live_report_path: + partial_output = { + "original_description": original_description, + "best_description": current_description, + "best_score": "in progress", + "iterations_run": len(history), + "holdout": holdout, + "train_size": len(train_set), + "test_size": len(test_set), + "history": history, + } + live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name)) + + if verbose: + def print_eval_stats(label, results, elapsed): + pos = [r for r in results if r["should_trigger"]] + neg = [r for r in results if not r["should_trigger"]] + tp = sum(r["triggers"] for r in pos) + pos_runs = sum(r["runs"] for r in pos) + fn = pos_runs - tp + fp = sum(r["triggers"] for r in neg) + neg_runs = sum(r["runs"] for r in neg) + tn = neg_runs - fp + total = tp + tn + fp + fn + precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0 + recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0 + accuracy = (tp + tn) / total if total > 0 else 0.0 + logger.info(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)") + for r in results: + status = "PASS" if r["pass"] else "FAIL" + rate_str = f"{r['triggers']}/{r['runs']}" + logger.info(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}") + + print_eval_stats("Train", train_results["results"], eval_elapsed) + if test_summary: + print_eval_stats("Test ", test_results["results"], 0) + + if train_summary["failed"] == 0: + exit_reason = f"all_passed (iteration {iteration})" + if verbose: + logger.info(f"\nAll train queries passed on iteration {iteration}!") + break + + if iteration == max_iterations: + exit_reason = f"max_iterations ({max_iterations})" + if verbose: + logger.info(f"\nMax iterations reached ({max_iterations}).") + break + + # Improve the description based on train results + if verbose: + logger.info(f"\nImproving description...") + + t0 = time.time() + # Strip test scores from history so improvement model can't see them + blinded_history = [ + {k: v for k, v in h.items() if not k.startswith("test_")} + for h in history + ] + new_description = improve_description( + client=client, + skill_name=name, + skill_content=content, + current_description=current_description, + eval_results=train_results, + history=blinded_history, + model=model, + log_dir=log_dir, + iteration=iteration, + ) + improve_elapsed = time.time() - t0 + + if verbose: + logger.info(f"Proposed ({improve_elapsed:.1f}s): {new_description}") + + current_description = new_description + + # Find the best iteration by TEST score (or train if no test set) + if test_set: + best = max(history, key=lambda h: h["test_passed"] or 0) + best_score = f"{best['test_passed']}/{best['test_total']}" + else: + best = max(history, key=lambda h: h["train_passed"]) + best_score = f"{best['train_passed']}/{best['train_total']}" + + if verbose: + logger.info(f"\nExit reason: {exit_reason}") + logger.info(f"Best score: {best_score} (iteration {best['iteration']})") + + return { + "exit_reason": exit_reason, + "original_description": original_description, + "best_description": best["description"], + "best_score": best_score, + "best_train_score": f"{best['train_passed']}/{best['train_total']}", + "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None, + "final_description": current_description, + "iterations_run": len(history), + "holdout": holdout, + "train_size": len(train_set), + "test_size": len(test_set), + "history": history, + } + + +def main(): + parser = argparse.ArgumentParser(description="Run eval + improve loop") + parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--description", default=None, help="Override starting description") + parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") + parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") + parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations") + parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") + parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") + parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)") + parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") + parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)") + parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here") + args = parser.parse_args() + + eval_set = json.loads(Path(args.eval_set).read_text()) + skill_path = Path(args.skill_path) + + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + name, _, _ = parse_skill_md(skill_path) + + # Set up live report path + if args.report != "none": + if args.report == "auto": + timestamp = time.strftime("%Y%m%d_%H%M%S") + live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html" + else: + live_report_path = Path(args.report) + # Open the report immediately so the user can watch + live_report_path.write_text("

Starting optimization loop...

") + webbrowser.open(str(live_report_path)) + else: + live_report_path = None + + # Determine output directory (create before run_loop so logs can be written) + if args.results_dir: + timestamp = time.strftime("%Y-%m-%d_%H%M%S") + results_dir = Path(args.results_dir) / timestamp + results_dir.mkdir(parents=True, exist_ok=True) + else: + results_dir = None + + log_dir = results_dir / "logs" if results_dir else None + + output = run_loop( + eval_set=eval_set, + skill_path=skill_path, + description_override=args.description, + num_workers=args.num_workers, + timeout=args.timeout, + max_iterations=args.max_iterations, + runs_per_query=args.runs_per_query, + trigger_threshold=args.trigger_threshold, + holdout=args.holdout, + model=args.model, + verbose=args.verbose, + live_report_path=live_report_path, + log_dir=log_dir, + ) + + # Save JSON output + json_output = json.dumps(output, indent=2) + print(json_output) + if results_dir: + (results_dir / "results.json").write_text(json_output) + + # Write final HTML report (without auto-refresh) + if live_report_path: + live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name)) + logger.info(f"\nReport: {live_report_path}") + + if results_dir and live_report_path: + (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name)) + + if results_dir: + logger.info(f"Results saved to: {results_dir}") + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/utils.py b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/utils.py new file mode 100644 index 000000000..51b6a07dd --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/skills/skill-creator/scripts/utils.py @@ -0,0 +1,47 @@ +"""Shared utilities for skill-creator scripts.""" + +from pathlib import Path + + + +def parse_skill_md(skill_path: Path) -> tuple[str, str, str]: + """Parse a SKILL.md file, returning (name, description, full_content).""" + content = (skill_path / "SKILL.md").read_text() + lines = content.split("\n") + + if lines[0].strip() != "---": + raise ValueError("SKILL.md missing frontmatter (no opening ---)") + + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + + if end_idx is None: + raise ValueError("SKILL.md missing frontmatter (no closing ---)") + + name = "" + description = "" + frontmatter_lines = lines[1:end_idx] + i = 0 + while i < len(frontmatter_lines): + line = frontmatter_lines[i] + if line.startswith("name:"): + name = line[len("name:"):].strip().strip('"').strip("'") + elif line.startswith("description:"): + value = line[len("description:"):].strip() + # Handle YAML multiline indicators (>, |, >-, |-) + if value in (">", "|", ">-", "|-"): + continuation_lines: list[str] = [] + i += 1 + while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")): + continuation_lines.append(frontmatter_lines[i].strip()) + i += 1 + description = " ".join(continuation_lines) + continue + else: + description = value.strip('"').strip("'") + i += 1 + + return name, description, content diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/soul.md b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/soul.md new file mode 100644 index 000000000..ed33419eb --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/soul.md @@ -0,0 +1,78 @@ +# Personality + +I am the OKR Agent, the organizational intelligence coordinator for this team. + +## Role +I exist to help the team stay aligned on Objectives and Key Results. My job is to: +- Help establish company and individual OKRs at the start of each period +- Monitor progress across all OKRs and generate regular reports +- Identify risks early — KRs that are falling behind or at risk +- Proactively reach out when team members need to set or update their OKRs +- Reach out to members who haven't updated KRs when reports show they are behind + +## Core Traits +- **Data-Driven**: I base everything on actual progress numbers and concrete evidence +- **Proactive**: I reach out to team members to gather updates and nudge action +- **Clear Communicator**: I present OKR data in a clean, scannable format — no fluff +- **Supportive**: My goal is to help the team succeed, not to judge or police performance +- **Systematic**: I follow a consistent cadence — daily check-ins, weekly summaries + +## How OKRs Get Created + +### Company OKR +The first step after OKR is enabled is for the admin to open a chat with me and describe +the company’s objectives for the period. I use `create_objective` and `create_key_result` +to record everything they tell me. I ask clarifying questions to ensure KRs are measurable. + +### Individual OKRs (Agent Colleagues) +When I am triggered to reach out to Agent colleagues: +- I send them a single comprehensive message that includes: (a) the full company OKR context, + (b) a request to think deeply about their role’s contribution and reply in ONE message + with their proposed Objective and Key Results. +- I wait for their reply, then parse it and call `create_objective` + `create_key_result` + to record their OKR on their behalf. +- I confirm back to them once their OKRs are created. + +## How Existing OKRs Get Revised + +When someone asks me to modify an existing OKR, I do NOT create a new Objective or KR by default. + +- First, I inspect the current OKRs with `get_my_okr` (for the speaker's own OKRs) or `get_okr` (for any member). +- If the Objective wording needs to change, I use `update_objective`. +- If the KR wording, target value, unit, focus reference, or KR status needs to change, I use `update_kr_content`. +- If only the numeric progress changed, I use `update_kr_progress` or `update_any_kr_progress`. +- I only use `create_objective` or `create_key_result` when the user is clearly adding a brand-new OKR item for the current period. +- If any OKR tool returns `Permission denied`, I stop immediately, explain the permission boundary in plain language, and do NOT retry with create tools as a fallback. + +### Individual OKRs (Human Members) +For human platform users, I send a `send_platform_message` notification inviting them to either: +- Chat with me directly to discuss their OKRs (I will create them from the conversation), or +- Add their OKRs manually on the OKR page. + +## Channel Users +If the organization has channel-synced members (e.g. Feishu) but I have not been configured +with the corresponding channel bot, I immediately notify the admin via `send_platform_message` +listing the unreachable users and asking them to configure the channel for me. + +## Work Style +- I use `get_okr` to get the full OKR board at the start of each report cycle +- I use `send_message_to_agent` to communicate with Agent colleagues +- I use `send_platform_message` to notify human platform members +- I write structured reports in `workspace/reports/` and share them via Plaza +- I use `update_any_kr_progress` to record progress values gathered during check-ins + +## During Report Generation (Cron Triggers) +When a daily or weekly report is triggered: +1. Call `get_okr_settings` to read config +2. Call `get_okr` to get current OKR board +3. Identify KRs with `behind` or `at_risk` status +4. For stale or at-risk KRs, send targeted reminders to the responsible person + (agent → `send_message_to_agent`; user → `send_platform_message`) +5. Generate and post the report via `generate_okr_report` + `plaza_create_post` + +## Communication Style +- Professional and concise +- Data-first: lead with numbers, then context +- I respond in whatever language my team uses (Chinese or English) +- I use structured markdown for all reports +- Tone: supportive invitation, never accusatory demand diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/state.json b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/state.json new file mode 100644 index 000000000..713d7dc9a --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/state.json @@ -0,0 +1,13 @@ +{ + "agent_id": "6baf75b5-0f3e-4e82-8e0d-269711aef0d8", + "name": "OKR Agent", + "status": "idle", + "current_task": null, + "last_active": null, + "channel_status": {}, + "stats": { + "tasks_completed_today": 0, + "tasks_in_progress": 0, + "鐫e姙_pending": 0 + } +} \ No newline at end of file diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/todo.json b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/todo.json new file mode 100644 index 000000000..50ffbb9a9 --- /dev/null +++ b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/todo.json @@ -0,0 +1,3 @@ +{ + "tasks": [] +} diff --git a/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/workspace/archived/.gitkeep b/.clawith/data/agents/6baf75b5-0f3e-4e82-8e0d-269711aef0d8/workspace/archived/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/HEARTBEAT.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/HEARTBEAT.md new file mode 100644 index 000000000..485565cb3 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/HEARTBEAT.md @@ -0,0 +1,63 @@ +# HEARTBEAT + +When this file is read during a heartbeat, you are performing a **periodic awareness check**. + +## Phase 1: Review Context & Discover Interest Points + +Review your **recent conversations** and your **role/responsibilities**. +Identify topics or questions that: +- Are directly relevant to your role and current work +- Were mentioned by users but not fully explored at the time +- Represent emerging trends or changes in your professional domain +- Could improve your ability to serve your users + +If no genuine, informative topics emerge from recent context, **skip exploration** and go directly to Phase 3. +Do NOT search for generic or obvious topics just to fill time. Quality over quantity. + +## Phase 2: Targeted Exploration (Conditional) + +Only if you identified genuine interest points in Phase 1: + +1. Use `web_search` to investigate (maximum 5 searches per heartbeat) +2. Keep searches **tightly scoped** to your role and recent work topics +3. For each discovery worth keeping: + - Record it using `write_file` to `memory/curiosity_journal.md` + - Include the **source URL** and a brief note on **why it matters to your work** + - Rate its relevance (high/medium/low) to your current responsibilities + +Format for curiosity_journal.md entries: +``` +### [Date] - [Topic] +- **Finding**: [What you learned] +- **Source**: [URL] +- **Relevance**: [high/medium/low] — [Why it matters to your work] +- **Follow-up**: [Optional: questions this raises for next time] +``` + +## Phase 3: Agent Plaza + +1. Call `plaza_get_new_posts` to check recent activity +2. If you found something genuinely valuable in Phase 2: + - Share the most impactful discovery to plaza (max 1 post) + - **Always include the source URL** when sharing internet findings + - Frame it in terms of how it's relevant to your team/domain +3. Comment on relevant existing posts (max 2 comments) + +## Phase 4: Wrap Up + +- If nothing needed attention and no exploration was warranted: reply with `HEARTBEAT_OK` +- Otherwise, briefly summarize what you explored and why + +## Key Principles +- Always ground exploration in YOUR role and YOUR recent work context +- Never search for random unrelated topics out of idle curiosity +- If you don't have a specific angle worth investigating, don't search +- Prefer depth over breadth — one thoroughly explored topic > five surface-level queries +- Generate follow-up questions only when you genuinely want to know more + +## Rules +- ⛔ **NEVER share private information**: user conversations, memory contents, workspace files, task details +- ✅ **Share only public-safe content**: general insights, tips, industry news, web search discoveries with links +- 📝 **Limits per heartbeat**: max 1 post + 2 comments +- 🔍 **Search limits**: max 5 web searches per heartbeat +- 🤐 **If nothing interesting to explore or share**, respond with `HEARTBEAT_OK` diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/daily_reports/.gitkeep b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/daily_reports/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/enterprise_info/.gitkeep b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/enterprise_info/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/MEMORY_INDEX.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/MEMORY_INDEX.md new file mode 100644 index 000000000..29e3fab13 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/MEMORY_INDEX.md @@ -0,0 +1,6 @@ +# Memory Index + +This file serves as an index of all memories for this digital employee. + +## Topics + diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/curiosity_journal.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/curiosity_journal.md new file mode 100644 index 000000000..c5185fe44 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/memory/curiosity_journal.md @@ -0,0 +1,9 @@ +# Curiosity Journal + +This is your exploration log. Record interesting discoveries from your web searches here. + +## Active Questions + + +## Discoveries + diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/.gitkeep b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/MCP_INSTALLER.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/MCP_INSTALLER.md new file mode 100644 index 000000000..9e3bf3c77 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/MCP_INSTALLER.md @@ -0,0 +1,87 @@ +# MCP Tool Installer + +## When to Use This Skill +Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL. + +--- + +## Step-by-Step Protocol + +### Step 1 — Search first +``` +discover_resources(query="", max_results=5) +``` +Show the results and let the user pick. Note the `ID` field (e.g. `github`). + +### Step 2 — Determine import method + +**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐) +- Requires Smithery API Key (one-time per agent) +- Individual tool tokens NOT needed — Smithery handles auth via OAuth + +**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint) +- User provides the MCP server URL directly +- May require tool-specific API key + +**Not importable** (💻 local-only tools) +- Requires local Docker/process — inform user these cannot be imported automatically + +--- + +### Method A: Smithery Import + +#### Check Smithery API Key +If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim): + +> **Smithery** (smithery.ai) 是一个 MCP 工具市场,类似于"应用商店"。通过它,我可以帮你一键安装各种第三方工具(如 GitHub、Notion、Slack 等),并自动完成认证。 +> +> **为什么需要注册?** +> Smithery 用 API Key 来识别你的身份,这样安装的工具会关联到你的账号,认证信息也会安全保存。 +> +> **注册一次后有什么好处?** +> - 🔑 只需提供一次 Key,后续安装其他工具时我会自动帮你配置 +> - 🔐 不需要为每个工具单独创建 Token(如 GitHub PAT),OAuth 一键授权 +> - 📦 支持上千种 MCP 工具,随时可以扩展你的能力 +> +> **获取步骤:** +> 1. 访问 https://smithery.ai 注册/登录 +> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key +> 3. 将 Key 提供给我 + +#### Import +``` +import_mcp_server( + server_id="", + config={"smithery_api_key": ""} # first time only +) +``` + +#### Handle OAuth +Some tools return an OAuth authorization URL. Tell the user to visit the link. + +**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically. + +--- + +### Method B: Direct URL Import + +When a tool is not available on Smithery but the user has a public MCP endpoint: +``` +import_mcp_server( + server_id="", + config={ + "mcp_url": "https://my-mcp-server.com/sse", + "api_key": "" + } +) +``` +The system will connect to the URL, discover available tools, and register them. + +--- + +## What NOT to Do +- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these +- ❌ Don't tell users to go to Settings — handle everything in chat +- ❌ Don't echo API keys back in your response +- ❌ Don't skip the search step — always verify the server exists before importing +- ❌ Don't import local-only tools — inform users they require local installation diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/SKILL.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/SKILL.md new file mode 100644 index 000000000..db71c3ed8 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/SKILL.md @@ -0,0 +1,146 @@ +--- +name: Complex Task Executor +description: Structured methodology for decomposing, planning, and executing complex multi-step tasks with progress tracking +--- + +# Complex Task Executor + +## When to Use This Skill + +Use this skill when a task meets ANY of the following criteria: +- Requires more than 3 distinct steps to complete +- Involves multiple tools or information sources +- Has dependencies between steps (step B needs output from step A) +- Requires research before execution +- Could benefit from a documented plan others can review +- The user explicitly asks for a thorough or systematic approach + +**DO NOT use this for simple tasks** like answering a question, reading a single file, or performing one tool call. + +## Workflow + +### Phase 1: Task Analysis (THINK before acting) + +Before creating any files, analyze the task: + +1. **Understand the goal**: What is the final deliverable? What does "done" look like? +2. **Assess complexity**: How many steps? What tools are needed? +3. **Identify dependencies**: Which steps depend on others? +4. **Identify risks**: What could go wrong? What information is missing? +5. **Estimate scope**: Is the task feasible with available tools/skills? + +### Phase 2: Create Task Plan + +Create a task folder and plan file in the workspace: + +``` +workspace//plan.md +``` + +The plan.md MUST follow this exact format: + +```markdown +# Task: + +## Objective + + +## Steps + +- [ ] 1. + - Details: + - Output: +- [ ] 2. + - Details: <...> + - Depends on: Step 1 +- [ ] 3. + - Details: <...> + +## Status +- Created: +- Current Step: Not started +- Progress: 0/ + +## Notes + +``` + +Rules for writing the plan: +- Each step should be completable in 1-3 tool calls +- Use verb-noun format: "Research competitors", "Draft report", "Validate data" +- Mark dependencies explicitly +- Include expected outputs for each step + +### Phase 3: Execute Step-by-Step + +For EACH step in the plan: + +1. **Read the plan** — Call `read_file` on `workspace//plan.md` to check current state +2. **Mark as in-progress** — Update the checkbox from `[ ]` to `[/]` and update the "Current Step" field +3. **Execute the step** — Do the actual work (tool calls, analysis, writing) +4. **Record output** — Save results to `workspace//` (e.g., intermediate files, data) +5. **Mark as complete** — Update the checkbox from `[/]` to `[x]` and update "Progress" counter +6. **Proceed to next step** — Move to the next uncompleted step + +### Phase 4: Completion + +When all steps are done: +1. Update plan.md status to "✅ Completed" +2. Create a `workspace//summary.md` with: + - What was accomplished + - Key results and deliverables + - Any follow-up items +3. Present the final result to the user + +## Adaptive Replanning + +If during execution you discover: +- A step is impossible → Mark it `[!]` with a reason, add alternative steps +- New steps are needed → Add them to the plan with `[+]` prefix +- A step produced unexpected results → Add a note and adjust subsequent steps +- The plan needs major changes → Create a new section "## Revised Plan" and follow it + +Always update plan.md BEFORE changing course, so the plan stays the source of truth. + +## Error Handling + +- If a tool call fails, retry once. If it fails again, mark the step as blocked and note the error. +- Never silently skip a step. Always update the plan to reflect what happened. +- If you're stuck, tell the user what's blocking and ask for guidance. + +## Example Scenarios + +### Example 1: "Research our top 3 competitors and write a comparison report" + +Plan would be: +``` +- [ ] 1. Identify the user's company/product context +- [ ] 2. Research Competitor A — website, pricing, features +- [ ] 3. Research Competitor B — website, pricing, features +- [ ] 4. Research Competitor C — website, pricing, features +- [ ] 5. Create comparison matrix +- [ ] 6. Write analysis and recommendations +- [ ] 7. Compile final report +``` + +### Example 2: "Analyze our Q4 sales data and prepare a board presentation" + +Plan would be: +``` +- [ ] 1. Read and understand the sales data files +- [ ] 2. Calculate key metrics (revenue, growth, trends) +- [ ] 3. Identify top insights and anomalies +- [ ] 4. Create data summary tables +- [ ] 5. Draft presentation outline +- [ ] 6. Write each presentation section +- [ ] 7. Add executive summary +- [ ] 8. Review and polish final document +``` + +## Key Principles + +1. **Plan is the source of truth** — Always update it before moving on +2. **One step at a time** — Don't skip ahead or batch too many steps +3. **Show your work** — Save intermediate results to the task folder +4. **Communicate progress** — The user can read plan.md at any time to see status +5. **Be adaptive** — Plans change; that's OK if you update the plan first diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/examples/plan_template.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/examples/plan_template.md new file mode 100644 index 000000000..dfd60e7cb --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/complex-task-executor/examples/plan_template.md @@ -0,0 +1,23 @@ +# Task: [Title] + +## Objective +[One-sentence description of the desired outcome] + +## Steps + +- [ ] 1. [First step] + - Details: [What specifically to do] + - Output: [What this step produces] +- [ ] 2. [Second step] + - Details: [...] + - Depends on: Step 1 +- [ ] 3. [Third step] + - Details: [...] + +## Status +- Created: [timestamp] +- Current Step: Not started +- Progress: 0/3 + +## Notes +- [Any assumptions, risks, or open questions] diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/mcp-installer/SKILL.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/mcp-installer/SKILL.md new file mode 100644 index 000000000..9e3bf3c77 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/mcp-installer/SKILL.md @@ -0,0 +1,87 @@ +# MCP Tool Installer + +## When to Use This Skill +Use this skill when a user wants to add a new tool or integration (e.g., GitHub, Brave Search, Notion, etc.) that isn't currently available but can be imported from the MCP registry or via a direct URL. + +--- + +## Step-by-Step Protocol + +### Step 1 — Search first +``` +discover_resources(query="", max_results=5) +``` +Show the results and let the user pick. Note the `ID` field (e.g. `github`). + +### Step 2 — Determine import method + +**Method A: Smithery Import** (tool found on Smithery with remote hosting support 🌐) +- Requires Smithery API Key (one-time per agent) +- Individual tool tokens NOT needed — Smithery handles auth via OAuth + +**Method B: Direct URL Import** (tool NOT on Smithery, but has public HTTP/SSE endpoint) +- User provides the MCP server URL directly +- May require tool-specific API key + +**Not importable** (💻 local-only tools) +- Requires local Docker/process — inform user these cannot be imported automatically + +--- + +### Method A: Smithery Import + +#### Check Smithery API Key +If no Smithery key is configured, explain Smithery and guide the user. Use the following talking points (adapt to context, don't read verbatim): + +> **Smithery** (smithery.ai) 是一个 MCP 工具市场,类似于"应用商店"。通过它,我可以帮你一键安装各种第三方工具(如 GitHub、Notion、Slack 等),并自动完成认证。 +> +> **为什么需要注册?** +> Smithery 用 API Key 来识别你的身份,这样安装的工具会关联到你的账号,认证信息也会安全保存。 +> +> **注册一次后有什么好处?** +> - 🔑 只需提供一次 Key,后续安装其他工具时我会自动帮你配置 +> - 🔐 不需要为每个工具单独创建 Token(如 GitHub PAT),OAuth 一键授权 +> - 📦 支持上千种 MCP 工具,随时可以扩展你的能力 +> +> **获取步骤:** +> 1. 访问 https://smithery.ai 注册/登录 +> 2. 前往 https://smithery.ai/account/api-keys 创建 API Key +> 3. 将 Key 提供给我 + +#### Import +``` +import_mcp_server( + server_id="", + config={"smithery_api_key": ""} # first time only +) +``` + +#### Handle OAuth +Some tools return an OAuth authorization URL. Tell the user to visit the link. + +**Important:** Do NOT ask for individual tool tokens (GitHub PAT, Notion API key, etc.) when using Smithery — OAuth handles this automatically. + +--- + +### Method B: Direct URL Import + +When a tool is not available on Smithery but the user has a public MCP endpoint: +``` +import_mcp_server( + server_id="", + config={ + "mcp_url": "https://my-mcp-server.com/sse", + "api_key": "" + } +) +``` +The system will connect to the URL, discover available tools, and register them. + +--- + +## What NOT to Do +- ❌ Don't ask for GitHub PAT, Notion key etc. when using Smithery — OAuth handles these +- ❌ Don't tell users to go to Settings — handle everything in chat +- ❌ Don't echo API keys back in your response +- ❌ Don't skip the search step — always verify the server exists before importing +- ❌ Don't import local-only tools — inform users they require local installation diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/SKILL.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/SKILL.md new file mode 100644 index 000000000..ce0d06f3e --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/SKILL.md @@ -0,0 +1,152 @@ +--- +name: skill-creator +description: Create new skills, modify and improve existing skills, and measure skill performance. Use when users want to create a skill from scratch, update or optimize an existing skill, run evals to test a skill, benchmark skill performance with variance analysis, or optimize a skill's description for better triggering accuracy. +--- + +# Skill Creator + +A skill for creating new skills and iteratively improving them. + +At a high level, the process of creating a skill goes like this: + +- Decide what you want the skill to do and roughly how it should do it +- Write a draft of the skill +- Create a few test prompts and run claude-with-access-to-the-skill on them +- Help the user evaluate the results both qualitatively and quantitatively +- Rewrite the skill based on feedback from the user's evaluation +- Repeat until you're satisfied +- Expand the test set and try again at larger scale + +Your job when using this skill is to figure out where the user is in this process and then jump in and help them progress through these stages. + +## Communicating with the user + +Pay attention to context cues to understand how to phrase your communication. Briefly explain terms if you're in doubt, and feel free to clarify terms with a short definition if you're unsure if the user will get it. + +--- + +## Creating a skill + +### Capture Intent +Start by understanding the user's intent. + +1. What should this skill enable the agent to do? +2. When should this skill trigger? (what user phrases/contexts) +3. What's the expected output format? +4. Should we set up test cases to verify the skill works? + +### Interview and Research +Proactively ask questions about edge cases, input/output formats, example files, success criteria, and dependencies. Wait to write test prompts until you've got this part ironed out. + +### Write the SKILL.md +Based on the user interview, fill in these components: + +- **name**: Skill identifier +- **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it. +- **the rest of the skill** + +### Skill Writing Guide + +#### Anatomy of a Skill + +``` +skill-name/ +\u251c\u2500\u2500 SKILL.md (required) +\u2502 \u251c\u2500\u2500 YAML frontmatter (name, description required) +\u2502 \u2514\u2500\u2500 Markdown instructions +\u2514\u2500\u2500 Bundled Resources (optional) + \u251c\u2500\u2500 scripts/ - Executable code for deterministic/repetitive tasks + \u251c\u2500\u2500 references/ - Docs loaded into context as needed + \u2514\u2500\u2500 assets/ - Files used in output (templates, icons, fonts) +``` + +#### Progressive Disclosure + +Skills use a three-level loading system: +1. **Metadata** (name + description) - Always in context (~100 words) +2. **SKILL.md body** - In context whenever skill triggers (<500 lines ideal) +3. **Bundled resources** - As needed (unlimited, scripts can execute without loading) + +**Key patterns:** +- Keep SKILL.md under 500 lines; if approaching this limit, add hierarchy with clear pointers +- Reference files clearly from SKILL.md with guidance on when to read them +- For large reference files (>300 lines), include a table of contents + +#### Writing Patterns + +Prefer using the imperative form in instructions. + +### Writing Style +Explain to the model why things are important. Use theory of mind and try to make the skill general. Start by writing a draft and then look at it with fresh eyes and improve it. + +### Test Cases +After writing the skill draft, come up with 2-3 realistic test prompts. Share them with the user. Save test cases to `evals/evals.json`. + +--- + +## Running and evaluating test cases + +This section is one continuous sequence. + +### Step 1: Run test cases +For each test case, run the agent with the skill applied, and optionally a baseline run without the skill for comparison. + +### Step 2: Draft assertions +While runs are in progress, draft quantitative assertions for each test case. Good assertions are objectively verifiable and have descriptive names. + +### Step 3: Capture timing data +When each run completes, save timing data (tokens, duration) to `timing.json`. + +### Step 4: Grade, aggregate, and launch the viewer +Once all runs are done: +1. Grade each run against assertions — see `agents/grader.md` +2. Aggregate results: `python -m scripts.aggregate_benchmark /iteration-N --skill-name ` +3. Launch the viewer: `python eval-viewer/generate_review.py /iteration-N --skill-name "my-skill" --benchmark /iteration-N/benchmark.json` +4. Present results to the user for review + +### Step 5: Read the feedback +Read user feedback from `feedback.json`. Empty feedback means the user thought it was fine. + +--- + +## Improving the skill + +### How to think about improvements +1. **Generalize from the feedback.** Don't overfit to specific examples. +2. **Keep the prompt lean.** Remove things that aren't pulling their weight. +3. **Explain the why.** Today's LLMs are smart. Explain reasoning rather than rigid MUSTs. +4. **Look for repeated work across test cases.** Bundle common scripts in `scripts/`. + +### The iteration loop +1. Apply improvements to the skill +2. Rerun all test cases into a new iteration directory +3. Present results for review +4. Wait for user to review +5. Read feedback, improve again, repeat + +--- + +## Advanced: Blind comparison +For rigorous comparison between two versions. Read `agents/comparator.md` and `agents/analyzer.md`. + +## Description Optimization +Optimize the description for better triggering accuracy. Use `scripts/run_loop.py`. + +--- + +## Reference files + +- `agents/grader.md` — How to evaluate assertions against outputs +- `agents/comparator.md` — How to do blind A/B comparison between two outputs +- `agents/analyzer.md` — How to analyze why one version beat another +- `references/schemas.md` — JSON structures for evals.json, grading.json, etc. +- `assets/eval_review.html` — HTML template for eval review +- `eval-viewer/generate_review.py` — Script to generate the review viewer +- `scripts/aggregate_benchmark.py` — Aggregate benchmark results +- `scripts/generate_report.py` — Generate optimization report +- `scripts/improve_description.py` — Improve skill description +- `scripts/package_skill.py` — Package skill for distribution +- `scripts/quick_validate.py` — Quick validation +- `scripts/run_eval.py` — Run triggering evaluation +- `scripts/run_loop.py` — Run optimization loop +- `scripts/utils.py` — Shared utilities diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/analyzer.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/analyzer.md new file mode 100644 index 000000000..14e41d606 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/analyzer.md @@ -0,0 +1,274 @@ +# Post-hoc Analyzer Agent + +Analyze blind comparison results to understand WHY the winner won and generate improvement suggestions. + +## Role + +After the blind comparator determines a winner, the Post-hoc Analyzer "unblids" the results by examining the skills and transcripts. The goal is to extract actionable insights: what made the winner better, and how can the loser be improved? + +## Inputs + +You receive these parameters in your prompt: + +- **winner**: "A" or "B" (from blind comparison) +- **winner_skill_path**: Path to the skill that produced the winning output +- **winner_transcript_path**: Path to the execution transcript for the winner +- **loser_skill_path**: Path to the skill that produced the losing output +- **loser_transcript_path**: Path to the execution transcript for the loser +- **comparison_result_path**: Path to the blind comparator's output JSON +- **output_path**: Where to save the analysis results + +## Process + +### Step 1: Read Comparison Result + +1. Read the blind comparator's output at comparison_result_path +2. Note the winning side (A or B), the reasoning, and any scores +3. Understand what the comparator valued in the winning output + +### Step 2: Read Both Skills + +1. Read the winner skill's SKILL.md and key referenced files +2. Read the loser skill's SKILL.md and key referenced files +3. Identify structural differences: + - Instructions clarity and specificity + - Script/tool usage patterns + - Example coverage + - Edge case handling + +### Step 3: Read Both Transcripts + +1. Read the winner's transcript +2. Read the loser's transcript +3. Compare execution patterns: + - How closely did each follow their skill's instructions? + - What tools were used differently? + - Where did the loser diverge from optimal behavior? + - Did either encounter errors or make recovery attempts? + +### Step 4: Analyze Instruction Following + +For each transcript, evaluate: +- Did the agent follow the skill's explicit instructions? +- Did the agent use the skill's provided tools/scripts? +- Were there missed opportunities to leverage skill content? +- Did the agent add unnecessary steps not in the skill? + +Score instruction following 1-10 and note specific issues. + +### Step 5: Identify Winner Strengths + +Determine what made the winner better: +- Clearer instructions that led to better behavior? +- Better scripts/tools that produced better output? +- More comprehensive examples that guided edge cases? +- Better error handling guidance? + +Be specific. Quote from skills/transcripts where relevant. + +### Step 6: Identify Loser Weaknesses + +Determine what held the loser back: +- Ambiguous instructions that led to suboptimal choices? +- Missing tools/scripts that forced workarounds? +- Gaps in edge case coverage? +- Poor error handling that caused failures? + +### Step 7: Generate Improvement Suggestions + +Based on the analysis, produce actionable suggestions for improving the loser skill: +- Specific instruction changes to make +- Tools/scripts to add or modify +- Examples to include +- Edge cases to address + +Prioritize by impact. Focus on changes that would have changed the outcome. + +### Step 8: Write Analysis Results + +Save structured analysis to `{output_path}`. + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "comparison_summary": { + "winner": "A", + "winner_skill": "path/to/winner/skill", + "loser_skill": "path/to/loser/skill", + "comparator_reasoning": "Brief summary of why comparator chose winner" + }, + "winner_strengths": [ + "Clear step-by-step instructions for handling multi-page documents", + "Included validation script that caught formatting errors", + "Explicit guidance on fallback behavior when OCR fails" + ], + "loser_weaknesses": [ + "Vague instruction 'process the document appropriately' led to inconsistent behavior", + "No script for validation, agent had to improvise and made errors", + "No guidance on OCR failure, agent gave up instead of trying alternatives" + ], + "instruction_following": { + "winner": { + "score": 9, + "issues": [ + "Minor: skipped optional logging step" + ] + }, + "loser": { + "score": 6, + "issues": [ + "Did not use the skill's formatting template", + "Invented own approach instead of following step 3", + "Missed the 'always validate output' instruction" + ] + } + }, + "improvement_suggestions": [ + { + "priority": "high", + "category": "instructions", + "suggestion": "Replace 'process the document appropriately' with explicit steps: 1) Extract text, 2) Identify sections, 3) Format per template", + "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior" + }, + { + "priority": "high", + "category": "tools", + "suggestion": "Add validate_output.py script similar to winner skill's validation approach", + "expected_impact": "Would catch formatting errors before final output" + }, + { + "priority": "medium", + "category": "error_handling", + "suggestion": "Add fallback instructions: 'If OCR fails, try: 1) different resolution, 2) image preprocessing, 3) manual extraction'", + "expected_impact": "Would prevent early failure on difficult documents" + } + ], + "transcript_insights": { + "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script -> Fixed 2 issues -> Produced output", + "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods -> No validation -> Output had errors" + } +} +``` + +## Guidelines + +- **Be specific**: Quote from skills and transcripts, don't just say "instructions were unclear" +- **Be actionable**: Suggestions should be concrete changes, not vague advice +- **Focus on skill improvements**: The goal is to improve the losing skill, not critique the agent +- **Prioritize by impact**: Which changes would most likely have changed the outcome? +- **Consider causation**: Did the skill weakness actually cause the worse output, or is it incidental? +- **Stay objective**: Analyze what happened, don't editorialize +- **Think about generalization**: Would this improvement help on other evals too? + +## Categories for Suggestions + +Use these categories to organize improvement suggestions: + +| Category | Description | +|----------|-------------| +| `instructions` | Changes to the skill's prose instructions | +| `tools` | Scripts, templates, or utilities to add/modify | +| `examples` | Example inputs/outputs to include | +| `error_handling` | Guidance for handling failures | +| `structure` | Reorganization of skill content | +| `references` | External docs or resources to add | + +## Priority Levels + +- **high**: Would likely change the outcome of this comparison +- **medium**: Would improve quality but may not change win/loss +- **low**: Nice to have, marginal improvement + +--- + +# Analyzing Benchmark Results + +When analyzing benchmark results, the analyzer's purpose is to **surface patterns and anomalies** across multiple runs, not suggest skill improvements. + +## Role + +Review all benchmark run results and generate freeform notes that help the user understand skill performance. Focus on patterns that wouldn't be visible from aggregate metrics alone. + +## Inputs + +You receive these parameters in your prompt: + +- **benchmark_data_path**: Path to the in-progress benchmark.json with all run results +- **skill_path**: Path to the skill being benchmarked +- **output_path**: Where to save the notes (as JSON array of strings) + +## Process + +### Step 1: Read Benchmark Data + +1. Read the benchmark.json containing all run results +2. Note the configurations tested (with_skill, without_skill) +3. Understand the run_summary aggregates already calculated + +### Step 2: Analyze Per-Assertion Patterns + +For each expectation across all runs: +- Does it **always pass** in both configurations? (may not differentiate skill value) +- Does it **always fail** in both configurations? (may be broken or beyond capability) +- Does it **always pass with skill but fail without**? (skill clearly adds value here) +- Does it **always fail with skill but pass without**? (skill may be hurting) +- Is it **highly variable**? (flaky expectation or non-deterministic behavior) + +### Step 3: Analyze Cross-Eval Patterns + +Look for patterns across evals: +- Are certain eval types consistently harder/easier? +- Do some evals show high variance while others are stable? +- Are there surprising results that contradict expectations? + +### Step 4: Analyze Metrics Patterns + +Look at time_seconds, tokens, tool_calls: +- Does the skill significantly increase execution time? +- Is there high variance in resource usage? +- Are there outlier runs that skew the aggregates? + +### Step 5: Generate Notes + +Write freeform observations as a list of strings. Each note should: +- State a specific observation +- Be grounded in the data (not speculation) +- Help the user understand something the aggregate metrics don't show + +Examples: +- "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value" +- "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky" +- "Without-skill runs consistently fail on table extraction expectations (0% pass rate)" +- "Skill adds 13s average execution time but improves pass rate by 50%" +- "Token usage is 80% higher with skill, primarily due to script output parsing" +- "All 3 without-skill runs for eval 1 produced empty output" + +### Step 6: Write Notes + +Save notes to `{output_path}` as a JSON array of strings: + +```json +[ + "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value", + "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure", + "Without-skill runs consistently fail on table extraction expectations", + "Skill adds 13s average execution time but improves pass rate by 50%" +] +``` + +## Guidelines + +**DO:** +- Report what you observe in the data +- Be specific about which evals, expectations, or runs you're referring to +- Note patterns that aggregate metrics would hide +- Provide context that helps interpret the numbers + +**DO NOT:** +- Suggest improvements to the skill (that's for the improvement step, not benchmarking) +- Make subjective quality judgments ("the output was good/bad") +- Speculate about causes without evidence +- Repeat information already in the run_summary aggregates diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/comparator.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/comparator.md new file mode 100644 index 000000000..80e00eb45 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/comparator.md @@ -0,0 +1,202 @@ +# Blind Comparator Agent + +Compare two outputs WITHOUT knowing which skill produced them. + +## Role + +The Blind Comparator judges which output better accomplishes the eval task. You receive two outputs labeled A and B, but you do NOT know which skill produced which. This prevents bias toward a particular skill or approach. + +Your judgment is based purely on output quality and task completion. + +## Inputs + +You receive these parameters in your prompt: + +- **output_a_path**: Path to the first output file or directory +- **output_b_path**: Path to the second output file or directory +- **eval_prompt**: The original task/prompt that was executed +- **expectations**: List of expectations to check (optional - may be empty) + +## Process + +### Step 1: Read Both Outputs + +1. Examine output A (file or directory) +2. Examine output B (file or directory) +3. Note the type, structure, and content of each +4. If outputs are directories, examine all relevant files inside + +### Step 2: Understand the Task + +1. Read the eval_prompt carefully +2. Identify what the task requires: + - What should be produced? + - What qualities matter (accuracy, completeness, format)? + - What would distinguish a good output from a poor one? + +### Step 3: Generate Evaluation Rubric + +Based on the task, generate a rubric with two dimensions: + +**Content Rubric** (what the output contains): +| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) | +|-----------|----------|----------------|---------------| +| Correctness | Major errors | Minor errors | Fully correct | +| Completeness | Missing key elements | Mostly complete | All elements present | +| Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout | + +**Structure Rubric** (how the output is organized): +| Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) | +|-----------|----------|----------------|---------------| +| Organization | Disorganized | Reasonably organized | Clear, logical structure | +| Formatting | Inconsistent/broken | Mostly consistent | Professional, polished | +| Usability | Difficult to use | Usable with effort | Easy to use | + +Adapt criteria to the specific task. For example: +- PDF form → "Field alignment", "Text readability", "Data placement" +- Document → "Section structure", "Heading hierarchy", "Paragraph flow" +- Data output → "Schema correctness", "Data types", "Completeness" + +### Step 4: Evaluate Each Output Against the Rubric + +For each output (A and B): + +1. **Score each criterion** on the rubric (1-5 scale) +2. **Calculate dimension totals**: Content score, Structure score +3. **Calculate overall score**: Average of dimension scores, scaled to 1-10 + +### Step 5: Check Assertions (if provided) + +If expectations are provided: + +1. Check each expectation against output A +2. Check each expectation against output B +3. Count pass rates for each output +4. Use expectation scores as secondary evidence (not the primary decision factor) + +### Step 6: Determine the Winner + +Compare A and B based on (in priority order): + +1. **Primary**: Overall rubric score (content + structure) +2. **Secondary**: Assertion pass rates (if applicable) +3. **Tiebreaker**: If truly equal, declare a TIE + +Be decisive - ties should be rare. One output is usually better, even if marginally. + +### Step 7: Write Comparison Results + +Save results to a JSON file at the path specified (or `comparison.json` if not specified). + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "winner": "A", + "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.", + "rubric": { + "A": { + "content": { + "correctness": 5, + "completeness": 5, + "accuracy": 4 + }, + "structure": { + "organization": 4, + "formatting": 5, + "usability": 4 + }, + "content_score": 4.7, + "structure_score": 4.3, + "overall_score": 9.0 + }, + "B": { + "content": { + "correctness": 3, + "completeness": 2, + "accuracy": 3 + }, + "structure": { + "organization": 3, + "formatting": 2, + "usability": 3 + }, + "content_score": 2.7, + "structure_score": 2.7, + "overall_score": 5.4 + } + }, + "output_quality": { + "A": { + "score": 9, + "strengths": ["Complete solution", "Well-formatted", "All fields present"], + "weaknesses": ["Minor style inconsistency in header"] + }, + "B": { + "score": 5, + "strengths": ["Readable output", "Correct basic structure"], + "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"] + } + }, + "expectation_results": { + "A": { + "passed": 4, + "total": 5, + "pass_rate": 0.80, + "details": [ + {"text": "Output includes name", "passed": true}, + {"text": "Output includes date", "passed": true}, + {"text": "Format is PDF", "passed": true}, + {"text": "Contains signature", "passed": false}, + {"text": "Readable text", "passed": true} + ] + }, + "B": { + "passed": 3, + "total": 5, + "pass_rate": 0.60, + "details": [ + {"text": "Output includes name", "passed": true}, + {"text": "Output includes date", "passed": false}, + {"text": "Format is PDF", "passed": true}, + {"text": "Contains signature", "passed": false}, + {"text": "Readable text", "passed": true} + ] + } + } +} +``` + +If no expectations were provided, omit the `expectation_results` field entirely. + +## Field Descriptions + +- **winner**: "A", "B", or "TIE" +- **reasoning**: Clear explanation of why the winner was chosen (or why it's a tie) +- **rubric**: Structured rubric evaluation for each output + - **content**: Scores for content criteria (correctness, completeness, accuracy) + - **structure**: Scores for structure criteria (organization, formatting, usability) + - **content_score**: Average of content criteria (1-5) + - **structure_score**: Average of structure criteria (1-5) + - **overall_score**: Combined score scaled to 1-10 +- **output_quality**: Summary quality assessment + - **score**: 1-10 rating (should match rubric overall_score) + - **strengths**: List of positive aspects + - **weaknesses**: List of issues or shortcomings +- **expectation_results**: (Only if expectations provided) + - **passed**: Number of expectations that passed + - **total**: Total number of expectations + - **pass_rate**: Fraction passed (0.0 to 1.0) + - **details**: Individual expectation results + +## Guidelines + +- **Stay blind**: DO NOT try to infer which skill produced which output. Judge purely on output quality. +- **Be specific**: Cite specific examples when explaining strengths and weaknesses. +- **Be decisive**: Choose a winner unless outputs are genuinely equivalent. +- **Output quality first**: Assertion scores are secondary to overall task completion. +- **Be objective**: Don't favor outputs based on style preferences; focus on correctness and completeness. +- **Explain your reasoning**: The reasoning field should make it clear why you chose the winner. +- **Handle edge cases**: If both outputs fail, pick the one that fails less badly. If both are excellent, pick the one that's marginally better. diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/grader.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/grader.md new file mode 100644 index 000000000..558ab05c0 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/agents/grader.md @@ -0,0 +1,223 @@ +# Grader Agent + +Evaluate expectations against an execution transcript and outputs. + +## Role + +The Grader reviews a transcript and output files, then determines whether each expectation passes or fails. Provide clear evidence for each judgment. + +You have two jobs: grade the outputs, and critique the evals themselves. A passing grade on a weak assertion is worse than useless — it creates false confidence. When you notice an assertion that's trivially satisfied, or an important outcome that no assertion checks, say so. + +## Inputs + +You receive these parameters in your prompt: + +- **expectations**: List of expectations to evaluate (strings) +- **transcript_path**: Path to the execution transcript (markdown file) +- **outputs_dir**: Directory containing output files from execution + +## Process + +### Step 1: Read the Transcript + +1. Read the transcript file completely +2. Note the eval prompt, execution steps, and final result +3. Identify any issues or errors documented + +### Step 2: Examine Output Files + +1. List files in outputs_dir +2. Read/examine each file relevant to the expectations. If outputs aren't plain text, use the inspection tools provided in your prompt — don't rely solely on what the transcript says the executor produced. +3. Note contents, structure, and quality + +### Step 3: Evaluate Each Assertion + +For each expectation: + +1. **Search for evidence** in the transcript and outputs +2. **Determine verdict**: + - **PASS**: Clear evidence the expectation is true AND the evidence reflects genuine task completion, not just surface-level compliance + - **FAIL**: No evidence, or evidence contradicts the expectation, or the evidence is superficial (e.g., correct filename but empty/wrong content) +3. **Cite the evidence**: Quote the specific text or describe what you found + +### Step 4: Extract and Verify Claims + +Beyond the predefined expectations, extract implicit claims from the outputs and verify them: + +1. **Extract claims** from the transcript and outputs: + - Factual statements ("The form has 12 fields") + - Process claims ("Used pypdf to fill the form") + - Quality claims ("All fields were filled correctly") + +2. **Verify each claim**: + - **Factual claims**: Can be checked against the outputs or external sources + - **Process claims**: Can be verified from the transcript + - **Quality claims**: Evaluate whether the claim is justified + +3. **Flag unverifiable claims**: Note claims that cannot be verified with available information + +This catches issues that predefined expectations might miss. + +### Step 5: Read User Notes + +If `{outputs_dir}/user_notes.md` exists: +1. Read it and note any uncertainties or issues flagged by the executor +2. Include relevant concerns in the grading output +3. These may reveal problems even when expectations pass + +### Step 6: Critique the Evals + +After grading, consider whether the evals themselves could be improved. Only surface suggestions when there's a clear gap. + +Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't. + +Suggestions worth raising: +- An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content) +- An important outcome you observed — good or bad — that no assertion covers at all +- An assertion that can't actually be verified from the available outputs + +Keep the bar high. The goal is to flag things the eval author would say "good catch" about, not to nitpick every assertion. + +### Step 7: Write Grading Results + +Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir). + +## Grading Criteria + +**PASS when**: +- The transcript or outputs clearly demonstrate the expectation is true +- Specific evidence can be cited +- The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename) + +**FAIL when**: +- No evidence found for the expectation +- Evidence contradicts the expectation +- The expectation cannot be verified from available information +- The evidence is superficial — the assertion is technically satisfied but the underlying task outcome is wrong or incomplete +- The output appears to meet the assertion by coincidence rather than by actually doing the work + +**When uncertain**: The burden of proof to pass is on the expectation. + +### Step 8: Read Executor Metrics and Timing + +1. If `{outputs_dir}/metrics.json` exists, read it and include in grading output +2. If `{outputs_dir}/../timing.json` exists, read it and include timing data + +## Output Format + +Write a JSON file with this structure: + +```json +{ + "expectations": [ + { + "text": "The output includes the name 'John Smith'", + "passed": true, + "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'" + }, + { + "text": "The spreadsheet has a SUM formula in cell B10", + "passed": false, + "evidence": "No spreadsheet was created. The output was a text file." + }, + { + "text": "The assistant used the skill's OCR script", + "passed": true, + "evidence": "Transcript Step 2 shows: 'Tool: Bash - python ocr_script.py image.png'" + } + ], + "summary": { + "passed": 2, + "failed": 1, + "total": 3, + "pass_rate": 0.67 + }, + "execution_metrics": { + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8 + }, + "total_tool_calls": 15, + "total_steps": 6, + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 + }, + "timing": { + "executor_duration_seconds": 165.0, + "grader_duration_seconds": 26.0, + "total_duration_seconds": 191.0 + }, + "claims": [ + { + "claim": "The form has 12 fillable fields", + "type": "factual", + "verified": true, + "evidence": "Counted 12 fields in field_info.json" + }, + { + "claim": "All required fields were populated", + "type": "quality", + "verified": false, + "evidence": "Reference section was left blank despite data being available" + } + ], + "user_notes_summary": { + "uncertainties": ["Used 2023 data, may be stale"], + "needs_review": [], + "workarounds": ["Fell back to text overlay for non-fillable fields"] + }, + "eval_feedback": { + "suggestions": [ + { + "assertion": "The output includes the name 'John Smith'", + "reason": "A hallucinated document that mentions the name would also pass — consider checking it appears as the primary contact with matching phone and email from the input" + }, + { + "reason": "No assertion checks whether the extracted phone numbers match the input — I observed incorrect numbers in the output that went uncaught" + } + ], + "overall": "Assertions check presence but not correctness. Consider adding content verification." + } +} +``` + +## Field Descriptions + +- **expectations**: Array of graded expectations + - **text**: The original expectation text + - **passed**: Boolean - true if expectation passes + - **evidence**: Specific quote or description supporting the verdict +- **summary**: Aggregate statistics + - **passed**: Count of passed expectations + - **failed**: Count of failed expectations + - **total**: Total expectations evaluated + - **pass_rate**: Fraction passed (0.0 to 1.0) +- **execution_metrics**: Copied from executor's metrics.json (if available) + - **output_chars**: Total character count of output files (proxy for tokens) + - **transcript_chars**: Character count of transcript +- **timing**: Wall clock timing from timing.json (if available) + - **executor_duration_seconds**: Time spent in executor subagent + - **total_duration_seconds**: Total elapsed time for the run +- **claims**: Extracted and verified claims from the output + - **claim**: The statement being verified + - **type**: "factual", "process", or "quality" + - **verified**: Boolean - whether the claim holds + - **evidence**: Supporting or contradicting evidence +- **user_notes_summary**: Issues flagged by the executor + - **uncertainties**: Things the executor wasn't sure about + - **needs_review**: Items requiring human attention + - **workarounds**: Places where the skill didn't work as expected +- **eval_feedback**: Improvement suggestions for the evals (only when warranted) + - **suggestions**: List of concrete suggestions, each with a `reason` and optionally an `assertion` it relates to + - **overall**: Brief assessment — can be "No suggestions, evals look solid" if nothing to flag + +## Guidelines + +- **Be objective**: Base verdicts on evidence, not assumptions +- **Be specific**: Quote the exact text that supports your verdict +- **Be thorough**: Check both transcript and output files +- **Be consistent**: Apply the same standard to each expectation +- **Explain failures**: Make it clear why evidence was insufficient +- **No partial credit**: Each expectation is pass or fail, not partial diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/assets/eval_review.html b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/assets/eval_review.html new file mode 100644 index 000000000..938ff32ae --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/assets/eval_review.html @@ -0,0 +1,146 @@ + + + + + + Eval Set Review - __SKILL_NAME_PLACEHOLDER__ + + + + + + +

Eval Set Review: __SKILL_NAME_PLACEHOLDER__

+

Current description: __SKILL_DESCRIPTION_PLACEHOLDER__

+ +
+ + +
+ + + + + + + + + + +
QueryShould TriggerActions
+ +

+ + + + diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/generate_review.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/generate_review.py new file mode 100644 index 000000000..4f0b1fe00 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/generate_review.py @@ -0,0 +1,473 @@ +#!/usr/bin/env python3 +"""Generate and serve a review page for eval results. + +Reads the workspace directory, discovers runs (directories with outputs/), +embeds all output data into a self-contained HTML page, and serves it via +a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace. + +Usage: + python generate_review.py [--port PORT] [--skill-name NAME] + python generate_review.py --previous-feedback /path/to/old/feedback.json + +No dependencies beyond the Python stdlib are required. +""" + +import argparse +import base64 +import json +import mimetypes +import os +import re +import signal +import subprocess +import sys +import time +import webbrowser +from functools import partial +from http.server import HTTPServer, BaseHTTPRequestHandler +from pathlib import Path + +from loguru import logger + +# Files to exclude from output listings +METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"} + +# Extensions we render as inline text +TEXT_EXTENSIONS = { + ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx", + ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs", + ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml", +} + +# Extensions we render as inline images +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"} + +# MIME type overrides for common types +MIME_OVERRIDES = { + ".svg": "image/svg+xml", + ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", +} + + +def get_mime_type(path: Path) -> str: + ext = path.suffix.lower() + if ext in MIME_OVERRIDES: + return MIME_OVERRIDES[ext] + mime, _ = mimetypes.guess_type(str(path)) + return mime or "application/octet-stream" + + +def find_runs(workspace: Path) -> list[dict]: + """Recursively find directories that contain an outputs/ subdirectory.""" + runs: list[dict] = [] + _find_runs_recursive(workspace, workspace, runs) + runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"])) + return runs + + +def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None: + if not current.is_dir(): + return + + outputs_dir = current / "outputs" + if outputs_dir.is_dir(): + run = build_run(root, current) + if run: + runs.append(run) + return + + skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"} + for child in sorted(current.iterdir()): + if child.is_dir() and child.name not in skip: + _find_runs_recursive(root, child, runs) + + +def build_run(root: Path, run_dir: Path) -> dict | None: + """Build a run dict with prompt, outputs, and grading data.""" + prompt = "" + eval_id = None + + # Try eval_metadata.json + for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]: + if candidate.exists(): + try: + metadata = json.loads(candidate.read_text()) + prompt = metadata.get("prompt", "") + eval_id = metadata.get("eval_id") + except (json.JSONDecodeError, OSError): + pass + if prompt: + break + + # Fall back to transcript.md + if not prompt: + for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]: + if candidate.exists(): + try: + text = candidate.read_text() + match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text) + if match: + prompt = match.group(1).strip() + except OSError: + pass + if prompt: + break + + if not prompt: + prompt = "(No prompt found)" + + run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-") + + # Collect output files + outputs_dir = run_dir / "outputs" + output_files: list[dict] = [] + if outputs_dir.is_dir(): + for f in sorted(outputs_dir.iterdir()): + if f.is_file() and f.name not in METADATA_FILES: + output_files.append(embed_file(f)) + + # Load grading if present + grading = None + for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]: + if candidate.exists(): + try: + grading = json.loads(candidate.read_text()) + except (json.JSONDecodeError, OSError): + pass + if grading: + break + + return { + "id": run_id, + "prompt": prompt, + "eval_id": eval_id, + "outputs": output_files, + "grading": grading, + } + + +def embed_file(path: Path) -> dict: + """Read a file and return an embedded representation.""" + ext = path.suffix.lower() + mime = get_mime_type(path) + + if ext in TEXT_EXTENSIONS: + try: + content = path.read_text(errors="replace") + except OSError: + content = "(Error reading file)" + return { + "name": path.name, + "type": "text", + "content": content, + } + elif ext in IMAGE_EXTENSIONS: + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "image", + "mime": mime, + "data_uri": f"data:{mime};base64,{b64}", + } + elif ext == ".pdf": + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "pdf", + "data_uri": f"data:{mime};base64,{b64}", + } + elif ext == ".xlsx": + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "xlsx", + "data_b64": b64, + } + else: + # Binary / unknown — base64 download link + try: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + except OSError: + return {"name": path.name, "type": "error", "content": "(Error reading file)"} + return { + "name": path.name, + "type": "binary", + "mime": mime, + "data_uri": f"data:{mime};base64,{b64}", + } + + +def load_previous_iteration(workspace: Path) -> dict[str, dict]: + """Load previous iteration's feedback and outputs. + + Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}. + """ + result: dict[str, dict] = {} + + # Load feedback + feedback_map: dict[str, str] = {} + feedback_path = workspace / "feedback.json" + if feedback_path.exists(): + try: + data = json.loads(feedback_path.read_text()) + feedback_map = { + r["run_id"]: r["feedback"] + for r in data.get("reviews", []) + if r.get("feedback", "").strip() + } + except (json.JSONDecodeError, OSError, KeyError): + pass + + # Load runs (to get outputs) + prev_runs = find_runs(workspace) + for run in prev_runs: + result[run["id"]] = { + "feedback": feedback_map.get(run["id"], ""), + "outputs": run.get("outputs", []), + } + + # Also add feedback for run_ids that had feedback but no matching run + for run_id, fb in feedback_map.items(): + if run_id not in result: + result[run_id] = {"feedback": fb, "outputs": []} + + return result + + +def generate_html( + runs: list[dict], + skill_name: str, + previous: dict[str, dict] | None = None, + benchmark: dict | None = None, +) -> str: + """Generate the complete standalone HTML page with embedded data.""" + template_path = Path(__file__).parent / "viewer.html" + template = template_path.read_text() + + # Build previous_feedback and previous_outputs maps for the template + previous_feedback: dict[str, str] = {} + previous_outputs: dict[str, list[dict]] = {} + if previous: + for run_id, data in previous.items(): + if data.get("feedback"): + previous_feedback[run_id] = data["feedback"] + if data.get("outputs"): + previous_outputs[run_id] = data["outputs"] + + embedded = { + "skill_name": skill_name, + "runs": runs, + "previous_feedback": previous_feedback, + "previous_outputs": previous_outputs, + } + if benchmark: + embedded["benchmark"] = benchmark + + data_json = json.dumps(embedded) + + return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};") + + +# --------------------------------------------------------------------------- +# HTTP server (stdlib only, zero dependencies) +# --------------------------------------------------------------------------- + +def _kill_port(port: int) -> None: + """Kill any process listening on the given port.""" + try: + result = subprocess.run( + ["lsof", "-ti", f":{port}"], + capture_output=True, text=True, timeout=5, + ) + for pid_str in result.stdout.strip().split("\n"): + if pid_str.strip(): + try: + os.kill(int(pid_str.strip()), signal.SIGTERM) + except (ProcessLookupError, ValueError): + pass + if result.stdout.strip(): + time.sleep(0.5) + except subprocess.TimeoutExpired: + pass + except FileNotFoundError: + logger.warning("Note: lsof not found, cannot check if port is in use") + +class ReviewHandler(BaseHTTPRequestHandler): + """Serves the review HTML and handles feedback saves. + + Regenerates the HTML on each page load so that refreshing the browser + picks up new eval outputs without restarting the server. + """ + + def __init__( + self, + workspace: Path, + skill_name: str, + feedback_path: Path, + previous: dict[str, dict], + benchmark_path: Path | None, + *args, + **kwargs, + ): + self.workspace = workspace + self.skill_name = skill_name + self.feedback_path = feedback_path + self.previous = previous + self.benchmark_path = benchmark_path + super().__init__(*args, **kwargs) + + def do_GET(self) -> None: + if self.path == "/" or self.path == "/index.html": + # Regenerate HTML on each request (re-scans workspace for new outputs) + runs = find_runs(self.workspace) + benchmark = None + if self.benchmark_path and self.benchmark_path.exists(): + try: + benchmark = json.loads(self.benchmark_path.read_text()) + except (json.JSONDecodeError, OSError): + pass + html = generate_html(runs, self.skill_name, self.previous, benchmark) + content = html.encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(content))) + self.end_headers() + self.wfile.write(content) + elif self.path == "/api/feedback": + data = b"{}" + if self.feedback_path.exists(): + data = self.feedback_path.read_bytes() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(data))) + self.end_headers() + self.wfile.write(data) + else: + self.send_error(404) + + def do_POST(self) -> None: + if self.path == "/api/feedback": + length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(length) + try: + data = json.loads(body) + if not isinstance(data, dict) or "reviews" not in data: + raise ValueError("Expected JSON object with 'reviews' key") + self.feedback_path.write_text(json.dumps(data, indent=2) + "\n") + resp = b'{"ok":true}' + self.send_response(200) + except (json.JSONDecodeError, OSError, ValueError) as e: + resp = json.dumps({"error": str(e)}).encode() + self.send_response(500) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(resp))) + self.end_headers() + self.wfile.write(resp) + else: + self.send_error(404) + + def log_message(self, format: str, *args: object) -> None: + # Suppress request logging to keep terminal clean + pass + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate and serve eval review") + parser.add_argument("workspace", type=Path, help="Path to workspace directory") + parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)") + parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header") + parser.add_argument( + "--previous-workspace", type=Path, default=None, + help="Path to previous iteration's workspace (shows old outputs and feedback as context)", + ) + parser.add_argument( + "--benchmark", type=Path, default=None, + help="Path to benchmark.json to show in the Benchmark tab", + ) + parser.add_argument( + "--static", "-s", type=Path, default=None, + help="Write standalone HTML to this path instead of starting a server", + ) + args = parser.parse_args() + + workspace = args.workspace.resolve() + if not workspace.is_dir(): + logger.error(f"Error: {workspace} is not a directory") + sys.exit(1) + + runs = find_runs(workspace) + if not runs: + logger.error(f"No runs found in {workspace}") + sys.exit(1) + + skill_name = args.skill_name or workspace.name.replace("-workspace", "") + feedback_path = workspace / "feedback.json" + + previous: dict[str, dict] = {} + if args.previous_workspace: + previous = load_previous_iteration(args.previous_workspace.resolve()) + + benchmark_path = args.benchmark.resolve() if args.benchmark else None + benchmark = None + if benchmark_path and benchmark_path.exists(): + try: + benchmark = json.loads(benchmark_path.read_text()) + except (json.JSONDecodeError, OSError): + pass + + if args.static: + html = generate_html(runs, skill_name, previous, benchmark) + args.static.parent.mkdir(parents=True, exist_ok=True) + args.static.write_text(html) + logger.info(f"\n Static viewer written to: {args.static}\n") + sys.exit(0) + + # Kill any existing process on the target port + port = args.port + _kill_port(port) + handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path) + try: + server = HTTPServer(("127.0.0.1", port), handler) + except OSError: + # Port still in use after kill attempt — find a free one + server = HTTPServer(("127.0.0.1", 0), handler) + port = server.server_address[1] + + url = f"http://localhost:{port}" + logger.info(f"\n Eval Viewer") + logger.info(f" ─────────────────────────────────") + logger.info(f" URL: {url}") + logger.info(f" Workspace: {workspace}") + logger.info(f" Feedback: {feedback_path}") + if previous: + logger.info(f" Previous: {args.previous_workspace} ({len(previous)} runs)") + if benchmark_path: + logger.info(f" Benchmark: {benchmark_path}") + logger.info(f"\n Press Ctrl+C to stop.\n") + + webbrowser.open(url) + + try: + server.serve_forever() + except KeyboardInterrupt: + logger.info("\nStopped.") + server.server_close() + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/viewer.html b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/viewer.html new file mode 100644 index 000000000..6d8e96348 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/eval-viewer/viewer.html @@ -0,0 +1,1325 @@ + + + + + + Eval Review + + + + + + + +
+
+
+

Eval Review:

+
Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.
+
+
+
+ + + + + +
+
+ +
+
Prompt
+
+
+
+
+ + +
+
Output
+
+
No output files found
+
+
+ + + + + + + + +
+
Your Feedback
+
+ + + +
+
+
+ + +
+ + +
+
+
No benchmark data available. Run a benchmark to see quantitative results here.
+
+
+
+ + +
+
+

Review Complete

+

Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.

+
+ +
+
+
+ + +
+ + + + diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/references/schemas.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/references/schemas.md new file mode 100644 index 000000000..b6eeaa2d4 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/references/schemas.md @@ -0,0 +1,430 @@ +# JSON Schemas + +This document defines the JSON schemas used by skill-creator. + +--- + +## evals.json + +Defines the evals for a skill. Located at `evals/evals.json` within the skill directory. + +```json +{ + "skill_name": "example-skill", + "evals": [ + { + "id": 1, + "prompt": "User's example prompt", + "expected_output": "Description of expected result", + "files": ["evals/files/sample1.pdf"], + "expectations": [ + "The output includes X", + "The skill used script Y" + ] + } + ] +} +``` + +**Fields:** +- `skill_name`: Name matching the skill's frontmatter +- `evals[].id`: Unique integer identifier +- `evals[].prompt`: The task to execute +- `evals[].expected_output`: Human-readable description of success +- `evals[].files`: Optional list of input file paths (relative to skill root) +- `evals[].expectations`: List of verifiable statements + +--- + +## history.json + +Tracks version progression in Improve mode. Located at workspace root. + +```json +{ + "started_at": "2026-01-15T10:30:00Z", + "skill_name": "pdf", + "current_best": "v2", + "iterations": [ + { + "version": "v0", + "parent": null, + "expectation_pass_rate": 0.65, + "grading_result": "baseline", + "is_current_best": false + }, + { + "version": "v1", + "parent": "v0", + "expectation_pass_rate": 0.75, + "grading_result": "won", + "is_current_best": false + }, + { + "version": "v2", + "parent": "v1", + "expectation_pass_rate": 0.85, + "grading_result": "won", + "is_current_best": true + } + ] +} +``` + +**Fields:** +- `started_at`: ISO timestamp of when improvement started +- `skill_name`: Name of the skill being improved +- `current_best`: Version identifier of the best performer +- `iterations[].version`: Version identifier (v0, v1, ...) +- `iterations[].parent`: Parent version this was derived from +- `iterations[].expectation_pass_rate`: Pass rate from grading +- `iterations[].grading_result`: "baseline", "won", "lost", or "tie" +- `iterations[].is_current_best`: Whether this is the current best version + +--- + +## grading.json + +Output from the grader agent. Located at `/grading.json`. + +```json +{ + "expectations": [ + { + "text": "The output includes the name 'John Smith'", + "passed": true, + "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'" + }, + { + "text": "The spreadsheet has a SUM formula in cell B10", + "passed": false, + "evidence": "No spreadsheet was created. The output was a text file." + } + ], + "summary": { + "passed": 2, + "failed": 1, + "total": 3, + "pass_rate": 0.67 + }, + "execution_metrics": { + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8 + }, + "total_tool_calls": 15, + "total_steps": 6, + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 + }, + "timing": { + "executor_duration_seconds": 165.0, + "grader_duration_seconds": 26.0, + "total_duration_seconds": 191.0 + }, + "claims": [ + { + "claim": "The form has 12 fillable fields", + "type": "factual", + "verified": true, + "evidence": "Counted 12 fields in field_info.json" + } + ], + "user_notes_summary": { + "uncertainties": ["Used 2023 data, may be stale"], + "needs_review": [], + "workarounds": ["Fell back to text overlay for non-fillable fields"] + }, + "eval_feedback": { + "suggestions": [ + { + "assertion": "The output includes the name 'John Smith'", + "reason": "A hallucinated document that mentions the name would also pass" + } + ], + "overall": "Assertions check presence but not correctness." + } +} +``` + +**Fields:** +- `expectations[]`: Graded expectations with evidence +- `summary`: Aggregate pass/fail counts +- `execution_metrics`: Tool usage and output size (from executor's metrics.json) +- `timing`: Wall clock timing (from timing.json) +- `claims`: Extracted and verified claims from the output +- `user_notes_summary`: Issues flagged by the executor +- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising + +--- + +## metrics.json + +Output from the executor agent. Located at `/outputs/metrics.json`. + +```json +{ + "tool_calls": { + "Read": 5, + "Write": 2, + "Bash": 8, + "Edit": 1, + "Glob": 2, + "Grep": 0 + }, + "total_tool_calls": 18, + "total_steps": 6, + "files_created": ["filled_form.pdf", "field_values.json"], + "errors_encountered": 0, + "output_chars": 12450, + "transcript_chars": 3200 +} +``` + +**Fields:** +- `tool_calls`: Count per tool type +- `total_tool_calls`: Sum of all tool calls +- `total_steps`: Number of major execution steps +- `files_created`: List of output files created +- `errors_encountered`: Number of errors during execution +- `output_chars`: Total character count of output files +- `transcript_chars`: Character count of transcript + +--- + +## timing.json + +Wall clock timing for a run. Located at `/timing.json`. + +**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact. + +```json +{ + "total_tokens": 84852, + "duration_ms": 23332, + "total_duration_seconds": 23.3, + "executor_start": "2026-01-15T10:30:00Z", + "executor_end": "2026-01-15T10:32:45Z", + "executor_duration_seconds": 165.0, + "grader_start": "2026-01-15T10:32:46Z", + "grader_end": "2026-01-15T10:33:12Z", + "grader_duration_seconds": 26.0 +} +``` + +--- + +## benchmark.json + +Output from Benchmark mode. Located at `benchmarks//benchmark.json`. + +```json +{ + "metadata": { + "skill_name": "pdf", + "skill_path": "/path/to/pdf", + "executor_model": "claude-sonnet-4-20250514", + "analyzer_model": "most-capable-model", + "timestamp": "2026-01-15T10:30:00Z", + "evals_run": [1, 2, 3], + "runs_per_configuration": 3 + }, + + "runs": [ + { + "eval_id": 1, + "eval_name": "Ocean", + "configuration": "with_skill", + "run_number": 1, + "result": { + "pass_rate": 0.85, + "passed": 6, + "failed": 1, + "total": 7, + "time_seconds": 42.5, + "tokens": 3800, + "tool_calls": 18, + "errors": 0 + }, + "expectations": [ + {"text": "...", "passed": true, "evidence": "..."} + ], + "notes": [ + "Used 2023 data, may be stale", + "Fell back to text overlay for non-fillable fields" + ] + } + ], + + "run_summary": { + "with_skill": { + "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90}, + "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0}, + "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100} + }, + "without_skill": { + "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45}, + "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0}, + "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500} + }, + "delta": { + "pass_rate": "+0.50", + "time_seconds": "+13.0", + "tokens": "+1700" + } + }, + + "notes": [ + "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value", + "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent", + "Without-skill runs consistently fail on table extraction expectations", + "Skill adds 13s average execution time but improves pass rate by 50%" + ] +} +``` + +**Fields:** +- `metadata`: Information about the benchmark run + - `skill_name`: Name of the skill + - `timestamp`: When the benchmark was run + - `evals_run`: List of eval names or IDs + - `runs_per_configuration`: Number of runs per config (e.g. 3) +- `runs[]`: Individual run results + - `eval_id`: Numeric eval identifier + - `eval_name`: Human-readable eval name (used as section header in the viewer) + - `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding) + - `run_number`: Integer run number (1, 2, 3...) + - `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors` +- `run_summary`: Statistical aggregates per configuration + - `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields + - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"` +- `notes`: Freeform observations from the analyzer + +**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually. + +--- + +## comparison.json + +Output from blind comparator. Located at `/comparison-N.json`. + +```json +{ + "winner": "A", + "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.", + "rubric": { + "A": { + "content": { + "correctness": 5, + "completeness": 5, + "accuracy": 4 + }, + "structure": { + "organization": 4, + "formatting": 5, + "usability": 4 + }, + "content_score": 4.7, + "structure_score": 4.3, + "overall_score": 9.0 + }, + "B": { + "content": { + "correctness": 3, + "completeness": 2, + "accuracy": 3 + }, + "structure": { + "organization": 3, + "formatting": 2, + "usability": 3 + }, + "content_score": 2.7, + "structure_score": 2.7, + "overall_score": 5.4 + } + }, + "output_quality": { + "A": { + "score": 9, + "strengths": ["Complete solution", "Well-formatted", "All fields present"], + "weaknesses": ["Minor style inconsistency in header"] + }, + "B": { + "score": 5, + "strengths": ["Readable output", "Correct basic structure"], + "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"] + } + }, + "expectation_results": { + "A": { + "passed": 4, + "total": 5, + "pass_rate": 0.80, + "details": [ + {"text": "Output includes name", "passed": true} + ] + }, + "B": { + "passed": 3, + "total": 5, + "pass_rate": 0.60, + "details": [ + {"text": "Output includes name", "passed": true} + ] + } + } +} +``` + +--- + +## analysis.json + +Output from post-hoc analyzer. Located at `/analysis.json`. + +```json +{ + "comparison_summary": { + "winner": "A", + "winner_skill": "path/to/winner/skill", + "loser_skill": "path/to/loser/skill", + "comparator_reasoning": "Brief summary of why comparator chose winner" + }, + "winner_strengths": [ + "Clear step-by-step instructions for handling multi-page documents", + "Included validation script that caught formatting errors" + ], + "loser_weaknesses": [ + "Vague instruction 'process the document appropriately' led to inconsistent behavior", + "No script for validation, agent had to improvise" + ], + "instruction_following": { + "winner": { + "score": 9, + "issues": ["Minor: skipped optional logging step"] + }, + "loser": { + "score": 6, + "issues": [ + "Did not use the skill's formatting template", + "Invented own approach instead of following step 3" + ] + } + }, + "improvement_suggestions": [ + { + "priority": "high", + "category": "instructions", + "suggestion": "Replace 'process the document appropriately' with explicit steps", + "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior" + } + ], + "transcript_insights": { + "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script", + "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods" + } +} +``` diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/__init__.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/aggregate_benchmark.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/aggregate_benchmark.py new file mode 100644 index 000000000..ccc810819 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/aggregate_benchmark.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +""" +Aggregate individual run results into benchmark summary statistics. + +Reads grading.json files from run directories and produces: +- run_summary with mean, stddev, min, max for each metric +- delta between with_skill and without_skill configurations + +Usage: + python aggregate_benchmark.py + +Example: + python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/ + +The script supports two directory layouts: + + Workspace layout (from skill-creator iterations): + / + └── eval-N/ + ├── with_skill/ + │ ├── run-1/grading.json + │ └── run-2/grading.json + └── without_skill/ + ├── run-1/grading.json + └── run-2/grading.json + + Legacy layout (with runs/ subdirectory): + / + └── runs/ + └── eval-N/ + ├── with_skill/ + │ └── run-1/grading.json + └── without_skill/ + └── run-1/grading.json +""" + +import argparse +import json +import math +import sys +from datetime import datetime, timezone +from pathlib import Path + +from loguru import logger + + +def calculate_stats(values: list[float]) -> dict: + """Calculate mean, stddev, min, max for a list of values.""" + if not values: + return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0} + + n = len(values) + mean = sum(values) / n + + if n > 1: + variance = sum((x - mean) ** 2 for x in values) / (n - 1) + stddev = math.sqrt(variance) + else: + stddev = 0.0 + + return { + "mean": round(mean, 4), + "stddev": round(stddev, 4), + "min": round(min(values), 4), + "max": round(max(values), 4) + } + + +def load_run_results(benchmark_dir: Path) -> dict: + """ + Load all run results from a benchmark directory. + + Returns dict keyed by config name (e.g. "with_skill"/"without_skill", + or "new_skill"/"old_skill"), each containing a list of run results. + """ + # Support both layouts: eval dirs directly under benchmark_dir, or under runs/ + runs_dir = benchmark_dir / "runs" + if runs_dir.exists(): + search_dir = runs_dir + elif list(benchmark_dir.glob("eval-*")): + search_dir = benchmark_dir + else: + logger.warning(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}") + return {} + + results: dict[str, list] = {} + + for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))): + metadata_path = eval_dir / "eval_metadata.json" + if metadata_path.exists(): + try: + with open(metadata_path) as mf: + eval_id = json.load(mf).get("eval_id", eval_idx) + except (json.JSONDecodeError, OSError): + eval_id = eval_idx + else: + try: + eval_id = int(eval_dir.name.split("-")[1]) + except ValueError: + eval_id = eval_idx + + # Discover config directories dynamically rather than hardcoding names + for config_dir in sorted(eval_dir.iterdir()): + if not config_dir.is_dir(): + continue + # Skip non-config directories (inputs, outputs, etc.) + if not list(config_dir.glob("run-*")): + continue + config = config_dir.name + if config not in results: + results[config] = [] + + for run_dir in sorted(config_dir.glob("run-*")): + run_number = int(run_dir.name.split("-")[1]) + grading_file = run_dir / "grading.json" + + if not grading_file.exists(): + logger.warning(f"Warning: grading.json not found in {run_dir}") + continue + + try: + with open(grading_file) as f: + grading = json.load(f) + except json.JSONDecodeError as e: + logger.warning(f"Warning: Invalid JSON in {grading_file}: {e}") + continue + + # Extract metrics + result = { + "eval_id": eval_id, + "run_number": run_number, + "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0), + "passed": grading.get("summary", {}).get("passed", 0), + "failed": grading.get("summary", {}).get("failed", 0), + "total": grading.get("summary", {}).get("total", 0), + } + + # Extract timing — check grading.json first, then sibling timing.json + timing = grading.get("timing", {}) + result["time_seconds"] = timing.get("total_duration_seconds", 0.0) + timing_file = run_dir / "timing.json" + if result["time_seconds"] == 0.0 and timing_file.exists(): + try: + with open(timing_file) as tf: + timing_data = json.load(tf) + result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0) + result["tokens"] = timing_data.get("total_tokens", 0) + except json.JSONDecodeError: + pass + + # Extract metrics if available + metrics = grading.get("execution_metrics", {}) + result["tool_calls"] = metrics.get("total_tool_calls", 0) + if not result.get("tokens"): + result["tokens"] = metrics.get("output_chars", 0) + result["errors"] = metrics.get("errors_encountered", 0) + + # Extract expectations — viewer requires fields: text, passed, evidence + raw_expectations = grading.get("expectations", []) + for exp in raw_expectations: + if "text" not in exp or "passed" not in exp: + logger.warning(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}") + result["expectations"] = raw_expectations + + # Extract notes from user_notes_summary + notes_summary = grading.get("user_notes_summary", {}) + notes = [] + notes.extend(notes_summary.get("uncertainties", [])) + notes.extend(notes_summary.get("needs_review", [])) + notes.extend(notes_summary.get("workarounds", [])) + result["notes"] = notes + + results[config].append(result) + + return results + + +def aggregate_results(results: dict) -> dict: + """ + Aggregate run results into summary statistics. + + Returns run_summary with stats for each configuration and delta. + """ + run_summary = {} + configs = list(results.keys()) + + for config in configs: + runs = results.get(config, []) + + if not runs: + run_summary[config] = { + "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}, + "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}, + "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0} + } + continue + + pass_rates = [r["pass_rate"] for r in runs] + times = [r["time_seconds"] for r in runs] + tokens = [r.get("tokens", 0) for r in runs] + + run_summary[config] = { + "pass_rate": calculate_stats(pass_rates), + "time_seconds": calculate_stats(times), + "tokens": calculate_stats(tokens) + } + + # Calculate delta between the first two configs (if two exist) + if len(configs) >= 2: + primary = run_summary.get(configs[0], {}) + baseline = run_summary.get(configs[1], {}) + else: + primary = run_summary.get(configs[0], {}) if configs else {} + baseline = {} + + delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0) + delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0) + delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0) + + run_summary["delta"] = { + "pass_rate": f"{delta_pass_rate:+.2f}", + "time_seconds": f"{delta_time:+.1f}", + "tokens": f"{delta_tokens:+.0f}" + } + + return run_summary + + +def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict: + """ + Generate complete benchmark.json from run results. + """ + results = load_run_results(benchmark_dir) + run_summary = aggregate_results(results) + + # Build runs array for benchmark.json + runs = [] + for config in results: + for result in results[config]: + runs.append({ + "eval_id": result["eval_id"], + "configuration": config, + "run_number": result["run_number"], + "result": { + "pass_rate": result["pass_rate"], + "passed": result["passed"], + "failed": result["failed"], + "total": result["total"], + "time_seconds": result["time_seconds"], + "tokens": result.get("tokens", 0), + "tool_calls": result.get("tool_calls", 0), + "errors": result.get("errors", 0) + }, + "expectations": result["expectations"], + "notes": result["notes"] + }) + + # Determine eval IDs from results + eval_ids = sorted(set( + r["eval_id"] + for config in results.values() + for r in config + )) + + benchmark = { + "metadata": { + "skill_name": skill_name or "", + "skill_path": skill_path or "", + "executor_model": "", + "analyzer_model": "", + "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "evals_run": eval_ids, + "runs_per_configuration": 3 + }, + "runs": runs, + "run_summary": run_summary, + "notes": [] # To be filled by analyzer + } + + return benchmark + + +def generate_markdown(benchmark: dict) -> str: + """Generate human-readable benchmark.md from benchmark data.""" + metadata = benchmark["metadata"] + run_summary = benchmark["run_summary"] + + # Determine config names (excluding "delta") + configs = [k for k in run_summary if k != "delta"] + config_a = configs[0] if len(configs) >= 1 else "config_a" + config_b = configs[1] if len(configs) >= 2 else "config_b" + label_a = config_a.replace("_", " ").title() + label_b = config_b.replace("_", " ").title() + + lines = [ + f"# Skill Benchmark: {metadata['skill_name']}", + "", + f"**Model**: {metadata['executor_model']}", + f"**Date**: {metadata['timestamp']}", + f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)", + "", + "## Summary", + "", + f"| Metric | {label_a} | {label_b} | Delta |", + "|--------|------------|---------------|-------|", + ] + + a_summary = run_summary.get(config_a, {}) + b_summary = run_summary.get(config_b, {}) + delta = run_summary.get("delta", {}) + + # Format pass rate + a_pr = a_summary.get("pass_rate", {}) + b_pr = b_summary.get("pass_rate", {}) + lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |") + + # Format time + a_time = a_summary.get("time_seconds", {}) + b_time = b_summary.get("time_seconds", {}) + lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |") + + # Format tokens + a_tokens = a_summary.get("tokens", {}) + b_tokens = b_summary.get("tokens", {}) + lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |") + + # Notes section + if benchmark.get("notes"): + lines.extend([ + "", + "## Notes", + "" + ]) + for note in benchmark["notes"]: + lines.append(f"- {note}") + + return "\n".join(lines) + + +def main(): + parser = argparse.ArgumentParser( + description="Aggregate benchmark run results into summary statistics" + ) + parser.add_argument( + "benchmark_dir", + type=Path, + help="Path to the benchmark directory" + ) + parser.add_argument( + "--skill-name", + default="", + help="Name of the skill being benchmarked" + ) + parser.add_argument( + "--skill-path", + default="", + help="Path to the skill being benchmarked" + ) + parser.add_argument( + "--output", "-o", + type=Path, + help="Output path for benchmark.json (default: /benchmark.json)" + ) + + args = parser.parse_args() + + if not args.benchmark_dir.exists(): + logger.error(f"Directory not found: {args.benchmark_dir}") + sys.exit(1) + + # Generate benchmark + benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path) + + # Determine output paths + output_json = args.output or (args.benchmark_dir / "benchmark.json") + output_md = output_json.with_suffix(".md") + + # Write benchmark.json + with open(output_json, "w") as f: + json.dump(benchmark, f, indent=2) + logger.info(f"Generated: {output_json}") + + # Write benchmark.md + markdown = generate_markdown(benchmark) + with open(output_md, "w") as f: + f.write(markdown) + logger.info(f"Generated: {output_md}") + + # Print summary + run_summary = benchmark["run_summary"] + configs = [k for k in run_summary if k != "delta"] + delta = run_summary.get("delta", {}) + + logger.info(f"\nSummary:") + for config in configs: + pr = run_summary[config]["pass_rate"]["mean"] + label = config.replace("_", " ").title() + logger.info(f" {label}: {pr*100:.1f}% pass rate") + logger.info(f" Delta: {delta.get('pass_rate', '—')}") + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/generate_report.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/generate_report.py new file mode 100644 index 000000000..395232d96 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/generate_report.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 +"""Generate an HTML report from run_loop.py output. + +Takes the JSON output from run_loop.py and generates a visual HTML report +showing each description attempt with check/x for each test case. +Distinguishes between train and test queries. +""" + +import argparse +import html +import json +import sys +from pathlib import Path + +from loguru import logger + + +def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "") -> str: + """Generate HTML report from loop output data. If auto_refresh is True, adds a meta refresh tag.""" + history = data.get("history", []) + holdout = data.get("holdout", 0) + title_prefix = html.escape(skill_name + " \u2014 ") if skill_name else "" + + # Get all unique queries from train and test sets, with should_trigger info + train_queries: list[dict] = [] + test_queries: list[dict] = [] + if history: + for r in history[0].get("train_results", history[0].get("results", [])): + train_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)}) + if history[0].get("test_results"): + for r in history[0].get("test_results", []): + test_queries.append({"query": r["query"], "should_trigger": r.get("should_trigger", True)}) + + refresh_tag = ' \n' if auto_refresh else "" + + html_parts = [""" + + + +""" + refresh_tag + """ """ + title_prefix + """Skill Description Optimization + + + + + + +

""" + title_prefix + """Skill Description Optimization

+
+ Optimizing your skill's description. This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill. +
+"""] + + # Summary section + best_test_score = data.get('best_test_score') + best_train_score = data.get('best_train_score') + html_parts.append(f""" +
+

Original: {html.escape(data.get('original_description', 'N/A'))}

+

Best: {html.escape(data.get('best_description', 'N/A'))}

+

Best Score: {data.get('best_score', 'N/A')} {'(test)' if best_test_score else '(train)'}

+

Iterations: {data.get('iterations_run', 0)} | Train: {data.get('train_size', '?')} | Test: {data.get('test_size', '?')}

+
+""") + + # Legend + html_parts.append(""" +
+ Query columns: + Should trigger + Should NOT trigger + Train + Test +
+""") + + # Table header + html_parts.append(""" +
+ + + + + + + +""") + + # Add column headers for train queries + for qinfo in train_queries: + polarity = "positive-col" if qinfo["should_trigger"] else "negative-col" + html_parts.append(f' \n') + + # Add column headers for test queries (different color) + for qinfo in test_queries: + polarity = "positive-col" if qinfo["should_trigger"] else "negative-col" + html_parts.append(f' \n') + + html_parts.append(""" + + +""") + + # Find best iteration for highlighting + if test_queries: + best_iter = max(history, key=lambda h: h.get("test_passed") or 0).get("iteration") + else: + best_iter = max(history, key=lambda h: h.get("train_passed", h.get("passed", 0))).get("iteration") + + # Add rows for each iteration + for h in history: + iteration = h.get("iteration", "?") + train_passed = h.get("train_passed", h.get("passed", 0)) + train_total = h.get("train_total", h.get("total", 0)) + test_passed = h.get("test_passed") + test_total = h.get("test_total") + description = h.get("description", "") + train_results = h.get("train_results", h.get("results", [])) + test_results = h.get("test_results", []) + + # Create lookups for results by query + train_by_query = {r["query"]: r for r in train_results} + test_by_query = {r["query"]: r for r in test_results} if test_results else {} + + # Compute aggregate correct/total runs across all retries + def aggregate_runs(results: list[dict]) -> tuple[int, int]: + correct = 0 + total = 0 + for r in results: + runs = r.get("runs", 0) + triggers = r.get("triggers", 0) + total += runs + if r.get("should_trigger", True): + correct += triggers + else: + correct += runs - triggers + return correct, total + + train_correct, train_runs = aggregate_runs(train_results) + test_correct, test_runs = aggregate_runs(test_results) + + # Determine score classes + def score_class(correct: int, total: int) -> str: + if total > 0: + ratio = correct / total + if ratio >= 0.8: + return "score-good" + elif ratio >= 0.5: + return "score-ok" + return "score-bad" + + train_class = score_class(train_correct, train_runs) + test_class = score_class(test_correct, test_runs) + + row_class = "best-row" if iteration == best_iter else "" + + html_parts.append(f""" + + + + +""") + + # Add result for each train query + for qinfo in train_queries: + r = train_by_query.get(qinfo["query"], {}) + did_pass = r.get("pass", False) + triggers = r.get("triggers", 0) + runs = r.get("runs", 0) + + icon = "✓" if did_pass else "✗" + css_class = "pass" if did_pass else "fail" + + html_parts.append(f' \n') + + # Add result for each test query (with different background) + for qinfo in test_queries: + r = test_by_query.get(qinfo["query"], {}) + did_pass = r.get("pass", False) + triggers = r.get("triggers", 0) + runs = r.get("runs", 0) + + icon = "✓" if did_pass else "✗" + css_class = "pass" if did_pass else "fail" + + html_parts.append(f' \n') + + html_parts.append(" \n") + + html_parts.append(""" +
IterTrainTestDescription{html.escape(qinfo["query"])}{html.escape(qinfo["query"])}
{iteration}{train_correct}/{train_runs}{test_correct}/{test_runs}{html.escape(description)}{icon}{triggers}/{runs}{icon}{triggers}/{runs}
+
+""") + + html_parts.append(""" + + +""") + + return "".join(html_parts) + + +def main(): + parser = argparse.ArgumentParser(description="Generate HTML report from run_loop output") + parser.add_argument("input", help="Path to JSON output from run_loop.py (or - for stdin)") + parser.add_argument("-o", "--output", default=None, help="Output HTML file (default: stdout)") + parser.add_argument("--skill-name", default="", help="Skill name to include in the report title") + args = parser.parse_args() + + if args.input == "-": + data = json.load(sys.stdin) + else: + data = json.loads(Path(args.input).read_text()) + + html_output = generate_html(data, skill_name=args.skill_name) + + if args.output: + Path(args.output).write_text(html_output) + logger.info(f"Report written to {args.output}") + else: + print(html_output) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/improve_description.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/improve_description.py new file mode 100644 index 000000000..887a06a08 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/improve_description.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +"""Improve a skill description based on eval results. + +Takes eval results (from run_eval.py) and generates an improved description +using Claude with extended thinking. +""" + +import argparse +import json +import re +import sys +from pathlib import Path + +import anthropic +from loguru import logger + +from scripts.utils import parse_skill_md + + +def improve_description( + client: anthropic.Anthropic, + skill_name: str, + skill_content: str, + current_description: str, + eval_results: dict, + history: list[dict], + model: str, + test_results: dict | None = None, + log_dir: Path | None = None, + iteration: int | None = None, +) -> str: + """Call Claude to improve the description based on eval results.""" + failed_triggers = [ + r for r in eval_results["results"] + if r["should_trigger"] and not r["pass"] + ] + false_triggers = [ + r for r in eval_results["results"] + if not r["should_trigger"] and not r["pass"] + ] + + # Build scores summary + train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}" + if test_results: + test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}" + scores_summary = f"Train: {train_score}, Test: {test_score}" + else: + scores_summary = f"Train: {train_score}" + + prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples. + +The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones. + +Here's the current description: + +"{current_description}" + + +Current scores ({scores_summary}): + +""" + if failed_triggers: + prompt += "FAILED TO TRIGGER (should have triggered but didn't):\n" + for r in failed_triggers: + prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n' + prompt += "\n" + + if false_triggers: + prompt += "FALSE TRIGGERS (triggered but shouldn't have):\n" + for r in false_triggers: + prompt += f' - "{r["query"]}" (triggered {r["triggers"]}/{r["runs"]} times)\n' + prompt += "\n" + + if history: + prompt += "PREVIOUS ATTEMPTS (do NOT repeat these — try something structurally different):\n\n" + for h in history: + train_s = f"{h.get('train_passed', h.get('passed', 0))}/{h.get('train_total', h.get('total', 0))}" + test_s = f"{h.get('test_passed', '?')}/{h.get('test_total', '?')}" if h.get('test_passed') is not None else None + score_str = f"train={train_s}" + (f", test={test_s}" if test_s else "") + prompt += f'\n' + prompt += f'Description: "{h["description"]}"\n' + if "results" in h: + prompt += "Train results:\n" + for r in h["results"]: + status = "PASS" if r["pass"] else "FAIL" + prompt += f' [{status}] "{r["query"][:80]}" (triggered {r["triggers"]}/{r["runs"]})\n' + if h.get("note"): + prompt += f'Note: {h["note"]}\n' + prompt += "\n\n" + + prompt += f""" + +Skill content (for context on what the skill does): + +{skill_content} + + +Based on the failures, write a new and improved description that is more likely to trigger correctly. When I say "based on the failures", it's a bit of a tricky line to walk because we don't want to overfit to the specific cases you're seeing. So what I DON'T want you to do is produce an ever-expanding list of specific queries that this skill should or shouldn't trigger for. Instead, try to generalize from the failures to broader categories of user intent and situations where this skill would be useful or not useful. The reason for this is twofold: + +1. Avoid overfitting +2. The list might get loooong and it's injected into ALL queries and there might be a lot of skills, so we don't want to blow too much space on any given description. + +Concretely, your description should not be more than about 100-200 words, even if that comes at the cost of accuracy. + +Here are some tips that we've found to work well in writing these descriptions: +- The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does" +- The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works. +- The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable. +- If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings. + +I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end. + +Please respond with only the new description text in tags, nothing else.""" + + response = client.messages.create( + model=model, + max_tokens=16000, + thinking={ + "type": "enabled", + "budget_tokens": 10000, + }, + messages=[{"role": "user", "content": prompt}], + ) + + # Extract thinking and text from response + thinking_text = "" + text = "" + for block in response.content: + if block.type == "thinking": + thinking_text = block.thinking + elif block.type == "text": + text = block.text + + # Parse out the tags + match = re.search(r"(.*?)", text, re.DOTALL) + description = match.group(1).strip().strip('"') if match else text.strip().strip('"') + + # Log the transcript + transcript: dict = { + "iteration": iteration, + "prompt": prompt, + "thinking": thinking_text, + "response": text, + "parsed_description": description, + "char_count": len(description), + "over_limit": len(description) > 1024, + } + + # If over 1024 chars, ask the model to shorten it + if len(description) > 1024: + shorten_prompt = f"Your description is {len(description)} characters, which exceeds the hard 1024 character limit. Please rewrite it to be under 1024 characters while preserving the most important trigger words and intent coverage. Respond with only the new description in tags." + shorten_response = client.messages.create( + model=model, + max_tokens=16000, + thinking={ + "type": "enabled", + "budget_tokens": 10000, + }, + messages=[ + {"role": "user", "content": prompt}, + {"role": "assistant", "content": text}, + {"role": "user", "content": shorten_prompt}, + ], + ) + + shorten_thinking = "" + shorten_text = "" + for block in shorten_response.content: + if block.type == "thinking": + shorten_thinking = block.thinking + elif block.type == "text": + shorten_text = block.text + + match = re.search(r"(.*?)", shorten_text, re.DOTALL) + shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"') + + transcript["rewrite_prompt"] = shorten_prompt + transcript["rewrite_thinking"] = shorten_thinking + transcript["rewrite_response"] = shorten_text + transcript["rewrite_description"] = shortened + transcript["rewrite_char_count"] = len(shortened) + description = shortened + + transcript["final_description"] = description + + if log_dir: + log_dir.mkdir(parents=True, exist_ok=True) + log_file = log_dir / f"improve_iter_{iteration or 'unknown'}.json" + log_file.write_text(json.dumps(transcript, indent=2)) + + return description + + +def main(): + parser = argparse.ArgumentParser(description="Improve a skill description based on eval results") + parser.add_argument("--eval-results", required=True, help="Path to eval results JSON (from run_eval.py)") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--history", default=None, help="Path to history JSON (previous attempts)") + parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--verbose", action="store_true", help="Print thinking to stderr") + args = parser.parse_args() + + skill_path = Path(args.skill_path) + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + eval_results = json.loads(Path(args.eval_results).read_text()) + history = [] + if args.history: + history = json.loads(Path(args.history).read_text()) + + name, _, content = parse_skill_md(skill_path) + current_description = eval_results["description"] + + if args.verbose: + logger.info(f"Current: {current_description}") + logger.info(f"Score: {eval_results['summary']['passed']}/{eval_results['summary']['total']}") + + client = anthropic.Anthropic() + new_description = improve_description( + client=client, + skill_name=name, + skill_content=content, + current_description=current_description, + eval_results=eval_results, + history=history, + model=args.model, + ) + + if args.verbose: + logger.info(f"Improved: {new_description}") + + # Output as JSON with both the new description and updated history + output = { + "description": new_description, + "history": history + [{ + "description": current_description, + "passed": eval_results["summary"]["passed"], + "failed": eval_results["summary"]["failed"], + "total": eval_results["summary"]["total"], + "results": eval_results["results"], + }], + } + print(json.dumps(output, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/package_skill.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/package_skill.py new file mode 100644 index 000000000..5dbdf7843 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/package_skill.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Skill Packager - Creates a distributable .skill file of a skill folder + +Usage: + python utils/package_skill.py [output-directory] + +Example: + python utils/package_skill.py skills/public/my-skill + python utils/package_skill.py skills/public/my-skill ./dist +""" + +import fnmatch +import sys +import zipfile +from pathlib import Path + +from loguru import logger +from scripts.quick_validate import validate_skill + +# Patterns to exclude when packaging skills. +EXCLUDE_DIRS = {"__pycache__", "node_modules"} +EXCLUDE_GLOBS = {"*.pyc"} +EXCLUDE_FILES = {".DS_Store"} +# Directories excluded only at the skill root (not when nested deeper). +ROOT_EXCLUDE_DIRS = {"evals"} + + +def should_exclude(rel_path: Path) -> bool: + """Check if a path should be excluded from packaging.""" + parts = rel_path.parts + if any(part in EXCLUDE_DIRS for part in parts): + return True + # rel_path is relative to skill_path.parent, so parts[0] is the skill + # folder name and parts[1] (if present) is the first subdir. + if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS: + return True + name = rel_path.name + if name in EXCLUDE_FILES: + return True + return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS) + + +def package_skill(skill_path, output_dir=None): + """ + Package a skill folder into a .skill file. + + Args: + skill_path: Path to the skill folder + output_dir: Optional output directory for the .skill file (defaults to current directory) + + Returns: + Path to the created .skill file, or None if error + """ + skill_path = Path(skill_path).resolve() + + # Validate skill folder exists + if not skill_path.exists(): + logger.error(f"Skill folder not found: {skill_path}") + return None + + if not skill_path.is_dir(): + logger.error(f"Path is not a directory: {skill_path}") + return None + + # Validate SKILL.md exists + skill_md = skill_path / "SKILL.md" + if not skill_md.exists(): + logger.error(f"SKILL.md not found in {skill_path}") + return None + + # Run validation before packaging + logger.info("Validating skill...") + valid, message = validate_skill(skill_path) + if not valid: + logger.error(f"Validation failed: {message}") + logger.error("Please fix the validation errors before packaging.") + return None + logger.info(f"{message}\n") + + # Determine output location + skill_name = skill_path.name + if output_dir: + output_path = Path(output_dir).resolve() + output_path.mkdir(parents=True, exist_ok=True) + else: + output_path = Path.cwd() + + skill_filename = output_path / f"{skill_name}.skill" + + # Create the .skill file (zip format) + try: + with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf: + # Walk through the skill directory, excluding build artifacts + for file_path in skill_path.rglob('*'): + if not file_path.is_file(): + continue + arcname = file_path.relative_to(skill_path.parent) + if should_exclude(arcname): + logger.debug(f"Skipped: {arcname}") + continue + zipf.write(file_path, arcname) + logger.debug(f"Added: {arcname}") + + logger.info(f"Successfully packaged skill to: {skill_filename}") + return skill_filename + + except Exception as e: + logger.error(f"Error creating .skill file: {e}") + return None + + +def main(): + if len(sys.argv) < 2: + logger.info("Usage: python utils/package_skill.py [output-directory]") + logger.info("\nExample:") + logger.info(" python utils/package_skill.py skills/public/my-skill") + logger.info(" python utils/package_skill.py skills/public/my-skill ./dist") + sys.exit(1) + + skill_path = sys.argv[1] + output_dir = sys.argv[2] if len(sys.argv) > 2 else None + + logger.info(f"Packaging skill: {skill_path}") + if output_dir: + logger.info(f" Output directory: {output_dir}") + logger.info("") + + result = package_skill(skill_path, output_dir) + + if result: + sys.exit(0) + else: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/quick_validate.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/quick_validate.py new file mode 100644 index 000000000..36553161e --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/quick_validate.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Quick validation script for skills - minimal version +""" + +import sys +import os +import re +import yaml +from pathlib import Path + +from loguru import logger + +def validate_skill(skill_path): + """Basic validation of a skill""" + skill_path = Path(skill_path) + + # Check SKILL.md exists + skill_md = skill_path / 'SKILL.md' + if not skill_md.exists(): + return False, "SKILL.md not found" + + # Read and validate frontmatter + content = skill_md.read_text() + if not content.startswith('---'): + return False, "No YAML frontmatter found" + + # Extract frontmatter + match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL) + if not match: + return False, "Invalid frontmatter format" + + frontmatter_text = match.group(1) + + # Parse YAML frontmatter + try: + frontmatter = yaml.safe_load(frontmatter_text) + if not isinstance(frontmatter, dict): + return False, "Frontmatter must be a YAML dictionary" + except yaml.YAMLError as e: + return False, f"Invalid YAML in frontmatter: {e}" + + # Define allowed properties + ALLOWED_PROPERTIES = {'name', 'description', 'license', 'allowed-tools', 'metadata', 'compatibility'} + + # Check for unexpected properties (excluding nested keys under metadata) + unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES + if unexpected_keys: + return False, ( + f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. " + f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}" + ) + + # Check required fields + if 'name' not in frontmatter: + return False, "Missing 'name' in frontmatter" + if 'description' not in frontmatter: + return False, "Missing 'description' in frontmatter" + + # Extract name for validation + name = frontmatter.get('name', '') + if not isinstance(name, str): + return False, f"Name must be a string, got {type(name).__name__}" + name = name.strip() + if name: + # Check naming convention (kebab-case: lowercase with hyphens) + if not re.match(r'^[a-z0-9-]+$', name): + return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)" + if name.startswith('-') or name.endswith('-') or '--' in name: + return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens" + # Check name length (max 64 characters per spec) + if len(name) > 64: + return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters." + + # Extract and validate description + description = frontmatter.get('description', '') + if not isinstance(description, str): + return False, f"Description must be a string, got {type(description).__name__}" + description = description.strip() + if description: + # Check for angle brackets + if '<' in description or '>' in description: + return False, "Description cannot contain angle brackets (< or >)" + # Check description length (max 1024 characters per spec) + if len(description) > 1024: + return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters." + + # Validate compatibility field if present (optional) + compatibility = frontmatter.get('compatibility', '') + if compatibility: + if not isinstance(compatibility, str): + return False, f"Compatibility must be a string, got {type(compatibility).__name__}" + if len(compatibility) > 500: + return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters." + + return True, "Skill is valid!" + +if __name__ == "__main__": + if len(sys.argv) != 2: + logger.info("Usage: python quick_validate.py ") + sys.exit(1) + + valid, message = validate_skill(sys.argv[1]) + if valid: + logger.info(message) + else: + logger.error(message) + sys.exit(0 if valid else 1) \ No newline at end of file diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_eval.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_eval.py new file mode 100644 index 000000000..f923066ca --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_eval.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python3 +"""Run trigger evaluation for a skill description. + +Tests whether a skill's description causes Claude to trigger (read the skill) +for a set of queries. Outputs results as JSON. +""" + +import argparse +import json +import os +import select +import subprocess +import sys +import time +import uuid +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path + +from loguru import logger + +from scripts.utils import parse_skill_md + + +def find_project_root() -> Path: + """Find the project root by walking up from cwd looking for .claude/. + + Mimics how Claude Code discovers its project root, so the command file + we create ends up where claude -p will look for it. + """ + current = Path.cwd() + for parent in [current, *current.parents]: + if (parent / ".claude").is_dir(): + return parent + return current + + +def run_single_query( + query: str, + skill_name: str, + skill_description: str, + timeout: int, + project_root: str, + model: str | None = None, +) -> bool: + """Run a single query and return whether the skill was triggered. + + Creates a command file in .claude/commands/ so it appears in Claude's + available_skills list, then runs `claude -p` with the raw query. + Uses --include-partial-messages to detect triggering early from + stream events (content_block_start) rather than waiting for the + full assistant message, which only arrives after tool execution. + """ + unique_id = uuid.uuid4().hex[:8] + clean_name = f"{skill_name}-skill-{unique_id}" + project_commands_dir = Path(project_root) / ".claude" / "commands" + command_file = project_commands_dir / f"{clean_name}.md" + + try: + project_commands_dir.mkdir(parents=True, exist_ok=True) + # Use YAML block scalar to avoid breaking on quotes in description + indented_desc = "\n ".join(skill_description.split("\n")) + command_content = ( + f"---\n" + f"description: |\n" + f" {indented_desc}\n" + f"---\n\n" + f"# {skill_name}\n\n" + f"This skill handles: {skill_description}\n" + ) + command_file.write_text(command_content) + + cmd = [ + "claude", + "-p", query, + "--output-format", "stream-json", + "--verbose", + "--include-partial-messages", + ] + if model: + cmd.extend(["--model", model]) + + # Remove CLAUDECODE env var to allow nesting claude -p inside a + # Claude Code session. The guard is for interactive terminal conflicts; + # programmatic subprocess usage is safe. + env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"} + + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + cwd=project_root, + env=env, + ) + + triggered = False + start_time = time.time() + buffer = "" + # Track state for stream event detection + pending_tool_name = None + accumulated_json = "" + + try: + while time.time() - start_time < timeout: + if process.poll() is not None: + remaining = process.stdout.read() + if remaining: + buffer += remaining.decode("utf-8", errors="replace") + break + + ready, _, _ = select.select([process.stdout], [], [], 1.0) + if not ready: + continue + + chunk = os.read(process.stdout.fileno(), 8192) + if not chunk: + break + buffer += chunk.decode("utf-8", errors="replace") + + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if not line: + continue + + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + + # Early detection via stream events + if event.get("type") == "stream_event": + se = event.get("event", {}) + se_type = se.get("type", "") + + if se_type == "content_block_start": + cb = se.get("content_block", {}) + if cb.get("type") == "tool_use": + tool_name = cb.get("name", "") + if tool_name in ("Skill", "Read"): + pending_tool_name = tool_name + accumulated_json = "" + else: + return False + + elif se_type == "content_block_delta" and pending_tool_name: + delta = se.get("delta", {}) + if delta.get("type") == "input_json_delta": + accumulated_json += delta.get("partial_json", "") + if clean_name in accumulated_json: + return True + + elif se_type in ("content_block_stop", "message_stop"): + if pending_tool_name: + return clean_name in accumulated_json + if se_type == "message_stop": + return False + + # Fallback: full assistant message + elif event.get("type") == "assistant": + message = event.get("message", {}) + for content_item in message.get("content", []): + if content_item.get("type") != "tool_use": + continue + tool_name = content_item.get("name", "") + tool_input = content_item.get("input", {}) + if tool_name == "Skill" and clean_name in tool_input.get("skill", ""): + triggered = True + elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""): + triggered = True + return triggered + + elif event.get("type") == "result": + return triggered + finally: + # Clean up process on any exit path (return, exception, timeout) + if process.poll() is None: + process.kill() + process.wait() + + return triggered + finally: + if command_file.exists(): + command_file.unlink() + + +def run_eval( + eval_set: list[dict], + skill_name: str, + description: str, + num_workers: int, + timeout: int, + project_root: Path, + runs_per_query: int = 1, + trigger_threshold: float = 0.5, + model: str | None = None, +) -> dict: + """Run the full eval set and return results.""" + results = [] + + with ProcessPoolExecutor(max_workers=num_workers) as executor: + future_to_info = {} + for item in eval_set: + for run_idx in range(runs_per_query): + future = executor.submit( + run_single_query, + item["query"], + skill_name, + description, + timeout, + str(project_root), + model, + ) + future_to_info[future] = (item, run_idx) + + query_triggers: dict[str, list[bool]] = {} + query_items: dict[str, dict] = {} + for future in as_completed(future_to_info): + item, _ = future_to_info[future] + query = item["query"] + query_items[query] = item + if query not in query_triggers: + query_triggers[query] = [] + try: + query_triggers[query].append(future.result()) + except Exception as e: + logger.warning(f"Warning: query failed: {e}") + query_triggers[query].append(False) + + for query, triggers in query_triggers.items(): + item = query_items[query] + trigger_rate = sum(triggers) / len(triggers) + should_trigger = item["should_trigger"] + if should_trigger: + did_pass = trigger_rate >= trigger_threshold + else: + did_pass = trigger_rate < trigger_threshold + results.append({ + "query": query, + "should_trigger": should_trigger, + "trigger_rate": trigger_rate, + "triggers": sum(triggers), + "runs": len(triggers), + "pass": did_pass, + }) + + passed = sum(1 for r in results if r["pass"]) + total = len(results) + + return { + "skill_name": skill_name, + "description": description, + "results": results, + "summary": { + "total": total, + "passed": passed, + "failed": total - passed, + }, + } + + +def main(): + parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description") + parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--description", default=None, help="Override description to test") + parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") + parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") + parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") + parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") + parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)") + parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") + args = parser.parse_args() + + eval_set = json.loads(Path(args.eval_set).read_text()) + skill_path = Path(args.skill_path) + + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + name, original_description, content = parse_skill_md(skill_path) + description = args.description or original_description + project_root = find_project_root() + + if args.verbose: + logger.info(f"Evaluating: {description}") + + output = run_eval( + eval_set=eval_set, + skill_name=name, + description=description, + num_workers=args.num_workers, + timeout=args.timeout, + project_root=project_root, + runs_per_query=args.runs_per_query, + trigger_threshold=args.trigger_threshold, + model=args.model, + ) + + if args.verbose: + summary = output["summary"] + logger.info(f"Results: {summary['passed']}/{summary['total']} passed") + for r in output["results"]: + status = "PASS" if r["pass"] else "FAIL" + rate_str = f"{r['triggers']}/{r['runs']}" + logger.info(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}") + + print(json.dumps(output, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_loop.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_loop.py new file mode 100644 index 000000000..a2907d6e0 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/run_loop.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python3 +"""Run the eval + improve loop until all pass or max iterations reached. + +Combines run_eval.py and improve_description.py in a loop, tracking history +and returning the best description found. Supports train/test split to prevent +overfitting. +""" + +import argparse +import json +import random +import sys +import tempfile +import time +import webbrowser +from pathlib import Path + +import anthropic +from loguru import logger + +from scripts.generate_report import generate_html +from scripts.improve_description import improve_description +from scripts.run_eval import find_project_root, run_eval +from scripts.utils import parse_skill_md + + +def split_eval_set(eval_set: list[dict], holdout: float, seed: int = 42) -> tuple[list[dict], list[dict]]: + """Split eval set into train and test sets, stratified by should_trigger.""" + random.seed(seed) + + # Separate by should_trigger + trigger = [e for e in eval_set if e["should_trigger"]] + no_trigger = [e for e in eval_set if not e["should_trigger"]] + + # Shuffle each group + random.shuffle(trigger) + random.shuffle(no_trigger) + + # Calculate split points + n_trigger_test = max(1, int(len(trigger) * holdout)) + n_no_trigger_test = max(1, int(len(no_trigger) * holdout)) + + # Split + test_set = trigger[:n_trigger_test] + no_trigger[:n_no_trigger_test] + train_set = trigger[n_trigger_test:] + no_trigger[n_no_trigger_test:] + + return train_set, test_set + + +def run_loop( + eval_set: list[dict], + skill_path: Path, + description_override: str | None, + num_workers: int, + timeout: int, + max_iterations: int, + runs_per_query: int, + trigger_threshold: float, + holdout: float, + model: str, + verbose: bool, + live_report_path: Path | None = None, + log_dir: Path | None = None, +) -> dict: + """Run the eval + improvement loop.""" + project_root = find_project_root() + name, original_description, content = parse_skill_md(skill_path) + current_description = description_override or original_description + + # Split into train/test if holdout > 0 + if holdout > 0: + train_set, test_set = split_eval_set(eval_set, holdout) + if verbose: + logger.info(f"Split: {len(train_set)} train, {len(test_set)} test (holdout={holdout})") + else: + train_set = eval_set + test_set = [] + + client = anthropic.Anthropic() + history = [] + exit_reason = "unknown" + + for iteration in range(1, max_iterations + 1): + if verbose: + logger.info(f"\n{'='*60}") + logger.info(f"Iteration {iteration}/{max_iterations}") + logger.info(f"Description: {current_description}") + logger.info(f"{'='*60}") + + # Evaluate train + test together in one batch for parallelism + all_queries = train_set + test_set + t0 = time.time() + all_results = run_eval( + eval_set=all_queries, + skill_name=name, + description=current_description, + num_workers=num_workers, + timeout=timeout, + project_root=project_root, + runs_per_query=runs_per_query, + trigger_threshold=trigger_threshold, + model=model, + ) + eval_elapsed = time.time() - t0 + + # Split results back into train/test by matching queries + train_queries_set = {q["query"] for q in train_set} + train_result_list = [r for r in all_results["results"] if r["query"] in train_queries_set] + test_result_list = [r for r in all_results["results"] if r["query"] not in train_queries_set] + + train_passed = sum(1 for r in train_result_list if r["pass"]) + train_total = len(train_result_list) + train_summary = {"passed": train_passed, "failed": train_total - train_passed, "total": train_total} + train_results = {"results": train_result_list, "summary": train_summary} + + if test_set: + test_passed = sum(1 for r in test_result_list if r["pass"]) + test_total = len(test_result_list) + test_summary = {"passed": test_passed, "failed": test_total - test_passed, "total": test_total} + test_results = {"results": test_result_list, "summary": test_summary} + else: + test_results = None + test_summary = None + + history.append({ + "iteration": iteration, + "description": current_description, + "train_passed": train_summary["passed"], + "train_failed": train_summary["failed"], + "train_total": train_summary["total"], + "train_results": train_results["results"], + "test_passed": test_summary["passed"] if test_summary else None, + "test_failed": test_summary["failed"] if test_summary else None, + "test_total": test_summary["total"] if test_summary else None, + "test_results": test_results["results"] if test_results else None, + # For backward compat with report generator + "passed": train_summary["passed"], + "failed": train_summary["failed"], + "total": train_summary["total"], + "results": train_results["results"], + }) + + # Write live report if path provided + if live_report_path: + partial_output = { + "original_description": original_description, + "best_description": current_description, + "best_score": "in progress", + "iterations_run": len(history), + "holdout": holdout, + "train_size": len(train_set), + "test_size": len(test_set), + "history": history, + } + live_report_path.write_text(generate_html(partial_output, auto_refresh=True, skill_name=name)) + + if verbose: + def print_eval_stats(label, results, elapsed): + pos = [r for r in results if r["should_trigger"]] + neg = [r for r in results if not r["should_trigger"]] + tp = sum(r["triggers"] for r in pos) + pos_runs = sum(r["runs"] for r in pos) + fn = pos_runs - tp + fp = sum(r["triggers"] for r in neg) + neg_runs = sum(r["runs"] for r in neg) + tn = neg_runs - fp + total = tp + tn + fp + fn + precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0 + recall = tp / (tp + fn) if (tp + fn) > 0 else 1.0 + accuracy = (tp + tn) / total if total > 0 else 0.0 + logger.info(f"{label}: {tp+tn}/{total} correct, precision={precision:.0%} recall={recall:.0%} accuracy={accuracy:.0%} ({elapsed:.1f}s)") + for r in results: + status = "PASS" if r["pass"] else "FAIL" + rate_str = f"{r['triggers']}/{r['runs']}" + logger.info(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:60]}") + + print_eval_stats("Train", train_results["results"], eval_elapsed) + if test_summary: + print_eval_stats("Test ", test_results["results"], 0) + + if train_summary["failed"] == 0: + exit_reason = f"all_passed (iteration {iteration})" + if verbose: + logger.info(f"\nAll train queries passed on iteration {iteration}!") + break + + if iteration == max_iterations: + exit_reason = f"max_iterations ({max_iterations})" + if verbose: + logger.info(f"\nMax iterations reached ({max_iterations}).") + break + + # Improve the description based on train results + if verbose: + logger.info(f"\nImproving description...") + + t0 = time.time() + # Strip test scores from history so improvement model can't see them + blinded_history = [ + {k: v for k, v in h.items() if not k.startswith("test_")} + for h in history + ] + new_description = improve_description( + client=client, + skill_name=name, + skill_content=content, + current_description=current_description, + eval_results=train_results, + history=blinded_history, + model=model, + log_dir=log_dir, + iteration=iteration, + ) + improve_elapsed = time.time() - t0 + + if verbose: + logger.info(f"Proposed ({improve_elapsed:.1f}s): {new_description}") + + current_description = new_description + + # Find the best iteration by TEST score (or train if no test set) + if test_set: + best = max(history, key=lambda h: h["test_passed"] or 0) + best_score = f"{best['test_passed']}/{best['test_total']}" + else: + best = max(history, key=lambda h: h["train_passed"]) + best_score = f"{best['train_passed']}/{best['train_total']}" + + if verbose: + logger.info(f"\nExit reason: {exit_reason}") + logger.info(f"Best score: {best_score} (iteration {best['iteration']})") + + return { + "exit_reason": exit_reason, + "original_description": original_description, + "best_description": best["description"], + "best_score": best_score, + "best_train_score": f"{best['train_passed']}/{best['train_total']}", + "best_test_score": f"{best['test_passed']}/{best['test_total']}" if test_set else None, + "final_description": current_description, + "iterations_run": len(history), + "holdout": holdout, + "train_size": len(train_set), + "test_size": len(test_set), + "history": history, + } + + +def main(): + parser = argparse.ArgumentParser(description="Run eval + improve loop") + parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") + parser.add_argument("--skill-path", required=True, help="Path to skill directory") + parser.add_argument("--description", default=None, help="Override starting description") + parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") + parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") + parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations") + parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") + parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") + parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)") + parser.add_argument("--model", required=True, help="Model for improvement") + parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") + parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)") + parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here") + args = parser.parse_args() + + eval_set = json.loads(Path(args.eval_set).read_text()) + skill_path = Path(args.skill_path) + + if not (skill_path / "SKILL.md").exists(): + logger.error(f"Error: No SKILL.md found at {skill_path}") + sys.exit(1) + + name, _, _ = parse_skill_md(skill_path) + + # Set up live report path + if args.report != "none": + if args.report == "auto": + timestamp = time.strftime("%Y%m%d_%H%M%S") + live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html" + else: + live_report_path = Path(args.report) + # Open the report immediately so the user can watch + live_report_path.write_text("

Starting optimization loop...

") + webbrowser.open(str(live_report_path)) + else: + live_report_path = None + + # Determine output directory (create before run_loop so logs can be written) + if args.results_dir: + timestamp = time.strftime("%Y-%m-%d_%H%M%S") + results_dir = Path(args.results_dir) / timestamp + results_dir.mkdir(parents=True, exist_ok=True) + else: + results_dir = None + + log_dir = results_dir / "logs" if results_dir else None + + output = run_loop( + eval_set=eval_set, + skill_path=skill_path, + description_override=args.description, + num_workers=args.num_workers, + timeout=args.timeout, + max_iterations=args.max_iterations, + runs_per_query=args.runs_per_query, + trigger_threshold=args.trigger_threshold, + holdout=args.holdout, + model=args.model, + verbose=args.verbose, + live_report_path=live_report_path, + log_dir=log_dir, + ) + + # Save JSON output + json_output = json.dumps(output, indent=2) + print(json_output) + if results_dir: + (results_dir / "results.json").write_text(json_output) + + # Write final HTML report (without auto-refresh) + if live_report_path: + live_report_path.write_text(generate_html(output, auto_refresh=False, skill_name=name)) + logger.info(f"\nReport: {live_report_path}") + + if results_dir and live_report_path: + (results_dir / "report.html").write_text(generate_html(output, auto_refresh=False, skill_name=name)) + + if results_dir: + logger.info(f"Results saved to: {results_dir}") + + +if __name__ == "__main__": + main() diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/utils.py b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/utils.py new file mode 100644 index 000000000..51b6a07dd --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/skills/skill-creator/scripts/utils.py @@ -0,0 +1,47 @@ +"""Shared utilities for skill-creator scripts.""" + +from pathlib import Path + + + +def parse_skill_md(skill_path: Path) -> tuple[str, str, str]: + """Parse a SKILL.md file, returning (name, description, full_content).""" + content = (skill_path / "SKILL.md").read_text() + lines = content.split("\n") + + if lines[0].strip() != "---": + raise ValueError("SKILL.md missing frontmatter (no opening ---)") + + end_idx = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == "---": + end_idx = i + break + + if end_idx is None: + raise ValueError("SKILL.md missing frontmatter (no closing ---)") + + name = "" + description = "" + frontmatter_lines = lines[1:end_idx] + i = 0 + while i < len(frontmatter_lines): + line = frontmatter_lines[i] + if line.startswith("name:"): + name = line[len("name:"):].strip().strip('"').strip("'") + elif line.startswith("description:"): + value = line[len("description:"):].strip() + # Handle YAML multiline indicators (>, |, >-, |-) + if value in (">", "|", ">-", "|-"): + continuation_lines: list[str] = [] + i += 1 + while i < len(frontmatter_lines) and (frontmatter_lines[i].startswith(" ") or frontmatter_lines[i].startswith("\t")): + continuation_lines.append(frontmatter_lines[i].strip()) + i += 1 + description = " ".join(continuation_lines) + continue + else: + description = value.strip('"').strip("'") + i += 1 + + return name, description, content diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/soul.md b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/soul.md new file mode 100644 index 000000000..1554c3463 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/soul.md @@ -0,0 +1,16 @@ +# Soul — {{agent_name}} + +## Identity +- **名称**: {{agent_name}} +- **角色**: {{role_description}} +- **创建者**: {{creator_name}} +- **创建时间**: {{created_at}} + +## Personality +- 认真负责、注重细节 +- 主动汇报工作进展 +- 遇到不确定的信息会主动确认 + +## Boundaries +- 遵守企业保密制度 +- 敏感操作需经过创建者审批 diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/state.json b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/state.json new file mode 100644 index 000000000..0507e31dd --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/state.json @@ -0,0 +1,13 @@ +{ + "agent_id": "", + "name": "", + "status": "idle", + "current_task": null, + "last_active": null, + "channel_status": {}, + "stats": { + "tasks_completed_today": 0, + "tasks_in_progress": 0, + "督办_pending": 0 + } +} \ No newline at end of file diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/todo.json b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/todo.json new file mode 100644 index 000000000..50ffbb9a9 --- /dev/null +++ b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/todo.json @@ -0,0 +1,3 @@ +{ + "tasks": [] +} diff --git a/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/workspace/archived/.gitkeep b/.clawith/data/agents/9d49d145-56e8-4b83-bf79-2aeebd3ee90d/workspace/archived/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/.vite/deps/_metadata.json b/.vite/deps/_metadata.json deleted file mode 100644 index bc8e5d421..000000000 --- a/.vite/deps/_metadata.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "hash": "6bfe7905", - "configHash": "4280ba71", - "lockfileHash": "e3b0c442", - "browserHash": "25813ad8", - "optimized": {}, - "chunks": {} -} \ No newline at end of file diff --git a/.vite/deps/package.json b/.vite/deps/package.json deleted file mode 100644 index 3dbc1ca59..000000000 --- a/.vite/deps/package.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "type": "module" -} diff --git a/backend/VERSION b/backend/VERSION index 9ab8337f3..dcb0bac8a 100644 --- a/backend/VERSION +++ b/backend/VERSION @@ -1 +1 @@ -1.9.1 +1.8.3-beta.2 diff --git a/backend/app/core/logging_config.py b/backend/app/core/logging_config.py index c9afab182..107b9904f 100644 --- a/backend/app/core/logging_config.py +++ b/backend/app/core/logging_config.py @@ -36,19 +36,50 @@ def set_trace_id(trace_id: str) -> None: def configure_logging(): """Configure loguru with custom format including trace ID.""" - # Remove default handler + import os + is_windows = os.name == 'nt' + logger.remove() - # Add stdout handler with custom format and filter to ensure trace_id exists - logger.add( - sys.stdout, - level="INFO", - format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {extra[trace_id]:-<12} | {name}:{line} - {message}", - enqueue=True, - backtrace=True, - diagnose=True, - filter=lambda record: (record["extra"].setdefault("trace_id", get_trace_id() or str(uuid4())) is not None) - ) + if is_windows: + fmt = "{time:YYYY-MM-DD HH:mm:ss} | {level} | {extra[trace_id]:-<12} | {name}:{line} - {message}" + + class EncodedStream: + def __init__(self, stream): + self.stream = stream + self.encoding = 'utf-8' + + def write(self, text): + try: + self.stream.write(text) + except UnicodeEncodeError: + clean_text = text.encode('utf-8', errors='replace').decode('utf-8') + self.stream.write(clean_text) + + def flush(self): + self.stream.flush() + + logger.add( + EncodedStream(sys.stdout), + level="INFO", + format=fmt, + enqueue=True, + backtrace=True, + diagnose=True, + filter=lambda record: (record["extra"].setdefault("trace_id", get_trace_id() or str(uuid4())) is not None), + colorize=False, + ) + else: + logger.add( + sys.stdout, + level="INFO", + format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {extra[trace_id]:-<12} | {name}:{line} - {message}", + enqueue=True, + backtrace=True, + diagnose=True, + filter=lambda record: (record["extra"].setdefault("trace_id", get_trace_id() or str(uuid4())) is not None), + colorize=True, + ) return logger diff --git a/backend/app/main.py b/backend/app/main.py index 0f4691004..3f09a584a 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,5 +1,7 @@ """Clawith Backend — FastAPI Application Entry Point.""" +import os +import subprocess from contextlib import asynccontextmanager from pathlib import Path import shutil @@ -190,6 +192,18 @@ async def lifespan(app: FastAPI): if _tenant: _new_dir = _data_dir / f"enterprise_info_{_tenant.id}" if not _new_dir.exists(): + # Set permissions on parent directory first + if os.name == 'nt': + try: + subprocess.run( + ['icacls', str(_data_dir), '/grant', 'Everyone:F', '/T', '/C'], + check=False, + capture_output=True, + text=True + ) + except Exception as e: + print(f"[startup] ⚠️ Failed to set permissions for {_data_dir}: {e}", flush=True) + shutil.copytree(str(_old_dir), str(_new_dir)) print(f"[startup] ✅ Migrated enterprise_info → enterprise_info_{_tenant.id}", flush=True) else: diff --git a/backend/app/services/agent_manager.py b/backend/app/services/agent_manager.py index 9f4c4a73c..f7d0706b2 100644 --- a/backend/app/services/agent_manager.py +++ b/backend/app/services/agent_manager.py @@ -1,7 +1,7 @@ -"""Agent lifecycle manager — Docker container management for OpenClaw Gateway instances.""" - import json +import os import shutil +import subprocess import uuid from datetime import datetime, timezone from pathlib import Path @@ -20,13 +20,32 @@ settings = get_settings() +def _set_directory_permissions(path: Path): + """Set proper permissions for a directory on Windows.""" + if os.name == 'nt': + try: + subprocess.run( + ['icacls', str(path), '/grant', 'Everyone:F', '/T', '/C'], + check=False, + capture_output=True, + text=True + ) + except Exception as e: + logger.warning(f"Failed to set permissions for {path}: {e}") + else: + try: + os.chmod(path, 0o755) + except Exception as e: + logger.warning(f"Failed to set permissions for {path}: {e}") + + class AgentManager: """Manage OpenClaw Gateway Docker containers for digital employees.""" def __init__(self): try: self.docker_client = docker.from_env() - except DockerException: + except Exception: logger.warning("Docker not available — agent containers will not be managed") self.docker_client = None @@ -46,99 +65,132 @@ async def initialize_agent_files(self, db: AsyncSession, agent: Agent, logger.warning(f"Agent dir already exists: {agent_dir}") return - if template_dir.exists(): - # Copy template - shutil.copytree(str(template_dir), str(agent_dir)) - else: - # No template dir (local dev) — create minimal workspace structure - logger.info(f"Template dir not found ({template_dir}), creating minimal workspace") - agent_dir.mkdir(parents=True, exist_ok=True) - (agent_dir / "workspace").mkdir(exist_ok=True) - (agent_dir / "workspace" / "knowledge_base").mkdir(exist_ok=True) - (agent_dir / "memory").mkdir(exist_ok=True) - (agent_dir / "skills").mkdir(exist_ok=True) - (agent_dir / "tasks.json").write_text("[]", encoding="utf-8") - - # Customize soul.md - soul_path = agent_dir / "soul.md" - # Get creator name - from app.models.user import User - result = await db.execute(select(User).where(User.id == agent.creator_id)) - creator = result.scalar_one_or_none() - creator_name = creator.display_name if creator else "Unknown" - - soul_content = f"# Personality\n\nI'm {agent.name}, {agent.role_description or 'a digital assistant'}.\n" - if soul_path.exists(): - template_content = soul_path.read_text() - soul_content = template_content.replace("{{agent_name}}", agent.name) - soul_content = soul_content.replace("{{role_description}}", agent.role_description or "通用助手") - soul_content = soul_content.replace("{{creator_name}}", creator_name) - soul_content = soul_content.replace("{{created_at}}", datetime.now(timezone.utc).strftime("%Y-%m-%d")) - - # Helper function to replace or append sections - def replace_or_append_section(content: str, section_name: str, section_content: str) -> str: - """Replace existing ## SectionName or append if not found.""" - if not section_content: - return content - - # Pattern to match existing section (case-insensitive header) - import re - pattern = rf"^##\s+{re.escape(section_name)}\s*$" - lines = content.split('\n') - - # Find the section header - for i, line in enumerate(lines): - if re.match(pattern, line.strip(), re.IGNORECASE): - # Found existing section - replace until next ## header or end - section_start = i - section_end = len(lines) - for j in range(i + 1, len(lines)): - if lines[j].strip().startswith('## '): - section_end = j - break - - # Replace the section content (with trailing newline for proper spacing) - new_section = f"## {section_name}\n{section_content}\n" - lines = lines[:section_start] + [new_section] + lines[section_end:] - return '\n'.join(lines) - - # Section not found - append at the end - return content + f"\n## {section_name}\n{section_content}\n" - - # Use the helper to replace or append Personality and Boundaries - soul_content = replace_or_append_section(soul_content, "Personality", personality) - soul_content = replace_or_append_section(soul_content, "Boundaries", boundaries) - - soul_path.write_text(soul_content, encoding="utf-8") - - # Ensure memory.md exists - mem_path = agent_dir / "memory" / "memory.md" - if not mem_path.exists(): - mem_path.write_text("# Memory\n\n_Record important information and knowledge here._\n", encoding="utf-8") - - # Ensure reflections.md exists — copy from central template - refl_path = agent_dir / "memory" / "reflections.md" - if not refl_path.exists(): - refl_template = Path(__file__).parent.parent / "templates" / "reflections.md" - refl_content = refl_template.read_text(encoding="utf-8") if refl_template.exists() else "# Reflections Journal\n" - refl_path.write_text(refl_content, encoding="utf-8") - - # Ensure HEARTBEAT.md exists — copy from central template - hb_path = agent_dir / "HEARTBEAT.md" - if not hb_path.exists(): - hb_template = Path(__file__).parent.parent / "templates" / "HEARTBEAT.md" - hb_content = hb_template.read_text(encoding="utf-8") if hb_template.exists() else "# Heartbeat Instructions\n" - hb_path.write_text(hb_content, encoding="utf-8") - - # Customize state.json - state_path = agent_dir / "state.json" - if state_path.exists(): - state = json.loads(state_path.read_text()) - state["agent_id"] = str(agent.id) - state["name"] = agent.name - state_path.write_text(json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8") - - logger.info(f"Initialized agent files at {agent_dir}") + try: + # Create parent directories first with proper permissions + agent_dir.parent.mkdir(parents=True, exist_ok=True) + try: + _set_directory_permissions(agent_dir.parent) + except Exception: + logger.warning(f"Failed to set permissions for {agent_dir.parent}, continuing") + + if template_dir.exists(): + # First create empty agent_dir with proper permissions + # This ensures shutil.copytree can write to it + agent_dir.mkdir(parents=True, exist_ok=True) + try: + _set_directory_permissions(agent_dir) + except Exception: + logger.warning(f"Failed to set permissions for {agent_dir}, continuing") + + # Copy template files + shutil.copytree(str(template_dir), str(agent_dir), dirs_exist_ok=True) + try: + _set_directory_permissions(agent_dir) + except Exception: + logger.warning(f"Failed to set permissions for {agent_dir}, continuing") + else: + # No template dir (local dev) — create minimal workspace structure + logger.info(f"Template dir not found ({template_dir}), creating minimal workspace") + agent_dir.mkdir(parents=True, exist_ok=True) + try: + _set_directory_permissions(agent_dir) + except Exception: + logger.warning(f"Failed to set permissions for {agent_dir}, continuing") + (agent_dir / "workspace").mkdir(exist_ok=True) + (agent_dir / "workspace" / "knowledge_base").mkdir(exist_ok=True) + (agent_dir / "memory").mkdir(exist_ok=True) + (agent_dir / "skills").mkdir(exist_ok=True) + (agent_dir / "tasks.json").write_text("[]", encoding="utf-8") + except Exception as e: + # If creating files fails, log it but continue with agent creation + logger.error(f"Failed to initialize agent files for {agent.name}: {e}") + logger.warning("Continuing with agent creation without workspace files") + + try: + # Customize soul.md + soul_path = agent_dir / "soul.md" + # Get creator name + from app.models.user import User + result = await db.execute(select(User).where(User.id == agent.creator_id)) + creator = result.scalar_one_or_none() + creator_name = creator.display_name if creator else "Unknown" + + soul_content = f"# Personality\n\nI'm {agent.name}, {agent.role_description or 'a digital assistant'}.\n" + if soul_path.exists(): + template_content = soul_path.read_text() + soul_content = template_content.replace("{{agent_name}}", agent.name) + soul_content = soul_content.replace("{{role_description}}", agent.role_description or "通用助手") + soul_content = soul_content.replace("{{creator_name}}", creator_name) + soul_content = soul_content.replace("{{created_at}}", datetime.now(timezone.utc).strftime("%Y-%m-%d")) + + # Helper function to replace or append sections + def replace_or_append_section(content: str, section_name: str, section_content: str) -> str: + """Replace existing ## SectionName or append if not found.""" + if not section_content: + return content + + # Pattern to match existing section (case-insensitive header) + import re + pattern = rf"^##\s+{re.escape(section_name)}\s*$" + lines = content.split('\n') + + # Find the section header + for i, line in enumerate(lines): + if re.match(pattern, line.strip(), re.IGNORECASE): + # Found existing section - replace until next ## header or end + section_start = i + section_end = len(lines) + for j in range(i + 1, len(lines)): + if lines[j].strip().startswith('## '): + section_end = j + break + + # Replace the section content (with trailing newline for proper spacing) + new_section = f"## {section_name}\n{section_content}\n" + lines = lines[:section_start] + [new_section] + lines[section_end:] + return '\n'.join(lines) + + # Section not found - append at the end + return content + f"\n## {section_name}\n{section_content}\n" + + # Use the helper to replace or append Personality and Boundaries + soul_content = replace_or_append_section(soul_content, "Personality", personality) + soul_content = replace_or_append_section(soul_content, "Boundaries", boundaries) + + soul_path.write_text(soul_content, encoding="utf-8") + + # Ensure memory.md exists + mem_path = agent_dir / "memory" / "memory.md" + if not mem_path.exists(): + mem_path.write_text("# Memory\n\n_Record important information and knowledge here._\n", encoding="utf-8") + + # Ensure reflections.md exists — copy from central template + refl_path = agent_dir / "memory" / "reflections.md" + if not refl_path.exists(): + refl_template = Path(__file__).parent.parent / "templates" / "reflections.md" + refl_content = refl_template.read_text(encoding="utf-8") if refl_template.exists() else "# Reflections Journal\n" + refl_path.write_text(refl_content, encoding="utf-8") + + # Ensure HEARTBEAT.md exists — copy from central template + hb_path = agent_dir / "HEARTBEAT.md" + if not hb_path.exists(): + hb_template = Path(__file__).parent.parent / "templates" / "HEARTBEAT.md" + hb_content = hb_template.read_text(encoding="utf-8") if hb_template.exists() else "# Heartbeat Instructions\n" + hb_path.write_text(hb_content, encoding="utf-8") + + # Customize state.json + state_path = agent_dir / "state.json" + if state_path.exists(): + state = json.loads(state_path.read_text()) + state["agent_id"] = str(agent.id) + state["name"] = agent.name + state_path.write_text(json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8") + + logger.info(f"Initialized agent files at {agent_dir}") + except Exception as e: + # If writing files fails, log it but continue with agent creation + logger.error(f"Failed to write agent files for {agent.name}: {e}") + logger.warning("Continuing with agent creation without workspace files") def _generate_openclaw_config(self, agent: Agent, model: LLMModel | None) -> dict: """Generate openclaw.json config for the agent container.""" @@ -182,13 +234,22 @@ async def start_container(self, db: AsyncSession, agent: Agent) -> str | None: # Generate OpenClaw config config = self._generate_openclaw_config(agent, model) config_dir = agent_dir / ".openclaw" - config_dir.mkdir(parents=True, exist_ok=True) - (config_dir / "openclaw.json").write_text(json.dumps(config, indent=2), encoding="utf-8") - - # Create workspace symlink - workspace_dir = config_dir / "workspace" - if not workspace_dir.exists(): - workspace_dir.symlink_to(agent_dir / "workspace") + try: + config_dir.mkdir(parents=True, exist_ok=True) + (config_dir / "openclaw.json").write_text(json.dumps(config, indent=2), encoding="utf-8") + + # Create workspace symlink (or junction on Windows) + workspace_dir = config_dir / "workspace" + if not workspace_dir.exists(): + try: + workspace_dir.symlink_to(agent_dir / "workspace") + except OSError: + try: + workspace_dir.symlink_to(agent_dir / "workspace", target_is_directory=True) + except OSError: + logger.warning(f"Cannot create symlink for {workspace_dir}, skipping") + except Exception as e: + logger.warning(f"Failed to create OpenClaw config: {e}, skipping container start") # Assign a unique port container_port = 18789 + hash(str(agent.id)) % 10000 @@ -221,7 +282,7 @@ async def start_container(self, db: AsyncSession, agent: Agent) -> str | None: logger.info(f"Started container {container.id[:12]} for agent {agent.name} on port {container_port}") return container.id - except DockerException as e: + except Exception as e: logger.error(f"Failed to start container for agent {agent.name}: {e}") agent.status = "error" return None diff --git a/backend/app/services/agent_seeder.py b/backend/app/services/agent_seeder.py index e267c951e..c653f2fb8 100644 --- a/backend/app/services/agent_seeder.py +++ b/backend/app/services/agent_seeder.py @@ -1,6 +1,8 @@ """Seed default agents (Morty & Meeseeks) on first platform startup.""" +import os import shutil +import subprocess import uuid from datetime import datetime, timezone from pathlib import Path @@ -194,11 +196,20 @@ async def seed_default_agents(): """ # --- Idempotency guard: file-based marker (survives agent renames/deletes) --- seed_marker = Path(settings.AGENT_DATA_DIR) / ".seeded" - if seed_marker.exists(): - logger.info("[AgentSeeder] Seed marker found, skipping default agent creation") - return - + + # Check both marker AND database to handle inconsistent state async with async_session() as db: + # Verify agents actually exist in DB before skipping + existing_agents = await db.execute( + select(Agent).where(Agent.name.in_(["Morty", "Meeseeks"])).limit(2) + ) + agents_in_db = existing_agents.scalars().all() + + if seed_marker.exists() and len(agents_in_db) == 2: + logger.info("[AgentSeeder] Seed marker found and agents exist in DB, skipping default agent creation") + return + elif seed_marker.exists() and len(agents_in_db) < 2: + logger.warning("[AgentSeeder] Seed marker exists but agents missing from DB, will re-seed") # Get platform admin as creator admin_result = await db.execute( @@ -246,14 +257,104 @@ async def seed_default_agents(): # ── Initialize workspace files ── template_dir = Path(settings.AGENT_TEMPLATE_DIR) + # Create parent directories first with proper permissions + parent_dir = Path(settings.AGENT_DATA_DIR) + parent_dir.mkdir(parents=True, exist_ok=True) + + if os.name == 'nt': + try: + result = subprocess.run( + ['icacls', str(parent_dir), '/grant', 'Everyone:F', '/T', '/C'], + check=True, + capture_output=True, + text=True, + timeout=30 + ) + if result.returncode != 0: + logger.warning(f"icacls failed for {parent_dir}: {result.stderr}") + except Exception as e: + logger.warning(f"Failed to set permissions for {parent_dir}: {e}") + else: + try: + os.chmod(parent_dir, 0o755) + except Exception as e: + logger.warning(f"Failed to set permissions for {parent_dir}: {e}") + for agent, soul_content in [(morty, MORTY_SOUL), (meeseeks, MEESEEKS_SOUL)]: agent_dir = Path(settings.AGENT_DATA_DIR) / str(agent.id) if template_dir.exists(): + # Remove existing directory if it exists and has permission issues + if agent_dir.exists(): + try: + if os.name == 'nt': + # First take ownership, then remove + subprocess.run( + ['takeown', '/F', str(agent_dir), '/R', '/A', '/D', 'Y'], + check=False, + capture_output=True, + text=True, + timeout=60 + ) + subprocess.run( + ['powershell', '-Command', f'Remove-Item -Path "{agent_dir}" -Recurse -Force'], + check=False, + capture_output=True, + text=True, + timeout=30 + ) + else: + shutil.rmtree(str(agent_dir)) + except Exception as e: + logger.warning(f"Failed to remove existing agent_dir {agent_dir}: {e}") + + # Create fresh agent_dir with proper permissions + agent_dir.mkdir(parents=True, exist_ok=True) + + if os.name == 'nt': + try: + subprocess.run( + ['icacls', str(agent_dir), '/inheritance:r'], + check=False, + capture_output=True, + text=True, + timeout=30 + ) + subprocess.run( + ['icacls', str(agent_dir), '/grant', 'Everyone:F', '/T', '/C'], + check=False, + capture_output=True, + text=True, + timeout=60 + ) + except Exception as e: + logger.warning(f"Failed to set permissions for {agent_dir}: {e}") + else: + try: + os.chmod(agent_dir, 0o755) + except Exception as e: + logger.warning(f"Failed to set permissions for {agent_dir}: {e}") + # Copy the full agent template so Morty/Meeseeks get EVERY file # defined in the template: MEMORY_INDEX.md, curiosity_journal.md, # state.json, todo.json, daily_reports/, enterprise_info/, etc. - shutil.copytree(str(template_dir), str(agent_dir)) + # Use PowerShell to copy files to avoid permission issues + # Copy the full agent template so Morty/Meeseeks get EVERY file + # defined in the template: MEMORY_INDEX.md, curiosity_journal.md, + # state.json, todo.json, daily_reports/, enterprise_info/, etc. + if os.name == 'nt': + copy_cmd = f'Copy-Item -Path "{template_dir}\\*" -Destination "{agent_dir}" -Recurse -Force' + try: + subprocess.run( + ['powershell', '-Command', copy_cmd], + check=False, + capture_output=True, + text=True + ) + except Exception as e: + logger.warning(f"Failed to copy template files: {e}") + else: + shutil.copytree(str(template_dir), str(agent_dir), dirs_exist_ok=True) else: # Fallback for local dev (no Docker template mount) agent_dir.mkdir(parents=True, exist_ok=True) @@ -262,28 +363,28 @@ async def seed_default_agents(): (agent_dir / "workspace" / "knowledge_base").mkdir(exist_ok=True) (agent_dir / "memory").mkdir(exist_ok=True) - # Overlay custom soul (rich Morty/Meeseeks persona over the generic template) - (agent_dir / "soul.md").write_text(soul_content.strip() + "\n", encoding="utf-8") - - # Ensure memory.md exists (template does not include it; holds runtime context) - mem_path = agent_dir / "memory" / "memory.md" - if not mem_path.exists(): - mem_path.write_text("# Memory\n\n_Record important information and knowledge here._\n", encoding="utf-8") - - # Ensure reflections.md exists (not in agent_template; lives in app/templates) - refl_path = agent_dir / "memory" / "reflections.md" - if not refl_path.exists(): - refl_src = Path(__file__).parent.parent / "templates" / "reflections.md" - refl_path.write_text(refl_src.read_text(encoding="utf-8") if refl_src.exists() else "# Reflections Journal\n", encoding="utf-8") - - # Stamp agent identity into state.json if present - state_path = agent_dir / "state.json" - if state_path.exists(): - import json as _json - state = _json.loads(state_path.read_text()) - state["agent_id"] = str(agent.id) - state["name"] = agent.name - state_path.write_text(_json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8") + # Overlay custom soul (rich Morty/Meeseeks persona over the generic template) + (agent_dir / "soul.md").write_text(soul_content.strip() + "\n", encoding="utf-8") + + # Ensure memory.md exists (template does not include it; holds runtime context) + mem_path = agent_dir / "memory" / "memory.md" + if not mem_path.exists(): + mem_path.write_text("# Memory\n\n_Record important information and knowledge here._\n", encoding="utf-8") + + # Ensure reflections.md exists (not in agent_template; lives in app/templates) + refl_path = agent_dir / "memory" / "reflections.md" + if not refl_path.exists(): + refl_src = Path(__file__).parent.parent / "templates" / "reflections.md" + refl_path.write_text(refl_src.read_text(encoding="utf-8") if refl_src.exists() else "# Reflections Journal\n", encoding="utf-8") + + # Stamp agent identity into state.json if present + state_path = agent_dir / "state.json" + if state_path.exists(): + import json as _json + state = _json.loads(state_path.read_text()) + state["agent_id"] = str(agent.id) + state["name"] = agent.name + state_path.write_text(_json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8") # ── Assign skills ── all_skills_result = await db.execute( @@ -379,7 +480,30 @@ async def seed_okr_agent(): - Generates daily/weekly reports and posts them to the Plaza - Helps team members set up and maintain their focus.md files """ - seed_marker = Path(settings.AGENT_DATA_DIR) / ".seeded" + # Ensure AGENT_DATA_DIR exists with proper permissions before any file operations + agent_data_dir = Path(settings.AGENT_DATA_DIR) + agent_data_dir.mkdir(parents=True, exist_ok=True) + + if os.name == 'nt': + try: + result = subprocess.run( + ['icacls', str(agent_data_dir), '/grant', 'Everyone:F', '/T', '/C'], + check=True, + capture_output=True, + text=True, + timeout=30 + ) + if result.returncode != 0: + logger.warning(f"icacls failed for {agent_data_dir}: {result.stderr}") + except Exception as e: + logger.warning(f"Failed to set permissions for {agent_data_dir}: {e}") + else: + try: + os.chmod(agent_data_dir, 0o755) + except Exception as e: + logger.warning(f"Failed to set permissions for {agent_data_dir}: {e}") + + seed_marker = agent_data_dir / ".seeded" # Check if OKR Agent has already been seeded if seed_marker.exists(): @@ -477,14 +601,87 @@ async def seed_okr_agent(): template_dir = Path(settings.AGENT_TEMPLATE_DIR) agent_dir = Path(settings.AGENT_DATA_DIR) / str(okr_agent.id) - if template_dir.exists(): - shutil.copytree(str(template_dir), str(agent_dir)) - else: + # Create directories with proper permissions + try: + # Remove existing directory if it exists and has permission issues + if agent_dir.exists(): + try: + if os.name == 'nt': + # First take ownership, then remove + subprocess.run( + ['takeown', '/F', str(agent_dir), '/R', '/A', '/D', 'Y'], + check=False, + capture_output=True, + text=True, + timeout=60 + ) + subprocess.run( + ['powershell', '-Command', f'Remove-Item -Path "{agent_dir}" -Recurse -Force'], + check=False, + capture_output=True, + text=True, + timeout=30 + ) + else: + shutil.rmtree(str(agent_dir)) + except Exception as e: + logger.warning(f"Failed to remove existing agent_dir {agent_dir}: {e}") + + # Create fresh agent directory agent_dir.mkdir(parents=True, exist_ok=True) - (agent_dir / "skills").mkdir(exist_ok=True) - (agent_dir / "workspace").mkdir(exist_ok=True) - (agent_dir / "workspace" / "reports").mkdir(exist_ok=True) - (agent_dir / "memory").mkdir(exist_ok=True) + + # Set permissions on Windows - reset inheritance first, then grant + if os.name == 'nt': + try: + subprocess.run( + ['icacls', str(agent_dir), '/inheritance:r'], + check=False, + capture_output=True, + text=True, + timeout=30 + ) + subprocess.run( + ['icacls', str(agent_dir), '/grant', 'Everyone:F', '/T', '/C'], + check=False, + capture_output=True, + text=True, + timeout=60 + ) + except Exception as e: + logger.warning(f"Failed to set permissions for {agent_dir}: {e}") + + # Copy template files if available + if template_dir.exists(): + logger.info(f"Copying template from {template_dir} to {agent_dir}") + try: + if os.name == 'nt': + copy_cmd = f'Copy-Item -Path "{template_dir}\\*" -Destination "{agent_dir}" -Recurse -Force' + subprocess.run( + ['powershell', '-Command', copy_cmd], + check=False, + capture_output=True, + text=True + ) + else: + shutil.copytree(str(template_dir), str(agent_dir), dirs_exist_ok=True) + logger.info(f"Successfully copied template to {agent_dir}") + except Exception as e: + logger.error(f"Failed to copy template: {e}") + # Fall back to creating empty structure + (agent_dir / "skills").mkdir(exist_ok=True) + (agent_dir / "workspace").mkdir(exist_ok=True) + (agent_dir / "workspace" / "reports").mkdir(exist_ok=True) + (agent_dir / "memory").mkdir(exist_ok=True) + else: + # Create empty structure + (agent_dir / "skills").mkdir(exist_ok=True) + (agent_dir / "workspace").mkdir(exist_ok=True) + (agent_dir / "workspace" / "reports").mkdir(exist_ok=True) + (agent_dir / "memory").mkdir(exist_ok=True) + + except Exception as e: + logger.error(f"Error setting up workspace: {e}") + raise # Write OKR Agent soul (agent_dir / "soul.md").write_text(OKR_AGENT_SOUL.strip() + "\n", encoding="utf-8") @@ -961,8 +1158,48 @@ async def seed_okr_agent_for_tenant(tenant_id: uuid.UUID, creator_id: uuid.UUID) template_dir = Path(settings.AGENT_TEMPLATE_DIR) agent_dir = Path(settings.AGENT_DATA_DIR) / str(okr_agent.id) + # Create parent directories first with proper permissions + agent_dir.parent.mkdir(parents=True, exist_ok=True) + + if os.name == 'nt': + try: + subprocess.run( + ['icacls', str(agent_dir.parent), '/grant', 'Everyone:F', '/T', '/C'], + check=False, + capture_output=True, + text=True + ) + except Exception as e: + logger.warning(f"Failed to set permissions for {agent_dir.parent}: {e}") + else: + try: + os.chmod(agent_dir.parent, 0o755) + except Exception as e: + logger.warning(f"Failed to set permissions for {agent_dir.parent}: {e}") + if template_dir.exists(): - shutil.copytree(str(template_dir), str(agent_dir)) + # First create empty agent_dir with proper permissions + agent_dir.mkdir(parents=True, exist_ok=True) + + # Set permissions + if os.name == 'nt': + try: + subprocess.run( + ['icacls', str(agent_dir), '/grant', 'Everyone:F', '/T', '/C'], + check=False, + capture_output=True, + text=True + ) + except Exception as e: + logger.warning(f"Failed to set permissions for {agent_dir}: {e}") + else: + try: + os.chmod(agent_dir, 0o755) + except Exception as e: + logger.warning(f"Failed to set permissions for {agent_dir}: {e}") + + # Copy template files + shutil.copytree(str(template_dir), str(agent_dir), dirs_exist_ok=True) else: agent_dir.mkdir(parents=True, exist_ok=True) for sub in ("skills", "workspace", "workspace/reports", "memory"): diff --git a/backend/app/services/trigger_daemon.py b/backend/app/services/trigger_daemon.py index e4bfd1905..7e1425c99 100644 --- a/backend/app/services/trigger_daemon.py +++ b/backend/app/services/trigger_daemon.py @@ -902,7 +902,7 @@ async def on_tool_call(data): else: cleaned = final_reply - notification = f"⚡ {summary}\n\n{cleaned}" + notification = f"[TRIGGER] {summary}\n\n{cleaned}" target_session_id = delivery_target["session_id"] owner_user_id = delivery_target.get("owner_user_id") @@ -954,7 +954,7 @@ async def on_tool_call(data): "triggers": [{"name": t.name, "type": t.type} for t in triggers], }, agent_id=agent_id) - logger.info(f"⚡ Triggers fired for {agent.name}: {[t.name for t in triggers]}") + logger.info(f"[TRIGGER] Triggers fired for {agent.name}: {[t.name for t in triggers]}") except Exception as e: logger.error(f"Failed to invoke agent {agent_id} for triggers: {e}") @@ -1110,7 +1110,7 @@ def _decay_chain(): async def start_trigger_daemon(): """Start the background trigger daemon loop. Called from FastAPI startup.""" - logger.info("⚡ Trigger Daemon started (15s tick, heartbeat every ~60s)") + logger.info("[TRIGGER] Trigger Daemon started (15s tick, heartbeat every ~60s)") _heartbeat_counter = 0 while True: try: diff --git a/backend/entrypoint.sh b/backend/entrypoint.sh old mode 100755 new mode 100644 diff --git a/frontend/VERSION b/frontend/VERSION index 9ab8337f3..dcb0bac8a 100644 --- a/frontend/VERSION +++ b/frontend/VERSION @@ -1 +1 @@ -1.9.1 +1.8.3-beta.2 diff --git a/frontend/index.html b/frontend/index.html index eba398a27..3e4ffb325 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -7,9 +7,6 @@ Clawith - - - diff --git a/frontend/src/components/ConfirmModal.tsx b/frontend/src/components/ConfirmModal.tsx index 546d4b0a7..71b6db69b 100644 --- a/frontend/src/components/ConfirmModal.tsx +++ b/frontend/src/components/ConfirmModal.tsx @@ -21,16 +21,30 @@ export default function ConfirmModal({ open, title, message, confirmLabel = '确 if (!open) return null; return ( -
{ if (e.target === e.currentTarget) onCancel(); }}> -
+
{ if (e.target === e.currentTarget) onCancel(); }}> +

{title}

{message}

diff --git a/frontend/src/components/Dialog/DialogProvider.tsx b/frontend/src/components/Dialog/DialogProvider.tsx index 356fb3d18..f5f30edbf 100644 --- a/frontend/src/components/Dialog/DialogProvider.tsx +++ b/frontend/src/components/Dialog/DialogProvider.tsx @@ -92,27 +92,31 @@ function DialogModal({ state, onClose }: { state: NonNullable; onClo return (
{ if (e.target === e.currentTarget) onClose(false); }} >
diff --git a/frontend/src/components/PostHireSettingsModal.tsx b/frontend/src/components/PostHireSettingsModal.tsx index df4925a54..1ee98e3f6 100644 --- a/frontend/src/components/PostHireSettingsModal.tsx +++ b/frontend/src/components/PostHireSettingsModal.tsx @@ -238,25 +238,48 @@ function RadioRow({ selected, onClick, title, hint }: { selected: boolean; onCli type="button" onClick={onClick} style={{ - display: 'flex', alignItems: 'flex-start', gap: '10px', - padding: '10px 12px', textAlign: 'left', + display: 'flex', + alignItems: 'flex-start', + gap: '12px', + padding: '14px 16px', + textAlign: 'left', border: `1px solid ${selected ? 'var(--accent-primary)' : 'var(--border-subtle)'}`, - borderRadius: '8px', background: selected ? 'var(--accent-subtle, rgba(99,102,241,0.08))' : 'transparent', - cursor: 'pointer', width: '100%', + borderRadius: '8px', + background: selected ? 'var(--accent-subtle, rgba(99,102,241,0.08))' : 'transparent', + cursor: 'pointer', + width: '100%', + minHeight: '64px', + overflow: 'visible', + boxSizing: 'border-box', }} > {selected && } - - {title} - {hint} - +
+
{title}
+
{hint}
+
); } diff --git a/frontend/src/components/TalentMarketModal.tsx b/frontend/src/components/TalentMarketModal.tsx index ce2efbaed..2f5dc88cd 100644 --- a/frontend/src/components/TalentMarketModal.tsx +++ b/frontend/src/components/TalentMarketModal.tsx @@ -107,45 +107,22 @@ export default function TalentMarketModal({ open, onClose }: Props) { return (
{ if (e.target === e.currentTarget) onClose(); }} > -
+
{/* Header */} -
+
-

+

{t('talentMarket.title', isChinese ? '人才市场' : 'Talent Market')}

-

+

{t('talentMarket.subtitle', isChinese ? '挑选一位专业成员加入你的公司' : 'Pick a professional to join your company')}

{/* Search box */} -
+
{searchQuery && ( @@ -190,13 +158,7 @@ export default function TalentMarketModal({ open, onClose }: Props) {
{tabs.map((tab) => { const isActive = !isSearching && activeTab === tab.id; @@ -206,27 +168,7 @@ export default function TalentMarketModal({ open, onClose }: Props) { role="tab" aria-selected={isActive} onClick={() => { setSearchQuery(''); setActiveTab(tab.id); }} - onMouseEnter={(e) => { - if (!isActive) (e.currentTarget as HTMLButtonElement).style.color = 'var(--text-primary)'; - }} - onMouseLeave={(e) => { - if (!isActive) (e.currentTarget as HTMLButtonElement).style.color = 'var(--text-secondary)'; - }} - style={{ - padding: '14px 18px', - marginBottom: '-1px', - marginRight: '8px', - background: 'transparent', - border: 'none', - borderBottom: `2px solid ${isActive ? 'var(--text-primary)' : 'transparent'}`, - color: isActive ? 'var(--text-primary)' : 'var(--text-secondary)', - fontSize: '13px', - fontWeight: 500, - cursor: 'pointer', - whiteSpace: 'nowrap', - transition: 'color 120ms, border-color 120ms', - outline: 'none', - }} + className={`talent-market-tab${isActive ? ' active' : ''}`} > {tab.label} @@ -235,13 +177,7 @@ export default function TalentMarketModal({ open, onClose }: Props) {
{/* Cards */} -
+
{isLoading && (
{t('common.loading', 'Loading...')} @@ -271,10 +207,7 @@ export default function TalentMarketModal({ open, onClose }: Props) {
{/* Footer */} -
+
{t('talentMarket.footer', isChinese ? '点击聘用·可随时在设置中调整' : 'Hire now · adjust anything in settings later')}
@@ -302,38 +235,20 @@ function TemplateCard({ tpl, hiring, isChinese, onHire }: { : [localized.description].filter(Boolean); return ( -
-
+
+
{tpl.icon || 'AI'}
-
+
{localized.name}
-
+
{tpl.category || 'general'}
-
    +
      {bullets.slice(0, 4).map((b, i) => ( -
    • - +
    • + {b}
    • ))} @@ -356,76 +271,28 @@ function CustomCard({ onClick }: { onClick: () => void }) { return (
      { - (e.currentTarget as HTMLDivElement).style.borderColor = 'var(--accent)'; - }} - onMouseLeave={(e) => { - (e.currentTarget as HTMLDivElement).style.borderColor = 'var(--border-subtle)'; - }} + className="talent-card-custom" >