From 93b92d7af82deb62565b434150363eccd188cd1f Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Fri, 8 May 2026 11:38:57 +0200 Subject: [PATCH] chore(ai): Add check-code-attribution skill (JAVA-499) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a check-code-attribution skill that validates license headers + THIRD_PARTY_NOTICES.md entries for code copied or adapted from third parties. Also verifies license compatiblity against Sentry's licensing policy. Focus is limited to the branch diff. Reports any issues found via PR comments (when run on CI) or to the terminal (when run locally). To run it in Claude Code: ``` /check-code-attribution ``` Runs on CI automatically via [Warden](https://warden.sentry.dev/). - Purely advisory / does not block merge. - Generates PR comments with code suggestions for all discovered issues. - Automatically manages removing stale comments as PRs are updated. Current Warden configs: ┌─────────────────┬─────────────────────────────┬───────────────────────────────────────────────────┐ │ Setting │ Value │ Effect │ ├─────────────────┼─────────────────────────────┼───────────────────────────────────────────────────┤ │ model │ anthropic/claude-sonnet-4-6 │ Model used for analysis │ ├─────────────────┼─────────────────────────────┼───────────────────────────────────────────────────┤ │ maxTurns │ 30 │ Max tool calls per chunk │ ├─────────────────┼─────────────────────────────┼───────────────────────────────────────────────────┤ │ skill │ check-code-attribution │ Per-file vendored code attribution check │ ├─────────────────┼─────────────────────────────┼───────────────────────────────────────────────────┤ │ failOn │ off │ Do not fail workflow if attribution issues found │ ├─────────────────┼─────────────────────────────┼───────────────────────────────────────────────────┤ │ reportOn │ medium │ Show findings at >= medium severity via PR comment│ ├─────────────────┼─────────────────────────────┼───────────────────────────────────────────────────┤ │ requestChanges │ false │ Never post REQUEST_CHANGES comments on PRs │ ├─────────────────┼─────────────────────────────┼───────────────────────────────────────────────────┤ │ failCheck │ false │ No red X on workflow in GitHub UI if it fails │ ├─────────────────┼─────────────────────────────┼───────────────────────────────────────────────────┤ │ triggers │ pull_request + local │ Runs on PR open/sync and local warden invocations │ ├─────────────────┼─────────────────────────────┼───────────────────────────────────────────────────┤ │ reportOnSuccess │ false (default) │ No comment when everything is clean │ └─────────────────┴─────────────────────────────┴───────────────────────────────────────────────────┘ Going forward, we can consider blocking PRs once we've had a chance to vet behavior in the wild. --- .claude/skills/.gitignore | 2 + .../skills/check-code-attribution/SKILL.md | 295 ++++++++++++++ .../validation-tests/EXPECTED.json | 53 +++ .../validation-tests/README.md | 41 ++ .../THIRD_PARTY_NOTICES.catalog.md | 130 ++++++ .../validation-tests/assert-scenarios.mjs | 377 ++++++++++++++++++ .../check-code-attribution-tests.sh | 246 ++++++++++++ .../HeaderCompleteAndNoticePresent.java | 21 + .../HeaderCompleteButNoticeMissing.java | 17 + .../scenarios/HeaderFullyStripped.java | 7 + .../HeaderMissingButNoticePresent.java | 8 + .../HeaderMissingNonEssentialInfo.java | 12 + .../scenarios/HeaderPartiallyStripped.java | 10 + .../scenarios/NewLicenseType.java | 10 + .../THIRD_PARTY_NOTICES.mismatch-snippet.md | 37 ++ .gitignore | 3 + AGENTS.md | 2 + agents.toml | 4 + warden.toml | 100 +++++ 19 files changed, 1375 insertions(+) create mode 100644 .claude/skills/check-code-attribution/SKILL.md create mode 100644 .claude/skills/check-code-attribution/validation-tests/EXPECTED.json create mode 100644 .claude/skills/check-code-attribution/validation-tests/README.md create mode 100644 .claude/skills/check-code-attribution/validation-tests/THIRD_PARTY_NOTICES.catalog.md create mode 100755 .claude/skills/check-code-attribution/validation-tests/assert-scenarios.mjs create mode 100755 .claude/skills/check-code-attribution/validation-tests/check-code-attribution-tests.sh create mode 100644 .claude/skills/check-code-attribution/validation-tests/scenarios/HeaderCompleteAndNoticePresent.java create mode 100644 .claude/skills/check-code-attribution/validation-tests/scenarios/HeaderCompleteButNoticeMissing.java create mode 100644 .claude/skills/check-code-attribution/validation-tests/scenarios/HeaderFullyStripped.java create mode 100644 .claude/skills/check-code-attribution/validation-tests/scenarios/HeaderMissingButNoticePresent.java create mode 100644 .claude/skills/check-code-attribution/validation-tests/scenarios/HeaderMissingNonEssentialInfo.java create mode 100644 .claude/skills/check-code-attribution/validation-tests/scenarios/HeaderPartiallyStripped.java create mode 100644 .claude/skills/check-code-attribution/validation-tests/scenarios/NewLicenseType.java create mode 100644 .claude/skills/check-code-attribution/validation-tests/scenarios/THIRD_PARTY_NOTICES.mismatch-snippet.md create mode 100644 warden.toml diff --git a/.claude/skills/.gitignore b/.claude/skills/.gitignore index 229f4495ee3..2dd55eba801 100644 --- a/.claude/skills/.gitignore +++ b/.claude/skills/.gitignore @@ -8,3 +8,5 @@ !test/** !btrace-perfetto/ !btrace-perfetto/** +!check-code-attribution/ +!check-code-attribution/** diff --git a/.claude/skills/check-code-attribution/SKILL.md b/.claude/skills/check-code-attribution/SKILL.md new file mode 100644 index 00000000000..82e31bc2683 --- /dev/null +++ b/.claude/skills/check-code-attribution/SKILL.md @@ -0,0 +1,295 @@ +--- +name: check-code-attribution +description: Per-file check of vendored code attribution in the current branch diff, including license headers, THIRD_PARTY_NOTICES.md entries, and compatibility with Sentry's licensing policy +allowed-tools: Bash Read Grep Glob +--- + +**Maintainers:** Only edit files in `.claude/skills/check-code-attribution` (the committed file) and run `npx @sentry/dotagents sync` from the command line to automatically update the matching files in `.agents/skills/check-code-attribution`. + +# Check Code Attribution + +You are reviewing changed files for third-party code attribution compliance in **sentry-java**, an MIT-licensed repository. + +## Local runs — discover changed files first + +When running locally (not via Warden), determine which files changed on this branch: + +```bash +MB=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main) +git diff --name-only "${MB}"..HEAD +``` + +Then run the Quick triage and subsequent checks on **every** file in that list. Warden's `ignorePaths` in `warden.toml` lists the paths to skip — apply the same exclusions locally. + +### Warden CLI (optional local parity check) + +Warden does **not** use Cursor auth. Before running Warden locally, configure a provider (same model family as `warden.toml`, or override with `-m`): + +```bash +# Option A: Anthropic API key (matches CI model in warden.toml) +export WARDEN_ANTHROPIC_API_KEY=sk-ant-... # or: export ANTHROPIC_API_KEY=sk-ant-... + +# Option B: Pi OAuth / API key store (~/.pi/agent/auth.json) +npx pi # then run /login and pick Anthropic (or another provider) + +# Option C: Different provider for a one-off run +export WARDEN_OPENAI_API_KEY=sk-... +npx @sentry/warden origin/main..HEAD --skill check-code-attribution -m openai/gpt-5.5 -vv +``` + +```bash +npx @sentry/warden origin/main..HEAD --skill check-code-attribution -vv +``` + +If you only need attribution review in the IDE, `/check-code-attribution` in Cursor does not require Warden credentials. + +When running via Warden, the changed file is already provided — skip branch-wide discovery, but follow **Warden execution** below. + +## Warden execution + +Warden analyzes one changed file per run (whole-file mode). Complete every Quick triage step — the diff alone is not sufficient. + +**Mandatory on every run (do not skip):** + +1. `Read` the first 50 lines of the changed file. +2. `Grep` `THIRD_PARTY_NOTICES.md` for the class name (filename without extension, e.g. `ANRWatchDog` for `ANRWatchDog.java`). On renames, also grep the old basename and read Scope sections (see Quick triage). +3. When Bash is available, compare the merge-base header: + ```bash + MB=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main) + git show "${MB}:" | head -50 + ``` + +**Do not dismiss findings because:** + +- A `THIRD_PARTY_NOTICES.md` entry exists — file headers are still required; NOTICES does not replace them. +- The diff only removes a header comment block — if removed `-` lines include a **required field** (see below) or vendoring language ("adapted from", etc.), attribution was stripped. Removing boilerplate alone is not stripping. +- The header says "Adapted from …" but omits copyright holder or license name — flag missing header fields. +- The file header has all four required fields — a missing THIRD_PARTY_NOTICES.md entry is independently required and is ⚠️ medium regardless of header completeness. + +For `THIRD_PARTY_NOTICES.md` runs: for every **removed** entry in the diff, use `Read` or `Glob` to confirm whether Scope files still exist with attribution headers. If they do, the entry must not be removed. + +## Quick triage + +Sentry's own files carry **no** copyright headers — any copyright/license line indicates third-party code. Every file that reaches this skill is in scope — do not skip files based on extension. + +If this file is `THIRD_PARTY_NOTICES.md`, go to the THIRD_PARTY_NOTICES section below. + +For all other files, perform these checks **before** deciding whether to proceed: + +1. **Read the file header** — use the Read tool to read the first 50 lines of the file. Look for vendored-code signals: `Copyright`, `Licensed under`, `SPDX-License-Identifier`, or vendoring language ("adapted from", "backported from", "based on", "copied from", "derived from", "inspired by", "ported from", "translated from", "vendored"). +2. **Check THIRD_PARTY_NOTICES.md** — use Grep to search `THIRD_PARTY_NOTICES.md` for the file name without extension (e.g., search for `ANRWatchDog` when reviewing `ANRWatchDog.java`). A match means this is a known vendored file. **Renames:** if the diff is a rename (`similarity index` / `rename from` in the diff, or a delete of one path and add of another with the same content), also Grep for the **old** basename and read **Scope** sections in matching entries — NOTICES may still reference the previous class or path name. + > **A complete NOTICES entry does NOT end the check.** It confirms the file is vendored and that the NOTICES requirement is satisfied. The file header is a separate, additional requirement — continue to header verification regardless of NOTICES completeness. +3. **Scan the diff** — check for vendored-code signals on both added (`+`) and **removed (`-`)** lines. Removed lines that drop a **required field** (copyright, license name, source URL, vendoring origin) ARE signals. Removed disclaimer/boilerplate lines alone are not. + +**A signal in ANY of these three sources means this is vendored code — proceed to the vendored source file section.** + +A file referenced in THIRD_PARTY_NOTICES.md is ALWAYS vendored, even if its current header has no attribution. + +**If none of the three sources have signals, report no findings and stop.** + +--- + +## If this file is `THIRD_PARTY_NOTICES.md` + +Validate the changed entries using the diff context: + +1. For each added or modified entry, verify it has all required fields: **Source URL**, **License name**, **Copyright**, **Scope** (file paths), and **full license text** in a fenced code block. +2. For each Scope path, verify the file(s) exist (use Glob or Read). +3. Flag new license types using the same license-tier table as for source files: weak copyleft (LGPL, MPL, EPL) → 🚨 **high**, strong copyleft (GPL) → 🚨 **high**, AGPL → 🚨 **high** (absolute ban, must be removed). Do not use low or medium for copyleft or AGPL. +4. Flag orphaned entries whose Scope files no longer exist. +5. For **removed** entries (lines prefixed with `-` in the diff), use Read to check whether the Scope files still exist and still have attribution headers. If they do, the entry must not be removed. +6. Check **copyright consistency** — the Copyright field must match the copyright line inside the embedded license text. Flag mismatches. + +--- + +## If this is a vendored file + +### 1. Check attribution header + +Check each of the following by reading the file header — not NOTICES. Each is an independent yes/no; a "no" is ⚠️ medium regardless of NOTICES completeness: + +- [ ] **Vendoring origin phrase** — explicit wording such as `Adapted from …`, `Based on …`, `Vendored from …`, or a library name. +- [ ] **Copyright line** — e.g. `Copyright (c) 2016 …`, `Copyright 2010 Square, Inc.` +- [ ] **License name** — e.g. `Licensed under the Apache License, Version 2.0`, `The MIT License` +- [ ] **Source URL** — e.g. `https://github.com/…` + +Exact wording and comment style may vary. **Do not flag** missing or changed content that is not one of these four fields. + +**Each field must be physically present in the file header. A complete `THIRD_PARTY_NOTICES.md` entry does not satisfy any required field — both are independently required. Check each of the four fields by reading the file header, not by reasoning from NOTICES.** + +**Not required in the file header** (full text belongs in `THIRD_PARTY_NOTICES.md`, not in every source file): + +- Full license boilerplate (MIT permission paragraph, Apache "Unless required by applicable law…" disclaimer, ASF contributor grant preamble) +- Wording differences vs the NOTICES embedded license text (e.g. shortened Apache header vs canonical ASF phrasing) +- Comment style (`//` vs `/* */`), line wrapping, or extra Sentry modification notes + +Compare the current header against the NOTICES entry **only for the four required fields** — e.g. if NOTICES says MIT by "Salomon BRYS" but the header has no copyright or license name, flag it. If both have copyright + license name but the header omits the Apache disclaimer while NOTICES still has the full text, **do not flag**. + +When Bash is available (local runs), also compare against the merge-base version for additional context: +```bash +MB=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main) +git show "${MB}:" | head -50 +``` + +Flag these issues: +- **Header stripped** — file is in NOTICES but current header has none of the four required fields +- **Header truncated** — one or more **required** fields were removed (e.g. copyright line or `Licensed under …` removed) while the file remains vendored +- **Header inconsistent** — a **required** field contradicts NOTICES (wrong copyright holder/year, wrong license name) — not boilerplate or phrasing differences +- **Diff removes required attribution** — removed `-` lines drop a required field or vendoring origin (`Adapted from`, etc.); removing disclaimer/boilerplate lines alone is **not** this + +**Do not report** (no finding — prefer silence): + +- Apache/MIT disclaimer or permission paragraphs removed but all four required fields remain +- Header reworded to a shorter permissive-license form with the same copyright holder and license name +- Header and NOTICES differ only in full license body text (wording or boilerplate, not missing required fields) + +These exceptions apply only when an entry already exists in NOTICES and only to header-vs-NOTICES wording differences. A **missing** NOTICES entry is ⚠️ medium per section 2 — never covered by these exceptions. + +### 2. Check THIRD_PARTY_NOTICES.md entry + +**Severity: always `medium`. Do not output `severity: "low"` for a missing entry even if the attribution header is complete.** + +`THIRD_PARTY_NOTICES.md` is a mandatory legal exhibit that Sentry ships with every SDK distribution. It must enumerate all vendored code regardless of what the source file header says. A missing entry is a distribution-level compliance failure, not a nit. A complete file header does not satisfy the NOTICES requirement — both are mandatory. + +From the Grep in Quick triage: if no matching entry exists, output `severity: "medium"` and flag as ⚠️ Missing THIRD_PARTY_NOTICES.md entry. A valid entry needs: Source URL, License name, Copyright, Scope, full license text. + +### 3. Check license compatibility + +Classify the license per Sentry's Open Source Legal Policy (https://open.sentry.io/licensing/): + +| Tier | Examples | Finding | +|-----------------|-------------------------------------------------|---------------------------------------------| +| Permissive | MIT, BSD, Apache 2.0, ISC, CC0, Unlicense, Zlib | None — license is compatible | +| Weak copyleft | LGPL, MPL, EPL, CDDL | 🚨 **high** — requires review | +| Strong copyleft | GPL, QPL, Sleepycat, OSL | 🚨 **high** — requires legal review | +| AGPL | — | 🚨 **high** — absolute ban, must be removed | +| No license | — | 🚨 **high** — assume no permission | + +**Permissive licenses:** do not report a finding solely because the license is MIT/BSD/Apache/etc. Only flag missing or stripped **required** header fields, or missing/inconsistent `THIRD_PARTY_NOTICES.md` entry. Do not flag disclaimer/boilerplate-only diffs. Copyleft and unlicensed code still get 🚨 findings per the table. + +--- + +## If this is a deleted vendored file + +If the diff deletes a file and the removed lines contained attribution headers, check whether `THIRD_PARTY_NOTICES.md` still references it — the entry should be updated or removed. + +--- + +## Severity guide + +| Level | Use for | +|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **high** | 🚨 License violations: AGPL, copyleft, unlicensed, no-license code | +| **medium** | ⚠️ Missing **required** header fields, stripped required fields, missing/inconsistent NOTICES entries (even when header is complete), deleted/renamed vendored files needing NOTICES update | +| **low** | 👀 Cosmetic/style differences only (shortened license wording, comment style). **Never** use for a missing NOTICES entry or missing header field — those are always medium. | + +Warden relies on these severity levels when deciding whether to comment on PRs or require changes. Put the severity emoji **only on the finding title** (see Output) so reviewers can triage at a glance. + +## Output + +**No issues → empty response (say nothing).** + +Otherwise, report each finding ordered by severity (most severe first). + +### Emoji placement (required) + +Use the emoji from the severity guide (🚨, ⚠️, or 👀) — not the word `high`, `medium`, or `low`. + +| Field | Emoji? | Example | +|-------------------|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------| +| **Title** | Yes — once, at the start | `⚠️ Copyright line stripped from vendored file header` | +| **Description** | **No** | `**io.sentry.cache.tape.FileObjectQueue** — The Copyright (C) 2010 Square, Inc. line was removed…` (see **Description subject** below) | +| **Verification** | **No** | Evidence steps only | +| **Suggested fix** | **No** | Fix text only | + +**Good (Warden PR comment):** + +``` +Title: ⚠️ Copyright line stripped from vendored file header +Description: **io.sentry.cache.tape.FileObjectQueue** — The `Copyright (C) 2010 Square, Inc.` line was removed from this vendored file's header. Please restore the copyright line. +``` + +**Bad — emoji in the description (never do this):** + +``` +Title: ⚠️ Copyright line stripped from vendored file header +Description: ⚠️ The `Copyright (C) 2010 Square, Inc.` line was removed… +``` + +**Bad — emoji before the class name:** + +``` +Title: ⚠️ Copyright line stripped from vendored file header +Description: ⚠️ **io.sentry.cache.tape.FileObjectQueue** — The copyright line was removed… +``` + +### Description subject (required) + +Every description **must** start with `**** —` (bold subject, space, em dash, space). Pick **one** subject by file type: + +| File type | Subject format | Example | +|-------------------------------------------------------------------------------------------|----------------------------------------------------------------------|----------------------------------------------------------------| +| Java / Kotlin source (`.java`, `.kt`) with a top-level type | Fully qualified class name (FQCN) | `**io.sentry.CircularFifoQueue** —` | +| Java / Kotlin with no single clear type (multiple top-level types, unclear which changed) | FQCN of the primary type under review, or repo-relative path if none | `**sentry/src/.../Foo.kt** —` | +| `THIRD_PARTY_NOTICES.md` | `THIRD_PARTY_NOTICES.md — ` | `**THIRD_PARTY_NOTICES.md — Square — Seismic (Apache 2.0)** —` | +| Gradle / other scripts (e.g. `.kts`, `.gradle`) | Repo-relative path from repository root | `**build.gradle.kts** —` | + +- Prefer **FQCN** for `.java` / `.kt` vendored source (derive from `package` + primary public top-level class). Do not use file paths when a FQCN is clear. +- For license-tier / policy issues, include https://open.sentry.io/licensing/ in the description body. + +### Warden runs + +For each finding, set these fields exactly: + +| Field | Value | +|------------------|-------------------------------------------------------------------------------------------------------------------| +| **severity** | `high`, `medium`, or `low` — **never** put emoji here; Warden maps severity from this field, not from the title | +| **title** | ` ` — emoji allowed **only** here (imperative, no class name) | +| **description** | `**** — ` — **plain text only**; subject per **Description subject** above | +| **verification** | Optional evidence steps — plain text only | + +**Description rules (Warden):** + +- **Must** match `**** — …` using the table in **Description subject**. +- **Must not** contain 🚨, ⚠️, 👀, or the words `high`, `medium`, or `low` as severity labels. +- **Must not** repeat the title or paraphrase it with an emoji prefix. + +**Good (NOTICES entry removed while scope files remain):** + +``` +Title: ⚠️ NOTICES entry removed for vendored code still in tree +Description: **THIRD_PARTY_NOTICES.md — Square — Seismic (Apache 2.0)** — The Seismic entry was removed but `io.sentry.android.core.SentryShakeDetector` still has an attribution header. Restore the entry or remove attribution from the scope files. +``` + +**Before submitting findings:** For every finding, confirm `description` does not match `[🚨⚠️👀]` and matches `^\*\*.+\*\* — `. If it contains any emoji, rewrite the description without it. + +### Local / IDE runs + +Use this numbered format — same title vs description split as above: + +``` +1\. **** + **** — + +2\. **** + **** — +``` + +Rules: + +- Put the severity emoji **only** on the title line (`1\. ⚠️ **…**`), never on the description line. +- The description line uses `**** —` per **Description subject** and must not contain 🚨, ⚠️, or 👀. +- **Escape the period** after the number (`1\.` not `1.`) so markdown does not collapse entries into a tight list. +- Leave an empty line between each numbered finding. + +## Validation (maintainers) + +Test samples live under `validation-tests/` and are excluded from normal runs via `.claude/**` in `warden.toml`. + +```bash +.claude/skills/check-code-attribution/validation-tests/check-code-attribution-tests.sh +``` + +Expected outcomes are in `validation-tests/EXPECTED.json`. The script creates isolated git worktrees, runs Warden with `--report-on medium --json`, and asserts per-scenario pass/fail. Scenarios marked `"isolated": true` in `EXPECTED.json` each get their own worktree to avoid Anthropic prompt-cache priming that can suppress findings below medium in concurrent batches. Exit 0 = all pass. + +When manually reviewing a file under `validation-tests/scenarios/`, grep `validation-tests/THIRD_PARTY_NOTICES.catalog.md` in addition to root `THIRD_PARTY_NOTICES.md` in Quick triage step 2. See `validation-tests/README.md`. diff --git a/.claude/skills/check-code-attribution/validation-tests/EXPECTED.json b/.claude/skills/check-code-attribution/validation-tests/EXPECTED.json new file mode 100644 index 00000000000..a82637b84e2 --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/EXPECTED.json @@ -0,0 +1,53 @@ +[ + { + "id": "header-complete-and-notice-present", + "file": "HeaderCompleteAndNoticePresent.java", + "expectFinding": false, + "notes": "Header matches catalog entry" + }, + { + "id": "header-complete-but-notice-missing", + "file": "HeaderCompleteButNoticeMissing.java", + "expectFinding": true, + "isolated": true, + "notes": "Full header; no catalog / root NOTICES entry. Isolated: prompt-cache priming in a concurrent batch suppresses the missing-NOTICES finding below medium." + }, + { + "id": "header-missing-but-notice-present", + "file": "HeaderMissingButNoticePresent.java", + "expectFinding": true, + "isolated": true, + "notes": "NOTICES entry claims file is vendored but file has no attribution header. Isolated: a complete NOTICES entry suppresses the missing-header finding in a concurrent batch." + }, + { + "id": "header-fully-stripped", + "file": "HeaderFullyStripped.java", + "expectFinding": true, + "notes": "Header has no required attribution fields" + }, + { + "id": "header-partially-stripped", + "file": "HeaderPartiallyStripped.java", + "expectFinding": true, + "notes": "Adapted from + URL only; no copyright or license name" + }, + { + "id": "header-missing-non-essential-info", + "file": "HeaderMissingNonEssentialInfo.java", + "expectFinding": false, + "notes": "All four required fields present; no license boilerplate — boilerplate is not required in the header" + }, + { + "id": "header-vs-notice-mismatch", + "file": "THIRD_PARTY_NOTICES.md", + "expectFinding": true, + "isolated": true, + "notes": "Copyright in metadata field does not match embedded license text. Isolated: mismatch finding needs an independent assertion free of interference from other NOTICES changes." + }, + { + "id": "new-license-type", + "file": "NewLicenseType.java", + "expectFinding": true, + "notes": "AGPL v3 license in file header — absolute ban, must be removed" + } +] diff --git a/.claude/skills/check-code-attribution/validation-tests/README.md b/.claude/skills/check-code-attribution/validation-tests/README.md new file mode 100644 index 00000000000..d1217dcb0c3 --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/README.md @@ -0,0 +1,41 @@ +# Attribution skill validation tests + +Self-contained samples for validating `check-code-attribution` without touching production SDK sources. + + +## Run the tests + +```bash +./check-code-attribution-tests.sh +``` + +Requires Node.js and an Anthropic API key (`WARDEN_ANTHROPIC_API_KEY` or `ANTHROPIC_API_KEY`). See SKILL.md "Warden CLI" section for all auth options. + +In practice, straight command line runs tend to be a bit flakier than asking Claude Code to run the tests for you. + +## Layout + +- `EXPECTED.json` — scenario IDs and expected outcomes (single source of truth). +- `THIRD_PARTY_NOTICES.catalog.md` — NOTICES-style entries for validation class names. +- `scenarios/` — `.java` files and `THIRD_PARTY_NOTICES.mismatch-snippet.md` (copyright-mismatch fixture). +- `check-code-attribution-tests.sh` — runs Warden on a temp branch and asserts per-scenario pass/fail. +- `assert-scenarios.mjs` — validation driver (`list-isolated`, `routing-set`, `assert` subcommands); parses Warden JSONL and checks outcomes from `EXPECTED.json`. + +### assert-scenarios.mjs commands + +```bash +node assert-scenarios.mjs validate EXPECTED.json scenarios/ # pre-flight (no API); run automatically by the shell script +node assert-scenarios.mjs list-isolated EXPECTED.json # idfile per isolated scenario +node assert-scenarios.mjs list-main-java EXPECTED.json scenarios/ # .java files for the main Warden batch +node assert-scenarios.mjs routing-set routing.json # update id → Warden JSONL path +node assert-scenarios.mjs assert EXPECTED.json routing.json +``` + +Warden runs are limited to 300s. On macOS the script uses `gtimeout` (from `brew install coreutils`) when available, otherwise GNU `timeout`, otherwise `perl` with `alarm`. + +## Add a scenario + +1. Add `scenarios/.java`. +2. Add or omit a catalog entry in `THIRD_PARTY_NOTICES.catalog.md`. +3. Add an entry to `EXPECTED.json`. +4. **Isolation (if needed):** If the scenario relies on a finding that could be suppressed by Anthropic prompt-cache priming when analyzed alongside many other files (e.g. a missing-NOTICES entry, or a missing header on a file that has a complete NOTICES entry), add `"isolated": true` to its `EXPECTED.json` entry. The test script creates a dedicated worktree for each isolated scenario automatically — no changes to the script itself are needed. diff --git a/.claude/skills/check-code-attribution/validation-tests/THIRD_PARTY_NOTICES.catalog.md b/.claude/skills/check-code-attribution/validation-tests/THIRD_PARTY_NOTICES.catalog.md new file mode 100644 index 00000000000..0b9a9af364b --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/THIRD_PARTY_NOTICES.catalog.md @@ -0,0 +1,130 @@ +# Test THIRD_PARTY_NOTICES catalog (not shipped) + +Used only when validating `check-code-attribution` against `validation-tests/scenarios/**`. +Grep this file in addition to the repository root `THIRD_PARTY_NOTICES.md`. + +--- + +## Example — HeaderFullyStripped (MIT) + +**Source:** https://github.com/example/attribution-fixtures
+**License:** MIT License
+**Copyright:** Copyright (c) 2016 Example Author + +### Scope + +Attribution validation sample. The code resides in `io.sentry.skills.verification.HeaderFullyStripped` (`validation-tests/scenarios/HeaderFullyStripped.java`). + +``` +MIT License + +Copyright (c) 2016 Example Author + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` + +--- + +## Example — HeaderMissingButNoticePresent (Apache 2.0) + +**Source:** https://github.com/example/notices-without-header
+**License:** Apache License 2.0
+**Copyright:** Copyright 2023 Example Corp. + +### Scope + +Attribution validation sample. The code resides in `io.sentry.skills.verification.HeaderMissingButNoticePresent`. + +``` +Copyright 2023 Example Corp. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +``` + +--- + +## Example — HeaderMissingNonEssentialInfo (MIT) + +**Source:** https://github.com/example/examplelib
+**License:** MIT License
+**Copyright:** Copyright 2020 Example Corp. + +### Scope + +Attribution validation sample. The code resides in `io.sentry.skills.verification.HeaderMissingNonEssentialInfo`. + +``` +MIT License + +Copyright (c) 2020 Example Corp. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` + +--- + +## Example — HeaderCompleteAndNoticePresent (Apache 2.0) + +**Source:** https://github.com/example/complete-with-notices
+**License:** Apache License 2.0
+**Copyright:** Copyright 2020 Example Authors + +### Scope + +Attribution validation sample. The code resides in `io.sentry.skills.verification.HeaderCompleteAndNoticePresent`. + +``` +Copyright 2020 Example Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +``` diff --git a/.claude/skills/check-code-attribution/validation-tests/assert-scenarios.mjs b/.claude/skills/check-code-attribution/validation-tests/assert-scenarios.mjs new file mode 100755 index 00000000000..118d99f10d7 --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/assert-scenarios.mjs @@ -0,0 +1,377 @@ +#!/usr/bin/env node +/** + * Validation driver for check-code-attribution scenario tests. + * + * Usage: + * node assert-scenarios.mjs validate + * node assert-scenarios.mjs list-isolated + * node assert-scenarios.mjs list-main-java + * node assert-scenarios.mjs routing-set + * node assert-scenarios.mjs assert + * + * routing.json maps scenario id to Warden JSONL output path, e.g. { "main": "/tmp/..." }. + * Non-isolated scenarios use the "main" entry when no dedicated id is present. + */ + +import fs from 'node:fs'; +import path from 'node:path'; +import { pathToFileURL } from 'node:url'; + +const ISOLATED_FILE_JAVA = /\.java$/i; +const ISOLATED_FILE_NOTICES = 'THIRD_PARTY_NOTICES.md'; + +export function loadExpected(expectedPath) { + return JSON.parse(fs.readFileSync(expectedPath, 'utf8')); +} + +export function listIsolated(scenarios) { + return scenarios.filter((s) => s.isolated); +} + +/** Repo-relative path normalization for Warden JSONL matching. */ +export function normalizeRepoPath(filePath) { + if (!filePath) return filePath; + return filePath.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/'); +} + +/** True when a Warden-reported path refers to the expected scenario file. */ +export function pathMatchesWardenFile(reportedPath, wardenFile) { + const reported = normalizeRepoPath(reportedPath); + const expected = normalizeRepoPath(wardenFile); + if (reported === expected) return true; + const base = expected.split('/').pop(); + return base != null && reported.endsWith(`/${base}`); +} + +export function findingCountForFile(fileMap, wardenFile) { + const expected = normalizeRepoPath(wardenFile); + if (fileMap[expected] != null) return fileMap[expected]; + for (const [key, count] of Object.entries(fileMap)) { + if (pathMatchesWardenFile(key, wardenFile)) return count; + } + return 0; +} + +export function findingsForFile(findings, wardenFile) { + return findings.filter( + (f) => f.location && pathMatchesWardenFile(f.location.path, wardenFile), + ); +} + +export function listMainBatchJava(scenarios, scenariosDir) { + const isolatedJava = new Set( + listIsolated(scenarios) + .map((s) => s.file) + .filter((file) => ISOLATED_FILE_JAVA.test(file)), + ); + return fs + .readdirSync(scenariosDir) + .filter((name) => name.endsWith('.java') && !isolatedJava.has(name)) + .sort(); +} + +/** + * @returns {string[]} validation error messages (empty = ok) + */ +export function validateExpected(scenarios, scenariosDir) { + const errors = []; + + if (!Array.isArray(scenarios)) { + return ['EXPECTED.json must be a JSON array']; + } + + const ids = new Set(); + const expectedJava = new Set(); + + for (const [index, s] of scenarios.entries()) { + const label = `entry ${index}`; + if (!s || typeof s !== 'object') { + errors.push(`${label}: must be an object`); + continue; + } + if (typeof s.id !== 'string' || !s.id) { + errors.push(`${label}: missing or empty "id"`); + } else { + if (ids.has(s.id)) errors.push(`duplicate id "${s.id}"`); + ids.add(s.id); + if (s.id === 'main') { + errors.push(`id "main" is reserved for routing.json`); + } + } + if (typeof s.file !== 'string' || !s.file) { + errors.push(`${label}: missing or empty "file"`); + } else if (ISOLATED_FILE_JAVA.test(s.file)) { + expectedJava.add(s.file); + const onDisk = path.join(scenariosDir, s.file); + if (!fs.existsSync(onDisk)) { + errors.push(`${s.id}: scenarios/${s.file} does not exist`); + } + } else if (s.file !== ISOLATED_FILE_NOTICES) { + errors.push( + `${s.id}: unsupported file "${s.file}" (use *.java or ${ISOLATED_FILE_NOTICES})`, + ); + } + if (typeof s.expectFinding !== 'boolean') { + errors.push(`${s.id ?? label}: "expectFinding" must be a boolean`); + } + if (s.isolated) { + if ( + !ISOLATED_FILE_JAVA.test(s.file) && + s.file !== ISOLATED_FILE_NOTICES + ) { + errors.push( + `${s.id}: isolated scenarios must use *.java or ${ISOLATED_FILE_NOTICES}`, + ); + } + } + } + + let diskJava = []; + try { + diskJava = fs.readdirSync(scenariosDir).filter((n) => n.endsWith('.java')); + } catch (e) { + errors.push(`cannot read scenarios dir ${scenariosDir}: ${e.message}`); + return errors; + } + + for (const name of diskJava) { + if (!expectedJava.has(name)) { + errors.push(`scenarios/${name} has no matching entry in EXPECTED.json`); + } + } + + if (listMainBatchJava(scenarios, scenariosDir).length === 0) { + errors.push('main Warden batch needs at least one non-isolated .java scenario'); + } + + return errors; +} + +export function parseWardenJsonl(jsonlPath) { + /** @type {Record} */ + const fileMap = {}; + const allFindings = []; + try { + const raw = fs.readFileSync(jsonlPath, 'utf8').trim(); + if (!raw) return { fileMap, findings: [] }; + const records = raw + .split('\n') + .filter((l) => l.trim()) + .map((l) => JSON.parse(l)); + for (const record of records) { + const file = record.chunk && record.chunk.file; + if (!file) continue; + const normalized = normalizeRepoPath(file); + const recordFindings = record.findings || []; + fileMap[normalized] = (fileMap[normalized] || 0) + recordFindings.length; + for (const f of recordFindings) { + allFindings.push({ + ...f, + location: f.location || { path: normalized, startLine: 1 }, + }); + } + } + } catch (e) { + console.error( + 'ERROR: Could not parse Warden output from ' + jsonlPath + ':', + e.message, + ); + process.exit(2); + } + return { fileMap, findings: allFindings }; +} + +export function routingSet(routingPath, id, jsonlPath) { + const routing = JSON.parse(fs.readFileSync(routingPath, 'utf8')); + routing[id] = jsonlPath; + fs.writeFileSync(routingPath, JSON.stringify(routing)); +} + +function wardenFileForScenario(destPkg, scenario) { + return scenario.file === ISOLATED_FILE_NOTICES + ? ISOLATED_FILE_NOTICES + : `${destPkg}/${scenario.file}`; +} + +function loadRouting(routingPath) { + /** @type {Record} */ + let routing; + try { + routing = JSON.parse(fs.readFileSync(routingPath, 'utf8')); + } catch (e) { + console.error(`ERROR: Could not read routing file ${routingPath}:`, e.message); + process.exit(2); + } + + if (typeof routing.main !== 'string' || !routing.main) { + console.error('ERROR: routing.json must include a non-empty "main" JSONL path.'); + process.exit(2); + } + return routing; +} + +function cmdValidate(expectedPath, scenariosDir) { + if (!expectedPath || !scenariosDir) { + console.error( + 'Usage: node assert-scenarios.mjs validate ', + ); + process.exit(2); + } + const errors = validateExpected(loadExpected(expectedPath), scenariosDir); + if (errors.length > 0) { + console.error('EXPECTED.json validation failed:'); + for (const err of errors) console.error(` - ${err}`); + process.exit(1); + } + console.log('EXPECTED.json OK'); +} + +function cmdListIsolated(expectedPath) { + for (const s of listIsolated(loadExpected(expectedPath))) { + process.stdout.write(`${s.id}\t${s.file}\n`); + } +} + +function cmdListMainJava(expectedPath, scenariosDir) { + if (!expectedPath || !scenariosDir) { + console.error( + 'Usage: node assert-scenarios.mjs list-main-java ', + ); + process.exit(2); + } + for (const name of listMainBatchJava(loadExpected(expectedPath), scenariosDir)) { + process.stdout.write(`${name}\n`); + } +} + +function cmdRoutingSet(routingPath, id, jsonlPath) { + if (!routingPath || !id || !jsonlPath) { + console.error( + 'Usage: node assert-scenarios.mjs routing-set ', + ); + process.exit(2); + } + routingSet(routingPath, id, jsonlPath); +} + +function cmdAssert(expectedPath, destPkg, routingPath) { + if (!expectedPath || !destPkg || !routingPath) { + console.error( + 'Usage: node assert-scenarios.mjs assert ', + ); + process.exit(2); + } + + const routing = loadRouting(routingPath); + const scenarios = loadExpected(expectedPath); + + /** @type {Record>} */ + const parsed = {}; + function getSource(id) { + const jsonlPath = routing[id] ?? routing.main; + if (!parsed[jsonlPath]) parsed[jsonlPath] = parseWardenJsonl(jsonlPath); + return parsed[jsonlPath]; + } + + const GREEN = '\x1b[32m'; + const RED = '\x1b[31m'; + const RESET = '\x1b[0m'; + + const failures = []; + let pass = 0; + + for (const s of scenarios) { + if (s.isolated && !routing[s.id]) { + console.error( + `ERROR: isolated scenario "${s.id}" has no routing entry (missing Warden run?)`, + ); + process.exit(2); + } + + const wardenFile = wardenFileForScenario(destPkg, s); + const source = getSource(s.id); + const count = findingCountForFile(source.fileMap, wardenFile); + const passed = s.expectFinding ? count > 0 : count === 0; + + if (passed) { + console.log(`${GREEN}PASS${RESET} ${s.id}`); + pass++; + } else { + const reason = s.expectFinding + ? 'expected finding (>= medium), got none' + : `expected no finding (>= medium), got ${count}`; + console.log(`${RED}FAIL${RESET} ${s.id} (${reason})`); + + failures.push({ + id: s.id, + findings: findingsForFile(source.findings, wardenFile), + }); + } + } + + const total = scenarios.length; + console.log(''); + console.log(`${total} scenarios: ${pass} passed, ${total - pass} failed`); + + if (failures.length > 0) { + console.log(''); + console.log('Warden output'); + console.log('══════════════════════'); + + for (const { id, findings } of failures) { + console.log(''); + console.log(id); + console.log('-'.repeat(id.length)); + if (findings.length === 0) { + console.log('(Warden produced no findings for this file)'); + } else { + for (const f of findings) { + console.log(f.title); + if (f.description) console.log(f.description); + if (f.verification) console.log('\nVerification: ' + f.verification); + console.log(''); + } + } + } + + process.exit(1); + } +} + +function usage() { + console.error(`Usage: + node assert-scenarios.mjs validate + node assert-scenarios.mjs list-isolated + node assert-scenarios.mjs list-main-java + node assert-scenarios.mjs routing-set + node assert-scenarios.mjs assert `); + process.exit(2); +} + +function main() { + const [, , cmd, ...args] = process.argv; + switch (cmd) { + case 'validate': + cmdValidate(args[0], args[1]); + break; + case 'list-isolated': + if (!args[0]) usage(); + cmdListIsolated(args[0]); + break; + case 'list-main-java': + cmdListMainJava(args[0], args[1]); + break; + case 'routing-set': + cmdRoutingSet(args[0], args[1], args[2]); + break; + case 'assert': + cmdAssert(args[0], args[1], args[2]); + break; + default: + usage(); + } +} + +if (process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href) { + main(); +} diff --git a/.claude/skills/check-code-attribution/validation-tests/check-code-attribution-tests.sh b/.claude/skills/check-code-attribution/validation-tests/check-code-attribution-tests.sh new file mode 100755 index 00000000000..713815734a2 --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/check-code-attribution-tests.sh @@ -0,0 +1,246 @@ +#!/usr/bin/env bash +# check-code-attribution-tests.sh — Validate the check-code-attribution skill against synthetic scenarios. +# +# Usage: +# ./check-code-attribution-tests.sh [--help] +# +# What it does: +# 1. Validates EXPECTED.json and scenario fixtures (no API calls). +# 2. Creates an isolated git worktree on a temp branch from HEAD. +# 3. Creates a diff (non-isolated .java files, NOTICES catalog, mismatch snippet), +# commits, and runs Warden on the main batch. +# 4. Scenarios marked "isolated" in EXPECTED.json each get their own worktree and Warden +# run to avoid prompt-cache priming that can suppress findings in concurrent batches. +# 5. Asserts per-scenario pass/fail against EXPECTED.json (>= medium findings only). +# 6. Prints Warden's actual output for each failing scenario. +# 7. Cleans up all worktrees. +# +# Requires: +# - Node.js / npx +# - One of: WARDEN_ANTHROPIC_API_KEY, ANTHROPIC_API_KEY, or Pi OAuth config +# (see SKILL.md "Warden CLI" section for setup options) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)" +SCENARIOS_DIR="$SCRIPT_DIR/scenarios" +CATALOG="$SCRIPT_DIR/THIRD_PARTY_NOTICES.catalog.md" +EXPECTED_JSON="$SCRIPT_DIR/EXPECTED.json" +VALIDATION="$SCRIPT_DIR/assert-scenarios.mjs" +MISMATCH_SNIPPET="$SCENARIOS_DIR/THIRD_PARTY_NOTICES.mismatch-snippet.md" + +# Destination path inside the worktree — must not appear in warden.toml ignorePaths. +DEST_PACKAGE_PATH="sentry/src/test/java/io/sentry/skills/verification" + +# Warden wall-clock limit (seconds). +TIMEOUT_SEC=300 + +die() { echo "ERROR: $*" >&2; exit 1; } + +show_usage() { + cat <<'EOF' +Usage: check-code-attribution-tests.sh [--help] + +Validates the check-code-attribution skill against all scenarios in EXPECTED.json. +Runs Warden on a temporary branch and asserts per-scenario pass/fail (>= medium findings). + +Prerequisites: + - Node.js (npx) + - API key: WARDEN_ANTHROPIC_API_KEY or ANTHROPIC_API_KEY + (or Pi OAuth: npx pi && /login — see SKILL.md "Warden CLI" section) + - Wall-clock limit: gtimeout (brew install coreutils), GNU timeout, or perl +EOF +} + +[[ "${1:-}" == "--help" || "${1:-}" == "-h" ]] && { show_usage; exit 0; } + +# --- prereq checks --- + +command -v node >/dev/null 2>&1 || die "node not found — install Node.js." +command -v npx >/dev/null 2>&1 || die "npx not found — install Node.js." +command -v git >/dev/null 2>&1 || die "git not found." + +# macOS: GNU timeout is `gtimeout` from coreutils; fall back to perl alarm. +TIMEOUT_CMD=() +if command -v gtimeout >/dev/null 2>&1; then + TIMEOUT_CMD=(gtimeout "$TIMEOUT_SEC") +elif command -v timeout >/dev/null 2>&1; then + TIMEOUT_CMD=(timeout "$TIMEOUT_SEC") +elif command -v perl >/dev/null 2>&1; then + TIMEOUT_CMD=(perl -e 'alarm shift; exec @ARGV' "$TIMEOUT_SEC") +else + die "Need gtimeout (brew install coreutils), GNU timeout, or perl for Warden wall-clock limit" +fi + +if [[ -z "${WARDEN_ANTHROPIC_API_KEY:-}" && -z "${ANTHROPIC_API_KEY:-}" ]]; then + if [[ ! -f "$HOME/.pi/agent/auth.json" ]]; then + die "No API key found. Set WARDEN_ANTHROPIC_API_KEY, ANTHROPIC_API_KEY, or run: npx pi && /login" + fi +fi + +node "$VALIDATION" validate "$EXPECTED_JSON" "$SCENARIOS_DIR" + +# --- cleanup tracking --- + +declare -a WORKTREES=() +declare -a BRANCHES=() +declare -a JSON_FILES=() + +cleanup() { + for wt in "${WORKTREES[@]+"${WORKTREES[@]}"}"; do + git -C "$REPO_ROOT" worktree remove --force "$wt" 2>/dev/null || true + done + for b in "${BRANCHES[@]+"${BRANCHES[@]}"}"; do + git -C "$REPO_ROOT" branch -D "$b" 2>/dev/null || true + done + (( ${#JSON_FILES[@]} )) && rm -f "${JSON_FILES[@]}" +} +trap cleanup EXIT + +# --- resolve base commit --- +# Branch from HEAD so the worktree includes the current skill definition. + +BASE=$(git -C "$REPO_ROOT" rev-parse HEAD || die "Cannot resolve HEAD.") +TS=$(date +%s) + +# --- helpers --- + +# Commits paths in a validation worktree with consistent author metadata. +# Usage: git_commit_in_worktree [path...] +git_commit_in_worktree() { + local worktree="$1" message="$2" + shift 2 + if (($# > 0)); then + git -C "$worktree" add "$@" + fi + git -C "$worktree" \ + -c user.email="ci@sentry.io" \ + -c user.name="Validation Test" \ + commit --quiet -m "$message" +} + +# Creates a git worktree from $BASE and commits the NOTICES catalog as the Warden +# analysis base — so only fixture changes appear in the diff Warden analyzes. +# Prints the catalog-commit SHA to stdout. +setup_catalog_base() { + local worktree="$1" branch="$2" + git -C "$REPO_ROOT" worktree add --quiet "$worktree" "$BASE" -b "$branch" + printf '\n' >> "$worktree/THIRD_PARTY_NOTICES.md" + sed "s|validation-tests/scenarios/|${DEST_PACKAGE_PATH}/|g" \ + "$CATALOG" >> "$worktree/THIRD_PARTY_NOTICES.md" + git_commit_in_worktree "$worktree" "test: apply NOTICES catalog [skip ci]" \ + THIRD_PARTY_NOTICES.md + git -C "$worktree" rev-parse HEAD +} + +# Appends the mismatch snippet to THIRD_PARTY_NOTICES.md, stripping the fixture's +# prose header so only the NOTICES entry itself lands in the file. +append_mismatch_snippet() { + local worktree="$1" + printf '\n' >> "$worktree/THIRD_PARTY_NOTICES.md" + sed '1,/^---$/d' "$MISMATCH_SNIPPET" >> "$worktree/THIRD_PARTY_NOTICES.md" +} + +# Runs Warden and writes JSON output to the given file. +run_warden() { + local base="$1" worktree="$2" json_out="$3" label="$4" + echo "Running Warden on ${base:0:7}..HEAD ($label)..." + : > "$json_out" + if ! "${TIMEOUT_CMD[@]}" npx @sentry/warden "${base}..HEAD" \ + --skill check-code-attribution \ + --fail-on off \ + --report-on medium \ + --json \ + -C "$worktree" \ + > "$json_out"; then + if [[ ! -s "$json_out" ]]; then + die "Warden failed for $label with no JSON output (check API key, network, and Warden logs)." + fi + die "Warden exited with an error for $label but left partial JSON in $json_out." + fi + [[ -s "$json_out" ]] || die "Warden succeeded but produced no JSON output for $label." +} + +# --- main worktree: non-isolated scenarios --- +# Isolated .java files are omitted here; they get dedicated worktrees below. + +echo "Creating worktrees from $(git -C "$REPO_ROOT" rev-parse --short "$BASE")..." +echo "" + +MAIN_WORKTREE=$(mktemp -d) +MAIN_BRANCH="validation-main-${TS}" +MAIN_JSON=$(mktemp) +ROUTING_JSON_FILE=$(mktemp) +echo '{}' > "$ROUTING_JSON_FILE" +WORKTREES+=("$MAIN_WORKTREE") +BRANCHES+=("$MAIN_BRANCH") +JSON_FILES+=("$MAIN_JSON" "$ROUTING_JSON_FILE") + +MAIN_BASE=$(setup_catalog_base "$MAIN_WORKTREE" "$MAIN_BRANCH") + +DEST_DIR="$MAIN_WORKTREE/$DEST_PACKAGE_PATH" +mkdir -p "$DEST_DIR" + +shopt -s nullglob +copied=0 +while IFS= read -r java_file; do + cp "$SCENARIOS_DIR/$java_file" "$DEST_DIR/" + copied=$((copied + 1)) +done < <(node "$VALIDATION" list-main-java "$EXPECTED_JSON" "$SCENARIOS_DIR") +echo "Copied ${copied} scenario files → $DEST_PACKAGE_PATH/ (non-isolated batch)" +append_mismatch_snippet "$MAIN_WORKTREE" +git_commit_in_worktree "$MAIN_WORKTREE" \ + "test: add check-code-attribution validation fixtures [skip ci]" \ + "$DEST_PACKAGE_PATH" THIRD_PARTY_NOTICES.md + +run_warden "$MAIN_BASE" "$MAIN_WORKTREE" "$MAIN_JSON" "main" +node "$VALIDATION" routing-set "$ROUTING_JSON_FILE" main "$MAIN_JSON" + +# --- isolated worktrees: one per scenario marked "isolated" in EXPECTED.json --- +# +# Scenarios where Anthropic prompt-cache priming can suppress findings in a concurrent +# batch get their own worktree and Warden run. EXPECTED.json is the single source of +# truth for which scenarios need isolation — add "isolated": true there, not here. +# Java isolates omit the mismatch snippet; the NOTICES mismatch scenario adds it alone. + +while IFS=$'\t' read -r id file; do + worktree=$(mktemp -d) + branch="validation-isolated-${TS}-${id//[^a-zA-Z0-9]/-}" + json=$(mktemp) + WORKTREES+=("$worktree") + BRANCHES+=("$branch") + JSON_FILES+=("$json") + + base=$(setup_catalog_base "$worktree" "$branch") + + commit_paths=() + if [[ "$file" == *.java ]]; then + dest_dir="$worktree/$DEST_PACKAGE_PATH" + mkdir -p "$dest_dir" + cp "$SCENARIOS_DIR/$file" "$dest_dir/" + commit_paths=("$DEST_PACKAGE_PATH") + elif [[ "$file" == "THIRD_PARTY_NOTICES.md" ]]; then + append_mismatch_snippet "$worktree" + commit_paths=(THIRD_PARTY_NOTICES.md) + else + die "Unsupported isolated scenario file: $file (id: $id)" + fi + + git_commit_in_worktree "$worktree" "test: isolated fixture for $id [skip ci]" \ + "${commit_paths[@]}" + + echo "" + run_warden "$base" "$worktree" "$json" "$id" + node "$VALIDATION" routing-set "$ROUTING_JSON_FILE" "$id" "$json" + +done < <(node "$VALIDATION" list-isolated "$EXPECTED_JSON") + +echo "" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + +# --- assert per-scenario --- +# +# ROUTING_JSON_FILE maps scenario id → Warden JSONL path; non-isolated scenarios use "main". + +node "$VALIDATION" assert "$EXPECTED_JSON" "$DEST_PACKAGE_PATH" "$ROUTING_JSON_FILE" diff --git a/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderCompleteAndNoticePresent.java b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderCompleteAndNoticePresent.java new file mode 100644 index 00000000000..4dca9ad3603 --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderCompleteAndNoticePresent.java @@ -0,0 +1,21 @@ +/* + * Adapted from https://github.com/example + * + * Copyright 2020 Example Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * https://github.com/example/something + */ +package io.sentry.skills.verification; + +public final class HeaderCompleteAndNoticePresent { + + public int sum(int a, int b) { + return a + b; + } +} diff --git a/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderCompleteButNoticeMissing.java b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderCompleteButNoticeMissing.java new file mode 100644 index 00000000000..081d1848300 --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderCompleteButNoticeMissing.java @@ -0,0 +1,17 @@ +/* + * Adapted from https://github.com/example + * + * Copyright 2024 Example Authors + * + * Licensed under the MIT License + * + * https://github.com/example/something + */ +package io.sentry.skills.verification; + +public final class HeaderCompleteButNoticeMissing { + + public boolean ok() { + return true; + } +} diff --git a/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderFullyStripped.java b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderFullyStripped.java new file mode 100644 index 00000000000..6973848c61e --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderFullyStripped.java @@ -0,0 +1,7 @@ +/* Attribution stripped — fixture for check-code-attribution validation only. */ +package io.sentry.skills.verification; + +public final class HeaderFullyStripped { + + public void run() {} +} diff --git a/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderMissingButNoticePresent.java b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderMissingButNoticePresent.java new file mode 100644 index 00000000000..5c4953ea3ad --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderMissingButNoticePresent.java @@ -0,0 +1,8 @@ +package io.sentry.skills.verification; + +public final class HeaderMissingButNoticePresent { + + public int compute(int x) { + return x * 2; + } +} diff --git a/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderMissingNonEssentialInfo.java b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderMissingNonEssentialInfo.java new file mode 100644 index 00000000000..c524a2593a4 --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderMissingNonEssentialInfo.java @@ -0,0 +1,12 @@ +// Adapted from ExampleLib. +// Copyright 2020 Example Corp. +// Licensed under the MIT License. +// https://github.com/example/examplelib +package io.sentry.skills.verification; + +public final class HeaderMissingNonEssentialInfo { + + public int compute(int x) { + return x + 1; + } +} diff --git a/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderPartiallyStripped.java b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderPartiallyStripped.java new file mode 100644 index 00000000000..0389934d94a --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/scenarios/HeaderPartiallyStripped.java @@ -0,0 +1,10 @@ +// Adapted from Example RateLimiter. +// https://github.com/example +package io.sentry.skills.verification; + +public final class HeaderPartiallyStripped { + + public synchronized boolean tryAcquire() { + return true; + } +} diff --git a/.claude/skills/check-code-attribution/validation-tests/scenarios/NewLicenseType.java b/.claude/skills/check-code-attribution/validation-tests/scenarios/NewLicenseType.java new file mode 100644 index 00000000000..e148f5a1a4f --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/scenarios/NewLicenseType.java @@ -0,0 +1,10 @@ +// Adapted from ExampleLib. +// Copyright 2020 Example Corp. +// Licensed under the GNU Affero General Public License v3.0. +// https://github.com/example/agpl-lib +package io.sentry.skills.verification; + +public final class NewLicenseType { + + public void run() {} +} diff --git a/.claude/skills/check-code-attribution/validation-tests/scenarios/THIRD_PARTY_NOTICES.mismatch-snippet.md b/.claude/skills/check-code-attribution/validation-tests/scenarios/THIRD_PARTY_NOTICES.mismatch-snippet.md new file mode 100644 index 00000000000..5a9b87285df --- /dev/null +++ b/.claude/skills/check-code-attribution/validation-tests/scenarios/THIRD_PARTY_NOTICES.mismatch-snippet.md @@ -0,0 +1,37 @@ +# Snippet fixture — MismatchLib entry for the isolated mismatch worktree. +# header-vs-notice-mismatch: copyright in metadata field does not match embedded license text. + +--- + +## Example — MismatchLib (MIT) + +**Source:** https://github.com/example/mismatch
+**License:** MIT License
+**Copyright:** Copyright (c) 2020 Wrong Holder + +### Scope + +Validation sample only. The code resides in `io.sentry.skills.verification.MismatchLib`. + +``` +MIT License + +Copyright (c) 2016 Correct Holder + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +``` diff --git a/.gitignore b/.gitignore index a7899736a86..f252087a5ab 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,6 @@ spy.log # Auto-generated by dotagents — do not commit these files. agents.lock .agents/.gitignore + +# Warden local run logs +.warden/logs/ diff --git a/AGENTS.md b/AGENTS.md index 1784e4f950e..e6e49477d6a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -154,6 +154,8 @@ When adapting code from third-party libraries: ``` 2. Add a full attribution entry to `THIRD_PARTY_NOTICES.md` following the existing format (Source, License, Copyright, Scope, full license text) +3. Run the `check-code-attribution` skill locally or wait for it to be auto-run against your PR to check for required fields and verify new licenses against [Sentry's Open Source Legal Policy](https://open.sentry.io/licensing/). The skill definition lives at `.claude/skills/check-code-attribution/SKILL.md` (registered in `agents.toml`; `.agents/skills/` is a symlink to `.claude/skills/`). + ### Getting PR Information Use `gh pr view` to get PR details from the current branch. This is needed when adding changelog entries, which require the PR number. diff --git a/agents.toml b/agents.toml index b4c9e091b70..d9770ee7df5 100644 --- a/agents.toml +++ b/agents.toml @@ -35,3 +35,7 @@ source = "path:.agents/skills/test" [[skills]] name = "btrace-perfetto" source = "path:.agents/skills/btrace-perfetto" + +[[skills]] +name = "check-code-attribution" +source = "path:.agents/skills/check-code-attribution" diff --git a/warden.toml b/warden.toml new file mode 100644 index 00000000000..7bb2865b747 --- /dev/null +++ b/warden.toml @@ -0,0 +1,100 @@ +version = 1 + +[defaults] +model = "anthropic/claude-sonnet-4-6" + +# Warden's schema does not support per-skill verification config; this is the only +# placement available. Disabled for attribution policy checks: a second verifier +# pass often rejects valid header/NOTICES mismatches (e.g. "NOTICES still documents it"). +[defaults.verification] +enabled = false + +# Warden's schema does not support per-skill chunking config; these patterns apply +# globally but are tuned for check-code-attribution. Attribution checks need the full +# file header and a NOTICES cross-check — not isolated diff hunks. +[[defaults.chunking.filePatterns]] +pattern = "**/*.api" +mode = "skip" + +[[defaults.chunking.filePatterns]] +pattern = "**/gradlew" +mode = "skip" + +[[defaults.chunking.filePatterns]] +pattern = "**/gradlew.bat" +mode = "skip" + +[[defaults.chunking.filePatterns]] +pattern = "**/*.java" +mode = "whole-file" + +[[defaults.chunking.filePatterns]] +pattern = "**/*.kt" +mode = "whole-file" + +[[defaults.chunking.filePatterns]] +pattern = "**/*.kts" +mode = "whole-file" + +[[defaults.chunking.filePatterns]] +pattern = "THIRD_PARTY_NOTICES.md" +mode = "whole-file" + +# Coalesce hunks aggressively for any remaining per-hunk files +[defaults.chunking.coalesce] +enabled = true +maxGapLines = 100 +maxChunkSize = 16000 + +[[skills]] +name = "check-code-attribution" +maxTurns = 30 +# Phase 1: report only — Warden comments on PRs but does not block merges. +# Tighten to failOn = "medium" / requestChanges = true once the false-positive baseline is established. +failOn = "off" +reportOn = "medium" +ignorePaths = [ + # Infrastructure directories + ".agents/**", + ".claude/**", + ".cursor/**", + ".github/**", + ".gradle/**", + ".idea/**", + ".mvn/**", + "gradle/**", + # Generated files + "**/*.aidl", + "**/*.api", + "**/*.g.kt", + "**/*.interp", + "**/*.pb.java", + "**/*.tokens", + "**/databinding/*Binding.java", + "**/generated/**", + "**/gradlew", + "**/gradlew.bat", + "**/grpc/*Grpc.java", + "**/ksp/**", + "**/mvnw", + "**/mvnw.cmd", + # Binary files + "**/*.jar", + # Repo docs (attribution examples in prose, not vendored code) + "AGENTS.md", + "CHANGELOG.md", + "CLAUDE.md", + "**/README.md", + # Warden infrastructure + ".warden/**", + "warden.toml", +] + +[[skills.triggers]] +type = "pull_request" +actions = ["opened", "synchronize"] +requestChanges = false +failCheck = false + +[[skills.triggers]] +type = "local"