microsoft · JacksonWeber · May 23, 2026 · May 23, 2026
@@ -0,0 +1,123 @@
+name: Performance
+
+on:
+  pull_request:
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pull-requests: write
+
+concurrency:
+  group: perf-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  benchmark:
+    name: Benchmark distro overhead
+    runs-on: ubuntu-latest
+    timeout-minutes: 40
+    env:
+      PERF_REGRESSION_THRESHOLD: "15"
+
+    steps:
+      - name: Check out PR branch
+        uses: actions/checkout@v4
+        with:
+          path: pr
+          fetch-depth: 0
+
+      - name: Check out base branch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.base.sha || github.event.repository.default_branch }}
+          path: base
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      # ------------------------------------------------------------------
+      # Candidate (PR) run
+      # ------------------------------------------------------------------
+      - name: Install candidate (PR) distro
+        working-directory: pr
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .
+          python -m pip install "pytest>=8.0" "pytest-benchmark>=4.0"
+
+      - name: Run candidate benchmarks
+        working-directory: pr
+        run: |
+          python -m pytest tests/perf \
+            --benchmark-only \
+            --benchmark-min-rounds=5 \
+            --benchmark-warmup=on \
+            --benchmark-json="$GITHUB_WORKSPACE/pr.json"
+
+      # ------------------------------------------------------------------
+      # Baseline (base branch) run.
+      # We always invoke the PR copy of the perf tests (tests/perf may not
+      # exist on base) but install the *base* distro into the same env first
+      # so the `microsoft.opentelemetry` import resolves to baseline code.
+      # ------------------------------------------------------------------
+      - name: Install baseline distro
+        working-directory: base
+        run: |
+          python -m pip install --force-reinstall --no-deps .
+          python -m pip install .
+          python -m pip install "pytest>=8.0" "pytest-benchmark>=4.0"
+
+      - name: Run baseline benchmarks
+        working-directory: pr
+        run: |
+          python -m pytest tests/perf \
+            --benchmark-only \
+            --benchmark-min-rounds=5 \
+            --benchmark-warmup=on \
+            --benchmark-json="$GITHUB_WORKSPACE/base.json"
+
+      # ------------------------------------------------------------------
+      # Compare + report
+      # ------------------------------------------------------------------
+      - name: Compare results
+        id: compare
+        working-directory: pr
+        run: |
+          set +e
+          python -m perf.compare \
+            --baseline "$GITHUB_WORKSPACE/base.json" \
+            --candidate "$GITHUB_WORKSPACE/pr.json" \
+            --threshold "$PERF_REGRESSION_THRESHOLD" \
+            --output "$GITHUB_WORKSPACE/report.md"
+          echo "exit_code=$?" >> "$GITHUB_OUTPUT"
+
+      - name: Upload artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: perf-results
+          path: |
+            pr.json
+            base.json
+            report.md
+
+      - name: Post sticky PR comment
+        # Skip for PRs from forks — the default GITHUB_TOKEN has read-only
+        # access in that case and the comment API will reject the call. The
+        # JSON artifacts and the workflow log still contain the comparison.
+        if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
+        continue-on-error: true
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          header: perf-comparison
+          path: report.md
+
+      - name: Fail on regression
+        if: steps.compare.outputs.exit_code != '0'
+        run: |
+          echo "Gating perf scenario regressed beyond ${PERF_REGRESSION_THRESHOLD}%"
+          exit 1
@@ -1,6 +1,7 @@
 pytest>=8.0
 pytest-asyncio>=0.23.0
 pytest-cov>=5.0
+pytest-benchmark>=4.0
 black>=24.0
 microsoft-agents-activity
 microsoft-agents-hosting-core
@@ -0,0 +1,51 @@
+# Performance benchmarks
+
+This directory contains the regression-gating glue used by the `performance`
+CI workflow. The benchmarks themselves live in
+[`tests/perf/test_overhead.py`](../tests/perf/test_overhead.py) and are
+driven by [`pytest-benchmark`](https://pytest-benchmark.readthedocs.io/).
+
+## Scenarios
+
+| Test | Gating | What it measures |
+| --- | --- | --- |
+| `test_azure_monitor_span` | yes | `configure_azure_monitor` + `tracer.start_as_current_span` |
+| `test_azure_monitor_log`  | yes | `configure_azure_monitor` + `logger.info` |
+| `test_otel_span`          | no  | Plain `opentelemetry-sdk` `TracerProvider` reference |
+| `test_otel_log`           | no  | Plain `opentelemetry-sdk` `LoggerProvider` reference |
+
+Non-gating scenarios are informational only — they show how much overhead
+the distro adds on top of upstream and never fail CI.
+
+The gating flag is attached to each benchmark via
+`benchmark.extra_info["gating"]` so `perf/compare.py` can pick it up out of
+the pytest-benchmark JSON.
+
+## Running locally
+
+From the repo root, with the distro and dev deps installed
+(`pip install -e . && pip install -r dev_requirements.txt`):
+
+```bash
+# Run the benchmarks and save the result.
+pytest tests/perf --benchmark-only --benchmark-json=pr.json
+
+# Compare against a previously-saved baseline (e.g. from main).
+python -m perf.compare --baseline base.json --candidate pr.json --threshold 15
+```
+
+`pytest --benchmark-skip` is the default (see `pyproject.toml`), so a normal
+`pytest` invocation will *skip* the perf tests entirely. Pass
+`--benchmark-only` to opt in.
+
+## CI
+
+`.github/workflows/performance.yml` runs the suite on every pull request:
+
+1. Install the PR branch, run `pytest tests/perf --benchmark-only
+   --benchmark-json=pr.json`.
+2. Check out the base branch, install it, repeat → `base.json`.
+3. `perf/compare.py` produces a markdown report and exits non-zero if any
+   *gating* scenario regresses by more than `PERF_REGRESSION_THRESHOLD`
+   percent (default 15).
+4. The report is posted as a sticky PR comment.
@@ -0,0 +1,125 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License in the project root for
+# license information.
+# --------------------------------------------------------------------------
+"""Compare two pytest-benchmark JSON files and gate on regression.
+
+Reads the JSON produced by ``pytest --benchmark-json=...`` for the base
+branch and the PR branch. A scenario "regresses" when its candidate median
+operation time is greater than the baseline median by more than
+``--threshold`` percent (i.e. slower).
+
+Emits a markdown comparison table on stdout (suitable for posting as a
+sticky PR comment). Exits with status ``1`` if any *gating* scenario
+regresses past the threshold; non-gating scenarios are reported but never
+fail the build.
+
+A scenario is treated as gating when its ``extra_info.gating`` field is
+``true`` in either file.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from typing import Any, Dict, Optional
+
+
+def _load_benchmarks(path: str) -> Dict[str, Dict[str, Any]]:
+    with open(path, "r", encoding="utf-8") as f:
+        doc = json.load(f)
+    out: Dict[str, Dict[str, Any]] = {}
+    for entry in doc.get("benchmarks", []):
+        # Strip the conventional ``test_`` prefix so report names match the
+        # scenario names used in PR comments.
+        name = entry.get("name", "")
+        if name.startswith("test_"):
+            name = name[len("test_") :]
+        out[name] = entry
+    return out
+
+
+def _stats_seconds(entry: Optional[Dict[str, Any]]) -> Optional[float]:
+    if not entry:
+        return None
+    return entry.get("stats", {}).get("median")
+
+
+def _ops_per_sec(seconds: Optional[float]) -> float:
+    if not seconds or seconds <= 0:
+        return float("nan")
+    return 1.0 / seconds
+
+
+def _pct_slower(base_s: float, cand_s: float) -> float:
+    """Positive number => candidate is slower than baseline (regression)."""
+    if base_s <= 0:
+        return 0.0
+    return (cand_s - base_s) / base_s * 100.0
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Compare two pytest-benchmark JSON files and gate on regression.")
+    parser.add_argument("--baseline", required=True, help="pytest-benchmark JSON for the base branch")
+    parser.add_argument("--candidate", required=True, help="pytest-benchmark JSON for the PR branch")
+    parser.add_argument("--threshold", type=float, default=15.0, help="Max allowed regression %% (default 15)")
+    parser.add_argument("--output", help="Write markdown report to this path in addition to stdout")
+    args = parser.parse_args(argv)
+
+    base = _load_benchmarks(args.baseline)
+    cand = _load_benchmarks(args.candidate)
+    all_names = sorted(set(base) | set(cand))
+
+    lines: list[str] = []
+    lines.append("### Performance comparison")
+    lines.append("")
+    lines.append(
+        f"Threshold: regressions >{args.threshold:.1f}% on gating scenarios fail the build. "
+        "Higher ops/s is better; positive Δ means the PR is slower."
+    )
+    lines.append("")
+    lines.append("| Scenario | Gating | Baseline (ops/s) | Candidate (ops/s) | Δ % | Status |")
+    lines.append("| --- | --- | ---: | ---: | ---: | :---: |")
+
+    any_regression = False
+    for name in all_names:
+        b = base.get(name)
+        c = cand.get(name)
+        gating = bool(((c or b or {}).get("extra_info") or {}).get("gating", False))
+        b_sec = _stats_seconds(b)
+        c_sec = _stats_seconds(c)
+        b_ops = _ops_per_sec(b_sec)
+        c_ops = _ops_per_sec(c_sec)
+        if b_sec is not None and c_sec is not None:
+            delta = _pct_slower(b_sec, c_sec)
+            regressed = gating and delta > args.threshold
+            status = "❌" if regressed else ("⚠️" if delta > args.threshold else "✅")
+            if regressed:
+                any_regression = True
+            lines.append(
+                f"| `{name}` | {'yes' if gating else 'no'} | {b_ops:,.1f} | {c_ops:,.1f} | "
+                f"{delta:+.2f}% | {status} |"
+            )
+        else:
+            lines.append(
+                f"| `{name}` | {'yes' if gating else 'no'} | "
+                f"{('—' if b_sec is None else f'{b_ops:,.1f}')} | "
+                f"{('—' if c_sec is None else f'{c_ops:,.1f}')} | — | ➖ |"
+            )
+
+    report = "\n".join(lines) + "\n"
+    sys.stdout.write(report)
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(report)
+
+    if any_regression:
+        sys.stderr.write(f"\n[compare] FAIL: one or more gating scenarios regressed > {args.threshold:.1f}%\n")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -135,3 +135,6 @@ disable = [
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
+# Skip pytest-benchmark tests by default; the perf workflow opts in with
+# --benchmark-only.
+addopts = "--benchmark-skip"