ServiceNow · katstankiewicz · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/docs/assets/index-Bc8XlX7Z.css b/docs/assets/index-Bc8XlX7Z.css
diff --git a/docs/assets/index-Bq1YCEGN.css b/docs/assets/index-Bq1YCEGN.css
diff --git a/docs/assets/index-Da27i3Q-.js → docs/assets/index-DoEF8HaY.js b/docs/assets/index-Da27i3Q-.js → docs/assets/index-DoEF8HaY.js
diff --git a/docs/index.html b/docs/index.html
@@ -9,8 +9,8 @@
     <link rel="preconnect" href="https://fonts.googleapis.com" />
     <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
     <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet" />
-    <script type="module" crossorigin src="./assets/index-Da27i3Q-.js"></script>
-    <link rel="stylesheet" crossorigin href="./assets/index-Bq1YCEGN.css">
+    <script type="module" crossorigin src="./assets/index-DoEF8HaY.js"></script>
+    <link rel="stylesheet" crossorigin href="./assets/index-Bc8XlX7Z.css">
   </head>
   <body>
     <div id="root"></div>

diff --git a/website/src/components/leaderboard/LeaderboardSection.tsx b/website/src/components/leaderboard/LeaderboardSection.tsx
@@ -19,31 +19,26 @@ const paretoInsights = [
   {
     title: 'No system clears 0.5 on both axes pass@1',
     description:
-      'Across 12 systems spanning all three architectures, no system simultaneously exceeds 0.5 on both EVA-A pass@1 and EVA-X pass@1 — joint accuracy–experience quality remains far from saturated.',
+      'Across 16 systems spanning all three architectures, no system simultaneously exceeds 0.5 on both EVA-A pass@1 and EVA-X pass@1 — joint accuracy–experience quality remains far from saturated.',
   },
   {
     title: 'Peak and reliable performance diverge',
     description:
       'Peak (pass@k) and reliable (pass^k) performance diverge substantially: the median pass@k–pass^k gap is 0.44 on EVA-A and 0.24 on EVA-X, indicating single-trial scores systematically overstate deployment-grade reliability.',
   },
   {
-    title: 'S2S systems anchor every Pareto frontier',
+    title: 'Architecture and SDK implementation both shape results',
     description:
-      'Speech-to-speech systems consistently sit on the Pareto frontier in every domain — their experience-side lead (driven by turn-taking and response latency) makes them uniformly Pareto-efficient relative to cascades.',
+      'The Pareto frontier spans both S2S and cascade architectures. Cascade results vary significantly depending on the SDK implementation used, with some cascade configurations achieving turn-taking scores competitive with S2S models. This suggests that integration choices can matter as much as the underlying models.',
   },
 ];
 
 // Supporting bullets drawn from §4.3 Robustness and §4.4 Failure Mode Analysis.
 const keyInsights = [
-  {
-    title: 'Architectures diverge on experience',
-    description:
-      'While the best cascade and S2S systems achieve comparable accuracy, experience quality diverges sharply along architecture lines, with the S2S–cascade gap on EVA-X driven almost entirely by turn-taking.',
-  },
   {
     title: 'Cascade accuracy–experience trade-off',
     description:
-      'Among cascade systems we observe a consistent accuracy–experience trade-off: the three highest-accuracy cascades have mean tool-call latencies above 5 s, while faster cascades trade accuracy for lower latency. No cascade exceeds 0.25 on both dimensions.',
+      'Among cascade systems we observe a consistent accuracy–experience trade-off: higher-accuracy cascades tend to have higher tool-call latencies, while faster cascades trade accuracy for lower latency.',
   },
   {
     title: 'Asymmetric degradation under perturbation',
@@ -53,7 +48,7 @@ const keyInsights = [
   {
     title: 'Named-entity transcription bottlenecks cascades',
     description:
-      'Across seven cascade systems, mean key-entity transcription accuracy is strongly correlated with mean task completion (Pearson r = 0.93, p = 0.002). Cascades below 70% key-entity transcription accuracy show task completion 39% lower than those above it.',
+      'Across nine cascade systems, mean key-entity transcription accuracy is strongly correlated with mean task completion. Cascades below 70% key-entity transcription accuracy show substantially lower task completion than those above it.',
   },
   {
     title: 'Faithfulness is decoupled from task completion',
@@ -65,6 +60,11 @@ const keyInsights = [
     description:
       'Entity errors — letter substitutions, digit omissions, spurious insertions, and phonetic confusions — are the dominant speech-fidelity failure mode. Even 1% per-turn fail rates compound over multi-turn interactions when the caller cannot detect the error from context.',
   },
+  {
+    title: 'Low-latency cascades close the experience gap',
+    description:
+      'Cascade systems built with low-latency models can outperform S2S models on experience. The fastest cascade system achieves the highest EVA-X pass@1 (0.82) of any system, with turn-taking (0.88) surpassing all S2S models — suggesting that latency, not architecture, is the primary driver of experience quality.',
+  },
 ];
 
 const DOMAIN_TABS: DomainOrPooled[] = ['pooled', 'airline', 'itsm', 'medical_hr'];