pinchbench · jijivski · Apr 1, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py
@@ -482,7 +482,7 @@ def _ensure_judge_agent(judge_agent_prefix: str, judge_model: str, skill_dir: Pa
 
 
 def _parse_judge_response(transcript: List[Dict[str, Any]]) -> Dict[str, Any]:
-    content_chunks: List[str] = []
+    assistant_texts: List[str] = []
     for event in transcript:
         if event.get("type") != "message":
             continue
@@ -491,82 +491,137 @@ def _parse_judge_response(transcript: List[Dict[str, Any]]) -> Dict[str, Any]:
             continue
         for item in msg.get("content", []):
             if item.get("type") == "text":
-                content_chunks.append(item.get("text", ""))
-    raw_text = "\n".join(content_chunks).strip()
+                text = item.get("text", "")
+                assistant_texts.append(text)
+
+    # Transcript-based judging often includes earlier assistant echoes such as
+    # "NO_REPLY", partial waits, or prompt-embedded tool JSON. Prefer parsing
+    # the most recent assistant text chunks individually before concatenating.
+    for text in reversed(assistant_texts):
+        raw_text = text.strip()
+        if not raw_text or raw_text == "NO_REPLY":
+            continue
+        parsed = _parse_judge_text(raw_text)
+        if _looks_like_judge_payload(parsed):
+            return parsed
+
+    raw_text = "\n".join(assistant_texts).strip()
     logger.info("   [VERBOSE] Judge raw response text (first 2000 chars):\n%s", raw_text[:2000])
     if not raw_text:
         return {}
+    parsed = _parse_judge_text(raw_text)
+    if _looks_like_judge_payload(parsed):
+        return parsed
 
-    # First, try to extract JSON from code blocks (```json ... ```)
-    code_block_match = re.search(r"```json\s*(.*?)\s*```", raw_text, re.DOTALL)
-    if code_block_match:
-        try:
-            parsed = json.loads(code_block_match.group(1))
-            if isinstance(parsed, dict):
-                return parsed
-        except json.JSONDecodeError:
-            pass
+    logger.warning("Failed to parse judge JSON response")
+    return {}
 
-    # Find all potential JSON objects by looking for balanced braces
-    # We'll extract chunks that start with { and try to parse them
-    json_candidates: List[str] = []
-    brace_depth = 0
-    current_json = []
-    for char in raw_text:
-        if char == "{":
-            if brace_depth == 0:
-                current_json = []
-            brace_depth += 1
 
-        if brace_depth > 0:
-            current_json.append(char)
+def _looks_like_judge_payload(parsed: Dict[str, Any]) -> bool:
+    if not isinstance(parsed, dict) or not parsed:
+        return False
+    judge_keys = {
+        "scores",
+        "criteria_scores",
+        "criterion_scores",
+        "total",
+        "score",
+        "overall_score",
+        "total_score",
+        "completionScore",
+        "notes",
+        "justification",
+        "reasoning",
+        "overall",
+    }
+    return any(key in parsed for key in judge_keys)
 
-        if char == "}":
-            brace_depth -= 1
-            if brace_depth == 0 and current_json:
-                json_candidates.append("".join(current_json))
 
-    # Try parsing from the last JSON object backwards (most recent response)
-    for candidate in reversed(json_candidates):
+def _coerce_score_value(value: Any) -> float | None:
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
         try:
-            parsed = json.loads(candidate)
-            if isinstance(parsed, dict) and "scores" in parsed:
-                # Prefer JSON that has the expected structure
-                return parsed
-        except json.JSONDecodeError:
-            continue
+            return float(value.strip())
+        except ValueError:
+            return None
+    if isinstance(value, dict):
+        for key in ("score", "value", "weighted_score"):
+            if key in value:
+                return _coerce_score_value(value[key])
+    return None
 
-    # Try any valid JSON dict
-    for candidate in reversed(json_candidates):
-        try:
-            parsed = json.loads(candidate)
-            if isinstance(parsed, dict):
-                return parsed
-        except json.JSONDecodeError:
-            continue
 
-    # Fallback: try to extract numeric scores from prose responses.
-    # Models sometimes return "Total: 0.72" or "Overall score: 0.65" instead of JSON.
-    score_pattern = re.search(
-        r"(?:total|overall|final)\s*(?:score)?[:\s]*(0\.\d+|1\.0+)",
-        raw_text,
-        re.IGNORECASE,
-    )
-    if score_pattern:
-        try:
-            total = float(score_pattern.group(1))
-            if 0.0 <= total <= 1.0:
-                logger.warning("Fell back to regex score extraction from prose (total=%.2f)", total)
-                return {
-                    "scores": {},
-                    "total": total,
-                    "notes": "Score extracted from prose (JSON parse failed)",
-                }
-        except ValueError:
-            pass
+def _extract_named_scores(parsed: Dict[str, Any]) -> Dict[str, float]:
+    scores: Dict[str, float] = {}
 
-    logger.warning("Failed to parse judge JSON response")
-    return {}
+    if "scores" in parsed and isinstance(parsed["scores"], dict):
+        for key, value in parsed["scores"].items():
+            coerced = _coerce_score_value(value)
+            if coerced is not None:
+                scores[str(key)] = coerced
+
+    if "criteria_scores" in parsed:
+        criteria = parsed["criteria_scores"]
+        if isinstance(criteria, dict):
+            for key, value in criteria.items():
+                coerced = _coerce_score_value(value)
+                if coerced is not None:
+                    scores[str(key)] = coerced
+
+    if "criterion_scores" in parsed:
+        criteria = parsed["criterion_scores"]
+        if isinstance(criteria, dict):
+            for key, value in criteria.items():
+                coerced = _coerce_score_value(value)
+                if coerced is not None:
+                    scores[str(key)] = coerced
+        elif isinstance(criteria, list):
+            for idx, item in enumerate(criteria, start=1):
+                if isinstance(item, dict):
+                    name = (
+                        item.get("name")
+                        or item.get("criterion")
+                        or item.get("label")
+                        or f"criterion_{idx}"
+                    )
+                    coerced = _coerce_score_value(item)
+                else:
+                    name = f"criterion_{idx}"
+                    coerced = _coerce_score_value(item)
+                if coerced is not None:
+                    scores[str(name)] = coerced
+
+    for key, value in parsed.items():
+        if re.fullmatch(r"criterion\d+", str(key), re.IGNORECASE):
+            coerced = _coerce_score_value(value)
+            if coerced is not None:
+                scores[str(key)] = coerced
+
+    return scores
+
+
+def _extract_total_score(parsed: Dict[str, Any], scores: Dict[str, float]) -> float | None:
+    for key in ("total", "score", "overall_score", "completionScore", "total_score"):
+        if key in parsed:
+            coerced = _coerce_score_value(parsed[key])
+            if coerced is not None:
+                return coerced
+
+    overall = parsed.get("overall")
+    if isinstance(overall, dict):
+        coerced = _coerce_score_value(overall)
+        if coerced is not None:
+            return coerced
+
+    if scores:
+        values = [v for v in scores.values() if isinstance(v, (int, float))]
+        if values:
+            return sum(values) / len(values)
+
+    return None
 
 
 def _parse_judge_text(raw_text: str) -> Dict[str, Any]:
@@ -656,44 +711,8 @@ def _normalize_judge_response(parsed: Dict[str, Any]) -> Dict[str, Any]:
     """
     result: Dict[str, Any] = {"scores": {}, "total": None, "notes": ""}
 
-    # Extract scores from various keys
-    if "scores" in parsed:
-        scores_data = parsed["scores"]
-        if isinstance(scores_data, dict):
-            # Handle nested structure: {"criterion": {"score": 0.9, "weight": 0.3}}
-            for key, value in scores_data.items():
-                if isinstance(value, dict) and "score" in value:
-                    result["scores"][key] = (
-                        float(value["score"])
-                        if isinstance(value["score"], (int, float, str))
-                        else value["score"]
-                    )
-                elif isinstance(value, (int, float)):
-                    result["scores"][key] = value
-    elif "criteria_scores" in parsed:
-        # Handle Claude's alternate format
-        criteria = parsed["criteria_scores"]
-        if isinstance(criteria, dict):
-            for key, value in criteria.items():
-                if isinstance(value, dict) and "score" in value:
-                    result["scores"][key] = value["score"]
-                elif isinstance(value, (int, float)):
-                    result["scores"][key] = value
-
-    # Extract total score
-    if "total" in parsed and parsed["total"] is not None:
-        result["total"] = (
-            float(parsed["total"]) if isinstance(parsed["total"], (int, float)) else None
-        )
-    elif "score" in parsed and isinstance(parsed["score"], (int, float)):
-        result["total"] = float(parsed["score"])
-    elif "overall_score" in parsed and isinstance(parsed["overall_score"], (int, float)):
-        result["total"] = float(parsed["overall_score"])
-    elif result["scores"]:
-        # Calculate average if we have individual scores but no total
-        values = [v for v in result["scores"].values() if isinstance(v, (int, float))]
-        if values:
-            result["total"] = sum(values) / len(values)
+    result["scores"] = _extract_named_scores(parsed)
+    result["total"] = _extract_total_score(parsed, result["scores"])
 
     # Some judge models return a summed total across criteria even though each
     # criterion is scored on a 0..1 scale. Normalize that back to a 0..1 mean.

diff --git a/tasks/task_market_research.md b/tasks/task_market_research.md
@@ -173,7 +173,7 @@ def grade(transcript: list, workspace_path: str) -> dict:
             for item in msg.get("content", []):
                 if item.get("type") == "toolCall":
                     tool_name = item.get("name", "").lower()
-                    params = item.get("params", {})
+                    params = item.get("arguments", item.get("params", {}))
                     # Check for web search / fetch tools
                     if any(t in tool_name for t in [
                         "web_search", "websearch", "search",

diff --git a/tasks/task_memory.md b/tasks/task_memory.md
@@ -151,9 +151,15 @@ def grade(transcript: list, workspace_path: str) -> dict:
                     # - read_file / readFile (Cursor, Windsurf, Claude Code)
                     if tool_name in ["read", "read_file", "readFile"]:
                         args = item.get("arguments", item.get("params", {}))
-                        path_val = str(args.get("path", args.get("file_path", "")))
                         files = args.get("files", [])
-                        if "notes.md" in path_val or any("notes.md" in str(f) for f in files):
+                        path_candidates = [
+                            args.get("path", ""),
+                            args.get("file_path", ""),
+                            args.get("file", ""),
+                        ]
+                        if any("notes.md" in str(f) for f in files) or any(
+                            "notes.md" in str(path) for path in path_candidates if path
+                        ):
                             read_notes = True
                             break
 

diff --git a/tasks/task_workflow.md b/tasks/task_workflow.md
@@ -103,10 +103,16 @@ def grade(transcript: list, workspace_path: str) -> dict:
                     if tool_name.lower() in ["read_file", "readfile", "read"]:
                         # Support multiple param formats across different agents:
                         # - files: ["config.json"] (Cursor, Windsurf)
-                        # - path/file_path: "config.json" (OpenClaw, Claude Code)
+                        # - path/file_path/file: "config.json" (OpenClaw, Claude Code)
                         files = params.get("files", [])
-                        path_val = str(params.get("path", params.get("file_path", "")))
-                        if any("config.json" in str(f) for f in files) or "config.json" in path_val:
+                        path_candidates = [
+                            params.get("path", ""),
+                            params.get("file_path", ""),
+                            params.get("file", ""),
+                        ]
+                        if any("config.json" in str(f) for f in files) or any(
+                            "config.json" in str(path) for path in path_candidates if path
+                        ):
                             read_config = True
                             break