From b15a7dbf9ab8b5fceb8276b4dd6e99021ff01642 Mon Sep 17 00:00:00 2001
From: Shudipto Trafder <shudiptotrafder@gmail.com>
Date: Wed, 13 May 2026 22:57:27 +0600
Subject: [PATCH 1/4] feat: implement test command with pytest integration and
 configuration management

---
 agentflow_cli/cli/commands/test.py | 87 ++++++++++++++++++++++++++++++
 agentflow_cli/cli/core/config.py   | 11 ++++
 agentflow_cli/cli/main.py          | 39 ++++++++++++++
 3 files changed, 137 insertions(+)
 create mode 100644 agentflow_cli/cli/commands/test.py

diff --git a/agentflow_cli/cli/commands/test.py b/agentflow_cli/cli/commands/test.py
new file mode 100644
index 0000000..73b4b11
--- /dev/null
+++ b/agentflow_cli/cli/commands/test.py
@@ -0,0 +1,87 @@
+"""Test command implementation — thin pytest wrapper."""
+
+import subprocess
+import sys
+import webbrowser
+from pathlib import Path
+from typing import Any
+
+from agentflow_cli.cli.commands import BaseCommand
+from agentflow_cli.cli.core.config import ConfigManager
+
+
+class TestCommand(BaseCommand):
+    """Run the project's test suite via pytest."""
+
+    def execute(
+        self,
+        path: str | None = None,
+        coverage: bool = False,
+        html: bool = False,
+        keyword: str | None = None,
+        verbose: bool = False,
+        quiet: bool = False,
+        extra_args: tuple[str, ...] = (),
+        **kwargs: Any,
+    ) -> int:
+        project_root = Path.cwd()
+
+        # Load optional overrides from agentflow.json
+        cfg: dict[str, Any] = {}
+        config_manager = ConfigManager()
+        discovered = config_manager.auto_discover_config()
+        if discovered:
+            try:
+                config_manager.load_config(str(discovered))
+                cfg = config_manager.get_test_config()
+            except Exception:  # noqa: S110
+                pass  # config absent or invalid — proceed with CLI defaults
+
+        # Explicit CLI path wins; fall back to agentflow.json; None = pytest auto-discovery
+        resolved_path: str | None = path or cfg.get("path") or None
+        resolved_coverage = coverage or cfg.get("coverage", False)
+        coverage_threshold: int | None = cfg.get("coverage_threshold")
+
+        location = str(project_root / resolved_path) if resolved_path else str(project_root)
+        self.output.print_banner("Test", f"Running tests in {location}")
+
+        cmd = [sys.executable, "-m", "pytest"]
+        if resolved_path:
+            cmd.append(resolved_path)
+
+        if verbose:
+            cmd.append("-v")
+        elif quiet:
+            cmd.append("-q")
+        else:
+            cmd.append("-v")
+
+        if resolved_coverage:
+            cmd += [
+                "--cov=.",
+                "--cov-report=term-missing",
+                "--cov-report=html:htmlcov",
+            ]
+            if coverage_threshold is not None:
+                cmd.append(f"--cov-fail-under={coverage_threshold}")
+
+        if keyword:
+            cmd += ["-k", keyword]
+
+        cmd += list(extra_args)
+
+        self.logger.info("Running: %s", " ".join(cmd))
+
+        result = subprocess.run(cmd, cwd=project_root)  # noqa: PLW1510
+
+        if result.returncode == 0:
+            self.output.success("All tests passed.")
+        else:
+            self.output.error(f"Tests finished with exit code {result.returncode}.")
+
+        if html and resolved_coverage:
+            report_path = (project_root / "htmlcov" / "index.html").as_uri()
+            self.output.info(f"Opening coverage report: {report_path}", emoji=False)
+            webbrowser.open(report_path)
+
+        return result.returncode
diff --git a/agentflow_cli/cli/core/config.py b/agentflow_cli/cli/core/config.py
index 8f2f06e..ad8f3d2 100644
--- a/agentflow_cli/cli/core/config.py
+++ b/agentflow_cli/cli/core/config.py
@@ -214,6 +214,17 @@ def get_config_value(self, key: str, default: Any = None) -> Any:
 
         return value
 
+    def get_test_config(self) -> dict[str, Any]:
+        """Return the optional 'test' section from agentflow.json.
+
+        Returns a dict with keys: path, coverage, coverage_threshold.
+        All fields are optional; callers should use .get() with their own defaults.
+        """
+        raw = self.get_config_value("test", default={})
+        if not isinstance(raw, dict):
+            return {}
+        return raw
+
     def resolve_env_file(self) -> Path | None:
         """Resolve environment file path from configuration.
 
diff --git a/agentflow_cli/cli/main.py b/agentflow_cli/cli/main.py
index b103084..ecd44b5 100644
--- a/agentflow_cli/cli/main.py
+++ b/agentflow_cli/cli/main.py
@@ -9,6 +9,7 @@
 from agentflow_cli.cli.commands.build import BuildCommand
 from agentflow_cli.cli.commands.init import InitCommand
 from agentflow_cli.cli.commands.skills import SkillsCommand
+from agentflow_cli.cli.commands.test import TestCommand
 from agentflow_cli.cli.commands.version import VersionCommand
 from agentflow_cli.cli.constants import (
     DEFAULT_CONFIG_FILE,
@@ -362,6 +363,44 @@ def skills(
         sys.exit(handle_exception(e))
 
 
+@app.command(
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def test(
+    ctx: typer.Context,
+    path: str | None = typer.Argument(None, help="Path to tests directory or file (omit to let pytest auto-discover)"),
+    coverage: bool = typer.Option(False, "--coverage", "-C", help="Run with coverage"),
+    html: bool = typer.Option(
+        False, "--html", help="Open HTML coverage report after run (requires --coverage)"
+    ),
+    keyword: str | None = typer.Option(
+        None, "-k", help="Only run tests matching this expression"
+    ),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress all output except errors"),
+) -> None:
+    """Run project tests with pytest.
+
+    Any arguments after -- are forwarded verbatim to pytest.
+    """
+    setup_cli_logging(verbose=verbose, quiet=quiet)
+
+    try:
+        command = TestCommand(output)
+        exit_code = command.execute(
+            path=path,
+            coverage=coverage,
+            html=html,
+            keyword=keyword,
+            verbose=verbose,
+            quiet=quiet,
+            extra_args=tuple(ctx.args),
+        )
+        sys.exit(exit_code)
+    except Exception as e:
+        sys.exit(handle_exception(e))
+
+
 def main() -> None:
     """Main CLI entry point."""
     try:

From 61ba4df5630e200aa4713182956abf1852cd7838 Mon Sep 17 00:00:00 2001
From: Shudipto Trafder <shudiptotrafder@gmail.com>
Date: Wed, 13 May 2026 23:46:58 +0600
Subject: [PATCH 2/4] feat: add eval command for agent evaluations with report
 generation

---
 agentflow_cli/cli/commands/eval.py            | 259 ++++++++++++++++++
 agentflow_cli/cli/commands/test.py            |   2 +-
 agentflow_cli/cli/core/config.py              |  11 +
 agentflow_cli/cli/main.py                     |  68 ++++-
 .../templates/prod/evals/weather_agents.py    |  38 ++-
 5 files changed, 358 insertions(+), 20 deletions(-)
 create mode 100644 agentflow_cli/cli/commands/eval.py

diff --git a/agentflow_cli/cli/commands/eval.py b/agentflow_cli/cli/commands/eval.py
new file mode 100644
index 0000000..26c690e
--- /dev/null
+++ b/agentflow_cli/cli/commands/eval.py
@@ -0,0 +1,259 @@
+"""Eval command — discover and run agentflow evaluations, always generating reports."""
+
+from __future__ import annotations
+
+import asyncio
+import importlib
+import importlib.util
+import inspect
+import sys
+import webbrowser
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+from agentflow_cli.cli.commands import BaseCommand
+from agentflow_cli.cli.core.config import ConfigManager
+
+
+if TYPE_CHECKING:
+    from agentflow.qa.evaluation.eval_result import EvalReport
+
+
+class EvalCommand(BaseCommand):
+    """Discover and run agent evaluations; always write HTML + JSON reports."""
+
+    # ------------------------------------------------------------------
+    # Discovery
+    # ------------------------------------------------------------------
+
+    def _discover(self, target: Path) -> list[Path]:
+        """Return eval files under target. If target is a file, return it directly."""
+        if target.is_file():
+            return [target]
+
+        seen: dict[Path, None] = {}
+        for pattern in ("*_eval.py", "eval_*.py"):
+            for p in sorted(target.rglob(pattern)):
+                seen[p] = None
+        return list(seen)
+
+    # ------------------------------------------------------------------
+    # Module loading
+    # ------------------------------------------------------------------
+
+    def _load_module(self, path: Path) -> Any:
+        project_root = str(Path.cwd())
+        if project_root not in sys.path:
+            sys.path.insert(0, project_root)
+
+        spec = importlib.util.spec_from_file_location("_agentflow_eval", path)
+        mod = importlib.util.module_from_spec(spec)  # type: ignore[arg-type]
+        spec.loader.exec_module(mod)  # type: ignore[union-attr]
+        return mod
+
+    # ------------------------------------------------------------------
+    # Agent loading from agentflow.json
+    # ------------------------------------------------------------------
+
+    def _load_agent_from_config(self) -> Any:
+        config_manager = ConfigManager()
+        discovered = config_manager.auto_discover_config()
+        if not discovered:
+            raise RuntimeError("No agentflow.json found — cannot auto-load agent.")
+        config_manager.load_config(str(discovered))
+        agent_spec: str = config_manager.get_config_value("agent", default="")
+        if not agent_spec or ":" not in agent_spec:
+            raise RuntimeError(f"Invalid 'agent' field in agentflow.json: {agent_spec!r}")
+        module_path, attr = agent_spec.rsplit(":", 1)
+        mod = importlib.import_module(module_path)
+        return getattr(mod, attr)
+
+    # ------------------------------------------------------------------
+    # Per-file runner (sync wrapper — avoids nested asyncio.run() issues)
+    # ------------------------------------------------------------------
+
+    def _run_file_sync(self, path: Path) -> EvalReport | None:
+        """Load and run a single eval file. Returns EvalReport or None if skipped."""
+        self.output.info(f"Running: {path.name}", emoji=False)
+        mod = self._load_module(path)
+
+        # Option A: run() function — module has full control
+        if hasattr(mod, "run"):
+            result = mod.run()
+            if inspect.isawaitable(result):
+                return asyncio.run(result)
+            return result  # type: ignore[return-value]
+
+        # Option B: get_eval_set() + get_eval_config() — CLI loads agent
+        if hasattr(mod, "get_eval_set") and hasattr(mod, "get_eval_config"):
+            return self._run_with_evaluator(mod, mod.get_eval_set(), mod.get_eval_config())
+
+        # Option C: EVAL_CONFIG constant + get_eval_set()
+        if hasattr(mod, "EVAL_CONFIG") and hasattr(mod, "get_eval_set"):
+            return self._run_with_evaluator(mod, mod.get_eval_set(), mod.EVAL_CONFIG)
+
+        self.output.warning(
+            f"Skipping {path.name} — no run(), get_eval_set()+get_eval_config(), "
+            "or EVAL_CONFIG+get_eval_set() found."
+        )
+        return None
+
+    def _run_with_evaluator(self, mod: Any, eval_set: Any, config: Any) -> EvalReport:
+        from agentflow.qa.evaluation.collectors.trajectory_collector import TrajectoryCollector
+        from agentflow.qa.evaluation.evaluator import AgentEvaluator
+
+        # Prefer the graph already imported in the module (most common pattern)
+        graph = getattr(mod, "app", None) or self._load_agent_from_config()
+        collector = TrajectoryCollector(capture_all_events=True)
+        evaluator = AgentEvaluator(graph, collector, config=config)
+        return asyncio.run(evaluator.evaluate(eval_set))
+
+    # ------------------------------------------------------------------
+    # Report merging
+    # ------------------------------------------------------------------
+
+    def _merge_reports(self, reports: list[EvalReport]) -> EvalReport:
+        if len(reports) == 1:
+            return reports[0]
+
+        from agentflow.qa.evaluation.eval_result import EvalReport as ER
+
+        all_results = []
+        for r in reports:
+            all_results.extend(r.results)
+        return ER.create(
+            eval_set_id="combined_eval",
+            eval_set_name="Combined Evaluation",
+            results=all_results,
+        )
+
+    # ------------------------------------------------------------------
+    # Eval directory from agentflow.json
+    # ------------------------------------------------------------------
+
+    def _resolve_eval_dir(self) -> Path:
+        config_manager = ConfigManager()
+        discovered = config_manager.auto_discover_config()
+        directory = "evals"
+        if discovered:
+            try:
+                config_manager.load_config(str(discovered))
+                eval_cfg = config_manager.get_evaluation_config()
+                directory = eval_cfg.get("directory", "evals")
+            except Exception:
+                pass
+        return Path.cwd() / directory
+
+    # ------------------------------------------------------------------
+    # Main entry point
+    # ------------------------------------------------------------------
+
+    def execute(  # noqa: PLR0912, PLR0915
+        self,
+        target: str | None = None,
+        output_dir: str = "eval_reports",
+        no_report: bool = False,
+        threshold: float | None = None,
+        open_report: bool = False,
+        verbose: bool = False,
+        quiet: bool = False,
+        **kwargs: Any,
+    ) -> int:
+        # Load optional overrides from agentflow.json
+        config_manager = ConfigManager()
+        discovered = config_manager.auto_discover_config()
+        if discovered:
+            try:
+                config_manager.load_config(str(discovered))
+                eval_cfg = config_manager.get_evaluation_config()
+                if output_dir == "eval_reports":
+                    output_dir = eval_cfg.get("output_dir", output_dir)
+                if threshold is None:
+                    threshold = eval_cfg.get("threshold")
+            except Exception:
+                pass
+
+        # Resolve target path
+        if target:
+            target_path = Path(target)
+            if not target_path.exists():
+                self.output.error(f"Path not found: {target}")
+                return 1
+        else:
+            target_path = self._resolve_eval_dir()
+            if not target_path.exists():
+                self.output.error(
+                    f"Eval directory '{target_path}' not found. "
+                    "Create an evals/ directory or pass a file/folder path."
+                )
+                return 1
+
+        files = self._discover(target_path)
+        if not files:
+            self.output.error(f"No eval files found in {target_path}")
+            return 1
+
+        self.output.print_banner("Eval", f"Found {len(files)} eval file(s) in {target_path}")
+
+        # Run each file
+        reports: list[EvalReport] = []
+        for f in files:
+            try:
+                report = self._run_file_sync(f)
+                if report is not None:
+                    reports.append(report)
+            except Exception as exc:
+                self.output.error(f"Error in {f.name}: {exc}")
+                self.logger.exception("Eval file failed: %s", f)
+
+        if not reports:
+            self.output.error("No reports produced. Ensure eval files expose run().")
+            return 1
+
+        merged = self._merge_reports(reports)
+
+        # Determine exit code
+        if threshold is not None and merged.summary.pass_rate < threshold:
+            self.output.error(
+                f"Pass rate {merged.summary.pass_rate:.1%} is below "
+                f"threshold {threshold:.1%}"
+            )
+            return_code = 1
+        else:
+            return_code = 0 if merged.summary.pass_rate == 1.0 else 1
+
+        # Always generate file reports unless --no-report
+        if not no_report:
+            from agentflow.qa.evaluation.config.eval_config import ReporterConfig
+            from agentflow.qa.evaluation.reporters.manager import ReporterManager
+
+            manager = ReporterManager(
+                ReporterConfig(
+                    output_dir=output_dir,
+                    html=True,
+                    json_report=True,
+                    console=False,  # console output already handled by run() modules
+                    timestamp_files=True,
+                )
+            )
+            result = manager.run_all(merged)
+
+            if result.html_path:
+                self.output.success(f"HTML report: {result.html_path}")
+            if result.json_path:
+                self.output.info(f"JSON report: {result.json_path}", emoji=False)
+            if result.has_errors:
+                for name, err in result.errors:
+                    self.output.warning(f"Reporter error [{name}]: {err}")
+
+            if open_report and result.html_path:
+                webbrowser.open(Path(result.html_path).as_uri())
+
+        summary = merged.summary
+        self.output.info(
+            f"Results: {summary.passed_cases}/{summary.total_cases} passed "
+            f"({summary.pass_rate:.1%})",
+            emoji=False,
+        )
+
+        return return_code
diff --git a/agentflow_cli/cli/commands/test.py b/agentflow_cli/cli/commands/test.py
index 73b4b11..1ffce9f 100644
--- a/agentflow_cli/cli/commands/test.py
+++ b/agentflow_cli/cli/commands/test.py
@@ -72,7 +72,7 @@ def execute(
 
         self.logger.info("Running: %s", " ".join(cmd))
 
-        result = subprocess.run(cmd, cwd=project_root)  # noqa: PLW1510
+        result = subprocess.run(cmd, cwd=project_root)  # noqa: PLW1510, S603
 
         if result.returncode == 0:
             self.output.success("All tests passed.")
diff --git a/agentflow_cli/cli/core/config.py b/agentflow_cli/cli/core/config.py
index ad8f3d2..792c530 100644
--- a/agentflow_cli/cli/core/config.py
+++ b/agentflow_cli/cli/core/config.py
@@ -225,6 +225,17 @@ def get_test_config(self) -> dict[str, Any]:
             return {}
         return raw
 
+    def get_evaluation_config(self) -> dict[str, Any]:
+        """Return the optional 'evaluation' section from agentflow.json.
+
+        Returns a dict with keys: directory, output_dir, threshold, timestamp_files.
+        All fields are optional; callers should use .get() with their own defaults.
+        """
+        raw = self.get_config_value("evaluation", default={})
+        if not isinstance(raw, dict):
+            return {}
+        return raw
+
     def resolve_env_file(self) -> Path | None:
         """Resolve environment file path from configuration.
 
diff --git a/agentflow_cli/cli/main.py b/agentflow_cli/cli/main.py
index ecd44b5..3ee7aed 100644
--- a/agentflow_cli/cli/main.py
+++ b/agentflow_cli/cli/main.py
@@ -7,6 +7,7 @@
 
 from agentflow_cli.cli.commands.api import APICommand
 from agentflow_cli.cli.commands.build import BuildCommand
+from agentflow_cli.cli.commands.eval import EvalCommand
 from agentflow_cli.cli.commands.init import InitCommand
 from agentflow_cli.cli.commands.skills import SkillsCommand
 from agentflow_cli.cli.commands.test import TestCommand
@@ -368,14 +369,14 @@ def skills(
 )
 def test(
     ctx: typer.Context,
-    path: str | None = typer.Argument(None, help="Path to tests directory or file (omit to let pytest auto-discover)"),
+    path: str | None = typer.Argument(
+        None, help="Path to tests directory or file (omit to let pytest auto-discover)"
+    ),
     coverage: bool = typer.Option(False, "--coverage", "-C", help="Run with coverage"),
     html: bool = typer.Option(
         False, "--html", help="Open HTML coverage report after run (requires --coverage)"
     ),
-    keyword: str | None = typer.Option(
-        None, "-k", help="Only run tests matching this expression"
-    ),
+    keyword: str | None = typer.Option(None, "-k", help="Only run tests matching this expression"),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"),
     quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress all output except errors"),
 ) -> None:
@@ -401,6 +402,65 @@ def test(
         sys.exit(handle_exception(e))
 
 
+@app.command()
+def eval(
+    target: str | None = typer.Argument(
+        None,
+        help="File or directory to evaluate (default: evals/ from agentflow.json or cwd)",
+    ),
+    output_dir: str = typer.Option(
+        "eval_reports",
+        "--output",
+        "-o",
+        help="Directory for generated report files",
+    ),
+    no_report: bool = typer.Option(
+        False,
+        "--no-report",
+        help="Skip file report generation (console summary only)",
+    ),
+    threshold: float | None = typer.Option(
+        None,
+        "--threshold",
+        "-t",
+        help="Fail if overall pass rate is below this value (0.0–1.0)",
+    ),
+    open_report: bool = typer.Option(
+        False,
+        "--open",
+        help="Open the HTML report in the default browser after the run",
+    ),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress all output except errors"),
+) -> None:
+    """Run agent evaluations.
+
+    Discovers *_eval.py / eval_*.py files in the target directory (default: evals/).
+    Always generates HTML + JSON reports in eval_reports/ unless --no-report is set.
+
+    Each eval file must expose one of:
+      run()                           # full control, returns EvalReport
+      get_eval_set() + get_eval_config()  # CLI loads agent from agentflow.json
+      EVAL_CONFIG + get_eval_set()        # same, config as a constant
+    """
+    setup_cli_logging(verbose=verbose, quiet=quiet)
+
+    try:
+        command = EvalCommand(output)
+        exit_code = command.execute(
+            target=target,
+            output_dir=output_dir,
+            no_report=no_report,
+            threshold=threshold,
+            open_report=open_report,
+            verbose=verbose,
+            quiet=quiet,
+        )
+        sys.exit(exit_code)
+    except Exception as e:
+        sys.exit(handle_exception(e))
+
+
 def main() -> None:
     """Main CLI entry point."""
     try:
diff --git a/agentflow_cli/cli/templates/prod/evals/weather_agents.py b/agentflow_cli/cli/templates/prod/evals/weather_agents.py
index 35a72f0..62e56c3 100644
--- a/agentflow_cli/cli/templates/prod/evals/weather_agents.py
+++ b/agentflow_cli/cli/templates/prod/evals/weather_agents.py
@@ -92,28 +92,36 @@ def build_weather_agent_eval_config() -> EvalConfig:
     )
 
 
-def run_weather_agent_eval(config: EvalConfig | None = None):
-    """Run the current compiled graph against the weather-agent eval set."""
-    config = config or build_weather_agent_eval_config()
+def get_eval_set() -> EvalSet:
+    """Return the eval set for CLI discovery (agentflow eval)."""
+    return build_weather_agent_eval_set()
+
+
+def get_eval_config() -> EvalConfig:
+    """Return the eval config for CLI discovery (agentflow eval)."""
+    return build_weather_agent_eval_config()
+
+
+def run() -> ReporterOutput:
+    """Entry point for agentflow eval CLI discovery.
+
+    Runs the full eval and returns the reporter output (HTML + JSON + JUnit).
+    """
+    config = build_weather_agent_eval_config()
     collector = TrajectoryCollector(capture_all_events=True)
-    return QuickEval.run_sync(
+    report = QuickEval.run_sync(
         graph=app,
         collector=collector,
         eval_set=build_weather_agent_eval_set(),
         config=config,
+        print_results=True,
     )
-
-
-def write_weather_agent_eval_reports(report=None) -> ReporterOutput:
-    """Write JSON, HTML, and JUnit XML report files for a completed eval report."""
-    config = build_weather_agent_eval_config()
-    eval_report = report or run_weather_agent_eval(config=config)
     EVAL_REPORT_DIR.mkdir(parents=True, exist_ok=True)
-    return ReporterManager(config.reporter).run_all(eval_report, output_dir=str(EVAL_REPORT_DIR))
+    return ReporterManager(config.reporter).run_all(report, output_dir=str(EVAL_REPORT_DIR))
 
 
 if __name__ == "__main__":
-    output = write_weather_agent_eval_reports()
-    print(f"JSON report: {output.json_path}")  # noqa: T201
-    print(f"HTML report: {output.html_path}")  # noqa: T201
-    print(f"JUnit report: {output.junit_path}")  # noqa: T201
+    reporter_output = run()
+    print(f"JSON report: {reporter_output.json_path}")  # noqa: T201
+    print(f"HTML report: {reporter_output.html_path}")  # noqa: T201
+    print(f"JUnit report: {reporter_output.junit_path}")  # noqa: T201

From c99dfcec4dec3789477f6eb18385af873e735b7f Mon Sep 17 00:00:00 2001
From: Shudipto Trafder <shudiptotrafder@gmail.com>
Date: Thu, 14 May 2026 00:09:14 +0600
Subject: [PATCH 3/4] feat: enhance eval command with default configuration and
 new evaluation structure

---
 agentflow_cli/cli/commands/eval.py            |  38 ++++--
 .../cli/templates/prod/agentflow.json         |  11 ++
 .../templates/prod/evals/weather_agents.py    | 127 ------------------
 .../prod/evals/weather_agents_eval.py         |  36 +++++
 .../templates/prod/tests/test_agent_eval.py   |  40 +++---
 5 files changed, 87 insertions(+), 165 deletions(-)
 delete mode 100644 agentflow_cli/cli/templates/prod/evals/weather_agents.py
 create mode 100644 agentflow_cli/cli/templates/prod/evals/weather_agents_eval.py

diff --git a/agentflow_cli/cli/commands/eval.py b/agentflow_cli/cli/commands/eval.py
index 26c690e..65b0ce8 100644
--- a/agentflow_cli/cli/commands/eval.py
+++ b/agentflow_cli/cli/commands/eval.py
@@ -72,30 +72,42 @@ def _load_agent_from_config(self) -> Any:
     # Per-file runner (sync wrapper — avoids nested asyncio.run() issues)
     # ------------------------------------------------------------------
 
+    def _default_config(self) -> Any:
+        from agentflow.qa.evaluation import CriterionConfig, EvalConfig, MatchType
+
+        return EvalConfig(
+            criteria={
+                "response": CriterionConfig(threshold=0.6, match_type=MatchType.ANY_ORDER),
+                "tool_usage": CriterionConfig(
+                    threshold=1.0, match_type=MatchType.EXACT, check_args=False
+                ),
+                "node_order": CriterionConfig(threshold=0.8, match_type=MatchType.IN_ORDER),
+            },
+        )
+
     def _run_file_sync(self, path: Path) -> EvalReport | None:
         """Load and run a single eval file. Returns EvalReport or None if skipped."""
         self.output.info(f"Running: {path.name}", emoji=False)
         mod = self._load_module(path)
 
-        # Option A: run() function — module has full control
+        # Option A: run() — module has full control
         if hasattr(mod, "run"):
             result = mod.run()
             if inspect.isawaitable(result):
                 return asyncio.run(result)
             return result  # type: ignore[return-value]
 
-        # Option B: get_eval_set() + get_eval_config() — CLI loads agent
-        if hasattr(mod, "get_eval_set") and hasattr(mod, "get_eval_config"):
-            return self._run_with_evaluator(mod, mod.get_eval_set(), mod.get_eval_config())
-
-        # Option C: EVAL_CONFIG constant + get_eval_set()
-        if hasattr(mod, "EVAL_CONFIG") and hasattr(mod, "get_eval_set"):
-            return self._run_with_evaluator(mod, mod.get_eval_set(), mod.EVAL_CONFIG)
-
-        self.output.warning(
-            f"Skipping {path.name} — no run(), get_eval_set()+get_eval_config(), "
-            "or EVAL_CONFIG+get_eval_set() found."
-        )
+        # Option B: get_eval_set() — config is optional; CLI fills in the default
+        if hasattr(mod, "get_eval_set"):
+            if hasattr(mod, "get_eval_config"):
+                config = mod.get_eval_config()
+            elif hasattr(mod, "EVAL_CONFIG"):
+                config = mod.EVAL_CONFIG
+            else:
+                config = self._default_config()
+            return self._run_with_evaluator(mod, mod.get_eval_set(), config)
+
+        self.output.warning(f"Skipping {path.name} — no run() or get_eval_set() found.")
         return None
 
     def _run_with_evaluator(self, mod: Any, eval_set: Any, config: Any) -> EvalReport:
diff --git a/agentflow_cli/cli/templates/prod/agentflow.json b/agentflow_cli/cli/templates/prod/agentflow.json
index 918b003..d0423c5 100644
--- a/agentflow_cli/cli/templates/prod/agentflow.json
+++ b/agentflow_cli/cli/templates/prod/agentflow.json
@@ -15,5 +15,16 @@
     "by": "ip",
     "trusted_proxy_headers": false,
     "exclude_paths": ["/health", "/docs", "/redoc", "/openapi.json"]
+  },
+  "test": {
+    "path": "tests",
+    "coverage": true,
+    "coverage_threshold": 70
+  },
+  "evaluation": {
+    "directory": "evals",
+    "output_dir": "eval_reports",
+    "threshold": 0.75,
+    "timestamp_files": true
   }
 }
diff --git a/agentflow_cli/cli/templates/prod/evals/weather_agents.py b/agentflow_cli/cli/templates/prod/evals/weather_agents.py
deleted file mode 100644
index 62e56c3..0000000
--- a/agentflow_cli/cli/templates/prod/evals/weather_agents.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from pathlib import Path
-
-from agentflow.qa.evaluation import (
-    CriterionConfig,
-    EvalConfig,
-    EvalSet,
-    EvalSetBuilder,
-    MatchType,
-    QuickEval,
-    ReporterConfig,
-    ReporterManager,
-    ReporterOutput,
-    TrajectoryCollector,
-)
-
-from graph.agent import app
-
-
-EVAL_REPORT_DIR = Path("eval_reports")
-
-
-def build_weather_agent_eval_set() -> EvalSet:
-    """Build deterministic behavior checks for the weather agent."""
-    return (
-        EvalSetBuilder(name="weather-agent-regression")
-        .add_case(
-            query="Hi",
-            expected="assistant",
-            expected_node_order=["MAIN"],
-            name="greeting_response",
-            description="The agent responds to a greeting.",
-        )
-        .add_tool_test(
-            query="What is the weather in London?",
-            tool_name="get_weather",
-            tool_args={"location": "London"},
-            expected_response="London",
-            case_id="weather_london",
-        )
-        .add_tool_test(
-            query="Tell me the current weather in New York",
-            tool_name="get_weather",
-            tool_args={"location": "New York"},
-            expected_response="New York",
-            case_id="weather_new_york",
-        )
-        .add_tool_test(
-            query="How is the weather in Tokyo today?",
-            tool_name="get_weather",
-            tool_args={"location": "Tokyo"},
-            expected_response="Tokyo",
-            case_id="weather_tokyo",
-        )
-        .build()
-    )
-
-
-def build_weather_agent_eval_config() -> EvalConfig:
-    """Configure fast non-judge criteria for regression evaluation."""
-    return EvalConfig(
-        criteria={
-            "response": CriterionConfig(
-                threshold=0.6,
-                match_type=MatchType.ANY_ORDER,
-            ),
-            "tool_usage": CriterionConfig(
-                threshold=1.0,
-                match_type=MatchType.EXACT,
-                check_args=False,
-            ),
-            "node_order": CriterionConfig(
-                threshold=0.8,
-                match_type=MatchType.IN_ORDER,
-            ),
-        },
-        mock_mode=True,
-        verbose=True,
-        reporter=ReporterConfig(
-            enabled=True,
-            output_dir=str(EVAL_REPORT_DIR),
-            console=True,
-            json_report=True,
-            html=True,
-            junit_xml=True,
-            include_details=True,
-            include_trajectory=True,
-            include_node_responses=True,
-            include_actual_response=True,
-            include_tool_call_details=True,
-            timestamp_files=True,
-        ),
-    )
-
-
-def get_eval_set() -> EvalSet:
-    """Return the eval set for CLI discovery (agentflow eval)."""
-    return build_weather_agent_eval_set()
-
-
-def get_eval_config() -> EvalConfig:
-    """Return the eval config for CLI discovery (agentflow eval)."""
-    return build_weather_agent_eval_config()
-
-
-def run() -> ReporterOutput:
-    """Entry point for agentflow eval CLI discovery.
-
-    Runs the full eval and returns the reporter output (HTML + JSON + JUnit).
-    """
-    config = build_weather_agent_eval_config()
-    collector = TrajectoryCollector(capture_all_events=True)
-    report = QuickEval.run_sync(
-        graph=app,
-        collector=collector,
-        eval_set=build_weather_agent_eval_set(),
-        config=config,
-        print_results=True,
-    )
-    EVAL_REPORT_DIR.mkdir(parents=True, exist_ok=True)
-    return ReporterManager(config.reporter).run_all(report, output_dir=str(EVAL_REPORT_DIR))
-
-
-if __name__ == "__main__":
-    reporter_output = run()
-    print(f"JSON report: {reporter_output.json_path}")  # noqa: T201
-    print(f"HTML report: {reporter_output.html_path}")  # noqa: T201
-    print(f"JUnit report: {reporter_output.junit_path}")  # noqa: T201
diff --git a/agentflow_cli/cli/templates/prod/evals/weather_agents_eval.py b/agentflow_cli/cli/templates/prod/evals/weather_agents_eval.py
new file mode 100644
index 0000000..b863c26
--- /dev/null
+++ b/agentflow_cli/cli/templates/prod/evals/weather_agents_eval.py
@@ -0,0 +1,36 @@
+from agentflow.qa.evaluation import EvalSet, EvalSetBuilder
+
+
+def get_eval_set() -> EvalSet:
+    return (
+        EvalSetBuilder(name="weather-agent-regression")
+        .add_case(
+            query="Hi",
+            expected="assistant",
+            expected_node_order=["MAIN"],
+            name="greeting_response",
+            description="The agent responds to a greeting.",
+        )
+        .add_tool_test(
+            query="What is the weather in London?",
+            tool_name="get_weather",
+            tool_args={"location": "London"},
+            expected_response="London",
+            case_id="weather_london",
+        )
+        .add_tool_test(
+            query="Tell me the current weather in New York",
+            tool_name="get_weather",
+            tool_args={"location": "New York"},
+            expected_response="New York",
+            case_id="weather_new_york",
+        )
+        .add_tool_test(
+            query="How is the weather in Tokyo today?",
+            tool_name="get_weather",
+            tool_args={"location": "Tokyo"},
+            expected_response="Tokyo",
+            case_id="weather_tokyo",
+        )
+        .build()
+    )
diff --git a/agentflow_cli/cli/templates/prod/tests/test_agent_eval.py b/agentflow_cli/cli/templates/prod/tests/test_agent_eval.py
index b4a5a34..b1fed1b 100644
--- a/agentflow_cli/cli/templates/prod/tests/test_agent_eval.py
+++ b/agentflow_cli/cli/templates/prod/tests/test_agent_eval.py
@@ -3,18 +3,20 @@
 import asyncio
 
 from agentflow.qa import QuickTest
-from agentflow.qa.evaluation import CriterionResult, EvalCaseResult, EvalReport, EvalSummary
-
-from evals.weather_agents import (
-    EVAL_REPORT_DIR,
-    build_weather_agent_eval_config,
-    build_weather_agent_eval_set,
-    write_weather_agent_eval_reports,
+from agentflow.qa.evaluation import (
+    CriterionResult,
+    EvalCaseResult,
+    EvalReport,
+    EvalSummary,
+    ReporterConfig,
+    ReporterManager,
 )
 
+from evals.weather_agents_eval import get_eval_set
+
 
-def test_weather_agent_eval_set_covers_core_weather_behaviors() -> None:
-    eval_set = build_weather_agent_eval_set()
+def test_eval_set_covers_core_weather_behaviors() -> None:
+    eval_set = get_eval_set()
 
     case_ids = {case.eval_id for case in eval_set.eval_cases}
     case_names = {case.name for case in eval_set.eval_cases}
@@ -27,20 +29,7 @@ def test_weather_agent_eval_set_covers_core_weather_behaviors() -> None:
     assert "weather_tokyo" in case_ids
 
 
-def test_weather_agent_eval_config_uses_deterministic_criteria() -> None:
-    config = build_weather_agent_eval_config()
-
-    assert config.mock_mode is True
-    assert set(config.criteria) == {"response", "tool_usage", "node_order"}
-    assert config.criteria["tool_usage"].check_args is False
-    assert config.reporter.output_dir == str(EVAL_REPORT_DIR)
-    assert config.reporter.json_report is True
-    assert config.reporter.html is True
-    assert config.reporter.junit_xml is True
-
-
-def test_write_weather_agent_eval_reports_creates_report_artifacts(tmp_path, monkeypatch) -> None:
-    monkeypatch.setattr("evals.weather_agents.EVAL_REPORT_DIR", tmp_path)
+def test_reporter_writes_html_and_json_artifacts(tmp_path) -> None:
     report = EvalReport(
         eval_set_id="weather-agent-regression",
         eval_set_name="weather-agent-regression",
@@ -69,11 +58,12 @@ def test_write_weather_agent_eval_reports_creates_report_artifacts(tmp_path, mon
         ),
     )
 
-    output = write_weather_agent_eval_reports(report)
+    output = ReporterManager(
+        ReporterConfig(output_dir=str(tmp_path), html=True, json_report=True, console=False)
+    ).run_all(report, output_dir=str(tmp_path))
 
     assert output.json_path is not None
     assert output.html_path is not None
-    assert output.junit_path is not None
     assert "weather-agent-regression" in tmp_path.joinpath(output.json_path).read_text(
         encoding="utf-8"
     )

From d6e6448da59b5d1628b5d1c1fc4112a1f36f52ed Mon Sep 17 00:00:00 2001
From: Shudipto Trafder <shudiptotrafder@gmail.com>
Date: Thu, 14 May 2026 00:24:12 +0600
Subject: [PATCH 4/4] feat: refactor eval and test commands for improved
 configuration handling and command naming

---
 agentflow_cli/cli/commands/eval.py | 30 +++++++++++++++---------------
 agentflow_cli/cli/commands/test.py |  8 ++++----
 agentflow_cli/cli/main.py          |  6 +++---
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/agentflow_cli/cli/commands/eval.py b/agentflow_cli/cli/commands/eval.py
index 65b0ce8..36ae6d0 100644
--- a/agentflow_cli/cli/commands/eval.py
+++ b/agentflow_cli/cli/commands/eval.py
@@ -11,6 +11,13 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
+from agentflow.qa.evaluation import CriterionConfig, EvalConfig, MatchType
+from agentflow.qa.evaluation.collectors.trajectory_collector import TrajectoryCollector
+from agentflow.qa.evaluation.config.eval_config import ReporterConfig
+from agentflow.qa.evaluation.eval_result import EvalReport as ER
+from agentflow.qa.evaluation.evaluator import AgentEvaluator
+from agentflow.qa.evaluation.reporters.manager import ReporterManager
+
 from agentflow_cli.cli.commands import BaseCommand
 from agentflow_cli.cli.core.config import ConfigManager
 
@@ -73,8 +80,6 @@ def _load_agent_from_config(self) -> Any:
     # ------------------------------------------------------------------
 
     def _default_config(self) -> Any:
-        from agentflow.qa.evaluation import CriterionConfig, EvalConfig, MatchType
-
         return EvalConfig(
             criteria={
                 "response": CriterionConfig(threshold=0.6, match_type=MatchType.ANY_ORDER),
@@ -94,7 +99,7 @@ def _run_file_sync(self, path: Path) -> EvalReport | None:
         if hasattr(mod, "run"):
             result = mod.run()
             if inspect.isawaitable(result):
-                return asyncio.run(result)
+                return asyncio.run(result)  # type: ignore[arg-type]
             return result  # type: ignore[return-value]
 
         # Option B: get_eval_set() — config is optional; CLI fills in the default
@@ -111,9 +116,6 @@ def _run_file_sync(self, path: Path) -> EvalReport | None:
         return None
 
     def _run_with_evaluator(self, mod: Any, eval_set: Any, config: Any) -> EvalReport:
-        from agentflow.qa.evaluation.collectors.trajectory_collector import TrajectoryCollector
-        from agentflow.qa.evaluation.evaluator import AgentEvaluator
-
         # Prefer the graph already imported in the module (most common pattern)
         graph = getattr(mod, "app", None) or self._load_agent_from_config()
         collector = TrajectoryCollector(capture_all_events=True)
@@ -128,8 +130,6 @@ def _merge_reports(self, reports: list[EvalReport]) -> EvalReport:
         if len(reports) == 1:
             return reports[0]
 
-        from agentflow.qa.evaluation.eval_result import EvalReport as ER
-
         all_results = []
         for r in reports:
             all_results.extend(r.results)
@@ -153,7 +153,9 @@ def _resolve_eval_dir(self) -> Path:
                 eval_cfg = config_manager.get_evaluation_config()
                 directory = eval_cfg.get("directory", "evals")
             except Exception:
-                pass
+                self.logger.warning(
+                    "Failed to load eval directory from config; using default 'evals/'"
+                )
         return Path.cwd() / directory
 
     # ------------------------------------------------------------------
@@ -183,7 +185,9 @@ def execute(  # noqa: PLR0912, PLR0915
                 if threshold is None:
                     threshold = eval_cfg.get("threshold")
             except Exception:
-                pass
+                self.logger.warning(
+                    "Failed to load eval config from agentflow.json; using defaults"
+                )
 
         # Resolve target path
         if target:
@@ -227,8 +231,7 @@ def execute(  # noqa: PLR0912, PLR0915
         # Determine exit code
         if threshold is not None and merged.summary.pass_rate < threshold:
             self.output.error(
-                f"Pass rate {merged.summary.pass_rate:.1%} is below "
-                f"threshold {threshold:.1%}"
+                f"Pass rate {merged.summary.pass_rate:.1%} is below threshold {threshold:.1%}"
             )
             return_code = 1
         else:
@@ -236,9 +239,6 @@ def execute(  # noqa: PLR0912, PLR0915
 
         # Always generate file reports unless --no-report
         if not no_report:
-            from agentflow.qa.evaluation.config.eval_config import ReporterConfig
-            from agentflow.qa.evaluation.reporters.manager import ReporterManager
-
             manager = ReporterManager(
                 ReporterConfig(
                     output_dir=output_dir,
diff --git a/agentflow_cli/cli/commands/test.py b/agentflow_cli/cli/commands/test.py
index 1ffce9f..9dc8f2e 100644
--- a/agentflow_cli/cli/commands/test.py
+++ b/agentflow_cli/cli/commands/test.py
@@ -1,6 +1,6 @@
 """Test command implementation — thin pytest wrapper."""
 
-import subprocess
+import subprocess  # nosec: B404
 import sys
 import webbrowser
 from pathlib import Path
@@ -34,8 +34,8 @@ def execute(
             try:
                 config_manager.load_config(str(discovered))
                 cfg = config_manager.get_test_config()
-            except Exception:  # noqa: S110
-                pass  # config absent or invalid — proceed with CLI defaults
+            except Exception:  # nosec: B110
+                self.logger.warning("Failed to load test configuration from %s", discovered)
 
         # Explicit CLI path wins; fall back to agentflow.json; None = pytest auto-discovery
         resolved_path: str | None = path or cfg.get("path") or None
@@ -72,7 +72,7 @@ def execute(
 
         self.logger.info("Running: %s", " ".join(cmd))
 
-        result = subprocess.run(cmd, cwd=project_root)  # noqa: PLW1510, S603
+        result = subprocess.run(cmd, cwd=project_root, check=False)  # nosec: B603  # noqa: S603
 
         if result.returncode == 0:
             self.output.success("All tests passed.")
diff --git a/agentflow_cli/cli/main.py b/agentflow_cli/cli/main.py
index 3ee7aed..4311fe0 100644
--- a/agentflow_cli/cli/main.py
+++ b/agentflow_cli/cli/main.py
@@ -402,8 +402,8 @@ def test(
         sys.exit(handle_exception(e))
 
 
-@app.command()
-def eval(
+@app.command(name="eval")
+def eval_cmd(
     target: str | None = typer.Argument(
         None,
         help="File or directory to evaluate (default: evals/ from agentflow.json or cwd)",
@@ -423,7 +423,7 @@ def eval(
         None,
         "--threshold",
         "-t",
-        help="Fail if overall pass rate is below this value (0.0–1.0)",
+        help="Fail if overall pass rate is below this value (0.0-1.0)",
     ),
     open_report: bool = typer.Option(
         False,