From b15a7dbf9ab8b5fceb8276b4dd6e99021ff01642 Mon Sep 17 00:00:00 2001 From: Shudipto Trafder Date: Wed, 13 May 2026 22:57:27 +0600 Subject: [PATCH 1/4] feat: implement test command with pytest integration and configuration management --- agentflow_cli/cli/commands/test.py | 87 ++++++++++++++++++++++++++++++ agentflow_cli/cli/core/config.py | 11 ++++ agentflow_cli/cli/main.py | 39 ++++++++++++++ 3 files changed, 137 insertions(+) create mode 100644 agentflow_cli/cli/commands/test.py diff --git a/agentflow_cli/cli/commands/test.py b/agentflow_cli/cli/commands/test.py new file mode 100644 index 0000000..73b4b11 --- /dev/null +++ b/agentflow_cli/cli/commands/test.py @@ -0,0 +1,87 @@ +"""Test command implementation — thin pytest wrapper.""" + +import subprocess +import sys +import webbrowser +from pathlib import Path +from typing import Any + +from agentflow_cli.cli.commands import BaseCommand +from agentflow_cli.cli.core.config import ConfigManager + + +class TestCommand(BaseCommand): + """Run the project's test suite via pytest.""" + + def execute( + self, + path: str | None = None, + coverage: bool = False, + html: bool = False, + keyword: str | None = None, + verbose: bool = False, + quiet: bool = False, + extra_args: tuple[str, ...] = (), + **kwargs: Any, + ) -> int: + project_root = Path.cwd() + + # Load optional overrides from agentflow.json + cfg: dict[str, Any] = {} + config_manager = ConfigManager() + discovered = config_manager.auto_discover_config() + if discovered: + try: + config_manager.load_config(str(discovered)) + cfg = config_manager.get_test_config() + except Exception: # noqa: S110 + pass # config absent or invalid — proceed with CLI defaults + + # Explicit CLI path wins; fall back to agentflow.json; None = pytest auto-discovery + resolved_path: str | None = path or cfg.get("path") or None + resolved_coverage = coverage or cfg.get("coverage", False) + coverage_threshold: int | None = cfg.get("coverage_threshold") + + location = str(project_root / resolved_path) if resolved_path else str(project_root) + self.output.print_banner("Test", f"Running tests in {location}") + + cmd = [sys.executable, "-m", "pytest"] + if resolved_path: + cmd.append(resolved_path) + + if verbose: + cmd.append("-v") + elif quiet: + cmd.append("-q") + else: + cmd.append("-v") + + if resolved_coverage: + cmd += [ + "--cov=.", + "--cov-report=term-missing", + "--cov-report=html:htmlcov", + ] + if coverage_threshold is not None: + cmd.append(f"--cov-fail-under={coverage_threshold}") + + if keyword: + cmd += ["-k", keyword] + + cmd += list(extra_args) + + self.logger.info("Running: %s", " ".join(cmd)) + + result = subprocess.run(cmd, cwd=project_root) # noqa: PLW1510 + + if result.returncode == 0: + self.output.success("All tests passed.") + else: + self.output.error(f"Tests finished with exit code {result.returncode}.") + + if html and resolved_coverage: + report_path = (project_root / "htmlcov" / "index.html").as_uri() + self.output.info(f"Opening coverage report: {report_path}", emoji=False) + webbrowser.open(report_path) + + return result.returncode diff --git a/agentflow_cli/cli/core/config.py b/agentflow_cli/cli/core/config.py index 8f2f06e..ad8f3d2 100644 --- a/agentflow_cli/cli/core/config.py +++ b/agentflow_cli/cli/core/config.py @@ -214,6 +214,17 @@ def get_config_value(self, key: str, default: Any = None) -> Any: return value + def get_test_config(self) -> dict[str, Any]: + """Return the optional 'test' section from agentflow.json. + + Returns a dict with keys: path, coverage, coverage_threshold. + All fields are optional; callers should use .get() with their own defaults. + """ + raw = self.get_config_value("test", default={}) + if not isinstance(raw, dict): + return {} + return raw + def resolve_env_file(self) -> Path | None: """Resolve environment file path from configuration. diff --git a/agentflow_cli/cli/main.py b/agentflow_cli/cli/main.py index b103084..ecd44b5 100644 --- a/agentflow_cli/cli/main.py +++ b/agentflow_cli/cli/main.py @@ -9,6 +9,7 @@ from agentflow_cli.cli.commands.build import BuildCommand from agentflow_cli.cli.commands.init import InitCommand from agentflow_cli.cli.commands.skills import SkillsCommand +from agentflow_cli.cli.commands.test import TestCommand from agentflow_cli.cli.commands.version import VersionCommand from agentflow_cli.cli.constants import ( DEFAULT_CONFIG_FILE, @@ -362,6 +363,44 @@ def skills( sys.exit(handle_exception(e)) +@app.command( + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def test( + ctx: typer.Context, + path: str | None = typer.Argument(None, help="Path to tests directory or file (omit to let pytest auto-discover)"), + coverage: bool = typer.Option(False, "--coverage", "-C", help="Run with coverage"), + html: bool = typer.Option( + False, "--html", help="Open HTML coverage report after run (requires --coverage)" + ), + keyword: str | None = typer.Option( + None, "-k", help="Only run tests matching this expression" + ), + verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"), + quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress all output except errors"), +) -> None: + """Run project tests with pytest. + + Any arguments after -- are forwarded verbatim to pytest. + """ + setup_cli_logging(verbose=verbose, quiet=quiet) + + try: + command = TestCommand(output) + exit_code = command.execute( + path=path, + coverage=coverage, + html=html, + keyword=keyword, + verbose=verbose, + quiet=quiet, + extra_args=tuple(ctx.args), + ) + sys.exit(exit_code) + except Exception as e: + sys.exit(handle_exception(e)) + + def main() -> None: """Main CLI entry point.""" try: From 61ba4df5630e200aa4713182956abf1852cd7838 Mon Sep 17 00:00:00 2001 From: Shudipto Trafder Date: Wed, 13 May 2026 23:46:58 +0600 Subject: [PATCH 2/4] feat: add eval command for agent evaluations with report generation --- agentflow_cli/cli/commands/eval.py | 259 ++++++++++++++++++ agentflow_cli/cli/commands/test.py | 2 +- agentflow_cli/cli/core/config.py | 11 + agentflow_cli/cli/main.py | 68 ++++- .../templates/prod/evals/weather_agents.py | 38 ++- 5 files changed, 358 insertions(+), 20 deletions(-) create mode 100644 agentflow_cli/cli/commands/eval.py diff --git a/agentflow_cli/cli/commands/eval.py b/agentflow_cli/cli/commands/eval.py new file mode 100644 index 0000000..26c690e --- /dev/null +++ b/agentflow_cli/cli/commands/eval.py @@ -0,0 +1,259 @@ +"""Eval command — discover and run agentflow evaluations, always generating reports.""" + +from __future__ import annotations + +import asyncio +import importlib +import importlib.util +import inspect +import sys +import webbrowser +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from agentflow_cli.cli.commands import BaseCommand +from agentflow_cli.cli.core.config import ConfigManager + + +if TYPE_CHECKING: + from agentflow.qa.evaluation.eval_result import EvalReport + + +class EvalCommand(BaseCommand): + """Discover and run agent evaluations; always write HTML + JSON reports.""" + + # ------------------------------------------------------------------ + # Discovery + # ------------------------------------------------------------------ + + def _discover(self, target: Path) -> list[Path]: + """Return eval files under target. If target is a file, return it directly.""" + if target.is_file(): + return [target] + + seen: dict[Path, None] = {} + for pattern in ("*_eval.py", "eval_*.py"): + for p in sorted(target.rglob(pattern)): + seen[p] = None + return list(seen) + + # ------------------------------------------------------------------ + # Module loading + # ------------------------------------------------------------------ + + def _load_module(self, path: Path) -> Any: + project_root = str(Path.cwd()) + if project_root not in sys.path: + sys.path.insert(0, project_root) + + spec = importlib.util.spec_from_file_location("_agentflow_eval", path) + mod = importlib.util.module_from_spec(spec) # type: ignore[arg-type] + spec.loader.exec_module(mod) # type: ignore[union-attr] + return mod + + # ------------------------------------------------------------------ + # Agent loading from agentflow.json + # ------------------------------------------------------------------ + + def _load_agent_from_config(self) -> Any: + config_manager = ConfigManager() + discovered = config_manager.auto_discover_config() + if not discovered: + raise RuntimeError("No agentflow.json found — cannot auto-load agent.") + config_manager.load_config(str(discovered)) + agent_spec: str = config_manager.get_config_value("agent", default="") + if not agent_spec or ":" not in agent_spec: + raise RuntimeError(f"Invalid 'agent' field in agentflow.json: {agent_spec!r}") + module_path, attr = agent_spec.rsplit(":", 1) + mod = importlib.import_module(module_path) + return getattr(mod, attr) + + # ------------------------------------------------------------------ + # Per-file runner (sync wrapper — avoids nested asyncio.run() issues) + # ------------------------------------------------------------------ + + def _run_file_sync(self, path: Path) -> EvalReport | None: + """Load and run a single eval file. Returns EvalReport or None if skipped.""" + self.output.info(f"Running: {path.name}", emoji=False) + mod = self._load_module(path) + + # Option A: run() function — module has full control + if hasattr(mod, "run"): + result = mod.run() + if inspect.isawaitable(result): + return asyncio.run(result) + return result # type: ignore[return-value] + + # Option B: get_eval_set() + get_eval_config() — CLI loads agent + if hasattr(mod, "get_eval_set") and hasattr(mod, "get_eval_config"): + return self._run_with_evaluator(mod, mod.get_eval_set(), mod.get_eval_config()) + + # Option C: EVAL_CONFIG constant + get_eval_set() + if hasattr(mod, "EVAL_CONFIG") and hasattr(mod, "get_eval_set"): + return self._run_with_evaluator(mod, mod.get_eval_set(), mod.EVAL_CONFIG) + + self.output.warning( + f"Skipping {path.name} — no run(), get_eval_set()+get_eval_config(), " + "or EVAL_CONFIG+get_eval_set() found." + ) + return None + + def _run_with_evaluator(self, mod: Any, eval_set: Any, config: Any) -> EvalReport: + from agentflow.qa.evaluation.collectors.trajectory_collector import TrajectoryCollector + from agentflow.qa.evaluation.evaluator import AgentEvaluator + + # Prefer the graph already imported in the module (most common pattern) + graph = getattr(mod, "app", None) or self._load_agent_from_config() + collector = TrajectoryCollector(capture_all_events=True) + evaluator = AgentEvaluator(graph, collector, config=config) + return asyncio.run(evaluator.evaluate(eval_set)) + + # ------------------------------------------------------------------ + # Report merging + # ------------------------------------------------------------------ + + def _merge_reports(self, reports: list[EvalReport]) -> EvalReport: + if len(reports) == 1: + return reports[0] + + from agentflow.qa.evaluation.eval_result import EvalReport as ER + + all_results = [] + for r in reports: + all_results.extend(r.results) + return ER.create( + eval_set_id="combined_eval", + eval_set_name="Combined Evaluation", + results=all_results, + ) + + # ------------------------------------------------------------------ + # Eval directory from agentflow.json + # ------------------------------------------------------------------ + + def _resolve_eval_dir(self) -> Path: + config_manager = ConfigManager() + discovered = config_manager.auto_discover_config() + directory = "evals" + if discovered: + try: + config_manager.load_config(str(discovered)) + eval_cfg = config_manager.get_evaluation_config() + directory = eval_cfg.get("directory", "evals") + except Exception: + pass + return Path.cwd() / directory + + # ------------------------------------------------------------------ + # Main entry point + # ------------------------------------------------------------------ + + def execute( # noqa: PLR0912, PLR0915 + self, + target: str | None = None, + output_dir: str = "eval_reports", + no_report: bool = False, + threshold: float | None = None, + open_report: bool = False, + verbose: bool = False, + quiet: bool = False, + **kwargs: Any, + ) -> int: + # Load optional overrides from agentflow.json + config_manager = ConfigManager() + discovered = config_manager.auto_discover_config() + if discovered: + try: + config_manager.load_config(str(discovered)) + eval_cfg = config_manager.get_evaluation_config() + if output_dir == "eval_reports": + output_dir = eval_cfg.get("output_dir", output_dir) + if threshold is None: + threshold = eval_cfg.get("threshold") + except Exception: + pass + + # Resolve target path + if target: + target_path = Path(target) + if not target_path.exists(): + self.output.error(f"Path not found: {target}") + return 1 + else: + target_path = self._resolve_eval_dir() + if not target_path.exists(): + self.output.error( + f"Eval directory '{target_path}' not found. " + "Create an evals/ directory or pass a file/folder path." + ) + return 1 + + files = self._discover(target_path) + if not files: + self.output.error(f"No eval files found in {target_path}") + return 1 + + self.output.print_banner("Eval", f"Found {len(files)} eval file(s) in {target_path}") + + # Run each file + reports: list[EvalReport] = [] + for f in files: + try: + report = self._run_file_sync(f) + if report is not None: + reports.append(report) + except Exception as exc: + self.output.error(f"Error in {f.name}: {exc}") + self.logger.exception("Eval file failed: %s", f) + + if not reports: + self.output.error("No reports produced. Ensure eval files expose run().") + return 1 + + merged = self._merge_reports(reports) + + # Determine exit code + if threshold is not None and merged.summary.pass_rate < threshold: + self.output.error( + f"Pass rate {merged.summary.pass_rate:.1%} is below " + f"threshold {threshold:.1%}" + ) + return_code = 1 + else: + return_code = 0 if merged.summary.pass_rate == 1.0 else 1 + + # Always generate file reports unless --no-report + if not no_report: + from agentflow.qa.evaluation.config.eval_config import ReporterConfig + from agentflow.qa.evaluation.reporters.manager import ReporterManager + + manager = ReporterManager( + ReporterConfig( + output_dir=output_dir, + html=True, + json_report=True, + console=False, # console output already handled by run() modules + timestamp_files=True, + ) + ) + result = manager.run_all(merged) + + if result.html_path: + self.output.success(f"HTML report: {result.html_path}") + if result.json_path: + self.output.info(f"JSON report: {result.json_path}", emoji=False) + if result.has_errors: + for name, err in result.errors: + self.output.warning(f"Reporter error [{name}]: {err}") + + if open_report and result.html_path: + webbrowser.open(Path(result.html_path).as_uri()) + + summary = merged.summary + self.output.info( + f"Results: {summary.passed_cases}/{summary.total_cases} passed " + f"({summary.pass_rate:.1%})", + emoji=False, + ) + + return return_code diff --git a/agentflow_cli/cli/commands/test.py b/agentflow_cli/cli/commands/test.py index 73b4b11..1ffce9f 100644 --- a/agentflow_cli/cli/commands/test.py +++ b/agentflow_cli/cli/commands/test.py @@ -72,7 +72,7 @@ def execute( self.logger.info("Running: %s", " ".join(cmd)) - result = subprocess.run(cmd, cwd=project_root) # noqa: PLW1510 + result = subprocess.run(cmd, cwd=project_root) # noqa: PLW1510, S603 if result.returncode == 0: self.output.success("All tests passed.") diff --git a/agentflow_cli/cli/core/config.py b/agentflow_cli/cli/core/config.py index ad8f3d2..792c530 100644 --- a/agentflow_cli/cli/core/config.py +++ b/agentflow_cli/cli/core/config.py @@ -225,6 +225,17 @@ def get_test_config(self) -> dict[str, Any]: return {} return raw + def get_evaluation_config(self) -> dict[str, Any]: + """Return the optional 'evaluation' section from agentflow.json. + + Returns a dict with keys: directory, output_dir, threshold, timestamp_files. + All fields are optional; callers should use .get() with their own defaults. + """ + raw = self.get_config_value("evaluation", default={}) + if not isinstance(raw, dict): + return {} + return raw + def resolve_env_file(self) -> Path | None: """Resolve environment file path from configuration. diff --git a/agentflow_cli/cli/main.py b/agentflow_cli/cli/main.py index ecd44b5..3ee7aed 100644 --- a/agentflow_cli/cli/main.py +++ b/agentflow_cli/cli/main.py @@ -7,6 +7,7 @@ from agentflow_cli.cli.commands.api import APICommand from agentflow_cli.cli.commands.build import BuildCommand +from agentflow_cli.cli.commands.eval import EvalCommand from agentflow_cli.cli.commands.init import InitCommand from agentflow_cli.cli.commands.skills import SkillsCommand from agentflow_cli.cli.commands.test import TestCommand @@ -368,14 +369,14 @@ def skills( ) def test( ctx: typer.Context, - path: str | None = typer.Argument(None, help="Path to tests directory or file (omit to let pytest auto-discover)"), + path: str | None = typer.Argument( + None, help="Path to tests directory or file (omit to let pytest auto-discover)" + ), coverage: bool = typer.Option(False, "--coverage", "-C", help="Run with coverage"), html: bool = typer.Option( False, "--html", help="Open HTML coverage report after run (requires --coverage)" ), - keyword: str | None = typer.Option( - None, "-k", help="Only run tests matching this expression" - ), + keyword: str | None = typer.Option(None, "-k", help="Only run tests matching this expression"), verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"), quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress all output except errors"), ) -> None: @@ -401,6 +402,65 @@ def test( sys.exit(handle_exception(e)) +@app.command() +def eval( + target: str | None = typer.Argument( + None, + help="File or directory to evaluate (default: evals/ from agentflow.json or cwd)", + ), + output_dir: str = typer.Option( + "eval_reports", + "--output", + "-o", + help="Directory for generated report files", + ), + no_report: bool = typer.Option( + False, + "--no-report", + help="Skip file report generation (console summary only)", + ), + threshold: float | None = typer.Option( + None, + "--threshold", + "-t", + help="Fail if overall pass rate is below this value (0.0–1.0)", + ), + open_report: bool = typer.Option( + False, + "--open", + help="Open the HTML report in the default browser after the run", + ), + verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"), + quiet: bool = typer.Option(False, "--quiet", "-q", help="Suppress all output except errors"), +) -> None: + """Run agent evaluations. + + Discovers *_eval.py / eval_*.py files in the target directory (default: evals/). + Always generates HTML + JSON reports in eval_reports/ unless --no-report is set. + + Each eval file must expose one of: + run() # full control, returns EvalReport + get_eval_set() + get_eval_config() # CLI loads agent from agentflow.json + EVAL_CONFIG + get_eval_set() # same, config as a constant + """ + setup_cli_logging(verbose=verbose, quiet=quiet) + + try: + command = EvalCommand(output) + exit_code = command.execute( + target=target, + output_dir=output_dir, + no_report=no_report, + threshold=threshold, + open_report=open_report, + verbose=verbose, + quiet=quiet, + ) + sys.exit(exit_code) + except Exception as e: + sys.exit(handle_exception(e)) + + def main() -> None: """Main CLI entry point.""" try: diff --git a/agentflow_cli/cli/templates/prod/evals/weather_agents.py b/agentflow_cli/cli/templates/prod/evals/weather_agents.py index 35a72f0..62e56c3 100644 --- a/agentflow_cli/cli/templates/prod/evals/weather_agents.py +++ b/agentflow_cli/cli/templates/prod/evals/weather_agents.py @@ -92,28 +92,36 @@ def build_weather_agent_eval_config() -> EvalConfig: ) -def run_weather_agent_eval(config: EvalConfig | None = None): - """Run the current compiled graph against the weather-agent eval set.""" - config = config or build_weather_agent_eval_config() +def get_eval_set() -> EvalSet: + """Return the eval set for CLI discovery (agentflow eval).""" + return build_weather_agent_eval_set() + + +def get_eval_config() -> EvalConfig: + """Return the eval config for CLI discovery (agentflow eval).""" + return build_weather_agent_eval_config() + + +def run() -> ReporterOutput: + """Entry point for agentflow eval CLI discovery. + + Runs the full eval and returns the reporter output (HTML + JSON + JUnit). + """ + config = build_weather_agent_eval_config() collector = TrajectoryCollector(capture_all_events=True) - return QuickEval.run_sync( + report = QuickEval.run_sync( graph=app, collector=collector, eval_set=build_weather_agent_eval_set(), config=config, + print_results=True, ) - - -def write_weather_agent_eval_reports(report=None) -> ReporterOutput: - """Write JSON, HTML, and JUnit XML report files for a completed eval report.""" - config = build_weather_agent_eval_config() - eval_report = report or run_weather_agent_eval(config=config) EVAL_REPORT_DIR.mkdir(parents=True, exist_ok=True) - return ReporterManager(config.reporter).run_all(eval_report, output_dir=str(EVAL_REPORT_DIR)) + return ReporterManager(config.reporter).run_all(report, output_dir=str(EVAL_REPORT_DIR)) if __name__ == "__main__": - output = write_weather_agent_eval_reports() - print(f"JSON report: {output.json_path}") # noqa: T201 - print(f"HTML report: {output.html_path}") # noqa: T201 - print(f"JUnit report: {output.junit_path}") # noqa: T201 + reporter_output = run() + print(f"JSON report: {reporter_output.json_path}") # noqa: T201 + print(f"HTML report: {reporter_output.html_path}") # noqa: T201 + print(f"JUnit report: {reporter_output.junit_path}") # noqa: T201 From c99dfcec4dec3789477f6eb18385af873e735b7f Mon Sep 17 00:00:00 2001 From: Shudipto Trafder Date: Thu, 14 May 2026 00:09:14 +0600 Subject: [PATCH 3/4] feat: enhance eval command with default configuration and new evaluation structure --- agentflow_cli/cli/commands/eval.py | 38 ++++-- .../cli/templates/prod/agentflow.json | 11 ++ .../templates/prod/evals/weather_agents.py | 127 ------------------ .../prod/evals/weather_agents_eval.py | 36 +++++ .../templates/prod/tests/test_agent_eval.py | 40 +++--- 5 files changed, 87 insertions(+), 165 deletions(-) delete mode 100644 agentflow_cli/cli/templates/prod/evals/weather_agents.py create mode 100644 agentflow_cli/cli/templates/prod/evals/weather_agents_eval.py diff --git a/agentflow_cli/cli/commands/eval.py b/agentflow_cli/cli/commands/eval.py index 26c690e..65b0ce8 100644 --- a/agentflow_cli/cli/commands/eval.py +++ b/agentflow_cli/cli/commands/eval.py @@ -72,30 +72,42 @@ def _load_agent_from_config(self) -> Any: # Per-file runner (sync wrapper — avoids nested asyncio.run() issues) # ------------------------------------------------------------------ + def _default_config(self) -> Any: + from agentflow.qa.evaluation import CriterionConfig, EvalConfig, MatchType + + return EvalConfig( + criteria={ + "response": CriterionConfig(threshold=0.6, match_type=MatchType.ANY_ORDER), + "tool_usage": CriterionConfig( + threshold=1.0, match_type=MatchType.EXACT, check_args=False + ), + "node_order": CriterionConfig(threshold=0.8, match_type=MatchType.IN_ORDER), + }, + ) + def _run_file_sync(self, path: Path) -> EvalReport | None: """Load and run a single eval file. Returns EvalReport or None if skipped.""" self.output.info(f"Running: {path.name}", emoji=False) mod = self._load_module(path) - # Option A: run() function — module has full control + # Option A: run() — module has full control if hasattr(mod, "run"): result = mod.run() if inspect.isawaitable(result): return asyncio.run(result) return result # type: ignore[return-value] - # Option B: get_eval_set() + get_eval_config() — CLI loads agent - if hasattr(mod, "get_eval_set") and hasattr(mod, "get_eval_config"): - return self._run_with_evaluator(mod, mod.get_eval_set(), mod.get_eval_config()) - - # Option C: EVAL_CONFIG constant + get_eval_set() - if hasattr(mod, "EVAL_CONFIG") and hasattr(mod, "get_eval_set"): - return self._run_with_evaluator(mod, mod.get_eval_set(), mod.EVAL_CONFIG) - - self.output.warning( - f"Skipping {path.name} — no run(), get_eval_set()+get_eval_config(), " - "or EVAL_CONFIG+get_eval_set() found." - ) + # Option B: get_eval_set() — config is optional; CLI fills in the default + if hasattr(mod, "get_eval_set"): + if hasattr(mod, "get_eval_config"): + config = mod.get_eval_config() + elif hasattr(mod, "EVAL_CONFIG"): + config = mod.EVAL_CONFIG + else: + config = self._default_config() + return self._run_with_evaluator(mod, mod.get_eval_set(), config) + + self.output.warning(f"Skipping {path.name} — no run() or get_eval_set() found.") return None def _run_with_evaluator(self, mod: Any, eval_set: Any, config: Any) -> EvalReport: diff --git a/agentflow_cli/cli/templates/prod/agentflow.json b/agentflow_cli/cli/templates/prod/agentflow.json index 918b003..d0423c5 100644 --- a/agentflow_cli/cli/templates/prod/agentflow.json +++ b/agentflow_cli/cli/templates/prod/agentflow.json @@ -15,5 +15,16 @@ "by": "ip", "trusted_proxy_headers": false, "exclude_paths": ["/health", "/docs", "/redoc", "/openapi.json"] + }, + "test": { + "path": "tests", + "coverage": true, + "coverage_threshold": 70 + }, + "evaluation": { + "directory": "evals", + "output_dir": "eval_reports", + "threshold": 0.75, + "timestamp_files": true } } diff --git a/agentflow_cli/cli/templates/prod/evals/weather_agents.py b/agentflow_cli/cli/templates/prod/evals/weather_agents.py deleted file mode 100644 index 62e56c3..0000000 --- a/agentflow_cli/cli/templates/prod/evals/weather_agents.py +++ /dev/null @@ -1,127 +0,0 @@ -from pathlib import Path - -from agentflow.qa.evaluation import ( - CriterionConfig, - EvalConfig, - EvalSet, - EvalSetBuilder, - MatchType, - QuickEval, - ReporterConfig, - ReporterManager, - ReporterOutput, - TrajectoryCollector, -) - -from graph.agent import app - - -EVAL_REPORT_DIR = Path("eval_reports") - - -def build_weather_agent_eval_set() -> EvalSet: - """Build deterministic behavior checks for the weather agent.""" - return ( - EvalSetBuilder(name="weather-agent-regression") - .add_case( - query="Hi", - expected="assistant", - expected_node_order=["MAIN"], - name="greeting_response", - description="The agent responds to a greeting.", - ) - .add_tool_test( - query="What is the weather in London?", - tool_name="get_weather", - tool_args={"location": "London"}, - expected_response="London", - case_id="weather_london", - ) - .add_tool_test( - query="Tell me the current weather in New York", - tool_name="get_weather", - tool_args={"location": "New York"}, - expected_response="New York", - case_id="weather_new_york", - ) - .add_tool_test( - query="How is the weather in Tokyo today?", - tool_name="get_weather", - tool_args={"location": "Tokyo"}, - expected_response="Tokyo", - case_id="weather_tokyo", - ) - .build() - ) - - -def build_weather_agent_eval_config() -> EvalConfig: - """Configure fast non-judge criteria for regression evaluation.""" - return EvalConfig( - criteria={ - "response": CriterionConfig( - threshold=0.6, - match_type=MatchType.ANY_ORDER, - ), - "tool_usage": CriterionConfig( - threshold=1.0, - match_type=MatchType.EXACT, - check_args=False, - ), - "node_order": CriterionConfig( - threshold=0.8, - match_type=MatchType.IN_ORDER, - ), - }, - mock_mode=True, - verbose=True, - reporter=ReporterConfig( - enabled=True, - output_dir=str(EVAL_REPORT_DIR), - console=True, - json_report=True, - html=True, - junit_xml=True, - include_details=True, - include_trajectory=True, - include_node_responses=True, - include_actual_response=True, - include_tool_call_details=True, - timestamp_files=True, - ), - ) - - -def get_eval_set() -> EvalSet: - """Return the eval set for CLI discovery (agentflow eval).""" - return build_weather_agent_eval_set() - - -def get_eval_config() -> EvalConfig: - """Return the eval config for CLI discovery (agentflow eval).""" - return build_weather_agent_eval_config() - - -def run() -> ReporterOutput: - """Entry point for agentflow eval CLI discovery. - - Runs the full eval and returns the reporter output (HTML + JSON + JUnit). - """ - config = build_weather_agent_eval_config() - collector = TrajectoryCollector(capture_all_events=True) - report = QuickEval.run_sync( - graph=app, - collector=collector, - eval_set=build_weather_agent_eval_set(), - config=config, - print_results=True, - ) - EVAL_REPORT_DIR.mkdir(parents=True, exist_ok=True) - return ReporterManager(config.reporter).run_all(report, output_dir=str(EVAL_REPORT_DIR)) - - -if __name__ == "__main__": - reporter_output = run() - print(f"JSON report: {reporter_output.json_path}") # noqa: T201 - print(f"HTML report: {reporter_output.html_path}") # noqa: T201 - print(f"JUnit report: {reporter_output.junit_path}") # noqa: T201 diff --git a/agentflow_cli/cli/templates/prod/evals/weather_agents_eval.py b/agentflow_cli/cli/templates/prod/evals/weather_agents_eval.py new file mode 100644 index 0000000..b863c26 --- /dev/null +++ b/agentflow_cli/cli/templates/prod/evals/weather_agents_eval.py @@ -0,0 +1,36 @@ +from agentflow.qa.evaluation import EvalSet, EvalSetBuilder + + +def get_eval_set() -> EvalSet: + return ( + EvalSetBuilder(name="weather-agent-regression") + .add_case( + query="Hi", + expected="assistant", + expected_node_order=["MAIN"], + name="greeting_response", + description="The agent responds to a greeting.", + ) + .add_tool_test( + query="What is the weather in London?", + tool_name="get_weather", + tool_args={"location": "London"}, + expected_response="London", + case_id="weather_london", + ) + .add_tool_test( + query="Tell me the current weather in New York", + tool_name="get_weather", + tool_args={"location": "New York"}, + expected_response="New York", + case_id="weather_new_york", + ) + .add_tool_test( + query="How is the weather in Tokyo today?", + tool_name="get_weather", + tool_args={"location": "Tokyo"}, + expected_response="Tokyo", + case_id="weather_tokyo", + ) + .build() + ) diff --git a/agentflow_cli/cli/templates/prod/tests/test_agent_eval.py b/agentflow_cli/cli/templates/prod/tests/test_agent_eval.py index b4a5a34..b1fed1b 100644 --- a/agentflow_cli/cli/templates/prod/tests/test_agent_eval.py +++ b/agentflow_cli/cli/templates/prod/tests/test_agent_eval.py @@ -3,18 +3,20 @@ import asyncio from agentflow.qa import QuickTest -from agentflow.qa.evaluation import CriterionResult, EvalCaseResult, EvalReport, EvalSummary - -from evals.weather_agents import ( - EVAL_REPORT_DIR, - build_weather_agent_eval_config, - build_weather_agent_eval_set, - write_weather_agent_eval_reports, +from agentflow.qa.evaluation import ( + CriterionResult, + EvalCaseResult, + EvalReport, + EvalSummary, + ReporterConfig, + ReporterManager, ) +from evals.weather_agents_eval import get_eval_set + -def test_weather_agent_eval_set_covers_core_weather_behaviors() -> None: - eval_set = build_weather_agent_eval_set() +def test_eval_set_covers_core_weather_behaviors() -> None: + eval_set = get_eval_set() case_ids = {case.eval_id for case in eval_set.eval_cases} case_names = {case.name for case in eval_set.eval_cases} @@ -27,20 +29,7 @@ def test_weather_agent_eval_set_covers_core_weather_behaviors() -> None: assert "weather_tokyo" in case_ids -def test_weather_agent_eval_config_uses_deterministic_criteria() -> None: - config = build_weather_agent_eval_config() - - assert config.mock_mode is True - assert set(config.criteria) == {"response", "tool_usage", "node_order"} - assert config.criteria["tool_usage"].check_args is False - assert config.reporter.output_dir == str(EVAL_REPORT_DIR) - assert config.reporter.json_report is True - assert config.reporter.html is True - assert config.reporter.junit_xml is True - - -def test_write_weather_agent_eval_reports_creates_report_artifacts(tmp_path, monkeypatch) -> None: - monkeypatch.setattr("evals.weather_agents.EVAL_REPORT_DIR", tmp_path) +def test_reporter_writes_html_and_json_artifacts(tmp_path) -> None: report = EvalReport( eval_set_id="weather-agent-regression", eval_set_name="weather-agent-regression", @@ -69,11 +58,12 @@ def test_write_weather_agent_eval_reports_creates_report_artifacts(tmp_path, mon ), ) - output = write_weather_agent_eval_reports(report) + output = ReporterManager( + ReporterConfig(output_dir=str(tmp_path), html=True, json_report=True, console=False) + ).run_all(report, output_dir=str(tmp_path)) assert output.json_path is not None assert output.html_path is not None - assert output.junit_path is not None assert "weather-agent-regression" in tmp_path.joinpath(output.json_path).read_text( encoding="utf-8" ) From d6e6448da59b5d1628b5d1c1fc4112a1f36f52ed Mon Sep 17 00:00:00 2001 From: Shudipto Trafder Date: Thu, 14 May 2026 00:24:12 +0600 Subject: [PATCH 4/4] feat: refactor eval and test commands for improved configuration handling and command naming --- agentflow_cli/cli/commands/eval.py | 30 +++++++++++++++--------------- agentflow_cli/cli/commands/test.py | 8 ++++---- agentflow_cli/cli/main.py | 6 +++--- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/agentflow_cli/cli/commands/eval.py b/agentflow_cli/cli/commands/eval.py index 65b0ce8..36ae6d0 100644 --- a/agentflow_cli/cli/commands/eval.py +++ b/agentflow_cli/cli/commands/eval.py @@ -11,6 +11,13 @@ from pathlib import Path from typing import TYPE_CHECKING, Any +from agentflow.qa.evaluation import CriterionConfig, EvalConfig, MatchType +from agentflow.qa.evaluation.collectors.trajectory_collector import TrajectoryCollector +from agentflow.qa.evaluation.config.eval_config import ReporterConfig +from agentflow.qa.evaluation.eval_result import EvalReport as ER +from agentflow.qa.evaluation.evaluator import AgentEvaluator +from agentflow.qa.evaluation.reporters.manager import ReporterManager + from agentflow_cli.cli.commands import BaseCommand from agentflow_cli.cli.core.config import ConfigManager @@ -73,8 +80,6 @@ def _load_agent_from_config(self) -> Any: # ------------------------------------------------------------------ def _default_config(self) -> Any: - from agentflow.qa.evaluation import CriterionConfig, EvalConfig, MatchType - return EvalConfig( criteria={ "response": CriterionConfig(threshold=0.6, match_type=MatchType.ANY_ORDER), @@ -94,7 +99,7 @@ def _run_file_sync(self, path: Path) -> EvalReport | None: if hasattr(mod, "run"): result = mod.run() if inspect.isawaitable(result): - return asyncio.run(result) + return asyncio.run(result) # type: ignore[arg-type] return result # type: ignore[return-value] # Option B: get_eval_set() — config is optional; CLI fills in the default @@ -111,9 +116,6 @@ def _run_file_sync(self, path: Path) -> EvalReport | None: return None def _run_with_evaluator(self, mod: Any, eval_set: Any, config: Any) -> EvalReport: - from agentflow.qa.evaluation.collectors.trajectory_collector import TrajectoryCollector - from agentflow.qa.evaluation.evaluator import AgentEvaluator - # Prefer the graph already imported in the module (most common pattern) graph = getattr(mod, "app", None) or self._load_agent_from_config() collector = TrajectoryCollector(capture_all_events=True) @@ -128,8 +130,6 @@ def _merge_reports(self, reports: list[EvalReport]) -> EvalReport: if len(reports) == 1: return reports[0] - from agentflow.qa.evaluation.eval_result import EvalReport as ER - all_results = [] for r in reports: all_results.extend(r.results) @@ -153,7 +153,9 @@ def _resolve_eval_dir(self) -> Path: eval_cfg = config_manager.get_evaluation_config() directory = eval_cfg.get("directory", "evals") except Exception: - pass + self.logger.warning( + "Failed to load eval directory from config; using default 'evals/'" + ) return Path.cwd() / directory # ------------------------------------------------------------------ @@ -183,7 +185,9 @@ def execute( # noqa: PLR0912, PLR0915 if threshold is None: threshold = eval_cfg.get("threshold") except Exception: - pass + self.logger.warning( + "Failed to load eval config from agentflow.json; using defaults" + ) # Resolve target path if target: @@ -227,8 +231,7 @@ def execute( # noqa: PLR0912, PLR0915 # Determine exit code if threshold is not None and merged.summary.pass_rate < threshold: self.output.error( - f"Pass rate {merged.summary.pass_rate:.1%} is below " - f"threshold {threshold:.1%}" + f"Pass rate {merged.summary.pass_rate:.1%} is below threshold {threshold:.1%}" ) return_code = 1 else: @@ -236,9 +239,6 @@ def execute( # noqa: PLR0912, PLR0915 # Always generate file reports unless --no-report if not no_report: - from agentflow.qa.evaluation.config.eval_config import ReporterConfig - from agentflow.qa.evaluation.reporters.manager import ReporterManager - manager = ReporterManager( ReporterConfig( output_dir=output_dir, diff --git a/agentflow_cli/cli/commands/test.py b/agentflow_cli/cli/commands/test.py index 1ffce9f..9dc8f2e 100644 --- a/agentflow_cli/cli/commands/test.py +++ b/agentflow_cli/cli/commands/test.py @@ -1,6 +1,6 @@ """Test command implementation — thin pytest wrapper.""" -import subprocess +import subprocess # nosec: B404 import sys import webbrowser from pathlib import Path @@ -34,8 +34,8 @@ def execute( try: config_manager.load_config(str(discovered)) cfg = config_manager.get_test_config() - except Exception: # noqa: S110 - pass # config absent or invalid — proceed with CLI defaults + except Exception: # nosec: B110 + self.logger.warning("Failed to load test configuration from %s", discovered) # Explicit CLI path wins; fall back to agentflow.json; None = pytest auto-discovery resolved_path: str | None = path or cfg.get("path") or None @@ -72,7 +72,7 @@ def execute( self.logger.info("Running: %s", " ".join(cmd)) - result = subprocess.run(cmd, cwd=project_root) # noqa: PLW1510, S603 + result = subprocess.run(cmd, cwd=project_root, check=False) # nosec: B603 # noqa: S603 if result.returncode == 0: self.output.success("All tests passed.") diff --git a/agentflow_cli/cli/main.py b/agentflow_cli/cli/main.py index 3ee7aed..4311fe0 100644 --- a/agentflow_cli/cli/main.py +++ b/agentflow_cli/cli/main.py @@ -402,8 +402,8 @@ def test( sys.exit(handle_exception(e)) -@app.command() -def eval( +@app.command(name="eval") +def eval_cmd( target: str | None = typer.Argument( None, help="File or directory to evaluate (default: evals/ from agentflow.json or cwd)", @@ -423,7 +423,7 @@ def eval( None, "--threshold", "-t", - help="Fail if overall pass rate is below this value (0.0–1.0)", + help="Fail if overall pass rate is below this value (0.0-1.0)", ), open_report: bool = typer.Option( False,