From ef6e7c446baf6066ea223acd7bf711b7660a3948 Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 19:56:30 -0400 Subject: [PATCH 01/18] feat(evaluation): add DatasetClient and ServiceDatasetProvider Add Dataset Management SDK support with: - DatasetClient: pass-through client for all 11 dataset APIs (create/get/list/update/delete datasets, versions, examples) with 6 _and_wait helpers for async operations - ServiceDatasetProvider: fetches datasets from the service and returns SDK Dataset objects compatible with OnDemandEvaluationDatasetRunner and BatchEvaluationRunner - Unit tests (20 tests) and integration tests (17 tests, verified against live AWS) --- src/bedrock_agentcore/evaluation/__init__.py | 4 + .../evaluation/dataset_client.py | 258 ++++++++++++++++++ .../evaluation/runner/dataset_providers.py | 41 ++- .../evaluation/test_dataset_client.py | 248 +++++++++++++++++ .../test_service_dataset_provider.py | 130 +++++++++ tests_integ/evaluation/test_dataset_client.py | 179 ++++++++++++ .../test_service_dataset_provider.py | 167 ++++++++++++ 7 files changed, 1026 insertions(+), 1 deletion(-) create mode 100644 src/bedrock_agentcore/evaluation/dataset_client.py create mode 100644 tests/bedrock_agentcore/evaluation/test_dataset_client.py create mode 100644 tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py create mode 100644 tests_integ/evaluation/test_dataset_client.py create mode 100644 tests_integ/evaluation/test_service_dataset_provider.py diff --git a/src/bedrock_agentcore/evaluation/__init__.py b/src/bedrock_agentcore/evaluation/__init__.py index 57b0c277..7b4d0970 100644 --- a/src/bedrock_agentcore/evaluation/__init__.py +++ b/src/bedrock_agentcore/evaluation/__init__.py @@ -20,9 +20,11 @@ from bedrock_agentcore.evaluation.runner.batch.batch_evaluation_runner import ( BatchEvaluationRunner, ) +from bedrock_agentcore.evaluation.dataset_client import DatasetClient from bedrock_agentcore.evaluation.runner.dataset_providers import ( DatasetProvider, FileDatasetProvider, + ServiceDatasetProvider, ) from bedrock_agentcore.evaluation.runner.dataset_types import ( ActorProfile, @@ -77,6 +79,7 @@ "AgentInvokerOutput", "CloudWatchAgentSpanCollector", "Dataset", + "DatasetClient", "DatasetProvider", "EvaluationClient", "EvaluationResult", @@ -86,6 +89,7 @@ "EvaluatorOutput", "EvaluatorResult", "FileDatasetProvider", + "ServiceDatasetProvider", "Input", "OnDemandEvaluationDatasetRunner", "ReferenceInputs", diff --git a/src/bedrock_agentcore/evaluation/dataset_client.py b/src/bedrock_agentcore/evaluation/dataset_client.py new file mode 100644 index 00000000..19ba9e4b --- /dev/null +++ b/src/bedrock_agentcore/evaluation/dataset_client.py @@ -0,0 +1,258 @@ +"""DatasetClient for managing evaluation datasets.""" + +import logging +from typing import Any, Dict, Optional + +import boto3 +from botocore.config import Config + +from bedrock_agentcore._utils.config import WaitConfig +from bedrock_agentcore._utils.polling import wait_until, wait_until_deleted +from bedrock_agentcore._utils.snake_case import accept_snake_case_kwargs, convert_kwargs +from bedrock_agentcore._utils.user_agent import build_user_agent_suffix + +logger = logging.getLogger(__name__) + +_DATASET_FAILED_STATUSES = {"CREATE_FAILED", "UPDATE_FAILED"} + + +class DatasetClient: + """Client for managing evaluation datasets. + + Provides pass-through access to all dataset management APIs on the + bedrock-agentcore-control service, plus *_and_wait helpers for async operations. + + Example:: + + client = DatasetClient(region_name="us-west-2") + + # Create a dataset and wait for ACTIVE + dataset = client.create_dataset_and_wait( + datasetName="my-dataset", + schemaType="AGENTCORE_EVALUATION_PREDEFINED_V1", + source={"inlineExamples": {"examples": [...]}}, + ) + + # Pass-through to any dataset API + client.list_datasets(maxResults=10) + client.add_dataset_examples(datasetId="ds-123", examples=[...]) + """ + + _ALLOWED_CP_METHODS = { + # Dataset CRUD + "create_dataset", + "get_dataset", + "list_datasets", + "update_dataset", + "delete_dataset", + # Version management + "create_dataset_version", + "list_dataset_versions", + "delete_dataset_version", + # Examples management + "add_dataset_examples", + "update_dataset_examples", + "delete_dataset_examples", + "list_dataset_examples", + } + + def __init__( + self, + region_name: Optional[str] = None, + integration_source: Optional[str] = None, + boto3_session: Optional[boto3.Session] = None, + ): + """Initialize the DatasetClient. + + Args: + region_name: AWS region. Falls back to boto3 session region or us-west-2. + integration_source: Optional integration framework identifier for telemetry. + boto3_session: Optional boto3 Session. If not provided, a default is created. + """ + session = boto3_session if boto3_session else boto3.Session() + self.region_name = region_name or session.region_name or "us-west-2" + self.integration_source = integration_source + + user_agent_extra = build_user_agent_suffix(integration_source) + client_config = Config(user_agent_extra=user_agent_extra) + + self._cp_client = session.client( + "bedrock-agentcore-control", + region_name=self.region_name, + config=client_config, + ) + + logger.info("Initialized DatasetClient in region %s", self.region_name) + + def __getattr__(self, name: str): + """Dynamically forward allowlisted method calls to the boto3 client.""" + if name in self._ALLOWED_CP_METHODS and hasattr(self._cp_client, name): + method = getattr(self._cp_client, name) + logger.debug("Forwarding method '%s' to _cp_client", name) + return accept_snake_case_kwargs(method) + + raise AttributeError( + f"'{self.__class__.__name__}' object has no attribute '{name}'. " + f"Method not found on control plane client. " + f"Available methods can be found in the boto3 documentation for " + f"'bedrock-agentcore-control' service." + ) + + # *_and_wait methods + # ------------------------------------------------------------------------- + + def create_dataset_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> Dict[str, Any]: + """Create a dataset and wait for it to reach ACTIVE status. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the create_dataset API. + + Returns: + Dataset details when ACTIVE. + + Raises: + RuntimeError: If the dataset reaches CREATE_FAILED status. + TimeoutError: If the dataset doesn't become ACTIVE within max_wait. + """ + response = self._cp_client.create_dataset(**convert_kwargs(kwargs)) + dataset_id = response["datasetId"] + return wait_until( + lambda: self._cp_client.get_dataset(datasetId=dataset_id), + "ACTIVE", + {"CREATE_FAILED"}, + wait_config, + ) + + def delete_dataset_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> None: + """Delete a dataset and wait for deletion to complete. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the delete_dataset API. + + Raises: + TimeoutError: If the dataset isn't deleted within max_wait. + """ + response = self._cp_client.delete_dataset(**convert_kwargs(kwargs)) + dataset_id = response["datasetId"] + wait_until_deleted( + lambda: self._cp_client.get_dataset(datasetId=dataset_id), + wait_config=wait_config, + ) + + def create_dataset_version_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> Dict[str, Any]: + """Create a dataset version and wait for the dataset to reach ACTIVE status. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the create_dataset_version API. + + Returns: + Dataset details when ACTIVE. + + Raises: + RuntimeError: If the dataset reaches UPDATE_FAILED status. + TimeoutError: If the dataset doesn't become ACTIVE within max_wait. + """ + response = self._cp_client.create_dataset_version(**convert_kwargs(kwargs)) + dataset_id = response["datasetId"] + return wait_until( + lambda: self._cp_client.get_dataset(datasetId=dataset_id), + "ACTIVE", + {"UPDATE_FAILED"}, + wait_config, + ) + + def add_examples_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> Dict[str, Any]: + """Add examples to a dataset and wait for ACTIVE status. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the add_dataset_examples API. + + Returns: + Dataset details when ACTIVE. + + Raises: + RuntimeError: If the dataset reaches UPDATE_FAILED status. + TimeoutError: If the dataset doesn't become ACTIVE within max_wait. + """ + response = self._cp_client.add_dataset_examples(**convert_kwargs(kwargs)) + dataset_id = response["datasetId"] + return wait_until( + lambda: self._cp_client.get_dataset(datasetId=dataset_id), + "ACTIVE", + {"UPDATE_FAILED"}, + wait_config, + ) + + def update_examples_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> Dict[str, Any]: + """Update examples in a dataset and wait for ACTIVE status. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the update_dataset_examples API. + + Returns: + Dataset details when ACTIVE. + + Raises: + RuntimeError: If the dataset reaches UPDATE_FAILED status. + TimeoutError: If the dataset doesn't become ACTIVE within max_wait. + """ + response = self._cp_client.update_dataset_examples(**convert_kwargs(kwargs)) + dataset_id = response["datasetId"] + return wait_until( + lambda: self._cp_client.get_dataset(datasetId=dataset_id), + "ACTIVE", + {"UPDATE_FAILED"}, + wait_config, + ) + + def delete_examples_and_wait( + self, + wait_config: Optional[WaitConfig] = None, + **kwargs, + ) -> Dict[str, Any]: + """Delete examples from a dataset and wait for ACTIVE status. + + Args: + wait_config: Optional WaitConfig for polling behavior. + **kwargs: Arguments forwarded to the delete_dataset_examples API. + + Returns: + Dataset details when ACTIVE. + + Raises: + RuntimeError: If the dataset reaches UPDATE_FAILED status. + TimeoutError: If the dataset doesn't become ACTIVE within max_wait. + """ + response = self._cp_client.delete_dataset_examples(**convert_kwargs(kwargs)) + dataset_id = response["datasetId"] + return wait_until( + lambda: self._cp_client.get_dataset(datasetId=dataset_id), + "ACTIVE", + {"UPDATE_FAILED"}, + wait_config, + ) diff --git a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py index f8cb2cb4..60501435 100644 --- a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py +++ b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py @@ -2,7 +2,7 @@ import json from abc import ABC, abstractmethod -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from .dataset_types import ActorProfile, Dataset, PredefinedScenario, Scenario, SimulatedScenario, Turn @@ -62,3 +62,42 @@ def _parse_turn(raw: Dict[str, Any]) -> Turn: input=raw["input"], expected_response=raw.get("expected_response"), ) + + +class ServiceDatasetProvider(DatasetProvider): + """A dataset provider that loads a Dataset from the Dataset Management service.""" + + def __init__(self, dataset_id: str, version_id: Optional[str] = None, region_name: Optional[str] = None): + """Initialize with a dataset ID and optional version. + + Args: + dataset_id: The dataset ID to fetch. + version_id: Optional version ID. If omitted, fetches DRAFT. + region_name: AWS region. Falls back to boto3 session region or us-west-2. + """ + self._dataset_id = dataset_id + self._version_id = version_id + self._region_name = region_name + + def get_dataset(self) -> Dataset: + """Load and return the dataset from the Dataset Management service.""" + from bedrock_agentcore.evaluation.dataset_client import DatasetClient + + client = DatasetClient(region_name=self._region_name) + + # Fetch all examples via pagination + all_examples: List[Dict[str, Any]] = [] + kwargs: Dict[str, Any] = {"datasetId": self._dataset_id} + if self._version_id: + kwargs["datasetVersion"] = self._version_id + + while True: + response = client.list_dataset_examples(**kwargs) + all_examples.extend(response.get("examples", [])) + next_token = response.get("nextToken") + if not next_token: + break + kwargs["nextToken"] = next_token + + scenarios: List[Scenario] = [FileDatasetProvider._parse_scenario(example) for example in all_examples] + return Dataset(scenarios=scenarios) diff --git a/tests/bedrock_agentcore/evaluation/test_dataset_client.py b/tests/bedrock_agentcore/evaluation/test_dataset_client.py new file mode 100644 index 00000000..a5e69ad4 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/test_dataset_client.py @@ -0,0 +1,248 @@ +"""Tests for DatasetClient.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from bedrock_agentcore._utils.config import WaitConfig +from bedrock_agentcore.evaluation.dataset_client import DatasetClient + + +class TestDatasetClientInit: + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_default_region(self, mock_boto3): + mock_session = MagicMock() + mock_session.region_name = "us-east-1" + mock_boto3.Session.return_value = mock_session + + client = DatasetClient() + assert client.region_name == "us-east-1" + mock_session.client.assert_called_once() + + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_explicit_region(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + + client = DatasetClient(region_name="eu-west-1") + assert client.region_name == "eu-west-1" + + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_custom_session(self, mock_boto3): + custom_session = MagicMock() + custom_session.region_name = "ap-southeast-1" + + client = DatasetClient(boto3_session=custom_session) + assert client.region_name == "ap-southeast-1" + custom_session.client.assert_called_once() + + +class TestDatasetClientPassthrough: + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_allowed_method_forwarded(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + mock_cp.list_datasets.return_value = {"datasets": []} + + client = DatasetClient() + result = client.list_datasets(maxResults=10) + mock_cp.list_datasets.assert_called_once_with(maxResults=10) + + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_snake_case_converted(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + mock_cp.list_datasets.return_value = {"datasets": []} + + client = DatasetClient() + client.list_datasets(max_results=5) + mock_cp.list_datasets.assert_called_once_with(maxResults=5) + + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_non_allowed_method_raises(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + + client = DatasetClient() + with pytest.raises(AttributeError, match="not_a_real_method"): + client.not_a_real_method() + + +class TestDatasetClientCreateAndWait: + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_create_dataset_and_wait_success(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + + mock_cp.create_dataset.return_value = {"datasetId": "ds-123"} + mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "ACTIVE"} + + client = DatasetClient() + result = client.create_dataset_and_wait( + wait_config=WaitConfig(max_wait=10, poll_interval=1), + datasetName="test", + schemaType="AGENTCORE_EVALUATION_PREDEFINED_V1", + ) + + assert result["status"] == "ACTIVE" + mock_cp.create_dataset.assert_called_once() + mock_cp.get_dataset.assert_called_with(datasetId="ds-123") + + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_create_dataset_and_wait_failure(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + + mock_cp.create_dataset.return_value = {"datasetId": "ds-123"} + mock_cp.get_dataset.return_value = { + "datasetId": "ds-123", + "status": "CREATE_FAILED", + "statusReasons": "Invalid source", + } + + client = DatasetClient() + with pytest.raises(RuntimeError, match="CREATE_FAILED"): + client.create_dataset_and_wait( + wait_config=WaitConfig(max_wait=5, poll_interval=1), + datasetName="test", + ) + + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_create_dataset_and_wait_timeout(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + + mock_cp.create_dataset.return_value = {"datasetId": "ds-123"} + mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "CREATING"} + + client = DatasetClient() + with pytest.raises(TimeoutError): + client.create_dataset_and_wait( + wait_config=WaitConfig(max_wait=1, poll_interval=1), + datasetName="test", + ) + + +class TestDatasetClientDeleteAndWait: + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_delete_dataset_and_wait_success(self, mock_boto3): + from botocore.exceptions import ClientError + + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + + mock_cp.delete_dataset.return_value = {"datasetId": "ds-123"} + mock_cp.get_dataset.side_effect = ClientError( + {"Error": {"Code": "ResourceNotFoundException", "Message": "Not found"}}, + "GetDataset", + ) + + client = DatasetClient() + client.delete_dataset_and_wait( + wait_config=WaitConfig(max_wait=10, poll_interval=1), + datasetId="ds-123", + ) + + mock_cp.delete_dataset.assert_called_once_with(datasetId="ds-123") + + +class TestDatasetClientVersionAndWait: + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_create_version_and_wait_success(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + + mock_cp.create_dataset_version.return_value = {"datasetId": "ds-123"} + mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "ACTIVE"} + + client = DatasetClient() + result = client.create_dataset_version_and_wait( + wait_config=WaitConfig(max_wait=10, poll_interval=1), + datasetId="ds-123", + ) + + assert result["status"] == "ACTIVE" + + +class TestDatasetClientExamplesAndWait: + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_add_examples_and_wait_success(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + + mock_cp.add_dataset_examples.return_value = {"datasetId": "ds-123"} + mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "ACTIVE"} + + client = DatasetClient() + result = client.add_examples_and_wait( + wait_config=WaitConfig(max_wait=10, poll_interval=1), + datasetId="ds-123", + examples=[{"scenario_id": "s1", "turns": [{"input": "hi"}]}], + ) + + assert result["status"] == "ACTIVE" + + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_update_examples_and_wait_success(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + + mock_cp.update_dataset_examples.return_value = {"datasetId": "ds-123"} + mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "ACTIVE"} + + client = DatasetClient() + result = client.update_examples_and_wait( + wait_config=WaitConfig(max_wait=10, poll_interval=1), + datasetId="ds-123", + examples=[{"exampleId": "e1", "scenario_id": "s1", "turns": [{"input": "hi"}]}], + ) + + assert result["status"] == "ACTIVE" + + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_delete_examples_and_wait_success(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + + mock_cp.delete_dataset_examples.return_value = {"datasetId": "ds-123"} + mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "ACTIVE"} + + client = DatasetClient() + result = client.delete_examples_and_wait( + wait_config=WaitConfig(max_wait=10, poll_interval=1), + datasetId="ds-123", + exampleIds=["e1"], + ) + + assert result["status"] == "ACTIVE" + + @patch("bedrock_agentcore.evaluation.dataset_client.boto3") + def test_add_examples_and_wait_failure(self, mock_boto3): + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + + mock_cp.add_dataset_examples.return_value = {"datasetId": "ds-123"} + mock_cp.get_dataset.return_value = { + "datasetId": "ds-123", + "status": "UPDATE_FAILED", + "statusReasons": "Schema validation failed", + } + + client = DatasetClient() + with pytest.raises(RuntimeError, match="UPDATE_FAILED"): + client.add_examples_and_wait( + wait_config=WaitConfig(max_wait=5, poll_interval=1), + datasetId="ds-123", + examples=[], + ) diff --git a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py new file mode 100644 index 00000000..27260dc1 --- /dev/null +++ b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py @@ -0,0 +1,130 @@ +"""Tests for ServiceDatasetProvider.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from bedrock_agentcore.evaluation.runner.dataset_providers import ServiceDatasetProvider +from bedrock_agentcore.evaluation.runner.dataset_types import ( + Dataset, + PredefinedScenario, + SimulatedScenario, +) + +PATCH_TARGET = "bedrock_agentcore.evaluation.dataset_client.DatasetClient" + + +class TestServiceDatasetProvider: + def _make_provider(self, mock_client_instance, dataset_id="ds-123", version_id=None, region_name="us-west-2"): + with patch(PATCH_TARGET, return_value=mock_client_instance): + provider = ServiceDatasetProvider(dataset_id=dataset_id, version_id=version_id, region_name=region_name) + return provider.get_dataset() + + def test_get_dataset_predefined(self): + mock_client = MagicMock() + mock_client.list_dataset_examples.return_value = { + "examples": [ + { + "scenario_id": "s1", + "turns": [ + {"input": "Hello", "expected_response": "Hi!"}, + ], + "assertions": ["Be polite"], + "expected_trajectory": ["greet"], + }, + { + "scenario_id": "s2", + "turns": [{"input": "What is 2+2?"}], + }, + ], + } + + dataset = self._make_provider(mock_client) + + assert isinstance(dataset, Dataset) + assert len(dataset.scenarios) == 2 + + s1 = dataset.scenarios[0] + assert isinstance(s1, PredefinedScenario) + assert s1.scenario_id == "s1" + assert len(s1.turns) == 1 + assert s1.turns[0].input == "Hello" + assert s1.turns[0].expected_response == "Hi!" + assert s1.assertions == ["Be polite"] + assert s1.expected_trajectory == ["greet"] + + s2 = dataset.scenarios[1] + assert isinstance(s2, PredefinedScenario) + assert s2.scenario_id == "s2" + + def test_get_dataset_simulated(self): + mock_client = MagicMock() + mock_client.list_dataset_examples.return_value = { + "examples": [ + { + "scenario_id": "sim-1", + "scenario_description": "Frustrated customer", + "actor_profile": { + "traits": {"personality": "impatient"}, + "context": "Waiting 3 days", + "goal": "Get a refund", + }, + "input": "I want my money back!", + "max_turns": 5, + "assertions": ["Agent should empathize"], + }, + ], + } + + dataset = self._make_provider(mock_client, dataset_id="ds-456") + + assert isinstance(dataset, Dataset) + assert len(dataset.scenarios) == 1 + scenario = dataset.scenarios[0] + assert isinstance(scenario, SimulatedScenario) + assert scenario.scenario_id == "sim-1" + assert scenario.actor_profile.goal == "Get a refund" + assert scenario.max_turns == 5 + assert scenario.assertions == ["Agent should empathize"] + + def test_get_dataset_with_pagination(self): + mock_client = MagicMock() + mock_client.list_dataset_examples.side_effect = [ + { + "examples": [{"scenario_id": "s1", "turns": [{"input": "a"}]}], + "nextToken": "token-1", + }, + { + "examples": [{"scenario_id": "s2", "turns": [{"input": "b"}]}], + }, + ] + + dataset = self._make_provider(mock_client, dataset_id="ds-789") + + assert len(dataset.scenarios) == 2 + assert mock_client.list_dataset_examples.call_count == 2 + + def test_get_dataset_with_version_id(self): + mock_client = MagicMock() + mock_client.list_dataset_examples.return_value = { + "examples": [{"scenario_id": "s1", "turns": [{"input": "hi"}]}], + } + + with patch(PATCH_TARGET, return_value=mock_client): + provider = ServiceDatasetProvider( + dataset_id="ds-123", + version_id="v-1", + region_name="us-west-2", + ) + provider.get_dataset() + + call_kwargs = mock_client.list_dataset_examples.call_args[1] + assert call_kwargs["datasetId"] == "ds-123" + assert call_kwargs["datasetVersion"] == "v-1" + + def test_get_dataset_empty_raises(self): + mock_client = MagicMock() + mock_client.list_dataset_examples.return_value = {"examples": []} + + with pytest.raises(ValueError, match="scenarios must not be empty"): + self._make_provider(mock_client, dataset_id="ds-empty") diff --git a/tests_integ/evaluation/test_dataset_client.py b/tests_integ/evaluation/test_dataset_client.py new file mode 100644 index 00000000..908055d9 --- /dev/null +++ b/tests_integ/evaluation/test_dataset_client.py @@ -0,0 +1,179 @@ +"""Integration tests for DatasetClient. + +Run with: + uv run pytest tests_integ/evaluation/test_dataset_client.py -xvs --log-cli-level=INFO +""" + +import os +import time + +import pytest +from botocore.exceptions import ClientError + +from bedrock_agentcore.evaluation.dataset_client import DatasetClient + + +@pytest.mark.integration +class TestDatasetClientPassthrough: + """Read-only passthrough tests. No resources needed.""" + + @classmethod + def setup_class(cls): + cls.region = os.environ.get("BEDROCK_TEST_REGION", "us-west-2") + cls.client = DatasetClient(region_name=cls.region) + + @pytest.mark.order(1) + def test_list_datasets_passthrough(self): + response = self.client.list_datasets() + assert "datasets" in response + + @pytest.mark.order(2) + def test_list_datasets_snake_case(self): + response = self.client.list_datasets(max_results=5) + assert "datasets" in response + + @pytest.mark.order(3) + def test_get_nonexistent_dataset_raises(self): + with pytest.raises(ClientError) as exc_info: + self.client.get_dataset(datasetId="nonexistent-dataset-id") + assert exc_info.value.response["Error"]["Code"] == "ResourceNotFoundException" + + @pytest.mark.order(4) + def test_non_allowlisted_method_raises(self): + with pytest.raises(AttributeError): + self.client.not_a_real_method() + + +@pytest.mark.integration +class TestDatasetCrud: + """Full CRUD lifecycle tests for datasets.""" + + @classmethod + def setup_class(cls): + cls.region = os.environ.get("BEDROCK_TEST_REGION", "us-west-2") + cls.client = DatasetClient(region_name=cls.region) + cls.test_prefix = f"sdk_integ_{int(time.time())}" + cls.dataset_ids = [] + + @classmethod + def teardown_class(cls): + for did in cls.dataset_ids: + try: + cls.client.delete_dataset(datasetId=did) + except Exception as e: + print(f"Failed to delete dataset {did}: {e}") + + @pytest.mark.order(5) + def test_create_dataset_and_wait_inline(self): + """Create a dataset with inline examples and wait for ACTIVE.""" + dataset = self.client.create_dataset_and_wait( + datasetName=f"{self.test_prefix}_predefined", + schemaType="AGENTCORE_EVALUATION_PREDEFINED_V1", + source={ + "inlineExamples": { + "examples": [ + { + "scenario_id": "test-scenario-1", + "turns": [ + {"input": "Hello", "expected_response": "Hi there!"} + ], + } + ] + } + }, + ) + self.__class__.dataset_ids.append(dataset["datasetId"]) + assert dataset["status"] == "ACTIVE" + + @pytest.mark.order(6) + def test_get_dataset(self): + """Get dataset metadata.""" + if not self.dataset_ids: + pytest.skip("prerequisite test did not create dataset") + dataset = self.client.get_dataset(datasetId=self.dataset_ids[0]) + assert dataset["datasetId"] == self.dataset_ids[0] + assert dataset["status"] == "ACTIVE" + + @pytest.mark.order(7) + def test_list_datasets_contains_created(self): + """List datasets includes the one we created.""" + if not self.dataset_ids: + pytest.skip("prerequisite test did not create dataset") + response = self.client.list_datasets() + ids = [d["datasetId"] for d in response["datasets"]] + assert self.dataset_ids[0] in ids + + @pytest.mark.order(8) + def test_update_dataset_metadata(self): + """Update dataset description (sync operation).""" + if not self.dataset_ids: + pytest.skip("prerequisite test did not create dataset") + response = self.client.update_dataset( + datasetId=self.dataset_ids[0], + description="updated by integ test", + ) + assert response["datasetId"] == self.dataset_ids[0] + # Verify via get + dataset = self.client.get_dataset(datasetId=self.dataset_ids[0]) + assert dataset.get("description") == "updated by integ test" + + @pytest.mark.order(9) + def test_add_examples_and_wait(self): + """Add examples to dataset and wait for ACTIVE.""" + if not self.dataset_ids: + pytest.skip("prerequisite test did not create dataset") + dataset = self.client.add_examples_and_wait( + datasetId=self.dataset_ids[0], + source={ + "inlineExamples": { + "examples": [ + { + "scenario_id": "test-scenario-2", + "turns": [ + {"input": "What is 2+2?", "expected_response": "4"} + ], + } + ] + } + }, + ) + assert dataset["status"] == "ACTIVE" + + @pytest.mark.order(10) + def test_list_examples(self): + """List examples in dataset.""" + if not self.dataset_ids: + pytest.skip("prerequisite test did not create dataset") + response = self.client.list_dataset_examples(datasetId=self.dataset_ids[0]) + assert "examples" in response + assert len(response["examples"]) >= 2 + + @pytest.mark.order(11) + def test_create_dataset_version_and_wait(self): + """Create a version from DRAFT and wait for ACTIVE.""" + if not self.dataset_ids: + pytest.skip("prerequisite test did not create dataset") + dataset = self.client.create_dataset_version_and_wait( + datasetId=self.dataset_ids[0], + ) + assert dataset["status"] == "ACTIVE" + + @pytest.mark.order(12) + def test_list_dataset_versions(self): + """List versions of the dataset.""" + if not self.dataset_ids: + pytest.skip("prerequisite test did not create dataset") + response = self.client.list_dataset_versions(datasetId=self.dataset_ids[0]) + assert "versions" in response + assert len(response["versions"]) >= 1 + + @pytest.mark.order(13) + def test_delete_dataset_and_wait(self): + """Delete dataset and wait for removal.""" + if not self.dataset_ids: + pytest.skip("prerequisite test did not create dataset") + did = self.dataset_ids.pop(0) + self.client.delete_dataset_and_wait(datasetId=did) + with pytest.raises(ClientError) as exc_info: + self.client.get_dataset(datasetId=did) + assert exc_info.value.response["Error"]["Code"] == "ResourceNotFoundException" diff --git a/tests_integ/evaluation/test_service_dataset_provider.py b/tests_integ/evaluation/test_service_dataset_provider.py new file mode 100644 index 00000000..770f10ad --- /dev/null +++ b/tests_integ/evaluation/test_service_dataset_provider.py @@ -0,0 +1,167 @@ +"""Integration tests for ServiceDatasetProvider. + +Tests that ServiceDatasetProvider can fetch a dataset from the service +and return it as an SDK Dataset object usable by OnDemandEvaluationDatasetRunner. + +Run with: + uv run pytest tests_integ/evaluation/test_service_dataset_provider.py -xvs --log-cli-level=INFO +""" + +import os +import time + +import pytest + +from bedrock_agentcore.evaluation.dataset_client import DatasetClient +from bedrock_agentcore.evaluation.runner.dataset_providers import ServiceDatasetProvider +from bedrock_agentcore.evaluation.runner.dataset_types import ( + Dataset, + PredefinedScenario, + SimulatedScenario, +) + + +@pytest.mark.integration +class TestServiceDatasetProvider: + """Tests ServiceDatasetProvider with both PREDEFINED and SYNTHETIC schema types.""" + + @classmethod + def setup_class(cls): + cls.region = os.environ.get("BEDROCK_TEST_REGION", "us-west-2") + cls.client = DatasetClient(region_name=cls.region) + cls.test_prefix = f"sdk_integ_provider_{int(time.time())}" + cls.dataset_ids = [] + + # Create a PREDEFINED_TURNS dataset + predefined = cls.client.create_dataset_and_wait( + datasetName=f"{cls.test_prefix}_predefined", + schemaType="AGENTCORE_EVALUATION_PREDEFINED_V1", + source={ + "inlineExamples": { + "examples": [ + { + "scenario_id": "greeting-scenario", + "turns": [ + {"input": "Hello", "expected_response": "Hi!"}, + {"input": "How are you?", "expected_response": "I'm good!"}, + ], + "assertions": ["Agent should be polite"], + "expected_trajectory": ["greet_user"], + }, + { + "scenario_id": "math-scenario", + "turns": [ + {"input": "What is 2+2?", "expected_response": "4"}, + ], + }, + ] + } + }, + ) + cls.predefined_dataset_id = predefined["datasetId"] + cls.dataset_ids.append(cls.predefined_dataset_id) + + # Create a version for version-specific fetch test + cls.client.create_dataset_version_and_wait(datasetId=cls.predefined_dataset_id) + + # Create a SYNTHETIC dataset + synthetic = cls.client.create_dataset_and_wait( + datasetName=f"{cls.test_prefix}_synthetic", + schemaType="AGENTCORE_EVALUATION_SIMULATED_V1", + source={ + "inlineExamples": { + "examples": [ + { + "scenario_id": "frustrated-customer", + "scenario_description": "Customer wants a refund", + "actor_profile": { + "traits": {"personality": "impatient"}, + "context": "Has been waiting 3 days", + "goal": "Get a refund", + }, + "input": "I want my money back!", + "max_turns": 5, + "assertions": ["Agent should empathize"], + }, + ] + } + }, + ) + cls.synthetic_dataset_id = synthetic["datasetId"] + cls.dataset_ids.append(cls.synthetic_dataset_id) + + @classmethod + def teardown_class(cls): + for did in cls.dataset_ids: + try: + cls.client.delete_dataset(datasetId=did) + except Exception as e: + print(f"Failed to delete dataset {did}: {e}") + + # --- Predefined turns tests --- + + @pytest.mark.order(1) + def test_get_predefined_dataset(self): + """ServiceDatasetProvider returns a valid SDK Dataset with PredefinedScenarios.""" + provider = ServiceDatasetProvider( + dataset_id=self.predefined_dataset_id, + region_name=self.region, + ) + dataset = provider.get_dataset() + + assert isinstance(dataset, Dataset) + assert len(dataset.scenarios) == 2 + for scenario in dataset.scenarios: + assert isinstance(scenario, PredefinedScenario) + + @pytest.mark.order(2) + def test_predefined_fields_preserved(self): + """Scenario fields (turns, assertions, trajectory) are preserved.""" + provider = ServiceDatasetProvider( + dataset_id=self.predefined_dataset_id, + region_name=self.region, + ) + dataset = provider.get_dataset() + + greeting = next(s for s in dataset.scenarios if s.scenario_id == "greeting-scenario") + assert len(greeting.turns) == 2 + assert greeting.turns[0].input == "Hello" + assert greeting.turns[0].expected_response == "Hi!" + assert greeting.assertions == ["Agent should be polite"] + assert greeting.expected_trajectory == ["greet_user"] + + @pytest.mark.order(3) + def test_get_predefined_with_version(self): + """ServiceDatasetProvider can fetch a specific version.""" + versions_resp = self.client.list_dataset_versions(datasetId=self.predefined_dataset_id) + version_id = str(versions_resp["versions"][0]["datasetVersion"]) + + provider = ServiceDatasetProvider( + dataset_id=self.predefined_dataset_id, + version_id=version_id, + region_name=self.region, + ) + dataset = provider.get_dataset() + + assert isinstance(dataset, Dataset) + assert len(dataset.scenarios) == 2 + + # --- Synthetic tests --- + + @pytest.mark.order(4) + def test_get_synthetic_dataset(self): + """ServiceDatasetProvider returns SimulatedScenarios for SYNTHETIC schema.""" + provider = ServiceDatasetProvider( + dataset_id=self.synthetic_dataset_id, + region_name=self.region, + ) + dataset = provider.get_dataset() + + assert isinstance(dataset, Dataset) + assert len(dataset.scenarios) == 1 + scenario = dataset.scenarios[0] + assert isinstance(scenario, SimulatedScenario) + assert scenario.scenario_id == "frustrated-customer" + assert scenario.actor_profile.goal == "Get a refund" + assert scenario.max_turns == 5 + assert scenario.assertions == ["Agent should empathize"] From 71d0f02fee725c484df3698f65bf145b17f956d2 Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 20:10:40 -0400 Subject: [PATCH 02/18] refactor(evaluation): use downloadUrl for ServiceDatasetProvider Switch ServiceDatasetProvider from list_dataset_examples pagination to downloading the JSONL file via the presigned downloadUrl from GetDataset. Single HTTP request is simpler and faster for large datasets. --- .../evaluation/runner/dataset_providers.py | 29 ++-- .../test_service_dataset_provider.py | 146 +++++++++--------- 2 files changed, 91 insertions(+), 84 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py index 60501435..43b0f8ce 100644 --- a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py +++ b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py @@ -4,6 +4,8 @@ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional +import requests + from .dataset_types import ActorProfile, Dataset, PredefinedScenario, Scenario, SimulatedScenario, Turn @@ -80,24 +82,31 @@ def __init__(self, dataset_id: str, version_id: Optional[str] = None, region_nam self._region_name = region_name def get_dataset(self) -> Dataset: - """Load and return the dataset from the Dataset Management service.""" + """Load and return the dataset from the Dataset Management service. + + Fetches the dataset via the presigned download URL returned by GetDataset. + The URL points to a JSONL file where each line is one example. + """ from bedrock_agentcore.evaluation.dataset_client import DatasetClient client = DatasetClient(region_name=self._region_name) - # Fetch all examples via pagination - all_examples: List[Dict[str, Any]] = [] kwargs: Dict[str, Any] = {"datasetId": self._dataset_id} if self._version_id: kwargs["datasetVersion"] = self._version_id - while True: - response = client.list_dataset_examples(**kwargs) - all_examples.extend(response.get("examples", [])) - next_token = response.get("nextToken") - if not next_token: - break - kwargs["nextToken"] = next_token + response = client.get_dataset(**kwargs) + download_url = response.get("downloadUrl") + if not download_url: + raise ValueError(f"Dataset {self._dataset_id} has no downloadUrl. Status: {response.get('status')}") + + r = requests.get(download_url) + r.raise_for_status() + + all_examples: List[Dict[str, Any]] = [] + for line in r.text.strip().split("\n"): + if line: + all_examples.append(json.loads(line)) scenarios: List[Scenario] = [FileDatasetProvider._parse_scenario(example) for example in all_examples] return Dataset(scenarios=scenarios) diff --git a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py index 27260dc1..8328613b 100644 --- a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py +++ b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py @@ -11,35 +11,50 @@ SimulatedScenario, ) -PATCH_TARGET = "bedrock_agentcore.evaluation.dataset_client.DatasetClient" +PATCH_CLIENT = "bedrock_agentcore.evaluation.dataset_client.DatasetClient" +PATCH_REQUESTS = "bedrock_agentcore.evaluation.runner.dataset_providers.requests" + + +def _jsonl(*examples): + """Build a JSONL string from example dicts.""" + import json + + return "\n".join(json.dumps(e) for e in examples) class TestServiceDatasetProvider: - def _make_provider(self, mock_client_instance, dataset_id="ds-123", version_id=None, region_name="us-west-2"): - with patch(PATCH_TARGET, return_value=mock_client_instance): - provider = ServiceDatasetProvider(dataset_id=dataset_id, version_id=version_id, region_name=region_name) - return provider.get_dataset() + def _make_provider(self, jsonl_content, dataset_id="ds-123", version_id=None, get_response=None): + mock_client = MagicMock() + if get_response is None: + get_response = {"datasetId": dataset_id, "status": "ACTIVE", "downloadUrl": "https://example.com/dataset.jsonl"} + mock_client.get_dataset.return_value = get_response + + mock_response = MagicMock() + mock_response.text = jsonl_content + mock_response.raise_for_status = MagicMock() + + with patch(PATCH_CLIENT, return_value=mock_client), patch(PATCH_REQUESTS) as mock_requests: + mock_requests.get.return_value = mock_response + provider = ServiceDatasetProvider(dataset_id=dataset_id, version_id=version_id, region_name="us-west-2") + return provider.get_dataset(), mock_client, mock_requests def test_get_dataset_predefined(self): - mock_client = MagicMock() - mock_client.list_dataset_examples.return_value = { - "examples": [ - { - "scenario_id": "s1", - "turns": [ - {"input": "Hello", "expected_response": "Hi!"}, - ], - "assertions": ["Be polite"], - "expected_trajectory": ["greet"], - }, - { - "scenario_id": "s2", - "turns": [{"input": "What is 2+2?"}], - }, - ], - } + content = _jsonl( + { + "exampleId": "e1", + "scenario_id": "s1", + "turns": [{"input": "Hello", "expected_response": "Hi!"}], + "assertions": ["Be polite"], + "expected_trajectory": ["greet"], + }, + { + "exampleId": "e2", + "scenario_id": "s2", + "turns": [{"input": "What is 2+2?"}], + }, + ) - dataset = self._make_provider(mock_client) + dataset, mock_client, mock_requests = self._make_provider(content) assert isinstance(dataset, Dataset) assert len(dataset.scenarios) == 2 @@ -47,7 +62,6 @@ def test_get_dataset_predefined(self): s1 = dataset.scenarios[0] assert isinstance(s1, PredefinedScenario) assert s1.scenario_id == "s1" - assert len(s1.turns) == 1 assert s1.turns[0].input == "Hello" assert s1.turns[0].expected_response == "Hi!" assert s1.assertions == ["Be polite"] @@ -58,25 +72,23 @@ def test_get_dataset_predefined(self): assert s2.scenario_id == "s2" def test_get_dataset_simulated(self): - mock_client = MagicMock() - mock_client.list_dataset_examples.return_value = { - "examples": [ - { - "scenario_id": "sim-1", - "scenario_description": "Frustrated customer", - "actor_profile": { - "traits": {"personality": "impatient"}, - "context": "Waiting 3 days", - "goal": "Get a refund", - }, - "input": "I want my money back!", - "max_turns": 5, - "assertions": ["Agent should empathize"], + content = _jsonl( + { + "exampleId": "e1", + "scenario_id": "sim-1", + "scenario_description": "Frustrated customer", + "actor_profile": { + "traits": {"personality": "impatient"}, + "context": "Waiting 3 days", + "goal": "Get a refund", }, - ], - } + "input": "I want my money back!", + "max_turns": 5, + "assertions": ["Agent should empathize"], + }, + ) - dataset = self._make_provider(mock_client, dataset_id="ds-456") + dataset, _, _ = self._make_provider(content, dataset_id="ds-456") assert isinstance(dataset, Dataset) assert len(dataset.scenarios) == 1 @@ -87,44 +99,30 @@ def test_get_dataset_simulated(self): assert scenario.max_turns == 5 assert scenario.assertions == ["Agent should empathize"] - def test_get_dataset_with_pagination(self): - mock_client = MagicMock() - mock_client.list_dataset_examples.side_effect = [ - { - "examples": [{"scenario_id": "s1", "turns": [{"input": "a"}]}], - "nextToken": "token-1", - }, - { - "examples": [{"scenario_id": "s2", "turns": [{"input": "b"}]}], - }, - ] + def test_downloads_from_presigned_url(self): + content = _jsonl({"scenario_id": "s1", "turns": [{"input": "hi"}]}) - dataset = self._make_provider(mock_client, dataset_id="ds-789") + _, mock_client, mock_requests = self._make_provider(content) - assert len(dataset.scenarios) == 2 - assert mock_client.list_dataset_examples.call_count == 2 + mock_client.get_dataset.assert_called_once_with(datasetId="ds-123") + mock_requests.get.assert_called_once_with("https://example.com/dataset.jsonl") def test_get_dataset_with_version_id(self): - mock_client = MagicMock() - mock_client.list_dataset_examples.return_value = { - "examples": [{"scenario_id": "s1", "turns": [{"input": "hi"}]}], - } - - with patch(PATCH_TARGET, return_value=mock_client): - provider = ServiceDatasetProvider( - dataset_id="ds-123", - version_id="v-1", - region_name="us-west-2", - ) - provider.get_dataset() - - call_kwargs = mock_client.list_dataset_examples.call_args[1] - assert call_kwargs["datasetId"] == "ds-123" - assert call_kwargs["datasetVersion"] == "v-1" + content = _jsonl({"scenario_id": "s1", "turns": [{"input": "hi"}]}) - def test_get_dataset_empty_raises(self): + _, mock_client, _ = self._make_provider(content, dataset_id="ds-123", version_id="1") + + mock_client.get_dataset.assert_called_once_with(datasetId="ds-123", datasetVersion="1") + + def test_get_dataset_no_download_url_raises(self): mock_client = MagicMock() - mock_client.list_dataset_examples.return_value = {"examples": []} + mock_client.get_dataset.return_value = {"datasetId": "ds-123", "status": "CREATING"} + + with patch(PATCH_CLIENT, return_value=mock_client), patch(PATCH_REQUESTS): + provider = ServiceDatasetProvider(dataset_id="ds-123") + with pytest.raises(ValueError, match="no downloadUrl"): + provider.get_dataset() + def test_get_dataset_empty_raises(self): with pytest.raises(ValueError, match="scenarios must not be empty"): - self._make_provider(mock_client, dataset_id="ds-empty") + self._make_provider("", dataset_id="ds-empty") From 1c7f8dcbcbfa62427670ee99e2553faef18cbafe Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 21:32:04 -0400 Subject: [PATCH 03/18] test(evaluation): add runner integ test helpers and skipped runner tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - helpers.py: get_or_create_agent_runtime(), make_agent_invoker() with retry logic and warmup for cold start handling - test_runners_with_service_dataset.py: OnDemandRunner + ServiceDatasetProvider end-to-end test (skipped until a working deployed agent is available — current account has 30s init timeout that prevents cold starts) --- tests_integ/evaluation/helpers.py | 181 ++++++++++++++++++ .../test_runners_with_service_dataset.py | 117 +++++++++++ 2 files changed, 298 insertions(+) create mode 100644 tests_integ/evaluation/helpers.py create mode 100644 tests_integ/evaluation/test_runners_with_service_dataset.py diff --git a/tests_integ/evaluation/helpers.py b/tests_integ/evaluation/helpers.py new file mode 100644 index 00000000..1b532887 --- /dev/null +++ b/tests_integ/evaluation/helpers.py @@ -0,0 +1,181 @@ +"""Shared helpers for evaluation integration tests.""" + +import json +import logging +import time +import uuid +from typing import Any, Dict + +import boto3 + +from bedrock_agentcore.evaluation.runner.invoker_types import AgentInvokerFn, AgentInvokerInput, AgentInvokerOutput + +logger = logging.getLogger(__name__) + +DEFAULT_AGENT_NAME = "sdk_integ_echo_bundled" +DEFAULT_REGION = "us-west-2" +DEFAULT_ROLE_ARN = "arn:aws:iam::619071331382:role/Kyros-Bedrock-AgentCore" +DEFAULT_S3_BUCKET = "codetest-us-west-2-619071331382-do-not-delete" +DEFAULT_S3_KEY = "integ-test-agents/echo-agent-bundled.zip" + + +def get_or_create_agent_runtime( + agent_name: str = DEFAULT_AGENT_NAME, + region: str = DEFAULT_REGION, + role_arn: str = DEFAULT_ROLE_ARN, + s3_bucket: str = DEFAULT_S3_BUCKET, + s3_key: str = DEFAULT_S3_KEY, +) -> Dict[str, str]: + """Find an existing READY agent runtime by name, or create one. + + The agent is never deleted — it's reused across test runs. + + Returns: + {"runtime_id": "...", "runtime_arn": "...", "endpoint_arn": "..."} + """ + cp = boto3.client("bedrock-agentcore-control", region_name=region) + + # Search for existing runtime by name + runtime = _find_runtime_by_name(cp, agent_name) + + if runtime is None: + logger.info("No runtime found with name=%s, creating...", agent_name) + runtime = _create_runtime(cp, agent_name, role_arn, s3_bucket, s3_key) + + runtime_id = runtime["agentRuntimeId"] + runtime_arn = runtime["agentRuntimeArn"] + logger.info("Using runtime: %s (%s)", runtime_id, runtime_arn) + + # Find or create endpoint + endpoint_arn = _get_or_create_endpoint(cp, runtime_id) + + # Warm up the agent to avoid cold start timeouts on first invoke + _warmup_agent(runtime_arn, region) + + return { + "runtime_id": runtime_id, + "runtime_arn": runtime_arn, + "endpoint_arn": endpoint_arn, + } + + +def make_agent_invoker(runtime_arn: str, region: str = DEFAULT_REGION) -> AgentInvokerFn: + """Create an AgentInvokerFn that calls the deployed agent via invoke_agent_runtime.""" + dp = boto3.client("bedrock-agentcore", region_name=region) + + def invoker(input: AgentInvokerInput) -> AgentInvokerOutput: + payload = input.payload if isinstance(input.payload, str) else json.dumps(input.payload) + last_error = None + for attempt in range(3): + try: + response = dp.invoke_agent_runtime( + agentRuntimeArn=runtime_arn, + payload=payload.encode(), + runtimeSessionId=input.session_id if input.session_id and len(input.session_id) >= 33 else str(uuid.uuid4()), + ) + body = response.get("response", "") + if hasattr(body, "read"): + body = body.read().decode() + result = json.loads(body) if body else {} + return AgentInvokerOutput(agent_output=result.get("output", result)) + except Exception as e: + last_error = e + if "initialization time exceeded" in str(e).lower() and attempt < 2: + import time as _t + _t.sleep(5) + continue + raise + raise last_error # type: ignore[misc] + + return invoker + + +# --- Private helpers --- + + +def _find_runtime_by_name(cp, agent_name: str) -> Any: + """List runtimes and find one matching the name with READY status.""" + next_token = None + while True: + kwargs: Dict[str, Any] = {"maxResults": 100} + if next_token: + kwargs["nextToken"] = next_token + response = cp.list_agent_runtimes(**kwargs) + for rt in response.get("agentRuntimes", []): + if rt.get("agentRuntimeName") == agent_name and rt.get("status") == "READY": + return rt + next_token = response.get("nextToken") + if not next_token: + break + return None + + +def _create_runtime(cp, agent_name: str, role_arn: str, s3_bucket: str, s3_key: str) -> Dict[str, Any]: + """Create a runtime and wait for READY.""" + response = cp.create_agent_runtime( + agentRuntimeName=agent_name, + roleArn=role_arn, + networkConfiguration={"networkMode": "PUBLIC"}, + agentRuntimeArtifact={ + "codeConfiguration": { + "code": {"s3": {"bucket": s3_bucket, "prefix": s3_key}}, + "runtime": "PYTHON_3_12", + "entryPoint": ["agent.py"], + } + }, + ) + runtime_id = response["agentRuntimeId"] + logger.info("Created runtime %s, waiting for READY...", runtime_id) + + for _ in range(60): + rt = cp.get_agent_runtime(agentRuntimeId=runtime_id) + status = rt["status"] + if status == "READY": + return rt + if "FAILED" in status: + raise RuntimeError(f"Runtime creation failed: {rt.get('statusReasons')}") + time.sleep(5) + + raise TimeoutError(f"Runtime {runtime_id} did not reach READY within 300s") + + +def _warmup_agent(runtime_arn: str, region: str) -> None: + """Send a warmup invoke to avoid cold start timeouts on first real call.""" + dp = boto3.client("bedrock-agentcore", region_name=region) + for attempt in range(5): + try: + resp = dp.invoke_agent_runtime( + agentRuntimeArn=runtime_arn, + payload=json.dumps({"input": "warmup"}).encode(), + runtimeSessionId=str(uuid.uuid4()), + ) + body = resp.get("response", "") + if hasattr(body, "read"): + body.read() + logger.info("Agent warmup successful on attempt %d", attempt + 1) + return + except Exception as e: + logger.debug("Warmup attempt %d failed: %s", attempt + 1, e) + time.sleep(10) + logger.warning("Agent warmup failed after 5 attempts — cold start may occur") + + +def _get_or_create_endpoint(cp, runtime_id: str) -> str: + """Find a READY endpoint or create one.""" + eps = cp.list_agent_runtime_endpoints(agentRuntimeId=runtime_id) + for ep in eps.get("runtimeEndpoints", []): + if ep.get("status") == "READY": + return ep["agentRuntimeEndpointArn"] + + # Create endpoint + logger.info("No READY endpoint found, creating...") + cp.create_agent_runtime_endpoint(agentRuntimeId=runtime_id, name="default") + + for _ in range(60): + eps = cp.list_agent_runtime_endpoints(agentRuntimeId=runtime_id) + for ep in eps.get("runtimeEndpoints", []): + if ep.get("status") == "READY": + return ep["agentRuntimeEndpointArn"] + time.sleep(5) + + raise TimeoutError(f"Endpoint for runtime {runtime_id} did not reach READY within 300s") diff --git a/tests_integ/evaluation/test_runners_with_service_dataset.py b/tests_integ/evaluation/test_runners_with_service_dataset.py new file mode 100644 index 00000000..d7d0cfa0 --- /dev/null +++ b/tests_integ/evaluation/test_runners_with_service_dataset.py @@ -0,0 +1,117 @@ +"""Integration tests for OnDemandEvaluationDatasetRunner and BatchEvaluationRunner +using ServiceDatasetProvider. + +These tests verify the full pipeline: + ServiceDatasetProvider → Dataset → Runner → Agent Invocation → Evaluation + +Requires a deployed agent runtime. Set INTEG_AGENT_NAME env var to the name of a +READY agent runtime, or the test will attempt to create one using defaults in helpers.py. + +Run with: + uv run pytest tests_integ/evaluation/test_runners_with_service_dataset.py -xvs --log-cli-level=INFO +""" + +import os +import time + +import pytest + +from bedrock_agentcore.evaluation.dataset_client import DatasetClient +from bedrock_agentcore.evaluation.runner.dataset_providers import ServiceDatasetProvider +from bedrock_agentcore.evaluation.runner.on_demand.config import EvaluationRunConfig, EvaluatorConfig +from bedrock_agentcore.evaluation.runner.on_demand.on_demand_runner import OnDemandEvaluationDatasetRunner + +from .helpers import get_or_create_agent_runtime, make_agent_invoker + + +@pytest.mark.integration +@pytest.mark.skip(reason="Requires a warm deployed agent runtime. See helpers.py for setup.") +class TestOnDemandRunnerWithServiceDataset: + """OnDemandEvaluationDatasetRunner + ServiceDatasetProvider end-to-end.""" + + @classmethod + def setup_class(cls): + cls.region = os.environ.get("BEDROCK_TEST_REGION", "us-west-2") + agent_name = os.environ.get("INTEG_AGENT_NAME", None) + + kwargs = {"region": cls.region} + if agent_name: + kwargs["agent_name"] = agent_name + + agent_info = get_or_create_agent_runtime(**kwargs) + cls.runtime_arn = agent_info["runtime_arn"] + cls.runtime_id = agent_info["runtime_id"] + + cls.client = DatasetClient(region_name=cls.region) + cls.test_prefix = f"sdk_integ_runner_{int(time.time())}" + cls.dataset_ids = [] + + ds = cls.client.create_dataset_and_wait( + datasetName=f"{cls.test_prefix}_ondemand", + schemaType="AGENTCORE_EVALUATION_PREDEFINED_V1", + source={ + "inlineExamples": { + "examples": [ + { + "scenario_id": "greeting", + "turns": [ + {"input": "Hello", "expected_response": "Hi there!"} + ], + "assertions": ["Agent should respond politely"], + }, + { + "scenario_id": "math", + "turns": [ + {"input": "What is 2+2?", "expected_response": "4"} + ], + }, + ] + } + }, + ) + cls.dataset_id = ds["datasetId"] + cls.dataset_ids.append(cls.dataset_id) + + @classmethod + def teardown_class(cls): + for did in cls.dataset_ids: + try: + cls.client.delete_dataset(datasetId=did) + except Exception as e: + print(f"Failed to delete dataset {did}: {e}") + + def test_on_demand_runner_executes_scenarios(self): + """OnDemandRunner invokes agent for each scenario from ServiceDatasetProvider.""" + provider = ServiceDatasetProvider( + dataset_id=self.dataset_id, + region_name=self.region, + ) + dataset = provider.get_dataset() + + runner = OnDemandEvaluationDatasetRunner(region=self.region) + invoker = make_agent_invoker(self.runtime_arn, self.region) + + from bedrock_agentcore.evaluation.agent_span_collector import CloudWatchAgentSpanCollector + + collector = CloudWatchAgentSpanCollector( + log_group_name=f"/aws/bedrock-agentcore/runtimes/{self.runtime_id}-DEFAULT", + region=self.region, + ) + + config = EvaluationRunConfig( + evaluator_config=EvaluatorConfig(evaluator_ids=["Builtin.Helpfulness"]), + evaluation_delay_seconds=60, + max_concurrent_scenarios=2, + ) + + result = runner.run( + config=config, + dataset=dataset, + agent_invoker=invoker, + span_collector=collector, + ) + + assert len(result.scenario_results) == 2 + scenario_ids = {r.scenario_id for r in result.scenario_results} + assert "greeting" in scenario_ids + assert "math" in scenario_ids From 7855fdaa7ead5e0cd8f211df74495515a4f25ac2 Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 21:57:31 -0400 Subject: [PATCH 04/18] test(evaluation): runner integ test passes with real agent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update test_runners_with_service_dataset.py to use env-var config: - INTEG_AGENT_RUNTIME_ARN: skips if not set - BEDROCK_TEST_REGION: region matching the agent - Verified end-to-end: ServiceDatasetProvider → OnDemandRunner → real agent invocation → COMPLETED --- .../test_runners_with_service_dataset.py | 77 +++++++++++++------ 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/tests_integ/evaluation/test_runners_with_service_dataset.py b/tests_integ/evaluation/test_runners_with_service_dataset.py index d7d0cfa0..49ffb939 100644 --- a/tests_integ/evaluation/test_runners_with_service_dataset.py +++ b/tests_integ/evaluation/test_runners_with_service_dataset.py @@ -1,46 +1,73 @@ -"""Integration tests for OnDemandEvaluationDatasetRunner and BatchEvaluationRunner -using ServiceDatasetProvider. +"""Integration tests for OnDemandEvaluationDatasetRunner using ServiceDatasetProvider. These tests verify the full pipeline: ServiceDatasetProvider → Dataset → Runner → Agent Invocation → Evaluation -Requires a deployed agent runtime. Set INTEG_AGENT_NAME env var to the name of a -READY agent runtime, or the test will attempt to create one using defaults in helpers.py. +Required env vars: + INTEG_AGENT_RUNTIME_ARN: ARN of a deployed, invokable agent runtime + BEDROCK_TEST_REGION: AWS region (must match the agent's region) + +Optional env vars: + INTEG_AGENT_LOG_GROUP: CloudWatch log group for the agent's spans + (defaults to /aws/bedrock-agentcore/runtimes/{runtime_id}-DEFAULT) Run with: + export INTEG_AGENT_RUNTIME_ARN=arn:aws:bedrock-agentcore:us-east-1:619071331382:runtime/hosted_agent_j135m-HDtuIw2zSo + export BEDROCK_TEST_REGION=us-east-1 uv run pytest tests_integ/evaluation/test_runners_with_service_dataset.py -xvs --log-cli-level=INFO """ +import json import os import time +import uuid +import boto3 import pytest from bedrock_agentcore.evaluation.dataset_client import DatasetClient from bedrock_agentcore.evaluation.runner.dataset_providers import ServiceDatasetProvider +from bedrock_agentcore.evaluation.runner.invoker_types import AgentInvokerInput, AgentInvokerOutput from bedrock_agentcore.evaluation.runner.on_demand.config import EvaluationRunConfig, EvaluatorConfig from bedrock_agentcore.evaluation.runner.on_demand.on_demand_runner import OnDemandEvaluationDatasetRunner -from .helpers import get_or_create_agent_runtime, make_agent_invoker +RUNTIME_ARN = os.environ.get("INTEG_AGENT_RUNTIME_ARN") +REGION = os.environ.get("BEDROCK_TEST_REGION", "us-west-2") + + +def _make_invoker(runtime_arn: str, region: str): + """Create an invoker that sends {prompt: ...} to the agent.""" + dp = boto3.client("bedrock-agentcore", region_name=region) + + def invoker(input: AgentInvokerInput) -> AgentInvokerOutput: + prompt = input.payload if isinstance(input.payload, str) else json.dumps(input.payload) + resp = dp.invoke_agent_runtime( + agentRuntimeArn=runtime_arn, + payload=json.dumps({"prompt": prompt}).encode(), + runtimeSessionId=input.session_id if input.session_id and len(input.session_id) >= 33 else str(uuid.uuid4()), + ) + body = resp["response"].read().decode() + result = json.loads(body) if body else {} + return AgentInvokerOutput(agent_output=result.get("result", result)) + + return invoker @pytest.mark.integration -@pytest.mark.skip(reason="Requires a warm deployed agent runtime. See helpers.py for setup.") +@pytest.mark.skipif(not RUNTIME_ARN, reason="INTEG_AGENT_RUNTIME_ARN not set") class TestOnDemandRunnerWithServiceDataset: """OnDemandEvaluationDatasetRunner + ServiceDatasetProvider end-to-end.""" @classmethod def setup_class(cls): - cls.region = os.environ.get("BEDROCK_TEST_REGION", "us-west-2") - agent_name = os.environ.get("INTEG_AGENT_NAME", None) + cls.region = REGION + cls.runtime_arn = RUNTIME_ARN + cls.runtime_id = cls.runtime_arn.split("/")[-1] - kwargs = {"region": cls.region} - if agent_name: - kwargs["agent_name"] = agent_name - - agent_info = get_or_create_agent_runtime(**kwargs) - cls.runtime_arn = agent_info["runtime_arn"] - cls.runtime_id = agent_info["runtime_id"] + cls.log_group = os.environ.get( + "INTEG_AGENT_LOG_GROUP", + f"/aws/bedrock-agentcore/runtimes/{cls.runtime_id}-DEFAULT", + ) cls.client = DatasetClient(region_name=cls.region) cls.test_prefix = f"sdk_integ_runner_{int(time.time())}" @@ -54,16 +81,12 @@ def setup_class(cls): "examples": [ { "scenario_id": "greeting", - "turns": [ - {"input": "Hello", "expected_response": "Hi there!"} - ], + "turns": [{"input": "Hello", "expected_response": "Hi there!"}], "assertions": ["Agent should respond politely"], }, { "scenario_id": "math", - "turns": [ - {"input": "What is 2+2?", "expected_response": "4"} - ], + "turns": [{"input": "What is 2+2?", "expected_response": "4"}], }, ] } @@ -80,6 +103,7 @@ def teardown_class(cls): except Exception as e: print(f"Failed to delete dataset {did}: {e}") + @pytest.mark.order(1) def test_on_demand_runner_executes_scenarios(self): """OnDemandRunner invokes agent for each scenario from ServiceDatasetProvider.""" provider = ServiceDatasetProvider( @@ -87,20 +111,21 @@ def test_on_demand_runner_executes_scenarios(self): region_name=self.region, ) dataset = provider.get_dataset() + assert len(dataset.scenarios) == 2 runner = OnDemandEvaluationDatasetRunner(region=self.region) - invoker = make_agent_invoker(self.runtime_arn, self.region) + invoker = _make_invoker(self.runtime_arn, self.region) from bedrock_agentcore.evaluation.agent_span_collector import CloudWatchAgentSpanCollector collector = CloudWatchAgentSpanCollector( - log_group_name=f"/aws/bedrock-agentcore/runtimes/{self.runtime_id}-DEFAULT", + log_group_name=self.log_group, region=self.region, ) config = EvaluationRunConfig( evaluator_config=EvaluatorConfig(evaluator_ids=["Builtin.Helpfulness"]), - evaluation_delay_seconds=60, + evaluation_delay_seconds=30, max_concurrent_scenarios=2, ) @@ -111,7 +136,11 @@ def test_on_demand_runner_executes_scenarios(self): span_collector=collector, ) + # Both scenarios should have been executed assert len(result.scenario_results) == 2 scenario_ids = {r.scenario_id for r in result.scenario_results} assert "greeting" in scenario_ids assert "math" in scenario_ids + # Both should complete (agent invocation succeeded) + for sr in result.scenario_results: + assert sr.status == "COMPLETED", f"Scenario {sr.scenario_id} failed: {sr.error}" From 43b70d963ddbcf5995ea0dc7aaea234617a275b3 Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 22:09:32 -0400 Subject: [PATCH 05/18] refactor: extract region from agent ARN, no separate BEDROCK_TEST_REGION needed --- tests_integ/evaluation/test_runners_with_service_dataset.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests_integ/evaluation/test_runners_with_service_dataset.py b/tests_integ/evaluation/test_runners_with_service_dataset.py index 49ffb939..9d0d24c9 100644 --- a/tests_integ/evaluation/test_runners_with_service_dataset.py +++ b/tests_integ/evaluation/test_runners_with_service_dataset.py @@ -5,7 +5,7 @@ Required env vars: INTEG_AGENT_RUNTIME_ARN: ARN of a deployed, invokable agent runtime - BEDROCK_TEST_REGION: AWS region (must match the agent's region) + (region is extracted from the ARN automatically) Optional env vars: INTEG_AGENT_LOG_GROUP: CloudWatch log group for the agent's spans @@ -13,7 +13,6 @@ Run with: export INTEG_AGENT_RUNTIME_ARN=arn:aws:bedrock-agentcore:us-east-1:619071331382:runtime/hosted_agent_j135m-HDtuIw2zSo - export BEDROCK_TEST_REGION=us-east-1 uv run pytest tests_integ/evaluation/test_runners_with_service_dataset.py -xvs --log-cli-level=INFO """ @@ -32,7 +31,7 @@ from bedrock_agentcore.evaluation.runner.on_demand.on_demand_runner import OnDemandEvaluationDatasetRunner RUNTIME_ARN = os.environ.get("INTEG_AGENT_RUNTIME_ARN") -REGION = os.environ.get("BEDROCK_TEST_REGION", "us-west-2") +REGION = RUNTIME_ARN.split(":")[3] if RUNTIME_ARN else os.environ.get("BEDROCK_TEST_REGION", "us-west-2") def _make_invoker(runtime_arn: str, region: str): From d354f758dd182b71d2b4d2b0991a643b6586c009 Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 22:54:45 -0400 Subject: [PATCH 06/18] fix(evaluation): address PR review feedback - ServiceDatasetProvider: accept client in __init__ (eliminates region_name) - ServiceDatasetProvider: validate schemaType against supported runner schemas - ServiceDatasetProvider: proper error message on download failure - Remove helpers.py (not needed) - Add unit tests for unsupported schema and download failure cases --- .../evaluation/runner/dataset_providers.py | 40 +++- .../test_service_dataset_provider.py | 97 ++++++++-- tests_integ/evaluation/helpers.py | 181 ------------------ .../test_runners_with_service_dataset.py | 6 +- .../test_service_dataset_provider.py | 8 +- 5 files changed, 118 insertions(+), 214 deletions(-) delete mode 100644 tests_integ/evaluation/helpers.py diff --git a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py index 43b0f8ce..87550bbb 100644 --- a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py +++ b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py @@ -66,42 +66,66 @@ def _parse_turn(raw: Dict[str, Any]) -> Turn: ) +SUPPORTED_SCHEMA_TYPES = { + "AGENTCORE_EVALUATION_PREDEFINED_V1", + "AGENTCORE_EVALUATION_SIMULATED_V1", +} + + class ServiceDatasetProvider(DatasetProvider): """A dataset provider that loads a Dataset from the Dataset Management service.""" - def __init__(self, dataset_id: str, version_id: Optional[str] = None, region_name: Optional[str] = None): + def __init__(self, dataset_id: str, version_id: Optional[str] = None, client=None): """Initialize with a dataset ID and optional version. Args: dataset_id: The dataset ID to fetch. version_id: Optional version ID. If omitted, fetches DRAFT. - region_name: AWS region. Falls back to boto3 session region or us-west-2. + client: Optional DatasetClient instance. If not provided, a default is created. """ self._dataset_id = dataset_id self._version_id = version_id - self._region_name = region_name + self._client = client def get_dataset(self) -> Dataset: """Load and return the dataset from the Dataset Management service. Fetches the dataset via the presigned download URL returned by GetDataset. The URL points to a JSONL file where each line is one example. + + Raises: + ValueError: If the dataset has no downloadUrl or has an unsupported schemaType. + RuntimeError: If the dataset content cannot be downloaded. """ - from bedrock_agentcore.evaluation.dataset_client import DatasetClient + if self._client is None: + from bedrock_agentcore.evaluation.dataset_client import DatasetClient - client = DatasetClient(region_name=self._region_name) + self._client = DatasetClient() kwargs: Dict[str, Any] = {"datasetId": self._dataset_id} if self._version_id: kwargs["datasetVersion"] = self._version_id - response = client.get_dataset(**kwargs) + response = self._client.get_dataset(**kwargs) + + schema_type = response.get("schemaType") + if schema_type and schema_type not in SUPPORTED_SCHEMA_TYPES: + raise ValueError( + f"Dataset schema type '{schema_type}' is not supported by the evaluation runners. " + f"Supported types: {sorted(SUPPORTED_SCHEMA_TYPES)}" + ) + download_url = response.get("downloadUrl") if not download_url: raise ValueError(f"Dataset {self._dataset_id} has no downloadUrl. Status: {response.get('status')}") - r = requests.get(download_url) - r.raise_for_status() + try: + r = requests.get(download_url) + r.raise_for_status() + except requests.RequestException as e: + raise RuntimeError( + f"Couldn't download dataset from S3 bucket: {e}" + ) from e all_examples: List[Dict[str, Any]] = [] for line in r.text.strip().split("\n"): diff --git a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py index 8328613b..850d3724 100644 --- a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py +++ b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py @@ -11,7 +11,6 @@ SimulatedScenario, ) -PATCH_CLIENT = "bedrock_agentcore.evaluation.dataset_client.DatasetClient" PATCH_REQUESTS = "bedrock_agentcore.evaluation.runner.dataset_providers.requests" @@ -22,20 +21,30 @@ def _jsonl(*examples): return "\n".join(json.dumps(e) for e in examples) +def _mock_client(get_response, jsonl_content): + """Create a mock DatasetClient and mock requests.""" + mock_client = MagicMock() + mock_client.get_dataset.return_value = get_response + return mock_client, jsonl_content + + class TestServiceDatasetProvider: - def _make_provider(self, jsonl_content, dataset_id="ds-123", version_id=None, get_response=None): + def _run_provider(self, jsonl_content, dataset_id="ds-123", version_id=None, schema_type="AGENTCORE_EVALUATION_PREDEFINED_V1"): mock_client = MagicMock() - if get_response is None: - get_response = {"datasetId": dataset_id, "status": "ACTIVE", "downloadUrl": "https://example.com/dataset.jsonl"} - mock_client.get_dataset.return_value = get_response + mock_client.get_dataset.return_value = { + "datasetId": dataset_id, + "status": "ACTIVE", + "schemaType": schema_type, + "downloadUrl": "https://example.com/dataset.jsonl", + } mock_response = MagicMock() mock_response.text = jsonl_content mock_response.raise_for_status = MagicMock() - with patch(PATCH_CLIENT, return_value=mock_client), patch(PATCH_REQUESTS) as mock_requests: + with patch(PATCH_REQUESTS) as mock_requests: mock_requests.get.return_value = mock_response - provider = ServiceDatasetProvider(dataset_id=dataset_id, version_id=version_id, region_name="us-west-2") + provider = ServiceDatasetProvider(dataset_id=dataset_id, version_id=version_id, client=mock_client) return provider.get_dataset(), mock_client, mock_requests def test_get_dataset_predefined(self): @@ -54,7 +63,7 @@ def test_get_dataset_predefined(self): }, ) - dataset, mock_client, mock_requests = self._make_provider(content) + dataset, mock_client, mock_requests = self._run_provider(content) assert isinstance(dataset, Dataset) assert len(dataset.scenarios) == 2 @@ -88,7 +97,7 @@ def test_get_dataset_simulated(self): }, ) - dataset, _, _ = self._make_provider(content, dataset_id="ds-456") + dataset, _, _ = self._run_provider(content, dataset_id="ds-456", schema_type="AGENTCORE_EVALUATION_SIMULATED_V1") assert isinstance(dataset, Dataset) assert len(dataset.scenarios) == 1 @@ -102,7 +111,7 @@ def test_get_dataset_simulated(self): def test_downloads_from_presigned_url(self): content = _jsonl({"scenario_id": "s1", "turns": [{"input": "hi"}]}) - _, mock_client, mock_requests = self._make_provider(content) + _, mock_client, mock_requests = self._run_provider(content) mock_client.get_dataset.assert_called_once_with(datasetId="ds-123") mock_requests.get.assert_called_once_with("https://example.com/dataset.jsonl") @@ -110,19 +119,71 @@ def test_downloads_from_presigned_url(self): def test_get_dataset_with_version_id(self): content = _jsonl({"scenario_id": "s1", "turns": [{"input": "hi"}]}) - _, mock_client, _ = self._make_provider(content, dataset_id="ds-123", version_id="1") + _, mock_client, _ = self._run_provider(content, dataset_id="ds-123", version_id="1") mock_client.get_dataset.assert_called_once_with(datasetId="ds-123", datasetVersion="1") def test_get_dataset_no_download_url_raises(self): mock_client = MagicMock() - mock_client.get_dataset.return_value = {"datasetId": "ds-123", "status": "CREATING"} + mock_client.get_dataset.return_value = { + "datasetId": "ds-123", + "status": "CREATING", + "schemaType": "AGENTCORE_EVALUATION_PREDEFINED_V1", + } - with patch(PATCH_CLIENT, return_value=mock_client), patch(PATCH_REQUESTS): - provider = ServiceDatasetProvider(dataset_id="ds-123") - with pytest.raises(ValueError, match="no downloadUrl"): - provider.get_dataset() + provider = ServiceDatasetProvider(dataset_id="ds-123", client=mock_client) + with pytest.raises(ValueError, match="no downloadUrl"): + provider.get_dataset() def test_get_dataset_empty_raises(self): - with pytest.raises(ValueError, match="scenarios must not be empty"): - self._make_provider("", dataset_id="ds-empty") + mock_client = MagicMock() + mock_client.get_dataset.return_value = { + "datasetId": "ds-empty", + "status": "ACTIVE", + "schemaType": "AGENTCORE_EVALUATION_PREDEFINED_V1", + "downloadUrl": "https://example.com/dataset.jsonl", + } + + mock_response = MagicMock() + mock_response.text = "" + mock_response.raise_for_status = MagicMock() + + with patch(PATCH_REQUESTS) as mock_requests: + mock_requests.get.return_value = mock_response + provider = ServiceDatasetProvider(dataset_id="ds-empty", client=mock_client) + with pytest.raises(ValueError, match="scenarios must not be empty"): + provider.get_dataset() + + def test_unsupported_schema_type_raises(self): + mock_client = MagicMock() + mock_client.get_dataset.return_value = { + "datasetId": "ds-123", + "status": "ACTIVE", + "schemaType": "RAGAS_V1", + "downloadUrl": "https://example.com/dataset.jsonl", + } + + provider = ServiceDatasetProvider(dataset_id="ds-123", client=mock_client) + with pytest.raises(ValueError, match="not supported by the evaluation runners"): + provider.get_dataset() + + def test_download_failure_raises_runtime_error(self): + import requests as real_requests + + mock_client = MagicMock() + mock_client.get_dataset.return_value = { + "datasetId": "ds-123", + "status": "ACTIVE", + "schemaType": "AGENTCORE_EVALUATION_PREDEFINED_V1", + "downloadUrl": "https://example.com/dataset.jsonl", + } + + mock_response = MagicMock() + mock_response.raise_for_status.side_effect = real_requests.HTTPError("403 Forbidden") + + with patch(PATCH_REQUESTS) as mock_requests: + mock_requests.get.return_value = mock_response + mock_requests.RequestException = real_requests.RequestException + provider = ServiceDatasetProvider(dataset_id="ds-123", client=mock_client) + with pytest.raises(RuntimeError, match="Couldn't download dataset"): + provider.get_dataset() diff --git a/tests_integ/evaluation/helpers.py b/tests_integ/evaluation/helpers.py deleted file mode 100644 index 1b532887..00000000 --- a/tests_integ/evaluation/helpers.py +++ /dev/null @@ -1,181 +0,0 @@ -"""Shared helpers for evaluation integration tests.""" - -import json -import logging -import time -import uuid -from typing import Any, Dict - -import boto3 - -from bedrock_agentcore.evaluation.runner.invoker_types import AgentInvokerFn, AgentInvokerInput, AgentInvokerOutput - -logger = logging.getLogger(__name__) - -DEFAULT_AGENT_NAME = "sdk_integ_echo_bundled" -DEFAULT_REGION = "us-west-2" -DEFAULT_ROLE_ARN = "arn:aws:iam::619071331382:role/Kyros-Bedrock-AgentCore" -DEFAULT_S3_BUCKET = "codetest-us-west-2-619071331382-do-not-delete" -DEFAULT_S3_KEY = "integ-test-agents/echo-agent-bundled.zip" - - -def get_or_create_agent_runtime( - agent_name: str = DEFAULT_AGENT_NAME, - region: str = DEFAULT_REGION, - role_arn: str = DEFAULT_ROLE_ARN, - s3_bucket: str = DEFAULT_S3_BUCKET, - s3_key: str = DEFAULT_S3_KEY, -) -> Dict[str, str]: - """Find an existing READY agent runtime by name, or create one. - - The agent is never deleted — it's reused across test runs. - - Returns: - {"runtime_id": "...", "runtime_arn": "...", "endpoint_arn": "..."} - """ - cp = boto3.client("bedrock-agentcore-control", region_name=region) - - # Search for existing runtime by name - runtime = _find_runtime_by_name(cp, agent_name) - - if runtime is None: - logger.info("No runtime found with name=%s, creating...", agent_name) - runtime = _create_runtime(cp, agent_name, role_arn, s3_bucket, s3_key) - - runtime_id = runtime["agentRuntimeId"] - runtime_arn = runtime["agentRuntimeArn"] - logger.info("Using runtime: %s (%s)", runtime_id, runtime_arn) - - # Find or create endpoint - endpoint_arn = _get_or_create_endpoint(cp, runtime_id) - - # Warm up the agent to avoid cold start timeouts on first invoke - _warmup_agent(runtime_arn, region) - - return { - "runtime_id": runtime_id, - "runtime_arn": runtime_arn, - "endpoint_arn": endpoint_arn, - } - - -def make_agent_invoker(runtime_arn: str, region: str = DEFAULT_REGION) -> AgentInvokerFn: - """Create an AgentInvokerFn that calls the deployed agent via invoke_agent_runtime.""" - dp = boto3.client("bedrock-agentcore", region_name=region) - - def invoker(input: AgentInvokerInput) -> AgentInvokerOutput: - payload = input.payload if isinstance(input.payload, str) else json.dumps(input.payload) - last_error = None - for attempt in range(3): - try: - response = dp.invoke_agent_runtime( - agentRuntimeArn=runtime_arn, - payload=payload.encode(), - runtimeSessionId=input.session_id if input.session_id and len(input.session_id) >= 33 else str(uuid.uuid4()), - ) - body = response.get("response", "") - if hasattr(body, "read"): - body = body.read().decode() - result = json.loads(body) if body else {} - return AgentInvokerOutput(agent_output=result.get("output", result)) - except Exception as e: - last_error = e - if "initialization time exceeded" in str(e).lower() and attempt < 2: - import time as _t - _t.sleep(5) - continue - raise - raise last_error # type: ignore[misc] - - return invoker - - -# --- Private helpers --- - - -def _find_runtime_by_name(cp, agent_name: str) -> Any: - """List runtimes and find one matching the name with READY status.""" - next_token = None - while True: - kwargs: Dict[str, Any] = {"maxResults": 100} - if next_token: - kwargs["nextToken"] = next_token - response = cp.list_agent_runtimes(**kwargs) - for rt in response.get("agentRuntimes", []): - if rt.get("agentRuntimeName") == agent_name and rt.get("status") == "READY": - return rt - next_token = response.get("nextToken") - if not next_token: - break - return None - - -def _create_runtime(cp, agent_name: str, role_arn: str, s3_bucket: str, s3_key: str) -> Dict[str, Any]: - """Create a runtime and wait for READY.""" - response = cp.create_agent_runtime( - agentRuntimeName=agent_name, - roleArn=role_arn, - networkConfiguration={"networkMode": "PUBLIC"}, - agentRuntimeArtifact={ - "codeConfiguration": { - "code": {"s3": {"bucket": s3_bucket, "prefix": s3_key}}, - "runtime": "PYTHON_3_12", - "entryPoint": ["agent.py"], - } - }, - ) - runtime_id = response["agentRuntimeId"] - logger.info("Created runtime %s, waiting for READY...", runtime_id) - - for _ in range(60): - rt = cp.get_agent_runtime(agentRuntimeId=runtime_id) - status = rt["status"] - if status == "READY": - return rt - if "FAILED" in status: - raise RuntimeError(f"Runtime creation failed: {rt.get('statusReasons')}") - time.sleep(5) - - raise TimeoutError(f"Runtime {runtime_id} did not reach READY within 300s") - - -def _warmup_agent(runtime_arn: str, region: str) -> None: - """Send a warmup invoke to avoid cold start timeouts on first real call.""" - dp = boto3.client("bedrock-agentcore", region_name=region) - for attempt in range(5): - try: - resp = dp.invoke_agent_runtime( - agentRuntimeArn=runtime_arn, - payload=json.dumps({"input": "warmup"}).encode(), - runtimeSessionId=str(uuid.uuid4()), - ) - body = resp.get("response", "") - if hasattr(body, "read"): - body.read() - logger.info("Agent warmup successful on attempt %d", attempt + 1) - return - except Exception as e: - logger.debug("Warmup attempt %d failed: %s", attempt + 1, e) - time.sleep(10) - logger.warning("Agent warmup failed after 5 attempts — cold start may occur") - - -def _get_or_create_endpoint(cp, runtime_id: str) -> str: - """Find a READY endpoint or create one.""" - eps = cp.list_agent_runtime_endpoints(agentRuntimeId=runtime_id) - for ep in eps.get("runtimeEndpoints", []): - if ep.get("status") == "READY": - return ep["agentRuntimeEndpointArn"] - - # Create endpoint - logger.info("No READY endpoint found, creating...") - cp.create_agent_runtime_endpoint(agentRuntimeId=runtime_id, name="default") - - for _ in range(60): - eps = cp.list_agent_runtime_endpoints(agentRuntimeId=runtime_id) - for ep in eps.get("runtimeEndpoints", []): - if ep.get("status") == "READY": - return ep["agentRuntimeEndpointArn"] - time.sleep(5) - - raise TimeoutError(f"Endpoint for runtime {runtime_id} did not reach READY within 300s") diff --git a/tests_integ/evaluation/test_runners_with_service_dataset.py b/tests_integ/evaluation/test_runners_with_service_dataset.py index 9d0d24c9..d261031c 100644 --- a/tests_integ/evaluation/test_runners_with_service_dataset.py +++ b/tests_integ/evaluation/test_runners_with_service_dataset.py @@ -35,7 +35,7 @@ def _make_invoker(runtime_arn: str, region: str): - """Create an invoker that sends {prompt: ...} to the agent.""" + """Create an invoker compatible with the hosted echo agent.""" dp = boto3.client("bedrock-agentcore", region_name=region) def invoker(input: AgentInvokerInput) -> AgentInvokerOutput: @@ -61,7 +61,7 @@ class TestOnDemandRunnerWithServiceDataset: def setup_class(cls): cls.region = REGION cls.runtime_arn = RUNTIME_ARN - cls.runtime_id = cls.runtime_arn.split("/")[-1] + cls.runtime_id = cls.runtime_arn.split("/")[-1] # type: ignore[union-attr] cls.log_group = os.environ.get( "INTEG_AGENT_LOG_GROUP", @@ -107,7 +107,7 @@ def test_on_demand_runner_executes_scenarios(self): """OnDemandRunner invokes agent for each scenario from ServiceDatasetProvider.""" provider = ServiceDatasetProvider( dataset_id=self.dataset_id, - region_name=self.region, + client=self.client, ) dataset = provider.get_dataset() assert len(dataset.scenarios) == 2 diff --git a/tests_integ/evaluation/test_service_dataset_provider.py b/tests_integ/evaluation/test_service_dataset_provider.py index 770f10ad..519589d1 100644 --- a/tests_integ/evaluation/test_service_dataset_provider.py +++ b/tests_integ/evaluation/test_service_dataset_provider.py @@ -105,7 +105,7 @@ def test_get_predefined_dataset(self): """ServiceDatasetProvider returns a valid SDK Dataset with PredefinedScenarios.""" provider = ServiceDatasetProvider( dataset_id=self.predefined_dataset_id, - region_name=self.region, + client=self.client, ) dataset = provider.get_dataset() @@ -119,7 +119,7 @@ def test_predefined_fields_preserved(self): """Scenario fields (turns, assertions, trajectory) are preserved.""" provider = ServiceDatasetProvider( dataset_id=self.predefined_dataset_id, - region_name=self.region, + client=self.client, ) dataset = provider.get_dataset() @@ -139,7 +139,7 @@ def test_get_predefined_with_version(self): provider = ServiceDatasetProvider( dataset_id=self.predefined_dataset_id, version_id=version_id, - region_name=self.region, + client=self.client, ) dataset = provider.get_dataset() @@ -153,7 +153,7 @@ def test_get_synthetic_dataset(self): """ServiceDatasetProvider returns SimulatedScenarios for SYNTHETIC schema.""" provider = ServiceDatasetProvider( dataset_id=self.synthetic_dataset_id, - region_name=self.region, + client=self.client, ) dataset = provider.get_dataset() From 30ab84937d272979284b21565037d1626e9ba1f4 Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 23:08:53 -0400 Subject: [PATCH 07/18] fix(evaluation): address second round of PR feedback - ServiceDatasetProvider: import DatasetClient at top, default in __init__ - ScenarioExecutor: add schema_type field, override in Predefined/Simulated - ServiceDatasetProvider: collect supported schemas dynamically from executors - delete_dataset_and_wait: add DELETE_FAILED as failed status --- .../evaluation/dataset_client.py | 1 + .../evaluation/runner/dataset_providers.py | 25 +++++++++---------- .../evaluation/runner/scenario_executor.py | 5 ++++ 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/dataset_client.py b/src/bedrock_agentcore/evaluation/dataset_client.py index 19ba9e4b..6eed289b 100644 --- a/src/bedrock_agentcore/evaluation/dataset_client.py +++ b/src/bedrock_agentcore/evaluation/dataset_client.py @@ -146,6 +146,7 @@ def delete_dataset_and_wait( dataset_id = response["datasetId"] wait_until_deleted( lambda: self._cp_client.get_dataset(datasetId=dataset_id), + failed={"DELETE_FAILED"}, wait_config=wait_config, ) diff --git a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py index 87550bbb..13a89c67 100644 --- a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py +++ b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py @@ -6,6 +6,8 @@ import requests +from bedrock_agentcore.evaluation.dataset_client import DatasetClient + from .dataset_types import ActorProfile, Dataset, PredefinedScenario, Scenario, SimulatedScenario, Turn @@ -66,16 +68,17 @@ def _parse_turn(raw: Dict[str, Any]) -> Turn: ) -SUPPORTED_SCHEMA_TYPES = { - "AGENTCORE_EVALUATION_PREDEFINED_V1", - "AGENTCORE_EVALUATION_SIMULATED_V1", -} +def _get_supported_schema_types() -> set: + """Collect supported schema types from all ScenarioExecutor subclasses.""" + from .scenario_executor import ScenarioExecutor + + return {cls.model_fields["schema_type"].default for cls in ScenarioExecutor.__subclasses__() if cls.model_fields["schema_type"].default} class ServiceDatasetProvider(DatasetProvider): """A dataset provider that loads a Dataset from the Dataset Management service.""" - def __init__(self, dataset_id: str, version_id: Optional[str] = None, client=None): + def __init__(self, dataset_id: str, version_id: Optional[str] = None, client: Optional[DatasetClient] = None): """Initialize with a dataset ID and optional version. Args: @@ -85,7 +88,7 @@ def __init__(self, dataset_id: str, version_id: Optional[str] = None, client=Non """ self._dataset_id = dataset_id self._version_id = version_id - self._client = client + self._client = client if client is not None else DatasetClient() def get_dataset(self) -> Dataset: """Load and return the dataset from the Dataset Management service. @@ -97,11 +100,6 @@ def get_dataset(self) -> Dataset: ValueError: If the dataset has no downloadUrl or has an unsupported schemaType. RuntimeError: If the dataset content cannot be downloaded. """ - if self._client is None: - from bedrock_agentcore.evaluation.dataset_client import DatasetClient - - self._client = DatasetClient() - kwargs: Dict[str, Any] = {"datasetId": self._dataset_id} if self._version_id: kwargs["datasetVersion"] = self._version_id @@ -109,10 +107,11 @@ def get_dataset(self) -> Dataset: response = self._client.get_dataset(**kwargs) schema_type = response.get("schemaType") - if schema_type and schema_type not in SUPPORTED_SCHEMA_TYPES: + supported = _get_supported_schema_types() + if schema_type and schema_type not in supported: raise ValueError( f"Dataset schema type '{schema_type}' is not supported by the evaluation runners. " - f"Supported types: {sorted(SUPPORTED_SCHEMA_TYPES)}" + f"Supported types: {sorted(supported)}" ) download_url = response.get("downloadUrl") diff --git a/src/bedrock_agentcore/evaluation/runner/scenario_executor.py b/src/bedrock_agentcore/evaluation/runner/scenario_executor.py index b042fcff..27a27c00 100644 --- a/src/bedrock_agentcore/evaluation/runner/scenario_executor.py +++ b/src/bedrock_agentcore/evaluation/runner/scenario_executor.py @@ -70,6 +70,7 @@ class ScenarioExecutor(BaseModel, ABC): model_config = ConfigDict(arbitrary_types_allowed=True) agent_invoker: AgentInvokerFn + schema_type: str = "" @abstractmethod def run_scenario(self, scenario: Scenario) -> ScenarioExecutionResult: @@ -79,6 +80,8 @@ def run_scenario(self, scenario: Scenario) -> ScenarioExecutionResult: class PredefinedScenarioExecutor(ScenarioExecutor): """Runs a PredefinedScenario by iterating its explicit turns.""" + schema_type: str = "AGENTCORE_EVALUATION_PREDEFINED_V1" + def run_scenario(self, scenario: Scenario) -> ScenarioExecutionResult: """Execute a predefined scenario by invoking the agent for each turn.""" logger.debug("Running scenario %s (%d turn(s))", scenario.scenario_id, len(scenario.turns)) @@ -138,6 +141,8 @@ class SimulatedScenarioExecutor(ScenarioExecutor): for JSON parsing heuristics. """ + schema_type: str = "AGENTCORE_EVALUATION_SIMULATED_V1" + simulation_config: Optional[SimulationConfig] = None def run_scenario(self, scenario: Scenario) -> ScenarioExecutionResult: From 625190dafe637a4cae298b7a6c593c577f7eeb8b Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 23:11:23 -0400 Subject: [PATCH 08/18] style: fix lint and formatting --- .../evaluation/runner/dataset_providers.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py index 13a89c67..1f846f90 100644 --- a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py +++ b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py @@ -72,7 +72,11 @@ def _get_supported_schema_types() -> set: """Collect supported schema types from all ScenarioExecutor subclasses.""" from .scenario_executor import ScenarioExecutor - return {cls.model_fields["schema_type"].default for cls in ScenarioExecutor.__subclasses__() if cls.model_fields["schema_type"].default} + return { + cls.model_fields["schema_type"].default + for cls in ScenarioExecutor.__subclasses__() + if cls.model_fields["schema_type"].default + } class ServiceDatasetProvider(DatasetProvider): @@ -122,9 +126,7 @@ def get_dataset(self) -> Dataset: r = requests.get(download_url) r.raise_for_status() except requests.RequestException as e: - raise RuntimeError( - f"Couldn't download dataset from S3 bucket: {e}" - ) from e + raise RuntimeError(f"Couldn't download dataset from S3 bucket: {e}") from e all_examples: List[Dict[str, Any]] = [] for line in r.text.strip().split("\n"): From a7c784d3cbb997cc735e55ab91ebc9d9b34e504d Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 23:13:47 -0400 Subject: [PATCH 09/18] style: fix remaining line-too-long issues --- .../evaluation/test_service_dataset_provider.py | 8 ++++++-- .../evaluation/test_runners_with_service_dataset.py | 8 +++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py index 850d3724..1686fcb1 100644 --- a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py +++ b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py @@ -29,7 +29,9 @@ def _mock_client(get_response, jsonl_content): class TestServiceDatasetProvider: - def _run_provider(self, jsonl_content, dataset_id="ds-123", version_id=None, schema_type="AGENTCORE_EVALUATION_PREDEFINED_V1"): + def _run_provider( + self, jsonl_content, dataset_id="ds-123", version_id=None, schema_type="AGENTCORE_EVALUATION_PREDEFINED_V1" + ): mock_client = MagicMock() mock_client.get_dataset.return_value = { "datasetId": dataset_id, @@ -97,7 +99,9 @@ def test_get_dataset_simulated(self): }, ) - dataset, _, _ = self._run_provider(content, dataset_id="ds-456", schema_type="AGENTCORE_EVALUATION_SIMULATED_V1") + dataset, _, _ = self._run_provider( + content, dataset_id="ds-456", schema_type="AGENTCORE_EVALUATION_SIMULATED_V1" + ) assert isinstance(dataset, Dataset) assert len(dataset.scenarios) == 1 diff --git a/tests_integ/evaluation/test_runners_with_service_dataset.py b/tests_integ/evaluation/test_runners_with_service_dataset.py index d261031c..3cc07c2e 100644 --- a/tests_integ/evaluation/test_runners_with_service_dataset.py +++ b/tests_integ/evaluation/test_runners_with_service_dataset.py @@ -12,8 +12,8 @@ (defaults to /aws/bedrock-agentcore/runtimes/{runtime_id}-DEFAULT) Run with: - export INTEG_AGENT_RUNTIME_ARN=arn:aws:bedrock-agentcore:us-east-1:619071331382:runtime/hosted_agent_j135m-HDtuIw2zSo - uv run pytest tests_integ/evaluation/test_runners_with_service_dataset.py -xvs --log-cli-level=INFO + export INTEG_AGENT_RUNTIME_ARN= + uv run pytest tests_integ/evaluation/test_runners_with_service_dataset.py -xvs """ import json @@ -43,7 +43,9 @@ def invoker(input: AgentInvokerInput) -> AgentInvokerOutput: resp = dp.invoke_agent_runtime( agentRuntimeArn=runtime_arn, payload=json.dumps({"prompt": prompt}).encode(), - runtimeSessionId=input.session_id if input.session_id and len(input.session_id) >= 33 else str(uuid.uuid4()), + runtimeSessionId=( + input.session_id if input.session_id and len(input.session_id) >= 33 else str(uuid.uuid4()) + ), ) body = resp["response"].read().decode() result = json.loads(body) if body else {} From 94860c16f50f49f78652607dbc20b609d6835665 Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 23:15:45 -0400 Subject: [PATCH 10/18] fix: remove unused variable assignment (F841) --- tests/bedrock_agentcore/evaluation/test_dataset_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/bedrock_agentcore/evaluation/test_dataset_client.py b/tests/bedrock_agentcore/evaluation/test_dataset_client.py index a5e69ad4..b0c7dc8b 100644 --- a/tests/bedrock_agentcore/evaluation/test_dataset_client.py +++ b/tests/bedrock_agentcore/evaluation/test_dataset_client.py @@ -46,7 +46,7 @@ def test_allowed_method_forwarded(self, mock_boto3): mock_cp.list_datasets.return_value = {"datasets": []} client = DatasetClient() - result = client.list_datasets(maxResults=10) + client.list_datasets(maxResults=10) mock_cp.list_datasets.assert_called_once_with(maxResults=10) @patch("bedrock_agentcore.evaluation.dataset_client.boto3") From 72e2f2e2704120abcaf74cf348cc63c3f181abde Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 23:40:08 -0400 Subject: [PATCH 11/18] refactor: move schema_type to Scenario classes, _parse_scenario to module level - Add schema_type field to Scenario base, PredefinedScenario, SimulatedScenario - Remove schema_type from ScenarioExecutor (doesn't belong there) - Move _parse_scenario from FileDatasetProvider to module-level function - SUPPORTED_SCHEMA_TYPES derived from Scenario classes directly --- .../evaluation/runner/dataset_providers.py | 95 +++++++++---------- .../evaluation/runner/dataset_types.py | 4 + .../evaluation/runner/scenario_executor.py | 5 - 3 files changed, 48 insertions(+), 56 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py index 1f846f90..b28e8306 100644 --- a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py +++ b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py @@ -8,7 +8,46 @@ from bedrock_agentcore.evaluation.dataset_client import DatasetClient -from .dataset_types import ActorProfile, Dataset, PredefinedScenario, Scenario, SimulatedScenario, Turn +from .dataset_types import ( + ActorProfile, + Dataset, + PredefinedScenario, + Scenario, + SimulatedScenario, + Turn, +) + +SUPPORTED_SCHEMA_TYPES = { + PredefinedScenario.model_fields["schema_type"].default, + SimulatedScenario.model_fields["schema_type"].default, +} + + +def _parse_scenario(raw: Dict[str, Any]) -> PredefinedScenario | SimulatedScenario: + """Parse a raw dict into a PredefinedScenario or SimulatedScenario.""" + if "turns" in raw: + return PredefinedScenario( + scenario_id=raw["scenario_id"], + turns=[Turn(input=t["input"], expected_response=t.get("expected_response")) for t in raw["turns"]], + expected_trajectory=raw.get("expected_trajectory"), + assertions=raw.get("assertions"), + metadata=raw.get("metadata"), + ) + else: + missing = [k for k in ("scenario_id", "actor_profile", "input") if k not in raw] + if missing: + raise ValueError( + f"Scenario '{raw.get('scenario_id', '?')}' is missing required fields for SimulatedScenario: {missing}" + ) + return SimulatedScenario( + scenario_id=raw["scenario_id"], + scenario_description=raw.get("scenario_description", ""), + actor_profile=ActorProfile(**raw["actor_profile"]), + input=raw["input"], + max_turns=raw.get("max_turns", 10), + assertions=raw.get("assertions"), + metadata=raw.get("metadata"), + ) class DatasetProvider(ABC): @@ -30,54 +69,9 @@ def get_dataset(self) -> Dataset: """Load and return the dataset from the JSON file.""" with open(self._file_path) as f: data = json.load(f) - scenarios: List[Scenario] = [self._parse_scenario(s) for s in data["scenarios"]] + scenarios: List[Scenario] = [_parse_scenario(s) for s in data["scenarios"]] return Dataset(scenarios=scenarios) - @staticmethod - def _parse_scenario(raw: Dict[str, Any]) -> PredefinedScenario | SimulatedScenario: - if "turns" in raw: - return PredefinedScenario( - scenario_id=raw["scenario_id"], - turns=[FileDatasetProvider._parse_turn(t) for t in raw["turns"]], - expected_trajectory=raw.get("expected_trajectory"), - assertions=raw.get("assertions"), - metadata=raw.get("metadata"), - ) - else: - missing = [k for k in ("scenario_id", "actor_profile", "input") if k not in raw] - if missing: - raise ValueError( - f"Scenario '{raw.get('scenario_id', '?')}' is missing required fields " - f"for SimulatedScenario: {missing}" - ) - return SimulatedScenario( - scenario_id=raw["scenario_id"], - scenario_description=raw.get("scenario_description", ""), - actor_profile=ActorProfile(**raw["actor_profile"]), - input=raw["input"], - max_turns=raw.get("max_turns", 10), - assertions=raw.get("assertions"), - metadata=raw.get("metadata"), - ) - - @staticmethod - def _parse_turn(raw: Dict[str, Any]) -> Turn: - return Turn( - input=raw["input"], - expected_response=raw.get("expected_response"), - ) - - -def _get_supported_schema_types() -> set: - """Collect supported schema types from all ScenarioExecutor subclasses.""" - from .scenario_executor import ScenarioExecutor - - return { - cls.model_fields["schema_type"].default - for cls in ScenarioExecutor.__subclasses__() - if cls.model_fields["schema_type"].default - } - class ServiceDatasetProvider(DatasetProvider): """A dataset provider that loads a Dataset from the Dataset Management service.""" @@ -111,11 +105,10 @@ def get_dataset(self) -> Dataset: response = self._client.get_dataset(**kwargs) schema_type = response.get("schemaType") - supported = _get_supported_schema_types() - if schema_type and schema_type not in supported: + if schema_type and schema_type not in SUPPORTED_SCHEMA_TYPES: raise ValueError( f"Dataset schema type '{schema_type}' is not supported by the evaluation runners. " - f"Supported types: {sorted(supported)}" + f"Supported types: {sorted(SUPPORTED_SCHEMA_TYPES)}" ) download_url = response.get("downloadUrl") @@ -133,5 +126,5 @@ def get_dataset(self) -> Dataset: if line: all_examples.append(json.loads(line)) - scenarios: List[Scenario] = [FileDatasetProvider._parse_scenario(example) for example in all_examples] + scenarios: List[Scenario] = [_parse_scenario(example) for example in all_examples] return Dataset(scenarios=scenarios) diff --git a/src/bedrock_agentcore/evaluation/runner/dataset_types.py b/src/bedrock_agentcore/evaluation/runner/dataset_types.py index 6d8b6206..4e9f63b6 100644 --- a/src/bedrock_agentcore/evaluation/runner/dataset_types.py +++ b/src/bedrock_agentcore/evaluation/runner/dataset_types.py @@ -37,6 +37,7 @@ class Turn(BaseModel): class Scenario(BaseModel): """Base class for evaluation scenarios.""" + schema_type: str = "" scenario_id: str assertions: Optional[List[str]] = None metadata: Optional[Dict[str, Any]] = None @@ -45,6 +46,7 @@ class Scenario(BaseModel): class PredefinedScenario(Scenario): """A scenario with a predefined conversation flow.""" + schema_type: str = "AGENTCORE_EVALUATION_PREDEFINED_V1" turns: List[Turn] expected_trajectory: Optional[List[str]] = None @@ -72,6 +74,8 @@ class SimulatedScenario(Scenario): Defaults to 10. """ + schema_type: str = "AGENTCORE_EVALUATION_SIMULATED_V1" + model_config = ConfigDict(arbitrary_types_allowed=True) scenario_description: str = "" diff --git a/src/bedrock_agentcore/evaluation/runner/scenario_executor.py b/src/bedrock_agentcore/evaluation/runner/scenario_executor.py index 27a27c00..b042fcff 100644 --- a/src/bedrock_agentcore/evaluation/runner/scenario_executor.py +++ b/src/bedrock_agentcore/evaluation/runner/scenario_executor.py @@ -70,7 +70,6 @@ class ScenarioExecutor(BaseModel, ABC): model_config = ConfigDict(arbitrary_types_allowed=True) agent_invoker: AgentInvokerFn - schema_type: str = "" @abstractmethod def run_scenario(self, scenario: Scenario) -> ScenarioExecutionResult: @@ -80,8 +79,6 @@ def run_scenario(self, scenario: Scenario) -> ScenarioExecutionResult: class PredefinedScenarioExecutor(ScenarioExecutor): """Runs a PredefinedScenario by iterating its explicit turns.""" - schema_type: str = "AGENTCORE_EVALUATION_PREDEFINED_V1" - def run_scenario(self, scenario: Scenario) -> ScenarioExecutionResult: """Execute a predefined scenario by invoking the agent for each turn.""" logger.debug("Running scenario %s (%d turn(s))", scenario.scenario_id, len(scenario.turns)) @@ -141,8 +138,6 @@ class SimulatedScenarioExecutor(ScenarioExecutor): for JSON parsing heuristics. """ - schema_type: str = "AGENTCORE_EVALUATION_SIMULATED_V1" - simulation_config: Optional[SimulationConfig] = None def run_scenario(self, scenario: Scenario) -> ScenarioExecutionResult: From ff0c85bdf5a721be8856639819dc2039a75eecc7 Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Sun, 10 May 2026 23:43:43 -0400 Subject: [PATCH 12/18] fix: remove unused _DATASET_FAILED_STATUSES constant --- src/bedrock_agentcore/evaluation/dataset_client.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/dataset_client.py b/src/bedrock_agentcore/evaluation/dataset_client.py index 6eed289b..20db5779 100644 --- a/src/bedrock_agentcore/evaluation/dataset_client.py +++ b/src/bedrock_agentcore/evaluation/dataset_client.py @@ -13,8 +13,6 @@ logger = logging.getLogger(__name__) -_DATASET_FAILED_STATUSES = {"CREATE_FAILED", "UPDATE_FAILED"} - class DatasetClient: """Client for managing evaluation datasets. From b503993c220b606f71a9f726d1c74296d8089a57 Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Mon, 11 May 2026 11:09:01 -0400 Subject: [PATCH 13/18] fix(evaluation): address reviewer findings - Add timeout=60 to requests.get() for presigned URL download (#1) - Use r.content.decode("utf-8") instead of r.text for explicit encoding (#3) - Replace model_fields introspection with plain constant set (#8) - Guard __getattr__ against recursion when _cp_client not initialized (#5) - Remove dead _mock_client function from tests (#11) --- .../evaluation/dataset_client.py | 3 +++ .../evaluation/runner/dataset_providers.py | 20 +++++++++++-------- .../test_service_dataset_provider.py | 11 ++-------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/dataset_client.py b/src/bedrock_agentcore/evaluation/dataset_client.py index 20db5779..07469e21 100644 --- a/src/bedrock_agentcore/evaluation/dataset_client.py +++ b/src/bedrock_agentcore/evaluation/dataset_client.py @@ -84,6 +84,9 @@ def __init__( def __getattr__(self, name: str): """Dynamically forward allowlisted method calls to the boto3 client.""" + if "_cp_client" not in self.__dict__: + raise AttributeError(name) + if name in self._ALLOWED_CP_METHODS and hasattr(self._cp_client, name): method = getattr(self._cp_client, name) logger.debug("Forwarding method '%s' to _cp_client", name) diff --git a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py index b28e8306..78937afc 100644 --- a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py +++ b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py @@ -18,11 +18,10 @@ ) SUPPORTED_SCHEMA_TYPES = { - PredefinedScenario.model_fields["schema_type"].default, - SimulatedScenario.model_fields["schema_type"].default, + "AGENTCORE_EVALUATION_PREDEFINED_V1", + "AGENTCORE_EVALUATION_SIMULATED_V1", } - def _parse_scenario(raw: Dict[str, Any]) -> PredefinedScenario | SimulatedScenario: """Parse a raw dict into a PredefinedScenario or SimulatedScenario.""" if "turns" in raw: @@ -76,7 +75,12 @@ def get_dataset(self) -> Dataset: class ServiceDatasetProvider(DatasetProvider): """A dataset provider that loads a Dataset from the Dataset Management service.""" - def __init__(self, dataset_id: str, version_id: Optional[str] = None, client: Optional[DatasetClient] = None): + def __init__( + self, + dataset_id: str, + version_id: Optional[str] = None, + client: Optional[DatasetClient] = None, + ): """Initialize with a dataset ID and optional version. Args: @@ -107,8 +111,8 @@ def get_dataset(self) -> Dataset: schema_type = response.get("schemaType") if schema_type and schema_type not in SUPPORTED_SCHEMA_TYPES: raise ValueError( - f"Dataset schema type '{schema_type}' is not supported by the evaluation runners. " - f"Supported types: {sorted(SUPPORTED_SCHEMA_TYPES)}" + f"Dataset schema type '{schema_type}' is not supported by the " + f"evaluation runners. Supported types: {sorted(SUPPORTED_SCHEMA_TYPES)}" ) download_url = response.get("downloadUrl") @@ -116,13 +120,13 @@ def get_dataset(self) -> Dataset: raise ValueError(f"Dataset {self._dataset_id} has no downloadUrl. Status: {response.get('status')}") try: - r = requests.get(download_url) + r = requests.get(download_url, timeout=60) r.raise_for_status() except requests.RequestException as e: raise RuntimeError(f"Couldn't download dataset from S3 bucket: {e}") from e all_examples: List[Dict[str, Any]] = [] - for line in r.text.strip().split("\n"): + for line in r.content.decode("utf-8").strip().split("\n"): if line: all_examples.append(json.loads(line)) diff --git a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py index 1686fcb1..9208cfcc 100644 --- a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py +++ b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py @@ -21,13 +21,6 @@ def _jsonl(*examples): return "\n".join(json.dumps(e) for e in examples) -def _mock_client(get_response, jsonl_content): - """Create a mock DatasetClient and mock requests.""" - mock_client = MagicMock() - mock_client.get_dataset.return_value = get_response - return mock_client, jsonl_content - - class TestServiceDatasetProvider: def _run_provider( self, jsonl_content, dataset_id="ds-123", version_id=None, schema_type="AGENTCORE_EVALUATION_PREDEFINED_V1" @@ -41,7 +34,7 @@ def _run_provider( } mock_response = MagicMock() - mock_response.text = jsonl_content + mock_response.content = jsonl_content.encode("utf-8") mock_response.raise_for_status = MagicMock() with patch(PATCH_REQUESTS) as mock_requests: @@ -118,7 +111,7 @@ def test_downloads_from_presigned_url(self): _, mock_client, mock_requests = self._run_provider(content) mock_client.get_dataset.assert_called_once_with(datasetId="ds-123") - mock_requests.get.assert_called_once_with("https://example.com/dataset.jsonl") + mock_requests.get.assert_called_once_with("https://example.com/dataset.jsonl", timeout=60) def test_get_dataset_with_version_id(self): content = _jsonl({"scenario_id": "s1", "turns": [{"input": "hi"}]}) From 85fc63878777e081caafc47d0a722711e5f529ba Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Mon, 11 May 2026 11:13:30 -0400 Subject: [PATCH 14/18] fix(evaluation): stream JSONL download and consolidate test fixtures - Stream JSONL via iter_lines() instead of loading entire file into memory (#2) - Consolidate repetitive test mock setup with pytest fixtures (#12) --- .../evaluation/runner/dataset_providers.py | 7 +- .../evaluation/test_dataset_client.py | 124 ++++++------------ .../test_service_dataset_provider.py | 6 +- 3 files changed, 47 insertions(+), 90 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py index 78937afc..73757ee5 100644 --- a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py +++ b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py @@ -22,6 +22,7 @@ "AGENTCORE_EVALUATION_SIMULATED_V1", } + def _parse_scenario(raw: Dict[str, Any]) -> PredefinedScenario | SimulatedScenario: """Parse a raw dict into a PredefinedScenario or SimulatedScenario.""" if "turns" in raw: @@ -120,15 +121,15 @@ def get_dataset(self) -> Dataset: raise ValueError(f"Dataset {self._dataset_id} has no downloadUrl. Status: {response.get('status')}") try: - r = requests.get(download_url, timeout=60) + r = requests.get(download_url, timeout=60, stream=True) r.raise_for_status() except requests.RequestException as e: raise RuntimeError(f"Couldn't download dataset from S3 bucket: {e}") from e all_examples: List[Dict[str, Any]] = [] - for line in r.content.decode("utf-8").strip().split("\n"): + for line in r.iter_lines(decode_unicode=False): if line: - all_examples.append(json.loads(line)) + all_examples.append(json.loads(line.decode("utf-8"))) scenarios: List[Scenario] = [_parse_scenario(example) for example in all_examples] return Dataset(scenarios=scenarios) diff --git a/tests/bedrock_agentcore/evaluation/test_dataset_client.py b/tests/bedrock_agentcore/evaluation/test_dataset_client.py index b0c7dc8b..da93855d 100644 --- a/tests/bedrock_agentcore/evaluation/test_dataset_client.py +++ b/tests/bedrock_agentcore/evaluation/test_dataset_client.py @@ -8,8 +8,23 @@ from bedrock_agentcore.evaluation.dataset_client import DatasetClient +@pytest.fixture +def mock_boto3(): + with patch("bedrock_agentcore.evaluation.dataset_client.boto3") as m: + yield m + + +@pytest.fixture +def client_and_cp(mock_boto3): + """Return a (DatasetClient, mock_cp_client) tuple.""" + mock_session = MagicMock() + mock_boto3.Session.return_value = mock_session + mock_cp = mock_session.client.return_value + client = DatasetClient() + return client, mock_cp + + class TestDatasetClientInit: - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") def test_default_region(self, mock_boto3): mock_session = MagicMock() mock_session.region_name = "us-east-1" @@ -19,7 +34,6 @@ def test_default_region(self, mock_boto3): assert client.region_name == "us-east-1" mock_session.client.assert_called_once() - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") def test_explicit_region(self, mock_boto3): mock_session = MagicMock() mock_boto3.Session.return_value = mock_session @@ -27,7 +41,6 @@ def test_explicit_region(self, mock_boto3): client = DatasetClient(region_name="eu-west-1") assert client.region_name == "eu-west-1" - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") def test_custom_session(self, mock_boto3): custom_session = MagicMock() custom_session.region_name = "ap-southeast-1" @@ -38,49 +51,32 @@ def test_custom_session(self, mock_boto3): class TestDatasetClientPassthrough: - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_allowed_method_forwarded(self, mock_boto3): - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - mock_cp = mock_session.client.return_value + def test_allowed_method_forwarded(self, client_and_cp): + client, mock_cp = client_and_cp mock_cp.list_datasets.return_value = {"datasets": []} - client = DatasetClient() client.list_datasets(maxResults=10) mock_cp.list_datasets.assert_called_once_with(maxResults=10) - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_snake_case_converted(self, mock_boto3): - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - mock_cp = mock_session.client.return_value + def test_snake_case_converted(self, client_and_cp): + client, mock_cp = client_and_cp mock_cp.list_datasets.return_value = {"datasets": []} - client = DatasetClient() client.list_datasets(max_results=5) mock_cp.list_datasets.assert_called_once_with(maxResults=5) - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_non_allowed_method_raises(self, mock_boto3): - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - - client = DatasetClient() + def test_non_allowed_method_raises(self, client_and_cp): + client, _ = client_and_cp with pytest.raises(AttributeError, match="not_a_real_method"): client.not_a_real_method() class TestDatasetClientCreateAndWait: - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_create_dataset_and_wait_success(self, mock_boto3): - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - mock_cp = mock_session.client.return_value - + def test_create_dataset_and_wait_success(self, client_and_cp): + client, mock_cp = client_and_cp mock_cp.create_dataset.return_value = {"datasetId": "ds-123"} mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "ACTIVE"} - client = DatasetClient() result = client.create_dataset_and_wait( wait_config=WaitConfig(max_wait=10, poll_interval=1), datasetName="test", @@ -91,12 +87,8 @@ def test_create_dataset_and_wait_success(self, mock_boto3): mock_cp.create_dataset.assert_called_once() mock_cp.get_dataset.assert_called_with(datasetId="ds-123") - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_create_dataset_and_wait_failure(self, mock_boto3): - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - mock_cp = mock_session.client.return_value - + def test_create_dataset_and_wait_failure(self, client_and_cp): + client, mock_cp = client_and_cp mock_cp.create_dataset.return_value = {"datasetId": "ds-123"} mock_cp.get_dataset.return_value = { "datasetId": "ds-123", @@ -104,23 +96,17 @@ def test_create_dataset_and_wait_failure(self, mock_boto3): "statusReasons": "Invalid source", } - client = DatasetClient() with pytest.raises(RuntimeError, match="CREATE_FAILED"): client.create_dataset_and_wait( wait_config=WaitConfig(max_wait=5, poll_interval=1), datasetName="test", ) - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_create_dataset_and_wait_timeout(self, mock_boto3): - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - mock_cp = mock_session.client.return_value - + def test_create_dataset_and_wait_timeout(self, client_and_cp): + client, mock_cp = client_and_cp mock_cp.create_dataset.return_value = {"datasetId": "ds-123"} mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "CREATING"} - client = DatasetClient() with pytest.raises(TimeoutError): client.create_dataset_and_wait( wait_config=WaitConfig(max_wait=1, poll_interval=1), @@ -129,21 +115,16 @@ def test_create_dataset_and_wait_timeout(self, mock_boto3): class TestDatasetClientDeleteAndWait: - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_delete_dataset_and_wait_success(self, mock_boto3): + def test_delete_dataset_and_wait_success(self, client_and_cp): from botocore.exceptions import ClientError - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - mock_cp = mock_session.client.return_value - + client, mock_cp = client_and_cp mock_cp.delete_dataset.return_value = {"datasetId": "ds-123"} mock_cp.get_dataset.side_effect = ClientError( {"Error": {"Code": "ResourceNotFoundException", "Message": "Not found"}}, "GetDataset", ) - client = DatasetClient() client.delete_dataset_and_wait( wait_config=WaitConfig(max_wait=10, poll_interval=1), datasetId="ds-123", @@ -153,16 +134,11 @@ def test_delete_dataset_and_wait_success(self, mock_boto3): class TestDatasetClientVersionAndWait: - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_create_version_and_wait_success(self, mock_boto3): - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - mock_cp = mock_session.client.return_value - + def test_create_version_and_wait_success(self, client_and_cp): + client, mock_cp = client_and_cp mock_cp.create_dataset_version.return_value = {"datasetId": "ds-123"} mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "ACTIVE"} - client = DatasetClient() result = client.create_dataset_version_and_wait( wait_config=WaitConfig(max_wait=10, poll_interval=1), datasetId="ds-123", @@ -172,16 +148,11 @@ def test_create_version_and_wait_success(self, mock_boto3): class TestDatasetClientExamplesAndWait: - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_add_examples_and_wait_success(self, mock_boto3): - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - mock_cp = mock_session.client.return_value - + def test_add_examples_and_wait_success(self, client_and_cp): + client, mock_cp = client_and_cp mock_cp.add_dataset_examples.return_value = {"datasetId": "ds-123"} mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "ACTIVE"} - client = DatasetClient() result = client.add_examples_and_wait( wait_config=WaitConfig(max_wait=10, poll_interval=1), datasetId="ds-123", @@ -190,16 +161,11 @@ def test_add_examples_and_wait_success(self, mock_boto3): assert result["status"] == "ACTIVE" - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_update_examples_and_wait_success(self, mock_boto3): - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - mock_cp = mock_session.client.return_value - + def test_update_examples_and_wait_success(self, client_and_cp): + client, mock_cp = client_and_cp mock_cp.update_dataset_examples.return_value = {"datasetId": "ds-123"} mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "ACTIVE"} - client = DatasetClient() result = client.update_examples_and_wait( wait_config=WaitConfig(max_wait=10, poll_interval=1), datasetId="ds-123", @@ -208,16 +174,11 @@ def test_update_examples_and_wait_success(self, mock_boto3): assert result["status"] == "ACTIVE" - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_delete_examples_and_wait_success(self, mock_boto3): - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - mock_cp = mock_session.client.return_value - + def test_delete_examples_and_wait_success(self, client_and_cp): + client, mock_cp = client_and_cp mock_cp.delete_dataset_examples.return_value = {"datasetId": "ds-123"} mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "ACTIVE"} - client = DatasetClient() result = client.delete_examples_and_wait( wait_config=WaitConfig(max_wait=10, poll_interval=1), datasetId="ds-123", @@ -226,12 +187,8 @@ def test_delete_examples_and_wait_success(self, mock_boto3): assert result["status"] == "ACTIVE" - @patch("bedrock_agentcore.evaluation.dataset_client.boto3") - def test_add_examples_and_wait_failure(self, mock_boto3): - mock_session = MagicMock() - mock_boto3.Session.return_value = mock_session - mock_cp = mock_session.client.return_value - + def test_add_examples_and_wait_failure(self, client_and_cp): + client, mock_cp = client_and_cp mock_cp.add_dataset_examples.return_value = {"datasetId": "ds-123"} mock_cp.get_dataset.return_value = { "datasetId": "ds-123", @@ -239,7 +196,6 @@ def test_add_examples_and_wait_failure(self, mock_boto3): "statusReasons": "Schema validation failed", } - client = DatasetClient() with pytest.raises(RuntimeError, match="UPDATE_FAILED"): client.add_examples_and_wait( wait_config=WaitConfig(max_wait=5, poll_interval=1), diff --git a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py index 9208cfcc..dde17e9e 100644 --- a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py +++ b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py @@ -34,7 +34,7 @@ def _run_provider( } mock_response = MagicMock() - mock_response.content = jsonl_content.encode("utf-8") + mock_response.iter_lines.return_value = [line.encode("utf-8") for line in jsonl_content.split("\n") if line] mock_response.raise_for_status = MagicMock() with patch(PATCH_REQUESTS) as mock_requests: @@ -111,7 +111,7 @@ def test_downloads_from_presigned_url(self): _, mock_client, mock_requests = self._run_provider(content) mock_client.get_dataset.assert_called_once_with(datasetId="ds-123") - mock_requests.get.assert_called_once_with("https://example.com/dataset.jsonl", timeout=60) + mock_requests.get.assert_called_once_with("https://example.com/dataset.jsonl", timeout=60, stream=True) def test_get_dataset_with_version_id(self): content = _jsonl({"scenario_id": "s1", "turns": [{"input": "hi"}]}) @@ -142,7 +142,7 @@ def test_get_dataset_empty_raises(self): } mock_response = MagicMock() - mock_response.text = "" + mock_response.iter_lines.return_value = [] mock_response.raise_for_status = MagicMock() with patch(PATCH_REQUESTS) as mock_requests: From f057e6b5ad96ee326f7bf2cd3738ce847658692a Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Wed, 13 May 2026 10:43:17 -0400 Subject: [PATCH 15/18] refactor(evaluation): rename ServiceDatasetProvider to DatasetManagementServiceProvider Addresses PR review feedback: the name makes explicit which service the provider loads datasets from (Dataset Management Service). --- src/bedrock_agentcore/evaluation/__init__.py | 6 ++--- .../evaluation/runner/dataset_providers.py | 2 +- .../test_service_dataset_provider.py | 18 +++++++------- tests_integ/evaluation/test_dataset_client.py | 8 ++----- .../test_runners_with_service_dataset.py | 12 +++++----- .../test_service_dataset_provider.py | 24 +++++++++---------- 6 files changed, 34 insertions(+), 36 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/__init__.py b/src/bedrock_agentcore/evaluation/__init__.py index 7b4d0970..a70c267f 100644 --- a/src/bedrock_agentcore/evaluation/__init__.py +++ b/src/bedrock_agentcore/evaluation/__init__.py @@ -6,6 +6,7 @@ EvaluatorOutput, custom_code_based_evaluator, ) +from bedrock_agentcore.evaluation.dataset_client import DatasetClient from bedrock_agentcore.evaluation.runner.batch.batch_evaluation_models import ( BatchEvaluationResult, BatchEvaluationRunConfig, @@ -20,11 +21,10 @@ from bedrock_agentcore.evaluation.runner.batch.batch_evaluation_runner import ( BatchEvaluationRunner, ) -from bedrock_agentcore.evaluation.dataset_client import DatasetClient from bedrock_agentcore.evaluation.runner.dataset_providers import ( + DatasetManagementServiceProvider, DatasetProvider, FileDatasetProvider, - ServiceDatasetProvider, ) from bedrock_agentcore.evaluation.runner.dataset_types import ( ActorProfile, @@ -89,7 +89,7 @@ "EvaluatorOutput", "EvaluatorResult", "FileDatasetProvider", - "ServiceDatasetProvider", + "DatasetManagementServiceProvider", "Input", "OnDemandEvaluationDatasetRunner", "ReferenceInputs", diff --git a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py index 73757ee5..d7ad5598 100644 --- a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py +++ b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py @@ -73,7 +73,7 @@ def get_dataset(self) -> Dataset: return Dataset(scenarios=scenarios) -class ServiceDatasetProvider(DatasetProvider): +class DatasetManagementServiceProvider(DatasetProvider): """A dataset provider that loads a Dataset from the Dataset Management service.""" def __init__( diff --git a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py index dde17e9e..3b9c8deb 100644 --- a/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py +++ b/tests/bedrock_agentcore/evaluation/test_service_dataset_provider.py @@ -1,10 +1,10 @@ -"""Tests for ServiceDatasetProvider.""" +"""Tests for DatasetManagementServiceProvider.""" from unittest.mock import MagicMock, patch import pytest -from bedrock_agentcore.evaluation.runner.dataset_providers import ServiceDatasetProvider +from bedrock_agentcore.evaluation.runner.dataset_providers import DatasetManagementServiceProvider from bedrock_agentcore.evaluation.runner.dataset_types import ( Dataset, PredefinedScenario, @@ -21,7 +21,7 @@ def _jsonl(*examples): return "\n".join(json.dumps(e) for e in examples) -class TestServiceDatasetProvider: +class TestDatasetManagementServiceProvider: def _run_provider( self, jsonl_content, dataset_id="ds-123", version_id=None, schema_type="AGENTCORE_EVALUATION_PREDEFINED_V1" ): @@ -39,7 +39,9 @@ def _run_provider( with patch(PATCH_REQUESTS) as mock_requests: mock_requests.get.return_value = mock_response - provider = ServiceDatasetProvider(dataset_id=dataset_id, version_id=version_id, client=mock_client) + provider = DatasetManagementServiceProvider( + dataset_id=dataset_id, version_id=version_id, client=mock_client + ) return provider.get_dataset(), mock_client, mock_requests def test_get_dataset_predefined(self): @@ -128,7 +130,7 @@ def test_get_dataset_no_download_url_raises(self): "schemaType": "AGENTCORE_EVALUATION_PREDEFINED_V1", } - provider = ServiceDatasetProvider(dataset_id="ds-123", client=mock_client) + provider = DatasetManagementServiceProvider(dataset_id="ds-123", client=mock_client) with pytest.raises(ValueError, match="no downloadUrl"): provider.get_dataset() @@ -147,7 +149,7 @@ def test_get_dataset_empty_raises(self): with patch(PATCH_REQUESTS) as mock_requests: mock_requests.get.return_value = mock_response - provider = ServiceDatasetProvider(dataset_id="ds-empty", client=mock_client) + provider = DatasetManagementServiceProvider(dataset_id="ds-empty", client=mock_client) with pytest.raises(ValueError, match="scenarios must not be empty"): provider.get_dataset() @@ -160,7 +162,7 @@ def test_unsupported_schema_type_raises(self): "downloadUrl": "https://example.com/dataset.jsonl", } - provider = ServiceDatasetProvider(dataset_id="ds-123", client=mock_client) + provider = DatasetManagementServiceProvider(dataset_id="ds-123", client=mock_client) with pytest.raises(ValueError, match="not supported by the evaluation runners"): provider.get_dataset() @@ -181,6 +183,6 @@ def test_download_failure_raises_runtime_error(self): with patch(PATCH_REQUESTS) as mock_requests: mock_requests.get.return_value = mock_response mock_requests.RequestException = real_requests.RequestException - provider = ServiceDatasetProvider(dataset_id="ds-123", client=mock_client) + provider = DatasetManagementServiceProvider(dataset_id="ds-123", client=mock_client) with pytest.raises(RuntimeError, match="Couldn't download dataset"): provider.get_dataset() diff --git a/tests_integ/evaluation/test_dataset_client.py b/tests_integ/evaluation/test_dataset_client.py index 908055d9..dc8c3dc8 100644 --- a/tests_integ/evaluation/test_dataset_client.py +++ b/tests_integ/evaluation/test_dataset_client.py @@ -74,9 +74,7 @@ def test_create_dataset_and_wait_inline(self): "examples": [ { "scenario_id": "test-scenario-1", - "turns": [ - {"input": "Hello", "expected_response": "Hi there!"} - ], + "turns": [{"input": "Hello", "expected_response": "Hi there!"}], } ] } @@ -129,9 +127,7 @@ def test_add_examples_and_wait(self): "examples": [ { "scenario_id": "test-scenario-2", - "turns": [ - {"input": "What is 2+2?", "expected_response": "4"} - ], + "turns": [{"input": "What is 2+2?", "expected_response": "4"}], } ] } diff --git a/tests_integ/evaluation/test_runners_with_service_dataset.py b/tests_integ/evaluation/test_runners_with_service_dataset.py index 3cc07c2e..daf4619b 100644 --- a/tests_integ/evaluation/test_runners_with_service_dataset.py +++ b/tests_integ/evaluation/test_runners_with_service_dataset.py @@ -1,7 +1,7 @@ -"""Integration tests for OnDemandEvaluationDatasetRunner using ServiceDatasetProvider. +"""Integration tests for OnDemandEvaluationDatasetRunner using DatasetManagementServiceProvider. These tests verify the full pipeline: - ServiceDatasetProvider → Dataset → Runner → Agent Invocation → Evaluation + DatasetManagementServiceProvider → Dataset → Runner → Agent Invocation → Evaluation Required env vars: INTEG_AGENT_RUNTIME_ARN: ARN of a deployed, invokable agent runtime @@ -25,7 +25,7 @@ import pytest from bedrock_agentcore.evaluation.dataset_client import DatasetClient -from bedrock_agentcore.evaluation.runner.dataset_providers import ServiceDatasetProvider +from bedrock_agentcore.evaluation.runner.dataset_providers import DatasetManagementServiceProvider from bedrock_agentcore.evaluation.runner.invoker_types import AgentInvokerInput, AgentInvokerOutput from bedrock_agentcore.evaluation.runner.on_demand.config import EvaluationRunConfig, EvaluatorConfig from bedrock_agentcore.evaluation.runner.on_demand.on_demand_runner import OnDemandEvaluationDatasetRunner @@ -57,7 +57,7 @@ def invoker(input: AgentInvokerInput) -> AgentInvokerOutput: @pytest.mark.integration @pytest.mark.skipif(not RUNTIME_ARN, reason="INTEG_AGENT_RUNTIME_ARN not set") class TestOnDemandRunnerWithServiceDataset: - """OnDemandEvaluationDatasetRunner + ServiceDatasetProvider end-to-end.""" + """OnDemandEvaluationDatasetRunner + DatasetManagementServiceProvider end-to-end.""" @classmethod def setup_class(cls): @@ -106,8 +106,8 @@ def teardown_class(cls): @pytest.mark.order(1) def test_on_demand_runner_executes_scenarios(self): - """OnDemandRunner invokes agent for each scenario from ServiceDatasetProvider.""" - provider = ServiceDatasetProvider( + """OnDemandRunner invokes agent for each scenario from DatasetManagementServiceProvider.""" + provider = DatasetManagementServiceProvider( dataset_id=self.dataset_id, client=self.client, ) diff --git a/tests_integ/evaluation/test_service_dataset_provider.py b/tests_integ/evaluation/test_service_dataset_provider.py index 519589d1..2999f4d8 100644 --- a/tests_integ/evaluation/test_service_dataset_provider.py +++ b/tests_integ/evaluation/test_service_dataset_provider.py @@ -1,6 +1,6 @@ -"""Integration tests for ServiceDatasetProvider. +"""Integration tests for DatasetManagementServiceProvider. -Tests that ServiceDatasetProvider can fetch a dataset from the service +Tests that DatasetManagementServiceProvider can fetch a dataset from the service and return it as an SDK Dataset object usable by OnDemandEvaluationDatasetRunner. Run with: @@ -13,7 +13,7 @@ import pytest from bedrock_agentcore.evaluation.dataset_client import DatasetClient -from bedrock_agentcore.evaluation.runner.dataset_providers import ServiceDatasetProvider +from bedrock_agentcore.evaluation.runner.dataset_providers import DatasetManagementServiceProvider from bedrock_agentcore.evaluation.runner.dataset_types import ( Dataset, PredefinedScenario, @@ -22,8 +22,8 @@ @pytest.mark.integration -class TestServiceDatasetProvider: - """Tests ServiceDatasetProvider with both PREDEFINED and SYNTHETIC schema types.""" +class TestDatasetManagementServiceProvider: + """Tests DatasetManagementServiceProvider with both PREDEFINED and SYNTHETIC schema types.""" @classmethod def setup_class(cls): @@ -102,8 +102,8 @@ def teardown_class(cls): @pytest.mark.order(1) def test_get_predefined_dataset(self): - """ServiceDatasetProvider returns a valid SDK Dataset with PredefinedScenarios.""" - provider = ServiceDatasetProvider( + """DatasetManagementServiceProvider returns a valid SDK Dataset with PredefinedScenarios.""" + provider = DatasetManagementServiceProvider( dataset_id=self.predefined_dataset_id, client=self.client, ) @@ -117,7 +117,7 @@ def test_get_predefined_dataset(self): @pytest.mark.order(2) def test_predefined_fields_preserved(self): """Scenario fields (turns, assertions, trajectory) are preserved.""" - provider = ServiceDatasetProvider( + provider = DatasetManagementServiceProvider( dataset_id=self.predefined_dataset_id, client=self.client, ) @@ -132,11 +132,11 @@ def test_predefined_fields_preserved(self): @pytest.mark.order(3) def test_get_predefined_with_version(self): - """ServiceDatasetProvider can fetch a specific version.""" + """DatasetManagementServiceProvider can fetch a specific version.""" versions_resp = self.client.list_dataset_versions(datasetId=self.predefined_dataset_id) version_id = str(versions_resp["versions"][0]["datasetVersion"]) - provider = ServiceDatasetProvider( + provider = DatasetManagementServiceProvider( dataset_id=self.predefined_dataset_id, version_id=version_id, client=self.client, @@ -150,8 +150,8 @@ def test_get_predefined_with_version(self): @pytest.mark.order(4) def test_get_synthetic_dataset(self): - """ServiceDatasetProvider returns SimulatedScenarios for SYNTHETIC schema.""" - provider = ServiceDatasetProvider( + """DatasetManagementServiceProvider returns SimulatedScenarios for SYNTHETIC schema.""" + provider = DatasetManagementServiceProvider( dataset_id=self.synthetic_dataset_id, client=self.client, ) From 51f10554ca7845cff50bf3464157cbe9074fa9e3 Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Wed, 13 May 2026 13:43:22 -0400 Subject: [PATCH 16/18] fix: Removed delete_dataset_version api from allowlist --- src/bedrock_agentcore/evaluation/dataset_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bedrock_agentcore/evaluation/dataset_client.py b/src/bedrock_agentcore/evaluation/dataset_client.py index 07469e21..c899bad1 100644 --- a/src/bedrock_agentcore/evaluation/dataset_client.py +++ b/src/bedrock_agentcore/evaluation/dataset_client.py @@ -46,7 +46,6 @@ class DatasetClient: # Version management "create_dataset_version", "list_dataset_versions", - "delete_dataset_version", # Examples management "add_dataset_examples", "update_dataset_examples", From 8307342192ca7cb33e129efd455117ddad9e7435 Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Wed, 13 May 2026 16:42:46 -0400 Subject: [PATCH 17/18] feat(evaluation): support JSONL files in FileDatasetProvider Dispatch on file extension: paths ending in .jsonl are read line-by-line (one scenario per line). All other paths keep the existing {"scenarios": [...]} JSON shape. Adds 8 unit tests covering predefined/simulated/mixed JSONL content, blank-line tolerance, malformed lines, and extension dispatch. --- .../evaluation/runner/dataset_providers.py | 21 ++-- .../evaluation/test_dataset_parser.py | 95 +++++++++++++++++++ 2 files changed, 110 insertions(+), 6 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py index d7ad5598..081d626e 100644 --- a/src/bedrock_agentcore/evaluation/runner/dataset_providers.py +++ b/src/bedrock_agentcore/evaluation/runner/dataset_providers.py @@ -59,17 +59,26 @@ def get_dataset(self) -> Dataset: class FileDatasetProvider(DatasetProvider): - """A dataset provider that loads a Dataset from a JSON file.""" + """A dataset provider that loads a Dataset from a JSON or JSONL file. + + JSON format: {"scenarios": [{...}, {...}]} + JSONL format: one scenario JSON object per line. + Format is selected by file extension (".jsonl" → JSONL, otherwise JSON). + """ def __init__(self, file_path: str): - """Initialize with a path to a JSON dataset file.""" + """Initialize with a path to a JSON or JSONL dataset file.""" self._file_path = file_path def get_dataset(self) -> Dataset: - """Load and return the dataset from the JSON file.""" - with open(self._file_path) as f: - data = json.load(f) - scenarios: List[Scenario] = [_parse_scenario(s) for s in data["scenarios"]] + """Load and return the dataset from the file.""" + if self._file_path.endswith(".jsonl"): + with open(self._file_path) as f: + raw_examples = [json.loads(line) for line in f if line.strip()] + else: + with open(self._file_path) as f: + raw_examples = json.load(f)["scenarios"] + scenarios: List[Scenario] = [_parse_scenario(s) for s in raw_examples] return Dataset(scenarios=scenarios) diff --git a/tests/bedrock_agentcore/evaluation/test_dataset_parser.py b/tests/bedrock_agentcore/evaluation/test_dataset_parser.py index da605375..5a39c97d 100644 --- a/tests/bedrock_agentcore/evaluation/test_dataset_parser.py +++ b/tests/bedrock_agentcore/evaluation/test_dataset_parser.py @@ -167,3 +167,98 @@ def test_parse_mixed_predefined_and_simulated(self, tmp_path): assert len(dataset.scenarios) == 2 assert isinstance(dataset.scenarios[0], PredefinedScenario) assert isinstance(dataset.scenarios[1], SimulatedScenario) + + +class TestFileDatasetProviderJsonl: + def _write_jsonl_and_load(self, tmp_path, scenarios): + file_path = tmp_path / "dataset.jsonl" + file_path.write_text("\n".join(json.dumps(s) for s in scenarios)) + return FileDatasetProvider(str(file_path)).get_dataset() + + def test_parse_jsonl_predefined(self, tmp_path): + scenarios = [ + { + "scenario_id": "s1", + "turns": [{"input": "Hello", "expected_response": "Hi"}], + }, + { + "scenario_id": "s2", + "turns": [{"input": "What is 2+2?"}], + }, + ] + dataset = self._write_jsonl_and_load(tmp_path, scenarios) + assert isinstance(dataset, Dataset) + assert len(dataset.scenarios) == 2 + assert all(isinstance(s, PredefinedScenario) for s in dataset.scenarios) + assert dataset.scenarios[0].scenario_id == "s1" + assert dataset.scenarios[0].turns[0].expected_response == "Hi" + + def test_parse_jsonl_simulated(self, tmp_path): + scenarios = [ + { + "scenario_id": "sim-1", + "scenario_description": "Customer orders pizza", + "actor_profile": {"context": "ctx", "goal": "Order a pizza"}, + "input": "I'd like a pizza", + "max_turns": 5, + } + ] + dataset = self._write_jsonl_and_load(tmp_path, scenarios) + assert len(dataset.scenarios) == 1 + scenario = dataset.scenarios[0] + assert isinstance(scenario, SimulatedScenario) + assert scenario.actor_profile.goal == "Order a pizza" + assert scenario.max_turns == 5 + + def test_parse_jsonl_mixed(self, tmp_path): + scenarios = [ + {"scenario_id": "pre-1", "turns": [{"input": "hi"}]}, + { + "scenario_id": "sim-1", + "scenario_description": "desc", + "actor_profile": {"context": "ctx", "goal": "goal"}, + "input": "hello", + }, + ] + dataset = self._write_jsonl_and_load(tmp_path, scenarios) + assert len(dataset.scenarios) == 2 + assert isinstance(dataset.scenarios[0], PredefinedScenario) + assert isinstance(dataset.scenarios[1], SimulatedScenario) + + def test_parse_jsonl_skips_blank_lines(self, tmp_path): + file_path = tmp_path / "dataset.jsonl" + file_path.write_text( + '{"scenario_id": "s1", "turns": [{"input": "hi"}]}\n' + "\n" + " \n" + '{"scenario_id": "s2", "turns": [{"input": "bye"}]}\n' + ) + dataset = FileDatasetProvider(str(file_path)).get_dataset() + assert len(dataset.scenarios) == 2 + assert {s.scenario_id for s in dataset.scenarios} == {"s1", "s2"} + + def test_parse_jsonl_empty_file_raises(self, tmp_path): + file_path = tmp_path / "empty.jsonl" + file_path.write_text("") + with pytest.raises(ValueError, match="scenarios must not be empty"): + FileDatasetProvider(str(file_path)).get_dataset() + + def test_parse_jsonl_invalid_line_raises(self, tmp_path): + file_path = tmp_path / "bad.jsonl" + file_path.write_text('{"scenario_id": "s1", "turns": [{"input": "hi"}]}\nnot json\n') + provider = FileDatasetProvider(str(file_path)) + with pytest.raises(json.JSONDecodeError): + provider.get_dataset() + + def test_parse_jsonl_missing_required_field_raises(self, tmp_path): + scenarios = [{"scenario_id": "s1"}] + with pytest.raises(ValueError): + self._write_jsonl_and_load(tmp_path, scenarios) + + def test_jsonl_extension_dispatch(self, tmp_path): + # A .json file with one-line-per-object content should NOT be parsed as JSONL. + file_path = tmp_path / "looks-like-jsonl.json" + file_path.write_text('{"scenario_id": "s1", "turns": [{"input": "hi"}]}\n') + # Without "scenarios" wrapper, JSON parse yields a dict that fails the lookup. + with pytest.raises(KeyError): + FileDatasetProvider(str(file_path)).get_dataset() From 97a2448d71e879c5696247f7f953f0c01d5bdaee Mon Sep 17 00:00:00 2001 From: "T.J Ariyawansa" Date: Wed, 20 May 2026 18:00:20 -0400 Subject: [PATCH 18/18] fix(evaluation): handle version-specific deletes in delete_dataset_and_wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A version-specific delete (datasetVersion provided) does not remove the dataset itself — it transitions the dataset to UPDATING and back to ACTIVE. The previous waiter polled for ResourceNotFoundException and timed out. Branch on whether datasetVersion is passed: - Without datasetVersion: poll until ResourceNotFoundException (DELETE_FAILED) - With datasetVersion: poll until ACTIVE (UPDATE_FAILED), return dataset dict Add unit tests for both version-delete paths (success + UPDATE_FAILED) and an integ test that creates two versions, deletes the oldest via delete_dataset_and_wait, and verifies the dataset stays ACTIVE. --- .../evaluation/dataset_client.py | 26 ++++++++-- .../evaluation/test_dataset_client.py | 31 +++++++++++ tests_integ/evaluation/test_dataset_client.py | 52 ++++++++++++++++++- 3 files changed, 103 insertions(+), 6 deletions(-) diff --git a/src/bedrock_agentcore/evaluation/dataset_client.py b/src/bedrock_agentcore/evaluation/dataset_client.py index c899bad1..2442f774 100644 --- a/src/bedrock_agentcore/evaluation/dataset_client.py +++ b/src/bedrock_agentcore/evaluation/dataset_client.py @@ -132,23 +132,41 @@ def delete_dataset_and_wait( self, wait_config: Optional[WaitConfig] = None, **kwargs, - ) -> None: - """Delete a dataset and wait for deletion to complete. + ) -> Optional[Dict[str, Any]]: + """Delete a dataset (or a single version) and wait for completion. + + - Full delete (no ``datasetVersion``): polls until ``GetDataset`` + raises ``ResourceNotFoundException``. Fails on ``DELETE_FAILED``. + - Version-specific delete (``datasetVersion`` provided): the dataset + itself is not removed. Polls ``GetDataset`` until status returns to + ``ACTIVE``. Fails on ``UPDATE_FAILED``. Returns the dataset details. Args: wait_config: Optional WaitConfig for polling behavior. **kwargs: Arguments forwarded to the delete_dataset API. Raises: - TimeoutError: If the dataset isn't deleted within max_wait. + RuntimeError: On ``DELETE_FAILED`` or ``UPDATE_FAILED``. + TimeoutError: If the operation does not finish within ``max_wait``. """ - response = self._cp_client.delete_dataset(**convert_kwargs(kwargs)) + converted = convert_kwargs(kwargs) + response = self._cp_client.delete_dataset(**converted) dataset_id = response["datasetId"] + + if "datasetVersion" in converted: + return wait_until( + lambda: self._cp_client.get_dataset(datasetId=dataset_id), + "ACTIVE", + {"UPDATE_FAILED"}, + wait_config, + ) + wait_until_deleted( lambda: self._cp_client.get_dataset(datasetId=dataset_id), failed={"DELETE_FAILED"}, wait_config=wait_config, ) + return None def create_dataset_version_and_wait( self, diff --git a/tests/bedrock_agentcore/evaluation/test_dataset_client.py b/tests/bedrock_agentcore/evaluation/test_dataset_client.py index da93855d..c51523c8 100644 --- a/tests/bedrock_agentcore/evaluation/test_dataset_client.py +++ b/tests/bedrock_agentcore/evaluation/test_dataset_client.py @@ -132,6 +132,37 @@ def test_delete_dataset_and_wait_success(self, client_and_cp): mock_cp.delete_dataset.assert_called_once_with(datasetId="ds-123") + def test_delete_dataset_version_and_wait_returns_active(self, client_and_cp): + client, mock_cp = client_and_cp + mock_cp.delete_dataset.return_value = {"datasetId": "ds-123"} + mock_cp.get_dataset.return_value = {"datasetId": "ds-123", "status": "ACTIVE"} + + result = client.delete_dataset_and_wait( + wait_config=WaitConfig(max_wait=10, poll_interval=1), + datasetId="ds-123", + datasetVersion="1", + ) + + assert result is not None + assert result["status"] == "ACTIVE" + mock_cp.delete_dataset.assert_called_once_with(datasetId="ds-123", datasetVersion="1") + + def test_delete_dataset_version_and_wait_failure(self, client_and_cp): + client, mock_cp = client_and_cp + mock_cp.delete_dataset.return_value = {"datasetId": "ds-123"} + mock_cp.get_dataset.return_value = { + "datasetId": "ds-123", + "status": "UPDATE_FAILED", + "statusReasons": "Workflow failed", + } + + with pytest.raises(RuntimeError, match="UPDATE_FAILED"): + client.delete_dataset_and_wait( + wait_config=WaitConfig(max_wait=5, poll_interval=1), + datasetId="ds-123", + datasetVersion="1", + ) + class TestDatasetClientVersionAndWait: def test_create_version_and_wait_success(self, client_and_cp): diff --git a/tests_integ/evaluation/test_dataset_client.py b/tests_integ/evaluation/test_dataset_client.py index dc8c3dc8..6fa0f73b 100644 --- a/tests_integ/evaluation/test_dataset_client.py +++ b/tests_integ/evaluation/test_dataset_client.py @@ -164,12 +164,60 @@ def test_list_dataset_versions(self): assert len(response["versions"]) >= 1 @pytest.mark.order(13) + def test_delete_dataset_version_and_wait(self): + """Version-specific delete returns the dataset to ACTIVE, dataset still exists. + + Requires at least 2 versions. Creates a second version first so the deletion + does not target the last remaining version (which the service rejects). + """ + if not self.dataset_ids: + pytest.skip("prerequisite test did not create dataset") + did = self.dataset_ids[0] + + # Make a second version so we have something we can safely delete. + self.client.add_examples_and_wait( + datasetId=did, + source={ + "inlineExamples": { + "examples": [ + { + "scenario_id": "version-delete-fixture", + "turns": [{"input": "filler", "expected_response": "ok"}], + } + ] + } + }, + ) + self.client.create_dataset_version_and_wait(datasetId=did) + + versions_before = self.client.list_dataset_versions(datasetId=did)["versions"] + assert len(versions_before) >= 2, "need >= 2 versions for version-specific delete" + # Delete the oldest version to keep the most recent one. + target_version = str(min(int(v["datasetVersion"]) for v in versions_before)) + + result = self.client.delete_dataset_and_wait(datasetId=did, datasetVersion=target_version) + + # Version-specific delete returns the dataset (not None) at ACTIVE. + assert result is not None + assert result["status"] == "ACTIVE" + + # Dataset itself is still present. + ds = self.client.get_dataset(datasetId=did) + assert ds["datasetId"] == did + + # Target version is gone. + versions_after = self.client.list_dataset_versions(datasetId=did)["versions"] + remaining = [str(v["datasetVersion"]) for v in versions_after] + assert target_version not in remaining + + @pytest.mark.order(14) def test_delete_dataset_and_wait(self): - """Delete dataset and wait for removal.""" + """Full delete (no datasetVersion) removes the dataset.""" if not self.dataset_ids: pytest.skip("prerequisite test did not create dataset") did = self.dataset_ids.pop(0) - self.client.delete_dataset_and_wait(datasetId=did) + result = self.client.delete_dataset_and_wait(datasetId=did) + assert result is None with pytest.raises(ClientError) as exc_info: self.client.get_dataset(datasetId=did) assert exc_info.value.response["Error"]["Code"] == "ResourceNotFoundException"