From 16768af328f37c37451636d04ca646034d830ba0 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 11 May 2026 15:48:18 +0000 Subject: [PATCH 1/4] type `push_data` and `user_data` with `JsonSerializable` instead of `Any` --- pyproject.toml | 2 +- src/crawlee/_types.py | 12 +++++----- src/crawlee/_utils/file.py | 8 +++---- .../_abstract_http/_abstract_http_crawler.py | 6 ++--- src/crawlee/crawlers/_basic/_basic_crawler.py | 6 ++--- .../_playwright/_playwright_crawler.py | 3 ++- src/crawlee/sessions/_models.py | 4 +++- src/crawlee/sessions/_session.py | 8 ++++--- .../storage_clients/_base/_dataset_client.py | 22 +++++++++++++++---- .../_file_system/_dataset_client.py | 12 +++++----- .../_memory/_dataset_client.py | 15 ++++++++----- .../storage_clients/_redis/_dataset_client.py | 8 ++++--- .../storage_clients/_sql/_dataset_client.py | 8 ++++--- src/crawlee/storage_clients/models.py | 7 ++++-- src/crawlee/storages/_dataset.py | 10 ++++----- uv.lock | 2 +- 16 files changed, 82 insertions(+), 51 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 699d68dffe..6d56a9fec9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ dependencies = [ "pydantic>=2.11.0", "pyee>=9.0.0", "tldextract>=5.1.0", - "typing-extensions>=4.1.0", + "typing-extensions>=4.10.0", "yarl>=1.18.0", ] diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 1e35510340..16fd9e18b4 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -198,7 +198,7 @@ class PushDataKwargs(TypedDict): class PushDataFunctionCall(PushDataKwargs): - data: list[dict[str, Any]] | dict[str, Any] + data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable] dataset_id: str | None dataset_name: str | None dataset_alias: str | None @@ -300,7 +300,7 @@ async def add_requests( async def push_data( self, - data: list[dict[str, Any]] | dict[str, Any], + data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable], dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, @@ -392,7 +392,7 @@ def __call__( selector: str | None = None, attribute: str | None = None, label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, rq_id: str | None = None, rq_name: str | None = None, @@ -417,7 +417,7 @@ def __call__( selector: str | None = None, attribute: str | None = None, label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, rq_id: str | None = None, @@ -465,7 +465,7 @@ def __call__( selector: str = 'a', attribute: str = 'href', label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, list[Request]]: @@ -543,7 +543,7 @@ class PushDataFunction(Protocol): def __call__( self, - data: list[dict[str, Any]] | dict[str, Any], + data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable], dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index 1d297fa724..b3ee5662f4 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -10,12 +10,12 @@ from typing import TYPE_CHECKING, overload if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Mapping from typing import Any, TextIO from typing_extensions import Unpack - from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs + from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs, JsonSerializable if sys.platform == 'win32': @@ -150,7 +150,7 @@ async def atomic_write( async def export_json_to_stream( - iterator: AsyncIterator[dict[str, Any]], + iterator: AsyncIterator[Mapping[str, JsonSerializable]], dst: TextIO, **kwargs: Unpack[ExportDataJsonKwargs], ) -> None: @@ -159,7 +159,7 @@ async def export_json_to_stream( async def export_csv_to_stream( - iterator: AsyncIterator[dict[str, Any]], + iterator: AsyncIterator[Mapping[str, JsonSerializable]], dst: TextIO, **kwargs: Unpack[ExportDataCsvKwargs], ) -> None: diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 56046f0b64..c2fe8f5c74 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -4,7 +4,7 @@ import logging from abc import ABC from datetime import timedelta -from typing import TYPE_CHECKING, Any, Generic +from typing import TYPE_CHECKING, Generic from more_itertools import partition from pydantic import ValidationError @@ -26,7 +26,7 @@ from typing_extensions import Unpack from crawlee import RequestTransformAction - from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction + from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction, JsonSerializable from ._abstract_http_parser import AbstractHttpParser @@ -200,7 +200,7 @@ async def extract_links( selector: str = 'a', attribute: str = 'href', label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 1c79c4eadd..207a1968ad 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -80,7 +80,7 @@ if TYPE_CHECKING: import re - from collections.abc import Iterator + from collections.abc import Iterator, Mapping from contextlib import AbstractAsyncContextManager from crawlee._types import ( @@ -941,7 +941,7 @@ async def export_data( async def _push_data( self, - data: list[dict[str, Any]] | dict[str, Any], + data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable], dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, @@ -1015,7 +1015,7 @@ async def enqueue_links( selector: str | None = None, attribute: str | None = None, label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 19b98c79c3..73e4d26d36 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -53,6 +53,7 @@ HttpHeaders, HttpMethod, HttpPayload, + JsonSerializable, ) from crawlee.browsers._types import BrowserType @@ -384,7 +385,7 @@ async def extract_links( selector: str = 'a', attribute: str = 'href', label: str | None = None, - user_data: dict | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], diff --git a/src/crawlee/sessions/_models.py b/src/crawlee/sessions/_models.py index 2f5b4a0483..c2e17ad439 100644 --- a/src/crawlee/sessions/_models.py +++ b/src/crawlee/sessions/_models.py @@ -13,6 +13,8 @@ computed_field, ) +from crawlee._types import JsonSerializable + from ._cookies import CookieParam from ._session import Session @@ -24,7 +26,7 @@ class SessionModel(BaseModel): id: Annotated[str, Field(alias='id')] max_age: Annotated[timedelta, Field(alias='maxAge')] - user_data: Annotated[dict, Field(alias='userData')] + user_data: Annotated[dict[str, JsonSerializable], Field(alias='userData')] max_error_score: Annotated[float, Field(alias='maxErrorScore')] error_score_decrement: Annotated[float, Field(alias='errorScoreDecrement')] created_at: Annotated[datetime, Field(alias='createdAt')] diff --git a/src/crawlee/sessions/_session.py b/src/crawlee/sessions/_session.py index b36d1b4970..6663d43cfe 100644 --- a/src/crawlee/sessions/_session.py +++ b/src/crawlee/sessions/_session.py @@ -11,8 +11,10 @@ from crawlee.sessions._cookies import CookieParam, SessionCookies if TYPE_CHECKING: + from collections.abc import Mapping, MutableMapping from http.cookiejar import CookieJar + from crawlee._types import JsonSerializable from crawlee.sessions._models import SessionModel logger = getLogger(__name__) @@ -36,7 +38,7 @@ def __init__( *, id: str | None = None, max_age: timedelta = timedelta(minutes=50), - user_data: dict | None = None, + user_data: Mapping[str, JsonSerializable] | None = None, max_error_score: float = 3.0, error_score_decrement: float = 0.5, created_at: datetime | None = None, @@ -63,7 +65,7 @@ def __init__( """ self._id = id or crypto_random_object_id(length=10) self._max_age = max_age - self._user_data = user_data or {} + self._user_data: dict[str, JsonSerializable] = dict(user_data) if user_data is not None else {} self._max_error_score = max_error_score self._error_score_decrement = error_score_decrement self._created_at = created_at or datetime.now(timezone.utc) @@ -117,7 +119,7 @@ def id(self) -> str: return self._id @property - def user_data(self) -> dict: + def user_data(self) -> MutableMapping[str, JsonSerializable]: """Get the user data.""" return self._user_data diff --git a/src/crawlee/storage_clients/_base/_dataset_client.py b/src/crawlee/storage_clients/_base/_dataset_client.py index eb1099708d..1a9ea11b40 100644 --- a/src/crawlee/storage_clients/_base/_dataset_client.py +++ b/src/crawlee/storage_clients/_base/_dataset_client.py @@ -4,12 +4,20 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from collections.abc import AsyncIterator - from typing import Any + from collections.abc import AsyncIterator, Mapping, Sequence + from typing_extensions import TypeIs + + from crawlee._types import JsonSerializable from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata +def _is_list_of_items( + data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable], +) -> TypeIs[Sequence[Mapping[str, JsonSerializable]]]: + return isinstance(data, list) + + class DatasetClient(ABC): """An abstract class for dataset storage clients. @@ -42,7 +50,7 @@ async def purge(self) -> None: """ @abstractmethod - async def push_data(self, data: list[Any] | dict[str, Any]) -> None: + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: """Push data to the dataset. The backend method for the `Dataset.push_data` call. @@ -82,7 +90,7 @@ async def iterate_items( unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict[str, Any]]: + ) -> AsyncIterator[Mapping[str, JsonSerializable]]: """Iterate over the dataset items with filtering options. The backend method for the `Dataset.iterate_items` call. @@ -91,3 +99,9 @@ async def iterate_items( raise NotImplementedError if False: yield {} + + @staticmethod + def _is_list_of_items( + data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable], + ) -> TypeIs[Sequence[Mapping[str, JsonSerializable]]]: + return isinstance(data, list) diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index b970a98928..38c0ede0f1 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -3,6 +3,7 @@ import asyncio import json import shutil +from collections.abc import Mapping from datetime import datetime, timezone from logging import getLogger from pathlib import Path @@ -12,6 +13,7 @@ from typing_extensions import Self, override from crawlee._consts import METADATA_FILENAME +from crawlee._types import JsonSerializable from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.file import atomic_write, json_dumps from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs @@ -19,7 +21,7 @@ from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Sequence from crawlee.configuration import Configuration @@ -220,10 +222,10 @@ async def purge(self) -> None: ) @override - async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: async with self._lock: new_item_count = self._metadata.item_count - if isinstance(data, list): + if self._is_list_of_items(data): for item in data: new_item_count += 1 await self._push_item(item, new_item_count) @@ -304,7 +306,7 @@ async def get_data( selected_files = selected_files[:limit] # Read and parse each data file. - items = list[dict[str, Any]]() + items = list[Mapping[str, JsonSerializable]]() for file_path in selected_files: try: file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') @@ -441,7 +443,7 @@ async def _update_metadata( data = await json_dumps(self._metadata.model_dump()) await atomic_write(self.path_to_metadata, data) - async def _push_item(self, item: dict[str, Any], item_id: int) -> None: + async def _push_item(self, item: Mapping[str, JsonSerializable], item_id: int) -> None: """Push a single item to the dataset. This method writes the item as a JSON file with a zero-padded numeric filename diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py index 67abc6f6dc..f98e1f5296 100644 --- a/src/crawlee/storage_clients/_memory/_dataset_client.py +++ b/src/crawlee/storage_clients/_memory/_dataset_client.py @@ -1,18 +1,21 @@ from __future__ import annotations +from collections.abc import Mapping from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Any from typing_extensions import Self, override +from crawlee._types import JsonSerializable from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Sequence + logger = getLogger(__name__) @@ -41,7 +44,7 @@ def __init__( """ self._metadata = metadata - self._records = list[dict[str, Any]]() + self._records = list[Mapping[str, JsonSerializable]]() """List to hold dataset items. Each item is a dictionary representing a record.""" @override @@ -113,11 +116,11 @@ async def purge(self) -> None: ) @override - async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: metadata = await self.get_metadata() new_item_count = metadata.item_count - if isinstance(data, list): + if self._is_list_of_items(data): for item in data: new_item_count += 1 await self._push_item(item) @@ -203,7 +206,7 @@ async def iterate_items( unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict[str, Any]]: + ) -> AsyncIterator[Mapping[str, JsonSerializable]]: # Check for unsupported arguments and log a warning if found unsupported_args: dict[str, Any] = { 'clean': clean, @@ -260,7 +263,7 @@ async def _update_metadata( if new_item_count is not None: self._metadata.item_count = new_item_count - async def _push_item(self, item: dict[str, Any]) -> None: + async def _push_item(self, item: Mapping[str, JsonSerializable]) -> None: """Push a single item to the dataset. Args: diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index db2b6375c6..5777dc2cd5 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -14,11 +14,13 @@ from ._utils import await_redis_response if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Mapping, Sequence from redis.asyncio import Redis from redis.asyncio.client import Pipeline + from crawlee._types import JsonSerializable + logger = getLogger(__name__) @@ -126,8 +128,8 @@ async def purge(self) -> None: @retry_on_error(RedisError) @override - async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: - if isinstance(data, dict): + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: + if not self._is_list_of_items(data): data = [data] async with self._get_pipeline() as pipe: diff --git a/src/crawlee/storage_clients/_sql/_dataset_client.py b/src/crawlee/storage_clients/_sql/_dataset_client.py index d17031ca89..239ac16623 100644 --- a/src/crawlee/storage_clients/_sql/_dataset_client.py +++ b/src/crawlee/storage_clients/_sql/_dataset_client.py @@ -17,12 +17,14 @@ from ._db_models import DatasetItemDb, DatasetMetadataBufferDb, DatasetMetadataDb if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Mapping, Sequence from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import NotRequired + from crawlee._types import JsonSerializable + from ._storage_client import SqlStorageClient @@ -144,8 +146,8 @@ async def purge(self) -> None: @retry_on_error(SQLAlchemyError) @override - async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: - if not isinstance(data, list): + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: + if not self._is_list_of_items(data): data = [data] db_items = [{'dataset_id': self._id, 'data': item} for item in data] diff --git a/src/crawlee/storage_clients/models.py b/src/crawlee/storage_clients/models.py index 2ebd65914d..1725340a53 100644 --- a/src/crawlee/storage_clients/models.py +++ b/src/crawlee/storage_clients/models.py @@ -6,10 +6,13 @@ from pydantic import BaseModel, BeforeValidator, ConfigDict, Field from typing_extensions import TypeVar -from crawlee._types import HttpMethod +from crawlee._types import HttpMethod, JsonSerializable from crawlee._utils.docs import docs_group from crawlee._utils.urls import validate_http_url +if TYPE_CHECKING: + from collections.abc import Mapping, Sequence + KvsValueType = TypeVar('KvsValueType', default=Any) @@ -129,7 +132,7 @@ class DatasetItemsListPage(BaseModel): # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: - items: list[dict] = [] + items: Sequence[Mapping[str, JsonSerializable]] = [] """The list of dataset items returned on this page.""" else: items: Annotated[list[dict], Field(default_factory=list)] diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index a0436aa576..67c201cfd4 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -15,12 +15,12 @@ from ._utils import validate_storage_name if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Mapping, Sequence from typing import Any, Literal from typing_extensions import Unpack - from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs + from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs, JsonSerializable from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import DatasetClient @@ -134,7 +134,7 @@ async def drop(self) -> None: async def purge(self) -> None: await self._client.purge() - async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: """Store an object or an array of objects to the dataset. The size of the data is limited by the receiving API and therefore `push_data()` will only @@ -210,7 +210,7 @@ async def iterate_items( unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict[str, Any]]: + ) -> AsyncIterator[Mapping[str, JsonSerializable]]: """Iterate over items in the dataset according to specified filters and sorting. This method allows for asynchronously iterating through dataset items while applying various filters such as @@ -258,7 +258,7 @@ async def list_items( unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> list[dict[str, Any]]: + ) -> list[Mapping[str, JsonSerializable]]: """Retrieve a list of all items from the dataset according to specified filters and sorting. This method collects all dataset items into a list while applying various filters such as diff --git a/uv.lock b/uv.lock index a9e945e853..b63298c0c7 100644 --- a/uv.lock +++ b/uv.lock @@ -956,7 +956,7 @@ requires-dist = [ { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-sqlite'", specifier = ">=2.0.0,<3.0.0" }, { name = "tldextract", specifier = ">=5.1.0" }, { name = "typer", marker = "extra == 'cli'", specifier = ">=0.12.0" }, - { name = "typing-extensions", specifier = ">=4.1.0" }, + { name = "typing-extensions", specifier = ">=4.10.0" }, { name = "wrapt", marker = "extra == 'otel'", specifier = ">=1.17.0" }, { name = "yarl", specifier = ">=1.18.0" }, ] From 71a8fba5901cdefe9fd85a0e75eb21da02a0daee Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 13 May 2026 12:30:42 +0000 Subject: [PATCH 2/4] update `JsonSerializable` type --- src/crawlee/_types.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 16fd9e18b4..d999149e7e 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -27,9 +27,7 @@ from crawlee.storage_clients import StorageClient from crawlee.storages import KeyValueStore - # Workaround for https://github.com/pydantic/pydantic/issues/9445 - J = TypeVar('J', bound='JsonSerializable') - JsonSerializable = list[J] | dict[str, J] | str | bool | int | float | None + JsonSerializable = dict[str, 'JsonSerializable'] | list['JsonSerializable'] | str | int | float | bool | None else: from pydantic import JsonValue as JsonSerializable From c84fffea5bf458d6f5ef7c82a125b6caf5661935 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 13 May 2026 19:03:10 +0000 Subject: [PATCH 3/4] remove garbage artefact --- src/crawlee/storage_clients/_base/_dataset_client.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/crawlee/storage_clients/_base/_dataset_client.py b/src/crawlee/storage_clients/_base/_dataset_client.py index 1a9ea11b40..68d3a6a3df 100644 --- a/src/crawlee/storage_clients/_base/_dataset_client.py +++ b/src/crawlee/storage_clients/_base/_dataset_client.py @@ -12,12 +12,6 @@ from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata -def _is_list_of_items( - data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable], -) -> TypeIs[Sequence[Mapping[str, JsonSerializable]]]: - return isinstance(data, list) - - class DatasetClient(ABC): """An abstract class for dataset storage clients. From 413ee179214f44efce04b278cb1bc25c7a4e4aba Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 15 May 2026 01:15:34 +0000 Subject: [PATCH 4/4] use consistent typing with `JsonSerializable` --- src/crawlee/_request.py | 9 +++++---- src/crawlee/_types.py | 12 ++++++------ .../_abstract_http/_abstract_http_crawler.py | 4 ++-- .../_adaptive_playwright_crawler.py | 13 ++++++++----- src/crawlee/crawlers/_basic/_basic_crawler.py | 8 ++++---- .../crawlers/_playwright/_playwright_crawler.py | 2 +- src/crawlee/sessions/_models.py | 3 ++- src/crawlee/sessions/_session.py | 2 +- .../storage_clients/_base/_dataset_client.py | 7 ++++--- .../storage_clients/_file_system/_dataset_client.py | 4 ++-- .../storage_clients/_memory/_dataset_client.py | 2 +- .../storage_clients/_redis/_dataset_client.py | 6 +++--- src/crawlee/storage_clients/_sql/_dataset_client.py | 4 ++-- src/crawlee/storage_clients/_sql/_db_models.py | 2 +- src/crawlee/storages/_key_value_store.py | 8 ++++---- 15 files changed, 46 insertions(+), 40 deletions(-) diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index acf37f4cd7..0d9598db8b 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -1,6 +1,6 @@ from __future__ import annotations -from collections.abc import Iterator, MutableMapping +from collections.abc import Iterator, Mapping, MutableMapping from datetime import datetime from enum import IntEnum from typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast @@ -135,7 +135,7 @@ class RequestOptions(TypedDict): keep_url_fragment: NotRequired[bool] use_extended_unique_key: NotRequired[bool] always_enqueue: NotRequired[bool] - user_data: NotRequired[dict[str, JsonSerializable]] + user_data: NotRequired[Mapping[str, JsonSerializable]] no_retry: NotRequired[bool] enqueue_strategy: NotRequired[EnqueueStrategy] max_retries: NotRequired[int | None] @@ -200,7 +200,7 @@ class Request(BaseModel): headers: HttpHeaders = HttpHeaders() """HTTP request headers.""" - user_data: dict[str, JsonSerializable] = {} + user_data: MutableMapping[str, JsonSerializable] = {} """Custom user data assigned to the request. Use this to save any request related data to the request's scope, keeping them accessible on retries, failures etc. """ @@ -209,8 +209,9 @@ class Request(BaseModel): headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] """HTTP request headers.""" + # Internally, the model contains `UserData`, this is just for convenience user_data: Annotated[ - dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience + MutableMapping[str, JsonSerializable], Field(alias='userData', default_factory=UserData), PlainValidator(user_data_adapter.validate_python), PlainSerializer( diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index d999149e7e..511fe33822 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -14,7 +14,7 @@ import json import logging import re - from collections.abc import Awaitable, Coroutine, Sequence + from collections.abc import Awaitable, Coroutine, MutableMapping, Sequence from typing_extensions import NotRequired, Required, Self, Unpack @@ -390,7 +390,7 @@ def __call__( selector: str | None = None, attribute: str | None = None, label: str | None = None, - user_data: dict[str, JsonSerializable] | None = None, + user_data: Mapping[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, rq_id: str | None = None, rq_name: str | None = None, @@ -415,7 +415,7 @@ def __call__( selector: str | None = None, attribute: str | None = None, label: str | None = None, - user_data: dict[str, JsonSerializable] | None = None, + user_data: Mapping[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, rq_id: str | None = None, @@ -463,7 +463,7 @@ def __call__( selector: str = 'a', attribute: str = 'href', label: str | None = None, - user_data: dict[str, JsonSerializable] | None = None, + user_data: Mapping[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, list[Request]]: @@ -614,8 +614,8 @@ class UseStateFunction(Protocol): def __call__( self, - default_value: dict[str, JsonSerializable] | None = None, - ) -> Coroutine[None, None, dict[str, JsonSerializable]]: + default_value: MutableMapping[str, JsonSerializable] | None = None, + ) -> Coroutine[None, None, MutableMapping[str, JsonSerializable]]: """Call dunder method. Args: diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index c2fe8f5c74..8d15a1d801 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -21,7 +21,7 @@ from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult if TYPE_CHECKING: - from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator + from collections.abc import AsyncGenerator, Awaitable, Callable, Iterator, Mapping from typing_extensions import Unpack @@ -200,7 +200,7 @@ async def extract_links( selector: str = 'a', attribute: str = 'href', label: str | None = None, - user_data: dict[str, JsonSerializable] | None = None, + user_data: Mapping[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index bf5f0e6747..0cee23ab2f 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -42,6 +42,7 @@ from ._result_comparator import create_default_comparator if TYPE_CHECKING: + from collections.abc import MutableMapping from types import TracebackType from typing_extensions import Unpack @@ -286,7 +287,7 @@ async def _crawl_one( self, rendering_type: RenderingType, context: BasicCrawlingContext, - state: dict[str, JsonSerializable] | None = None, + state: MutableMapping[str, JsonSerializable] | None = None, ) -> SubCrawlerRun: """Perform a one request crawl with specific context pipeline and return `SubCrawlerRun`. @@ -297,8 +298,8 @@ async def _crawl_one( if state is not None: async def get_input_state( - default_value: dict[str, JsonSerializable] | None = None, # noqa:ARG001 # Intentionally unused arguments. Closure, that generates same output regardless of inputs. - ) -> dict[str, JsonSerializable]: + default_value: MutableMapping[str, JsonSerializable] | None = None, # noqa:ARG001 # Intentionally unused arguments. Closure, that generates same output regardless of inputs. + ) -> MutableMapping[str, JsonSerializable]: return state use_state_function = get_input_state @@ -411,8 +412,10 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None: # avoid static crawl to modify the state. # (This static crawl is performed only to evaluate rendering type detection.) kvs = await context.get_key_value_store() - default_value = dict[str, JsonSerializable]() - old_state: dict[str, JsonSerializable] = await kvs.get_value(self._CRAWLEE_STATE_KEY, default_value) + default_value: MutableMapping[str, JsonSerializable] = {} + old_state: MutableMapping[str, JsonSerializable] = await kvs.get_value( + self._CRAWLEE_STATE_KEY, default_value + ) old_state_copy = deepcopy(old_state) pw_run = await self._crawl_one('client only', context=context) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 207a1968ad..be3da6dd27 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -80,7 +80,7 @@ if TYPE_CHECKING: import re - from collections.abc import Iterator, Mapping + from collections.abc import Iterator, Mapping, MutableMapping from contextlib import AbstractAsyncContextManager from crawlee._types import ( @@ -856,8 +856,8 @@ async def add_requests( async def use_state( self, - default_value: dict[str, JsonSerializable] | None = None, - ) -> dict[str, JsonSerializable]: + default_value: MutableMapping[str, JsonSerializable] | None = None, + ) -> MutableMapping[str, JsonSerializable]: kvs = await self.get_key_value_store() return await kvs.get_auto_saved_value(f'{self._CRAWLEE_STATE_KEY}_{self._id}', default_value) @@ -1015,7 +1015,7 @@ async def enqueue_links( selector: str | None = None, attribute: str | None = None, label: str | None = None, - user_data: dict[str, JsonSerializable] | None = None, + user_data: Mapping[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 73e4d26d36..321c732fa6 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -385,7 +385,7 @@ async def extract_links( selector: str = 'a', attribute: str = 'href', label: str | None = None, - user_data: dict[str, JsonSerializable] | None = None, + user_data: Mapping[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], diff --git a/src/crawlee/sessions/_models.py b/src/crawlee/sessions/_models.py index c2e17ad439..fa0b64042c 100644 --- a/src/crawlee/sessions/_models.py +++ b/src/crawlee/sessions/_models.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections.abc import MutableMapping from datetime import datetime, timedelta from typing import Annotated, Any @@ -26,7 +27,7 @@ class SessionModel(BaseModel): id: Annotated[str, Field(alias='id')] max_age: Annotated[timedelta, Field(alias='maxAge')] - user_data: Annotated[dict[str, JsonSerializable], Field(alias='userData')] + user_data: Annotated[MutableMapping[str, JsonSerializable], Field(alias='userData')] max_error_score: Annotated[float, Field(alias='maxErrorScore')] error_score_decrement: Annotated[float, Field(alias='errorScoreDecrement')] created_at: Annotated[datetime, Field(alias='createdAt')] diff --git a/src/crawlee/sessions/_session.py b/src/crawlee/sessions/_session.py index 6663d43cfe..4ff3ee7efa 100644 --- a/src/crawlee/sessions/_session.py +++ b/src/crawlee/sessions/_session.py @@ -65,7 +65,7 @@ def __init__( """ self._id = id or crypto_random_object_id(length=10) self._max_age = max_age - self._user_data: dict[str, JsonSerializable] = dict(user_data) if user_data is not None else {} + self._user_data: MutableMapping[str, JsonSerializable] = dict(user_data) if user_data is not None else {} self._max_error_score = max_error_score self._error_score_decrement = error_score_decrement self._created_at = created_at or datetime.now(timezone.utc) diff --git a/src/crawlee/storage_clients/_base/_dataset_client.py b/src/crawlee/storage_clients/_base/_dataset_client.py index 68d3a6a3df..19b030a675 100644 --- a/src/crawlee/storage_clients/_base/_dataset_client.py +++ b/src/crawlee/storage_clients/_base/_dataset_client.py @@ -1,10 +1,11 @@ from __future__ import annotations from abc import ABC, abstractmethod +from collections.abc import Sequence from typing import TYPE_CHECKING if TYPE_CHECKING: - from collections.abc import AsyncIterator, Mapping, Sequence + from collections.abc import AsyncIterator, Mapping from typing_extensions import TypeIs @@ -95,7 +96,7 @@ async def iterate_items( yield {} @staticmethod - def _is_list_of_items( + def _is_sequence_of_items( data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable], ) -> TypeIs[Sequence[Mapping[str, JsonSerializable]]]: - return isinstance(data, list) + return isinstance(data, Sequence) diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index 38c0ede0f1..8fce7b4733 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -225,7 +225,7 @@ async def purge(self) -> None: async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: async with self._lock: new_item_count = self._metadata.item_count - if self._is_list_of_items(data): + if self._is_sequence_of_items(data): for item in data: new_item_count += 1 await self._push_item(item, new_item_count) @@ -352,7 +352,7 @@ async def iterate_items( unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict[str, Any]]: + ) -> AsyncIterator[Mapping[str, JsonSerializable]]: # Check for unsupported arguments and log a warning if found. unsupported_args: dict[str, Any] = { 'clean': clean, diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py index f98e1f5296..0028292e63 100644 --- a/src/crawlee/storage_clients/_memory/_dataset_client.py +++ b/src/crawlee/storage_clients/_memory/_dataset_client.py @@ -120,7 +120,7 @@ async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mappi metadata = await self.get_metadata() new_item_count = metadata.item_count - if self._is_list_of_items(data): + if self._is_sequence_of_items(data): for item in data: new_item_count += 1 await self._push_item(item) diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index 5777dc2cd5..f2383b47de 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -129,7 +129,7 @@ async def purge(self) -> None: @retry_on_error(RedisError) @override async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: - if not self._is_list_of_items(data): + if not self._is_sequence_of_items(data): data = [data] async with self._get_pipeline() as pipe: @@ -239,7 +239,7 @@ async def iterate_items( unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict[str, Any]]: + ) -> AsyncIterator[Mapping[str, JsonSerializable]]: """Iterate over dataset items one by one. This method yields items individually instead of loading all items at once, @@ -303,7 +303,7 @@ async def iterate_items( if skip_empty and not item: continue - yield cast('dict[str, Any]', item) + yield cast('Mapping[str, JsonSerializable]', item) async with self._get_pipeline() as pipe: await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True)) diff --git a/src/crawlee/storage_clients/_sql/_dataset_client.py b/src/crawlee/storage_clients/_sql/_dataset_client.py index 239ac16623..f130556351 100644 --- a/src/crawlee/storage_clients/_sql/_dataset_client.py +++ b/src/crawlee/storage_clients/_sql/_dataset_client.py @@ -147,7 +147,7 @@ async def purge(self) -> None: @retry_on_error(SQLAlchemyError) @override async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: - if not self._is_list_of_items(data): + if not self._is_sequence_of_items(data): data = [data] db_items = [{'dataset_id': self._id, 'data': item} for item in data] @@ -219,7 +219,7 @@ async def iterate_items( unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict[str, Any]]: + ) -> AsyncIterator[Mapping[str, JsonSerializable]]: stmt = self._prepare_get_stmt( offset=offset, limit=limit, diff --git a/src/crawlee/storage_clients/_sql/_db_models.py b/src/crawlee/storage_clients/_sql/_db_models.py index db3f0a8e9f..3acb869a6f 100644 --- a/src/crawlee/storage_clients/_sql/_db_models.py +++ b/src/crawlee/storage_clients/_sql/_db_models.py @@ -189,7 +189,7 @@ class DatasetItemDb(Base): ) """Foreign key to metadata dataset record.""" - data: Mapped[list[dict[str, Any]] | dict[str, Any]] = mapped_column(JsonField, nullable=False) + data: Mapped[dict[str, Any]] = mapped_column(JsonField, nullable=False) """JSON serializable item data.""" # Relationship back to parent dataset diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 264c2fd1c8..611e3db919 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -1,7 +1,7 @@ from __future__ import annotations import asyncio -from collections.abc import AsyncIterator +from collections.abc import MutableMapping # noqa: TC003 from logging import getLogger from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, overload @@ -33,7 +33,7 @@ class AutosavedValue(RootModel): - root: dict[str, JsonSerializable] + root: MutableMapping[str, JsonSerializable] @docs_group('Storages') @@ -262,8 +262,8 @@ async def get_public_url(self, key: str) -> str: async def get_auto_saved_value( self, key: str, - default_value: dict[str, JsonSerializable] | None = None, - ) -> dict[str, JsonSerializable]: + default_value: MutableMapping[str, JsonSerializable] | None = None, + ) -> MutableMapping[str, JsonSerializable]: """Get a value from KVS that will be automatically saved on changes. Args: