From a5c54eee98a51d92dd1551bdbddce21444c80e5e Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 4 Mar 2026 21:19:10 +0000 Subject: [PATCH 1/6] add `discover_valid_sitemaps` helper --- src/crawlee/_utils/sitemap.py | 142 +++++++++++++++++++++++++++++- src/crawlee/_utils/web.py | 5 ++ tests/unit/_utils/test_sitemap.py | 104 +++++++++++++++++++++- 3 files changed, 248 insertions(+), 3 deletions(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 95d1e26a5f..f77909869c 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -1,8 +1,10 @@ from __future__ import annotations import asyncio +import re import zlib from codecs import getincrementaldecoder +from collections import defaultdict from contextlib import suppress from dataclasses import dataclass from datetime import datetime, timedelta @@ -16,6 +18,9 @@ from typing_extensions import NotRequired, override from yarl import URL +from crawlee._utils.web import is_status_code_successful +from crawlee.errors import ProxyError + if TYPE_CHECKING: from collections.abc import AsyncGenerator from xml.sax.xmlreader import AttributesImpl @@ -27,6 +32,8 @@ VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'} SITEMAP_HEADERS = {'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'} +SITEMAP_URL_PATTERN = re.compile(r'sitemap\.(?:xml|txt)(?:\.gz)?$', re.IGNORECASE) +COMMON_SITEMAP_PATHS = ['/sitemap.xml', '/sitemap.txt', '/sitemap_index.xml'] @dataclass() @@ -384,7 +391,7 @@ def urls(self) -> list[str]: @classmethod async def try_common_names(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Sitemap: base_url = URL(url) - sitemap_urls = [str(base_url.with_path('/sitemap.xml')), str(base_url.with_path('/sitemap.txt'))] + sitemap_urls = [str(base_url.with_path(path)) for path in COMMON_SITEMAP_PATHS] return await cls.load(sitemap_urls, http_client, proxy_info) @classmethod @@ -484,3 +491,136 @@ async def parse_sitemap( yield result else: logger.warning(f'Invalid source configuration: {source}') + + +async def _merge_async_generators(*generators: AsyncGenerator) -> AsyncGenerator: + queue: asyncio.Queue = asyncio.Queue() + + end_feed = object() + + async def feed(gen: AsyncGenerator) -> None: + try: + async for item in gen: + await queue.put(item) + except Exception: + logger.warning(f'Error in generator: {gen}', exc_info=True) + finally: + await queue.put(end_feed) + + tasks = [asyncio.create_task(feed(gen)) for gen in generators] + remaining_tasks = len(tasks) + + try: + while remaining_tasks > 0: + item = await queue.get() + if item is end_feed: + remaining_tasks -= 1 + else: + yield item + finally: + for task in tasks: + task.cancel() + + +async def _discover_for_hostname( + hostname: str, + hostname_urls: list[str], + *, + http_client: HttpClient, + proxy_info: ProxyInfo | None = None, + request_timeout: timedelta, + method_for_checking: Literal['HEAD', 'GET'] = 'HEAD', +) -> AsyncGenerator[str, None]: + # Import here to avoid circular imports. + from crawlee._utils.robots import RobotsTxtFile # noqa: PLC0415 + + domain_seen: set[str] = set() + hostname_urls = list(set(hostname_urls)) # Remove duplicates + + def _check_and_add(url: str) -> bool: + if url in domain_seen: + return False + domain_seen.add(url) + return True + + # Try getting sitemaps from robots.txt first + robots = await RobotsTxtFile.find(url=hostname_urls[0], http_client=http_client, proxy_info=proxy_info) + for sitemap_url in robots.get_sitemaps(): + if _check_and_add(sitemap_url): + yield sitemap_url + + # Check maybe provided URLs have sitemap url + sitemap_url = next((url for url in hostname_urls if SITEMAP_URL_PATTERN.search(url)), None) + + if sitemap_url: + if _check_and_add(sitemap_url): + yield sitemap_url + else: + # Check common sitemap locations + base_url = URL(hostname_urls[0]) + for path in COMMON_SITEMAP_PATHS: + candidate = str(base_url.with_path(path)) + if candidate in domain_seen: + continue + try: + response = await http_client.send_request( + candidate, method=method_for_checking, proxy_info=proxy_info, timeout=request_timeout + ) + if is_status_code_successful(response.status_code) and _check_and_add(candidate): + yield candidate + except ProxyError: + logger.warning(f'Proxy error when checking {candidate} with sitemap discovery for {hostname}') + except asyncio.TimeoutError: + logger.warning(f'Timeout when checking {candidate} with sitemap discovery for {hostname}') + + +async def discover_valid_sitemaps( + urls: list[str], + *, + http_client: HttpClient, + proxy_info: ProxyInfo | None = None, + request_timeout: timedelta = timedelta(seconds=20), + method_for_checking: Literal['HEAD', 'GET'] = 'HEAD', +) -> AsyncGenerator[str, None]: + """Discover related sitemaps for the given URLs. + + Args: + urls: List of URLs to discover sitemaps for. + http_client: `HttpClient` to use for making requests. + proxy_info: Proxy configuration to use for requests. + request_timeout: Timeout for each request when checking for sitemaps. + method_for_checking: HTTP method to use when checking for sitemap existence (HEAD or GET). + """ + # Use a set to track seen sitemap URLs and avoid duplicates + seen = set() + + grouped_urls = defaultdict(list) + for url in urls: + try: + hostname = URL(url).host + except ValueError: + logger.warning(f'Invalid URL {url} skipped') + continue + + if not hostname: + logger.warning(f'URL {url} without host skipped') + continue + + grouped_urls[hostname].append(url) + + generators = [ + _discover_for_hostname( + hostname, + hostname_urls, + http_client=http_client, + proxy_info=proxy_info, + request_timeout=request_timeout, + method_for_checking=method_for_checking, + ) + for hostname, hostname_urls in grouped_urls.items() + ] + + async for sitemap_url in _merge_async_generators(*generators): + if sitemap_url not in seen: + seen.add(sitemap_url) + yield sitemap_url diff --git a/src/crawlee/_utils/web.py b/src/crawlee/_utils/web.py index 2624383abf..969803ac22 100644 --- a/src/crawlee/_utils/web.py +++ b/src/crawlee/_utils/web.py @@ -9,3 +9,8 @@ def is_status_code_client_error(value: int) -> bool: def is_status_code_server_error(value: int) -> bool: """Return `True` for 5xx status codes, `False` otherwise.""" return value >= 500 # noqa: PLR2004 + + +def is_status_code_successful(value: int) -> bool: + """Return `True` for 2xx and 3xx status codes, `False` otherwise.""" + return 200 <= value < 400 # noqa: PLR2004 diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py index 807090eaa4..5f2005ca16 100644 --- a/tests/unit/_utils/test_sitemap.py +++ b/tests/unit/_utils/test_sitemap.py @@ -1,11 +1,13 @@ import base64 import gzip from datetime import datetime +from typing import Any +from unittest.mock import AsyncMock, MagicMock from yarl import URL -from crawlee._utils.sitemap import Sitemap, SitemapUrl, parse_sitemap -from crawlee.http_clients._base import HttpClient +from crawlee._utils.sitemap import Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap +from crawlee.http_clients._base import HttpClient, HttpResponse BASIC_SITEMAP = """ @@ -46,6 +48,23 @@ } +def _make_mock_client(url_map: dict[str, tuple[int, bytes]]) -> AsyncMock: + async def send_request(url: str, **_kwargs: Any) -> HttpResponse: + status, body = 404, b'' + for pattern, (s, b) in url_map.items(): + if pattern in url: + status, body = s, b + break + response = MagicMock(spec=HttpResponse) + response.status_code = status + response.read = AsyncMock(return_value=body) + return response + + client = AsyncMock(spec=HttpClient) + client.send_request.side_effect = send_request + return client + + def compress_gzip(data: str) -> bytes: """Compress a string using gzip.""" return gzip.compress(data.encode()) @@ -246,3 +265,84 @@ async def test_sitemap_from_string() -> None: assert len(sitemap.urls) == 5 assert set(sitemap.urls) == BASIC_RESULTS + + +async def test_discover_sitemap_from_robots_txt() -> None: + """Sitemap URL found in robots.txt is yielded.""" + robots_content = b'User-agent: *\nSitemap: http://example.com/custom-sitemap.xml' + http_client = _make_mock_client({'robots.txt': (200, robots_content)}) + + urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)] + + assert urls == ['http://example.com/custom-sitemap.xml'] + + +async def test_discover_sitemap_from_common_paths() -> None: + """Sitemap is found at common paths when robots.txt has none.""" + http_client = _make_mock_client( + {'/sitemap.xml': (200, b''), '/sitemap.txt': (200, b''), '/sitemap_index.xml': (200, b'')} + ) + + urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)] + + assert urls == [ + 'http://example.com/sitemap.xml', + 'http://example.com/sitemap.txt', + 'http://example.com/sitemap_index.xml', + ] + + +async def test_discover_sitemap_from_input_url() -> None: + """Input URL that is already a sitemap is yielded directly without checking common paths.""" + http_client = _make_mock_client({'/sitemap.txt': (200, b'')}) + + urls = [url async for url in discover_valid_sitemaps(['http://example.com/sitemap.xml'], http_client=http_client)] + + assert urls == ['http://example.com/sitemap.xml'] + + +async def test_discover_sitemap_deduplication() -> None: + """Sitemap URL found in robots.txt is not yielded again from common paths check.""" + robots_content = b'User-agent: *\nSitemap: http://example.com/sitemap.xml' + http_client = _make_mock_client( + { + 'robots.txt': (200, robots_content), + '/sitemap.xml': (200, b''), + } + ) + + urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)] + + assert urls == ['http://example.com/sitemap.xml'] + + +async def test_discover_sitemaps_multiple_domains() -> None: + """Sitemaps from multiple domains are all discovered.""" + http_client = _make_mock_client( + { + 'domain-a.com/sitemap.xml': (200, b''), + 'domain-b.com/sitemap.xml': (200, b''), + } + ) + + urls = [ + url + async for url in discover_valid_sitemaps( + ['http://domain-a.com/page', 'http://domain-b.com/page'], + http_client=http_client, + ) + ] + + assert set(urls) == { + 'http://domain-a.com/sitemap.xml', + 'http://domain-b.com/sitemap.xml', + } + + +async def test_discover_sitemap_url_without_host_skipped() -> None: + """URLs without a host are skipped.""" + http_client = _make_mock_client({}) + + urls = [url async for url in discover_valid_sitemaps(['not-a-valid-url'], http_client=http_client)] + + assert urls == [] From f1015a56a1357ac1ba5a520b9d43c7331410836b Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Wed, 4 Mar 2026 23:54:37 +0200 Subject: [PATCH 2/6] Update src/crawlee/_utils/sitemap.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/crawlee/_utils/sitemap.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index f77909869c..1eef7f9ad3 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -520,8 +520,7 @@ async def feed(gen: AsyncGenerator) -> None: finally: for task in tasks: task.cancel() - - + await asyncio.gather(*tasks, return_exceptions=True) async def _discover_for_hostname( hostname: str, hostname_urls: list[str], From 2b757ccdde42602c74fc282a328d11b1833c2f1e Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Thu, 5 Mar 2026 00:00:24 +0200 Subject: [PATCH 3/6] Update src/crawlee/_utils/sitemap.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/crawlee/_utils/sitemap.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 1eef7f9ad3..77152cdded 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -549,11 +549,12 @@ def _check_and_add(url: str) -> bool: yield sitemap_url # Check maybe provided URLs have sitemap url - sitemap_url = next((url for url in hostname_urls if SITEMAP_URL_PATTERN.search(url)), None) + matching_sitemap_urls = [url for url in hostname_urls if SITEMAP_URL_PATTERN.search(url)] - if sitemap_url: - if _check_and_add(sitemap_url): - yield sitemap_url + if matching_sitemap_urls: + for sitemap_url in matching_sitemap_urls: + if _check_and_add(sitemap_url): + yield sitemap_url else: # Check common sitemap locations base_url = URL(hostname_urls[0]) From 6dd21ab799f986dc24e019208b16cf7601e6c578 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 4 Mar 2026 21:58:27 +0000 Subject: [PATCH 4/6] use gather without `return_exception` --- src/crawlee/_utils/sitemap.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 77152cdded..81959e1046 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -520,7 +520,9 @@ async def feed(gen: AsyncGenerator) -> None: finally: for task in tasks: task.cancel() - await asyncio.gather(*tasks, return_exceptions=True) + await asyncio.gather(*tasks) + + async def _discover_for_hostname( hostname: str, hostname_urls: list[str], From adbac6094aa5e57b52a19e14416a81a53b08f7e5 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 4 Mar 2026 22:00:59 +0000 Subject: [PATCH 5/6] handle exception in discover --- src/crawlee/_utils/sitemap.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 81959e1046..6381493c61 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -574,6 +574,8 @@ def _check_and_add(url: str) -> bool: logger.warning(f'Proxy error when checking {candidate} with sitemap discovery for {hostname}') except asyncio.TimeoutError: logger.warning(f'Timeout when checking {candidate} with sitemap discovery for {hostname}') + except Exception: + logger.warning(f'Error when checking {candidate} with sitemap discovery for {hostname}', exc_info=True) async def discover_valid_sitemaps( From bd3587e457c41526844295d84fd371781e0f30fd Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 6 Mar 2026 12:22:58 +0000 Subject: [PATCH 6/6] fix --- src/crawlee/_utils/sitemap.py | 4 ++-- src/crawlee/_utils/web.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 6381493c61..b90d2e6935 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -32,7 +32,7 @@ VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'} SITEMAP_HEADERS = {'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'} -SITEMAP_URL_PATTERN = re.compile(r'sitemap\.(?:xml|txt)(?:\.gz)?$', re.IGNORECASE) +SITEMAP_URL_PATTERN = re.compile(r'\/sitemap\.(?:xml|txt)(?:\.gz)?$', re.IGNORECASE) COMMON_SITEMAP_PATHS = ['/sitemap.xml', '/sitemap.txt', '/sitemap_index.xml'] @@ -520,7 +520,7 @@ async def feed(gen: AsyncGenerator) -> None: finally: for task in tasks: task.cancel() - await asyncio.gather(*tasks) + await asyncio.gather(*tasks, return_exceptions=True) async def _discover_for_hostname( diff --git a/src/crawlee/_utils/web.py b/src/crawlee/_utils/web.py index 969803ac22..ff00480a67 100644 --- a/src/crawlee/_utils/web.py +++ b/src/crawlee/_utils/web.py @@ -1,16 +1,18 @@ from __future__ import annotations +from http import HTTPStatus + def is_status_code_client_error(value: int) -> bool: """Return `True` for 4xx status codes, `False` otherwise.""" - return 400 <= value <= 499 # noqa: PLR2004 + return HTTPStatus.BAD_REQUEST <= value < HTTPStatus.INTERNAL_SERVER_ERROR def is_status_code_server_error(value: int) -> bool: """Return `True` for 5xx status codes, `False` otherwise.""" - return value >= 500 # noqa: PLR2004 + return value >= HTTPStatus.INTERNAL_SERVER_ERROR def is_status_code_successful(value: int) -> bool: """Return `True` for 2xx and 3xx status codes, `False` otherwise.""" - return 200 <= value < 400 # noqa: PLR2004 + return HTTPStatus.OK <= value < HTTPStatus.BAD_REQUEST