From a5c54eee98a51d92dd1551bdbddce21444c80e5e Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 4 Mar 2026 21:19:10 +0000
Subject: [PATCH 1/6] add `discover_valid_sitemaps` helper

---
 src/crawlee/_utils/sitemap.py     | 142 +++++++++++++++++++++++++++++-
 src/crawlee/_utils/web.py         |   5 ++
 tests/unit/_utils/test_sitemap.py | 104 +++++++++++++++++++++-
 3 files changed, 248 insertions(+), 3 deletions(-)

diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py
index 95d1e26a5f..f77909869c 100644
--- a/src/crawlee/_utils/sitemap.py
+++ b/src/crawlee/_utils/sitemap.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
 import asyncio
+import re
 import zlib
 from codecs import getincrementaldecoder
+from collections import defaultdict
 from contextlib import suppress
 from dataclasses import dataclass
 from datetime import datetime, timedelta
@@ -16,6 +18,9 @@
 from typing_extensions import NotRequired, override
 from yarl import URL
 
+from crawlee._utils.web import is_status_code_successful
+from crawlee.errors import ProxyError
+
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator
     from xml.sax.xmlreader import AttributesImpl
@@ -27,6 +32,8 @@
 
 VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'}
 SITEMAP_HEADERS = {'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'}
+SITEMAP_URL_PATTERN = re.compile(r'sitemap\.(?:xml|txt)(?:\.gz)?$', re.IGNORECASE)
+COMMON_SITEMAP_PATHS = ['/sitemap.xml', '/sitemap.txt', '/sitemap_index.xml']
 
 
 @dataclass()
@@ -384,7 +391,7 @@ def urls(self) -> list[str]:
     @classmethod
     async def try_common_names(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Sitemap:
         base_url = URL(url)
-        sitemap_urls = [str(base_url.with_path('/sitemap.xml')), str(base_url.with_path('/sitemap.txt'))]
+        sitemap_urls = [str(base_url.with_path(path)) for path in COMMON_SITEMAP_PATHS]
         return await cls.load(sitemap_urls, http_client, proxy_info)
 
     @classmethod
@@ -484,3 +491,136 @@ async def parse_sitemap(
                 yield result
         else:
             logger.warning(f'Invalid source configuration: {source}')
+
+
+async def _merge_async_generators(*generators: AsyncGenerator) -> AsyncGenerator:
+    queue: asyncio.Queue = asyncio.Queue()
+
+    end_feed = object()
+
+    async def feed(gen: AsyncGenerator) -> None:
+        try:
+            async for item in gen:
+                await queue.put(item)
+        except Exception:
+            logger.warning(f'Error in generator: {gen}', exc_info=True)
+        finally:
+            await queue.put(end_feed)
+
+    tasks = [asyncio.create_task(feed(gen)) for gen in generators]
+    remaining_tasks = len(tasks)
+
+    try:
+        while remaining_tasks > 0:
+            item = await queue.get()
+            if item is end_feed:
+                remaining_tasks -= 1
+            else:
+                yield item
+    finally:
+        for task in tasks:
+            task.cancel()
+
+
+async def _discover_for_hostname(
+    hostname: str,
+    hostname_urls: list[str],
+    *,
+    http_client: HttpClient,
+    proxy_info: ProxyInfo | None = None,
+    request_timeout: timedelta,
+    method_for_checking: Literal['HEAD', 'GET'] = 'HEAD',
+) -> AsyncGenerator[str, None]:
+    # Import here to avoid circular imports.
+    from crawlee._utils.robots import RobotsTxtFile  # noqa: PLC0415
+
+    domain_seen: set[str] = set()
+    hostname_urls = list(set(hostname_urls))  # Remove duplicates
+
+    def _check_and_add(url: str) -> bool:
+        if url in domain_seen:
+            return False
+        domain_seen.add(url)
+        return True
+
+    # Try getting sitemaps from robots.txt first
+    robots = await RobotsTxtFile.find(url=hostname_urls[0], http_client=http_client, proxy_info=proxy_info)
+    for sitemap_url in robots.get_sitemaps():
+        if _check_and_add(sitemap_url):
+            yield sitemap_url
+
+    # Check maybe provided URLs have sitemap url
+    sitemap_url = next((url for url in hostname_urls if SITEMAP_URL_PATTERN.search(url)), None)
+
+    if sitemap_url:
+        if _check_and_add(sitemap_url):
+            yield sitemap_url
+    else:
+        # Check common sitemap locations
+        base_url = URL(hostname_urls[0])
+        for path in COMMON_SITEMAP_PATHS:
+            candidate = str(base_url.with_path(path))
+            if candidate in domain_seen:
+                continue
+            try:
+                response = await http_client.send_request(
+                    candidate, method=method_for_checking, proxy_info=proxy_info, timeout=request_timeout
+                )
+                if is_status_code_successful(response.status_code) and _check_and_add(candidate):
+                    yield candidate
+            except ProxyError:
+                logger.warning(f'Proxy error when checking {candidate} with sitemap discovery for {hostname}')
+            except asyncio.TimeoutError:
+                logger.warning(f'Timeout when checking {candidate} with sitemap discovery for {hostname}')
+
+
+async def discover_valid_sitemaps(
+    urls: list[str],
+    *,
+    http_client: HttpClient,
+    proxy_info: ProxyInfo | None = None,
+    request_timeout: timedelta = timedelta(seconds=20),
+    method_for_checking: Literal['HEAD', 'GET'] = 'HEAD',
+) -> AsyncGenerator[str, None]:
+    """Discover related sitemaps for the given URLs.
+
+    Args:
+        urls: List of URLs to discover sitemaps for.
+        http_client: `HttpClient` to use for making requests.
+        proxy_info: Proxy configuration to use for requests.
+        request_timeout: Timeout for each request when checking for sitemaps.
+        method_for_checking: HTTP method to use when checking for sitemap existence (HEAD or GET).
+    """
+    # Use a set to track seen sitemap URLs and avoid duplicates
+    seen = set()
+
+    grouped_urls = defaultdict(list)
+    for url in urls:
+        try:
+            hostname = URL(url).host
+        except ValueError:
+            logger.warning(f'Invalid URL {url} skipped')
+            continue
+
+        if not hostname:
+            logger.warning(f'URL {url} without host skipped')
+            continue
+
+        grouped_urls[hostname].append(url)
+
+    generators = [
+        _discover_for_hostname(
+            hostname,
+            hostname_urls,
+            http_client=http_client,
+            proxy_info=proxy_info,
+            request_timeout=request_timeout,
+            method_for_checking=method_for_checking,
+        )
+        for hostname, hostname_urls in grouped_urls.items()
+    ]
+
+    async for sitemap_url in _merge_async_generators(*generators):
+        if sitemap_url not in seen:
+            seen.add(sitemap_url)
+            yield sitemap_url
diff --git a/src/crawlee/_utils/web.py b/src/crawlee/_utils/web.py
index 2624383abf..969803ac22 100644
--- a/src/crawlee/_utils/web.py
+++ b/src/crawlee/_utils/web.py
@@ -9,3 +9,8 @@ def is_status_code_client_error(value: int) -> bool:
 def is_status_code_server_error(value: int) -> bool:
     """Return `True` for 5xx status codes, `False` otherwise."""
     return value >= 500  # noqa: PLR2004
+
+
+def is_status_code_successful(value: int) -> bool:
+    """Return `True` for 2xx and 3xx status codes, `False` otherwise."""
+    return 200 <= value < 400  # noqa: PLR2004
diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py
index 807090eaa4..5f2005ca16 100644
--- a/tests/unit/_utils/test_sitemap.py
+++ b/tests/unit/_utils/test_sitemap.py
@@ -1,11 +1,13 @@
 import base64
 import gzip
 from datetime import datetime
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
 
 from yarl import URL
 
-from crawlee._utils.sitemap import Sitemap, SitemapUrl, parse_sitemap
-from crawlee.http_clients._base import HttpClient
+from crawlee._utils.sitemap import Sitemap, SitemapUrl, discover_valid_sitemaps, parse_sitemap
+from crawlee.http_clients._base import HttpClient, HttpResponse
 
 BASIC_SITEMAP = """
 <?xml version="1.0" encoding="UTF-8"?>
@@ -46,6 +48,23 @@
 }
 
 
+def _make_mock_client(url_map: dict[str, tuple[int, bytes]]) -> AsyncMock:
+    async def send_request(url: str, **_kwargs: Any) -> HttpResponse:
+        status, body = 404, b''
+        for pattern, (s, b) in url_map.items():
+            if pattern in url:
+                status, body = s, b
+                break
+        response = MagicMock(spec=HttpResponse)
+        response.status_code = status
+        response.read = AsyncMock(return_value=body)
+        return response
+
+    client = AsyncMock(spec=HttpClient)
+    client.send_request.side_effect = send_request
+    return client
+
+
 def compress_gzip(data: str) -> bytes:
     """Compress a string using gzip."""
     return gzip.compress(data.encode())
@@ -246,3 +265,84 @@ async def test_sitemap_from_string() -> None:
 
     assert len(sitemap.urls) == 5
     assert set(sitemap.urls) == BASIC_RESULTS
+
+
+async def test_discover_sitemap_from_robots_txt() -> None:
+    """Sitemap URL found in robots.txt is yielded."""
+    robots_content = b'User-agent: *\nSitemap: http://example.com/custom-sitemap.xml'
+    http_client = _make_mock_client({'robots.txt': (200, robots_content)})
+
+    urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]
+
+    assert urls == ['http://example.com/custom-sitemap.xml']
+
+
+async def test_discover_sitemap_from_common_paths() -> None:
+    """Sitemap is found at common paths when robots.txt has none."""
+    http_client = _make_mock_client(
+        {'/sitemap.xml': (200, b''), '/sitemap.txt': (200, b''), '/sitemap_index.xml': (200, b'')}
+    )
+
+    urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]
+
+    assert urls == [
+        'http://example.com/sitemap.xml',
+        'http://example.com/sitemap.txt',
+        'http://example.com/sitemap_index.xml',
+    ]
+
+
+async def test_discover_sitemap_from_input_url() -> None:
+    """Input URL that is already a sitemap is yielded directly without checking common paths."""
+    http_client = _make_mock_client({'/sitemap.txt': (200, b'')})
+
+    urls = [url async for url in discover_valid_sitemaps(['http://example.com/sitemap.xml'], http_client=http_client)]
+
+    assert urls == ['http://example.com/sitemap.xml']
+
+
+async def test_discover_sitemap_deduplication() -> None:
+    """Sitemap URL found in robots.txt is not yielded again from common paths check."""
+    robots_content = b'User-agent: *\nSitemap: http://example.com/sitemap.xml'
+    http_client = _make_mock_client(
+        {
+            'robots.txt': (200, robots_content),
+            '/sitemap.xml': (200, b''),
+        }
+    )
+
+    urls = [url async for url in discover_valid_sitemaps(['http://example.com/page'], http_client=http_client)]
+
+    assert urls == ['http://example.com/sitemap.xml']
+
+
+async def test_discover_sitemaps_multiple_domains() -> None:
+    """Sitemaps from multiple domains are all discovered."""
+    http_client = _make_mock_client(
+        {
+            'domain-a.com/sitemap.xml': (200, b''),
+            'domain-b.com/sitemap.xml': (200, b''),
+        }
+    )
+
+    urls = [
+        url
+        async for url in discover_valid_sitemaps(
+            ['http://domain-a.com/page', 'http://domain-b.com/page'],
+            http_client=http_client,
+        )
+    ]
+
+    assert set(urls) == {
+        'http://domain-a.com/sitemap.xml',
+        'http://domain-b.com/sitemap.xml',
+    }
+
+
+async def test_discover_sitemap_url_without_host_skipped() -> None:
+    """URLs without a host are skipped."""
+    http_client = _make_mock_client({})
+
+    urls = [url async for url in discover_valid_sitemaps(['not-a-valid-url'], http_client=http_client)]
+
+    assert urls == []

From f1015a56a1357ac1ba5a520b9d43c7331410836b Mon Sep 17 00:00:00 2001
From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com>
Date: Wed, 4 Mar 2026 23:54:37 +0200
Subject: [PATCH 2/6] Update src/crawlee/_utils/sitemap.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/crawlee/_utils/sitemap.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py
index f77909869c..1eef7f9ad3 100644
--- a/src/crawlee/_utils/sitemap.py
+++ b/src/crawlee/_utils/sitemap.py
@@ -520,8 +520,7 @@ async def feed(gen: AsyncGenerator) -> None:
     finally:
         for task in tasks:
             task.cancel()
-
-
+        await asyncio.gather(*tasks, return_exceptions=True)
 async def _discover_for_hostname(
     hostname: str,
     hostname_urls: list[str],

From 2b757ccdde42602c74fc282a328d11b1833c2f1e Mon Sep 17 00:00:00 2001
From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com>
Date: Thu, 5 Mar 2026 00:00:24 +0200
Subject: [PATCH 3/6] Update src/crawlee/_utils/sitemap.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/crawlee/_utils/sitemap.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py
index 1eef7f9ad3..77152cdded 100644
--- a/src/crawlee/_utils/sitemap.py
+++ b/src/crawlee/_utils/sitemap.py
@@ -549,11 +549,12 @@ def _check_and_add(url: str) -> bool:
             yield sitemap_url
 
     # Check maybe provided URLs have sitemap url
-    sitemap_url = next((url for url in hostname_urls if SITEMAP_URL_PATTERN.search(url)), None)
+    matching_sitemap_urls = [url for url in hostname_urls if SITEMAP_URL_PATTERN.search(url)]
 
-    if sitemap_url:
-        if _check_and_add(sitemap_url):
-            yield sitemap_url
+    if matching_sitemap_urls:
+        for sitemap_url in matching_sitemap_urls:
+            if _check_and_add(sitemap_url):
+                yield sitemap_url
     else:
         # Check common sitemap locations
         base_url = URL(hostname_urls[0])

From 6dd21ab799f986dc24e019208b16cf7601e6c578 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 4 Mar 2026 21:58:27 +0000
Subject: [PATCH 4/6] use gather without `return_exception`

---
 src/crawlee/_utils/sitemap.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py
index 77152cdded..81959e1046 100644
--- a/src/crawlee/_utils/sitemap.py
+++ b/src/crawlee/_utils/sitemap.py
@@ -520,7 +520,9 @@ async def feed(gen: AsyncGenerator) -> None:
     finally:
         for task in tasks:
             task.cancel()
-        await asyncio.gather(*tasks, return_exceptions=True)
+        await asyncio.gather(*tasks)
+
+
 async def _discover_for_hostname(
     hostname: str,
     hostname_urls: list[str],

From adbac6094aa5e57b52a19e14416a81a53b08f7e5 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 4 Mar 2026 22:00:59 +0000
Subject: [PATCH 5/6] handle exception in discover

---
 src/crawlee/_utils/sitemap.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py
index 81959e1046..6381493c61 100644
--- a/src/crawlee/_utils/sitemap.py
+++ b/src/crawlee/_utils/sitemap.py
@@ -574,6 +574,8 @@ def _check_and_add(url: str) -> bool:
                 logger.warning(f'Proxy error when checking {candidate} with sitemap discovery for {hostname}')
             except asyncio.TimeoutError:
                 logger.warning(f'Timeout when checking {candidate} with sitemap discovery for {hostname}')
+            except Exception:
+                logger.warning(f'Error when checking {candidate} with sitemap discovery for {hostname}', exc_info=True)
 
 
 async def discover_valid_sitemaps(

From bd3587e457c41526844295d84fd371781e0f30fd Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Fri, 6 Mar 2026 12:22:58 +0000
Subject: [PATCH 6/6] fix

---
 src/crawlee/_utils/sitemap.py | 4 ++--
 src/crawlee/_utils/web.py     | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py
index 6381493c61..b90d2e6935 100644
--- a/src/crawlee/_utils/sitemap.py
+++ b/src/crawlee/_utils/sitemap.py
@@ -32,7 +32,7 @@
 
 VALID_CHANGE_FREQS = {'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'}
 SITEMAP_HEADERS = {'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8'}
-SITEMAP_URL_PATTERN = re.compile(r'sitemap\.(?:xml|txt)(?:\.gz)?$', re.IGNORECASE)
+SITEMAP_URL_PATTERN = re.compile(r'\/sitemap\.(?:xml|txt)(?:\.gz)?$', re.IGNORECASE)
 COMMON_SITEMAP_PATHS = ['/sitemap.xml', '/sitemap.txt', '/sitemap_index.xml']
 
 
@@ -520,7 +520,7 @@ async def feed(gen: AsyncGenerator) -> None:
     finally:
         for task in tasks:
             task.cancel()
-        await asyncio.gather(*tasks)
+        await asyncio.gather(*tasks, return_exceptions=True)
 
 
 async def _discover_for_hostname(
diff --git a/src/crawlee/_utils/web.py b/src/crawlee/_utils/web.py
index 969803ac22..ff00480a67 100644
--- a/src/crawlee/_utils/web.py
+++ b/src/crawlee/_utils/web.py
@@ -1,16 +1,18 @@
 from __future__ import annotations
 
+from http import HTTPStatus
+
 
 def is_status_code_client_error(value: int) -> bool:
     """Return `True` for 4xx status codes, `False` otherwise."""
-    return 400 <= value <= 499  # noqa: PLR2004
+    return HTTPStatus.BAD_REQUEST <= value < HTTPStatus.INTERNAL_SERVER_ERROR
 
 
 def is_status_code_server_error(value: int) -> bool:
     """Return `True` for 5xx status codes, `False` otherwise."""
-    return value >= 500  # noqa: PLR2004
+    return value >= HTTPStatus.INTERNAL_SERVER_ERROR
 
 
 def is_status_code_successful(value: int) -> bool:
     """Return `True` for 2xx and 3xx status codes, `False` otherwise."""
-    return 200 <= value < 400  # noqa: PLR2004
+    return HTTPStatus.OK <= value < HTTPStatus.BAD_REQUEST