diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index f00004ca3a..65248c3451 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -2,6 +2,7 @@ import asyncio import logging +import sys from dataclasses import dataclass from datetime import timedelta from itertools import cycle @@ -911,3 +912,29 @@ async def request_handler(_context: AdaptivePlaywrightCrawlingContext) -> None: await crawler.run(test_urls[:1]) mocked_handler.assert_called() + + +@pytest.mark.parametrize( + ('optional_module_name'), + [ + pytest.param('playwright', id='playwright'), + pytest.param('jaro', id='jaro'), + pytest.param('sklearn', id='sklearn'), + ], +) +def test_import_error_handled(optional_module_name: str) -> None: + # Block the package and all its cached submodules to prevent submodule cache entries + # from bypassing the blocked top-level package. + blocked = { + mod_name: None + for mod_name in sys.modules + if mod_name == optional_module_name or mod_name.startswith(f'{optional_module_name}.') + } + + with patch.dict('sys.modules', blocked): + for mod_name in list(sys.modules): + if mod_name == 'crawlee.crawlers' or mod_name.startswith('crawlee.crawlers._adaptive_playwright'): + sys.modules.pop(mod_name, None) + + with pytest.raises(ImportError): + from crawlee.crawlers import AdaptivePlaywrightCrawler # noqa: F401 PLC0415 diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index ea31cc42b5..9a828b0078 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import sys from datetime import timedelta from typing import TYPE_CHECKING from unittest import mock @@ -489,3 +490,14 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: mock.call(str(server_url / 'page_3')), ] visit.assert_has_calls(expected_visit_calls, any_order=True) + + +def test_import_error_handled() -> None: + # Simulate ImportError for BeautifulSoup + with mock.patch.dict('sys.modules', {'bs4': None}): + # Invalidate BeautifulSoupCrawler import + for mod_name in list(sys.modules): + if mod_name == 'crawlee.crawlers' or mod_name.startswith('crawlee.crawlers._beautifulsoup'): + sys.modules.pop(mod_name, None) + with pytest.raises(ImportError): + from crawlee.crawlers import BeautifulSoupCrawler # noqa: F401 PLC0415 diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 71d8ada1c5..02f5b61a86 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -218,24 +218,16 @@ async def test_handle_blocked_status_code(server_url: URL, http_client: HttpClie assert crawler._statistics.error_tracker.total == 1 -# TODO: Remove the skip mark when the test is fixed: -# https://github.com/apify/crawlee-python/issues/838 -@pytest.mark.skip(reason='The test does not work with `crawlee._utils.try_import.ImportWrapper`.') def test_import_error_handled() -> None: # Simulate ImportError for parsel with mock.patch.dict('sys.modules', {'parsel': None}): # Invalidate ParselCrawler import - sys.modules.pop('crawlee.crawlers', None) - sys.modules.pop('crawlee.crawlers._parsel', None) - with pytest.raises(ImportError) as import_error: + for mod_name in list(sys.modules): + if mod_name == 'crawlee.crawlers' or mod_name.startswith('crawlee.crawlers._parsel'): + sys.modules.pop(mod_name, None) + with pytest.raises(ImportError): from crawlee.crawlers import ParselCrawler # noqa: F401 PLC0415 - # Check if the raised ImportError contains the expected message - assert str(import_error.value) == ( - "To import this, you need to install the 'parsel' extra." - "For example, if you use pip, run `pip install 'crawlee[parsel]'`." - ) - async def test_json(server_url: URL, http_client: HttpClient) -> None: crawler = ParselCrawler(http_client=http_client) diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index ab965dfd0c..78d1789f99 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -3,6 +3,7 @@ import asyncio import json import logging +import sys from datetime import timedelta from typing import TYPE_CHECKING, Any, Literal from unittest import mock @@ -1239,3 +1240,15 @@ async def failed_handler(context: BasicCrawlingContext | PlaywrightCrawlingConte assert error_handler_calls == [HELLO_WORLD.decode(), HELLO_WORLD.decode()] assert failed_handler_calls == [HELLO_WORLD.decode()] + + +def test_import_error_handled() -> None: + blocked = { + mod_name: None for mod_name in sys.modules if mod_name == 'playwright' or mod_name.startswith('playwright.') + } + with mock.patch.dict('sys.modules', blocked): + for mod_name in list(sys.modules): + if mod_name.startswith('crawlee.crawlers._playwright'): + sys.modules.pop(mod_name, None) + with pytest.raises(ImportError): + from crawlee.crawlers._playwright import PlaywrightCrawler # noqa: F401 PLC0415 diff --git a/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py b/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py index 566f3bcb92..745a4c0da4 100644 --- a/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py +++ b/tests/unit/crawlers/_stagehand/test_stagehand_crawler.py @@ -1,5 +1,6 @@ from __future__ import annotations +import sys from typing import TYPE_CHECKING from unittest.mock import AsyncMock, MagicMock, patch @@ -165,3 +166,15 @@ async def handler(context: StagehandCrawlingContext) -> None: assert isinstance(method_mock.call_args.kwargs['page'], Page) assert argument in method_mock.call_args.kwargs + + +def test_import_error_handled() -> None: + blocked = { + mod_name: None for mod_name in sys.modules if mod_name == 'stagehand' or mod_name.startswith('stagehand.') + } + with patch.dict('sys.modules', blocked): + for mod_name in list(sys.modules): + if mod_name.startswith(('crawlee.crawlers._stagehand', 'crawlee.browsers._stagehand_browser_plugin')): + sys.modules.pop(mod_name, None) + with pytest.raises(ImportError): + from crawlee.crawlers._stagehand import StagehandCrawler # noqa: F401 PLC0415 diff --git a/tests/unit/http_clients/test_http_clients.py b/tests/unit/http_clients/test_http_clients.py index fb1ea43eca..330bd2072c 100644 --- a/tests/unit/http_clients/test_http_clients.py +++ b/tests/unit/http_clients/test_http_clients.py @@ -1,7 +1,10 @@ from __future__ import annotations +import importlib import os +import sys from typing import TYPE_CHECKING +from unittest.mock import patch import pytest from curl_cffi import CurlHttpVersion @@ -263,3 +266,24 @@ async def test_stream_rejects_non_http_scheme(http_client: HttpClient) -> None: with pytest.raises(ValidationError): async with http_client.stream('gopher://127.0.0.1:6379/_PING'): pass + + +@pytest.mark.parametrize( + ('optional_module_name', 'import_path'), + [ + pytest.param('curl_cffi', 'crawlee.http_clients._curl_impersonate', id='curl_impersonate'), + pytest.param('httpx', 'crawlee.http_clients._httpx', id='httpx'), + ], +) +def test_import_error_handled(optional_module_name: str, import_path: str) -> None: + blocked = { + mod_name: None + for mod_name in sys.modules + if mod_name == optional_module_name or mod_name.startswith(f'{optional_module_name}.') + } + with patch.dict('sys.modules', blocked): + for mod_name in list(sys.modules): + if mod_name.startswith(import_path): + sys.modules.pop(mod_name, None) + with pytest.raises(ImportError): + importlib.import_module(import_path) diff --git a/tests/unit/otel/test_crawler_instrumentor.py b/tests/unit/otel/test_crawler_instrumentor.py index 9af7078341..39e835550d 100644 --- a/tests/unit/otel/test_crawler_instrumentor.py +++ b/tests/unit/otel/test_crawler_instrumentor.py @@ -1,8 +1,11 @@ import io import json import re +import sys from unittest import mock +from unittest.mock import patch +import pytest from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor @@ -103,3 +106,15 @@ async def test_crawler_instrumentor_capability(server_url: URL) -> None: # Check that trace_ids of unrelated traces are not the same. assert telemetry_data[0]['context']['trace_id'] != telemetry_data[-1]['context']['trace_id'] + + +def test_import_error_handled() -> None: + blocked = { + mod_name: None + for mod_name in sys.modules + if mod_name == 'opentelemetry' or mod_name.startswith('opentelemetry.') + } + with patch.dict('sys.modules', blocked): + sys.modules.pop('crawlee.otel.crawler_instrumentor', None) + with pytest.raises(ImportError): + from crawlee.otel.crawler_instrumentor import CrawlerInstrumentor # noqa: F401 PLC0415 diff --git a/tests/unit/storage_clients/_redis/test_redis_storage_client.py b/tests/unit/storage_clients/_redis/test_redis_storage_client.py new file mode 100644 index 0000000000..38b2a773fc --- /dev/null +++ b/tests/unit/storage_clients/_redis/test_redis_storage_client.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +import sys +from unittest.mock import patch + +import pytest + + +def test_import_error_handled() -> None: + blocked = {mod_name: None for mod_name in sys.modules if mod_name == 'redis' or mod_name.startswith('redis.')} + with patch.dict('sys.modules', blocked): + for mod_name in list(sys.modules): + if mod_name.startswith('crawlee.storage_clients._redis'): + sys.modules.pop(mod_name, None) + with pytest.raises(ImportError): + from crawlee.storage_clients._redis import RedisStorageClient # noqa: F401 PLC0415 diff --git a/tests/unit/storage_clients/_sql/test_sql_storage_client.py b/tests/unit/storage_clients/_sql/test_sql_storage_client.py index 93202b54b9..1b43cc2169 100644 --- a/tests/unit/storage_clients/_sql/test_sql_storage_client.py +++ b/tests/unit/storage_clients/_sql/test_sql_storage_client.py @@ -1,7 +1,10 @@ from __future__ import annotations +import sys from typing import TYPE_CHECKING +from unittest.mock import patch +import pytest from sqlalchemy import text from sqlalchemy.ext.asyncio import create_async_engine @@ -49,3 +52,15 @@ async def test_sqlite_wal_mode_not_applied_with_custom_engine(tmp_path: Path) -> async with engine.begin() as conn: result = await conn.execute(text('PRAGMA journal_mode')) assert result.scalar() != 'wal' + + +def test_import_error_handled() -> None: + blocked = { + mod_name: None for mod_name in sys.modules if mod_name == 'sqlalchemy' or mod_name.startswith('sqlalchemy.') + } + with patch.dict('sys.modules', blocked): + for mod_name in list(sys.modules): + if mod_name.startswith('crawlee.storage_clients._sql'): + sys.modules.pop(mod_name, None) + with pytest.raises(ImportError): + from crawlee.storage_clients._sql import SqlStorageClient # noqa: F401 PLC0415 diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 6ac320caf6..66430ebc02 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -1,7 +1,8 @@ from __future__ import annotations import os -from unittest.mock import ANY, Mock +import sys +from unittest.mock import ANY, Mock, patch import pytest import readchar @@ -251,3 +252,28 @@ def test_create_existing_folder_interactive_multiple_attempts( 'install_project': True, }, ) + + +@pytest.mark.parametrize( + ('optional_module_name'), + [ + pytest.param('cookiecutter', id='cookiecutter'), + pytest.param('inquirer', id='inquirer'), + pytest.param('rich', id='rich'), + pytest.param('typer', id='typer'), + ], +) +def test_import_error_handled(optional_module_name: str) -> None: + # Block the package and all its submodules to prevent + # cached submodule entries from bypassing the blocked top-level package. + blocked = { + mod_name: None + for mod_name in sys.modules + if mod_name == optional_module_name or mod_name.startswith(f'{optional_module_name}.') + } + + with patch.dict('sys.modules', blocked): + sys.modules.pop('crawlee._cli', None) + + with pytest.raises(ImportError): + from crawlee._cli import cli # noqa: F401 PLC0415