diff --git a/.github/workflows/on_schedule_tests.yaml b/.github/workflows/on_schedule_tests.yaml index bf041faa08..96c4edb98d 100644 --- a/.github/workflows/on_schedule_tests.yaml +++ b/.github/workflows/on_schedule_tests.yaml @@ -27,7 +27,7 @@ jobs: fail-fast: false max-parallel: 12 matrix: - crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup"] + crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup", "adaptive_beautifulsoup", "adaptive_parsel", "stagehand"] http-client: ["httpx", "curl_impersonate", "impit"] package-manager: ["pip", "uv", "poetry"] diff --git a/src/crawlee/project_template/cookiecutter.json b/src/crawlee/project_template/cookiecutter.json index 53e0c8f445..0d50dbd766 100644 --- a/src/crawlee/project_template/cookiecutter.json +++ b/src/crawlee/project_template/cookiecutter.json @@ -1,7 +1,7 @@ { "project_name": "crawlee-python-project", "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", - "crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"], + "crawler_type": ["beautifulsoup", "parsel", "adaptive-beautifulsoup", "adaptive-parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit", "stagehand"], "__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}", "http_client": ["impit", "httpx", "curl-impersonate"], "package_manager": ["poetry", "pip", "uv"], diff --git a/src/crawlee/project_template/hooks/post_gen_project.py b/src/crawlee/project_template/hooks/post_gen_project.py index dc90ec516c..c7e69b030b 100644 --- a/src/crawlee/project_template/hooks/post_gen_project.py +++ b/src/crawlee/project_template/hooks/post_gen_project.py @@ -1,3 +1,4 @@ +# % set needs_playwright = cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand' import platform import subprocess from pathlib import Path @@ -12,7 +13,7 @@ subprocess.check_call(['uv', 'sync']) # % endif -# % if cookiecutter.crawler_type == 'playwright' +# % if needs_playwright manager = "{{ cookiecutter.package_manager }}" subprocess.check_call([manager, 'run', 'playwright', 'install']) # % endif @@ -38,7 +39,7 @@ subprocess.check_output([str(path / 'pip'), 'freeze']).decode() ) -# % if cookiecutter.crawler_type == 'playwright' +# % if needs_playwright subprocess.check_call([str(path / 'playwright'), 'install']) # % endif # % endif diff --git a/src/crawlee/project_template/templates/main_adaptive_beautifulsoup.py b/src/crawlee/project_template/templates/main_adaptive_beautifulsoup.py new file mode 100644 index 0000000000..7c536498b9 --- /dev/null +++ b/src/crawlee/project_template/templates/main_adaptive_beautifulsoup.py @@ -0,0 +1,12 @@ +# % extends 'main.py' + +# % block import +from crawlee.crawlers import AdaptivePlaywrightCrawler +# % endblock + +# % block instantiation +crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser( + request_handler=router, + max_requests_per_crawl=10, + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/main_adaptive_parsel.py b/src/crawlee/project_template/templates/main_adaptive_parsel.py new file mode 100644 index 0000000000..ff789928e0 --- /dev/null +++ b/src/crawlee/project_template/templates/main_adaptive_parsel.py @@ -0,0 +1,12 @@ +# % extends 'main.py' + +# % block import +from crawlee.crawlers import AdaptivePlaywrightCrawler +# % endblock + +# % block instantiation +crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( + request_handler=router, + max_requests_per_crawl=10, + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/main_stagehand.py b/src/crawlee/project_template/templates/main_stagehand.py new file mode 100644 index 0000000000..0a63a5153e --- /dev/null +++ b/src/crawlee/project_template/templates/main_stagehand.py @@ -0,0 +1,23 @@ +# % extends 'main.py' + +# % block import +import os + +from crawlee.browsers import StagehandOptions +from crawlee.crawlers import StagehandCrawler +# % endblock + +# % block instantiation +model_api_key = os.environ.get('OPENAI_API_KEY') +if model_api_key is None: + raise ValueError('The OPENAI_API_KEY environment variable is not set.') + +crawler = StagehandCrawler( + request_handler=router, + headless=True, + max_requests_per_crawl=10, + stagehand_options=StagehandOptions( + model_api_key=model_api_key, + ), + {{ self.http_client_instantiation() }}) +# % endblock diff --git a/src/crawlee/project_template/templates/routes_adaptive_beautifulsoup.py b/src/crawlee/project_template/templates/routes_adaptive_beautifulsoup.py new file mode 100644 index 0000000000..b0d8333853 --- /dev/null +++ b/src/crawlee/project_template/templates/routes_adaptive_beautifulsoup.py @@ -0,0 +1,19 @@ +from crawlee.crawlers import AdaptivePlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[AdaptivePlaywrightCrawlingContext]() + + +@router.default_handler +async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = context.parsed_content.find('title') + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': title.text if title else None, + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/templates/routes_adaptive_parsel.py b/src/crawlee/project_template/templates/routes_adaptive_parsel.py new file mode 100644 index 0000000000..95e767075d --- /dev/null +++ b/src/crawlee/project_template/templates/routes_adaptive_parsel.py @@ -0,0 +1,19 @@ +from crawlee.crawlers import AdaptivePlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[AdaptivePlaywrightCrawlingContext]() + + +@router.default_handler +async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = context.parsed_content.xpath('//title/text()').get() + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': title, + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/templates/routes_stagehand.py b/src/crawlee/project_template/templates/routes_stagehand.py new file mode 100644 index 0000000000..b197250b16 --- /dev/null +++ b/src/crawlee/project_template/templates/routes_stagehand.py @@ -0,0 +1,21 @@ +from crawlee.crawlers import StagehandCrawlingContext +from crawlee.router import Router + +router = Router[StagehandCrawlingContext]() + + +@router.default_handler +async def default_handler(context: StagehandCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + + data = await context.page.extract(instruction='Get the page title and main heading.') + + await context.push_data( + { + 'url': context.request.loaded_url, + 'data': data.model_dump(), + } + ) + + await context.enqueue_links() diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile index 323181d058..4f958871e2 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile @@ -1,19 +1,20 @@ # First, specify the base Docker image. # You can see the Docker images from Apify at https://hub.docker.com/r/apify/. # You can also use any other image from Docker Hub. -# % if cookiecutter.crawler_type == 'playwright' -FROM apify/actor-python-playwright:3.13 +# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand' +# % set base_image = 'apify/actor-python-playwright:3.13' # % elif cookiecutter.crawler_type == 'playwright-camoufox' -FROM apify/actor-python-playwright-camoufox:3.13 +# % set base_image = 'apify/actor-python-playwright-camoufox:3.13' # % elif cookiecutter.crawler_type == 'playwright-chrome' -FROM apify/actor-python-playwright-chrome:3.13 +# % set base_image = 'apify/actor-python-playwright-chrome:3.13' # % elif cookiecutter.crawler_type == 'playwright-firefox' -FROM apify/actor-python-playwright-firefox:3.13 +# % set base_image = 'apify/actor-python-playwright-firefox:3.13' # % elif cookiecutter.crawler_type == 'playwright-webkit' -FROM apify/actor-python-playwright-webkit:3.13 +# % set base_image = 'apify/actor-python-playwright-webkit:3.13' # % else -FROM apify/actor-python:3.13 +# % set base_image = 'apify/actor-python:3.13' # % endif +FROM {{ base_image }} RUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/* diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml index 2de6aa2532..f74127ec6f 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml @@ -1,12 +1,14 @@ -# % if cookiecutter.crawler_type.startswith('playwright') +# % if cookiecutter.crawler_type.startswith('adaptive-') +# % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel'] +# % elif cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] +# % elif cookiecutter.crawler_type == 'stagehand' +# % set extras = ['stagehand'] # % else # % set extras = [cookiecutter.crawler_type] # % endif -# % if cookiecutter.http_client == 'curl-impersonate' -# % do extras.append('curl-impersonate') -# % elif cookiecutter.http_client == 'httpx' -# % do extras.append('httpx') +# % if cookiecutter.http_client in ('httpx', 'curl-impersonate') +# % do extras.append(cookiecutter.http_client) # % endif [project] diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt index 8ac28ed5e4..47c6091d77 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt @@ -1,18 +1,19 @@ # % if cookiecutter.crawler_type == 'playwright-camoufox' camoufox[geoip]~=0.4.5 # % endif -# % if cookiecutter.crawler_type.startswith('playwright') +# % if cookiecutter.crawler_type.startswith('adaptive-') +# % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel'] +# % elif cookiecutter.crawler_type.startswith('playwright') # % set extras = ['playwright'] +# % elif cookiecutter.crawler_type == 'stagehand' +# % set extras = ['stagehand'] # % else # % set extras = [cookiecutter.crawler_type] # % endif # % if cookiecutter.enable_apify_integration apify # % endif -# % if cookiecutter.http_client == 'curl-impersonate' -# % do extras.append('curl-impersonate') -# % endif -# % if cookiecutter.http_client == 'httpx' -# % do extras.append('httpx') +# % if cookiecutter.http_client in ('httpx', 'curl-impersonate') +# % do extras.append(cookiecutter.http_client) # % endif crawlee[{{ extras | join(',') }}] diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 81945a760a..25c5ab8695 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -20,6 +20,9 @@ def pytest_configure(config: Config) -> None: 'playwright_webkit', 'parsel', 'beautifulsoup', + 'adaptive_beautifulsoup', + 'adaptive_parsel', + 'stagehand', 'uv', 'poetry', 'pip', diff --git a/tests/e2e/project_template/test_static_crawlers_templates.py b/tests/e2e/project_template/test_static_crawlers_templates.py index bd40f0c8ac..358f05fe50 100644 --- a/tests/e2e/project_template/test_static_crawlers_templates.py +++ b/tests/e2e/project_template/test_static_crawlers_templates.py @@ -26,6 +26,9 @@ pytest.param('playwright', marks=pytest.mark.playwright), pytest.param('parsel', marks=pytest.mark.parsel), pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup), + pytest.param('adaptive-beautifulsoup', marks=pytest.mark.adaptive_beautifulsoup), + pytest.param('adaptive-parsel', marks=pytest.mark.adaptive_parsel), + pytest.param('stagehand', marks=pytest.mark.stagehand), ], ) @pytest.mark.parametrize( @@ -106,6 +109,14 @@ async def test_static_crawler_actor_at_apify( client = ApifyClientAsync(token=os.getenv('APIFY_TEST_USER_API_TOKEN')) actor = client.actor(actor_id) + # The template ships a placeholder API key, so only validate the build and skip the run. + if crawler_type == 'stagehand': + try: + assert build_process.returncode == 0 + finally: + await actor.delete() + return + # Run actor try: assert build_process.returncode == 0