Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/on_schedule_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
fail-fast: false
max-parallel: 12
matrix:
crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup"]
crawler-type: ["playwright_camoufox", "playwright_chrome", "playwright_firefox", "playwright_webkit", "playwright", "parsel", "beautifulsoup", "adaptive_beautifulsoup", "adaptive_parsel", "stagehand"]
http-client: ["httpx", "curl_impersonate", "impit"]
package-manager: ["pip", "uv", "poetry"]

Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/project_template/cookiecutter.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"project_name": "crawlee-python-project",
"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
"crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit"],
"crawler_type": ["beautifulsoup", "parsel", "adaptive-beautifulsoup", "adaptive-parsel", "playwright", "playwright-camoufox", "playwright-chrome", "playwright-firefox", "playwright-webkit", "stagehand"],
"__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}",
"http_client": ["impit", "httpx", "curl-impersonate"],
"package_manager": ["poetry", "pip", "uv"],
Expand Down
5 changes: 3 additions & 2 deletions src/crawlee/project_template/hooks/post_gen_project.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# % set needs_playwright = cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand'
import platform
import subprocess
from pathlib import Path
Expand All @@ -12,7 +13,7 @@
subprocess.check_call(['uv', 'sync'])
# % endif

# % if cookiecutter.crawler_type == 'playwright'
# % if needs_playwright
manager = "{{ cookiecutter.package_manager }}"
subprocess.check_call([manager, 'run', 'playwright', 'install'])
# % endif
Expand All @@ -38,7 +39,7 @@
subprocess.check_output([str(path / 'pip'), 'freeze']).decode()
)

# % if cookiecutter.crawler_type == 'playwright'
# % if needs_playwright
subprocess.check_call([str(path / 'playwright'), 'install'])
# % endif
# % endif
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.crawlers import AdaptivePlaywrightCrawler
# % endblock

# % block instantiation
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
request_handler=router,
max_requests_per_crawl=10,
{{ self.http_client_instantiation() }})
# % endblock
12 changes: 12 additions & 0 deletions src/crawlee/project_template/templates/main_adaptive_parsel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.crawlers import AdaptivePlaywrightCrawler
# % endblock

# % block instantiation
crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
request_handler=router,
max_requests_per_crawl=10,
{{ self.http_client_instantiation() }})
# % endblock
23 changes: 23 additions & 0 deletions src/crawlee/project_template/templates/main_stagehand.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# % extends 'main.py'

# % block import
import os

from crawlee.browsers import StagehandOptions
from crawlee.crawlers import StagehandCrawler
# % endblock

# % block instantiation
model_api_key = os.environ.get('OPENAI_API_KEY')
if model_api_key is None:
raise ValueError('The OPENAI_API_KEY environment variable is not set.')

crawler = StagehandCrawler(
request_handler=router,
headless=True,
max_requests_per_crawl=10,
stagehand_options=StagehandOptions(
model_api_key=model_api_key,
),
{{ self.http_client_instantiation() }})
# % endblock
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router

router = Router[AdaptivePlaywrightCrawlingContext]()


@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = context.parsed_content.find('title')
await context.push_data(
{
'url': context.request.loaded_url,
'title': title.text if title else None,
}
)

await context.enqueue_links()
19 changes: 19 additions & 0 deletions src/crawlee/project_template/templates/routes_adaptive_parsel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.crawlers import AdaptivePlaywrightCrawlingContext
from crawlee.router import Router

router = Router[AdaptivePlaywrightCrawlingContext]()


@router.default_handler
async def default_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = context.parsed_content.xpath('//title/text()').get()
await context.push_data(
{
'url': context.request.loaded_url,
'title': title,
}
)

await context.enqueue_links()
21 changes: 21 additions & 0 deletions src/crawlee/project_template/templates/routes_stagehand.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from crawlee.crawlers import StagehandCrawlingContext
from crawlee.router import Router

router = Router[StagehandCrawlingContext]()


@router.default_handler
async def default_handler(context: StagehandCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')

data = await context.page.extract(instruction='Get the page title and main heading.')

await context.push_data(
{
'url': context.request.loaded_url,
'data': data.model_dump(),
}
)

await context.enqueue_links()
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
# % if cookiecutter.crawler_type == 'playwright'
FROM apify/actor-python-playwright:3.13
# % if cookiecutter.crawler_type == 'playwright' or cookiecutter.crawler_type.startswith('adaptive-') or cookiecutter.crawler_type == 'stagehand'
# % set base_image = 'apify/actor-python-playwright:3.13'
# % elif cookiecutter.crawler_type == 'playwright-camoufox'
FROM apify/actor-python-playwright-camoufox:3.13
# % set base_image = 'apify/actor-python-playwright-camoufox:3.13'
# % elif cookiecutter.crawler_type == 'playwright-chrome'
FROM apify/actor-python-playwright-chrome:3.13
# % set base_image = 'apify/actor-python-playwright-chrome:3.13'
# % elif cookiecutter.crawler_type == 'playwright-firefox'
FROM apify/actor-python-playwright-firefox:3.13
# % set base_image = 'apify/actor-python-playwright-firefox:3.13'
# % elif cookiecutter.crawler_type == 'playwright-webkit'
FROM apify/actor-python-playwright-webkit:3.13
# % set base_image = 'apify/actor-python-playwright-webkit:3.13'
# % else
FROM apify/actor-python:3.13
# % set base_image = 'apify/actor-python:3.13'
# % endif
FROM {{ base_image }}

RUN apt update && apt install -yq git && rm -rf /var/lib/apt/lists/*

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# % if cookiecutter.crawler_type.startswith('playwright')
# % if cookiecutter.crawler_type.startswith('adaptive-')
# % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel']
# % elif cookiecutter.crawler_type.startswith('playwright')
# % set extras = ['playwright']
# % elif cookiecutter.crawler_type == 'stagehand'
# % set extras = ['stagehand']
# % else
# % set extras = [cookiecutter.crawler_type]
# % endif
# % if cookiecutter.http_client == 'curl-impersonate'
# % do extras.append('curl-impersonate')
# % elif cookiecutter.http_client == 'httpx'
# % do extras.append('httpx')
# % if cookiecutter.http_client in ('httpx', 'curl-impersonate')
# % do extras.append(cookiecutter.http_client)
# % endif

[project]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
# % if cookiecutter.crawler_type == 'playwright-camoufox'
camoufox[geoip]~=0.4.5
# % endif
# % if cookiecutter.crawler_type.startswith('playwright')
# % if cookiecutter.crawler_type.startswith('adaptive-')
# % set extras = ['adaptive-crawler', 'beautifulsoup', 'parsel']
# % elif cookiecutter.crawler_type.startswith('playwright')
# % set extras = ['playwright']
# % elif cookiecutter.crawler_type == 'stagehand'
# % set extras = ['stagehand']
# % else
# % set extras = [cookiecutter.crawler_type]
# % endif
# % if cookiecutter.enable_apify_integration
apify
# % endif
# % if cookiecutter.http_client == 'curl-impersonate'
# % do extras.append('curl-impersonate')
# % endif
# % if cookiecutter.http_client == 'httpx'
# % do extras.append('httpx')
# % if cookiecutter.http_client in ('httpx', 'curl-impersonate')
# % do extras.append(cookiecutter.http_client)
# % endif
crawlee[{{ extras | join(',') }}]
3 changes: 3 additions & 0 deletions tests/e2e/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ def pytest_configure(config: Config) -> None:
'playwright_webkit',
'parsel',
'beautifulsoup',
'adaptive_beautifulsoup',
'adaptive_parsel',
'stagehand',
'uv',
'poetry',
'pip',
Expand Down
11 changes: 11 additions & 0 deletions tests/e2e/project_template/test_static_crawlers_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
pytest.param('playwright', marks=pytest.mark.playwright),
pytest.param('parsel', marks=pytest.mark.parsel),
pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup),
pytest.param('adaptive-beautifulsoup', marks=pytest.mark.adaptive_beautifulsoup),
pytest.param('adaptive-parsel', marks=pytest.mark.adaptive_parsel),
pytest.param('stagehand', marks=pytest.mark.stagehand),
],
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -106,6 +109,14 @@ async def test_static_crawler_actor_at_apify(
client = ApifyClientAsync(token=os.getenv('APIFY_TEST_USER_API_TOKEN'))
actor = client.actor(actor_id)

# The template ships a placeholder API key, so only validate the build and skip the run.
if crawler_type == 'stagehand':
try:
assert build_process.returncode == 0
finally:
await actor.delete()
return

# Run actor
try:
assert build_process.returncode == 0
Expand Down
Loading