From a62a06b521835f15efa24c27d28cdd26eac3c87d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 16:50:16 +0200 Subject: [PATCH 1/3] docs: add Crawl4AI guide --- docs/01_introduction/quick-start.mdx | 1 + docs/03_guides/10_crawl4ai.mdx | 111 ++++++++++++++++++ .../code/crawl4ai_project/Dockerfile | 19 +++ .../crawl4ai_project/my_actor/__init__.py | 0 .../crawl4ai_project/my_actor/__main__.py | 8 ++ .../code/crawl4ai_project/my_actor/main.py | 73 ++++++++++++ .../code/crawl4ai_project/my_actor/scraper.py | 46 ++++++++ pyproject.toml | 4 + 8 files changed, 262 insertions(+) create mode 100644 docs/03_guides/10_crawl4ai.mdx create mode 100644 docs/03_guides/code/crawl4ai_project/Dockerfile create mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/__init__.py create mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/__main__.py create mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/main.py create mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/scraper.py diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index da166da9..6bd65f39 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -105,4 +105,5 @@ To see how you can integrate the Apify SDK with popular web scraping libraries, - [Selenium](../guides/selenium) - [Crawlee](../guides/crawlee) - [Scrapy](../guides/scrapy) +- [Crawl4AI](../guides/crawl4ai) - [Running webserver](../guides/running-webserver) diff --git a/docs/03_guides/10_crawl4ai.mdx b/docs/03_guides/10_crawl4ai.mdx new file mode 100644 index 00000000..7dc996d8 --- /dev/null +++ b/docs/03_guides/10_crawl4ai.mdx @@ -0,0 +1,111 @@ +--- +id: crawl4ai +title: Use Crawl4AI +description: Build an Apify Actor that scrapes web pages into LLM-ready markdown using the Crawl4AI library. +--- + +import CodeBlock from '@theme/CodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +import Crawl4aiMain from '!!raw-loader!./code/crawl4ai_project/my_actor/main.py'; +import Crawl4aiScraper from '!!raw-loader!./code/crawl4ai_project/my_actor/scraper.py'; +import Crawl4aiEntrypoint from '!!raw-loader!./code/crawl4ai_project/my_actor/__main__.py'; +import Crawl4aiDockerfile from '!!raw-loader!./code/crawl4ai_project/Dockerfile'; + +In this guide, you'll learn how to use the [Crawl4AI](https://crawl4ai.com/) library in your Apify Actors. + +## Introduction + +[Crawl4AI](https://crawl4ai.com/) is an open-source, asynchronous web crawler built for LLM and AI workflows. It renders a page in a real browser and turns the result into clean, structured markdown that's ready to feed into a language model or a retrieval-augmented generation (RAG) pipeline, while still giving you the raw HTML, extracted links, and media when you need them. + +Some of the features that make Crawl4AI a good fit for Apify Actors: + +- **LLM-ready markdown** - Crawl4AI converts each page into clean markdown, stripping boilerplate and optionally filtering content, so the output can be fed straight into a language model. +- **Real browser rendering** - Pages are loaded in a [Playwright](https://playwright.dev/)-driven browser, so JavaScript-heavy and dynamically rendered websites work out of the box. +- **Built-in link and media extraction** - Every crawl returns the page's links already split into `internal` and `external` groups, together with the media it found, which makes recursive crawling straightforward. +- **Flexible extraction strategies** - Beyond markdown, Crawl4AI can extract structured data with CSS/XPath schemas or with an LLM, all configured per request. +- **First-class async support** - The `AsyncWebCrawler` is built on `asyncio`, which integrates naturally with the asyncio-based Apify SDK. +- **Per-request proxy** - Each request can be routed through its own proxy, which pairs well with Apify Proxy and its rotating IP addresses. + +Crawl4AI drives a real browser through Playwright, so after installing the library you need to download the browser binaries once with the `crawl4ai-setup` command: + +```bash +pip install crawl4ai +crawl4ai-setup +``` + +## Example Actor + +The following Actor recursively crawls pages, starting from the URLs in the Actor input and following links up to a user-defined maximum depth. It uses Crawl4AI's `AsyncWebCrawler` to render each page through [Apify Proxy](https://docs.apify.com/platform/proxy), stores the page's markdown in the dataset, and follows the internal links that Crawl4AI discovers. + +The code is split into three small modules, following the structure of the Apify Python Actor templates: + +- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), opens a single browser-backed crawler, and drives the crawl. +- `my_actor/scraper.py` - The Crawl4AI-specific logic. A single `scrape_page` function crawls a page and returns the extracted data together with the links found on it. +- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. + + + + + {Crawl4aiMain} + + + + + {Crawl4aiScraper} + + + + + {Crawl4aiEntrypoint} + + + + +A few things worth pointing out: + +- A single `AsyncWebCrawler` is opened once and reused for every request. The crawler manages one browser instance, so reusing it across the whole crawl is far cheaper than launching a new browser per page. +- Keeping the crawling and parsing in `scrape_page` separates the Crawl4AI-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `my_actor/main.py` decides what to store and what to enqueue. +- `result.markdown` is the rendered page as clean markdown, and `result.metadata` carries page-level fields such as the title - exactly the kind of output you want when preparing data for an LLM. +- `result.links` already separates `internal` (same-site) links from `external` ones, so the example follows only the internal links to keep the crawl on the same website. +- `CacheMode.BYPASS` tells Crawl4AI to always fetch a fresh copy of the page instead of serving it from its local cache. + +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Crawl4AI's per-request `CrawlerRunConfig`. + +`ProxyConfig.from_string` parses the proxy URL returned by `ProxyConfiguration.new_url` (for example `http://groups-RESIDENTIAL:@proxy.apify.com:8000`) into the server, username, and password that the browser needs - the browser cannot take the credentials embedded directly in the URL. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + +## Running on the Apify platform + +Because Crawl4AI renders pages in a real browser, the Actor image needs a browser and its system-level dependencies. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser - Crawl4AI reuses those binaries, so no separate browser-install step is required in the Dockerfile. + +Add `apify` and `crawl4ai` to your `requirements.txt`: + +```text +apify +crawl4ai +``` + + + + + {Crawl4aiDockerfile} + + + + +The example pins the Python 3.13 base image because some of Crawl4AI's dependencies do not yet publish wheels for the newest Python versions, which would otherwise force a slow source build during the image build. + +## Conclusion + +In this guide, you learned how to use Crawl4AI in your Apify Actors. You can now render pages in a real browser, turn them into LLM-ready markdown, follow the links Crawl4AI discovers, route requests through Apify Proxy, and run the whole thing on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! + +## Additional resources + +- [Crawl4AI: Official documentation](https://docs.crawl4ai.com/) +- [Crawl4AI: AsyncWebCrawler and configuration](https://docs.crawl4ai.com/api/async-webcrawler/) +- [Crawl4AI: Proxy and security](https://docs.crawl4ai.com/advanced/proxy-security/) +- [Crawl4AI: GitHub repository](https://github.com/unclecode/crawl4ai) +- [Apify: Proxy management](https://docs.apify.com/platform/proxy) diff --git a/docs/03_guides/code/crawl4ai_project/Dockerfile b/docs/03_guides/code/crawl4ai_project/Dockerfile new file mode 100644 index 00000000..348f6ff2 --- /dev/null +++ b/docs/03_guides/code/crawl4ai_project/Dockerfile @@ -0,0 +1,19 @@ +# Use the Apify Playwright base image, which already ships a browser together +# with all of its system-level dependencies. Crawl4AI drives this browser +# through Playwright and reuses the binaries the image provides, so no separate +# browser-install step is needed. +# +# The Python 3.13 image is used because some of Crawl4AI's dependencies do not +# yet publish wheels for newer Python versions. +FROM apify/actor-python-playwright:3.13-1.60.0 + +# Copy just requirements.txt first to leverage the Docker build cache. +COPY --chown=myuser:myuser requirements.txt ./ +RUN pip install -r requirements.txt + +# Copy the rest of the source code and verify that it compiles. +COPY --chown=myuser:myuser . ./ +RUN python -m compileall -q my_actor/ + +# Specify how to launch the Actor. +CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/__init__.py b/docs/03_guides/code/crawl4ai_project/my_actor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py b/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py new file mode 100644 index 00000000..6aeaf3d5 --- /dev/null +++ b/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +import asyncio + +from .main import main + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/main.py b/docs/03_guides/code/crawl4ai_project/my_actor/main.py new file mode 100644 index 00000000..4e6befe6 --- /dev/null +++ b/docs/03_guides/code/crawl4ai_project/my_actor/main.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from crawl4ai import AsyncWebCrawler, BrowserConfig + +from apify import Actor, Request + +from .scraper import scrape_page + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Create a proxy configuration that routes requests through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs. Their crawl depth defaults to 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + await request_queue.add_request(Request.from_url(url)) + + # Configure the headless browser that Crawl4AI drives. + browser_config = BrowserConfig(headless=True) + + # Open a single browser-backed crawler and reuse it for every request. + async with AsyncWebCrawler(config=browser_config) as crawler: + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Get a fresh proxy URL for each request (None if no proxy set up). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + # Crawl the page and extract its markdown and nested links. + data, links = await scrape_page(crawler, url, proxy_url=proxy_url) + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + # If we are not too deep yet, enqueue the links we found one + # level deeper than the current page. + if depth < max_depth: + for link_url in links: + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled so it is not processed again. + await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py b/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py new file mode 100644 index 00000000..f96f76e3 --- /dev/null +++ b/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from crawl4ai import CacheMode, CrawlerRunConfig, ProxyConfig + +if TYPE_CHECKING: + from crawl4ai import AsyncWebCrawler + + +async def scrape_page( + crawler: AsyncWebCrawler, + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Crawl a single page with Crawl4AI and extract its markdown and links. + + The page is rendered in the browser managed by `crawler`, and Crawl4AI turns + the result into clean, LLM-ready markdown. Setting `proxy_config` on the + per-request `CrawlerRunConfig` routes this request through Apify Proxy, so + every page can use a fresh IP address. + """ + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + proxy_config=ProxyConfig.from_string(proxy_url) if proxy_url else None, + ) + + result = await crawler.arun(url, config=run_config) + if not result.success: + raise RuntimeError(result.error_message or f'Failed to crawl {url}') + + # `result.markdown` is the rendered page as clean markdown, and + # `result.metadata` carries page-level fields such as the title. + data = { + 'url': result.url, + 'title': (result.metadata or {}).get('title'), + 'markdown': str(result.markdown), + } + + # Crawl4AI already splits links into `internal` (same site) and `external`. + # We follow only the internal ones to keep the crawl on the same website. + internal_links = result.links.get('internal', []) + links = [link['href'] for link in internal_links if link.get('href')] + + return data, links diff --git a/pyproject.toml b/pyproject.toml index d17bdc01..38846e70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,6 +181,10 @@ indent-style = "space" # Local imports in Scrapy project. "TID252", # Prefer absolute imports over relative imports from parent modules ] +"**/docs/**/crawl4ai_project/**" = [ + # Local imports are mixed up with the Apify SDK. + "I001", # Import block is un-sorted or un-formatted +] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" From aef18134a30dbf2159b73ab2b111406815c40c04 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 20:45:18 +0200 Subject: [PATCH 2/3] docs: renumber Crawl4AI guide to 08 and switch to a single-file example --- .../{10_crawl4ai.mdx => 08_crawl4ai.mdx} | 57 ++------ docs/03_guides/code/08_crawl4ai.py | 124 ++++++++++++++++++ .../code/crawl4ai_project/Dockerfile | 19 --- .../crawl4ai_project/my_actor/__init__.py | 0 .../crawl4ai_project/my_actor/__main__.py | 8 -- .../code/crawl4ai_project/my_actor/main.py | 73 ----------- .../code/crawl4ai_project/my_actor/scraper.py | 46 ------- 7 files changed, 137 insertions(+), 190 deletions(-) rename docs/03_guides/{10_crawl4ai.mdx => 08_crawl4ai.mdx} (68%) create mode 100644 docs/03_guides/code/08_crawl4ai.py delete mode 100644 docs/03_guides/code/crawl4ai_project/Dockerfile delete mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/__init__.py delete mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/__main__.py delete mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/main.py delete mode 100644 docs/03_guides/code/crawl4ai_project/my_actor/scraper.py diff --git a/docs/03_guides/10_crawl4ai.mdx b/docs/03_guides/08_crawl4ai.mdx similarity index 68% rename from docs/03_guides/10_crawl4ai.mdx rename to docs/03_guides/08_crawl4ai.mdx index 7dc996d8..0802c002 100644 --- a/docs/03_guides/10_crawl4ai.mdx +++ b/docs/03_guides/08_crawl4ai.mdx @@ -1,19 +1,14 @@ --- id: crawl4ai -title: Use Crawl4AI +title: LLM-ready scraping with Crawl4AI description: Build an Apify Actor that scrapes web pages into LLM-ready markdown using the Crawl4AI library. --- -import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; -import Crawl4aiMain from '!!raw-loader!./code/crawl4ai_project/my_actor/main.py'; -import Crawl4aiScraper from '!!raw-loader!./code/crawl4ai_project/my_actor/scraper.py'; -import Crawl4aiEntrypoint from '!!raw-loader!./code/crawl4ai_project/my_actor/__main__.py'; -import Crawl4aiDockerfile from '!!raw-loader!./code/crawl4ai_project/Dockerfile'; +import Crawl4aiExample from '!!raw-loader!roa-loader!./code/08_crawl4ai.py'; -In this guide, you'll learn how to use the [Crawl4AI](https://crawl4ai.com/) library in your Apify Actors. +In this guide, you'll learn how to use the [Crawl4AI](https://crawl4ai.com/) library for LLM-ready web scraping in your Apify Actors. ## Introduction @@ -39,41 +34,23 @@ crawl4ai-setup The following Actor recursively crawls pages, starting from the URLs in the Actor input and following links up to a user-defined maximum depth. It uses Crawl4AI's `AsyncWebCrawler` to render each page through [Apify Proxy](https://docs.apify.com/platform/proxy), stores the page's markdown in the dataset, and follows the internal links that Crawl4AI discovers. -The code is split into three small modules, following the structure of the Apify Python Actor templates: - -- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), opens a single browser-backed crawler, and drives the crawl. -- `my_actor/scraper.py` - The Crawl4AI-specific logic. A single `scrape_page` function crawls a page and returns the extracted data together with the links found on it. -- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. - - - - - {Crawl4aiMain} - - - - - {Crawl4aiScraper} - - - - - {Crawl4aiEntrypoint} - - - +The whole Actor fits in a single file. A `scrape_page` helper holds the Crawl4AI-specific crawling and parsing, while the `main` coroutine handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), opens a single browser-backed crawler, and drives the crawl: + + + {Crawl4aiExample} + A few things worth pointing out: - A single `AsyncWebCrawler` is opened once and reused for every request. The crawler manages one browser instance, so reusing it across the whole crawl is far cheaper than launching a new browser per page. -- Keeping the crawling and parsing in `scrape_page` separates the Crawl4AI-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `my_actor/main.py` decides what to store and what to enqueue. +- Keeping the crawling and parsing in `scrape_page` separates the Crawl4AI-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `main` decides what to store and what to enqueue. - `result.markdown` is the rendered page as clean markdown, and `result.metadata` carries page-level fields such as the title - exactly the kind of output you want when preparing data for an LLM. - `result.links` already separates `internal` (same-site) links from `external` ones, so the example follows only the internal links to keep the crawl on the same website. - `CacheMode.BYPASS` tells Crawl4AI to always fetch a fresh copy of the page instead of serving it from its local cache. ## Using Apify Proxy -Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Crawl4AI's per-request `CrawlerRunConfig`. +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `main` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Crawl4AI's per-request `CrawlerRunConfig`. `ProxyConfig.from_string` parses the proxy URL returned by `ProxyConfiguration.new_url` (for example `http://groups-RESIDENTIAL:@proxy.apify.com:8000`) into the server, username, and password that the browser needs - the browser cannot take the credentials embedded directly in the URL. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. @@ -81,6 +58,8 @@ Running on the Apify platform gives your scraper access to [Apify Proxy](https:/ Because Crawl4AI renders pages in a real browser, the Actor image needs a browser and its system-level dependencies. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser - Crawl4AI reuses those binaries, so no separate browser-install step is required in the Dockerfile. +Pin the Python 3.13 variant of that image (for example `apify/actor-python-playwright:3.13-1.60.0`), because some of Crawl4AI's dependencies do not yet publish wheels for the newest Python versions, which would otherwise force a slow source build during the image build. + Add `apify` and `crawl4ai` to your `requirements.txt`: ```text @@ -88,16 +67,6 @@ apify crawl4ai ``` - - - - {Crawl4aiDockerfile} - - - - -The example pins the Python 3.13 base image because some of Crawl4AI's dependencies do not yet publish wheels for the newest Python versions, which would otherwise force a slow source build during the image build. - ## Conclusion In this guide, you learned how to use Crawl4AI in your Apify Actors. You can now render pages in a real browser, turn them into LLM-ready markdown, follow the links Crawl4AI discovers, route requests through Apify Proxy, and run the whole thing on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/code/08_crawl4ai.py b/docs/03_guides/code/08_crawl4ai.py new file mode 100644 index 00000000..1c7884c1 --- /dev/null +++ b/docs/03_guides/code/08_crawl4ai.py @@ -0,0 +1,124 @@ +import asyncio +from typing import Any + +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CacheMode, + CrawlerRunConfig, + ProxyConfig, +) + +from apify import Actor, Request +from apify.storages import RequestQueue + + +async def scrape_page( + crawler: AsyncWebCrawler, + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Crawl a page with Crawl4AI and return its markdown and same-site links.""" + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + proxy_config=ProxyConfig.from_string(proxy_url) if proxy_url else None, + ) + + result = await crawler.arun(url, config=run_config) + if not result.success: + raise RuntimeError(result.error_message or f'Failed to crawl {url}') + + data = { + 'url': result.url, + 'title': (result.metadata or {}).get('title'), + 'markdown': str(result.markdown), + } + + # Crawl4AI already classifies links; follow only the internal ones. + internal_links = result.links.get('internal', []) + links = [link['href'] for link in internal_links if link.get('href')] + + return data, links + + +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the links one level deeper, unless max_depth was reached.""" + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + +async def main() -> None: + async with Actor: + # Read the Actor input. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) + + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Set up Apify Proxy and the request queue. + proxy_configuration = await Actor.create_proxy_configuration() + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs (crawl depth defaults to 0). + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing start URL: {url}') + await request_queue.add_request(Request.from_url(url)) + + # Cap the crawl; raise or remove to follow more pages. + max_requests = 50 + handled_requests = 0 + + # Reuse one headless browser-backed crawler for every request. + browser_config = BrowserConfig(headless=True) + + async with AsyncWebCrawler(config=browser_config) as crawler: + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 + url = request.url + depth = request.crawl_depth + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Fresh proxy URL per request (None if no proxy). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + data, links = await scrape_page(crawler, url, proxy_url=proxy_url) + await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + await request_queue.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/crawl4ai_project/Dockerfile b/docs/03_guides/code/crawl4ai_project/Dockerfile deleted file mode 100644 index 348f6ff2..00000000 --- a/docs/03_guides/code/crawl4ai_project/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -# Use the Apify Playwright base image, which already ships a browser together -# with all of its system-level dependencies. Crawl4AI drives this browser -# through Playwright and reuses the binaries the image provides, so no separate -# browser-install step is needed. -# -# The Python 3.13 image is used because some of Crawl4AI's dependencies do not -# yet publish wheels for newer Python versions. -FROM apify/actor-python-playwright:3.13-1.60.0 - -# Copy just requirements.txt first to leverage the Docker build cache. -COPY --chown=myuser:myuser requirements.txt ./ -RUN pip install -r requirements.txt - -# Copy the rest of the source code and verify that it compiles. -COPY --chown=myuser:myuser . ./ -RUN python -m compileall -q my_actor/ - -# Specify how to launch the Actor. -CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/__init__.py b/docs/03_guides/code/crawl4ai_project/my_actor/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py b/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py deleted file mode 100644 index 6aeaf3d5..00000000 --- a/docs/03_guides/code/crawl4ai_project/my_actor/__main__.py +++ /dev/null @@ -1,8 +0,0 @@ -from __future__ import annotations - -import asyncio - -from .main import main - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/main.py b/docs/03_guides/code/crawl4ai_project/my_actor/main.py deleted file mode 100644 index 4e6befe6..00000000 --- a/docs/03_guides/code/crawl4ai_project/my_actor/main.py +++ /dev/null @@ -1,73 +0,0 @@ -from __future__ import annotations - -from crawl4ai import AsyncWebCrawler, BrowserConfig - -from apify import Actor, Request - -from .scraper import scrape_page - - -async def main() -> None: - # Enter the context of the Actor. - async with Actor: - # Retrieve the Actor input, and use default values if not provided. - actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) - max_depth = actor_input.get('max_depth', 1) - - # Exit if no start URLs are provided. - if not start_urls: - Actor.log.info('No start URLs specified in Actor input, exiting...') - await Actor.exit() - - # Create a proxy configuration that routes requests through Apify Proxy. - proxy_configuration = await Actor.create_proxy_configuration() - - # Open the default request queue for handling URLs to be processed. - request_queue = await Actor.open_request_queue() - - # Enqueue the start URLs. Their crawl depth defaults to 0. - for start_url in start_urls: - url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') - await request_queue.add_request(Request.from_url(url)) - - # Configure the headless browser that Crawl4AI drives. - browser_config = BrowserConfig(headless=True) - - # Open a single browser-backed crawler and reuse it for every request. - async with AsyncWebCrawler(config=browser_config) as crawler: - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): - url = request.url - - # Read the crawl depth tracked by the request itself. - depth = request.crawl_depth - Actor.log.info(f'Scraping {url} (depth={depth}) ...') - - try: - # Get a fresh proxy URL for each request (None if no proxy set up). - proxy_url = None - if proxy_configuration: - proxy_url = await proxy_configuration.new_url() - - # Crawl the page and extract its markdown and nested links. - data, links = await scrape_page(crawler, url, proxy_url=proxy_url) - - # Store the extracted data to the default dataset. - await Actor.push_data(data) - - # If we are not too deep yet, enqueue the links we found one - # level deeper than the current page. - if depth < max_depth: - for link_url in links: - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) - - except Exception: - Actor.log.exception(f'Cannot extract data from {url}.') - - finally: - # Mark the request as handled so it is not processed again. - await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py b/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py deleted file mode 100644 index f96f76e3..00000000 --- a/docs/03_guides/code/crawl4ai_project/my_actor/scraper.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -from crawl4ai import CacheMode, CrawlerRunConfig, ProxyConfig - -if TYPE_CHECKING: - from crawl4ai import AsyncWebCrawler - - -async def scrape_page( - crawler: AsyncWebCrawler, - url: str, - *, - proxy_url: str | None = None, -) -> tuple[dict[str, Any], list[str]]: - """Crawl a single page with Crawl4AI and extract its markdown and links. - - The page is rendered in the browser managed by `crawler`, and Crawl4AI turns - the result into clean, LLM-ready markdown. Setting `proxy_config` on the - per-request `CrawlerRunConfig` routes this request through Apify Proxy, so - every page can use a fresh IP address. - """ - run_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - proxy_config=ProxyConfig.from_string(proxy_url) if proxy_url else None, - ) - - result = await crawler.arun(url, config=run_config) - if not result.success: - raise RuntimeError(result.error_message or f'Failed to crawl {url}') - - # `result.markdown` is the rendered page as clean markdown, and - # `result.metadata` carries page-level fields such as the title. - data = { - 'url': result.url, - 'title': (result.metadata or {}).get('title'), - 'markdown': str(result.markdown), - } - - # Crawl4AI already splits links into `internal` (same site) and `external`. - # We follow only the internal ones to keep the crawl on the same website. - internal_links = result.links.get('internal', []) - links = [link['href'] for link in internal_links if link.get('href')] - - return data, links From d6d4dcd1811db37e9a532aaa58625a0db741986e Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 21:03:05 +0200 Subject: [PATCH 3/3] chore: drop unused ruff ignore for the removed Crawl4AI project --- pyproject.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 38846e70..d17bdc01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,10 +181,6 @@ indent-style = "space" # Local imports in Scrapy project. "TID252", # Prefer absolute imports over relative imports from parent modules ] -"**/docs/**/crawl4ai_project/**" = [ - # Local imports are mixed up with the Apify SDK. - "I001", # Import block is un-sorted or un-formatted -] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double"