From 54e153d74860c271eb31142644ad088a0b4f6569 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 10:40:25 +0200 Subject: [PATCH 1/5] docs: Add Scrapling guide --- docs/01_introduction/quick-start.mdx | 1 + docs/03_guides/09_scrapling.mdx | 123 +++++++++++++++++++++++++++ docs/03_guides/code/09_scrapling.py | 95 +++++++++++++++++++++ 3 files changed, 219 insertions(+) create mode 100644 docs/03_guides/09_scrapling.mdx create mode 100644 docs/03_guides/code/09_scrapling.py diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index da166da9..c0f8bec3 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -105,4 +105,5 @@ To see how you can integrate the Apify SDK with popular web scraping libraries, - [Selenium](../guides/selenium) - [Crawlee](../guides/crawlee) - [Scrapy](../guides/scrapy) +- [Scrapling](../guides/scrapling) - [Running webserver](../guides/running-webserver) diff --git a/docs/03_guides/09_scrapling.mdx b/docs/03_guides/09_scrapling.mdx new file mode 100644 index 00000000..459e5a25 --- /dev/null +++ b/docs/03_guides/09_scrapling.mdx @@ -0,0 +1,123 @@ +--- +id: scrapling +title: Use Scrapling +description: Build an Apify Actor that scrapes web pages using the Scrapling adaptive web scraping library. +--- + +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ScraplingExample from '!!raw-loader!roa-loader!./code/09_scrapling.py'; + +In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library in your Apify Actors. + +## Introduction + +[Scrapling](https://scrapling.readthedocs.io/) is an adaptive web scraping library for Python that combines fetching and parsing behind a single, high-level API. It can fetch a page with fast HTTP requests or with a real browser, parse the result with familiar CSS selectors and XPath, and even relocate your selectors automatically when a website's structure changes. + +Some of the features that make Scrapling a good fit for Apify Actors: + +- **Multiple fetchers** - A single API exposes a fast HTTP client with browser TLS-fingerprint impersonation, as well as full browser automation for JavaScript-heavy or protected pages. +- **Adaptive selectors** - Scrapling can remember the elements you scraped and find them again after a website redesign, so your scrapers keep working with fewer manual fixes. +- **Anti-bot evasion** - Built-in stealth features (browser impersonation, realistic headers, and automatic Cloudflare Turnstile solving with the browser fetchers) help you avoid being blocked. +- **Familiar parsing API** - Elements are selected with CSS selectors (including the `::text` and `::attr()` pseudo-elements) or XPath, with a Scrapy/Parsel-like `.get()` and `.getall()` interface. +- **First-class async support** - Every fetcher has an asynchronous variant, which integrates naturally with the asyncio-based Apify SDK. + +Scrapling's parser works on its own, while the fetchers are an optional extra. Install Scrapling with the `fetchers` extra to get the HTTP and browser fetchers: + +```bash +pip install "scrapling[fetchers]" +``` + +## Choosing a fetcher + +All of Scrapling's fetchers are importable from `scrapling.fetchers`. Pick the one that matches the website you're scraping: + +- **`Fetcher` / `AsyncFetcher`** - Plain HTTP requests via `.get()`, `.post()`, `.put()`, and `.delete()`. Fast and lightweight, with optional browser TLS-fingerprint impersonation (`impersonate`) and realistic headers (`stealthy_headers`). This is the best choice for static pages and APIs, and it needs no browser binaries. +- **`DynamicFetcher` / `DynamicSession`** - Full browser automation based on [Playwright](https://playwright.dev/), for pages that require JavaScript rendering or interaction. Fetch a page with `.fetch()` or its async variant `.async_fetch()`. +- **`StealthyFetcher` / `StealthySession`** - A stealth-hardened browser fetcher that can automatically solve Cloudflare Turnstile challenges (`solve_cloudflare=True`). Use it for the most heavily protected websites. + +The returned `Response` object is also a Scrapling selector, so you can call `.css()`, `.xpath()`, `.find_all()`, and the other parsing methods on it directly. + +The HTTP fetchers work with just the `scrapling[fetchers]` extra. The browser-based fetchers (`DynamicFetcher` and `StealthyFetcher`) additionally need browser binaries, which you download with the `scrapling install` command - see [Running browser-based fetchers](#running-browser-based-fetchers) below. + +The example Actor in this guide uses the HTTP `AsyncFetcher`, which is the simplest to deploy and pairs well with Apify Proxy. + +## Example Actor + +The following Actor recursively scrapes titles from all linked pages, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. + + + {ScraplingExample} + + +A few things worth pointing out: + +- The response of `AsyncFetcher.get` is a Scrapling selector, so `response.css('title::text').get()` reads the page title and `response.css('a::attr(href)').getall()` returns every link's `href` in one call. +- `response.urljoin(link_href)` resolves relative links against the page URL, so you can enqueue them directly. +- The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which - combined with Apify Proxy - reduces the chance of being blocked. + +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example above creates a proxy configuration and passes a fresh proxy URL to every request: + +```python +proxy_configuration = await Actor.create_proxy_configuration() +... +proxy_url = None +if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + +response = await AsyncFetcher.get(url, proxy=proxy_url) +``` + +Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy.apify.com:8000`), which is exactly what `ProxyConfiguration.new_url` returns. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. The browser-based fetchers accept the same `proxy` argument. + +## Running browser-based fetchers + +`DynamicFetcher` and `StealthyFetcher` drive a real browser, so they need the browser binaries installed with the `scrapling install` command. Locally, run it once after installing the `scrapling[fetchers]` extra: + +```bash +scrapling install +``` + +On the Apify platform, the Actor runs in a Docker container, so the browsers have to be installed during the image build. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and then download the browser binaries that Scrapling expects: + + +{`FROM apify/actor-python-playwright:3.14-1.60.0 + +COPY --chown=myuser:myuser requirements.txt ./ +RUN pip install -r requirements.txt + +# Download the browser binaries Scrapling needs. The base image already provides +# their system-level dependencies, so run this step as root. +USER root +RUN scrapling install +USER myuser + +COPY --chown=myuser:myuser . ./ +RUN python -m compileall -q my_actor/ + +CMD ["python", "-m", "my_actor"]`} + + +Fetching a page then only differs in which fetcher you call - the parsing API is identical: + +```python +from scrapling.fetchers import DynamicFetcher + +response = await DynamicFetcher.async_fetch(url, headless=True, network_idle=True) +quotes = response.css('.quote .text::text').getall() +``` + +## Conclusion + +In this guide, you learned how to use Scrapling in your Apify Actors. You can now fetch pages with Scrapling's HTTP or browser-based fetchers, extract data with its CSS and XPath selectors, route requests through Apify Proxy, and run the whole thing on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! + +## Additional resources + +- [Scrapling: Official documentation](https://scrapling.readthedocs.io/) +- [Scrapling: Fetchers](https://scrapling.readthedocs.io/en/latest/fetching/choosing/) +- [Scrapling: Parsing and selecting elements](https://scrapling.readthedocs.io/en/latest/parsing/selection/) +- [Scrapling: GitHub repository](https://github.com/D4Vinci/Scrapling) +- [Apify: Proxy management](https://docs.apify.com/platform/proxy) diff --git a/docs/03_guides/code/09_scrapling.py b/docs/03_guides/code/09_scrapling.py new file mode 100644 index 00000000..fed1b5ae --- /dev/null +++ b/docs/03_guides/code/09_scrapling.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from scrapling.fetchers import AsyncFetcher + +from apify import Actor, Request + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Create a proxy configuration that routes requests through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs with an initial crawl depth of 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + new_request = Request.from_url(url, user_data={'depth': 0}) + await request_queue.add_request(new_request) + + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + if not isinstance(request.user_data['depth'], (str, int)): + raise TypeError('Request.depth is an unexpected type.') + + depth = int(request.user_data['depth']) + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Get a fresh proxy URL for each request (None if no proxy is set up). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + # Fetch the page with Scrapling's asynchronous HTTP fetcher. The + # `impersonate` and `stealthy_headers` options make the request look + # like it comes from a real Chrome browser, reducing the chance of + # being blocked. The returned response is also a Scrapling selector. + response = await AsyncFetcher.get( + url, + proxy=proxy_url, + impersonate='chrome', + stealthy_headers=True, + timeout=60, + ) + + # If the current depth is less than max_depth, find nested links + # and enqueue them. The `::attr(href)` pseudo-selector reads the + # attribute, and `response.urljoin` resolves it against the page URL. + if depth < max_depth: + for link_href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(link_href) + + if link_url.startswith(('http://', 'https://')): + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url( + link_url, + user_data={'depth': depth + 1}, + ) + await request_queue.add_request(new_request) + + # Extract the desired data using Scrapling's CSS selectors. The + # `::text` pseudo-element returns the text content of the elements. + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled to ensure it is not processed again. + await request_queue.mark_request_as_handled(request) From 29c4c8a8a35a83410f998ad8f6d6efea0f9decbf Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 11:24:10 +0200 Subject: [PATCH 2/5] docs: Split Scrapling guide example into modules and use code tabs --- docs/03_guides/09_scrapling.mdx | 93 +++++++++--------- docs/03_guides/code/09_scrapling.py | 95 ------------------- .../code/scrapling_browser_project/Dockerfile | 21 ++++ .../my_actor/scraper.py | 45 +++++++++ .../scrapling_project/my_actor/__init__.py | 0 .../scrapling_project/my_actor/__main__.py | 8 ++ .../code/scrapling_project/my_actor/main.py | 71 ++++++++++++++ .../scrapling_project/my_actor/scraper.py | 47 +++++++++ pyproject.toml | 4 + 9 files changed, 245 insertions(+), 139 deletions(-) delete mode 100644 docs/03_guides/code/09_scrapling.py create mode 100644 docs/03_guides/code/scrapling_browser_project/Dockerfile create mode 100644 docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/__init__.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/__main__.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/main.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/scraper.py diff --git a/docs/03_guides/09_scrapling.mdx b/docs/03_guides/09_scrapling.mdx index 459e5a25..3e76ebca 100644 --- a/docs/03_guides/09_scrapling.mdx +++ b/docs/03_guides/09_scrapling.mdx @@ -5,9 +5,14 @@ description: Build an Apify Actor that scrapes web pages using the Scrapling ada --- import CodeBlock from '@theme/CodeBlock'; -import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; -import ScraplingExample from '!!raw-loader!roa-loader!./code/09_scrapling.py'; +import ScraplingMain from '!!raw-loader!./code/scrapling_project/my_actor/main.py'; +import ScraplingScraper from '!!raw-loader!./code/scrapling_project/my_actor/scraper.py'; +import ScraplingEntrypoint from '!!raw-loader!./code/scrapling_project/my_actor/__main__.py'; +import ScraplingBrowserScraper from '!!raw-loader!./code/scrapling_browser_project/my_actor/scraper.py'; +import ScraplingBrowserDockerfile from '!!raw-loader!./code/scrapling_browser_project/Dockerfile'; In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library in your Apify Actors. @@ -47,29 +52,40 @@ The example Actor in this guide uses the HTTP `AsyncFetcher`, which is the simpl The following Actor recursively scrapes titles from all linked pages, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. - - {ScraplingExample} - +The code is split into three small modules, following the structure of the Apify Python Actor templates: + +- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), and drives the crawl. +- `my_actor/scraper.py` - The Scrapling-specific logic. A single `scrape_page` function fetches a page and returns the extracted data together with the links found on it. +- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. + + + + + {ScraplingMain} + + + + + {ScraplingScraper} + + + + + {ScraplingEntrypoint} + + + A few things worth pointing out: +- Keeping the fetching and parsing in `scrape_page` separates the Scrapling-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `my_actor/main.py` decides what to store and what to enqueue. - The response of `AsyncFetcher.get` is a Scrapling selector, so `response.css('title::text').get()` reads the page title and `response.css('a::attr(href)').getall()` returns every link's `href` in one call. - `response.urljoin(link_href)` resolves relative links against the page URL, so you can enqueue them directly. - The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which - combined with Apify Proxy - reduces the chance of being blocked. ## Using Apify Proxy -Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example above creates a proxy configuration and passes a fresh proxy URL to every request: - -```python -proxy_configuration = await Actor.create_proxy_configuration() -... -proxy_url = None -if proxy_configuration: - proxy_url = await proxy_configuration.new_url() - -response = await AsyncFetcher.get(url, proxy=proxy_url) -``` +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Scrapling's `proxy` argument. Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy.apify.com:8000`), which is exactly what `ProxyConfiguration.new_url` returns. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. The browser-based fetchers accept the same `proxy` argument. @@ -81,34 +97,23 @@ Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy scrapling install ``` -On the Apify platform, the Actor runs in a Docker container, so the browsers have to be installed during the image build. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and then download the browser binaries that Scrapling expects: - - -{`FROM apify/actor-python-playwright:3.14-1.60.0 - -COPY --chown=myuser:myuser requirements.txt ./ -RUN pip install -r requirements.txt - -# Download the browser binaries Scrapling needs. The base image already provides -# their system-level dependencies, so run this step as root. -USER root -RUN scrapling install -USER myuser - -COPY --chown=myuser:myuser . ./ -RUN python -m compileall -q my_actor/ - -CMD ["python", "-m", "my_actor"]`} - - -Fetching a page then only differs in which fetcher you call - the parsing API is identical: - -```python -from scrapling.fetchers import DynamicFetcher - -response = await DynamicFetcher.async_fetch(url, headless=True, network_idle=True) -quotes = response.css('.quote .text::text').getall() -``` +Switching the example Actor from HTTP to a real browser only takes two changes - the rest of the project, including `my_actor/main.py`, stays exactly the same: + +1. Swap the fetcher call in `my_actor/scraper.py` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the data extraction is unchanged. +2. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and run `scrapling install` during the build to download the browser binaries that Scrapling expects. + + + + + {ScraplingBrowserScraper} + + + + + {ScraplingBrowserDockerfile} + + + ## Conclusion diff --git a/docs/03_guides/code/09_scrapling.py b/docs/03_guides/code/09_scrapling.py deleted file mode 100644 index fed1b5ae..00000000 --- a/docs/03_guides/code/09_scrapling.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import annotations - -from scrapling.fetchers import AsyncFetcher - -from apify import Actor, Request - - -async def main() -> None: - # Enter the context of the Actor. - async with Actor: - # Retrieve the Actor input, and use default values if not provided. - actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) - max_depth = actor_input.get('max_depth', 1) - - # Exit if no start URLs are provided. - if not start_urls: - Actor.log.info('No start URLs specified in Actor input, exiting...') - await Actor.exit() - - # Create a proxy configuration that routes requests through Apify Proxy. - proxy_configuration = await Actor.create_proxy_configuration() - - # Open the default request queue for handling URLs to be processed. - request_queue = await Actor.open_request_queue() - - # Enqueue the start URLs with an initial crawl depth of 0. - for start_url in start_urls: - url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') - new_request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(new_request) - - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): - url = request.url - - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) - Actor.log.info(f'Scraping {url} (depth={depth}) ...') - - try: - # Get a fresh proxy URL for each request (None if no proxy is set up). - proxy_url = None - if proxy_configuration: - proxy_url = await proxy_configuration.new_url() - - # Fetch the page with Scrapling's asynchronous HTTP fetcher. The - # `impersonate` and `stealthy_headers` options make the request look - # like it comes from a real Chrome browser, reducing the chance of - # being blocked. The returned response is also a Scrapling selector. - response = await AsyncFetcher.get( - url, - proxy=proxy_url, - impersonate='chrome', - stealthy_headers=True, - timeout=60, - ) - - # If the current depth is less than max_depth, find nested links - # and enqueue them. The `::attr(href)` pseudo-selector reads the - # attribute, and `response.urljoin` resolves it against the page URL. - if depth < max_depth: - for link_href in response.css('a::attr(href)').getall(): - link_url = response.urljoin(link_href) - - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) - await request_queue.add_request(new_request) - - # Extract the desired data using Scrapling's CSS selectors. The - # `::text` pseudo-element returns the text content of the elements. - data = { - 'url': url, - 'title': response.css('title::text').get(), - 'h1s': response.css('h1::text').getall(), - 'h2s': response.css('h2::text').getall(), - 'h3s': response.css('h3::text').getall(), - } - - # Store the extracted data to the default dataset. - await Actor.push_data(data) - - except Exception: - Actor.log.exception(f'Cannot extract data from {url}.') - - finally: - # Mark the request as handled to ensure it is not processed again. - await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/scrapling_browser_project/Dockerfile b/docs/03_guides/code/scrapling_browser_project/Dockerfile new file mode 100644 index 00000000..38b30c60 --- /dev/null +++ b/docs/03_guides/code/scrapling_browser_project/Dockerfile @@ -0,0 +1,21 @@ +# Use the Apify Playwright base image, which already ships a browser together +# with all of its system-level dependencies. +FROM apify/actor-python-playwright:3.14-1.60.0 + +# Copy just requirements.txt first to leverage the Docker build cache. +COPY --chown=myuser:myuser requirements.txt ./ +RUN pip install -r requirements.txt + +# Download the browser binaries that Scrapling expects. The base image already +# provides their system-level dependencies, so run this step as root and then +# switch back to the unprivileged user. +USER root +RUN scrapling install +USER myuser + +# Copy the rest of the source code and verify that it compiles. +COPY --chown=myuser:myuser . ./ +RUN python -m compileall -q my_actor/ + +# Specify how to launch the Actor. +CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py b/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py new file mode 100644 index 00000000..fb7d4579 --- /dev/null +++ b/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import Any + +from scrapling.fetchers import DynamicFetcher + + +async def scrape_page( + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Fetch a single page in a real browser and extract its data and links. + + `DynamicFetcher` drives a real browser via Playwright, so it can render + JavaScript-heavy pages. `network_idle` waits until the page stops making + network requests before the HTML is captured. Apart from the fetcher call, + everything else - including the parsing - is identical to the HTTP version. + """ + response = await DynamicFetcher.async_fetch( + url, + proxy=proxy_url, + headless=True, + network_idle=True, + ) + + # Extract the desired data using CSS selectors. The `::text` pseudo-element + # returns the text content of the matched elements. + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Collect absolute links from the page. The `::attr(href)` pseudo-selector + # reads the attribute and `response.urljoin` resolves it against the page URL. + links: list[str] = [] + for href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(href) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links diff --git a/docs/03_guides/code/scrapling_project/my_actor/__init__.py b/docs/03_guides/code/scrapling_project/my_actor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/03_guides/code/scrapling_project/my_actor/__main__.py b/docs/03_guides/code/scrapling_project/my_actor/__main__.py new file mode 100644 index 00000000..6aeaf3d5 --- /dev/null +++ b/docs/03_guides/code/scrapling_project/my_actor/__main__.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +import asyncio + +from .main import main + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/scrapling_project/my_actor/main.py b/docs/03_guides/code/scrapling_project/my_actor/main.py new file mode 100644 index 00000000..d2cd36e7 --- /dev/null +++ b/docs/03_guides/code/scrapling_project/my_actor/main.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from apify import Actor, Request + +from .scraper import scrape_page + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Create a proxy configuration that routes requests through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs with an initial crawl depth of 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + request = Request.from_url(url, user_data={'depth': 0}) + await request_queue.add_request(request) + + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + if not isinstance(request.user_data['depth'], (str, int)): + raise TypeError('Request.depth is an unexpected type.') + + depth = int(request.user_data['depth']) + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Get a fresh proxy URL for each request (None if no proxy set up). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + # Fetch the page and extract its data and nested links. + data, links = await scrape_page(url, proxy_url=proxy_url) + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url( + link_url, + user_data={'depth': depth + 1}, + ) + await request_queue.add_request(new_request) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled so it is not processed again. + await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/scrapling_project/my_actor/scraper.py b/docs/03_guides/code/scrapling_project/my_actor/scraper.py new file mode 100644 index 00000000..b840db82 --- /dev/null +++ b/docs/03_guides/code/scrapling_project/my_actor/scraper.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from typing import Any + +from scrapling.fetchers import AsyncFetcher + + +async def scrape_page( + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Fetch a single page with Scrapling and extract its data and links. + + The page is fetched with Scrapling's asynchronous HTTP fetcher. The + `impersonate` and `stealthy_headers` options make the request look like it + comes from a real Chrome browser, which reduces the chance of being blocked. + The returned response is also a Scrapling selector, so it can be queried with + CSS selectors directly. + """ + response = await AsyncFetcher.get( + url, + proxy=proxy_url, + impersonate='chrome', + stealthy_headers=True, + timeout=60, + ) + + # Extract the desired data using CSS selectors. The `::text` pseudo-element + # returns the text content of the matched elements. + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Collect absolute links from the page. The `::attr(href)` pseudo-selector + # reads the attribute and `response.urljoin` resolves it against the page URL. + links: list[str] = [] + for href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(href) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links diff --git a/pyproject.toml b/pyproject.toml index d17bdc01..d8697219 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,6 +181,10 @@ indent-style = "space" # Local imports in Scrapy project. "TID252", # Prefer absolute imports over relative imports from parent modules ] +"**/docs/**/scrapling_project/**" = [ + # Local imports are mixed up with the Apify SDK. + "I001", # Import block is un-sorted or un-formatted +] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" From 2a41a3f3e19b1e664adcbe35a39bfdacc58e816d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 12:00:53 +0200 Subject: [PATCH 3/5] docs: use Request.crawl_depth for depth tracking in Scrapling example --- .../code/scrapling_project/my_actor/main.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/docs/03_guides/code/scrapling_project/my_actor/main.py b/docs/03_guides/code/scrapling_project/my_actor/main.py index d2cd36e7..52e9ef4c 100644 --- a/docs/03_guides/code/scrapling_project/my_actor/main.py +++ b/docs/03_guides/code/scrapling_project/my_actor/main.py @@ -24,21 +24,18 @@ async def main() -> None: # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs with an initial crawl depth of 0. + # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(request) + await request_queue.add_request(Request.from_url(url)) # Process the URLs from the request queue. while request := await request_queue.fetch_next_request(): url = request.url - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: @@ -53,14 +50,13 @@ async def main() -> None: # Store the extracted data to the default dataset. await Actor.push_data(data) - # If we are not too deep yet, enqueue the links we found. + # If we are not too deep yet, enqueue the links we found one + # level deeper than the current page. if depth < max_depth: for link_url in links: Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 await request_queue.add_request(new_request) except Exception: From 910df14999f02c3f22e9fac77322148a4f0630e2 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 20:45:03 +0200 Subject: [PATCH 4/5] docs: renumber Scrapling guide to 07 and switch to a single-file example --- .../{09_scrapling.mdx => 07_scrapling.mdx} | 74 +++-------- docs/03_guides/code/07_scrapling.py | 122 ++++++++++++++++++ .../scraper.py => 07_scrapling_browser.py} | 16 +-- .../code/scrapling_browser_project/Dockerfile | 21 --- .../scrapling_project/my_actor/__init__.py | 0 .../scrapling_project/my_actor/__main__.py | 8 -- .../code/scrapling_project/my_actor/main.py | 67 ---------- .../scrapling_project/my_actor/scraper.py | 47 ------- 8 files changed, 146 insertions(+), 209 deletions(-) rename docs/03_guides/{09_scrapling.mdx => 07_scrapling.mdx} (63%) create mode 100644 docs/03_guides/code/07_scrapling.py rename docs/03_guides/code/{scrapling_browser_project/my_actor/scraper.py => 07_scrapling_browser.py} (52%) delete mode 100644 docs/03_guides/code/scrapling_browser_project/Dockerfile delete mode 100644 docs/03_guides/code/scrapling_project/my_actor/__init__.py delete mode 100644 docs/03_guides/code/scrapling_project/my_actor/__main__.py delete mode 100644 docs/03_guides/code/scrapling_project/my_actor/main.py delete mode 100644 docs/03_guides/code/scrapling_project/my_actor/scraper.py diff --git a/docs/03_guides/09_scrapling.mdx b/docs/03_guides/07_scrapling.mdx similarity index 63% rename from docs/03_guides/09_scrapling.mdx rename to docs/03_guides/07_scrapling.mdx index 3e76ebca..63e948e5 100644 --- a/docs/03_guides/09_scrapling.mdx +++ b/docs/03_guides/07_scrapling.mdx @@ -1,20 +1,16 @@ --- id: scrapling -title: Use Scrapling +title: Adaptive scraping with Scrapling description: Build an Apify Actor that scrapes web pages using the Scrapling adaptive web scraping library. --- import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; -import ScraplingMain from '!!raw-loader!./code/scrapling_project/my_actor/main.py'; -import ScraplingScraper from '!!raw-loader!./code/scrapling_project/my_actor/scraper.py'; -import ScraplingEntrypoint from '!!raw-loader!./code/scrapling_project/my_actor/__main__.py'; -import ScraplingBrowserScraper from '!!raw-loader!./code/scrapling_browser_project/my_actor/scraper.py'; -import ScraplingBrowserDockerfile from '!!raw-loader!./code/scrapling_browser_project/Dockerfile'; +import ScraplingExample from '!!raw-loader!roa-loader!./code/07_scrapling.py'; +import ScraplingBrowserScraper from '!!raw-loader!./code/07_scrapling_browser.py'; -In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library in your Apify Actors. +In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library for adaptive web scraping in your Apify Actors. ## Introduction @@ -50,42 +46,24 @@ The example Actor in this guide uses the HTTP `AsyncFetcher`, which is the simpl ## Example Actor -The following Actor recursively scrapes titles from all linked pages, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. - -The code is split into three small modules, following the structure of the Apify Python Actor templates: - -- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), and drives the crawl. -- `my_actor/scraper.py` - The Scrapling-specific logic. A single `scrape_page` function fetches a page and returns the extracted data together with the links found on it. -- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. - - - - - {ScraplingMain} - - - - - {ScraplingScraper} - - - - - {ScraplingEntrypoint} - - - +The following Actor recursively scrapes data from linked pages on the same site, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. + +The whole Actor fits in a single file. A `scrape_page` helper holds the Scrapling-specific fetching and parsing, while the `main` coroutine handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), and drives the crawl: + + + {ScraplingExample} + A few things worth pointing out: -- Keeping the fetching and parsing in `scrape_page` separates the Scrapling-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `my_actor/main.py` decides what to store and what to enqueue. +- Keeping the fetching and parsing in `scrape_page` separates the Scrapling-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `main` decides what to store and what to enqueue. - The response of `AsyncFetcher.get` is a Scrapling selector, so `response.css('title::text').get()` reads the page title and `response.css('a::attr(href)').getall()` returns every link's `href` in one call. - `response.urljoin(link_href)` resolves relative links against the page URL, so you can enqueue them directly. - The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which - combined with Apify Proxy - reduces the chance of being blocked. ## Using Apify Proxy -Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Scrapling's `proxy` argument. +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `main` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Scrapling's `proxy` argument. Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy.apify.com:8000`), which is exactly what `ProxyConfiguration.new_url` returns. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. The browser-based fetchers accept the same `proxy` argument. @@ -97,23 +75,13 @@ Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy scrapling install ``` -Switching the example Actor from HTTP to a real browser only takes two changes - the rest of the project, including `my_actor/main.py`, stays exactly the same: - -1. Swap the fetcher call in `my_actor/scraper.py` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the data extraction is unchanged. -2. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and run `scrapling install` during the build to download the browser binaries that Scrapling expects. - - - - - {ScraplingBrowserScraper} - - - - - {ScraplingBrowserDockerfile} - - - +Switching the example Actor from HTTP to a real browser takes only one code change - swap the `AsyncFetcher.get` call in `scrape_page` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the rest of the Actor stays exactly the same: + + + {ScraplingBrowserScraper} + + +To run this on the Apify platform, build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and run `scrapling install` during the Docker build to download the browser binaries that Scrapling expects. ## Conclusion diff --git a/docs/03_guides/code/07_scrapling.py b/docs/03_guides/code/07_scrapling.py new file mode 100644 index 00000000..49aab31b --- /dev/null +++ b/docs/03_guides/code/07_scrapling.py @@ -0,0 +1,122 @@ +import asyncio +from typing import Any +from urllib.parse import urlsplit + +from scrapling.fetchers import AsyncFetcher + +from apify import Actor, Request +from apify.storages import RequestQueue + + +async def scrape_page( + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Fetch a page with Scrapling's HTTP fetcher and return data and links.""" + # `impersonate` and `stealthy_headers` make the request look like Chrome. + response = await AsyncFetcher.get( + url, + proxy=proxy_url, + impersonate='chrome', + stealthy_headers=True, + timeout=60, + ) + + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Keep only absolute links on the same host. + links: list[str] = [] + host = urlsplit(url).netloc + for href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(href) + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: + links.append(link_url) + + return data, links + + +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the links one level deeper, unless max_depth was reached.""" + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + +async def main() -> None: + async with Actor: + # Read the Actor input. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) + + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Set up Apify Proxy and the request queue. + proxy_configuration = await Actor.create_proxy_configuration() + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs (crawl depth defaults to 0). + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing start URL: {url}') + await request_queue.add_request(Request.from_url(url)) + + # Cap the crawl; raise or remove to follow more pages. + max_requests = 50 + handled_requests = 0 + + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 + url = request.url + depth = request.crawl_depth + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Fresh proxy URL per request (None if no proxy). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + data, links = await scrape_page(url, proxy_url=proxy_url) + await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + await request_queue.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py b/docs/03_guides/code/07_scrapling_browser.py similarity index 52% rename from docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py rename to docs/03_guides/code/07_scrapling_browser.py index fb7d4579..3eb50e24 100644 --- a/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py +++ b/docs/03_guides/code/07_scrapling_browser.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from typing import Any from scrapling.fetchers import DynamicFetcher @@ -10,13 +8,8 @@ async def scrape_page( *, proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: - """Fetch a single page in a real browser and extract its data and links. - - `DynamicFetcher` drives a real browser via Playwright, so it can render - JavaScript-heavy pages. `network_idle` waits until the page stops making - network requests before the HTML is captured. Apart from the fetcher call, - everything else - including the parsing - is identical to the HTTP version. - """ + """Fetch a page in a real browser with Scrapling and return data and links.""" + # `network_idle` waits until the page stops making network requests. response = await DynamicFetcher.async_fetch( url, proxy=proxy_url, @@ -24,8 +17,6 @@ async def scrape_page( network_idle=True, ) - # Extract the desired data using CSS selectors. The `::text` pseudo-element - # returns the text content of the matched elements. data = { 'url': url, 'title': response.css('title::text').get(), @@ -34,8 +25,7 @@ async def scrape_page( 'h3s': response.css('h3::text').getall(), } - # Collect absolute links from the page. The `::attr(href)` pseudo-selector - # reads the attribute and `response.urljoin` resolves it against the page URL. + # Collect absolute links from the page. links: list[str] = [] for href in response.css('a::attr(href)').getall(): link_url = response.urljoin(href) diff --git a/docs/03_guides/code/scrapling_browser_project/Dockerfile b/docs/03_guides/code/scrapling_browser_project/Dockerfile deleted file mode 100644 index 38b30c60..00000000 --- a/docs/03_guides/code/scrapling_browser_project/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -# Use the Apify Playwright base image, which already ships a browser together -# with all of its system-level dependencies. -FROM apify/actor-python-playwright:3.14-1.60.0 - -# Copy just requirements.txt first to leverage the Docker build cache. -COPY --chown=myuser:myuser requirements.txt ./ -RUN pip install -r requirements.txt - -# Download the browser binaries that Scrapling expects. The base image already -# provides their system-level dependencies, so run this step as root and then -# switch back to the unprivileged user. -USER root -RUN scrapling install -USER myuser - -# Copy the rest of the source code and verify that it compiles. -COPY --chown=myuser:myuser . ./ -RUN python -m compileall -q my_actor/ - -# Specify how to launch the Actor. -CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/scrapling_project/my_actor/__init__.py b/docs/03_guides/code/scrapling_project/my_actor/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/03_guides/code/scrapling_project/my_actor/__main__.py b/docs/03_guides/code/scrapling_project/my_actor/__main__.py deleted file mode 100644 index 6aeaf3d5..00000000 --- a/docs/03_guides/code/scrapling_project/my_actor/__main__.py +++ /dev/null @@ -1,8 +0,0 @@ -from __future__ import annotations - -import asyncio - -from .main import main - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/docs/03_guides/code/scrapling_project/my_actor/main.py b/docs/03_guides/code/scrapling_project/my_actor/main.py deleted file mode 100644 index 52e9ef4c..00000000 --- a/docs/03_guides/code/scrapling_project/my_actor/main.py +++ /dev/null @@ -1,67 +0,0 @@ -from __future__ import annotations - -from apify import Actor, Request - -from .scraper import scrape_page - - -async def main() -> None: - # Enter the context of the Actor. - async with Actor: - # Retrieve the Actor input, and use default values if not provided. - actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) - max_depth = actor_input.get('max_depth', 1) - - # Exit if no start URLs are provided. - if not start_urls: - Actor.log.info('No start URLs specified in Actor input, exiting...') - await Actor.exit() - - # Create a proxy configuration that routes requests through Apify Proxy. - proxy_configuration = await Actor.create_proxy_configuration() - - # Open the default request queue for handling URLs to be processed. - request_queue = await Actor.open_request_queue() - - # Enqueue the start URLs. Their crawl depth defaults to 0. - for start_url in start_urls: - url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') - await request_queue.add_request(Request.from_url(url)) - - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): - url = request.url - - # Read the crawl depth tracked by the request itself. - depth = request.crawl_depth - Actor.log.info(f'Scraping {url} (depth={depth}) ...') - - try: - # Get a fresh proxy URL for each request (None if no proxy set up). - proxy_url = None - if proxy_configuration: - proxy_url = await proxy_configuration.new_url() - - # Fetch the page and extract its data and nested links. - data, links = await scrape_page(url, proxy_url=proxy_url) - - # Store the extracted data to the default dataset. - await Actor.push_data(data) - - # If we are not too deep yet, enqueue the links we found one - # level deeper than the current page. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) - - except Exception: - Actor.log.exception(f'Cannot extract data from {url}.') - - finally: - # Mark the request as handled so it is not processed again. - await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/scrapling_project/my_actor/scraper.py b/docs/03_guides/code/scrapling_project/my_actor/scraper.py deleted file mode 100644 index b840db82..00000000 --- a/docs/03_guides/code/scrapling_project/my_actor/scraper.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations - -from typing import Any - -from scrapling.fetchers import AsyncFetcher - - -async def scrape_page( - url: str, - *, - proxy_url: str | None = None, -) -> tuple[dict[str, Any], list[str]]: - """Fetch a single page with Scrapling and extract its data and links. - - The page is fetched with Scrapling's asynchronous HTTP fetcher. The - `impersonate` and `stealthy_headers` options make the request look like it - comes from a real Chrome browser, which reduces the chance of being blocked. - The returned response is also a Scrapling selector, so it can be queried with - CSS selectors directly. - """ - response = await AsyncFetcher.get( - url, - proxy=proxy_url, - impersonate='chrome', - stealthy_headers=True, - timeout=60, - ) - - # Extract the desired data using CSS selectors. The `::text` pseudo-element - # returns the text content of the matched elements. - data = { - 'url': url, - 'title': response.css('title::text').get(), - 'h1s': response.css('h1::text').getall(), - 'h2s': response.css('h2::text').getall(), - 'h3s': response.css('h3::text').getall(), - } - - # Collect absolute links from the page. The `::attr(href)` pseudo-selector - # reads the attribute and `response.urljoin` resolves it against the page URL. - links: list[str] = [] - for href in response.css('a::attr(href)').getall(): - link_url = response.urljoin(href) - if link_url.startswith(('http://', 'https://')): - links.append(link_url) - - return data, links From 404bdfb23d4e951b3d63f9660f8f6ef6e8107533 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 21:02:44 +0200 Subject: [PATCH 5/5] chore: drop unused ruff ignore for the removed Scrapling project --- pyproject.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d8697219..d17bdc01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,10 +181,6 @@ indent-style = "space" # Local imports in Scrapy project. "TID252", # Prefer absolute imports over relative imports from parent modules ] -"**/docs/**/scrapling_project/**" = [ - # Local imports are mixed up with the Apify SDK. - "I001", # Import block is un-sorted or un-formatted -] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double"