From 65f8e0d1bdce589c6ba3249290e4257eb1da7473 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 13:24:12 +0200 Subject: [PATCH 1/2] docs: Flatten scraper examples and fix guide inaccuracies --- docs/03_guides/06_scrapy.mdx | 6 +- docs/03_guides/07_running_webserver.mdx | 12 ++- docs/03_guides/code/01_beautifulsoup_httpx.py | 89 ++++++++++-------- docs/03_guides/code/02_parsel_impit.py | 93 ++++++++++--------- docs/03_guides/code/03_playwright.py | 86 ++++++++++------- docs/03_guides/code/04_selenium.py | 77 ++++++++------- 6 files changed, 209 insertions(+), 154 deletions(-) diff --git a/docs/03_guides/06_scrapy.mdx b/docs/03_guides/06_scrapy.mdx index 12525609..81409ab2 100644 --- a/docs/03_guides/06_scrapy.mdx +++ b/docs/03_guides/06_scrapy.mdx @@ -23,9 +23,9 @@ In this guide, you'll learn how to use the [Scrapy](https://scrapy.org/) framewo ## Integrating Scrapy with the Apify platform -The Apify SDK provides an Apify-Scrapy integration. The main challenge of this is to combine two asynchronous frameworks that use different event loop implementations. Scrapy uses [Twisted](https://twisted.org/) for asynchronous execution, while the Apify SDK is based on [asyncio](https://docs.python.org/3/library/asyncio.html). The key thing is to install the Twisted's `asyncioreactor` to run Twisted's asyncio compatible event loop. The `apify.scrapy.run_scrapy_actor` function handles this reactor installation automatically. This allows both Twisted and asyncio to run on a single event loop, enabling a Scrapy spider to run as an Apify Actor with minimal modifications. +The Apify SDK provides an Apify-Scrapy integration. The main challenge of this is to combine two asynchronous frameworks that use different event loop implementations. Scrapy uses [Twisted](https://twisted.org/) for asynchronous execution, while the Apify SDK is based on [asyncio](https://docs.python.org/3/library/asyncio.html). The key thing is to install Twisted's `asyncioreactor` to run Twisted's asyncio compatible event loop. The `apify.scrapy.run_scrapy_actor` function handles this reactor installation automatically. This allows both Twisted and asyncio to run on a single event loop, enabling a Scrapy spider to run as an Apify Actor with minimal modifications. - + {UnderscoreMainExample} @@ -74,7 +74,7 @@ For further details, see the [Scrapy migration guide](https://docs.apify.com/cli The following example shows a Scrapy Actor that scrapes page titles and enqueues links found on each page. This example aligns with the structure provided in the Apify Actor templates. - + {UnderscoreMainExample} diff --git a/docs/03_guides/07_running_webserver.mdx b/docs/03_guides/07_running_webserver.mdx index c17c313b..9b63976a 100644 --- a/docs/03_guides/07_running_webserver.mdx +++ b/docs/03_guides/07_running_webserver.mdx @@ -18,9 +18,9 @@ The URL is available in the following places: - In Apify Console, on the Actor run details page as the **Container URL** field. - In the API as the `container_url` property of the [Run object](https://docs.apify.com/api/v2#/reference/actors/run-object/get-run). -- In the Actor as the `Actor.configuration.container_url` property. +- In the Actor as the `Actor.configuration.web_server_url` property. -The web server running inside the container must listen at the port defined by the `Actor.configuration.container_port` property. When running Actors locally, the port defaults to `4321`, so the web server will be accessible at `http://localhost:4321`. +The web server running inside the container must listen at the port defined by the `Actor.configuration.web_server_port` property. When running Actors locally, the port defaults to `4321`, so the web server will be accessible at `http://localhost:4321`. ## Example Actor @@ -30,6 +30,14 @@ The following example shows how to start a simple web server in your Actor, whic {WebserverExample} +## Actor Standby + +The example above runs a web server for the duration of a single Actor run. With [Actor Standby](https://docs.apify.com/platform/actors/development/programming-interface/standby), you can instead expose your Actor as an always-ready HTTP API: the platform keeps the Actor running in the background and routes incoming HTTP requests to the web server inside it, spinning up additional instances as the load grows. + +From the SDK's perspective, a Standby Actor is built the same way as the web server above — start an HTTP server listening on the port from `Actor.configuration.web_server_port`. The difference is operational: instead of doing its work once and exiting, a Standby Actor stays up and serves requests. This makes it a good fit for low-latency, on-demand use cases, such as serving scraped data or acting as a microservice. + +To get started quickly, use the [Standby Python template](https://apify.com/templates/python-standby). For details on enabling Standby, request routing, and readiness probes, see the [Actor Standby documentation](https://docs.apify.com/platform/actors/development/programming-interface/standby). + ## Conclusion In this guide, you learned how to run a web server inside your Apify Actor. By leveraging the container URL and port provided by the platform, you can expose HTTP endpoints for monitoring, reporting, or serving content during Actor execution. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). diff --git a/docs/03_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py index 5dbfab2a..86e83868 100644 --- a/docs/03_guides/code/01_beautifulsoup_httpx.py +++ b/docs/03_guides/code/01_beautifulsoup_httpx.py @@ -1,4 +1,5 @@ import asyncio +from typing import Any from urllib.parse import urljoin import httpx @@ -7,6 +8,40 @@ from apify import Actor, Request +async def scrape_page( + client: httpx.AsyncClient, url: str +) -> tuple[dict[str, Any], list[str]]: + """Fetch a single page with HTTPX and extract its data and links. + + Keeping the fetching and parsing in this helper keeps the Actor's main loop + shallow. It returns the extracted data together with the links found on the + page, so `main` only has to decide what to store and what to enqueue. + """ + # Fetch the HTTP response from the specified URL using HTTPX. + response = await client.get(url, follow_redirects=True) + + # Parse the HTML content using Beautiful Soup. + soup = BeautifulSoup(response.content, 'html.parser') + + # Extract the desired data. + data = { + 'url': url, + 'title': soup.title.string if soup.title else None, + 'h1s': [h1.text for h1 in soup.find_all('h1')], + 'h2s': [h2.text for h2 in soup.find_all('h2')], + 'h3s': [h3.text for h3 in soup.find_all('h3')], + } + + # Collect absolute links found on the page so the caller can enqueue them. + links: list[str] = [] + for link in soup.find_all('a'): + link_url = urljoin(url, link.get('href')) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -23,12 +58,11 @@ async def main() -> None: # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs with an initial crawl depth of 0. + # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - new_request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(new_request) + await request_queue.add_request(Request.from_url(url)) # Create an HTTPX client to fetch the HTML content of the URLs. async with httpx.AsyncClient() as client: @@ -36,52 +70,31 @@ async def main() -> None: while request := await request_queue.fetch_next_request(): url = request.url - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Fetch the HTTP response from the specified URL using HTTPX. - response = await client.get(url, follow_redirects=True) - - # Parse the HTML content using Beautiful Soup. - soup = BeautifulSoup(response.content, 'html.parser') - - # If the current depth is less than max_depth, find nested links - # and enqueue them. - if depth < max_depth: - for link in soup.find_all('a'): - link_href = link.get('href') - link_url = urljoin(url, link_href) - - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) - await request_queue.add_request(new_request) - - # Extract the desired data. - data = { - 'url': url, - 'title': soup.title.string if soup.title else None, - 'h1s': [h1.text for h1 in soup.find_all('h1')], - 'h2s': [h2.text for h2 in soup.find_all('h2')], - 'h3s': [h3.text for h3 in soup.find_all('h3')], - } + # Fetch the page and extract its data and nested links. + data, links = await scrape_page(client, url) # Store the extracted data to the default dataset. await Actor.push_data(data) + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled to ensure it is not processed again. - await request_queue.mark_request_as_handled(new_request) + # Mark the request as handled so it is not processed again. + await request_queue.mark_request_as_handled(request) if __name__ == '__main__': diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py index 21b5e74f..1a0c4f77 100644 --- a/docs/03_guides/code/02_parsel_impit.py +++ b/docs/03_guides/code/02_parsel_impit.py @@ -1,4 +1,5 @@ import asyncio +from typing import Any from urllib.parse import urljoin import impit @@ -7,6 +8,40 @@ from apify import Actor, Request +async def scrape_page( + client: impit.AsyncClient, url: str +) -> tuple[dict[str, Any], list[str]]: + """Fetch a single page with Impit and extract its data and links. + + Keeping the fetching and parsing in this helper keeps the Actor's main loop + shallow. It returns the extracted data together with the links found on the + page, so `main` only has to decide what to store and what to enqueue. + """ + # Fetch the HTTP response from the specified URL using Impit. + response = await client.get(url) + + # Parse the HTML content using a Parsel selector. + selector = parsel.Selector(text=response.text) + + # Extract the desired data using Parsel selectors. + data = { + 'url': url, + 'title': selector.css('title::text').get(), + 'h1s': selector.css('h1::text').getall(), + 'h2s': selector.css('h2::text').getall(), + 'h3s': selector.css('h3::text').getall(), + } + + # Collect absolute links found on the page so the caller can enqueue them. + links: list[str] = [] + for link_href in selector.css('a::attr(href)').getall(): + link_url = urljoin(url, link_href) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -23,12 +58,11 @@ async def main() -> None: # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs with an initial crawl depth of 0. + # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - new_request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(new_request) + await request_queue.add_request(Request.from_url(url)) # Create an Impit client to fetch the HTML content of the URLs. async with impit.AsyncClient() as client: @@ -36,57 +70,30 @@ async def main() -> None: while request := await request_queue.fetch_next_request(): url = request.url - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Fetch the HTTP response from the specified URL using Impit. - response = await client.get(url) - - # Parse the HTML content using Parsel Selector. - selector = parsel.Selector(text=response.text) - - # If the current depth is less than max_depth, find nested links - # and enqueue them. - if depth < max_depth: - # Extract all links using CSS selector - links = selector.css('a::attr(href)').getall() - for link_href in links: - link_url = urljoin(url, link_href) - - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) - await request_queue.add_request(new_request) - - # Extract the desired data using Parsel selectors. - title = selector.css('title::text').get() - h1s = selector.css('h1::text').getall() - h2s = selector.css('h2::text').getall() - h3s = selector.css('h3::text').getall() - - data = { - 'url': url, - 'title': title, - 'h1s': h1s, - 'h2s': h2s, - 'h3s': h3s, - } + # Fetch the page and extract its data and nested links. + data, links = await scrape_page(client, url) # Store the extracted data to the default dataset. await Actor.push_data(data) + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled to ensure it is not processed again. + # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py index 3eecb4ac..1f2fc1d7 100644 --- a/docs/03_guides/code/03_playwright.py +++ b/docs/03_guides/code/03_playwright.py @@ -1,7 +1,8 @@ import asyncio +from typing import Any from urllib.parse import urljoin -from playwright.async_api import async_playwright +from playwright.async_api import BrowserContext, async_playwright from apify import Actor, Request @@ -11,6 +12,39 @@ # in the Actor's Docker image. +async def scrape_page( + context: BrowserContext, url: str +) -> tuple[dict[str, Any], list[str]]: + """Open a page in the browser, extract its data, and collect its links. + + Keeping the page handling in this helper keeps the Actor's main loop shallow. + It returns the extracted data together with the links found on the page, so + `main` only has to decide what to store and what to enqueue. + """ + page = await context.new_page() + try: + await page.goto(url) + + # Extract the desired data. + data = { + 'url': url, + 'title': await page.title(), + } + + # Collect absolute links found on the page so the caller can enqueue them. + links: list[str] = [] + for link in await page.locator('a').all(): + link_href = await link.get_attribute('href') + link_url = urljoin(url, link_href) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links + + finally: + await page.close() + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -21,18 +55,17 @@ async def main() -> None: # Exit if no start URLs are provided. if not start_urls: - Actor.log.info('No start URLs specified in actor input, exiting...') + Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs with an initial crawl depth of 0. + # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - new_request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(new_request) + await request_queue.add_request(Request.from_url(url)) Actor.log.info('Launching Playwright...') @@ -49,47 +82,30 @@ async def main() -> None: while request := await request_queue.fetch_next_request(): url = request.url - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Open a new page in the browser context and navigate to the URL. - page = await context.new_page() - await page.goto(url) - - # If the current depth is less than max_depth, find nested links - # and enqueue them. - if depth < max_depth: - for link in await page.locator('a').all(): - link_href = await link.get_attribute('href') - link_url = urljoin(url, link_href) - - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) - await request_queue.add_request(new_request) - - # Extract the desired data. - data = { - 'url': url, - 'title': await page.title(), - } + # Fetch the page and extract its data and nested links. + data, links = await scrape_page(context, url) # Store the extracted data to the default dataset. await Actor.push_data(data) + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: - await page.close() - # Mark the request as handled to ensure it is not processed again. + # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py index 4b427a7a..42dc3509 100644 --- a/docs/03_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -1,4 +1,5 @@ import asyncio +from typing import Any from urllib.parse import urljoin from selenium import webdriver @@ -14,6 +15,32 @@ # in the Actor's Docker image. +def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], list[str]]: + """Navigate to a page with Selenium, extract its data, and collect its links. + + These are blocking WebDriver calls, so the Actor's main loop runs this helper + in a worker thread via `asyncio.to_thread`. It returns the extracted data + together with the links found on the page, so `main` only has to decide what + to store and what to enqueue. + """ + driver.get(url) + + # Extract the desired data. + data = { + 'url': url, + 'title': driver.title, + } + + # Collect absolute links found on the page so the caller can enqueue them. + links: list[str] = [] + for link in driver.find_elements(By.TAG_NAME, 'a'): + link_url = urljoin(url, link.get_attribute('href')) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links + + async def main() -> None: # Enter the context of the Actor. async with Actor: @@ -24,18 +51,17 @@ async def main() -> None: # Exit if no start URLs are provided. if not start_urls: - Actor.log.info('No start URLs specified in actor input, exiting...') + Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs with an initial crawl depth of 0. + # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - new_request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(new_request) + await request_queue.add_request(Request.from_url(url)) # Launch a new Selenium Chrome WebDriver and configure it. Actor.log.info('Launching Chrome WebDriver...') @@ -57,46 +83,31 @@ async def main() -> None: while request := await request_queue.fetch_next_request(): url = request.url - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Navigate to the URL using Selenium WebDriver. Use asyncio.to_thread - # for non-blocking execution. - await asyncio.to_thread(driver.get, url) - - # If the current depth is less than max_depth, find nested links - # and enqueue them. - if depth < max_depth: - for link in driver.find_elements(By.TAG_NAME, 'a'): - link_href = link.get_attribute('href') - link_url = urljoin(url, link_href) - - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) - await request_queue.add_request(new_request) - - # Extract the desired data. - data = { - 'url': url, - 'title': driver.title, - } + # Fetch the page and extract its data and nested links. The blocking + # WebDriver calls run in a worker thread to keep the loop responsive. + data, links = await asyncio.to_thread(scrape_page, driver, url) # Store the extracted data to the default dataset. await Actor.push_data(data) + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 + await request_queue.add_request(new_request) + except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled to ensure it is not processed again. + # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) driver.quit() From 10c81997481042e6145e412d24aa9df45dd12f14 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 20:47:08 +0200 Subject: [PATCH 2/2] docs: retitle and unify existing guides; move web server guide to 12 --- docs/01_introduction/quick-start.mdx | 26 ++- docs/03_guides/01_beautifulsoup_httpx.mdx | 10 +- docs/03_guides/02_parsel_impit.mdx | 10 +- docs/03_guides/03_playwright.mdx | 12 +- docs/03_guides/04_selenium.mdx | 14 +- docs/03_guides/05_crawlee.mdx | 8 +- docs/03_guides/06_scrapy.mdx | 4 +- ...webserver.mdx => 12_running_webserver.mdx} | 28 ++- docs/03_guides/code/01_beautifulsoup_httpx.py | 125 +++++++----- docs/03_guides/code/02_parsel_impit.py | 125 +++++++----- docs/03_guides/code/03_playwright.py | 109 ++++++---- docs/03_guides/code/04_selenium.py | 186 +++++++++++++----- .../code/05_crawlee_beautifulsoup.py | 38 ++-- docs/03_guides/code/05_crawlee_parsel.py | 38 ++-- docs/03_guides/code/05_crawlee_playwright.py | 46 +++-- .../code/{07_webserver.py => 12_webserver.py} | 6 +- docs/03_guides/code/12_webserver_fastapi.py | 48 +++++ .../code/scrapy_project/src/__main__.py | 2 +- .../03_guides/code/scrapy_project/src/main.py | 6 +- .../code/scrapy_project/src/settings.py | 2 +- .../code/scrapy_project/src/spiders/title.py | 29 +-- 21 files changed, 558 insertions(+), 314 deletions(-) rename docs/03_guides/{07_running_webserver.mdx => 12_running_webserver.mdx} (66%) rename docs/03_guides/code/{07_webserver.py => 12_webserver.py} (87%) create mode 100644 docs/03_guides/code/12_webserver_fastapi.py diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index da166da9..c74bd848 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -67,7 +67,7 @@ The Actor's source code is in the `src` folder. This folder contains two importa {MainExample} - + {UnderscoreMainExample} @@ -97,12 +97,20 @@ To learn more about the features of the Apify SDK and how to use them, check out ### Guides -To see how you can integrate the Apify SDK with popular web scraping libraries, check out our guides: +To see how you can integrate the Apify SDK with popular scraping libraries and frameworks, check out these guides: -- [BeautifulSoup with HTTPX](../guides/beautifulsoup-httpx) -- [Parsel with Impit](../guides/parsel-impit) -- [Playwright](../guides/playwright) -- [Selenium](../guides/selenium) -- [Crawlee](../guides/crawlee) -- [Scrapy](../guides/scrapy) -- [Running webserver](../guides/running-webserver) +- [Scraping with BeautifulSoup and HTTPX](../guides/beautifulsoup-httpx) +- [Scraping with Parsel and Impit](../guides/parsel-impit) +- [Browser automation with Playwright](../guides/playwright) +- [Browser automation with Selenium](../guides/selenium) +- [Building crawlers with Crawlee](../guides/crawlee) +- [Building crawlers with Scrapy](../guides/scrapy) +- [Adaptive scraping with Scrapling](../guides/scrapling) +- [LLM-ready scraping with Crawl4AI](../guides/crawl4ai) +- [Browser AI agents with Browser Use](../guides/browser-use) + +For other aspects of Actor development, explore these guides: + +- [Project management with uv](../guides/uv) +- [Input validation with Pydantic](../guides/input-validation) +- [Running a web server](../guides/running-webserver) diff --git a/docs/03_guides/01_beautifulsoup_httpx.mdx b/docs/03_guides/01_beautifulsoup_httpx.mdx index ba15df03..2ae47ded 100644 --- a/docs/03_guides/01_beautifulsoup_httpx.mdx +++ b/docs/03_guides/01_beautifulsoup_httpx.mdx @@ -1,6 +1,6 @@ --- id: beautifulsoup-httpx -title: Use BeautifulSoup with HTTPX +title: Scraping with BeautifulSoup and HTTPX description: Build an Apify Actor that scrapes web pages using BeautifulSoup and HTTPX. --- @@ -8,7 +8,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import BeautifulSoupHttpxExample from '!!raw-loader!roa-loader!./code/01_beautifulsoup_httpx.py'; -In this guide, you'll learn how to use the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) library with the [HTTPX](https://www.python-httpx.org/) library in your Apify Actors. +In this guide, you'll learn how to scrape web pages with the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) and [HTTPX](https://www.python-httpx.org/) libraries in your Apify Actors. ## Introduction @@ -20,12 +20,16 @@ To create an Actor which uses those libraries, start from the [BeautifulSoup & P ## Example Actor -Below is a simple Actor that recursively scrapes titles from all linked websites, up to a specified maximum depth, starting from URLs provided in the Actor input. It uses [HTTPX](https://www.python-httpx.org/) for fetching pages and [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing their content to extract titles and links to other pages. +Below is a simple Actor that recursively scrapes data from linked pages on the same site, up to a specified maximum depth, starting from URLs provided in the Actor input. It uses [HTTPX](https://www.python-httpx.org/) for fetching pages through [Apify Proxy](https://docs.apify.com/platform/proxy) and [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing their content to extract the title, headings, and links to other pages. {BeautifulSoupHttpxExample} +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example creates a proxy configuration with `Actor.create_proxy_configuration` and fetches a fresh proxy URL for every request, so each page goes through a different IP. A new HTTPX client is created per request to apply that URL. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + ## Conclusion In this guide, you learned how to use the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) with the [HTTPX](https://www.python-httpx.org/) in your Apify Actors. By combining these libraries, you can efficiently extract data from HTML or XML files, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/02_parsel_impit.mdx b/docs/03_guides/02_parsel_impit.mdx index da5a2866..d91ebea2 100644 --- a/docs/03_guides/02_parsel_impit.mdx +++ b/docs/03_guides/02_parsel_impit.mdx @@ -1,6 +1,6 @@ --- id: parsel-impit -title: Use Parsel with Impit +title: Scraping with Parsel and Impit description: Build an Apify Actor that scrapes web pages using Parsel selectors and the Impit HTTP client. --- @@ -8,7 +8,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ParselImpitExample from '!!raw-loader!roa-loader!./code/02_parsel_impit.py'; -In this guide, you'll learn how to combine the [Parsel](https://github.com/scrapy/parsel) and [Impit](https://github.com/apify/impit) libraries when building Apify Actors. +In this guide, you'll learn how to scrape web pages with the [Parsel](https://github.com/scrapy/parsel) and [Impit](https://github.com/apify/impit) libraries in your Apify Actors. ## Introduction @@ -18,12 +18,16 @@ In this guide, you'll learn how to combine the [Parsel](https://github.com/scrap ## Example Actor -The following example shows a simple Actor that recursively scrapes titles from linked pages, up to a user-defined maximum depth. It uses [Impit](https://github.com/apify/impit) to fetch pages and [Parsel](https://github.com/scrapy/parsel) to extract titles and discover new links. +The following example shows a simple Actor that recursively scrapes data from linked pages on the same site, up to a user-defined maximum depth. It uses [Impit](https://github.com/apify/impit) to fetch pages through [Apify Proxy](https://docs.apify.com/platform/proxy) and [Parsel](https://github.com/scrapy/parsel) to extract the title, headings, and links. {ParselImpitExample} +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example creates a proxy configuration with `Actor.create_proxy_configuration` and fetches a fresh proxy URL for every request, so each page goes through a different IP. A new Impit client is created per request to apply that URL. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + ## Conclusion In this guide, you learned how to use [Parsel](https://github.com/scrapy/parsel) with [Impit](https://github.com/apify/impit) in your Apify Actors. By combining these libraries, you get a powerful and efficient solution for web scraping: [Parsel](https://github.com/scrapy/parsel) provides excellent CSS selector and XPath support for data extraction, while [Impit](https://github.com/apify/impit) offers a fast and simple HTTP client built by Apify. This combination makes it easy to build scalable web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/03_playwright.mdx b/docs/03_guides/03_playwright.mdx index 0e20b9e4..11b57b7e 100644 --- a/docs/03_guides/03_playwright.mdx +++ b/docs/03_guides/03_playwright.mdx @@ -1,6 +1,6 @@ --- id: playwright -title: Use Playwright +title: Browser automation with Playwright description: Build an Apify Actor that scrapes dynamic web pages using Playwright browser automation. --- @@ -11,7 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import PlaywrightExample from '!!raw-loader!roa-loader!./code/03_playwright.py'; -In this guide, you'll learn how to use [Playwright](https://playwright.dev) for web scraping in your Apify Actors. +In this guide, you'll learn how to use [Playwright](https://playwright.dev) for browser automation and web scraping in your Apify Actors. ## Introduction @@ -48,14 +48,18 @@ playwright install --with-deps` ## Example Actor -This is a simple Actor that recursively scrapes titles from all linked websites, up to a maximum depth, starting from URLs in the Actor input. +This is a simple Actor that recursively scrapes data from linked pages on the same site, up to a maximum depth, starting from URLs in the Actor input. -It uses Playwright to open the pages in an automated Chrome browser, and to extract the title and anchor elements after the pages load. +It uses Playwright to open the pages in an automated Chrome browser, and to extract the title, headings, and links after the pages load. {PlaywrightExample} +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example creates a proxy configuration with `Actor.create_proxy_configuration` and launches the browser through it. Playwright applies the proxy at the browser level, so the whole run shares a single proxy URL rather than rotating per request; the `to_playwright_proxy` helper splits that URL into the `server`, `username`, and `password` fields Playwright expects. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + ## Conclusion In this guide you learned how to create Actors that use Playwright to scrape websites. Playwright is a powerful tool that can be used to manage browser instances and scrape websites that require JavaScript execution. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/04_selenium.mdx b/docs/03_guides/04_selenium.mdx index e878c3a6..ae4ccbef 100644 --- a/docs/03_guides/04_selenium.mdx +++ b/docs/03_guides/04_selenium.mdx @@ -1,6 +1,6 @@ --- id: selenium -title: Use Selenium +title: Browser automation with Selenium description: Build an Apify Actor that scrapes dynamic web pages using Selenium WebDriver. --- @@ -8,7 +8,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import SeleniumExample from '!!raw-loader!roa-loader!./code/04_selenium.py'; -In this guide, you'll learn how to use [Selenium](https://www.selenium.dev/) for web scraping in your Apify Actors. +In this guide, you'll learn how to use [Selenium](https://www.selenium.dev/) for browser automation and web scraping in your Apify Actors. ## Introduction @@ -32,14 +32,20 @@ Refer to the [Selenium documentation](https://www.selenium.dev/documentation/web ## Example Actor -This is a simple Actor that recursively scrapes titles from all linked websites, up to a maximum depth, starting from URLs in the Actor input. +This is a simple Actor that recursively scrapes data from linked pages on the same site, up to a maximum depth, starting from URLs in the Actor input. -It uses Selenium ChromeDriver to open the pages in an automated Chrome browser, and to extract the title and anchor elements after the pages load. +It uses Selenium ChromeDriver to open the pages in an automated Chrome browser, and to extract the title, headings, and links after the pages load. {SeleniumExample} +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example creates a proxy configuration with `Actor.create_proxy_configuration` and routes the browser through it for the whole run. + +Chrome ignores the credentials passed in the `--proxy-server` flag, so an authenticated proxy such as Apify Proxy has to be configured from inside a small extension. The `proxy_auth_extension` helper builds one at runtime: its service worker sets the proxy server and answers the browser's authentication challenge with the username and password. Note that the new headless mode (`--headless=new`) is required for Chrome to load the extension. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + ## Conclusion In this guide you learned how to use Selenium for web scraping in Apify Actors. You can now create your own Actors that use Selenium to scrape dynamic websites and interact with web pages just like a human would. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/05_crawlee.mdx b/docs/03_guides/05_crawlee.mdx index 34bb0f46..f0aa67f6 100644 --- a/docs/03_guides/05_crawlee.mdx +++ b/docs/03_guides/05_crawlee.mdx @@ -1,6 +1,6 @@ --- id: crawlee -title: Use Crawlee +title: Building crawlers with Crawlee description: Build Apify Actors using Crawlee's BeautifulSoupCrawler, ParselCrawler, or PlaywrightCrawler. --- @@ -10,7 +10,7 @@ import CrawleeBeautifulSoupExample from '!!raw-loader!roa-loader!./code/05_crawl import CrawleeParselExample from '!!raw-loader!roa-loader!./code/05_crawlee_parsel.py'; import CrawleePlaywrightExample from '!!raw-loader!roa-loader!./code/05_crawlee_playwright.py'; -In this guide, you'll learn how to use the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. +In this guide, you'll learn how to build web crawlers with the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. ## Introduction @@ -42,6 +42,10 @@ The [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler {CrawleePlaywrightExample} +## Using Apify Proxy + +All three crawlers above route their requests through [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. `Actor.create_proxy_configuration` returns a Crawlee-compatible proxy configuration, which is passed to the crawler as `proxy_configuration`; Crawlee then rotates the proxy IP for every request on its own. Because the configuration is only available inside the running Actor, the crawler is created in `main` and the request handler is registered on a standalone [`Router`](https://crawlee.dev/python/api/class/Router) up front. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. + ## Conclusion In this guide, you learned how to use the [Crawlee](https://crawlee.dev/python) library in your Apify Actors. By using the [`BeautifulSoupCrawler`](https://crawlee.dev/python/api/class/BeautifulSoupCrawler), [`ParselCrawler`](https://crawlee.dev/python/api/class/ParselCrawler), and [`PlaywrightCrawler`](https://crawlee.dev/python/api/class/PlaywrightCrawler) crawlers, you can efficiently scrape static or dynamic web pages, making it easy to build web scraping tasks in Python. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! diff --git a/docs/03_guides/06_scrapy.mdx b/docs/03_guides/06_scrapy.mdx index 81409ab2..4af24354 100644 --- a/docs/03_guides/06_scrapy.mdx +++ b/docs/03_guides/06_scrapy.mdx @@ -1,6 +1,6 @@ --- id: scrapy -title: Use Scrapy +title: Building crawlers with Scrapy description: Convert Scrapy spiders into Apify Actors with platform storage and proxy integration. --- @@ -15,7 +15,7 @@ import ItemsExample from '!!raw-loader!./code/scrapy_project/src/items.py'; import SpidersExample from '!!raw-loader!./code/scrapy_project/src/spiders/title.py'; import SettingsExample from '!!raw-loader!./code/scrapy_project/src/settings.py'; -In this guide, you'll learn how to use the [Scrapy](https://scrapy.org/) framework in your Apify Actors. +In this guide, you'll learn how to build web crawlers with the [Scrapy](https://scrapy.org/) framework in your Apify Actors. ## Introduction diff --git a/docs/03_guides/07_running_webserver.mdx b/docs/03_guides/12_running_webserver.mdx similarity index 66% rename from docs/03_guides/07_running_webserver.mdx rename to docs/03_guides/12_running_webserver.mdx index 9b63976a..7b946e86 100644 --- a/docs/03_guides/07_running_webserver.mdx +++ b/docs/03_guides/12_running_webserver.mdx @@ -1,12 +1,13 @@ --- id: running-webserver -title: Run a web server +title: Running a web server description: Run an HTTP server inside your Actor for monitoring or serving content during execution. --- import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; -import WebserverExample from '!!raw-loader!roa-loader!./code/07_webserver.py'; +import WebserverExample from '!!raw-loader!roa-loader!./code/12_webserver.py'; +import WebserverFastApiExample from '!!raw-loader!roa-loader!./code/12_webserver_fastapi.py'; In this guide, you'll learn how to run a web server inside your Apify Actor. This is useful for monitoring Actor progress, creating custom APIs, or serving content during the Actor run. @@ -30,6 +31,29 @@ The following example shows how to start a simple web server in your Actor, whic {WebserverExample} +## Using FastAPI + +The example above relies only on Python's standard library, which keeps it dependency-free but leaves you handling requests by hand. For anything beyond a single endpoint, a web framework such as [FastAPI](https://fastapi.tiangolo.com/) is a better fit - it gives you routing, request parsing, and automatic JSON responses, and is served by an ASGI server like [uvicorn](https://www.uvicorn.org/). + +Install both, for example by adding them to your `requirements.txt`: + +```text +fastapi +uvicorn[standard] +``` + +The following Actor serves the same processed-items counter as before, but through a FastAPI endpoint. The key difference is that uvicorn runs inside the Actor's event loop as a background task, bound to `Actor.configuration.web_server_port` so the platform routes the container URL to it: + + + {WebserverFastApiExample} + + +A few things worth pointing out: + +- `uvicorn.Server(...).serve()` is a coroutine, so it runs as an `asyncio` task alongside the Actor's own work instead of blocking it. Setting `server.should_exit = True` triggers a graceful shutdown once the work is done. +- The server binds to `0.0.0.0` (all interfaces) rather than `localhost`, so it's reachable through the container URL, not only from inside the container. +- The same pattern powers an [Actor Standby](#actor-standby) service - swap the one-off work loop for an Actor that just keeps serving requests. + ## Actor Standby The example above runs a web server for the duration of a single Actor run. With [Actor Standby](https://docs.apify.com/platform/actors/development/programming-interface/standby), you can instead expose your Actor as an always-ready HTTP API: the platform keeps the Actor running in the background and routes incoming HTTP requests to the web server inside it, spinning up additional instances as the load grows. diff --git a/docs/03_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py index 86e83868..adc03361 100644 --- a/docs/03_guides/code/01_beautifulsoup_httpx.py +++ b/docs/03_guides/code/01_beautifulsoup_httpx.py @@ -1,29 +1,26 @@ import asyncio from typing import Any -from urllib.parse import urljoin +from urllib.parse import urljoin, urlsplit import httpx from bs4 import BeautifulSoup from apify import Actor, Request +from apify.storages import RequestQueue async def scrape_page( - client: httpx.AsyncClient, url: str + url: str, + *, + proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: - """Fetch a single page with HTTPX and extract its data and links. + """Fetch a page with HTTPX and return its data and same-site links.""" + # A fresh client per call lets each request use a new proxy URL. + async with httpx.AsyncClient(proxy=proxy_url) as client: + response = await client.get(url, follow_redirects=True) - Keeping the fetching and parsing in this helper keeps the Actor's main loop - shallow. It returns the extracted data together with the links found on the - page, so `main` only has to decide what to store and what to enqueue. - """ - # Fetch the HTTP response from the specified URL using HTTPX. - response = await client.get(url, follow_redirects=True) - - # Parse the HTML content using Beautiful Soup. soup = BeautifulSoup(response.content, 'html.parser') - # Extract the desired data. data = { 'url': url, 'title': soup.title.string if soup.title else None, @@ -32,69 +29,91 @@ async def scrape_page( 'h3s': [h3.text for h3 in soup.find_all('h3')], } - # Collect absolute links found on the page so the caller can enqueue them. + # Keep only absolute links on the same host. links: list[str] = [] + host = urlsplit(url).netloc for link in soup.find_all('a'): link_url = urljoin(url, link.get('href')) - if link_url.startswith(('http://', 'https://')): + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: links.append(link_url) return data, links +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the links one level deeper, unless max_depth was reached.""" + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) - max_depth = actor_input.get('max_depth', 1) + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Open the default request queue for handling URLs to be processed. + # Set up Apify Proxy and the request queue. + proxy_configuration = await Actor.create_proxy_configuration() request_queue = await Actor.open_request_queue() - # Enqueue the start URLs. Their crawl depth defaults to 0. + # Enqueue the start URLs (crawl depth defaults to 0). for start_url in start_urls: url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') + Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Create an HTTPX client to fetch the HTML content of the URLs. - async with httpx.AsyncClient() as client: - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): - url = request.url - - # Read the crawl depth tracked by the request itself. - depth = request.crawl_depth - Actor.log.info(f'Scraping {url} (depth={depth}) ...') - - try: - # Fetch the page and extract its data and nested links. - data, links = await scrape_page(client, url) - - # Store the extracted data to the default dataset. - await Actor.push_data(data) - - # If we are not too deep yet, enqueue the links we found. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) - - except Exception: - Actor.log.exception(f'Cannot extract data from {url}.') - - finally: - # Mark the request as handled so it is not processed again. - await request_queue.mark_request_as_handled(request) + # Cap the crawl; raise or remove to follow more pages. + max_requests = 50 + handled_requests = 0 + + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 + url = request.url + depth = request.crawl_depth + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Fresh proxy URL per request (None if no proxy). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + data, links = await scrape_page(url, proxy_url=proxy_url) + await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + await request_queue.mark_request_as_handled(request) if __name__ == '__main__': diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py index 1a0c4f77..c937f48e 100644 --- a/docs/03_guides/code/02_parsel_impit.py +++ b/docs/03_guides/code/02_parsel_impit.py @@ -1,29 +1,26 @@ import asyncio from typing import Any -from urllib.parse import urljoin +from urllib.parse import urljoin, urlsplit import impit import parsel from apify import Actor, Request +from apify.storages import RequestQueue async def scrape_page( - client: impit.AsyncClient, url: str + url: str, + *, + proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: - """Fetch a single page with Impit and extract its data and links. + """Fetch a page with Impit and return its data and same-site links.""" + # A fresh client per call lets each request use a new proxy URL. + async with impit.AsyncClient(proxy=proxy_url) as client: + response = await client.get(url) - Keeping the fetching and parsing in this helper keeps the Actor's main loop - shallow. It returns the extracted data together with the links found on the - page, so `main` only has to decide what to store and what to enqueue. - """ - # Fetch the HTTP response from the specified URL using Impit. - response = await client.get(url) - - # Parse the HTML content using a Parsel selector. selector = parsel.Selector(text=response.text) - # Extract the desired data using Parsel selectors. data = { 'url': url, 'title': selector.css('title::text').get(), @@ -32,69 +29,91 @@ async def scrape_page( 'h3s': selector.css('h3::text').getall(), } - # Collect absolute links found on the page so the caller can enqueue them. + # Keep only absolute links on the same host. links: list[str] = [] + host = urlsplit(url).netloc for link_href in selector.css('a::attr(href)').getall(): link_url = urljoin(url, link_href) - if link_url.startswith(('http://', 'https://')): + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: links.append(link_url) return data, links +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the links one level deeper, unless max_depth was reached.""" + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) - max_depth = actor_input.get('max_depth', 1) + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Open the default request queue for handling URLs to be processed. + # Set up Apify Proxy and the request queue. + proxy_configuration = await Actor.create_proxy_configuration() request_queue = await Actor.open_request_queue() - # Enqueue the start URLs. Their crawl depth defaults to 0. + # Enqueue the start URLs (crawl depth defaults to 0). for start_url in start_urls: url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') + Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Create an Impit client to fetch the HTML content of the URLs. - async with impit.AsyncClient() as client: - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): - url = request.url - - # Read the crawl depth tracked by the request itself. - depth = request.crawl_depth - Actor.log.info(f'Scraping {url} (depth={depth}) ...') - - try: - # Fetch the page and extract its data and nested links. - data, links = await scrape_page(client, url) - - # Store the extracted data to the default dataset. - await Actor.push_data(data) - - # If we are not too deep yet, enqueue the links we found. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) - - except Exception: - Actor.log.exception(f'Cannot extract data from {url}.') - - finally: - # Mark the request as handled so it is not processed again. - await request_queue.mark_request_as_handled(request) + # Cap the crawl; raise or remove to follow more pages. + max_requests = 50 + handled_requests = 0 + + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 + url = request.url + depth = request.crawl_depth + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Fresh proxy URL per request (None if no proxy). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + data, links = await scrape_page(url, proxy_url=proxy_url) + await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + await request_queue.mark_request_as_handled(request) if __name__ == '__main__': diff --git a/docs/03_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py index 1f2fc1d7..46c89867 100644 --- a/docs/03_guides/code/03_playwright.py +++ b/docs/03_guides/code/03_playwright.py @@ -1,42 +1,51 @@ import asyncio from typing import Any -from urllib.parse import urljoin +from urllib.parse import urljoin, urlsplit from playwright.async_api import BrowserContext, async_playwright from apify import Actor, Request +from apify.storages import RequestQueue -# Note: To run this Actor locally, ensure that Playwright browsers are installed. -# Run `playwright install --with-deps` in the Actor's virtual environment to install them. -# When running on the Apify platform, these dependencies are already included -# in the Actor's Docker image. +# To run locally, install the browsers first: `playwright install --with-deps`. +# On the Apify platform they are already in the Actor's Docker image. + + +def to_playwright_proxy(proxy_url: str) -> dict[str, str]: + """Split an Apify Proxy URL into Playwright's server/username/password.""" + parts = urlsplit(proxy_url) + return { + 'server': f'{parts.scheme}://{parts.hostname}:{parts.port}', + 'username': parts.username or '', + 'password': parts.password or '', + } async def scrape_page( context: BrowserContext, url: str ) -> tuple[dict[str, Any], list[str]]: - """Open a page in the browser, extract its data, and collect its links. - - Keeping the page handling in this helper keeps the Actor's main loop shallow. - It returns the extracted data together with the links found on the page, so - `main` only has to decide what to store and what to enqueue. - """ + """Open the URL in a new page and return its data and same-site links.""" page = await context.new_page() try: await page.goto(url) - # Extract the desired data. data = { 'url': url, 'title': await page.title(), + 'h1s': [await h1.text_content() for h1 in await page.locator('h1').all()], + 'h2s': [await h2.text_content() for h2 in await page.locator('h2').all()], + 'h3s': [await h3.text_content() for h3 in await page.locator('h3').all()], } - # Collect absolute links found on the page so the caller can enqueue them. + # Keep only absolute links on the same host. links: list[str] = [] + host = urlsplit(url).netloc for link in await page.locator('a').all(): link_href = await link.get_attribute('href') link_url = urljoin(url, link_href) - if link_url.startswith(('http://', 'https://')): + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: links.append(link_url) return data, links @@ -45,67 +54,83 @@ async def scrape_page( await page.close() +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the links one level deeper, unless max_depth was reached.""" + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) - max_depth = actor_input.get('max_depth', 1) + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Open the default request queue for handling URLs to be processed. - request_queue = await Actor.open_request_queue() + # Playwright proxies at the browser level, so one URL is shared per run. + proxy_configuration = await Actor.create_proxy_configuration() + proxy_url = await proxy_configuration.new_url() if proxy_configuration else None - # Enqueue the start URLs. Their crawl depth defaults to 0. + # Open the request queue and enqueue the start URLs (crawl depth 0). + request_queue = await Actor.open_request_queue() for start_url in start_urls: url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') + Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) + # Cap the crawl; raise or remove to follow more pages. + max_requests = 50 + handled_requests = 0 + Actor.log.info('Launching Playwright...') - # Launch Playwright and open a new browser context. async with async_playwright() as playwright: - # Configure the browser to launch in headless mode as per Actor configuration. browser = await playwright.chromium.launch( headless=Actor.configuration.headless, - args=['--disable-gpu'], + proxy=to_playwright_proxy(proxy_url) if proxy_url else None, + args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], ) context = await browser.new_context() - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 url = request.url - - # Read the crawl depth tracked by the request itself. depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Fetch the page and extract its data and nested links. data, links = await scrape_page(context, url) - - # Store the extracted data to the default dataset. await Actor.push_data(data) - - # If we are not too deep yet, enqueue the links we found. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py index 42dc3509..8bf08817 100644 --- a/docs/03_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -1,113 +1,191 @@ import asyncio +import json +from pathlib import Path +from tempfile import mkdtemp from typing import Any -from urllib.parse import urljoin +from urllib.parse import urljoin, urlsplit +from zipfile import ZipFile from selenium import webdriver from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.common.by import By from apify import Actor, Request +from apify.storages import RequestQueue -# To run this Actor locally, you need to have the Selenium Chromedriver installed. -# Follow the installation guide at: +# To run locally, install the Selenium Chromedriver: # https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/ -# When running on the Apify platform, the Chromedriver is already included -# in the Actor's Docker image. +# On the Apify platform it is already in the Actor's Docker image. -def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], list[str]]: - """Navigate to a page with Selenium, extract its data, and collect its links. +def proxy_auth_extension(proxy_url: str) -> str: + """Build a Chrome extension that routes Chrome through an authenticated proxy.""" + parts = urlsplit(proxy_url) + + manifest = { + 'name': 'Apify Proxy', + 'version': '1.0.0', + 'manifest_version': 3, + 'permissions': ['proxy', 'webRequest', 'webRequestAuthProvider'], + 'host_permissions': [''], + 'background': {'service_worker': 'background.js'}, + 'minimum_chrome_version': '108', + } + + # The service worker sets the proxy and answers the auth challenge. + proxy_config = json.dumps( + { + 'mode': 'fixed_servers', + 'rules': { + 'singleProxy': { + 'scheme': parts.scheme, + 'host': parts.hostname, + 'port': parts.port, + }, + }, + } + ) + credentials = json.dumps( + {'username': parts.username or '', 'password': parts.password or ''} + ) + background = ( + 'chrome.proxy.settings.set(' + '{value: ' + proxy_config + ', scope: "regular"});\n' + 'chrome.webRequest.onAuthRequired.addListener(\n' + ' () => ({authCredentials: ' + credentials + '}),\n' + ' {urls: [""]},\n' + ' ["blocking"],\n' + ');\n' + ) + + extension_path = Path(mkdtemp()) / 'apify_proxy.zip' + with ZipFile(extension_path, 'w') as archive: + archive.writestr('manifest.json', json.dumps(manifest)) + archive.writestr('background.js', background) + return str(extension_path) + + +def build_chrome_driver(proxy_url: str | None = None) -> webdriver.Chrome: + """Create a headless Chrome WebDriver, optionally routed through a proxy.""" + chrome_options = ChromeOptions() + + if Actor.configuration.headless: + # The new headless mode is required to load the proxy extension. + chrome_options.add_argument('--headless=new') + + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-gpu') + + if proxy_url: + chrome_options.add_extension(proxy_auth_extension(proxy_url)) + chrome_options.add_argument( + '--disable-features=DisableLoadExtensionCommandLineSwitch' + ) + + return webdriver.Chrome(options=chrome_options) + - These are blocking WebDriver calls, so the Actor's main loop runs this helper - in a worker thread via `asyncio.to_thread`. It returns the extracted data - together with the links found on the page, so `main` only has to decide what - to store and what to enqueue. - """ +def scrape_page(driver: webdriver.Chrome, url: str) -> tuple[dict[str, Any], list[str]]: + """Navigate to the URL with Selenium and return its data and same-site links.""" driver.get(url) - # Extract the desired data. data = { 'url': url, 'title': driver.title, + 'h1s': [el.text for el in driver.find_elements(By.TAG_NAME, 'h1')], + 'h2s': [el.text for el in driver.find_elements(By.TAG_NAME, 'h2')], + 'h3s': [el.text for el in driver.find_elements(By.TAG_NAME, 'h3')], } - # Collect absolute links found on the page so the caller can enqueue them. + # Keep only absolute links on the same host. links: list[str] = [] + host = urlsplit(url).netloc for link in driver.find_elements(By.TAG_NAME, 'a'): link_url = urljoin(url, link.get_attribute('href')) - if link_url.startswith(('http://', 'https://')): + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: links.append(link_url) return data, links +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the links one level deeper, unless max_depth was reached.""" + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) - max_depth = actor_input.get('max_depth', 1) + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Open the default request queue for handling URLs to be processed. - request_queue = await Actor.open_request_queue() + # Selenium proxies at the browser level, so one URL is shared per run. + proxy_configuration = await Actor.create_proxy_configuration() - # Enqueue the start URLs. Their crawl depth defaults to 0. + # Open the request queue and enqueue the start URLs (crawl depth 0). + request_queue = await Actor.open_request_queue() for start_url in start_urls: url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') + Actor.log.info(f'Enqueuing start URL: {url}') await request_queue.add_request(Request.from_url(url)) - # Launch a new Selenium Chrome WebDriver and configure it. - Actor.log.info('Launching Chrome WebDriver...') - chrome_options = ChromeOptions() - - if Actor.configuration.headless: - chrome_options.add_argument('--headless') + # Cap the crawl; raise or remove to follow more pages. + max_requests = 50 + handled_requests = 0 - chrome_options.add_argument('--no-sandbox') - chrome_options.add_argument('--disable-dev-shm-usage') - driver = webdriver.Chrome(options=chrome_options) + # Fresh proxy URL for the run (None if no proxy). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() - # Test WebDriver setup by navigating to an example page. - driver.get('http://www.example.com') - if driver.title != 'Example Domain': - raise ValueError('Failed to open example page.') + Actor.log.info('Launching Chrome WebDriver...') + driver = build_chrome_driver(proxy_url) - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 url = request.url - - # Read the crawl depth tracked by the request itself. depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: - # Fetch the page and extract its data and nested links. The blocking - # WebDriver calls run in a worker thread to keep the loop responsive. + # Blocking WebDriver calls run in a worker thread. data, links = await asyncio.to_thread(scrape_page, driver, url) - - # Store the extracted data to the default dataset. await Actor.push_data(data) - - # If we are not too deep yet, enqueue the links we found. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: - # Mark the request as handled so it is not processed again. await request_queue.mark_request_as_handled(request) driver.quit() diff --git a/docs/03_guides/code/05_crawlee_beautifulsoup.py b/docs/03_guides/code/05_crawlee_beautifulsoup.py index 4d3a81d7..d3767109 100644 --- a/docs/03_guides/code/05_crawlee_beautifulsoup.py +++ b/docs/03_guides/code/05_crawlee_beautifulsoup.py @@ -1,22 +1,19 @@ import asyncio from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.router import Router from apify import Actor -# Create a crawler. -crawler = BeautifulSoupCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. - max_requests_per_crawl=50, -) +# Define the router up front; the crawler is created later in `main`. +router = Router[BeautifulSoupCrawlingContext]() -# Define a request handler, which will be called for every request. -@crawler.router.default_handler +# Handler called for every request. +@router.default_handler async def request_handler(context: BeautifulSoupCrawlingContext) -> None: - Actor.log.info(f'Scraping {context.request.url}...') + Actor.log.info(f'Scraping {context.request.url} ...') - # Extract the desired data. data = { 'url': context.request.url, 'title': context.soup.title.string if context.soup.title else None, @@ -25,29 +22,38 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: 'h3s': [h3.text for h3 in context.soup.find_all('h3')], } - # Store the extracted data to the default dataset. await context.push_data(data) + Actor.log.info(f'Stored data from {context.request.url} (title={data["title"]!r}).') - # Enqueue additional links found on the current page. + # Enqueue links found on the page. await context.enqueue_links(strategy='same-domain') async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') - for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + for url in actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) ] - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Run the crawler with the starting requests. + # Crawlee rotates the proxy URL per request on its own. + proxy_configuration = await Actor.create_proxy_configuration() + if proxy_configuration is None: + raise RuntimeError('Failed to create the proxy configuration.') + + crawler = BeautifulSoupCrawler( + proxy_configuration=proxy_configuration, + request_handler=router, + # Cap the crawl; remove or increase to follow all links. + max_requests_per_crawl=50, + ) + await crawler.run(start_urls) diff --git a/docs/03_guides/code/05_crawlee_parsel.py b/docs/03_guides/code/05_crawlee_parsel.py index 31f39d8b..32723b00 100644 --- a/docs/03_guides/code/05_crawlee_parsel.py +++ b/docs/03_guides/code/05_crawlee_parsel.py @@ -1,22 +1,19 @@ import asyncio from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.router import Router from apify import Actor -# Create a crawler. -crawler = ParselCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. - max_requests_per_crawl=50, -) +# Define the router up front; the crawler is created later in `main`. +router = Router[ParselCrawlingContext]() -# Define a request handler, which will be called for every request. -@crawler.router.default_handler +# Handler called for every request. +@router.default_handler async def request_handler(context: ParselCrawlingContext) -> None: - Actor.log.info(f'Scraping {context.request.url}...') + Actor.log.info(f'Scraping {context.request.url} ...') - # Extract the desired data. data = { 'url': context.request.url, 'title': context.selector.xpath('//title/text()').get(), @@ -25,29 +22,38 @@ async def request_handler(context: ParselCrawlingContext) -> None: 'h3s': context.selector.xpath('//h3/text()').getall(), } - # Store the extracted data to the default dataset. await context.push_data(data) + Actor.log.info(f'Stored data from {context.request.url} (title={data["title"]!r}).') - # Enqueue additional links found on the current page. + # Enqueue links found on the page. await context.enqueue_links(strategy='same-domain') async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') - for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + for url in actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) ] - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Run the crawler with the starting requests. + # Crawlee rotates the proxy URL per request on its own. + proxy_configuration = await Actor.create_proxy_configuration() + if proxy_configuration is None: + raise RuntimeError('Failed to create the proxy configuration.') + + crawler = ParselCrawler( + proxy_configuration=proxy_configuration, + request_handler=router, + # Cap the crawl; remove or increase to follow all links. + max_requests_per_crawl=50, + ) + await crawler.run(start_urls) diff --git a/docs/03_guides/code/05_crawlee_playwright.py b/docs/03_guides/code/05_crawlee_playwright.py index be4ea29e..56337a31 100644 --- a/docs/03_guides/code/05_crawlee_playwright.py +++ b/docs/03_guides/code/05_crawlee_playwright.py @@ -1,25 +1,19 @@ import asyncio from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext +from crawlee.router import Router from apify import Actor -# Create a crawler. -crawler = PlaywrightCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. - max_requests_per_crawl=50, - # Run the browser in a headless mode. - headless=True, - browser_launch_options={'args': ['--disable-gpu']}, -) +# Define the router up front; the crawler is created later in `main`. +router = Router[PlaywrightCrawlingContext]() -# Define a request handler, which will be called for every request. -@crawler.router.default_handler +# Handler called for every request. +@router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: - Actor.log.info(f'Scraping {context.request.url}...') + Actor.log.info(f'Scraping {context.request.url} ...') - # Extract the desired data. data = { 'url': context.request.url, 'title': await context.page.title(), @@ -28,29 +22,43 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()], } - # Store the extracted data to the default dataset. await context.push_data(data) + Actor.log.info(f'Stored data from {context.request.url} (title={data["title"]!r}).') - # Enqueue additional links found on the current page. + # Enqueue links found on the page. await context.enqueue_links(strategy='same-domain') async def main() -> None: - # Enter the context of the Actor. async with Actor: - # Retrieve the Actor input, and use default values if not provided. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = [ url.get('url') - for url in actor_input.get('start_urls', [{'url': 'https://apify.com'}]) + for url in actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) ] - # Exit if no start URLs are provided. if not start_urls: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Run the crawler with the starting requests. + # Crawlee rotates the proxy URL per request on its own. + proxy_configuration = await Actor.create_proxy_configuration() + if proxy_configuration is None: + raise RuntimeError('Failed to create the proxy configuration.') + + # Common Chrome flags for running the browser in a container. + browser_args = ['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'] + + crawler = PlaywrightCrawler( + proxy_configuration=proxy_configuration, + request_handler=router, + # Cap the crawl; remove or increase to follow all links. + max_requests_per_crawl=50, + headless=True, + browser_launch_options={'args': browser_args}, + ) + await crawler.run(start_urls) diff --git a/docs/03_guides/code/07_webserver.py b/docs/03_guides/code/12_webserver.py similarity index 87% rename from docs/03_guides/code/07_webserver.py rename to docs/03_guides/code/12_webserver.py index 66ecfe3c..1cb23c1f 100644 --- a/docs/03_guides/code/07_webserver.py +++ b/docs/03_guides/code/12_webserver.py @@ -10,7 +10,7 @@ class RequestHandler(BaseHTTPRequestHandler): """A handler that prints the number of processed items on every GET request.""" - def do_get(self) -> None: + def do_GET(self) -> None: self.log_request() self.send_response(200) self.end_headers() @@ -18,7 +18,7 @@ def do_get(self) -> None: def run_server() -> None: - """Start the HTTP server on the provided port, and save a reference to the server.""" + """Start the HTTP server and keep a reference to it.""" global http_server with ThreadingHTTPServer( ('', Actor.configuration.web_server_port), RequestHandler @@ -43,7 +43,7 @@ async def main() -> None: if http_server is None: raise RuntimeError('HTTP server not started') - # Signal the HTTP server to shut down, and wait for it to finish. + # Signal the server to shut down and wait. http_server.shutdown() await run_server_task diff --git a/docs/03_guides/code/12_webserver_fastapi.py b/docs/03_guides/code/12_webserver_fastapi.py new file mode 100644 index 00000000..08768eb0 --- /dev/null +++ b/docs/03_guides/code/12_webserver_fastapi.py @@ -0,0 +1,48 @@ +import asyncio + +import uvicorn +from fastapi import FastAPI + +from apify import Actor + +# Counter the server reports and the Actor updates. +processed_items = 0 + +# FastAPI app with a single endpoint. +app = FastAPI() + + +@app.get('/') +async def index() -> dict[str, int]: + """Respond to every GET request with the number of processed items.""" + return {'processed_items': processed_items} + + +async def main() -> None: + global processed_items + async with Actor: + # Serve the app on the platform's web server port; 0.0.0.0 exposes it. + config = uvicorn.Config( + app, + host='0.0.0.0', # noqa: S104 + port=Actor.configuration.web_server_port, + ) + server = uvicorn.Server(config) + + # Run the server in the background. + server_task = asyncio.create_task(server.serve()) + Actor.log.info(f'Server running at {Actor.configuration.web_server_url}') + + # Simulate work, updating the reported counter. + for _ in range(100): + await asyncio.sleep(1) + processed_items += 1 + Actor.log.info(f'Processed items: {processed_items}') + + # Signal the server to shut down and wait. + server.should_exit = True + await server_task + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/scrapy_project/src/__main__.py b/docs/03_guides/code/scrapy_project/src/__main__.py index 807447c9..f9b27ed5 100644 --- a/docs/03_guides/code/scrapy_project/src/__main__.py +++ b/docs/03_guides/code/scrapy_project/src/__main__.py @@ -7,7 +7,7 @@ # Import your main Actor coroutine here. from .main import main -# Ensure the location to the Scrapy settings module is defined. +# Point Scrapy at the settings module. os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' diff --git a/docs/03_guides/code/scrapy_project/src/main.py b/docs/03_guides/code/scrapy_project/src/main.py index d8b67984..b234b171 100644 --- a/docs/03_guides/code/scrapy_project/src/main.py +++ b/docs/03_guides/code/scrapy_project/src/main.py @@ -14,16 +14,16 @@ async def main() -> None: """Apify Actor main coroutine for executing the Scrapy spider.""" async with Actor: - # Retrieve and process Actor input. + # Read the Actor input. actor_input = await Actor.get_input() or {} start_urls = [url['url'] for url in actor_input.get('startUrls', [])] allowed_domains = actor_input.get('allowedDomains') proxy_config = actor_input.get('proxyConfiguration') - # Apply Apify settings, which will override the Scrapy project settings. + # Apply Apify settings (override the Scrapy project settings). settings = apply_apify_settings(proxy_config=proxy_config) - # Create AsyncCrawlerRunner and execute the Scrapy spider. + # Run the Scrapy spider. crawler_runner = AsyncCrawlerRunner(settings) await crawler_runner.crawl( Spider, diff --git a/docs/03_guides/code/scrapy_project/src/settings.py b/docs/03_guides/code/scrapy_project/src/settings.py index 5c0e56e3..67ae1a03 100644 --- a/docs/03_guides/code/scrapy_project/src/settings.py +++ b/docs/03_guides/code/scrapy_project/src/settings.py @@ -5,7 +5,7 @@ ROBOTSTXT_OBEY = True SPIDER_MODULES = ['src.spiders'] TELNETCONSOLE_ENABLED = False -# Do not change the Twisted reactor unless you really know what you are doing. +# Don't change the Twisted reactor unless you know what you're doing. TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 7200 diff --git a/docs/03_guides/code/scrapy_project/src/spiders/title.py b/docs/03_guides/code/scrapy_project/src/spiders/title.py index 7223a53d..8111ee31 100644 --- a/docs/03_guides/code/scrapy_project/src/spiders/title.py +++ b/docs/03_guides/code/scrapy_project/src/spiders/title.py @@ -14,11 +14,7 @@ class TitleSpider(Spider): - """A spider that scrapes web pages to extract titles and discover new links. - - This spider retrieves the content of the element from each page and queues - any valid hyperlinks for further crawling. - """ + """A spider that extracts page titles and queues links for further crawling.""" name = 'title_spider' @@ -32,36 +28,21 @@ def __init__( *args: Any, **kwargs: Any, ) -> None: - """A default constructor. - - Args: - start_urls: URLs to start the scraping from. - allowed_domains: Domains that the scraper is allowed to crawl. - *args: Additional positional arguments. - **kwargs: Additional keyword arguments. - """ + """Store the start URLs and allowed domains.""" super().__init__(*args, **kwargs) self.start_urls = start_urls self.allowed_domains = allowed_domains def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]: - """Parse the web page response. - - Args: - response: The web page response. - - Yields: - Yields scraped `TitleItem` and new `Request` objects for links. - """ + """Yield a `TitleItem` and a `Request` for each link on the page.""" self.logger.info('TitleSpider is parsing %s...', response) - # Extract and yield the TitleItem + # Yield the title item. url = response.url title = response.css('title::text').extract_first() yield TitleItem(url=url, title=title) - # Extract all links from the page, create `Request` objects out of them, - # and yield them. + # Yield a request for each link. for link_href in response.css('a::attr("href")'): link_url = urljoin(response.url, link_href.get()) if link_url.startswith(('http://', 'https://')):