scrapegraph-py/examples/crawl/crawl_basic_async.py at main · ScrapeGraphAI/scrapegraph-py · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from dotenv import load_dotenv
load_dotenv()

import asyncio
from scrapegraph_py import AsyncScrapeGraphAI, CrawlRequest

async def main():
    async with AsyncScrapeGraphAI() as sgai:
        start_res = await sgai.crawl.start(CrawlRequest(
            url="https://scrapegraphai.com/",
            max_pages=5,
            max_depth=2,
        ))

        if start_res.status != "success" or not start_res.data:
            print("Failed to start:", start_res.error)
        else:
            crawl_id = start_res.data.id
            print("Crawl started:", crawl_id)

            status = start_res.data.status
            while status == "running":
                await asyncio.sleep(2)
                get_res = await sgai.crawl.get(crawl_id)
                if get_res.status != "success" or not get_res.data:
                    print("Failed to get status:", get_res.error)
                    break
                status = get_res.data.status
                print(f"Progress: {get_res.data.finished}/{get_res.data.total} - {status}")

                if status in ("completed", "failed"):
                    print("\nPages crawled:")
                    for page in get_res.data.pages:
                        print(f"  {page.url} - {page.status}")

asyncio.run(main())