fix: improve the Python test suite and fix solutions using Crawlee (see #2112 )

honzajavorek · honzajavorek · commit ddf09129da47 · 2025-11-25T14:54:07.000+01:00
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py
@@ -47,7 +47,7 @@ async def handle_driver(context: BeautifulSoupCrawlingContext) -> None:
         )
 
     await crawler.run(["https://www.f1academy.com/Racing-Series/Drivers"])
-    await crawler.export_data_json(path="dataset.json", ensure_ascii=False, indent=2)  # type: ignore[attr-defined]
+    await crawler.export_data("dataset.json")
 
 
 if __name__ == "__main__":
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py
@@ -39,7 +39,7 @@ async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None:
             )
 
     await crawler.run(["https://www.netflix.com/tudum/top10"])
-    await crawler.export_data_json(path="dataset.json", ensure_ascii=False, indent=2)  # type: ignore[attr-defined]
+    await crawler.export_data("dataset.json")
 
 
 if __name__ == "__main__":
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/products.json b/sources/academy/webscraping/scraping_basics_python/exercises/products.json
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats
@@ -2,122 +2,144 @@ setup_file() {
   cd "$BATS_TEST_DIRNAME"
 }
 
-teardown_file() {
-  rm -rf storage dataset.json
+teardown() {
+  rm -rf products.json storage dataset.json
 }
 
-# retry_run() {
-#   for attempt in 1 2 3; do
-#     run "$@"
-#     (( status == 0 )) && return 0
-#     sleep 1
-#   done
-#   return "$status"
-# }
-
 @test "outputs the HTML with Star Wars products" {
   run uv run --with=httpx python lego.py
+
   [[ "$output" == *"Millennium Falcon"* ]]
 }
 
 @test "counts the number of F1 Academy teams" {
   run uv run --with=httpx --with=beautifulsoup4 python f1academy_teams.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == "6" ]]
 }
 
 @test "counts the number of F1 Academy drivers" {
   run uv run --with=httpx --with=beautifulsoup4 python f1academy_drivers.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == "18" ]]
 }
 
 @test "lists African countries" {
   run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]]
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
 }
 
 @test "lists African countries with a single selector" {
   run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries_single_selector.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]]
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
 }
 
 @test "lists Guardian F1 article titles" {
   run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_titles.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == *' F1 '* ]]
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
 }
 
 @test "prints warehouse stock counts" {
   run uv run --with=httpx --with=beautifulsoup4 python warehouse_units.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]]
+  [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]]
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
 }
 
 @test "prints warehouse stock counts using regex" {
   run uv run --with=httpx --with=beautifulsoup4 python warehouse_units_regex.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]]
+  [[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]]
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
 }
 
 @test "prints Guardian F1 titles with publish dates" {
   run uv run --with=httpx --with=beautifulsoup4 python guardian_publish_dates.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == *' F1 '* ]]
+  [[ "$output" == *' | Sun '* ]]  # has info about date, Sundays are very likely
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
 }
 
 @test "filters products from JSON" {
+  echo '[{"title":"Premium Speakers","minPrice":75000,"price":75000},{"title":"Budget Headphones","minPrice":25000,"price":25000}]' > products.json
+
   run uv run python process_products_json.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == "{ title: 'Premium Speakers', minPrice: 75000, price: 75000 }" ]]
 }
 
 @test "lists Wikipedia country links" {
   run uv run --with=httpx --with=beautifulsoup4 python wikipedia_country_links.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]]
+  [[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]]
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
 }
 
 @test "lists Guardian F1 article links" {
   run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_links.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == *'https://www.theguardian.com/sport/'* ]]
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
 }
 
 @test "prints Wikipedia calling codes" {
   run uv run --with=httpx --with=beautifulsoup4 python wikipedia_calling_codes.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]]
+  [[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]]
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
 }
 
 @test "lists Guardian F1 authors" {
   run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == *' F1 '* ]]
+  [[ "$output" == *'Giles Richards: '* ]]  # writes most of them (we'll have to change this if they fire'him)
+  [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]]
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
 }
 
 @test "lists Python database jobs" {
   run uv run --with=httpx --with=beautifulsoup4 python python_jobs_database.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == *"'title': '"* ]]
+  [[ "$output" == *"'company': '"* ]]
+  [[ "$output" == *"'url': 'https://www.python.org/jobs/"* ]]
+  [[ "$output" == *"'posted_on': datetime.date("* ]]
 }
 
 @test "finds the shortest CNN sports article" {
   run uv run --with=httpx --with=beautifulsoup4 python cnn_sports_shortest_article.py
-  (( status == 0 ))
-  [[ -n "$output" ]]
+
+  [[ "$output" == 'https://edition.cnn.com/'* ]]
 }
 
 @test "scrapes F1 Academy driver details with Crawlee" {
   run uv run --with=crawlee[beautifulsoup] python crawlee_f1_drivers.py
-  [[ -n "$output" || -f dataset.json ]]
-  rm -f dataset.json
+
+  (( status == 0 ))
+  [[ -f dataset.json ]]
+  [[ $(cat dataset.json | jq '. | length') == "18" ]]
+  [[ $(cat dataset.json | jq -c '.[0] | keys') == '["dob","instagram_url","name","nationality","team","url"]' ]]
+  [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]]
 }
 
 @test "scrapes Netflix ratings with Crawlee" {
   run uv run --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py
-  [[ -n "$output" || -f dataset.json ]]
-  rm -f dataset.json
+
+  (( status == 0 ))
+  [[ -f dataset.json ]]
+  [[ $(cat dataset.json | jq '. | length') == "10" ]]
+  [[ $(cat dataset.json | jq -c '.[0] | keys') == '["url","title","rating"]' ]]
+  [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]]
 }

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ async def handle_driver(context: BeautifulSoupCrawlingContext) -> None:`
`47`	`47`	`)`
`48`	`48`
`49`	`49`	`await crawler.run(["https://www.f1academy.com/Racing-Series/Drivers"])`
`50`		`- await crawler.export_data_json(path="dataset.json", ensure_ascii=False, indent=2) # type: ignore[attr-defined]`
	`50`	`+ await crawler.export_data("dataset.json")`
`51`	`51`
`52`	`52`
`53`	`53`	`if __name__ == "__main__":`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None:`
`39`	`39`	`)`
`40`	`40`
`41`	`41`	`await crawler.run(["https://www.netflix.com/tudum/top10"])`
`42`		`- await crawler.export_data_json(path="dataset.json", ensure_ascii=False, indent=2) # type: ignore[attr-defined]`
	`42`	`+ await crawler.export_data("dataset.json")`
`43`	`43`
`44`	`44`
`45`	`45`	`if __name__ == "__main__":`