Skip to content

Commit ddf0912

Browse files
committed
fix: improve the Python test suite and fix solutions using Crawlee (see #2112 )
1 parent 1a41bb3 commit ddf0912

File tree

4 files changed

+69
-59
lines changed

4 files changed

+69
-59
lines changed

sources/academy/webscraping/scraping_basics_python/exercises/crawlee_f1_drivers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ async def handle_driver(context: BeautifulSoupCrawlingContext) -> None:
4747
)
4848

4949
await crawler.run(["https://www.f1academy.com/Racing-Series/Drivers"])
50-
await crawler.export_data_json(path="dataset.json", ensure_ascii=False, indent=2) # type: ignore[attr-defined]
50+
await crawler.export_data("dataset.json")
5151

5252

5353
if __name__ == "__main__":

sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None:
3939
)
4040

4141
await crawler.run(["https://www.netflix.com/tudum/top10"])
42-
await crawler.export_data_json(path="dataset.json", ensure_ascii=False, indent=2) # type: ignore[attr-defined]
42+
await crawler.export_data("dataset.json")
4343

4444

4545
if __name__ == "__main__":

sources/academy/webscraping/scraping_basics_python/exercises/products.json

Lines changed: 0 additions & 12 deletions
This file was deleted.

sources/academy/webscraping/scraping_basics_python/exercises/test.bats

Lines changed: 67 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2,122 +2,144 @@ setup_file() {
22
cd "$BATS_TEST_DIRNAME"
33
}
44

5-
teardown_file() {
6-
rm -rf storage dataset.json
5+
teardown() {
6+
rm -rf products.json storage dataset.json
77
}
88

9-
# retry_run() {
10-
# for attempt in 1 2 3; do
11-
# run "$@"
12-
# (( status == 0 )) && return 0
13-
# sleep 1
14-
# done
15-
# return "$status"
16-
# }
17-
189
@test "outputs the HTML with Star Wars products" {
1910
run uv run --with=httpx python lego.py
11+
2012
[[ "$output" == *"Millennium Falcon"* ]]
2113
}
2214

2315
@test "counts the number of F1 Academy teams" {
2416
run uv run --with=httpx --with=beautifulsoup4 python f1academy_teams.py
25-
(( status == 0 ))
26-
[[ -n "$output" ]]
17+
18+
[[ "$output" == "6" ]]
2719
}
2820

2921
@test "counts the number of F1 Academy drivers" {
3022
run uv run --with=httpx --with=beautifulsoup4 python f1academy_drivers.py
31-
(( status == 0 ))
32-
[[ -n "$output" ]]
23+
24+
[[ "$output" == "18" ]]
3325
}
3426

3527
@test "lists African countries" {
3628
run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries.py
37-
(( status == 0 ))
38-
[[ -n "$output" ]]
29+
30+
[[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]]
31+
[[ $(echo "$output" | wc -l) -gt 5 ]]
3932
}
4033

4134
@test "lists African countries with a single selector" {
4235
run uv run --with=httpx --with=beautifulsoup4 python wikipedia_countries_single_selector.py
43-
(( status == 0 ))
44-
[[ -n "$output" ]]
36+
37+
[[ "$output" == *$'Comoros\nDemocratic Republic of the Congo\n'* ]]
38+
[[ $(echo "$output" | wc -l) -gt 5 ]]
4539
}
4640

4741
@test "lists Guardian F1 article titles" {
4842
run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_titles.py
49-
(( status == 0 ))
50-
[[ -n "$output" ]]
43+
44+
[[ "$output" == *' F1 '* ]]
45+
[[ $(echo "$output" | wc -l) -gt 5 ]]
5146
}
5247

5348
@test "prints warehouse stock counts" {
5449
run uv run --with=httpx --with=beautifulsoup4 python warehouse_units.py
55-
(( status == 0 ))
56-
[[ -n "$output" ]]
50+
51+
[[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]]
52+
[[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]]
53+
[[ $(echo "$output" | wc -l) -gt 5 ]]
5754
}
5855

5956
@test "prints warehouse stock counts using regex" {
6057
run uv run --with=httpx --with=beautifulsoup4 python warehouse_units_regex.py
61-
(( status == 0 ))
62-
[[ -n "$output" ]]
58+
59+
[[ "$output" == *$'JBL Flip 4 Waterproof Portable Bluetooth Speaker | 672\n'* ]]
60+
[[ "$output" == *$'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 77\n'* ]]
61+
[[ $(echo "$output" | wc -l) -gt 5 ]]
6362
}
6463

6564
@test "prints Guardian F1 titles with publish dates" {
6665
run uv run --with=httpx --with=beautifulsoup4 python guardian_publish_dates.py
67-
(( status == 0 ))
68-
[[ -n "$output" ]]
66+
67+
[[ "$output" == *' F1 '* ]]
68+
[[ "$output" == *' | Sun '* ]] # has info about date, Sundays are very likely
69+
[[ $(echo "$output" | wc -l) -gt 5 ]]
6970
}
7071

7172
@test "filters products from JSON" {
73+
echo '[{"title":"Premium Speakers","minPrice":75000,"price":75000},{"title":"Budget Headphones","minPrice":25000,"price":25000}]' > products.json
74+
7275
run uv run python process_products_json.py
73-
(( status == 0 ))
74-
[[ -n "$output" ]]
76+
77+
[[ "$output" == "{ title: 'Premium Speakers', minPrice: 75000, price: 75000 }" ]]
7578
}
7679

7780
@test "lists Wikipedia country links" {
7881
run uv run --with=httpx --with=beautifulsoup4 python wikipedia_country_links.py
79-
(( status == 0 ))
80-
[[ -n "$output" ]]
82+
83+
[[ "$output" == *$'https://en.wikipedia.org/wiki/Algeria\nhttps://en.wikipedia.org/wiki/Angola\n'* ]]
84+
[[ "$output" == *$'https://en.wikipedia.org/wiki/R%C3%A9union\n'* ]]
85+
[[ $(echo "$output" | wc -l) -gt 5 ]]
8186
}
8287

8388
@test "lists Guardian F1 article links" {
8489
run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_links.py
85-
(( status == 0 ))
86-
[[ -n "$output" ]]
90+
91+
[[ "$output" == *'https://www.theguardian.com/sport/'* ]]
92+
[[ $(echo "$output" | wc -l) -gt 5 ]]
8793
}
8894

8995
@test "prints Wikipedia calling codes" {
9096
run uv run --with=httpx --with=beautifulsoup4 python wikipedia_calling_codes.py
91-
(( status == 0 ))
92-
[[ -n "$output" ]]
97+
98+
[[ "$output" == *$'https://en.wikipedia.org/wiki/Comoros +269\n'* ]]
99+
[[ "$output" == *$'https://en.wikipedia.org/wiki/Sahrawi_Arab_Democratic_Republic null\n'* ]]
100+
[[ $(echo "$output" | wc -l) -gt 5 ]]
93101
}
94102

95103
@test "lists Guardian F1 authors" {
96104
run uv run --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py
97-
(( status == 0 ))
98-
[[ -n "$output" ]]
105+
106+
[[ "$output" == *' F1 '* ]]
107+
[[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire'him)
108+
[[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]]
109+
[[ $(echo "$output" | wc -l) -gt 5 ]]
99110
}
100111

101112
@test "lists Python database jobs" {
102113
run uv run --with=httpx --with=beautifulsoup4 python python_jobs_database.py
103-
(( status == 0 ))
104-
[[ -n "$output" ]]
114+
115+
[[ "$output" == *"'title': '"* ]]
116+
[[ "$output" == *"'company': '"* ]]
117+
[[ "$output" == *"'url': 'https://www.python.org/jobs/"* ]]
118+
[[ "$output" == *"'posted_on': datetime.date("* ]]
105119
}
106120

107121
@test "finds the shortest CNN sports article" {
108122
run uv run --with=httpx --with=beautifulsoup4 python cnn_sports_shortest_article.py
109-
(( status == 0 ))
110-
[[ -n "$output" ]]
123+
124+
[[ "$output" == 'https://edition.cnn.com/'* ]]
111125
}
112126

113127
@test "scrapes F1 Academy driver details with Crawlee" {
114128
run uv run --with=crawlee[beautifulsoup] python crawlee_f1_drivers.py
115-
[[ -n "$output" || -f dataset.json ]]
116-
rm -f dataset.json
129+
130+
(( status == 0 ))
131+
[[ -f dataset.json ]]
132+
[[ $(cat dataset.json | jq '. | length') == "18" ]]
133+
[[ $(cat dataset.json | jq -c '.[0] | keys') == '["dob","instagram_url","name","nationality","team","url"]' ]]
134+
[[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]]
117135
}
118136

119137
@test "scrapes Netflix ratings with Crawlee" {
120138
run uv run --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py
121-
[[ -n "$output" || -f dataset.json ]]
122-
rm -f dataset.json
139+
140+
(( status == 0 ))
141+
[[ -f dataset.json ]]
142+
[[ $(cat dataset.json | jq '. | length') == "10" ]]
143+
[[ $(cat dataset.json | jq -c '.[0] | keys') == '["url","title","rating"]' ]]
144+
[[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]]
123145
}

0 commit comments

Comments
 (0)