python-flickr-api/scrape_api_docs.py at master · alexis-mignon/python-flickr-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.9"
# dependencies = [
#     "requests",
#     "beautifulsoup4",
# ]
# ///
"""
Scrape Flickr API documentation and store method details locally.

This script fetches all API method documentation from the Flickr website
and saves them as JSON files in the api-docs/ directory for later use
in creating test cases.
"""

import json
import os
import re
import time
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.flickr.com/services/api/"
OUTPUT_DIR = "api-docs"
DELAY_BETWEEN_REQUESTS = 0.5  # Be polite to the server


def get_soup(url, session):
    """Fetch a URL and return a BeautifulSoup object."""
    response = session.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")


def get_all_method_links(session):
    """Get all API method links from the main API page."""
    soup = get_soup(BASE_URL, session)
    methods = []

    # Find all links that match the pattern flickr.*.html
    for link in soup.find_all("a", href=True):
        href = link["href"]
        if re.match(r".*flickr\.[a-z]+\.[a-zA-Z.]+\.html$", href):
            method_name = link.get_text(strip=True)
            if method_name.startswith("flickr."):
                full_url = urljoin(BASE_URL, href)
                methods.append({"name": method_name, "url": full_url})

    return methods


def sanitize_xml_response(xml_text):
    """
    Fix malformed XML in API response examples.

    Flickr's documentation sometimes contains XML with unescaped quotes
    inside attribute values (e.g., title=""quoted text""). This function
    detects and escapes these nested quotes.
    """
    if not xml_text:
        return xml_text

    # Pattern to find attribute values with nested unescaped quotes
    # Matches: attr="..."..." where there are quotes inside the value
    # We look for patterns like: ="text""more text""
    # which should become: ="text&quot;more text&quot;"
    def fix_attr_value(match):
        attr_name = match.group(1)
        full_value = match.group(2)

        # Check if there are unescaped quotes inside (not at boundaries)
        # A valid attribute value shouldn't have " inside unless escaped
        if '""' in full_value:
            # Pattern like ""text"" - the outer quotes are the attr delimiters
            # and inner quotes should be escaped
            # Replace "" at start/end with &quot;
            fixed = full_value
            # Handle opening quote-quote: ="" followed by quote
            if fixed.startswith('"'):
                fixed = '&quot;' + fixed[1:]
            # Handle closing quote-quote: quote followed by ""
            if fixed.endswith('"'):
                fixed = fixed[:-1] + '&quot;'
            return f'{attr_name}="{fixed}"'
        return match.group(0)

    # Match attribute="value" patterns - capture the whole attribute
    # This regex finds attr="..." and we process each match
    result = re.sub(
        r'(\w+)="([^"]*""[^"]*)"',
        fix_attr_value,
        xml_text
    )

    return result


def parse_method_page(soup, method_name):
    """Parse a method documentation page and extract structured data."""
    data = {
        "name": method_name,
        "description": "",
        "authentication": "",
        "arguments": [],
        "response": "",
        "errors": [],
    }

    # Content is in div.InfoCase (old table-based layout)
    info_case = soup.find("div", class_="InfoCase")
    if not info_case:
        return data

    # Get description - text after h1 but before first h3
    h1 = info_case.find("h1")
    if h1:
        description_parts = []
        for sibling in h1.next_siblings:
            if sibling.name == "h3":
                break
            if hasattr(sibling, "get_text"):
                text = sibling.get_text(strip=True)
                if text:
                    description_parts.append(text)
            elif isinstance(sibling, str) and sibling.strip():
                description_parts.append(sibling.strip())
        data["description"] = " ".join(description_parts)

    # Find all h3 sections
    sections = {}
    current_section = None
    current_content = []

    for elem in info_case.children:
        if hasattr(elem, "name") and elem.name == "h3":
            if current_section:
                sections[current_section] = current_content
            current_section = elem.get_text(strip=True).lower()
            current_content = []
        elif current_section:
            current_content.append(elem)

    if current_section:
        sections[current_section] = current_content

    # Parse authentication
    if "authentication" in sections:
        auth_text = []
        for elem in sections["authentication"]:
            if hasattr(elem, "get_text"):
                auth_text.append(elem.get_text(strip=True))
            elif isinstance(elem, str) and elem.strip():
                auth_text.append(elem.strip())
        data["authentication"] = " ".join(auth_text)

    # Parse arguments - they're in a DL (definition list)
    if "arguments" in sections:
        data["arguments"] = parse_arguments_dl(sections["arguments"])

    # Parse example response - preserve XML formatting
    if "example response" in sections:
        for elem in sections["example response"]:
            # Skip NavigableString elements (whitespace, etc.)
            if not hasattr(elem, "name") or elem.name is None:
                continue
            # Look for pre or code tags that contain the XML
            response_text = None
            if elem.name == "pre":
                response_text = elem.get_text()
            elif elem.name == "code":
                response_text = elem.get_text()
            else:
                # Check if pre/code is nested inside this element
                pre = elem.find("pre")
                if pre:
                    response_text = pre.get_text()
                else:
                    code = elem.find("code")
                    if code:
                        response_text = code.get_text()

            if response_text:
                # Sanitize any malformed XML (e.g., unescaped quotes)
                data["response"] = sanitize_xml_response(response_text)
                break

    # Parse error codes
    if "error codes" in sections:
        data["errors"] = parse_errors(sections["error codes"])

    return data


def parse_arguments_dl(content):
    """Parse arguments from DL (definition list) elements."""
    arguments = []

    for elem in content:
        if not hasattr(elem, "name") or elem.name != "dl":
            continue

        # DL contains DT (term) and DD (definition) pairs
        current_arg = None
        for child in elem.children:
            if not hasattr(child, "name"):
                continue

            if child.name == "dt":
                # New argument - DT contains name and (Required)/(Optional)
                if current_arg:
                    arguments.append(current_arg)

                text = child.get_text(strip=True)
                # Parse "arg_name (Required)" or "arg_name (Optional)"
                match = re.match(r"([a-z_]+)\s*\((Required|Optional)\)", text)
                if match:
                    current_arg = {
                        "name": match.group(1),
                        "required": match.group(2) == "Required",
                        "description": "",
                    }
                else:
                    # Fallback: just use the text as name
                    current_arg = {
                        "name": text.split()[0] if text else "",
                        "required": "Required" in text,
                        "description": "",
                    }

            elif child.name == "dd" and current_arg:
                # Description for the current argument
                current_arg["description"] = child.get_text(strip=True)

        if current_arg:
            arguments.append(current_arg)

    return arguments


def parse_errors(content):
    """Parse the error codes section into a structured list."""
    errors = []
    text_parts = []

    for elem in content:
        if hasattr(elem, "get_text"):
            text_parts.append(elem.get_text(strip=True))
        elif isinstance(elem, str) and elem.strip():
            text_parts.append(elem.strip())

    full_text = " ".join(text_parts)

    # Errors are formatted like "1: Too many tags in ALL query Description here"
    # Split on the pattern "number: error title"
    error_pattern = re.compile(r"(\d+):\s*([^\d]+?)(?=\d+:|$)")
    matches = error_pattern.findall(full_text)

    for code, text in matches:
        # The text contains both the title and description
        # Try to split on sentence boundaries
        parts = text.strip().split(". ", 1)
        if len(parts) == 2:
            title = parts[0].strip()
            description = parts[1].strip()
        else:
            # If no clear split, use the first line as title
            lines = text.strip().split("\n", 1)
            title = lines[0].strip()
            description = lines[1].strip() if len(lines) > 1 else ""

        errors.append(
            {"code": int(code), "title": title, "description": description}
        )

    return errors


def scrape_method(method_info, session):
    """Scrape a single method's documentation."""
    print(f"  Fetching {method_info['name']}...")
    soup = get_soup(method_info["url"], session)
    return parse_method_page(soup, method_info["name"])


def save_method(method_data, output_dir):
    """Save method data to a JSON file."""
    # Create filename from method name: flickr.photos.search -> flickr.photos.search.json
    filename = f"{method_data['name']}.json"
    filepath = os.path.join(output_dir, filename)

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(method_data, f, indent=2, ensure_ascii=False)

    return filepath


def main():
    """Main entry point."""
    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Create a session for connection reuse
    session = requests.Session()
    session.headers.update(
        {
            "User-Agent": "FlickrAPIScraper/1.0 (python-flickr-api test helper)"
        }
    )

    print("Fetching API method list...")
    methods = get_all_method_links(session)
    print(f"Found {len(methods)} API methods")

    # Also save an index file
    index = {"methods": [m["name"] for m in methods], "count": len(methods)}

    with open(os.path.join(OUTPUT_DIR, "_index.json"), "w") as f:
        json.dump(index, f, indent=2)

    print(f"\nScraping method documentation to {OUTPUT_DIR}/")
    for i, method in enumerate(methods, 1):
        try:
            data = scrape_method(method, session)
            filepath = save_method(data, OUTPUT_DIR)
            print(f"  [{i}/{len(methods)}] Saved {filepath}")
        except Exception as e:
            print(f"  [{i}/{len(methods)}] ERROR scraping {method['name']}: {e}")

        # Be polite
        time.sleep(DELAY_BETWEEN_REQUESTS)

    print(f"\nDone! Scraped {len(methods)} methods to {OUTPUT_DIR}/")


if __name__ == "__main__":
    main()