diff --git a/env.example b/env.example index d30d3762..b09efc28 100644 --- a/env.example +++ b/env.example @@ -37,3 +37,10 @@ # https://docs.github.com/en/rest/authentication/authenticating-to-the-rest-api # GH_TOKEN = + + +# Smithsonian + +# https://edan.si.edu/openaccess/apidocs/ + +# DATA_GOV_API_KEY = diff --git a/scripts/1-fetch/smithsonian_fetch.py b/scripts/1-fetch/smithsonian_fetch.py new file mode 100755 index 00000000..2bf5c5bf --- /dev/null +++ b/scripts/1-fetch/smithsonian_fetch.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python +""" +Fetch metrics usage from Smithsonian Institution Open Access API. +""" + +# Standard library +import argparse +import csv +import os +import sys +import textwrap +import traceback +from operator import itemgetter + +# Third-party +import requests +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +DATA_GOV_API_KEY = os.getenv("DATA_GOV_API_KEY") +FILE_1_METRICS = os.path.join(PATHS["data_phase"], "smithsonian_1_metrics.csv") +FILE_2_UNITS = os.path.join(PATHS["data_phase"], "smithsonian_2_units.csv") +HEADER_1_METRICS = [ + "CC0_RECORDS", + "CC0_RECORDS_WITH_CC0_MEDIA", + "CC0_MEDIA", + "CC0_MEDIA_PERCENTAGE", + "TOTAL_OBJECTS", +] +HEADER_2_UNITS = [ + "UNIT", + "CC0_RECORDS", + "CC0_RECORDS_WITH_CC0_MEDIA", + "TOTAL_OBJECTS", +] +QUARTER = os.path.basename(PATHS["data_quarter"]) + + +def parse_arguments(): + """ + Parse command-line options, returns parsed argument namespace. + """ + LOGGER.info("Parsing command-line options") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions (fetch, merge, add, commit, and push)", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + return args + + +def check_for_completion(): + completed_metrics = False + completed_units = False + + try: + with open(FILE_1_METRICS, "r", newline="") as file_obj: + reader = csv.DictReader(file_obj, dialect="unix") + if len(list(reader)) > 0: + completed_metrics = True + except FileNotFoundError: + pass # File may not be found without --enable-save, etc. + + try: + with open(FILE_2_UNITS, "r", newline="") as file_obj: + reader = csv.DictReader(file_obj, dialect="unix") + if len(list(reader)) > 30: + completed_units = True + except FileNotFoundError: + pass # File may not be found without --enable-save, etc. + + if completed_metrics and completed_units: + raise shared.QuantifyingException( + f"Data fetch completed for {QUARTER}", 0 + ) + + +def write_data(args, data_metrics, data_units): + if not args.enable_save: + return args + + # Create data directory for this phase + os.makedirs(PATHS["data_phase"], exist_ok=True) + + with open(FILE_1_METRICS, "w", encoding="utf-8", newline="\n") as file_obj: + writer = csv.DictWriter( + file_obj, fieldnames=HEADER_1_METRICS, dialect="unix" + ) + writer.writeheader() + for row in data_metrics: + writer.writerow(row) + + with open(FILE_2_UNITS, "w", encoding="utf-8", newline="\n") as file_obj: + writer = csv.DictWriter( + file_obj, fieldnames=HEADER_2_UNITS, dialect="unix" + ) + writer.writeheader() + for row in data_units: + writer.writerow(row) + + return args + + +def query_smithsonian(args, session): + if not DATA_GOV_API_KEY: + raise shared.QuantifyingException( + "Authentication (DATA_GOV_API_KEY) required. Please ensure your" + " API key is set in .env", + 1, + ) + LOGGER.info("Fetch data from API") + url = "https://api.si.edu/openaccess/api/v1.0/stats" + params = {"api_key": DATA_GOV_API_KEY} + try: + with session.get(url, params=params) as response: + response.raise_for_status() + data = response.json()["response"] + except requests.HTTPError as e: + raise shared.QuantifyingException(f"HTTP Error: {e}", 1) + except requests.RequestException as e: + raise shared.QuantifyingException(f"Request Exception: {e}", 1) + except KeyError as e: + raise shared.QuantifyingException(f"KeyError: {e}", 1) + data_metrics = [ + { + "CC0_MEDIA": data["metrics"]["CC0_media"], + "CC0_MEDIA_PERCENTAGE": data["metrics"]["CC0_media_percentage"], + "CC0_RECORDS": data["metrics"]["CC0_records"], + "CC0_RECORDS_WITH_CC0_MEDIA": data["metrics"][ + "CC0_records_with_CC0_media" + ], + "TOTAL_OBJECTS": data["total_objects"], + } + ] + data_units = [] + for unit in data["units"]: + if unit["total_objects"] == 0: + continue + data_units.append( + { + "UNIT": unit["unit"], + "CC0_RECORDS": unit["metrics"]["CC0_records"], + "CC0_RECORDS_WITH_CC0_MEDIA": unit["metrics"][ + "CC0_records_with_CC0_media" + ], + "TOTAL_OBJECTS": unit["total_objects"], + } + ) + data_units = sorted(data_units, key=itemgetter("UNIT")) + LOGGER.info(f"Fetched stats for {len(data_units)} units") + return data_metrics, data_units + + +def main(): + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + check_for_completion() + session = shared.get_session() + data_metrics, data_units = query_smithsonian(args, session) + args = write_data(args, data_metrics, data_units) + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit new Smithsonian data for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") + sys.exit(1) diff --git a/sources.md b/sources.md index 2f559bef..0aae9fc6 100644 --- a/sources.md +++ b/sources.md @@ -147,6 +147,21 @@ license_version breakdown. - Supported licenses: `by`, `by-nc`, `by-nc-nd`, `by-nc-sa`, `by-nd`, `by-sa`, `cc0`, `nc-sampling+`, `pdm`, `sampling+` +## Smithsonian + +**Description:** The Smithsonian Institution Open Access API offers a metrics +API for stats about CC0 objects/media. + +**API documentation link:** +- [metrics - Documentation](https://edan.si.edu/openaccess/apidocs/#api-metrics) +- [Developer Manual - api.data.gov](https://api.data.gov/docs/developer-manual/) + +**API information:** +- API key required +- Hourly Limit: 1,000 requests per hour +- Data available in a JSON format + + ## Wikipedia **Description:** The Wikipedia API allows users to query statistics of pages,