diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index c5d354b7..fefbba0f 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -311,7 +311,9 @@ def main(): # Count data file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv") - count_data = pd.read_csv(file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"]) + count_data = shared.open_data_file( + LOGGER, file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"] + ) process_product_totals(args, count_data) process_latest_prior_retired_totals(args, count_data) process_totals_by_free_cultural(args, count_data) @@ -321,8 +323,10 @@ def main(): file2_language = shared.path_join( PATHS["data_1-fetch"], "gcs_2_count_by_language.csv" ) - language_data = pd.read_csv( - file2_language, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"] + language_data = shared.open_data_file( + LOGGER, + file2_language, + usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"], ) process_totals_by_language(args, language_data) @@ -330,8 +334,8 @@ def main(): file3_country = shared.path_join( PATHS["data_1-fetch"], "gcs_3_count_by_country.csv" ) - country_data = pd.read_csv( - file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"] + country_data = shared.open_data_file( + LOGGER, file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"] ) process_totals_by_country(args, country_data) diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index ae9d261a..6009629d 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -178,7 +178,9 @@ def main(): shared.git_fetch_and_merge(args, PATHS["repo"]) file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv") - count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]) + count_data = shared.open_data_file( + LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"] + ) process_totals_by_license(args, count_data) process_totals_by_restriction(args, count_data) diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py index 22d5743e..25393e5a 100755 --- a/scripts/2-process/wikipedia_process.py +++ b/scripts/2-process/wikipedia_process.py @@ -151,7 +151,9 @@ def main(): file_count = shared.path_join( PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv" ) - count_data = pd.read_csv(file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"]) + count_data = shared.open_data_file( + LOGGER, file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"] + ) process_language_representation(args, count_data) process_highest_language_usage(args, count_data) process_least_language_usage(args, count_data) diff --git a/scripts/3-report/gcs_report.py b/scripts/3-report/gcs_report.py index 105313fa..359796a9 100755 --- a/scripts/3-report/gcs_report.py +++ b/scripts/3-report/gcs_report.py @@ -11,7 +11,6 @@ import traceback # Third-party -import pandas as pd from pygments import highlight from pygments.formatters import TerminalFormatter from pygments.lexers import PythonTracebackLexer @@ -80,7 +79,7 @@ def gcs_intro(args): ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "CC legal tool product" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) total_count = f"{data['Count'].sum():,d}" shared.update_readme( args, @@ -111,7 +110,8 @@ def plot_products(args): ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "CC legal tool product" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data = data[::-1] # reverse order title = "Products totals and percentages" @@ -156,7 +156,7 @@ def plot_tool_status(args): ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "CC legal tool" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) data.sort_values(name_label, ascending=False, inplace=True) title = "CC legal tools status" @@ -199,7 +199,7 @@ def plot_latest_tools(args): ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "CC legal tool" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) data.sort_values(name_label, ascending=False, inplace=True) title = "Latest CC legal tools" @@ -241,7 +241,7 @@ def plot_prior_tools(args): ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "CC legal tool" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) data.sort_values(name_label, ascending=False, inplace=True) title = "Prior CC legal tools" @@ -286,7 +286,7 @@ def plot_retired_tools(args): ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "CC legal tool" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) data.sort_values(name_label, ascending=False, inplace=True) title = "Retired CC legal tools" @@ -332,7 +332,7 @@ def plot_countries_highest_usage(args): LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "Country" data_label = "Count" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) total_count = f"{data['Count'].sum():,d}" data.sort_values(data_label, ascending=False, inplace=True) data = data[:10] # limit to highest 10 @@ -385,7 +385,7 @@ def plot_languages_highest_usage(args): LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "Language" data_label = "Count" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) total_count = f"{data['Count'].sum():,d}" data.sort_values(data_label, ascending=False, inplace=True) data = data[:10] # limit to highest 10 @@ -439,7 +439,7 @@ def plot_free_culture(args): LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "Category" data_label = "Count" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) title = "Approved for Free Cultural Works" plt = plot.combined_plot( diff --git a/scripts/3-report/github_report.py b/scripts/3-report/github_report.py index 7de0189c..37979175 100755 --- a/scripts/3-report/github_report.py +++ b/scripts/3-report/github_report.py @@ -11,7 +11,6 @@ import traceback # Third-party -import pandas as pd from pygments import highlight from pygments.formatters import TerminalFormatter from pygments.lexers import PythonTracebackLexer @@ -77,11 +76,8 @@ def load_data(args): PATHS["data"], f"{selected_quarter}", "1-fetch", "github_1_count.csv" ) - if not os.path.exists(file_path): - LOGGER.error(f"Data file not found: {file_path}") - return pd.DataFrame() + data = shared.open_data_file(LOGGER, file_path) - data = pd.read_csv(file_path) LOGGER.info(f"Data loaded from {file_path}") return data @@ -97,7 +93,7 @@ def github_intro(args): ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "TOOL_IDENTIFIER" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) total_repositories = data.loc["Total public repositories", "COUNT"] cc_total = data[data.index.str.startswith("CC")]["COUNT"].sum() cc_percentage = f"{(cc_total / total_repositories) * 100:.2f}%" @@ -152,7 +148,7 @@ def plot_totals_by_license_type(args): LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "License" data_label = "Count" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) data.sort_values(data_label, ascending=True, inplace=True) title = "Totals by license type" plt = plot.combined_plot( @@ -201,7 +197,7 @@ def plot_totals_by_restriction(args): LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "Category" data_label = "Count" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) data.sort_values(name_label, ascending=False, inplace=True) title = "Totals by restriction" plt = plot.combined_plot( diff --git a/scripts/3-report/wikipedia_report.py b/scripts/3-report/wikipedia_report.py index 284cb216..83a92fa3 100755 --- a/scripts/3-report/wikipedia_report.py +++ b/scripts/3-report/wikipedia_report.py @@ -11,7 +11,6 @@ import traceback # Third-party -import pandas as pd from pygments import highlight from pygments.formatters import TerminalFormatter from pygments.lexers import PythonTracebackLexer @@ -87,9 +86,11 @@ def wikipedia_intro(args): ) name_label = "LANGUAGE_NAME_EN" name_label_top10 = "Language" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) total_articles = data["COUNT"].sum() - top10 = pd.read_csv(file_path_top10, index_col=name_label_top10) + top10 = shared.open_data_file( + LOGGER, file_path_top10, index_col=name_label_top10 + ) top10_articles = top10["Count"].sum() top10_percentage = (top10_articles / total_articles) * 100 average_articles = total_articles / len(data) @@ -131,7 +132,7 @@ def plot_language_representation(args): LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "Category" data_label = "Count" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) data.sort_values(data_label, ascending=True, inplace=True) title = "Language Representation" plt = plot.combined_plot( @@ -176,7 +177,7 @@ def plot_highest_language_usage(args): LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "Language" data_label = "Count" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) data.sort_values(data_label, ascending=True, inplace=True) title = "Most represented languages" plt = plot.combined_plot( @@ -219,7 +220,7 @@ def plot_least_language_usage(args): LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") name_label = "Language" data_label = "Count" - data = pd.read_csv(file_path, index_col=name_label) + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) data.sort_values(data_label, ascending=True, inplace=True) title = "Least represented languages" plt = plot.combined_plot( diff --git a/scripts/shared.py b/scripts/shared.py index 541988fc..509801d9 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -6,6 +6,7 @@ from datetime import datetime, timezone # Third-party +import pandas as pd from git import InvalidGitRepositoryError, NoSuchPathError, Repo from pandas import PeriodIndex from requests import Session @@ -66,6 +67,38 @@ def get_session(accept_header=None, session=None): return session +def open_data_file( + logger, + file_path, + usecols=None, + index_col=None, +): + """ + Open a CSV data file safely and convert expected errors into + QuantifyingException. This shared function ensures all process/report + scripts benefit from the same error handling. + """ + try: + # Reading the file + return pd.read_csv(file_path, usecols=usecols, index_col=index_col) + # File does not exist + except FileNotFoundError: + raise QuantifyingException( + message=f"Data file not found: {file_path}", exit_code=1 + ) + # Empty or invalid CSV file + except pd.errors.EmptyDataError: + raise QuantifyingException( + message=f"CSV file is empty or invalid: {file_path}", exit_code=1 + ) + # Permission denied + except PermissionError: + raise QuantifyingException( + message=f"Permission denied when accessing data file: {file_path}", + exit_code=1, + ) + + def git_fetch_and_merge(args, repo_path, branch=None): if not args.enable_git: return