From afdbbeaf5d97ecdcadf77715414a67705fa3b311 Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Fri, 19 Dec 2025 18:23:21 +0100 Subject: [PATCH 1/2] added function for automation --- scripts/1-fetch/wikipedia_fetch.py | 13 ++++++++ scripts/2-process/github_process.py | 44 -------------------------- scripts/2-process/wikipedia_process.py | 1 - 3 files changed, 13 insertions(+), 45 deletions(-) diff --git a/scripts/1-fetch/wikipedia_fetch.py b/scripts/1-fetch/wikipedia_fetch.py index 7d937d58..c3236aba 100755 --- a/scripts/1-fetch/wikipedia_fetch.py +++ b/scripts/1-fetch/wikipedia_fetch.py @@ -63,6 +63,18 @@ def parse_arguments(): return args +def check_for_completion(): + try: + with open(FILE_LANGUAGES, "r", newline="") as file_obj: + reader = csv.DictReader(file_obj, dialect="unix") + if len(list(reader)) > 0: + raise shared.QuantifyingException( + f"Data fetch completed for {QUARTER}", 0 + ) + except FileNotFoundError: + pass # File may not be found without --enable-save, etc. + + def write_data(args, tool_data): if not args.enable_save: return args @@ -157,6 +169,7 @@ def query_wikipedia_languages(session): def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) + check_for_completion() shared.git_fetch_and_merge(args, PATHS["repo"]) session = shared.get_session() tool_data = query_wikipedia_languages(session) diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index 6009629d..ca0a24c3 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -11,7 +11,6 @@ import traceback # Third-party -# import pandas as pd import pandas as pd # Add parent directory so shared can be imported @@ -129,49 +128,6 @@ def process_totals_by_restriction(args, count_data): data_to_csv(args, data, file_path) -# def load_quarter_data(quarter): -# """ -# Load data for a specific quarter. -# """ -# file_path = os.path.join(PATHS["data"], f"{quarter}", -# "1-fetch", "github_fetched") -# if not os.path.exists(file_path): -# LOGGER.error(f"Data file for quarter {quarter} not found.") -# return None -# return pd.read_csv(file_path) - - -# def compare_data(current_quarter, previous_quarter): -# """ -# Compare data between two quarters. -# """ -# current_data = load_quarter_data(current_quarter) -# previous_data = load_quarter_data(previous_quarter) - -# if current_data is None or previous_data is None: -# return - -# Process data to compare totals - - -# def parse_arguments(): -# """ -# Parses command-line arguments, returns parsed arguments. -# """ -# LOGGER.info("Parsing command-line arguments") -# parser = argparse.ArgumentParser( -# description="Google Custom Search Comparison Report") -# parser.add_argument( -# "--current_quarter", type=str, required=True, -# help="Current quarter for comparison (e.g., 2024Q3)" -# ) -# parser.add_argument( -# "--previous_quarter", type=str, required=True, -# help="Previous quarter for comparison (e.g., 2024Q2)" -# ) -# return parser.parse_args() - - def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py index 25393e5a..e9886ffc 100755 --- a/scripts/2-process/wikipedia_process.py +++ b/scripts/2-process/wikipedia_process.py @@ -147,7 +147,6 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) - file_count = shared.path_join( PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv" ) From b5c80e01f1fe5fb01233030f7d91634b056845db Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Fri, 26 Dec 2025 03:01:19 +0100 Subject: [PATCH 2/2] added check function for process script --- scripts/1-fetch/wikipedia_fetch.py | 2 +- scripts/2-process/github_process.py | 9 +++++++++ scripts/2-process/wikipedia_process.py | 10 ++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/scripts/1-fetch/wikipedia_fetch.py b/scripts/1-fetch/wikipedia_fetch.py index c3236aba..efabc327 100755 --- a/scripts/1-fetch/wikipedia_fetch.py +++ b/scripts/1-fetch/wikipedia_fetch.py @@ -67,7 +67,7 @@ def check_for_completion(): try: with open(FILE_LANGUAGES, "r", newline="") as file_obj: reader = csv.DictReader(file_obj, dialect="unix") - if len(list(reader)) > 0: + if len(list(reader)) > 300: raise shared.QuantifyingException( f"Data fetch completed for {QUARTER}", 0 ) diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index ca0a24c3..27945613 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -59,6 +59,13 @@ def parse_arguments(): return args +def check_for_data_file(file_path): + if os.path.exists(file_path): + raise shared.QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) + + def data_to_csv(args, data, file_path): if not args.enable_save: return @@ -91,6 +98,7 @@ def process_totals_by_license(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_license.csv" ) + check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -125,6 +133,7 @@ def process_totals_by_restriction(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_restriction.csv" ) + check_for_data_file(file_path) data_to_csv(args, data, file_path) diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py index e9886ffc..7712b26a 100755 --- a/scripts/2-process/wikipedia_process.py +++ b/scripts/2-process/wikipedia_process.py @@ -63,6 +63,13 @@ def parse_arguments(): return args +def check_for_data_file(file_path): + if os.path.exists(file_path): + raise shared.QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) + + def data_to_csv(args, data, file_path): if not args.enable_save: return @@ -91,6 +98,7 @@ def process_highest_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_highest_language_usage.csv" ) + check_for_data_file(file_path) data_to_csv(args, top_10, file_path) @@ -114,6 +122,7 @@ def process_least_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_least_language_usage.csv" ) + check_for_data_file(file_path) data_to_csv(args, bottom_10, file_path) @@ -140,6 +149,7 @@ def process_language_representation(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_language_representation.csv" ) + check_for_data_file(file_path) data_to_csv(args, language_counts, file_path)