From afdbbeaf5d97ecdcadf77715414a67705fa3b311 Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Fri, 19 Dec 2025 18:23:21 +0100
Subject: [PATCH 1/2] added function for automation

---
 scripts/1-fetch/wikipedia_fetch.py     | 13 ++++++++
 scripts/2-process/github_process.py    | 44 --------------------------
 scripts/2-process/wikipedia_process.py |  1 -
 3 files changed, 13 insertions(+), 45 deletions(-)

diff --git a/scripts/1-fetch/wikipedia_fetch.py b/scripts/1-fetch/wikipedia_fetch.py
index 7d937d58..c3236aba 100755
--- a/scripts/1-fetch/wikipedia_fetch.py
+++ b/scripts/1-fetch/wikipedia_fetch.py
@@ -63,6 +63,18 @@ def parse_arguments():
     return args
 
 
+def check_for_completion():
+    try:
+        with open(FILE_LANGUAGES, "r", newline="") as file_obj:
+            reader = csv.DictReader(file_obj, dialect="unix")
+            if len(list(reader)) > 0:
+                raise shared.QuantifyingException(
+                    f"Data fetch completed for {QUARTER}", 0
+                )
+    except FileNotFoundError:
+        pass  # File may not be found without --enable-save, etc.
+
+
 def write_data(args, tool_data):
     if not args.enable_save:
         return args
@@ -157,6 +169,7 @@ def query_wikipedia_languages(session):
 def main():
     args = parse_arguments()
     shared.paths_log(LOGGER, PATHS)
+    check_for_completion()
     shared.git_fetch_and_merge(args, PATHS["repo"])
     session = shared.get_session()
     tool_data = query_wikipedia_languages(session)
diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py
index 6009629d..ca0a24c3 100755
--- a/scripts/2-process/github_process.py
+++ b/scripts/2-process/github_process.py
@@ -11,7 +11,6 @@
 import traceback
 
 # Third-party
-# import pandas as pd
 import pandas as pd
 
 # Add parent directory so shared can be imported
@@ -129,49 +128,6 @@ def process_totals_by_restriction(args, count_data):
     data_to_csv(args, data, file_path)
 
 
-# def load_quarter_data(quarter):
-#     """
-#     Load data for a specific quarter.
-#     """
-#     file_path = os.path.join(PATHS["data"], f"{quarter}",
-#       "1-fetch", "github_fetched")
-#     if not os.path.exists(file_path):
-#         LOGGER.error(f"Data file for quarter {quarter} not found.")
-#         return None
-#     return pd.read_csv(file_path)
-
-
-# def compare_data(current_quarter, previous_quarter):
-#     """
-#     Compare data between two quarters.
-#     """
-#     current_data = load_quarter_data(current_quarter)
-#     previous_data = load_quarter_data(previous_quarter)
-
-#     if current_data is None or previous_data is None:
-#         return
-
-#     Process data to compare totals
-
-
-# def parse_arguments():
-#     """
-#     Parses command-line arguments, returns parsed arguments.
-#     """
-#     LOGGER.info("Parsing command-line arguments")
-#     parser = argparse.ArgumentParser(
-#       description="Google Custom Search Comparison Report")
-#     parser.add_argument(
-#         "--current_quarter", type=str, required=True,
-#       help="Current quarter for comparison (e.g., 2024Q3)"
-#     )
-#     parser.add_argument(
-#         "--previous_quarter", type=str, required=True,
-#           help="Previous quarter for comparison (e.g., 2024Q2)"
-#     )
-#     return parser.parse_args()
-
-
 def main():
     args = parse_arguments()
     shared.paths_log(LOGGER, PATHS)
diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py
index 25393e5a..e9886ffc 100755
--- a/scripts/2-process/wikipedia_process.py
+++ b/scripts/2-process/wikipedia_process.py
@@ -147,7 +147,6 @@ def main():
     args = parse_arguments()
     shared.paths_log(LOGGER, PATHS)
     shared.git_fetch_and_merge(args, PATHS["repo"])
-
     file_count = shared.path_join(
         PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
     )

From b5c80e01f1fe5fb01233030f7d91634b056845db Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Fri, 26 Dec 2025 03:01:19 +0100
Subject: [PATCH 2/2] added check function for process script

---
 scripts/1-fetch/wikipedia_fetch.py     |  2 +-
 scripts/2-process/github_process.py    |  9 +++++++++
 scripts/2-process/wikipedia_process.py | 10 ++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/scripts/1-fetch/wikipedia_fetch.py b/scripts/1-fetch/wikipedia_fetch.py
index c3236aba..efabc327 100755
--- a/scripts/1-fetch/wikipedia_fetch.py
+++ b/scripts/1-fetch/wikipedia_fetch.py
@@ -67,7 +67,7 @@ def check_for_completion():
     try:
         with open(FILE_LANGUAGES, "r", newline="") as file_obj:
             reader = csv.DictReader(file_obj, dialect="unix")
-            if len(list(reader)) > 0:
+            if len(list(reader)) > 300:
                 raise shared.QuantifyingException(
                     f"Data fetch completed for {QUARTER}", 0
                 )
diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py
index ca0a24c3..27945613 100755
--- a/scripts/2-process/github_process.py
+++ b/scripts/2-process/github_process.py
@@ -59,6 +59,13 @@ def parse_arguments():
     return args
 
 
+def check_for_data_file(file_path):
+    if os.path.exists(file_path):
+        raise shared.QuantifyingException(
+            f"Processed data already exists for {QUARTER}", 0
+        )
+
+
 def data_to_csv(args, data, file_path):
     if not args.enable_save:
         return
@@ -91,6 +98,7 @@ def process_totals_by_license(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "github_totals_by_license.csv"
     )
+    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -125,6 +133,7 @@ def process_totals_by_restriction(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "github_totals_by_restriction.csv"
     )
+    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py
index e9886ffc..7712b26a 100755
--- a/scripts/2-process/wikipedia_process.py
+++ b/scripts/2-process/wikipedia_process.py
@@ -63,6 +63,13 @@ def parse_arguments():
     return args
 
 
+def check_for_data_file(file_path):
+    if os.path.exists(file_path):
+        raise shared.QuantifyingException(
+            f"Processed data already exists for {QUARTER}", 0
+        )
+
+
 def data_to_csv(args, data, file_path):
     if not args.enable_save:
         return
@@ -91,6 +98,7 @@ def process_highest_language_usage(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
     )
+    check_for_data_file(file_path)
     data_to_csv(args, top_10, file_path)
 
 
@@ -114,6 +122,7 @@ def process_least_language_usage(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "wikipedia_least_language_usage.csv"
     )
+    check_for_data_file(file_path)
     data_to_csv(args, bottom_10, file_path)
 
 
@@ -140,6 +149,7 @@ def process_language_representation(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "wikipedia_language_representation.csv"
     )
+    check_for_data_file(file_path)
     data_to_csv(args, language_counts, file_path)