AOSSIE-Org · Archit381 · Feb 25, 2026 · Feb 19, 2026 · Feb 21, 2026 · Feb 21, 2026
diff --git a/.coderabbit.yaml b/.coderabbit.yaml
@@ -157,6 +157,8 @@ reviews:
     - Confirm that the code meets the project's requirements and objectives
     - Confirm that copyright years are up-to date whenever a file is changed
     - Point out redundant obvious comments that do not add clarity to the code
+    - Ensure that comments are concise and suggest more concise comment statements if possible
+    - Discourage usage of verbose comment styles such as NatSpec
-    - Ensure that comments are concise and suggest more concise comment statements if possible
-    - Discourage usage of verbose comment styles such as NatSpec
+    - Ensure that comments are concise and suggest more concise comment statements if possible
+    - Discourage usage of verbose comment styles (except NatSpec in Solidity files, where it is standard)
-    - Ensure that comments are concise and suggest more concise comment statements if possible
-    - Discourage usage of verbose comment styles such as NatSpec
+    - Ensure that comments are concise and suggest more concise comment statements if possible
+    - Discourage usage of verbose comment styles (except NatSpec in Solidity files, where it is standard)
     - Look for code duplication
     - Suggest code completions when:
         - seeing a TODO comment
@@ -275,4 +277,4 @@ reviews:
         - Image optimization (appropriate size and format)
         - Proper @2x and @3x variants for different screen densities
         - SVG assets are optimized
-        - Font files are licensed and optimized
+        - Font files are licensed and optimized
diff --git a/.gitignore b/.gitignore
@@ -324,8 +324,11 @@ TSWLatexianTemp*
 # option is specified. Footnotes are the stored in a file with suffix Notes.bib.
 # Uncomment the next line to have this generated file ignored.
 #*Notes.bib
+
+data/
-
-data/
+*.egg-info/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+/data/
-
-data/
+*.egg-info/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+/data/
 *.egg-info/
 __pycache__/
 *.pyc
 *.pyo
 *.pyd
+*.bz2
diff --git a/README.md b/README.md
@@ -277,4 +277,4 @@ Thanks a lot for spending your time helping TODO grow. Keep rocking 🥂
 
 [![Contributors](https://contrib.rocks/image?repo=AOSSIE-Org/TODO)](https://github.com/AOSSIE-Org/TODO/graphs/contributors)
 
-© 2025 AOSSIE 
+© 2025 AOSSIE 
diff --git a/examples/demo_util.py b/examples/demo_util.py
@@ -0,0 +1,23 @@
+import sys
+import logging
+from openverifiablellm.utils import extract_text_from_xml
+
+logger = logging.getLogger(__name__)
+
+"""
+Demo for preprocessing pipeline.
+
+Run with:
+    python -m examples.demo_util examples\sample_wiki.xml.bz2
+"""
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python -m examples.demo_util <input_dump>")
+        sys.exit(1)
+
+    logging.basicConfig(
+    level=logging.INFO,
+    format="%(levelname)s - %(message)s"
+    )
+    extract_text_from_xml(sys.argv[1])
diff --git a/examples/hash_demo.py b/examples/hash_demo.py
diff --git a/examples/sample_wiki.py b/examples/sample_wiki.py
@@ -0,0 +1,18 @@
+import bz2
+
+xml_content = """<?xml version="1.0" encoding="UTF-8"?>
+<mediawiki>
+  <page>
+    <revision>
+      <text>
+        Hello <ref>citation</ref> world.
+        This is [[Python|programming language]]
+        {{Wikipedia }}is a free online encyclopedia.
+      </text>
+    </revision>
+  </page>
+</mediawiki>
+"""
+
+with bz2.open("examples/sample_wiki.xml.bz2", "wt", encoding="utf-8") as f:
+    f.write(xml_content)
diff --git a/examples/sample_wiki.txt b/examples/sample_wiki.txt
diff --git a/examples/sample_wiki.xml.bz2 b/examples/sample_wiki.xml.bz2
diff --git a/openverifiablellm/dataset_hash.py b/openverifiablellm/dataset_hash.py
diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py
@@ -0,0 +1,156 @@
+import bz2
+import re
+import defusedxml.ElementTree as ET
+from pathlib import Path
+import sys
+from typing import Union
+import hashlib
+import logging
+import json
+import platform
+
+logger = logging.getLogger(__name__)
+
+# extract clean wikipage from actual wikipage
+def extract_text_from_xml(input_path):
+    """
+    Process a compressed Wikipedia XML dump into cleaned plain text.
+
+    Each <page> element is parsed, its revision text is extracted,
+    cleaned using `clean_wikitext()`, and appended to a single
+    output text file.
+
+    The processed output is saved to:
+        data/processed/wiki_clean.txt
+
+    Parameters
+    ----------
+    input_path : str or Path
+        Path to the compressed Wikipedia XML (.bz2) dump file.
+
+    Output
+    ------
+    Creates:
+        data/processed/wiki_clean.txt
+    """
+    input_path = Path(input_path)
+
+    # Fixed output path
+    project_root = Path.cwd()
+    output_dir = project_root / "data" / "processed"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    output_path = output_dir / "wiki_clean.txt"
+
+    with bz2.open(input_path, "rb") as f:
+        context = ET.iterparse(f, events=("end",))
+
+        with open(output_path, "w", encoding="utf-8") as out:
+            for _, elem in context:
+                if elem.tag.endswith("page"):
+                    text_elem = elem.find(".//{*}text")
+
+                    if text_elem is not None and text_elem.text:
+                        cleaned = clean_wikitext(text_elem.text)
+                        if cleaned:
+                            out.write(cleaned + "\n\n")
+
+                    elem.clear()
+    logger.info("Preprocessing complete. Output saved to %s", output_path)
+    generate_manifest(input_path,output_path)
+
+# generate data manifest
+def generate_manifest(raw_path, processed_path):
+    raw_path = Path(raw_path)
+    processed_path = Path(processed_path)
+
+    if not processed_path.exists():
+        raise FileNotFoundError(
+            f"Processed file not found at {processed_path}. Run preprocessing first."
+        )
+
+    manifest = {
+        "wikipedia_dump": raw_path.name,
+        "dump_date": extract_dump_date(raw_path.name),
+        "raw_sha256": compute_sha256(str(raw_path)),
+        "processed_sha256": compute_sha256(str(processed_path)),
-        "raw_sha256": compute_sha256(str(raw_path)),
-        "processed_sha256": compute_sha256(str(processed_path)),
+        "raw_sha256": compute_sha256(raw_path),
+        "processed_sha256": compute_sha256(processed_path),
-        "raw_sha256": compute_sha256(str(raw_path)),
-        "processed_sha256": compute_sha256(str(processed_path)),
+        "raw_sha256": compute_sha256(raw_path),
+        "processed_sha256": compute_sha256(processed_path),
+        "preprocessing_version": "v1",
+        "python_version": platform.python_version()
+    }
+    project_root = Path.cwd()
+    manifest_path = project_root / "data" / "dataset_manifest.json"
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(manifest_path, "w") as f:
+        json.dump(manifest, f, indent=2)
-    with open(manifest_path, "w") as f:
-        json.dump(manifest, f, indent=2)
+    with open(manifest_path, "w", encoding="utf-8") as f:
+        json.dump(manifest, f, indent=2)
-    with open(manifest_path, "w") as f:
-        json.dump(manifest, f, indent=2)
+    with open(manifest_path, "w", encoding="utf-8") as f:
+        json.dump(manifest, f, indent=2)
+
+    logger.info("Manifest written to %s", manifest_path)
+
+# helpers
+def compute_sha256(file_path: Union[str, Path]) -> str:
+    """
+    Compute SHA256 hash of a file.
+
+    This provides a deterministic fingerprint of the dataset,
+    enabling reproducibility and verification.
+
+    Parameters
+    ----------
+    file_path : Union[str, Path]
+        Path to the dataset file (string or Path-like).
+
+    Returns
+    -------
+    str
+        SHA256 hash string.
+    """
+    path = Path(file_path)
+
+    sha256 = hashlib.sha256()
+
+    with path.open("rb") as f:
+        while chunk := f.read(8192):
+            sha256.update(chunk)
+
+    return sha256.hexdigest()
+
+def extract_dump_date(filename: str):
-def extract_dump_date(filename: str):
+def extract_dump_date(filename: str) -> str:
-def extract_dump_date(filename: str):
+def extract_dump_date(filename: str) -> str:
+    parts = filename.split("-")
+    for part in parts:
+        if part.isdigit() and len(part) == 8:
+            return f"{part[:4]}-{part[4:6]}-{part[6:]}"
+    return "unknown"
+
+def clean_wikitext(text: str) -> str:
+    """
+    Basic deterministic wikitext cleaning.
+
+    Note:
+    This uses simple regex-based rules for speed and consistency.
+    It does NOT fully parse MediaWiki syntax.
+
+    Limitations:
+    - Deeply nested templates may not be fully removed.
+    - Some complex <ref /> cases may not be perfectly handled.
+    - This is not a complete MediaWiki parser.
+
+    These limitations are acceptable for lightweight, deterministic preprocessing.
+    """
+    text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL)
+    text = re.sub(r"<ref.*?>.*?</ref>", "", text, flags=re.DOTALL)
+    text = re.sub(r"<.*?>", "", text)
+    text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text)
+    text = re.sub(r"\[\[(.*?)\]\]", r"\1", text)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python -m openverifiablellm.utils <input_dump>")
+        sys.exit(1)
+
+    logging.basicConfig(
+    level=logging.INFO,
+    format="%(levelname)s - %(message)s"
+    )
+    extract_text_from_xml(sys.argv[1])
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,15 @@ authors = [
 ]
 requires-python = ">=3.9"
 
+dependencies= [
+    "defusedxml"
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest"
+]
+
 [tool.setuptools.packages.find]
 include = ["openverifiablellm*"]
 
diff --git a/tests/test_dataset_hash.py b/tests/test_dataset_hash.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -277,4 +277,4 @@ Thanks a lot for spending your time helping TODO grow. Keep rocking 🥂

		[![Contributors](https://contrib.rocks/image?repo=AOSSIE-Org/TODO)](https://github.com/AOSSIE-Org/TODO/graphs/contributors)

		© 2025 AOSSIE
		© 2025 AOSSIE