diff --git a/.coderabbit.yaml b/.coderabbit.yaml index 877065c..19d6c04 100644 --- a/.coderabbit.yaml +++ b/.coderabbit.yaml @@ -157,6 +157,8 @@ reviews: - Confirm that the code meets the project's requirements and objectives - Confirm that copyright years are up-to date whenever a file is changed - Point out redundant obvious comments that do not add clarity to the code + - Ensure that comments are concise and suggest more concise comment statements if possible + - Discourage usage of verbose comment styles such as NatSpec - Look for code duplication - Suggest code completions when: - seeing a TODO comment @@ -275,4 +277,4 @@ reviews: - Image optimization (appropriate size and format) - Proper @2x and @3x variants for different screen densities - SVG assets are optimized - - Font files are licensed and optimized + - Font files are licensed and optimized \ No newline at end of file diff --git a/.gitignore b/.gitignore index 46f6993..392cb81 100644 --- a/.gitignore +++ b/.gitignore @@ -324,8 +324,11 @@ TSWLatexianTemp* # option is specified. Footnotes are the stored in a file with suffix Notes.bib. # Uncomment the next line to have this generated file ignored. #*Notes.bib + +data/ *.egg-info/ __pycache__/ *.pyc *.pyo *.pyd +*.bz2 diff --git a/README.md b/README.md index 3c5adf2..d76f8be 100644 --- a/README.md +++ b/README.md @@ -277,4 +277,4 @@ Thanks a lot for spending your time helping TODO grow. Keep rocking 🥂 [![Contributors](https://contrib.rocks/image?repo=AOSSIE-Org/TODO)](https://github.com/AOSSIE-Org/TODO/graphs/contributors) -© 2025 AOSSIE +© 2025 AOSSIE \ No newline at end of file diff --git a/examples/demo_util.py b/examples/demo_util.py new file mode 100644 index 0000000..9f41446 --- /dev/null +++ b/examples/demo_util.py @@ -0,0 +1,23 @@ +import sys +import logging +from openverifiablellm.utils import extract_text_from_xml + +logger = logging.getLogger(__name__) + +""" +Demo for preprocessing pipeline. + +Run with: + python -m examples.demo_util examples\sample_wiki.xml.bz2 +""" + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python -m examples.demo_util ") + sys.exit(1) + + logging.basicConfig( + level=logging.INFO, + format="%(levelname)s - %(message)s" + ) + extract_text_from_xml(sys.argv[1]) \ No newline at end of file diff --git a/examples/hash_demo.py b/examples/hash_demo.py deleted file mode 100644 index ad8842c..0000000 --- a/examples/hash_demo.py +++ /dev/null @@ -1,12 +0,0 @@ -from pathlib import Path -from openverifiablellm.dataset_hash import compute_sha256 - - -if __name__ == "__main__": - current_dir = Path(__file__).parent - dataset_path = current_dir / "sample_wiki.txt" - - dataset_hash = compute_sha256(dataset_path) - - print("Dataset Hash:") - print(dataset_hash) \ No newline at end of file diff --git a/examples/sample_wiki.py b/examples/sample_wiki.py new file mode 100644 index 0000000..a59646b --- /dev/null +++ b/examples/sample_wiki.py @@ -0,0 +1,18 @@ +import bz2 + +xml_content = """ + + + + + Hello citation world. + This is [[Python|programming language]] + {{Wikipedia }}is a free online encyclopedia. + + + + +""" + +with bz2.open("examples/sample_wiki.xml.bz2", "wt", encoding="utf-8") as f: + f.write(xml_content) \ No newline at end of file diff --git a/examples/sample_wiki.txt b/examples/sample_wiki.txt deleted file mode 100644 index c30144b..0000000 --- a/examples/sample_wiki.txt +++ /dev/null @@ -1,3 +0,0 @@ -Wikipedia is a free online encyclopedia. -It is maintained by a community of volunteers. -This is a small reproducibility sample. diff --git a/examples/sample_wiki.xml.bz2 b/examples/sample_wiki.xml.bz2 new file mode 100644 index 0000000..ba8e6f1 Binary files /dev/null and b/examples/sample_wiki.xml.bz2 differ diff --git a/openverifiablellm/dataset_hash.py b/openverifiablellm/dataset_hash.py deleted file mode 100644 index 0a6f114..0000000 --- a/openverifiablellm/dataset_hash.py +++ /dev/null @@ -1,31 +0,0 @@ -import hashlib -from pathlib import Path -from typing import Union - - -def compute_sha256(file_path: Union[str, Path]) -> str: - """ - Compute SHA256 hash of a file. - - This provides a deterministic fingerprint of the dataset, - enabling reproducibility and verification. - - Parameters - ---------- - file_path : Union[str, Path] - Path to the dataset file (string or Path-like). - - Returns - ------- - str - SHA256 hash string. - """ - path = Path(file_path) - - sha256 = hashlib.sha256() - - with path.open("rb") as f: - while chunk := f.read(8192): - sha256.update(chunk) - - return sha256.hexdigest() \ No newline at end of file diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py new file mode 100644 index 0000000..084e1e2 --- /dev/null +++ b/openverifiablellm/utils.py @@ -0,0 +1,156 @@ +import bz2 +import re +import defusedxml.ElementTree as ET +from pathlib import Path +import sys +from typing import Union +import hashlib +import logging +import json +import platform + +logger = logging.getLogger(__name__) + +# extract clean wikipage from actual wikipage +def extract_text_from_xml(input_path): + """ + Process a compressed Wikipedia XML dump into cleaned plain text. + + Each element is parsed, its revision text is extracted, + cleaned using `clean_wikitext()`, and appended to a single + output text file. + + The processed output is saved to: + data/processed/wiki_clean.txt + + Parameters + ---------- + input_path : str or Path + Path to the compressed Wikipedia XML (.bz2) dump file. + + Output + ------ + Creates: + data/processed/wiki_clean.txt + """ + input_path = Path(input_path) + + # Fixed output path + project_root = Path.cwd() + output_dir = project_root / "data" / "processed" + output_dir.mkdir(parents=True, exist_ok=True) + + output_path = output_dir / "wiki_clean.txt" + + with bz2.open(input_path, "rb") as f: + context = ET.iterparse(f, events=("end",)) + + with open(output_path, "w", encoding="utf-8") as out: + for _, elem in context: + if elem.tag.endswith("page"): + text_elem = elem.find(".//{*}text") + + if text_elem is not None and text_elem.text: + cleaned = clean_wikitext(text_elem.text) + if cleaned: + out.write(cleaned + "\n\n") + + elem.clear() + logger.info("Preprocessing complete. Output saved to %s", output_path) + generate_manifest(input_path,output_path) + +# generate data manifest +def generate_manifest(raw_path, processed_path): + raw_path = Path(raw_path) + processed_path = Path(processed_path) + + if not processed_path.exists(): + raise FileNotFoundError( + f"Processed file not found at {processed_path}. Run preprocessing first." + ) + + manifest = { + "wikipedia_dump": raw_path.name, + "dump_date": extract_dump_date(raw_path.name), + "raw_sha256": compute_sha256(str(raw_path)), + "processed_sha256": compute_sha256(str(processed_path)), + "preprocessing_version": "v1", + "python_version": platform.python_version() + } + project_root = Path.cwd() + manifest_path = project_root / "data" / "dataset_manifest.json" + manifest_path.parent.mkdir(parents=True, exist_ok=True) + + with open(manifest_path, "w") as f: + json.dump(manifest, f, indent=2) + + logger.info("Manifest written to %s", manifest_path) + +# helpers +def compute_sha256(file_path: Union[str, Path]) -> str: + """ + Compute SHA256 hash of a file. + + This provides a deterministic fingerprint of the dataset, + enabling reproducibility and verification. + + Parameters + ---------- + file_path : Union[str, Path] + Path to the dataset file (string or Path-like). + + Returns + ------- + str + SHA256 hash string. + """ + path = Path(file_path) + + sha256 = hashlib.sha256() + + with path.open("rb") as f: + while chunk := f.read(8192): + sha256.update(chunk) + + return sha256.hexdigest() + +def extract_dump_date(filename: str): + parts = filename.split("-") + for part in parts: + if part.isdigit() and len(part) == 8: + return f"{part[:4]}-{part[4:6]}-{part[6:]}" + return "unknown" + +def clean_wikitext(text: str) -> str: + """ + Basic deterministic wikitext cleaning. + + Note: + This uses simple regex-based rules for speed and consistency. + It does NOT fully parse MediaWiki syntax. + + Limitations: + - Deeply nested templates may not be fully removed. + - Some complex cases may not be perfectly handled. + - This is not a complete MediaWiki parser. + + These limitations are acceptable for lightweight, deterministic preprocessing. + """ + text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL) + text = re.sub(r".*?", "", text, flags=re.DOTALL) + text = re.sub(r"<.*?>", "", text) + text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text) + text = re.sub(r"\[\[(.*?)\]\]", r"\1", text) + text = re.sub(r"\s+", " ", text) + return text.strip() + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python -m openverifiablellm.utils ") + sys.exit(1) + + logging.basicConfig( + level=logging.INFO, + format="%(levelname)s - %(message)s" + ) + extract_text_from_xml(sys.argv[1]) diff --git a/pyproject.toml b/pyproject.toml index 34e03d9..121d3bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,15 @@ authors = [ ] requires-python = ">=3.9" +dependencies= [ + "defusedxml" +] + +[project.optional-dependencies] +dev = [ + "pytest" +] + [tool.setuptools.packages.find] include = ["openverifiablellm*"] diff --git a/tests/test_dataset_hash.py b/tests/test_dataset_hash.py deleted file mode 100644 index b560baf..0000000 --- a/tests/test_dataset_hash.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -import hashlib -import tempfile -import pytest -from openverifiablellm.dataset_hash import compute_sha256 - - -def test_correct_sha256_output(tmp_path): - # Create a temporary file - file = tmp_path / "sample.txt" - content = "hello wikipedia" - file.write_text(content, encoding="utf-8") - - # Expected hash using standard hashlib - expected = hashlib.sha256(content.encode("utf-8")).hexdigest() - - # Hash using your function - actual = compute_sha256(str(file)) - - # Verify correctness - assert actual == expected - - -def test_different_content_different_hash(tmp_path): - file1 = tmp_path / "content_a.txt" - file2 = tmp_path / "content_b.txt" - - file1.write_text("Content A", encoding="utf-8") - file2.write_text("Content B", encoding="utf-8") - - assert compute_sha256(file1) != compute_sha256(file2) - - -def test_file_not_found(): - with pytest.raises(FileNotFoundError): - compute_sha256("non_existent_file.txt") \ No newline at end of file diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..c0cede3 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,129 @@ +import bz2 +import hashlib +import pytest +from openverifiablellm import utils + +""" +Unit and integration tests for OpenVerifiableLLM preprocessing pipeline. + +Run with: + pip install -e ".[dev]" + pytest +""" + +# --------------- clean_wikitext tests ------------------------------------ + +def test_clean_wikitext_removes_templates_and_refs(): + text = "Hello {{Infobox}} cite world" + cleaned = utils.clean_wikitext(text) + assert cleaned == "Hello world" + + +def test_clean_wikitext_handles_links(): + text = "This is [[Python|programming language]] and [[India]]" + cleaned = utils.clean_wikitext(text) + assert cleaned == "This is programming language and India" + + +def test_clean_wikitext_collapses_whitespace(): + text = "Hello world\n\n test" + cleaned = utils.clean_wikitext(text) + assert cleaned == "Hello world test" + +# --------------- extract_dump_date tests ------------------------------------ + +def test_extract_dump_date_valid(): + filename = "simplewiki-20260201-pages-articles.xml.bz2" + assert utils.extract_dump_date(filename) == "2026-02-01" + + +def test_extract_dump_date_invalid(): + filename = "no-date-file.xml.bz2" + assert utils.extract_dump_date(filename) == "unknown" + +# --------------- generate manifest ------------------------------------ + +def test_generate_manifest_raises_if_processed_missing(tmp_path): + raw_file = tmp_path / "raw.txt" + raw_file.write_text("dummy") + + missing_file = tmp_path / "missing.txt" + + with pytest.raises(FileNotFoundError): + utils.generate_manifest(raw_file, missing_file) + +def test_generate_manifest_runs_if_file_exists(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + + raw_file = tmp_path / "raw.txt" + raw_file.write_text("dummy") + + processed_file = tmp_path / "processed.txt" + processed_file.write_text("cleaned") + + utils.generate_manifest(raw_file, processed_file) + + manifest_file = tmp_path / "data/dataset_manifest.json" + assert manifest_file.exists() + +# --------------- compute_sha256 ------------------------------------ + +def test_correct_sha256_output(tmp_path): + # Create a temporary file + file = tmp_path / "sample.txt" + content = "hello wikipedia" + file.write_text(content, encoding="utf-8") + + # Expected hash using standard hashlib + expected = hashlib.sha256(content.encode("utf-8")).hexdigest() + + # Hash using your function + actual = utils.compute_sha256(str(file)) + + # Verify correctness + assert actual == expected + + +def test_different_content_different_hash(tmp_path): + file1 = tmp_path / "content_a.txt" + file2 = tmp_path / "content_b.txt" + + file1.write_text("Content A", encoding="utf-8") + file2.write_text("Content B", encoding="utf-8") + + assert utils.compute_sha256(file1) != utils.compute_sha256(file2) + + +def test_file_not_found(): + with pytest.raises(FileNotFoundError): + utils.compute_sha256("non_existent_file.txt") + +# --------------- extract_text_from_xml tests ------------------------------------ + +def test_extract_text_from_xml_end_to_end(tmp_path, monkeypatch): + + xml_content = """ + + + + Hello [[World]] + + + + """ + + input_file = tmp_path / "simplewiki-20260201-pages.xml.bz2" + + with bz2.open(input_file, "wt", encoding="utf-8") as f: + f.write(xml_content) + + # Redirect project root + monkeypatch.chdir(tmp_path) + + utils.extract_text_from_xml(input_file) + + processed_file = tmp_path / "data/processed/wiki_clean.txt" + assert processed_file.exists() + + assert "Hello World" in processed_file.read_text() + \ No newline at end of file