-
-
Notifications
You must be signed in to change notification settings - Fork 22
feat: add deterministic preprocessing and dataset identity tracking #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4e36e15
63182df
3221575
180e33d
106fd2d
abadc03
c959087
5ab32d6
bce705a
89041a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -324,8 +324,11 @@ TSWLatexianTemp* | |||||||||||||||||
| # option is specified. Footnotes are the stored in a file with suffix Notes.bib. | ||||||||||||||||||
| # Uncomment the next line to have this generated file ignored. | ||||||||||||||||||
| #*Notes.bib | ||||||||||||||||||
|
|
||||||||||||||||||
| data/ | ||||||||||||||||||
|
Comment on lines
+327
to
+328
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Consider anchoring Two minor points:
🔧 Proposed fix-
-data/
*.egg-info/
__pycache__/
*.pyc
*.pyo
*.pyd
+/data/📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||
| *.egg-info/ | ||||||||||||||||||
| __pycache__/ | ||||||||||||||||||
| *.pyc | ||||||||||||||||||
| *.pyo | ||||||||||||||||||
| *.pyd | ||||||||||||||||||
| *.bz2 | ||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| import sys | ||
| import logging | ||
| from openverifiablellm.utils import extract_text_from_xml | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial
All logging output comes from inside ♻️ Proposed fix-logger = logging.getLogger(__name__)
-
if __name__ == "__main__":And remove the unused 🤖 Prompt for AI Agents |
||
|
|
||
| """ | ||
| Demo for preprocessing pipeline. | ||
|
|
||
| Run with: | ||
| python -m examples.demo_util examples\sample_wiki.xml.bz2 | ||
| """ | ||
|
|
||
| if __name__ == "__main__": | ||
| if len(sys.argv) < 2: | ||
| print("Usage: python -m examples.demo_util <input_dump>") | ||
| sys.exit(1) | ||
tani-dubey marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| logging.basicConfig( | ||
| level=logging.INFO, | ||
| format="%(levelname)s - %(message)s" | ||
| ) | ||
| extract_text_from_xml(sys.argv[1]) | ||
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| import bz2 | ||
|
|
||
| xml_content = """<?xml version="1.0" encoding="UTF-8"?> | ||
| <mediawiki> | ||
| <page> | ||
| <revision> | ||
| <text> | ||
| Hello <ref>citation</ref> world. | ||
| This is [[Python|programming language]] | ||
| {{Wikipedia }}is a free online encyclopedia. | ||
| </text> | ||
| </revision> | ||
| </page> | ||
| </mediawiki> | ||
| """ | ||
|
|
||
| with bz2.open("examples/sample_wiki.xml.bz2", "wt", encoding="utf-8") as f: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hardcoded CWD-relative path will break when the script is not run from the project root.
🔧 Proposed fix-with bz2.open("examples/sample_wiki.xml.bz2", "wt", encoding="utf-8") as f:
+with bz2.open(Path(__file__).parent / "sample_wiki.xml.bz2", "wt", encoding="utf-8") as f:Also add 🤖 Prompt for AI Agents |
||
| f.write(xml_content) | ||
tani-dubey marked this conversation as resolved.
Show resolved
Hide resolved
|
||
This file was deleted.
This file was deleted.
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,156 @@ | ||||||||||
| import bz2 | ||||||||||
| import re | ||||||||||
| import defusedxml.ElementTree as ET | ||||||||||
| from pathlib import Path | ||||||||||
| import sys | ||||||||||
| from typing import Union | ||||||||||
| import hashlib | ||||||||||
| import logging | ||||||||||
| import json | ||||||||||
| import platform | ||||||||||
|
|
||||||||||
| logger = logging.getLogger(__name__) | ||||||||||
|
|
||||||||||
| # extract clean wikipage from actual wikipage | ||||||||||
| def extract_text_from_xml(input_path): | ||||||||||
| """ | ||||||||||
| Process a compressed Wikipedia XML dump into cleaned plain text. | ||||||||||
|
|
||||||||||
| Each <page> element is parsed, its revision text is extracted, | ||||||||||
| cleaned using `clean_wikitext()`, and appended to a single | ||||||||||
| output text file. | ||||||||||
|
|
||||||||||
| The processed output is saved to: | ||||||||||
| data/processed/wiki_clean.txt | ||||||||||
|
|
||||||||||
| Parameters | ||||||||||
| ---------- | ||||||||||
| input_path : str or Path | ||||||||||
| Path to the compressed Wikipedia XML (.bz2) dump file. | ||||||||||
|
|
||||||||||
| Output | ||||||||||
| ------ | ||||||||||
| Creates: | ||||||||||
| data/processed/wiki_clean.txt | ||||||||||
| """ | ||||||||||
| input_path = Path(input_path) | ||||||||||
|
|
||||||||||
| # Fixed output path | ||||||||||
| project_root = Path.cwd() | ||||||||||
| output_dir = project_root / "data" / "processed" | ||||||||||
| output_dir.mkdir(parents=True, exist_ok=True) | ||||||||||
|
|
||||||||||
| output_path = output_dir / "wiki_clean.txt" | ||||||||||
|
|
||||||||||
| with bz2.open(input_path, "rb") as f: | ||||||||||
| context = ET.iterparse(f, events=("end",)) | ||||||||||
|
|
||||||||||
| with open(output_path, "w", encoding="utf-8") as out: | ||||||||||
| for _, elem in context: | ||||||||||
| if elem.tag.endswith("page"): | ||||||||||
| text_elem = elem.find(".//{*}text") | ||||||||||
|
|
||||||||||
| if text_elem is not None and text_elem.text: | ||||||||||
| cleaned = clean_wikitext(text_elem.text) | ||||||||||
| if cleaned: | ||||||||||
| out.write(cleaned + "\n\n") | ||||||||||
|
|
||||||||||
| elem.clear() | ||||||||||
| logger.info("Preprocessing complete. Output saved to %s", output_path) | ||||||||||
| generate_manifest(input_path,output_path) | ||||||||||
|
Comment on lines
+59
to
+60
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Log message emitted before
🔧 Proposed fix- logger.info("Preprocessing complete. Output saved to %s", output_path)
- generate_manifest(input_path,output_path)
+ generate_manifest(input_path, output_path)
+ logger.info("Preprocessing complete. Output: %s", output_path)🤖 Prompt for AI Agents |
||||||||||
|
|
||||||||||
| # generate data manifest | ||||||||||
| def generate_manifest(raw_path, processed_path): | ||||||||||
| raw_path = Path(raw_path) | ||||||||||
| processed_path = Path(processed_path) | ||||||||||
|
|
||||||||||
| if not processed_path.exists(): | ||||||||||
| raise FileNotFoundError( | ||||||||||
| f"Processed file not found at {processed_path}. Run preprocessing first." | ||||||||||
| ) | ||||||||||
|
Comment on lines
+67
to
+70
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Ruff TRY003: move the exception message into a custom exception class. The long inline message in ♻️ Proposed fix if not processed_path.exists():
- raise FileNotFoundError(
- f"Processed file not found at {processed_path}. Run preprocessing first."
- )
+ raise FileNotFoundError(f"Processed file not found: {processed_path}")🧰 Tools🪛 Ruff (0.15.1)[warning] 68-70: Avoid specifying long messages outside the exception class (TRY003) 🤖 Prompt for AI Agents |
||||||||||
|
|
||||||||||
| manifest = { | ||||||||||
| "wikipedia_dump": raw_path.name, | ||||||||||
| "dump_date": extract_dump_date(raw_path.name), | ||||||||||
| "raw_sha256": compute_sha256(str(raw_path)), | ||||||||||
| "processed_sha256": compute_sha256(str(processed_path)), | ||||||||||
|
Comment on lines
+75
to
+76
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Unnecessary ♻️ Proposed fix- "raw_sha256": compute_sha256(str(raw_path)),
- "processed_sha256": compute_sha256(str(processed_path)),
+ "raw_sha256": compute_sha256(raw_path),
+ "processed_sha256": compute_sha256(processed_path),📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||
| "preprocessing_version": "v1", | ||||||||||
| "python_version": platform.python_version() | ||||||||||
| } | ||||||||||
| project_root = Path.cwd() | ||||||||||
| manifest_path = project_root / "data" / "dataset_manifest.json" | ||||||||||
| manifest_path.parent.mkdir(parents=True, exist_ok=True) | ||||||||||
|
|
||||||||||
| with open(manifest_path, "w") as f: | ||||||||||
| json.dump(manifest, f, indent=2) | ||||||||||
|
Comment on lines
+84
to
+85
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing On Windows where the default locale encoding is not UTF-8, non-ASCII characters in any manifest field (e.g., a dump filename with non-ASCII characters) would silently corrupt the JSON output. 🔧 Proposed fix- with open(manifest_path, "w") as f:
+ with open(manifest_path, "w", encoding="utf-8") as f:📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||
|
|
||||||||||
| logger.info("Manifest written to %s", manifest_path) | ||||||||||
|
|
||||||||||
| # helpers | ||||||||||
| def compute_sha256(file_path: Union[str, Path]) -> str: | ||||||||||
| """ | ||||||||||
| Compute SHA256 hash of a file. | ||||||||||
|
|
||||||||||
| This provides a deterministic fingerprint of the dataset, | ||||||||||
| enabling reproducibility and verification. | ||||||||||
|
|
||||||||||
| Parameters | ||||||||||
| ---------- | ||||||||||
| file_path : Union[str, Path] | ||||||||||
| Path to the dataset file (string or Path-like). | ||||||||||
|
|
||||||||||
| Returns | ||||||||||
| ------- | ||||||||||
| str | ||||||||||
| SHA256 hash string. | ||||||||||
| """ | ||||||||||
| path = Path(file_path) | ||||||||||
|
|
||||||||||
| sha256 = hashlib.sha256() | ||||||||||
|
|
||||||||||
| with path.open("rb") as f: | ||||||||||
| while chunk := f.read(8192): | ||||||||||
| sha256.update(chunk) | ||||||||||
|
|
||||||||||
| return sha256.hexdigest() | ||||||||||
|
|
||||||||||
| def extract_dump_date(filename: str): | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial
♻️ Proposed fix-def extract_dump_date(filename: str):
+def extract_dump_date(filename: str) -> str:📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||
| parts = filename.split("-") | ||||||||||
| for part in parts: | ||||||||||
| if part.isdigit() and len(part) == 8: | ||||||||||
| return f"{part[:4]}-{part[4:6]}-{part[6:]}" | ||||||||||
| return "unknown" | ||||||||||
|
|
||||||||||
| def clean_wikitext(text: str) -> str: | ||||||||||
| """ | ||||||||||
| Basic deterministic wikitext cleaning. | ||||||||||
|
|
||||||||||
| Note: | ||||||||||
| This uses simple regex-based rules for speed and consistency. | ||||||||||
| It does NOT fully parse MediaWiki syntax. | ||||||||||
|
|
||||||||||
| Limitations: | ||||||||||
| - Deeply nested templates may not be fully removed. | ||||||||||
| - Some complex <ref /> cases may not be perfectly handled. | ||||||||||
| - This is not a complete MediaWiki parser. | ||||||||||
|
|
||||||||||
| These limitations are acceptable for lightweight, deterministic preprocessing. | ||||||||||
| """ | ||||||||||
| text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL) | ||||||||||
| text = re.sub(r"<ref.*?>.*?</ref>", "", text, flags=re.DOTALL) | ||||||||||
| text = re.sub(r"<.*?>", "", text) | ||||||||||
| text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text) | ||||||||||
| text = re.sub(r"\[\[(.*?)\]\]", r"\1", text) | ||||||||||
| text = re.sub(r"\s+", " ", text) | ||||||||||
| return text.strip() | ||||||||||
|
|
||||||||||
| if __name__ == "__main__": | ||||||||||
| if len(sys.argv) < 2: | ||||||||||
| print("Usage: python -m openverifiablellm.utils <input_dump>") | ||||||||||
| sys.exit(1) | ||||||||||
|
|
||||||||||
tani-dubey marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
| logging.basicConfig( | ||||||||||
| level=logging.INFO, | ||||||||||
| format="%(levelname)s - %(message)s" | ||||||||||
| ) | ||||||||||
| extract_text_from_xml(sys.argv[1]) | ||||||||||
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Discourage NatSpec globally conflicts with Solidity best practices.
reviews.instructionsapplies to all file types. NatSpec is the standard documentation format for Solidity contracts (used bysolc --userdoc/--devdocand block explorers like Etherscan). Discouraging it globally will produce review comments suppressing documentation in.solfiles.Scope this instruction to non-Solidity files, or remove the NatSpec mention:
🔧 Proposed fix
📝 Committable suggestion
🤖 Prompt for AI Agents