From 4e36e156b51cac2256ff881dff4c34da720081c7 Mon Sep 17 00:00:00 2001 From: tanii1125 Date: Fri, 20 Feb 2026 01:14:07 +0530 Subject: [PATCH 1/7] feat: add deterministic preprocessing and dataset identity tracking --- .gitignore | 3 + README.md | 222 ++++++------------ .../generate_manifest.cpython-313.pyc | Bin 0 -> 1915 bytes .../__pycache__/hash_utils.cpython-313.pyc | Bin 0 -> 1053 bytes scripts/generate_manifest.py | 38 +++ scripts/hash_utils.py | 13 + scripts/preprocess.py | 107 +++++++++ 7 files changed, 238 insertions(+), 145 deletions(-) create mode 100644 scripts/__pycache__/generate_manifest.cpython-313.pyc create mode 100644 scripts/__pycache__/hash_utils.cpython-313.pyc create mode 100644 scripts/generate_manifest.py create mode 100644 scripts/hash_utils.py create mode 100644 scripts/preprocess.py diff --git a/.gitignore b/.gitignore index 9308a4b..3846644 100644 --- a/.gitignore +++ b/.gitignore @@ -324,3 +324,6 @@ TSWLatexianTemp* # option is specified. Footnotes are the stored in a file with suffix Notes.bib. # Uncomment the next line to have this generated file ignored. #*Notes.bib + +dataset_manifest.json +data \ No newline at end of file diff --git a/README.md b/README.md index 3c5adf2..365984d 100644 --- a/README.md +++ b/README.md @@ -48,192 +48,124 @@ ---
-

TODO: Project Name

+

OpenVerifiableLLM โ€“ Deterministic Dataset Pipeline

-[TODO](https://TODO.stability.nexus/) is a ... TODO: Project Description. +OpenVerifiableLLM is a deterministic Wikipedia preprocessing and dataset verification pipeline designed to support fully reproducible LLM training. ---- - -## ๐Ÿš€ Features - -TODO: List your main features here: - -- **Feature 1**: Description -- **Feature 2**: Description -- **Feature 3**: Description -- **Feature 4**: Description - ---- - -## ๐Ÿ’ป Tech Stack - -TODO: Update based on your project - -### Frontend -- React / Next.js / Flutter / React Native -- TypeScript -- TailwindCSS +It ensures that: -### Backend -- Flask / FastAPI / Node.js / Supabase -- Database: PostgreSQL / SQLite / MongoDB - -### AI/ML (if applicable) -- LangChain / LangGraph / LlamaIndex -- Google Gemini / OpenAI / Anthropic Claude -- Vector Database: Weaviate / Pinecone / Chroma -- RAG / Prompt Engineering / Agent Frameworks - -### Blockchain (if applicable) -- Solidity / solana / cardano / ergo Smart Contracts -- Hardhat / Truffle / foundry -- Web3.js / Ethers.js / Wagmi -- OpenZeppelin / alchemy / Infura - ---- - -## โœ… Project Checklist - -TODO: Complete applicable items based on your project type - -- [ ] **The protocol** (if applicable): - - [ ] has been described and formally specified in a paper. - - [ ] has had its main properties mathematically proven. - - [ ] has been formally verified. -- [ ] **The smart contracts** (if applicable): - - [ ] were thoroughly reviewed by at least two knights of The Stable Order. - - [ ] were deployed to: [Add deployment details] -- [ ] **The mobile app** (if applicable): - - [ ] has an _About_ page containing the Stability Nexus's logo and pointing to the social media accounts of the Stability Nexus. - - [ ] is available for download as a release in this repo. - - [ ] is available in the relevant app stores. -- [ ] **The AI/ML components** (if applicable): - - [ ] LLM/model selection and configuration are documented. - - [ ] Prompts and system instructions are version-controlled. - - [ ] Content safety and moderation mechanisms are implemented. - - [ ] API keys and rate limits are properly managed. +- The same Wikipedia dump always produces identical processed output. +- Dataset fingerprints (SHA256 hashes) are generated for verification. +- A manifest file captures dataset identity and environment metadata. --- -## ๐Ÿ”— Repository Links - -TODO: Update with your repository structure +## ๐Ÿš€ Features -1. [Main Repository](https://github.com/AOSSIE-Org/TODO) -2. [Frontend](https://github.com/AOSSIE-Org/TODO/tree/main/frontend) (if separate) -3. [Backend](https://github.com/AOSSIE-Org/TODO/tree/main/backend) (if separate) +- **Deterministic Wikipedia preprocessing** +- **Wikitext cleaning (templates, references, links removed)** +- **Stable XML parsing with memory-efficient streaming** +- **SHA256 hashing of raw and processed datasets** +- **Automatic dataset manifest generation** +- **Reproducible data identity tracking** --- -## ๐Ÿ—๏ธ Architecture Diagram - -TODO: Add your system architecture diagram here - -``` -[Architecture Diagram Placeholder] -``` - -You can create architecture diagrams using: -- [Draw.io](https://draw.io) -- [Excalidraw](https://excalidraw.com) -- [Lucidchart](https://lucidchart.com) -- [Mermaid](https://mermaid.js.org) (for code-based diagrams) +## ๐Ÿ’ป Tech Stack -Example structure to include: -- Frontend components -- Backend services -- Database architecture -- External APIs/services -- Data flow between components +- Python 3.9+ +- `xml.etree.ElementTree` (stream parsing) +- `bz2` (compressed dump handling) +- `hashlib` (SHA256 hashing) +- `pathlib` +- `re` (deterministic cleaning) --- -## ๐Ÿ”„ User Flow - -TODO: Add user flow diagrams showing how users interact with your application - -``` -[User Flow Diagram Placeholder] +## ๐Ÿ“‚ Project Structure + +```text +OpenVerifiableLLM/ +โ”‚ +โ”œโ”€โ”€ scripts/ +โ”‚ โ”œโ”€โ”€ preprocess.py +โ”‚ โ”œโ”€โ”€ generate_manifest.py +โ”‚ โ””โ”€โ”€ hash_utils.py +โ”‚ +โ”œโ”€โ”€ data/ +โ”‚ โ”œโ”€โ”€ raw/ +โ”‚ โ””โ”€โ”€ processed/ +โ”‚ +โ””โ”€โ”€ dataset_manifest.json ``` -### Key User Journeys - -TODO: Document main user flows: - -1. **User Journey 1**: Description - - Step 1 - - Step 2 - - Step 3 - -2. **User Journey 2**: Description - - Step 1 - - Step 2 - - Step 3 - -3. **User Journey 3**: Description - - Step 1 - - Step 2 - - Step 3 - --- ## ๏ฟฝ๐Ÿ€ Getting Started ### Prerequisites -TODO: List what developers need installed +- Python 3.9+ +- Wikipedia dump from: + https://dumps.wikimedia.org/ -- Node.js 18+ / Python 3.9+ / Flutter SDK -- npm / yarn / pnpm -- [Any specific tools or accounts needed] +Recommended for testing: +- `simplewiki-YYYYMMDD-pages-articles.xml.bz2` -### Installation +--- -TODO: Provide detailed setup instructions +### Installation #### 1. Clone the Repository ```bash -git clone https://github.com/AOSSIE-Org/TODO.git -cd TODO +git clone https://github.com/AOSSIE-Org/OpenVerifiableLLM.git +cd OpenVerifiableLLM ``` -#### 2. Install Dependencies +### โ–ถ Running the Pipeline + +#### Step 1 โ€” Place Dump File +Move your Wikipedia dump into: ```bash -npm install -# or -yarn install -# or -pnpm install +data/raw/ ``` -#### 3. Configure Environment Variables(.env.example) +Example: -Create a `.env` file in the root directory: - -```env -# Add your environment variables here -API_KEY=your_api_key -DATABASE_URL=your_database_url +```bash +data/raw/simplewiki-20260201-pages-articles.xml.bz2 ``` - -#### 4. Run the Development Server +#### Step 2 โ€” Run Preprocessing ```bash -npm run dev -# or -yarn dev -# or -pnpm dev +python scripts/preprocess.py data/raw/simplewiki-20260201-pages-articles.xml.bz2 ``` - -#### 5. Open your Browser - -Navigate to [http://localhost:3000](http://localhost:3000) to see the application. - -For detailed setup instructions, please refer to our [Installation Guide](./docs/INSTALL_GUIDE.md) (if you have one). +This will: +- Create `data/processed/wiki_clean.txt` +- Generate `dataset_manifest.json` +- Compute `SHA256` hashes + +#### ๐Ÿ“œ Example Manifest + +```json +{ + "wikipedia_dump": "simplewiki-20260201-pages-articles.xml.bz2", + "dump_date": "2026-02-01", + "raw_sha256": "...", + "processed_sha256": "...", + "preprocessing_version": "v1", + "python_version": "3.13.2" +} +``` +## ๐Ÿ“ˆ Future Extensions +- Deterministic tokenization stage +- Token-level hashing +- Multi-GPU training reproducibility +- Environment containerization (Docker) +- Full checkpoint verification protocol --- diff --git a/scripts/__pycache__/generate_manifest.cpython-313.pyc b/scripts/__pycache__/generate_manifest.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26c58ea894fad36fb8981a06a8faa6bbe404c955 GIT binary patch literal 1915 zcma(R?@JqJ__-gsTyh$Lw5H7iBzP>%`s|604wq!dwBej~T+aJWE)_>=kaDC%jDWxGTPKfe1+R z)Bd+v#TSK}JP{Qk3M4SY#Qi(eY%Ey^e;%urowFq2F>jEw>q>?AuuzdQm%b-t&Lvl#5PC_u{RDNeGU zHH96X0&q2Y0VZj{91kIU4{1)}8#J!~rO){+R)*6cfebUgxR7+1{v8^@4rUm31Uwvn z$+%?{=ZrY#_$;%a+m2tivib}X9L68#2zA8qnJTd@hj$STLzZP{AUiz$fvFiEX++QI z>WzYS_3E{0D?@bCwx(w^LnErKrDs(`&uNyOFc+LQZQdqo#!hETvt~Mr*W?&lqfS|$ z;IRZ-O&!b0d&#wP)uvNRm$zHPpYvN%%ks#*k>$|`qu*r8r@z)K1K%23k?7js+IuVc zm5Ba%+;llo}a&{gprP3&+3@Ms^7bd6|M<9l!^*oSio4M0*SSi))@&A7B zeV=Oa^ipje^(wyq)M_-cr={>IS}zwSIy~wXj)B+EiZ3A;Z0a@dP7xIUP(Al77+mVO zcQb+QeiULb)uo6^;3DVFGS0L2EoQDU8cNXf3mY6dp$~fMhO$`695(+cknxv{zRR70 z921w4i*I1lsb@`6%xIRSWzWv(xAb(Tps7Z}p0^h-UM8eShUM#?f=u^-Y!q!dS1cLX zggj9)WRqxh3%W6b8U(w02cHCoy)zIOoDj8YYFS-%8{G*s+T95fbuPWXXq?uD8=fS5 z$a4CQMl8K(I3aVv&KC`jUkuYAEX{s3TQ@CKW9D{fhH;e%x{=ikdr`htA3oU|$E<9X zGBoyFsZdx*#6yIRvm{|;s?I^Pyo-RRmN(o&Xcx4kuGz+SoE0S4q zlFIlZt^tR4o#1J#ao!PWpeAls^xjM6J}-J3!C`V1wTa6_CbT!5HqXsQWF&#|R{UGP z;;{s^miCpg2cIpCy$~Q0U5{R@hKHAue@G`bCHb)=Z%VPpQmiI6uZP}$B97NYX<584 zuC#q2;R<-Y>tX*_{k8C!^8Al?zq?xv57i=_KXjGz-}hG|Z`Dq|`7>8ZR(sy5o*dcg z?5;&)o6&)d=s-1k?s>DnRlqloK`^u)>MmPP0=<>nHL+@)H};DW%&S&D zU9$Cp<>s7pQ$;#>ZZ>`b(oShNW8%ncF7wluS?5l=l* zFK{7p#Au@4{1^NWG)2ULn)Hv3bT&n_;_!zgV;)NEyWgQy?@Po`J!ak>723G0sRydx+ zEDYl?1?M_po%NL!WEf*O|F66%jzJbFc$o?$opF%4b>oMD$0Og;Rrl2*a*2)FjT#D>v5)AF2C8A8_cz&7!RO zpTtj_cllCT2#fE-zU#|oAb;U*7S;HL`jGta4`EaQwrPqBUpYWN61bDdh9=`#vQOp_u literal 0 HcmV?d00001 diff --git a/scripts/generate_manifest.py b/scripts/generate_manifest.py new file mode 100644 index 0000000..058098c --- /dev/null +++ b/scripts/generate_manifest.py @@ -0,0 +1,38 @@ +import json +import sys +import platform +from pathlib import Path +from hash_utils import sha256_file + + +def extract_dump_date(filename: str): + parts = filename.split("-") + for part in parts: + if part.isdigit() and len(part) == 8: + return f"{part[:4]}-{part[4:6]}-{part[6:]}" + return "unknown" + + +def generate_manifest(raw_path, processed_path): + raw_path = Path(raw_path) + + # Automatically infer processed file path + processed_path = Path("data/processed/wiki_clean.txt") + + if not processed_path.exists(): + print("Error: Processed file not found. Run preprocessing first.") + sys.exit(1) + + manifest = { + "wikipedia_dump": raw_path.name, + "dump_date": extract_dump_date(raw_path.name), + "raw_sha256": sha256_file(str(raw_path)), + "processed_sha256": sha256_file(str(processed_path)), + "preprocessing_version": "v1", + "python_version": platform.python_version() + } + + with open("dataset_manifest.json", "w") as f: + json.dump(manifest, f, indent=2) + + print("Manifest generated successfully.") diff --git a/scripts/hash_utils.py b/scripts/hash_utils.py new file mode 100644 index 0000000..88a2cb5 --- /dev/null +++ b/scripts/hash_utils.py @@ -0,0 +1,13 @@ +import hashlib +import sys + +def sha256_file(filepath): + sha256 = hashlib.sha256() + with open(filepath, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + sha256.update(chunk) + return sha256.hexdigest() + +if __name__ == "__main__": + path = sys.argv[1] + print(sha256_file(path)) \ No newline at end of file diff --git a/scripts/preprocess.py b/scripts/preprocess.py new file mode 100644 index 0000000..4cf5cba --- /dev/null +++ b/scripts/preprocess.py @@ -0,0 +1,107 @@ +# import bz2 +# import re +# import xml.etree.ElementTree as ET +# from pathlib import Path +# import sys + + +# def clean_wikitext(text: str) -> str: +# # Remove templates {{...}} +# text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL) + +# # Remove references ... +# text = re.sub(r".*?", "", text, flags=re.DOTALL) + +# # Remove HTML tags +# text = re.sub(r"<.*?>", "", text) + +# # Convert [[Link|Text]] โ†’ Text +# text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text) + +# # Convert [[Link]] โ†’ Link +# text = re.sub(r"\[\[(.*?)\]\]", r"\1", text) + +# # Remove multiple spaces/newlines +# text = re.sub(r"\s+", " ", text) + +# return text.strip() + + +# def extract_text_from_xml(input_path, output_path): +# Path(output_path).parent.mkdir(parents=True, exist_ok=True) + +# with bz2.open(input_path, "rb") as f: +# context = ET.iterparse(f, events=("end",)) + +# with open(output_path, "w", encoding="utf-8") as out: +# for event, elem in context: +# if elem.tag.endswith("page"): +# text_elem = elem.find(".//{*}text") + +# if text_elem is not None and text_elem.text: +# cleaned = clean_wikitext(text_elem.text) +# if cleaned: +# out.write(cleaned + "\n\n") + +# elem.clear() + + +# if __name__ == "__main__": +# input_path = sys.argv[1] +# output_path = sys.argv[2] +# extract_text_from_xml(input_path, output_path) +# print("Preprocessing complete.") + +import bz2 +import re +import xml.etree.ElementTree as ET +from pathlib import Path +import sys +from generate_manifest import generate_manifest + + +def clean_wikitext(text: str) -> str: + text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL) + text = re.sub(r".*?", "", text, flags=re.DOTALL) + text = re.sub(r"<.*?>", "", text) + text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text) + text = re.sub(r"\[\[(.*?)\]\]", r"\1", text) + text = re.sub(r"\s+", " ", text) + return text.strip() + + +def extract_text_from_xml(input_path): + input_path = Path(input_path) + + # Fixed output path + output_dir = Path("data/processed") + output_dir.mkdir(parents=True, exist_ok=True) + + output_path = output_dir / "wiki_clean.txt" + + with bz2.open(input_path, "rb") as f: + context = ET.iterparse(f, events=("end",)) + + with open(output_path, "w", encoding="utf-8") as out: + for event, elem in context: + if elem.tag.endswith("page"): + text_elem = elem.find(".//{*}text") + + if text_elem is not None and text_elem.text: + cleaned = clean_wikitext(text_elem.text) + if cleaned: + out.write(cleaned + "\n\n") + + elem.clear() + + print(f"Preprocessing complete. Output saved to {output_path}") + + # ๐Ÿ”ฅ Automatically generate manifest + generate_manifest(input_path, output_path) + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python preprocess.py ") + sys.exit(1) + + extract_text_from_xml(sys.argv[1]) From 63182df505a913da8681f9a3529fed7e4b3a8b87 Mon Sep 17 00:00:00 2001 From: tanii1125 Date: Sat, 21 Feb 2026 13:21:17 +0530 Subject: [PATCH 2/7] updated .coderabbit.yaml --- .coderabbit.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.coderabbit.yaml b/.coderabbit.yaml index 877065c..19d6c04 100644 --- a/.coderabbit.yaml +++ b/.coderabbit.yaml @@ -157,6 +157,8 @@ reviews: - Confirm that the code meets the project's requirements and objectives - Confirm that copyright years are up-to date whenever a file is changed - Point out redundant obvious comments that do not add clarity to the code + - Ensure that comments are concise and suggest more concise comment statements if possible + - Discourage usage of verbose comment styles such as NatSpec - Look for code duplication - Suggest code completions when: - seeing a TODO comment @@ -275,4 +277,4 @@ reviews: - Image optimization (appropriate size and format) - Proper @2x and @3x variants for different screen densities - SVG assets are optimized - - Font files are licensed and optimized + - Font files are licensed and optimized \ No newline at end of file From 32215759ce1b63a7ad41f75a120beb3b1ae30c31 Mon Sep 17 00:00:00 2001 From: tanii1125 Date: Sat, 21 Feb 2026 15:57:41 +0530 Subject: [PATCH 3/7] resolved coderabbitai reviews --- .gitignore | 3 +- README.md | 13 ++- pytest.ini | 2 + requirements.txt | 2 + scripts/__init__.py | 0 scripts/generate_manifest.py | 20 +++-- scripts/hash_utils.py | 7 +- scripts/preprocess.py | 81 +++++------------- ...rate_manifest.cpython-313-pytest-9.0.2.pyc | Bin 0 -> 1229 bytes tests/test_generate_manifest.py | 21 +++++ 10 files changed, 76 insertions(+), 73 deletions(-) create mode 100644 pytest.ini create mode 100644 requirements.txt create mode 100644 scripts/__init__.py create mode 100644 tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc create mode 100644 tests/test_generate_manifest.py diff --git a/.gitignore b/.gitignore index 3846644..1d7036a 100644 --- a/.gitignore +++ b/.gitignore @@ -325,5 +325,4 @@ TSWLatexianTemp* # Uncomment the next line to have this generated file ignored. #*Notes.bib -dataset_manifest.json -data \ No newline at end of file +data/ \ No newline at end of file diff --git a/README.md b/README.md index 365984d..0a68f5e 100644 --- a/README.md +++ b/README.md @@ -108,9 +108,10 @@ OpenVerifiableLLM/ - Python 3.9+ - Wikipedia dump from: - https://dumps.wikimedia.org/ + Recommended for testing: + - `simplewiki-YYYYMMDD-pages-articles.xml.bz2` --- @@ -127,6 +128,7 @@ cd OpenVerifiableLLM ### โ–ถ Running the Pipeline #### Step 1 โ€” Place Dump File + Move your Wikipedia dump into: ```bash @@ -138,12 +140,15 @@ Example: ```bash data/raw/simplewiki-20260201-pages-articles.xml.bz2 ``` + #### Step 2 โ€” Run Preprocessing ```bash -python scripts/preprocess.py data/raw/simplewiki-20260201-pages-articles.xml.bz2 +python -m scripts.preprocess ``` + This will: + - Create `data/processed/wiki_clean.txt` - Generate `dataset_manifest.json` - Compute `SHA256` hashes @@ -160,7 +165,9 @@ This will: "python_version": "3.13.2" } ``` + ## ๐Ÿ“ˆ Future Extensions + - Deterministic tokenization stage - Token-level hashing - Multi-GPU training reproducibility @@ -209,4 +216,4 @@ Thanks a lot for spending your time helping TODO grow. Keep rocking ๐Ÿฅ‚ [![Contributors](https://contrib.rocks/image?repo=AOSSIE-Org/TODO)](https://github.com/AOSSIE-Org/TODO/graphs/contributors) -ยฉ 2025 AOSSIE +ยฉ 2025 AOSSIE diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..03f586d --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = . \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..634c013 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +defusedxml>=0.7.1 +pytest \ No newline at end of file diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/generate_manifest.py b/scripts/generate_manifest.py index 058098c..8e73c49 100644 --- a/scripts/generate_manifest.py +++ b/scripts/generate_manifest.py @@ -1,9 +1,10 @@ import json -import sys import platform from pathlib import Path -from hash_utils import sha256_file +from .hash_utils import sha256_file +# Anchor paths to project root (two levels up from this file) +_PROJECT_ROOT = Path(__file__).resolve().parent.parent def extract_dump_date(filename: str): parts = filename.split("-") @@ -15,13 +16,12 @@ def extract_dump_date(filename: str): def generate_manifest(raw_path, processed_path): raw_path = Path(raw_path) - - # Automatically infer processed file path - processed_path = Path("data/processed/wiki_clean.txt") + processed_path = Path(processed_path) if not processed_path.exists(): - print("Error: Processed file not found. Run preprocessing first.") - sys.exit(1) + raise FileNotFoundError( + f"Processed file not found at {processed_path}. Run preprocessing first." + ) manifest = { "wikipedia_dump": raw_path.name, @@ -32,7 +32,9 @@ def generate_manifest(raw_path, processed_path): "python_version": platform.python_version() } - with open("dataset_manifest.json", "w") as f: + manifest_path = _PROJECT_ROOT / "dataset_manifest.json" + + with open(manifest_path, "w") as f: json.dump(manifest, f, indent=2) - print("Manifest generated successfully.") + print(f"Manifest written to {manifest_path}") \ No newline at end of file diff --git a/scripts/hash_utils.py b/scripts/hash_utils.py index 88a2cb5..6106807 100644 --- a/scripts/hash_utils.py +++ b/scripts/hash_utils.py @@ -3,11 +3,16 @@ def sha256_file(filepath): sha256 = hashlib.sha256() + chunk_size = 1024 * 1024 # 1 MiB for better throughput + with open(filepath, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): + for chunk in iter(lambda: f.read(chunk_size), b""): sha256.update(chunk) return sha256.hexdigest() if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python hash_utils.py ") + sys.exit(1) path = sys.argv[1] print(sha256_file(path)) \ No newline at end of file diff --git a/scripts/preprocess.py b/scripts/preprocess.py index 4cf5cba..236a6cc 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -1,66 +1,31 @@ -# import bz2 -# import re -# import xml.etree.ElementTree as ET -# from pathlib import Path -# import sys - - -# def clean_wikitext(text: str) -> str: -# # Remove templates {{...}} -# text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL) - -# # Remove references ... -# text = re.sub(r".*?", "", text, flags=re.DOTALL) - -# # Remove HTML tags -# text = re.sub(r"<.*?>", "", text) - -# # Convert [[Link|Text]] โ†’ Text -# text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text) - -# # Convert [[Link]] โ†’ Link -# text = re.sub(r"\[\[(.*?)\]\]", r"\1", text) - -# # Remove multiple spaces/newlines -# text = re.sub(r"\s+", " ", text) - -# return text.strip() - - -# def extract_text_from_xml(input_path, output_path): -# Path(output_path).parent.mkdir(parents=True, exist_ok=True) - -# with bz2.open(input_path, "rb") as f: -# context = ET.iterparse(f, events=("end",)) - -# with open(output_path, "w", encoding="utf-8") as out: -# for event, elem in context: -# if elem.tag.endswith("page"): -# text_elem = elem.find(".//{*}text") - -# if text_elem is not None and text_elem.text: -# cleaned = clean_wikitext(text_elem.text) -# if cleaned: -# out.write(cleaned + "\n\n") - -# elem.clear() - - -# if __name__ == "__main__": -# input_path = sys.argv[1] -# output_path = sys.argv[2] -# extract_text_from_xml(input_path, output_path) -# print("Preprocessing complete.") - import bz2 import re -import xml.etree.ElementTree as ET +import defusedxml.ElementTree as ET from pathlib import Path import sys -from generate_manifest import generate_manifest +from .generate_manifest import generate_manifest def clean_wikitext(text: str) -> str: + """ + Basic deterministic wikitext cleaning. + + NOTE: + This implementation intentionally uses regex-based approximations + for performance and determinism. It does NOT fully parse MediaWiki syntax. + + Known limitations: + - Nested templates like {{Infobox | birth={{Date|1990|1|1}}}} + are not fully handled. The non-greedy template regex may leave + stray closing braces (e.g., "}}") in deeply nested structures. + - Self-closing references such as are only partially + handled. While generic tag stripping removes the tag itself, + complex edge cases may not be fully normalized. + - This is not a complete MediaWiki parser and should not be relied + upon for perfectly structured wikitext normalization. + + These trade-offs are acceptable for v1 deterministic preprocessing. + """ text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL) text = re.sub(r".*?", "", text, flags=re.DOTALL) text = re.sub(r"<.*?>", "", text) @@ -83,7 +48,7 @@ def extract_text_from_xml(input_path): context = ET.iterparse(f, events=("end",)) with open(output_path, "w", encoding="utf-8") as out: - for event, elem in context: + for _, elem in context: if elem.tag.endswith("page"): text_elem = elem.find(".//{*}text") @@ -96,7 +61,7 @@ def extract_text_from_xml(input_path): print(f"Preprocessing complete. Output saved to {output_path}") - # ๐Ÿ”ฅ Automatically generate manifest + # Automatically generate manifest generate_manifest(input_path, output_path) if __name__ == "__main__": diff --git a/tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc b/tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b295c70fab69f451015dfc335bf413a8944f83eb GIT binary patch literal 1229 zcmbVL&1(}u6o31XWb@U~)*7OOu6oH^RUXRh}FxML5J0FFZy6;vU`JQ4jcUKm*5;+f=m}$!0JUL zVifarD8Wr-6pE6e<_<&D+RS=eo*yXGDwXnvy@gT<&+qTuXz>MbS-bg*YJ9?h&Vsiy;yjR19e^Zuid_} zcF!fweM0S`ZLOEd;^N&kUWN&4o)|;ct@{%kaPr9Pe3)q$O_x^lgfUVuyJcwtD{_|E zESd%!B$7>|mAKHjbD+ld)nrReHbc9r{%QDZv(g&A*jW4$PCQ9JPVXi&dtu|fmTG2p zwW~iBh>Y!rjdx+=km11Lb!;Z8yp2X?QlVo4;iZH?D8Br!Kvv{Gib?Is0mZ;;h;>v9 zB`^Xx8o)T^655XU=@6q1<;%o!NWqY4h+{faqr4p&L*ozh@yDh`2l(OG-^i~yj8o62 zZIUe;M@WY;A&w85Lh}#ON7647t&#CY?m!*dSJN#u-L!Yr%#R=hPCwn)Q^(tw4@T72 z)@ydzvmNHgZn<@{@1@L+II|WGoqBfF$xxcUxKh%pwFj=%=QL5q1ev0L(?RqSG zLT`c{p&rrAE1;(^p+ki>RjC$gWin0E7{h-ISPIQIMUteC0FU5d`~zJ7u7;$E_K+rB H5vjib@(LrW literal 0 HcmV?d00001 diff --git a/tests/test_generate_manifest.py b/tests/test_generate_manifest.py new file mode 100644 index 0000000..987e86d --- /dev/null +++ b/tests/test_generate_manifest.py @@ -0,0 +1,21 @@ +import pytest +from scripts.generate_manifest import generate_manifest + +def test_generate_manifest_raises_if_processed_missing(tmp_path): + raw_file = tmp_path / "raw.txt" + raw_file.write_text("dummy") + + missing_file = tmp_path / "missing.txt" + + with pytest.raises(FileNotFoundError): + generate_manifest(raw_file, missing_file) + +def test_generate_manifest_runs_if_file_exists(tmp_path): + raw_file = tmp_path / "raw.txt" + raw_file.write_text("dummy") + + processed_file = tmp_path / "processed.txt" + processed_file.write_text("cleaned") + + # Should not raise + generate_manifest(raw_file, processed_file) \ No newline at end of file From c959087a0cac70e63673a6a4bebc14845dfd7dbc Mon Sep 17 00:00:00 2001 From: tanii1125 Date: Sun, 22 Feb 2026 22:37:34 +0530 Subject: [PATCH 4/7] Refactor: move helper functions to utils.py, written tests, and clean structure --- README.md | 81 +++++++++++++--- examples/demo_util.py | 19 ++++ examples/hash_demo.py | 12 --- examples/sample_wiki.txt | 3 - examples/sample_wiki.xml | 18 ++++ examples/sample_wiki.xml.bz2 | Bin 0 -> 236 bytes openverifiablellm/dataset_hash.py | 31 ------ openverifiablellm/utils.py | 156 ++++++++++++++++++++++++++++++ pytest.ini | 2 - scripts/__init__.py | 0 scripts/generate_manifest.py | 40 -------- scripts/hash_utils.py | 18 ---- scripts/preprocess.py | 72 -------------- tests/test_dataset_hash.py | 36 ------- tests/test_generate_manifest.py | 21 ---- tests/test_util.py | 120 +++++++++++++++++++++++ 16 files changed, 380 insertions(+), 249 deletions(-) create mode 100644 examples/demo_util.py delete mode 100644 examples/hash_demo.py delete mode 100644 examples/sample_wiki.txt create mode 100644 examples/sample_wiki.xml create mode 100644 examples/sample_wiki.xml.bz2 delete mode 100644 openverifiablellm/dataset_hash.py create mode 100644 openverifiablellm/utils.py delete mode 100644 pytest.ini delete mode 100644 scripts/__init__.py delete mode 100644 scripts/generate_manifest.py delete mode 100644 scripts/hash_utils.py delete mode 100644 scripts/preprocess.py delete mode 100644 tests/test_dataset_hash.py delete mode 100644 tests/test_generate_manifest.py create mode 100644 tests/test_util.py diff --git a/README.md b/README.md index 0a68f5e..300a40d 100644 --- a/README.md +++ b/README.md @@ -88,16 +88,24 @@ It ensures that: ```text OpenVerifiableLLM/ โ”‚ -โ”œโ”€โ”€ scripts/ -โ”‚ โ”œโ”€โ”€ preprocess.py -โ”‚ โ”œโ”€โ”€ generate_manifest.py -โ”‚ โ””โ”€โ”€ hash_utils.py +โ”œโ”€โ”€ data/ โ† created automatically at runtime +โ”‚ โ”œโ”€โ”€ dataset_manifest.json +โ”‚ โ””โ”€โ”€ processed/ +โ”‚ โ””โ”€โ”€ wiki_clean.txt +| +โ”œโ”€โ”€ examples/ +โ”‚ โ”œโ”€โ”€ demo_util.py +โ”‚ โ”œโ”€โ”€ sample_wiki.xml +โ”‚ โ””โ”€โ”€ sample_wiki.xml.bz2 โ”‚ -โ”œโ”€โ”€ data/ -โ”‚ โ”œโ”€โ”€ raw/ -โ”‚ โ””โ”€โ”€ processed/ +โ”œโ”€โ”€ openverifiablellm/ +โ”‚ โ”œโ”€โ”€ __init__.py โ† (should exist) +โ”‚ โ””โ”€โ”€ utils.py โ”‚ -โ””โ”€โ”€ dataset_manifest.json +โ”œโ”€โ”€ tests/ +โ”‚ โ””โ”€โ”€ test_util.py +โ”‚ +โ””โ”€โ”€ requirements.txt ``` --- @@ -127,24 +135,37 @@ cd OpenVerifiableLLM ### โ–ถ Running the Pipeline -#### Step 1 โ€” Place Dump File +#### Step 1 โ€” Install the Package -Move your Wikipedia dump into: +From the project root: ```bash -data/raw/ +pip install -e . ``` +#### Step 2 โ€” Place Dump File + +Move your Wikipedia dump into the project root directory +(the same directory that contains the ```openverifiablellm/``` folder). + Example: ```bash -data/raw/simplewiki-20260201-pages-articles.xml.bz2 +simplewiki-20260201-pages-articles.xml.bz2 ``` -#### Step 2 โ€” Run Preprocessing +Copy the file path to use as the argument when running preprocessing. + +Example (relative path): ```bash -python -m scripts.preprocess +simplewiki-20260201-pages-articles.xml.bz2 +``` + +#### Step 3 โ€” Run Preprocessing + +```bash +python -m openverifiablellm.utils simplewiki-20260201-pages-articles.xml.bz2 ``` This will: @@ -152,6 +173,7 @@ This will: - Create `data/processed/wiki_clean.txt` - Generate `dataset_manifest.json` - Compute `SHA256` hashes +- Log preprocessing status #### ๐Ÿ“œ Example Manifest @@ -166,6 +188,37 @@ This will: } ``` +--- + +## ๐Ÿงช Running Tests + +To verify correctness and reproducibility: + +```bash +pytest +``` + +This runs: + +- Unit tests for: + + - `clean_wikitext` + - `compute_sha256` + - `extract_dump_date` + - `generate_manifest` + +- Integration test for: + + - `extract_text_from_xml` (end-to-end pipeline using a synthetic .bz2 file) + +All tests should pass: + +```text +11 passed in 0.xx s +``` + +--- + ## ๐Ÿ“ˆ Future Extensions - Deterministic tokenization stage diff --git a/examples/demo_util.py b/examples/demo_util.py new file mode 100644 index 0000000..ac818e1 --- /dev/null +++ b/examples/demo_util.py @@ -0,0 +1,19 @@ +## run via- +## python -m examples.demo_util examples\sample_wiki.xml.bz2 + +import sys +import logging +from openverifiablellm.utils import extract_text_from_xml + +logger = logging.getLogger(__name__) + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python preprocess.py ") + sys.exit(1) + + logging.basicConfig( + level=logging.INFO, + format="%(levelname)s - %(message)s" + ) + extract_text_from_xml(sys.argv[1]) \ No newline at end of file diff --git a/examples/hash_demo.py b/examples/hash_demo.py deleted file mode 100644 index ad8842c..0000000 --- a/examples/hash_demo.py +++ /dev/null @@ -1,12 +0,0 @@ -from pathlib import Path -from openverifiablellm.dataset_hash import compute_sha256 - - -if __name__ == "__main__": - current_dir = Path(__file__).parent - dataset_path = current_dir / "sample_wiki.txt" - - dataset_hash = compute_sha256(dataset_path) - - print("Dataset Hash:") - print(dataset_hash) \ No newline at end of file diff --git a/examples/sample_wiki.txt b/examples/sample_wiki.txt deleted file mode 100644 index c30144b..0000000 --- a/examples/sample_wiki.txt +++ /dev/null @@ -1,3 +0,0 @@ -Wikipedia is a free online encyclopedia. -It is maintained by a community of volunteers. -This is a small reproducibility sample. diff --git a/examples/sample_wiki.xml b/examples/sample_wiki.xml new file mode 100644 index 0000000..a59646b --- /dev/null +++ b/examples/sample_wiki.xml @@ -0,0 +1,18 @@ +import bz2 + +xml_content = """ + + + + + Hello citation world. + This is [[Python|programming language]] + {{Wikipedia }}is a free online encyclopedia. + + + + +""" + +with bz2.open("examples/sample_wiki.xml.bz2", "wt", encoding="utf-8") as f: + f.write(xml_content) \ No newline at end of file diff --git a/examples/sample_wiki.xml.bz2 b/examples/sample_wiki.xml.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..ba8e6f1f5c469f6e8fbc8b554ded8fb6d4cc6f65 GIT binary patch literal 236 zcmV str: - """ - Compute SHA256 hash of a file. - - This provides a deterministic fingerprint of the dataset, - enabling reproducibility and verification. - - Parameters - ---------- - file_path : Union[str, Path] - Path to the dataset file (string or Path-like). - - Returns - ------- - str - SHA256 hash string. - """ - path = Path(file_path) - - sha256 = hashlib.sha256() - - with path.open("rb") as f: - while chunk := f.read(8192): - sha256.update(chunk) - - return sha256.hexdigest() \ No newline at end of file diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py new file mode 100644 index 0000000..ba7f042 --- /dev/null +++ b/openverifiablellm/utils.py @@ -0,0 +1,156 @@ +import bz2 +import re +import defusedxml.ElementTree as ET +from pathlib import Path +import sys +from typing import Union +import hashlib +import logging +import json +import platform + +logger = logging.getLogger(__name__) + +# extract clean wikipage from actual wikipage +def extract_text_from_xml(input_path): + """ + Process a compressed Wikipedia XML dump into cleaned plain text. + + Each element is parsed, its revision text is extracted, + cleaned using `clean_wikitext()`, and appended to a single + output text file. + + The processed output is saved to: + data/processed/wiki_clean.txt + + Parameters + ---------- + input_path : str or Path + Path to the compressed Wikipedia XML (.bz2) dump file. + + Output + ------ + Creates: + data/processed/wiki_clean.txt + """ + input_path = Path(input_path) + + # Fixed output path + project_root = Path.cwd() + output_dir = project_root / "data" / "processed" + output_dir.mkdir(parents=True, exist_ok=True) + + output_path = output_dir / "wiki_clean.txt" + + with bz2.open(input_path, "rb") as f: + context = ET.iterparse(f, events=("end",)) + + with open(output_path, "w", encoding="utf-8") as out: + for _, elem in context: + if elem.tag.endswith("page"): + text_elem = elem.find(".//{*}text") + + if text_elem is not None and text_elem.text: + cleaned = clean_wikitext(text_elem.text) + if cleaned: + out.write(cleaned + "\n\n") + + elem.clear() + logger.info("Preprocessing complete. Output saved to %s", output_path) + generate_manifest(input_path,output_path) + +# generate data manifest +def generate_manifest(raw_path, processed_path): + raw_path = Path(raw_path) + processed_path = Path(processed_path) + + if not processed_path.exists(): + raise FileNotFoundError( + f"Processed file not found at {processed_path}. Run preprocessing first." + ) + + manifest = { + "wikipedia_dump": raw_path.name, + "dump_date": extract_dump_date(raw_path.name), + "raw_sha256": compute_sha256(str(raw_path)), + "processed_sha256": compute_sha256(str(processed_path)), + "preprocessing_version": "v1", + "python_version": platform.python_version() + } + project_root = Path.cwd() + manifest_path = project_root / "data" / "dataset_manifest.json" + manifest_path.parent.mkdir(parents=True, exist_ok=True) + + with open(manifest_path, "w") as f: + json.dump(manifest, f, indent=2) + + logger.info("Manifest written to %s", manifest_path) + +# helpers +def compute_sha256(file_path: Union[str, Path]) -> str: + """ + Compute SHA256 hash of a file. + + This provides a deterministic fingerprint of the dataset, + enabling reproducibility and verification. + + Parameters + ---------- + file_path : Union[str, Path] + Path to the dataset file (string or Path-like). + + Returns + ------- + str + SHA256 hash string. + """ + path = Path(file_path) + + sha256 = hashlib.sha256() + + with path.open("rb") as f: + while chunk := f.read(8192): + sha256.update(chunk) + + return sha256.hexdigest() + +def extract_dump_date(filename: str): + parts = filename.split("-") + for part in parts: + if part.isdigit() and len(part) == 8: + return f"{part[:4]}-{part[4:6]}-{part[6:]}" + return "unknown" + +def clean_wikitext(text: str) -> str: + """ + Basic deterministic wikitext cleaning. + + Note: + This uses simple regex-based rules for speed and consistency. + It does NOT fully parse MediaWiki syntax. + + Limitations: + - Deeply nested templates may not be fully removed. + - Some complex cases may not be perfectly handled. + - This is not a complete MediaWiki parser. + + These limitations are acceptable for lightweight, deterministic preprocessing. + """ + text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL) + text = re.sub(r".*?", "", text, flags=re.DOTALL) + text = re.sub(r"<.*?>", "", text) + text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text) + text = re.sub(r"\[\[(.*?)\]\]", r"\1", text) + text = re.sub(r"\s+", " ", text) + return text.strip() + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python preprocess.py ") + sys.exit(1) + + logging.basicConfig( + level=logging.INFO, + format="%(levelname)s - %(message)s" + ) + extract_text_from_xml(sys.argv[1]) diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 03f586d..0000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -pythonpath = . \ No newline at end of file diff --git a/scripts/__init__.py b/scripts/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/generate_manifest.py b/scripts/generate_manifest.py deleted file mode 100644 index 8e73c49..0000000 --- a/scripts/generate_manifest.py +++ /dev/null @@ -1,40 +0,0 @@ -import json -import platform -from pathlib import Path -from .hash_utils import sha256_file - -# Anchor paths to project root (two levels up from this file) -_PROJECT_ROOT = Path(__file__).resolve().parent.parent - -def extract_dump_date(filename: str): - parts = filename.split("-") - for part in parts: - if part.isdigit() and len(part) == 8: - return f"{part[:4]}-{part[4:6]}-{part[6:]}" - return "unknown" - - -def generate_manifest(raw_path, processed_path): - raw_path = Path(raw_path) - processed_path = Path(processed_path) - - if not processed_path.exists(): - raise FileNotFoundError( - f"Processed file not found at {processed_path}. Run preprocessing first." - ) - - manifest = { - "wikipedia_dump": raw_path.name, - "dump_date": extract_dump_date(raw_path.name), - "raw_sha256": sha256_file(str(raw_path)), - "processed_sha256": sha256_file(str(processed_path)), - "preprocessing_version": "v1", - "python_version": platform.python_version() - } - - manifest_path = _PROJECT_ROOT / "dataset_manifest.json" - - with open(manifest_path, "w") as f: - json.dump(manifest, f, indent=2) - - print(f"Manifest written to {manifest_path}") \ No newline at end of file diff --git a/scripts/hash_utils.py b/scripts/hash_utils.py deleted file mode 100644 index 6106807..0000000 --- a/scripts/hash_utils.py +++ /dev/null @@ -1,18 +0,0 @@ -import hashlib -import sys - -def sha256_file(filepath): - sha256 = hashlib.sha256() - chunk_size = 1024 * 1024 # 1 MiB for better throughput - - with open(filepath, "rb") as f: - for chunk in iter(lambda: f.read(chunk_size), b""): - sha256.update(chunk) - return sha256.hexdigest() - -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python hash_utils.py ") - sys.exit(1) - path = sys.argv[1] - print(sha256_file(path)) \ No newline at end of file diff --git a/scripts/preprocess.py b/scripts/preprocess.py deleted file mode 100644 index 236a6cc..0000000 --- a/scripts/preprocess.py +++ /dev/null @@ -1,72 +0,0 @@ -import bz2 -import re -import defusedxml.ElementTree as ET -from pathlib import Path -import sys -from .generate_manifest import generate_manifest - - -def clean_wikitext(text: str) -> str: - """ - Basic deterministic wikitext cleaning. - - NOTE: - This implementation intentionally uses regex-based approximations - for performance and determinism. It does NOT fully parse MediaWiki syntax. - - Known limitations: - - Nested templates like {{Infobox | birth={{Date|1990|1|1}}}} - are not fully handled. The non-greedy template regex may leave - stray closing braces (e.g., "}}") in deeply nested structures. - - Self-closing references such as are only partially - handled. While generic tag stripping removes the tag itself, - complex edge cases may not be fully normalized. - - This is not a complete MediaWiki parser and should not be relied - upon for perfectly structured wikitext normalization. - - These trade-offs are acceptable for v1 deterministic preprocessing. - """ - text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL) - text = re.sub(r".*?", "", text, flags=re.DOTALL) - text = re.sub(r"<.*?>", "", text) - text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text) - text = re.sub(r"\[\[(.*?)\]\]", r"\1", text) - text = re.sub(r"\s+", " ", text) - return text.strip() - - -def extract_text_from_xml(input_path): - input_path = Path(input_path) - - # Fixed output path - output_dir = Path("data/processed") - output_dir.mkdir(parents=True, exist_ok=True) - - output_path = output_dir / "wiki_clean.txt" - - with bz2.open(input_path, "rb") as f: - context = ET.iterparse(f, events=("end",)) - - with open(output_path, "w", encoding="utf-8") as out: - for _, elem in context: - if elem.tag.endswith("page"): - text_elem = elem.find(".//{*}text") - - if text_elem is not None and text_elem.text: - cleaned = clean_wikitext(text_elem.text) - if cleaned: - out.write(cleaned + "\n\n") - - elem.clear() - - print(f"Preprocessing complete. Output saved to {output_path}") - - # Automatically generate manifest - generate_manifest(input_path, output_path) - -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python preprocess.py ") - sys.exit(1) - - extract_text_from_xml(sys.argv[1]) diff --git a/tests/test_dataset_hash.py b/tests/test_dataset_hash.py deleted file mode 100644 index b560baf..0000000 --- a/tests/test_dataset_hash.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -import hashlib -import tempfile -import pytest -from openverifiablellm.dataset_hash import compute_sha256 - - -def test_correct_sha256_output(tmp_path): - # Create a temporary file - file = tmp_path / "sample.txt" - content = "hello wikipedia" - file.write_text(content, encoding="utf-8") - - # Expected hash using standard hashlib - expected = hashlib.sha256(content.encode("utf-8")).hexdigest() - - # Hash using your function - actual = compute_sha256(str(file)) - - # Verify correctness - assert actual == expected - - -def test_different_content_different_hash(tmp_path): - file1 = tmp_path / "content_a.txt" - file2 = tmp_path / "content_b.txt" - - file1.write_text("Content A", encoding="utf-8") - file2.write_text("Content B", encoding="utf-8") - - assert compute_sha256(file1) != compute_sha256(file2) - - -def test_file_not_found(): - with pytest.raises(FileNotFoundError): - compute_sha256("non_existent_file.txt") \ No newline at end of file diff --git a/tests/test_generate_manifest.py b/tests/test_generate_manifest.py deleted file mode 100644 index 987e86d..0000000 --- a/tests/test_generate_manifest.py +++ /dev/null @@ -1,21 +0,0 @@ -import pytest -from scripts.generate_manifest import generate_manifest - -def test_generate_manifest_raises_if_processed_missing(tmp_path): - raw_file = tmp_path / "raw.txt" - raw_file.write_text("dummy") - - missing_file = tmp_path / "missing.txt" - - with pytest.raises(FileNotFoundError): - generate_manifest(raw_file, missing_file) - -def test_generate_manifest_runs_if_file_exists(tmp_path): - raw_file = tmp_path / "raw.txt" - raw_file.write_text("dummy") - - processed_file = tmp_path / "processed.txt" - processed_file.write_text("cleaned") - - # Should not raise - generate_manifest(raw_file, processed_file) \ No newline at end of file diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..f163968 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,120 @@ +import bz2 +import hashlib +import pytest +from openverifiablellm import utils +# run test by-- +## pip install -e . (from project root) +## pytest + +# --------------- clean_wikitext tests ------------------------------------ + +def test_clean_wikitext_removes_templates_and_refs(): + text = "Hello {{Infobox}} cite world" + cleaned = utils.clean_wikitext(text) + assert cleaned == "Hello world" + + +def test_clean_wikitext_handles_links(): + text = "This is [[Python|programming language]] and [[India]]" + cleaned = utils.clean_wikitext(text) + assert cleaned == "This is programming language and India" + + +def test_clean_wikitext_collapses_whitespace(): + text = "Hello world\n\n test" + cleaned = utils.clean_wikitext(text) + assert cleaned == "Hello world test" + +# --------------- extract_dump_date tests ------------------------------------ + +def test_extract_dump_date_valid(): + filename = "simplewiki-20260201-pages-articles.xml.bz2" + assert utils.extract_dump_date(filename) == "2026-02-01" + + +def test_extract_dump_date_invalid(): + filename = "no-date-file.xml.bz2" + assert utils.extract_dump_date(filename) == "unknown" + +# --------------- generate manifest ------------------------------------ + +def test_generate_manifest_raises_if_processed_missing(tmp_path): + raw_file = tmp_path / "raw.txt" + raw_file.write_text("dummy") + + missing_file = tmp_path / "missing.txt" + + with pytest.raises(FileNotFoundError): + utils.generate_manifest(raw_file, missing_file) + +def test_generate_manifest_runs_if_file_exists(tmp_path): + raw_file = tmp_path / "raw.txt" + raw_file.write_text("dummy") + + processed_file = tmp_path / "processed.txt" + processed_file.write_text("cleaned") + + # Should not raise + utils.generate_manifest(raw_file, processed_file) + +# --------------- compute_sha256 ------------------------------------ + +def test_correct_sha256_output(tmp_path): + # Create a temporary file + file = tmp_path / "sample.txt" + content = "hello wikipedia" + file.write_text(content, encoding="utf-8") + + # Expected hash using standard hashlib + expected = hashlib.sha256(content.encode("utf-8")).hexdigest() + + # Hash using your function + actual = utils.compute_sha256(str(file)) + + # Verify correctness + assert actual == expected + + +def test_different_content_different_hash(tmp_path): + file1 = tmp_path / "content_a.txt" + file2 = tmp_path / "content_b.txt" + + file1.write_text("Content A", encoding="utf-8") + file2.write_text("Content B", encoding="utf-8") + + assert utils.compute_sha256(file1) != utils.compute_sha256(file2) + + +def test_file_not_found(): + with pytest.raises(FileNotFoundError): + utils.compute_sha256("non_existent_file.txt") + +# --------------- extract_text_from_xml tests ------------------------------------ + +def test_extract_text_from_xml_end_to_end(tmp_path, monkeypatch): + + xml_content = """ + + + + Hello [[World]] + + + + """ + + input_file = tmp_path / "simplewiki-20260201-pages.xml.bz2" + + with bz2.open(input_file, "wt", encoding="utf-8") as f: + f.write(xml_content) + + # Redirect project root + monkeypatch.chdir(tmp_path) + + utils.extract_text_from_xml(input_file) + + processed_file = tmp_path / "data/processed/wiki_clean.txt" + assert processed_file.exists() + + assert "Hello World" in processed_file.read_text() + \ No newline at end of file From 5ab32d6b1d5527dbd55889f5bc6c4ff63e774715 Mon Sep 17 00:00:00 2001 From: tanii1125 Date: Sun, 22 Feb 2026 22:58:16 +0530 Subject: [PATCH 5/7] resolved coderabbitai reviews --- README.md | 4 ++-- examples/demo_util.py | 2 +- examples/{sample_wiki.xml => sample_wiki.py} | 0 openverifiablellm/utils.py | 2 +- tests/test_util.py | 8 ++++++-- 5 files changed, 10 insertions(+), 6 deletions(-) rename examples/{sample_wiki.xml => sample_wiki.py} (100%) diff --git a/README.md b/README.md index 300a40d..d3fd80b 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ OpenVerifiableLLM/ | โ”œโ”€โ”€ examples/ โ”‚ โ”œโ”€โ”€ demo_util.py -โ”‚ โ”œโ”€โ”€ sample_wiki.xml +โ”‚ โ”œโ”€โ”€ sample_wiki.py โ”‚ โ””โ”€โ”€ sample_wiki.xml.bz2 โ”‚ โ”œโ”€โ”€ openverifiablellm/ @@ -146,7 +146,7 @@ pip install -e . #### Step 2 โ€” Place Dump File Move your Wikipedia dump into the project root directory -(the same directory that contains the ```openverifiablellm/``` folder). +(the same directory that contains the `openverifiablellm/` folder). Example: diff --git a/examples/demo_util.py b/examples/demo_util.py index ac818e1..8ca4620 100644 --- a/examples/demo_util.py +++ b/examples/demo_util.py @@ -9,7 +9,7 @@ if __name__ == "__main__": if len(sys.argv) < 2: - print("Usage: python preprocess.py ") + print("Usage: python -m examples.demo_util ") sys.exit(1) logging.basicConfig( diff --git a/examples/sample_wiki.xml b/examples/sample_wiki.py similarity index 100% rename from examples/sample_wiki.xml rename to examples/sample_wiki.py diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py index ba7f042..084e1e2 100644 --- a/openverifiablellm/utils.py +++ b/openverifiablellm/utils.py @@ -146,7 +146,7 @@ def clean_wikitext(text: str) -> str: if __name__ == "__main__": if len(sys.argv) < 2: - print("Usage: python preprocess.py ") + print("Usage: python -m openverifiablellm.utils ") sys.exit(1) logging.basicConfig( diff --git a/tests/test_util.py b/tests/test_util.py index f163968..3901f2d 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -47,15 +47,19 @@ def test_generate_manifest_raises_if_processed_missing(tmp_path): with pytest.raises(FileNotFoundError): utils.generate_manifest(raw_file, missing_file) -def test_generate_manifest_runs_if_file_exists(tmp_path): +def test_generate_manifest_runs_if_file_exists(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + raw_file = tmp_path / "raw.txt" raw_file.write_text("dummy") processed_file = tmp_path / "processed.txt" processed_file.write_text("cleaned") - # Should not raise utils.generate_manifest(raw_file, processed_file) + + manifest_file = tmp_path / "data/dataset_manifest.json" + assert manifest_file.exists() # --------------- compute_sha256 ------------------------------------ From bce705a7cc887ba504def179b8409a9b63b4a0bf Mon Sep 17 00:00:00 2001 From: tanii1125 Date: Tue, 24 Feb 2026 10:40:09 +0530 Subject: [PATCH 6/7] Address review feedback: remove __pycache__, delete requirements.txt, update gitignore, revert README --- .gitignore | 1 + README.md | 246 +++++++++--------- pyproject.toml | 9 + requirements.txt | 2 - ...rate_manifest.cpython-313-pytest-9.0.2.pyc | Bin 1229 -> 0 bytes 5 files changed, 137 insertions(+), 121 deletions(-) delete mode 100644 requirements.txt delete mode 100644 tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc diff --git a/.gitignore b/.gitignore index 14d1412..392cb81 100644 --- a/.gitignore +++ b/.gitignore @@ -331,3 +331,4 @@ __pycache__/ *.pyc *.pyo *.pyd +*.bz2 diff --git a/README.md b/README.md index d3fd80b..d76f8be 100644 --- a/README.md +++ b/README.md @@ -48,184 +48,192 @@ ---
-

OpenVerifiableLLM โ€“ Deterministic Dataset Pipeline

+

TODO: Project Name

-OpenVerifiableLLM is a deterministic Wikipedia preprocessing and dataset verification pipeline designed to support fully reproducible LLM training. - -It ensures that: - -- The same Wikipedia dump always produces identical processed output. -- Dataset fingerprints (SHA256 hashes) are generated for verification. -- A manifest file captures dataset identity and environment metadata. +[TODO](https://TODO.stability.nexus/) is a ... TODO: Project Description. --- ## ๐Ÿš€ Features -- **Deterministic Wikipedia preprocessing** -- **Wikitext cleaning (templates, references, links removed)** -- **Stable XML parsing with memory-efficient streaming** -- **SHA256 hashing of raw and processed datasets** -- **Automatic dataset manifest generation** -- **Reproducible data identity tracking** +TODO: List your main features here: + +- **Feature 1**: Description +- **Feature 2**: Description +- **Feature 3**: Description +- **Feature 4**: Description --- ## ๐Ÿ’ป Tech Stack -- Python 3.9+ -- `xml.etree.ElementTree` (stream parsing) -- `bz2` (compressed dump handling) -- `hashlib` (SHA256 hashing) -- `pathlib` -- `re` (deterministic cleaning) +TODO: Update based on your project ---- +### Frontend +- React / Next.js / Flutter / React Native +- TypeScript +- TailwindCSS -## ๐Ÿ“‚ Project Structure - -```text -OpenVerifiableLLM/ -โ”‚ -โ”œโ”€โ”€ data/ โ† created automatically at runtime -โ”‚ โ”œโ”€โ”€ dataset_manifest.json -โ”‚ โ””โ”€โ”€ processed/ -โ”‚ โ””โ”€โ”€ wiki_clean.txt -| -โ”œโ”€โ”€ examples/ -โ”‚ โ”œโ”€โ”€ demo_util.py -โ”‚ โ”œโ”€โ”€ sample_wiki.py -โ”‚ โ””โ”€โ”€ sample_wiki.xml.bz2 -โ”‚ -โ”œโ”€โ”€ openverifiablellm/ -โ”‚ โ”œโ”€โ”€ __init__.py โ† (should exist) -โ”‚ โ””โ”€โ”€ utils.py -โ”‚ -โ”œโ”€โ”€ tests/ -โ”‚ โ””โ”€โ”€ test_util.py -โ”‚ -โ””โ”€โ”€ requirements.txt -``` +### Backend +- Flask / FastAPI / Node.js / Supabase +- Database: PostgreSQL / SQLite / MongoDB + +### AI/ML (if applicable) +- LangChain / LangGraph / LlamaIndex +- Google Gemini / OpenAI / Anthropic Claude +- Vector Database: Weaviate / Pinecone / Chroma +- RAG / Prompt Engineering / Agent Frameworks + +### Blockchain (if applicable) +- Solidity / solana / cardano / ergo Smart Contracts +- Hardhat / Truffle / foundry +- Web3.js / Ethers.js / Wagmi +- OpenZeppelin / alchemy / Infura --- -## ๏ฟฝ๐Ÿ€ Getting Started +## โœ… Project Checklist + +TODO: Complete applicable items based on your project type + +- [ ] **The protocol** (if applicable): + - [ ] has been described and formally specified in a paper. + - [ ] has had its main properties mathematically proven. + - [ ] has been formally verified. +- [ ] **The smart contracts** (if applicable): + - [ ] were thoroughly reviewed by at least two knights of The Stable Order. + - [ ] were deployed to: [Add deployment details] +- [ ] **The mobile app** (if applicable): + - [ ] has an _About_ page containing the Stability Nexus's logo and pointing to the social media accounts of the Stability Nexus. + - [ ] is available for download as a release in this repo. + - [ ] is available in the relevant app stores. +- [ ] **The AI/ML components** (if applicable): + - [ ] LLM/model selection and configuration are documented. + - [ ] Prompts and system instructions are version-controlled. + - [ ] Content safety and moderation mechanisms are implemented. + - [ ] API keys and rate limits are properly managed. -### Prerequisites +--- -- Python 3.9+ -- Wikipedia dump from: - +## ๐Ÿ”— Repository Links -Recommended for testing: +TODO: Update with your repository structure -- `simplewiki-YYYYMMDD-pages-articles.xml.bz2` +1. [Main Repository](https://github.com/AOSSIE-Org/TODO) +2. [Frontend](https://github.com/AOSSIE-Org/TODO/tree/main/frontend) (if separate) +3. [Backend](https://github.com/AOSSIE-Org/TODO/tree/main/backend) (if separate) --- -### Installation +## ๐Ÿ—๏ธ Architecture Diagram -#### 1. Clone the Repository +TODO: Add your system architecture diagram here -```bash -git clone https://github.com/AOSSIE-Org/OpenVerifiableLLM.git -cd OpenVerifiableLLM +``` +[Architecture Diagram Placeholder] ``` -### โ–ถ Running the Pipeline - -#### Step 1 โ€” Install the Package - -From the project root: +You can create architecture diagrams using: +- [Draw.io](https://draw.io) +- [Excalidraw](https://excalidraw.com) +- [Lucidchart](https://lucidchart.com) +- [Mermaid](https://mermaid.js.org) (for code-based diagrams) -```bash -pip install -e . -``` +Example structure to include: +- Frontend components +- Backend services +- Database architecture +- External APIs/services +- Data flow between components -#### Step 2 โ€” Place Dump File +--- -Move your Wikipedia dump into the project root directory -(the same directory that contains the `openverifiablellm/` folder). +## ๐Ÿ”„ User Flow -Example: +TODO: Add user flow diagrams showing how users interact with your application -```bash -simplewiki-20260201-pages-articles.xml.bz2 +``` +[User Flow Diagram Placeholder] ``` -Copy the file path to use as the argument when running preprocessing. +### Key User Journeys -Example (relative path): +TODO: Document main user flows: -```bash -simplewiki-20260201-pages-articles.xml.bz2 -``` +1. **User Journey 1**: Description + - Step 1 + - Step 2 + - Step 3 -#### Step 3 โ€” Run Preprocessing +2. **User Journey 2**: Description + - Step 1 + - Step 2 + - Step 3 -```bash -python -m openverifiablellm.utils simplewiki-20260201-pages-articles.xml.bz2 -``` +3. **User Journey 3**: Description + - Step 1 + - Step 2 + - Step 3 -This will: +--- -- Create `data/processed/wiki_clean.txt` -- Generate `dataset_manifest.json` -- Compute `SHA256` hashes -- Log preprocessing status +## ๏ฟฝ๐Ÿ€ Getting Started -#### ๐Ÿ“œ Example Manifest +### Prerequisites -```json -{ - "wikipedia_dump": "simplewiki-20260201-pages-articles.xml.bz2", - "dump_date": "2026-02-01", - "raw_sha256": "...", - "processed_sha256": "...", - "preprocessing_version": "v1", - "python_version": "3.13.2" -} -``` +TODO: List what developers need installed ---- +- Node.js 18+ / Python 3.9+ / Flutter SDK +- npm / yarn / pnpm +- [Any specific tools or accounts needed] -## ๐Ÿงช Running Tests +### Installation -To verify correctness and reproducibility: +TODO: Provide detailed setup instructions + +#### 1. Clone the Repository ```bash -pytest +git clone https://github.com/AOSSIE-Org/TODO.git +cd TODO ``` -This runs: +#### 2. Install Dependencies -- Unit tests for: +```bash +npm install +# or +yarn install +# or +pnpm install +``` - - `clean_wikitext` - - `compute_sha256` - - `extract_dump_date` - - `generate_manifest` +#### 3. Configure Environment Variables(.env.example) -- Integration test for: +Create a `.env` file in the root directory: - - `extract_text_from_xml` (end-to-end pipeline using a synthetic .bz2 file) +```env +# Add your environment variables here +API_KEY=your_api_key +DATABASE_URL=your_database_url +``` -All tests should pass: +#### 4. Run the Development Server -```text -11 passed in 0.xx s +```bash +npm run dev +# or +yarn dev +# or +pnpm dev ``` ---- +#### 5. Open your Browser -## ๐Ÿ“ˆ Future Extensions +Navigate to [http://localhost:3000](http://localhost:3000) to see the application. -- Deterministic tokenization stage -- Token-level hashing -- Multi-GPU training reproducibility -- Environment containerization (Docker) -- Full checkpoint verification protocol +For detailed setup instructions, please refer to our [Installation Guide](./docs/INSTALL_GUIDE.md) (if you have one). --- @@ -269,4 +277,4 @@ Thanks a lot for spending your time helping TODO grow. Keep rocking ๐Ÿฅ‚ [![Contributors](https://contrib.rocks/image?repo=AOSSIE-Org/TODO)](https://github.com/AOSSIE-Org/TODO/graphs/contributors) -ยฉ 2025 AOSSIE +ยฉ 2025 AOSSIE \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 34e03d9..121d3bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,15 @@ authors = [ ] requires-python = ">=3.9" +dependencies= [ + "defusedxml" +] + +[project.optional-dependencies] +dev = [ + "pytest" +] + [tool.setuptools.packages.find] include = ["openverifiablellm*"] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 634c013..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -defusedxml>=0.7.1 -pytest \ No newline at end of file diff --git a/tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc b/tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc deleted file mode 100644 index b295c70fab69f451015dfc335bf413a8944f83eb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1229 zcmbVL&1(}u6o31XWb@U~)*7OOu6oH^RUXRh}FxML5J0FFZy6;vU`JQ4jcUKm*5;+f=m}$!0JUL zVifarD8Wr-6pE6e<_<&D+RS=eo*yXGDwXnvy@gT<&+qTuXz>MbS-bg*YJ9?h&Vsiy;yjR19e^Zuid_} zcF!fweM0S`ZLOEd;^N&kUWN&4o)|;ct@{%kaPr9Pe3)q$O_x^lgfUVuyJcwtD{_|E zESd%!B$7>|mAKHjbD+ld)nrReHbc9r{%QDZv(g&A*jW4$PCQ9JPVXi&dtu|fmTG2p zwW~iBh>Y!rjdx+=km11Lb!;Z8yp2X?QlVo4;iZH?D8Br!Kvv{Gib?Is0mZ;;h;>v9 zB`^Xx8o)T^655XU=@6q1<;%o!NWqY4h+{faqr4p&L*ozh@yDh`2l(OG-^i~yj8o62 zZIUe;M@WY;A&w85Lh}#ON7647t&#CY?m!*dSJN#u-L!Yr%#R=hPCwn)Q^(tw4@T72 z)@ydzvmNHgZn<@{@1@L+II|WGoqBfF$xxcUxKh%pwFj=%=QL5q1ev0L(?RqSG zLT`c{p&rrAE1;(^p+ki>RjC$gWin0E7{h-ISPIQIMUteC0FU5d`~zJ7u7;$E_K+rB H5vjib@(LrW From 89041a05ec7b8ebbd709a5260fd86731dd54b497 Mon Sep 17 00:00:00 2001 From: tanii1125 Date: Tue, 24 Feb 2026 11:06:24 +0530 Subject: [PATCH 7/7] docs: documentation for usage in tets and examples --- examples/demo_util.py | 10 +++++++--- tests/test_util.py | 11 ++++++++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/examples/demo_util.py b/examples/demo_util.py index 8ca4620..9f41446 100644 --- a/examples/demo_util.py +++ b/examples/demo_util.py @@ -1,12 +1,16 @@ -## run via- -## python -m examples.demo_util examples\sample_wiki.xml.bz2 - import sys import logging from openverifiablellm.utils import extract_text_from_xml logger = logging.getLogger(__name__) +""" +Demo for preprocessing pipeline. + +Run with: + python -m examples.demo_util examples\sample_wiki.xml.bz2 +""" + if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python -m examples.demo_util ") diff --git a/tests/test_util.py b/tests/test_util.py index 3901f2d..c0cede3 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -2,9 +2,14 @@ import hashlib import pytest from openverifiablellm import utils -# run test by-- -## pip install -e . (from project root) -## pytest + +""" +Unit and integration tests for OpenVerifiableLLM preprocessing pipeline. + +Run with: + pip install -e ".[dev]" + pytest +""" # --------------- clean_wikitext tests ------------------------------------