From 4e36e156b51cac2256ff881dff4c34da720081c7 Mon Sep 17 00:00:00 2001
From: tanii1125 <dubeytanisha66@gmail.com>
Date: Fri, 20 Feb 2026 01:14:07 +0530
Subject: [PATCH 1/7] feat: add deterministic preprocessing and dataset
 identity tracking

---
 .gitignore                                    |   3 +
 README.md                                     | 222 ++++++------------
 .../generate_manifest.cpython-313.pyc         | Bin 0 -> 1915 bytes
 .../__pycache__/hash_utils.cpython-313.pyc    | Bin 0 -> 1053 bytes
 scripts/generate_manifest.py                  |  38 +++
 scripts/hash_utils.py                         |  13 +
 scripts/preprocess.py                         | 107 +++++++++
 7 files changed, 238 insertions(+), 145 deletions(-)
 create mode 100644 scripts/__pycache__/generate_manifest.cpython-313.pyc
 create mode 100644 scripts/__pycache__/hash_utils.cpython-313.pyc
 create mode 100644 scripts/generate_manifest.py
 create mode 100644 scripts/hash_utils.py
 create mode 100644 scripts/preprocess.py
diff --git a/.gitignore b/.gitignore
index 9308a4b..3846644 100644
--- a/.gitignore
+++ b/.gitignore
@@ -324,3 +324,6 @@ TSWLatexianTemp*
 # option is specified. Footnotes are the stored in a file with suffix Notes.bib.
 # Uncomment the next line to have this generated file ignored.
 #*Notes.bib
+
+dataset_manifest.json
+data
\ No newline at end of file
diff --git a/README.md b/README.md
index 3c5adf2..365984d 100644
--- a/README.md
+++ b/README.md
@@ -48,192 +48,124 @@
 ---
 
 <div align="center">
-<h1>TODO: Project Name</h1>
+<h1>OpenVerifiableLLM – Deterministic Dataset Pipeline</h1>
 </div>
 
-[TODO](https://TODO.stability.nexus/) is a ... TODO: Project Description.
+OpenVerifiableLLM is a deterministic Wikipedia preprocessing and dataset verification pipeline designed to support fully reproducible LLM training.
 
----
-
-## 🚀 Features
-
-TODO: List your main features here:
-
-- **Feature 1**: Description
-- **Feature 2**: Description
-- **Feature 3**: Description
-- **Feature 4**: Description
-
----
-
-## 💻 Tech Stack
-
-TODO: Update based on your project
-
-### Frontend
-- React / Next.js / Flutter / React Native
-- TypeScript
-- TailwindCSS
+It ensures that:
 
-### Backend
-- Flask / FastAPI / Node.js / Supabase
-- Database: PostgreSQL / SQLite / MongoDB
-
-### AI/ML (if applicable)
-- LangChain / LangGraph / LlamaIndex
-- Google Gemini / OpenAI / Anthropic Claude
-- Vector Database: Weaviate / Pinecone / Chroma
-- RAG / Prompt Engineering / Agent Frameworks
-
-### Blockchain (if applicable)
-- Solidity / solana / cardano / ergo Smart Contracts
-- Hardhat / Truffle / foundry
-- Web3.js / Ethers.js / Wagmi
-- OpenZeppelin / alchemy / Infura
-
----
-
-## ✅ Project Checklist
-
-TODO: Complete applicable items based on your project type
-
-- [ ] **The protocol** (if applicable):
-   - [ ] has been described and formally specified in a paper.
-   - [ ] has had its main properties mathematically proven.
-   - [ ] has been formally verified.
-- [ ] **The smart contracts** (if applicable):
-   - [ ] were thoroughly reviewed by at least two knights of The Stable Order.
-   - [ ] were deployed to: [Add deployment details]
-- [ ] **The mobile app** (if applicable):
-   - [ ] has an _About_ page containing the Stability Nexus's logo and pointing to the social media accounts of the Stability Nexus.
-   - [ ] is available for download as a release in this repo.
-   - [ ] is available in the relevant app stores.
-- [ ] **The AI/ML components** (if applicable):
-   - [ ] LLM/model selection and configuration are documented.
-   - [ ] Prompts and system instructions are version-controlled.
-   - [ ] Content safety and moderation mechanisms are implemented.
-   - [ ] API keys and rate limits are properly managed.
+- The same Wikipedia dump always produces identical processed output.
+- Dataset fingerprints (SHA256 hashes) are generated for verification.
+- A manifest file captures dataset identity and environment metadata.
 
 ---
 
-## 🔗 Repository Links
-
-TODO: Update with your repository structure
+## 🚀 Features
 
-1. [Main Repository](https://github.com/AOSSIE-Org/TODO)
-2. [Frontend](https://github.com/AOSSIE-Org/TODO/tree/main/frontend) (if separate)
-3. [Backend](https://github.com/AOSSIE-Org/TODO/tree/main/backend) (if separate)
+- **Deterministic Wikipedia preprocessing**
+- **Wikitext cleaning (templates, references, links removed)**
+- **Stable XML parsing with memory-efficient streaming**
+- **SHA256 hashing of raw and processed datasets**
+- **Automatic dataset manifest generation**
+- **Reproducible data identity tracking**
 
 ---
 
-## 🏗️ Architecture Diagram
-
-TODO: Add your system architecture diagram here
-
-```
-[Architecture Diagram Placeholder]
-```
-
-You can create architecture diagrams using:
-- [Draw.io](https://draw.io)
-- [Excalidraw](https://excalidraw.com)
-- [Lucidchart](https://lucidchart.com)
-- [Mermaid](https://mermaid.js.org) (for code-based diagrams)
+## 💻 Tech Stack
 
-Example structure to include:
-- Frontend components
-- Backend services
-- Database architecture
-- External APIs/services
-- Data flow between components
+- Python 3.9+
+- `xml.etree.ElementTree` (stream parsing)
+- `bz2` (compressed dump handling)
+- `hashlib` (SHA256 hashing)
+- `pathlib`
+- `re` (deterministic cleaning)
 
 ---
 
-## 🔄 User Flow
-
-TODO: Add user flow diagrams showing how users interact with your application
-
-```
-[User Flow Diagram Placeholder]
+## 📂 Project Structure
+
+```text
+OpenVerifiableLLM/
+│
+├── scripts/
+│ ├── preprocess.py
+│ ├── generate_manifest.py
+│ └── hash_utils.py
+│
+├── data/
+│ ├── raw/
+│ └── processed/
+│
+└── dataset_manifest.json
 ```
 
-### Key User Journeys
-
-TODO: Document main user flows:
-
-1. **User Journey 1**: Description
-   - Step 1
-   - Step 2
-   - Step 3
-
-2. **User Journey 2**: Description
-   - Step 1
-   - Step 2
-   - Step 3
-
-3. **User Journey 3**: Description
-   - Step 1
-   - Step 2
-   - Step 3
-
 ---
 
 ## �🍀 Getting Started
 
 ### Prerequisites
 
-TODO: List what developers need installed
+- Python 3.9+
+- Wikipedia dump from:
+  https://dumps.wikimedia.org/
 
-- Node.js 18+ / Python 3.9+ / Flutter SDK
-- npm / yarn / pnpm
-- [Any specific tools or accounts needed]
+Recommended for testing:
+- `simplewiki-YYYYMMDD-pages-articles.xml.bz2`
 
-### Installation
+---
 
-TODO: Provide detailed setup instructions
+### Installation
 
 #### 1. Clone the Repository
 
 ```bash
-git clone https://github.com/AOSSIE-Org/TODO.git
-cd TODO
+git clone https://github.com/AOSSIE-Org/OpenVerifiableLLM.git
+cd OpenVerifiableLLM
 ```
 
-#### 2. Install Dependencies
+### ▶ Running the Pipeline
+
+#### Step 1 — Place Dump File
+Move your Wikipedia dump into:
 
 ```bash
-npm install
-# or
-yarn install
-# or
-pnpm install
+data/raw/
 ```
 
-#### 3. Configure Environment Variables(.env.example)
+Example:
 
-Create a `.env` file in the root directory:
-
-```env
-# Add your environment variables here
-API_KEY=your_api_key
-DATABASE_URL=your_database_url
+```bash
+data/raw/simplewiki-20260201-pages-articles.xml.bz2
 ```
-
-#### 4. Run the Development Server
+#### Step 2 — Run Preprocessing
 
 ```bash
-npm run dev
-# or
-yarn dev
-# or
-pnpm dev
+python scripts/preprocess.py data/raw/simplewiki-20260201-pages-articles.xml.bz2
 ```
-
-#### 5. Open your Browser
-
-Navigate to [http://localhost:3000](http://localhost:3000) to see the application.
-
-For detailed setup instructions, please refer to our [Installation Guide](./docs/INSTALL_GUIDE.md) (if you have one).
+This will:
+- Create `data/processed/wiki_clean.txt`
+- Generate `dataset_manifest.json`
+- Compute `SHA256` hashes
+
+#### 📜 Example Manifest
+
+```json
+{
+  "wikipedia_dump": "simplewiki-20260201-pages-articles.xml.bz2",
+  "dump_date": "2026-02-01",
+  "raw_sha256": "...",
+  "processed_sha256": "...",
+  "preprocessing_version": "v1",
+  "python_version": "3.13.2"
+}
+```
+## 📈 Future Extensions
+- Deterministic tokenization stage
+- Token-level hashing
+- Multi-GPU training reproducibility
+- Environment containerization (Docker)
+- Full checkpoint verification protocol
 
 ---
 
diff --git a/scripts/__pycache__/generate_manifest.cpython-313.pyc b/scripts/__pycache__/generate_manifest.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26c58ea894fad36fb8981a06a8faa6bbe404c955
GIT binary patch
literal 1915
zcma(R?@JqJ__-gsTyh$<nyk$_ddAur)O6a}Qs;zG3eL7!@#c`lkzJFExi(&|?_KH-
z=<ws(4)lx6v5jVIunflNm;D3VKVVIZa;7l0PkyslDD2ao_spd#W9$iepC9jY&-?t|
zYi<q#!rrG}>Lw5H7iBzP>%`s|604wq!dwBej~T+aJWE)_>=kaDC%jDWxGTPKfe1+R
z)Bd+v#TSK}JP{Qk3M4SY#Qi(eY%<O`{B_mNBbKbZIyiJbozn}NBhu4QXSEDdf1V-{
z{~Z4Je!*iECSl^>Ey^e;%urowFq2F>jEw>q>?AuuzdQm%b-t&Lvl#5PC_u{RDNeGU
zHH96X0&q2Y0VZj{91kIU4{1)}8#J!~rO){+R)*6cfebUgxR7+1{v8^@4rUm31Uwvn
z$+%?{=ZrY#_$;%a+m2tivib}X9L68#2zA8qnJTd@hj$STLzZP{AUiz$fvFiEX++QI
z>WzYS_3E{0D?@bCwx(w^LnErKrDs(`&uNyOFc+LQZQdqo#!hETvt~Mr*W?&lqfS|$
z;IRZ-O&!b0d&#wP)uvNRm$zHPpYvN%%ks#*k>$|`qu*r8r@z)K1K%23k?7js+IuVc
zm5<kMmpjU8g(;sXOO=7jhmSfR#VS*k@%8o#8%-CU`=BkdD?n3t4<CSx*G-gZl=Dw4
z*eawN#he5$>Ba%+;llo}a&{gprP3&+3@Ms^7bd6|M<9l!^*oSio4M0*SSi))@&A7B
zeV=Oa^ipje^(wyq)M_-cr={>IS}zwSIy~wXj)B+EiZ3A;Z0a@dP7xIUP(Al77+mVO
zcQb+QeiULb)uo6^;3DVFGS0L2EoQDU8cNXf3mY6dp$~fMhO$`695(+cknxv{zRR70
z921w4i*I1lsb@`6%xIRSWzWv(xAb(Tps7Z}p0^h-UM8eShUM#?f=u^-Y!q!dS1cLX
zggj9)WRqxh3%W6b8U(w02cHCoy)zIOoDj8YYFS-%8{G*s+T95fbuPWXXq?uD8=fS5
z$a4CQMl8K(I3aVv&KC`jUkuYAEX{s3TQ@CKW9D{fhH;e%x{=ikdr`htA3oU|$E<9X
zGBoyFsZdx*#6yIRvm<Epx`h+&Gl_2C>{|;s?I^Pyo-RRmN(o&Xcx4kuGz+SoE0S4q
zlFIlZt^tR4o#1J#ao!PWpeAls^xjM6J}-J3!C`V1wTa6_CbT!5HqXsQWF&#|R{UGP
z;;{s^miCpg2cIpCy$~Q0U5{R@hKHAue@G`bCHb)=Z%VPpQmiI6uZP}$B97NYX<584
zuC#q2;R<-Y>tX*_{k8C!^8Al?zq?xv57i=_KXjGz-}hG|Z`Dq|`7>8ZR(sy5o*dcg
z?5;&)o6&)d=s-1k?s>DnRlqloK`^u)>MmPP0=<>nHL+<qa6hopQ&u;`o|ha1`_L<2
za6Q=fl<$A(NB(J`_iw8SJ^$@|<dV$&E(b3q{P7^6jY?wlID?1dr->@)H};DW%&S&D
zU9$Cp<>s7pQ$;#>ZZ>`b(oShNW8%ncF<UBVqht_ev<EG!y~{BSvklTV1h=8(861BG
Vv0cH(gv;CuK(yByVG^#He*hL!r}qE=

literal 0
HcmV?d00001

diff --git a/scripts/__pycache__/hash_utils.cpython-313.pyc b/scripts/__pycache__/hash_utils.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ded3f215febde19344198700efb08866e33cb47
GIT binary patch
literal 1053
zcmah{%}*0S6o0cj+wT(PD*-Glq!L#N2SW`aHe%F}NYS*`n9#7<cDL>7wluS?5l=l*
zFK{7p#Au@4{1^NWG)2ULn<sA79xxt^Gi^6Ddhl)Ly*Ka8`_1pY*=8{4117)jJ~tN;
zz*oL;MCv%rG;_8g0SR;)NZTTU62dSLAtffnxd#B+C_oCuJx+1gCNKvgQpz;}DS<61
z!fxW?SQ&CChZO0U2bVLQEZ=~OEs?FJhecP#ZCK&amc#|e>)Hv3bT&n_;_!zgV;<?8
z7N8)mxSfimIMeFmp(i0aC;DKqAD}2CQQYgC>)NEyWgQy?@Po`J!ak>723G0sRydx+
zEDYl?1?M_po%NL!WEf*O|F66%jzJbFc$o?$opF%4<wNRYt2xP36m7#jh%@NFg$<$;
z4TB;qn{Wz1K@<~1)0stCo2RSvygrw@Q!<M84Ps_ZEo~V~OR<!lA!ey;r}CPeS1V=H
zvZqVy)H82sg|x0kX~@oN7bBO{tZ5n3^VV8Mv+O8y+G##@JIK_JD(sHGo_sO+rvKGc
zlT##oUW$vlIkIJ@sjH1d@fssRu~{|<byZ5bRyL?NZ#>b>oMD$0Og;Rrl2*<Wj!dOY
zzEXTdT32BfkY)KA;Xfty|G|y02L_(DO{N%~qhSB$>a*2)FjT#D<PC1*ckrG!w0rt!
zU}$f6`klBxFtc&<Ake!RtHpNC?g!*=A`C`OMDUF^ToCB}>>v5)AF2C8A8_cz&7!RO
zpTtj_cllCT2#fE-zU#<MJ*ryJ%%Z9u^QkLt5@3dk_PX6-$n!<5V5lm^8p%DTm^VtL
z5;2SAx9y>|oAb;U*7S;HL`jGta4`EaQw<RzbO=3%;QNjxG}`C^9N2hT7l#h~J)5(&
e*{y}zmFkTz*ju}>rPqBUpYWN61bDdh9=`#vQOp_u

literal 0
HcmV?d00001

diff --git a/scripts/generate_manifest.py b/scripts/generate_manifest.py
new file mode 100644
index 0000000..058098c
--- /dev/null
+++ b/scripts/generate_manifest.py
@@ -0,0 +1,38 @@
+import json
+import sys
+import platform
+from pathlib import Path
+from hash_utils import sha256_file
+
+
+def extract_dump_date(filename: str):
+    parts = filename.split("-")
+    for part in parts:
+        if part.isdigit() and len(part) == 8:
+            return f"{part[:4]}-{part[4:6]}-{part[6:]}"
+    return "unknown"
+
+
+def generate_manifest(raw_path, processed_path):
+    raw_path = Path(raw_path)
+
+    # Automatically infer processed file path
+    processed_path = Path("data/processed/wiki_clean.txt")
+
+    if not processed_path.exists():
+        print("Error: Processed file not found. Run preprocessing first.")
+        sys.exit(1)
+
+    manifest = {
+        "wikipedia_dump": raw_path.name,
+        "dump_date": extract_dump_date(raw_path.name),
+        "raw_sha256": sha256_file(str(raw_path)),
+        "processed_sha256": sha256_file(str(processed_path)),
+        "preprocessing_version": "v1",
+        "python_version": platform.python_version()
+    }
+
+    with open("dataset_manifest.json", "w") as f:
+        json.dump(manifest, f, indent=2)
+
+    print("Manifest generated successfully.")
diff --git a/scripts/hash_utils.py b/scripts/hash_utils.py
new file mode 100644
index 0000000..88a2cb5
--- /dev/null
+++ b/scripts/hash_utils.py
@@ -0,0 +1,13 @@
+import hashlib
+import sys
+
+def sha256_file(filepath):
+    sha256 = hashlib.sha256()
+    with open(filepath, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            sha256.update(chunk)
+    return sha256.hexdigest()
+
+if __name__ == "__main__":
+    path = sys.argv[1]
+    print(sha256_file(path))
\ No newline at end of file
diff --git a/scripts/preprocess.py b/scripts/preprocess.py
new file mode 100644
index 0000000..4cf5cba
--- /dev/null
+++ b/scripts/preprocess.py
@@ -0,0 +1,107 @@
+# import bz2
+# import re
+# import xml.etree.ElementTree as ET
+# from pathlib import Path
+# import sys
+
+
+# def clean_wikitext(text: str) -> str:
+#     # Remove templates {{...}}
+#     text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL)
+
+#     # Remove references <ref>...</ref>
+#     text = re.sub(r"<ref.*?>.*?</ref>", "", text, flags=re.DOTALL)
+
+#     # Remove HTML tags
+#     text = re.sub(r"<.*?>", "", text)
+
+#     # Convert [[Link|Text]] → Text
+#     text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text)
+
+#     # Convert [[Link]] → Link
+#     text = re.sub(r"\[\[(.*?)\]\]", r"\1", text)
+
+#     # Remove multiple spaces/newlines
+#     text = re.sub(r"\s+", " ", text)
+
+#     return text.strip()
+
+
+# def extract_text_from_xml(input_path, output_path):
+#     Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+
+#     with bz2.open(input_path, "rb") as f:
+#         context = ET.iterparse(f, events=("end",))
+
+#         with open(output_path, "w", encoding="utf-8") as out:
+#             for event, elem in context:
+#                 if elem.tag.endswith("page"):
+#                     text_elem = elem.find(".//{*}text")
+
+#                     if text_elem is not None and text_elem.text:
+#                         cleaned = clean_wikitext(text_elem.text)
+#                         if cleaned:
+#                             out.write(cleaned + "\n\n")
+
+#                     elem.clear()
+
+
+# if __name__ == "__main__":
+#     input_path = sys.argv[1]
+#     output_path = sys.argv[2]
+#     extract_text_from_xml(input_path, output_path)
+#     print("Preprocessing complete.")
+
+import bz2
+import re
+import xml.etree.ElementTree as ET
+from pathlib import Path
+import sys
+from generate_manifest import generate_manifest
+
+
+def clean_wikitext(text: str) -> str:
+    text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL)
+    text = re.sub(r"<ref.*?>.*?</ref>", "", text, flags=re.DOTALL)
+    text = re.sub(r"<.*?>", "", text)
+    text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text)
+    text = re.sub(r"\[\[(.*?)\]\]", r"\1", text)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+
+
+def extract_text_from_xml(input_path):
+    input_path = Path(input_path)
+
+    # Fixed output path
+    output_dir = Path("data/processed")
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    output_path = output_dir / "wiki_clean.txt"
+
+    with bz2.open(input_path, "rb") as f:
+        context = ET.iterparse(f, events=("end",))
+
+        with open(output_path, "w", encoding="utf-8") as out:
+            for event, elem in context:
+                if elem.tag.endswith("page"):
+                    text_elem = elem.find(".//{*}text")
+
+                    if text_elem is not None and text_elem.text:
+                        cleaned = clean_wikitext(text_elem.text)
+                        if cleaned:
+                            out.write(cleaned + "\n\n")
+
+                    elem.clear()
+
+    print(f"Preprocessing complete. Output saved to {output_path}")
+
+    # 🔥 Automatically generate manifest
+    generate_manifest(input_path, output_path)
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python preprocess.py <input_dump>")
+        sys.exit(1)
+
+    extract_text_from_xml(sys.argv[1])

From 63182df505a913da8681f9a3529fed7e4b3a8b87 Mon Sep 17 00:00:00 2001
From: tanii1125 <dubeytanisha66@gmail.com>
Date: Sat, 21 Feb 2026 13:21:17 +0530
Subject: [PATCH 2/7] updated  .coderabbit.yaml

---
 .coderabbit.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.coderabbit.yaml b/.coderabbit.yaml
index 877065c..19d6c04 100644
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -157,6 +157,8 @@ reviews:
     - Confirm that the code meets the project's requirements and objectives
     - Confirm that copyright years are up-to date whenever a file is changed
     - Point out redundant obvious comments that do not add clarity to the code
+    - Ensure that comments are concise and suggest more concise comment statements if possible
+    - Discourage usage of verbose comment styles such as NatSpec
     - Look for code duplication
     - Suggest code completions when:
         - seeing a TODO comment
@@ -275,4 +277,4 @@ reviews:
         - Image optimization (appropriate size and format)
         - Proper @2x and @3x variants for different screen densities
         - SVG assets are optimized
-        - Font files are licensed and optimized
+        - Font files are licensed and optimized
\ No newline at end of file

From 32215759ce1b63a7ad41f75a120beb3b1ae30c31 Mon Sep 17 00:00:00 2001
From: tanii1125 <dubeytanisha66@gmail.com>
Date: Sat, 21 Feb 2026 15:57:41 +0530
Subject: [PATCH 3/7] resolved coderabbitai reviews

---
 .gitignore                                    |   3 +-
 README.md                                     |  13 ++-
 pytest.ini                                    |   2 +
 requirements.txt                              |   2 +
 scripts/__init__.py                           |   0
 scripts/generate_manifest.py                  |  20 +++--
 scripts/hash_utils.py                         |   7 +-
 scripts/preprocess.py                         |  81 +++++-------------
 ...rate_manifest.cpython-313-pytest-9.0.2.pyc | Bin 0 -> 1229 bytes
 tests/test_generate_manifest.py               |  21 +++++
 10 files changed, 76 insertions(+), 73 deletions(-)
 create mode 100644 pytest.ini
 create mode 100644 requirements.txt
 create mode 100644 scripts/__init__.py
 create mode 100644 tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc
 create mode 100644 tests/test_generate_manifest.py

diff --git a/.gitignore b/.gitignore
index 3846644..1d7036a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -325,5 +325,4 @@ TSWLatexianTemp*
 # Uncomment the next line to have this generated file ignored.
 #*Notes.bib
 
-dataset_manifest.json
-data
\ No newline at end of file
+data/
\ No newline at end of file
diff --git a/README.md b/README.md
index 365984d..0a68f5e 100644
--- a/README.md
+++ b/README.md
@@ -108,9 +108,10 @@ OpenVerifiableLLM/
 
 - Python 3.9+
 - Wikipedia dump from:
-  https://dumps.wikimedia.org/
+  <https://dumps.wikimedia.org/>
 
 Recommended for testing:
+
 - `simplewiki-YYYYMMDD-pages-articles.xml.bz2`
 
 ---
@@ -127,6 +128,7 @@ cd OpenVerifiableLLM
 ### ▶ Running the Pipeline
 
 #### Step 1 — Place Dump File
+
 Move your Wikipedia dump into:
 
 ```bash
@@ -138,12 +140,15 @@ Example:
 ```bash
 data/raw/simplewiki-20260201-pages-articles.xml.bz2
 ```
+
 #### Step 2 — Run Preprocessing
 
 ```bash
-python scripts/preprocess.py data/raw/simplewiki-20260201-pages-articles.xml.bz2
+python -m scripts.preprocess <args>
 ```
+
 This will:
+
 - Create `data/processed/wiki_clean.txt`
 - Generate `dataset_manifest.json`
 - Compute `SHA256` hashes
@@ -160,7 +165,9 @@ This will:
   "python_version": "3.13.2"
 }
 ```
+
 ## 📈 Future Extensions
+
 - Deterministic tokenization stage
 - Token-level hashing
 - Multi-GPU training reproducibility
@@ -209,4 +216,4 @@ Thanks a lot for spending your time helping TODO grow. Keep rocking 🥂
 
 [![Contributors](https://contrib.rocks/image?repo=AOSSIE-Org/TODO)](https://github.com/AOSSIE-Org/TODO/graphs/contributors)
 
-© 2025 AOSSIE 
+© 2025 AOSSIE
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..03f586d
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+pythonpath = .
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..634c013
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+defusedxml>=0.7.1
+pytest
\ No newline at end of file
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/generate_manifest.py b/scripts/generate_manifest.py
index 058098c..8e73c49 100644
--- a/scripts/generate_manifest.py
+++ b/scripts/generate_manifest.py
@@ -1,9 +1,10 @@
 import json
-import sys
 import platform
 from pathlib import Path
-from hash_utils import sha256_file
+from .hash_utils import sha256_file
 
+# Anchor paths to project root (two levels up from this file)
+_PROJECT_ROOT = Path(__file__).resolve().parent.parent
 
 def extract_dump_date(filename: str):
     parts = filename.split("-")
@@ -15,13 +16,12 @@ def extract_dump_date(filename: str):
 
 def generate_manifest(raw_path, processed_path):
     raw_path = Path(raw_path)
-
-    # Automatically infer processed file path
-    processed_path = Path("data/processed/wiki_clean.txt")
+    processed_path = Path(processed_path)
 
     if not processed_path.exists():
-        print("Error: Processed file not found. Run preprocessing first.")
-        sys.exit(1)
+        raise FileNotFoundError(
+            f"Processed file not found at {processed_path}. Run preprocessing first."
+        )
 
     manifest = {
         "wikipedia_dump": raw_path.name,
@@ -32,7 +32,9 @@ def generate_manifest(raw_path, processed_path):
         "python_version": platform.python_version()
     }
 
-    with open("dataset_manifest.json", "w") as f:
+    manifest_path = _PROJECT_ROOT / "dataset_manifest.json"
+
+    with open(manifest_path, "w") as f:
         json.dump(manifest, f, indent=2)
 
-    print("Manifest generated successfully.")
+    print(f"Manifest written to {manifest_path}")
\ No newline at end of file
diff --git a/scripts/hash_utils.py b/scripts/hash_utils.py
index 88a2cb5..6106807 100644
--- a/scripts/hash_utils.py
+++ b/scripts/hash_utils.py
@@ -3,11 +3,16 @@
 
 def sha256_file(filepath):
     sha256 = hashlib.sha256()
+    chunk_size = 1024 * 1024  # 1 MiB for better throughput
+    
     with open(filepath, "rb") as f:
-        for chunk in iter(lambda: f.read(4096), b""):
+        for chunk in iter(lambda: f.read(chunk_size), b""):
             sha256.update(chunk)
     return sha256.hexdigest()
 
 if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python hash_utils.py <filepath>")
+        sys.exit(1)
     path = sys.argv[1]
     print(sha256_file(path))
\ No newline at end of file
diff --git a/scripts/preprocess.py b/scripts/preprocess.py
index 4cf5cba..236a6cc 100644
--- a/scripts/preprocess.py
+++ b/scripts/preprocess.py
@@ -1,66 +1,31 @@
-# import bz2
-# import re
-# import xml.etree.ElementTree as ET
-# from pathlib import Path
-# import sys
-
-
-# def clean_wikitext(text: str) -> str:
-#     # Remove templates {{...}}
-#     text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL)
-
-#     # Remove references <ref>...</ref>
-#     text = re.sub(r"<ref.*?>.*?</ref>", "", text, flags=re.DOTALL)
-
-#     # Remove HTML tags
-#     text = re.sub(r"<.*?>", "", text)
-
-#     # Convert [[Link|Text]] → Text
-#     text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text)
-
-#     # Convert [[Link]] → Link
-#     text = re.sub(r"\[\[(.*?)\]\]", r"\1", text)
-
-#     # Remove multiple spaces/newlines
-#     text = re.sub(r"\s+", " ", text)
-
-#     return text.strip()
-
-
-# def extract_text_from_xml(input_path, output_path):
-#     Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-
-#     with bz2.open(input_path, "rb") as f:
-#         context = ET.iterparse(f, events=("end",))
-
-#         with open(output_path, "w", encoding="utf-8") as out:
-#             for event, elem in context:
-#                 if elem.tag.endswith("page"):
-#                     text_elem = elem.find(".//{*}text")
-
-#                     if text_elem is not None and text_elem.text:
-#                         cleaned = clean_wikitext(text_elem.text)
-#                         if cleaned:
-#                             out.write(cleaned + "\n\n")
-
-#                     elem.clear()
-
-
-# if __name__ == "__main__":
-#     input_path = sys.argv[1]
-#     output_path = sys.argv[2]
-#     extract_text_from_xml(input_path, output_path)
-#     print("Preprocessing complete.")
-
 import bz2
 import re
-import xml.etree.ElementTree as ET
+import defusedxml.ElementTree as ET
 from pathlib import Path
 import sys
-from generate_manifest import generate_manifest
+from .generate_manifest import generate_manifest
 
 
 def clean_wikitext(text: str) -> str:
+    """
+    Basic deterministic wikitext cleaning.
+
+    NOTE:
+    This implementation intentionally uses regex-based approximations
+    for performance and determinism. It does NOT fully parse MediaWiki syntax.
+
+    Known limitations:
+    - Nested templates like {{Infobox | birth={{Date|1990|1|1}}}}
+    are not fully handled. The non-greedy template regex may leave
+    stray closing braces (e.g., "}}") in deeply nested structures.
+    - Self-closing references such as <ref name="foo"/> are only partially
+    handled. While generic tag stripping removes the tag itself,
+    complex edge cases may not be fully normalized.
+    - This is not a complete MediaWiki parser and should not be relied
+    upon for perfectly structured wikitext normalization.
+
+    These trade-offs are acceptable for v1 deterministic preprocessing.
+    """
     text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL)
     text = re.sub(r"<ref.*?>.*?</ref>", "", text, flags=re.DOTALL)
     text = re.sub(r"<.*?>", "", text)
@@ -83,7 +48,7 @@ def extract_text_from_xml(input_path):
         context = ET.iterparse(f, events=("end",))
 
         with open(output_path, "w", encoding="utf-8") as out:
-            for event, elem in context:
+            for _, elem in context:
                 if elem.tag.endswith("page"):
                     text_elem = elem.find(".//{*}text")
 
@@ -96,7 +61,7 @@ def extract_text_from_xml(input_path):
 
     print(f"Preprocessing complete. Output saved to {output_path}")
 
-    # 🔥 Automatically generate manifest
+    # Automatically generate manifest
     generate_manifest(input_path, output_path)
 
 if __name__ == "__main__":
diff --git a/tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc b/tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b295c70fab69f451015dfc335bf413a8944f83eb
GIT binary patch
literal 1229
zcmbVL&1(}u6o31XWb@U~)*7OOu6oH<O|@7Asl|dV($)wq6`_S?n@rM$WV6g{Xwy?b
zPhRw3CEmRH4~SR)gg|S{qA2Lan;1dp$v2Y^TdE!$*f;y${NB8MzxUgAJRSjD&x$YX
z=Q6-&?(`P=VzsAn+ks_RmKK0YVwdL?ChIV-(tzGM3DO{*$P1x)jcOQE&HxQBt7s!7
z$q0PqSLF=JA1o1vP|G7`#d7Q-VP0MmWjZE^n}SdK4VoQT!J?O=0FJaGEgi*m&{%M?
z8`U{h5m<S(_vx^tc7aE>^RUXRh}FxML5J0FFZy6;vU`JQ4jcUKm*5;+f=m}$!0JUL
zVifarD8Wr-6pE6e<_<&D+RS=eo*yXGDwX<nv|=-6J0%`70)AwZ+Q@-N5D|3i9`fJ^
zsbw?5{J}Z9OmbCku3B>nvy@gT<&+qTuXz>MbS-bg*YJ9?h&Vsiy;yjR19e^Zuid_}
zcF!fweM0S`ZLOEd;^N&kUWN&4o)|;ct@{%kaPr9Pe3)q$O_x^lgfUVuyJcwtD{_|E
zESd%!B$7>|mAKHjbD+ld)nrReHbc9r{%QDZv(g&A*jW4$PCQ9JPVXi&dtu|fmTG2p
zwW~iBh>Y!rjdx+=km11Lb!;Z8yp2X?QlVo4;iZH?D8Br!Kvv{Gib?Is0mZ;;h;>v9
zB`^Xx8o)T^655XU=@6q1<;%o!NWqY4h+{faqr4p&L*ozh@yDh`2l(OG-^i~yj8o62
zZIUe;M@WY;A&w85Lh}#ON7647t&#CY?m!*dSJN#u-L!Yr%#R=hPCwn)Q^(tw4@T72
z)@ydzvmNHgZn<@{@1@L+II|WGoqBfF$x<S;>xcUxKh%pwFj=%=QL5q1ev0L(?RqSG
zLT`c{p&rrAE1;(^p+ki>RjC$gWin0E7{h-ISPIQIMUteC0FU5d`~zJ7u7;$E_K+rB
H5vjib@(LrW

literal 0
HcmV?d00001

diff --git a/tests/test_generate_manifest.py b/tests/test_generate_manifest.py
new file mode 100644
index 0000000..987e86d
--- /dev/null
+++ b/tests/test_generate_manifest.py
@@ -0,0 +1,21 @@
+import pytest
+from scripts.generate_manifest import generate_manifest
+
+def test_generate_manifest_raises_if_processed_missing(tmp_path):
+    raw_file = tmp_path / "raw.txt"
+    raw_file.write_text("dummy")
+
+    missing_file = tmp_path / "missing.txt"
+
+    with pytest.raises(FileNotFoundError):
+        generate_manifest(raw_file, missing_file)
+        
+def test_generate_manifest_runs_if_file_exists(tmp_path):
+    raw_file = tmp_path / "raw.txt"
+    raw_file.write_text("dummy")
+
+    processed_file = tmp_path / "processed.txt"
+    processed_file.write_text("cleaned")
+
+    # Should not raise
+    generate_manifest(raw_file, processed_file)
\ No newline at end of file

From c959087a0cac70e63673a6a4bebc14845dfd7dbc Mon Sep 17 00:00:00 2001
From: tanii1125 <dubeytanisha66@gmail.com>
Date: Sun, 22 Feb 2026 22:37:34 +0530
Subject: [PATCH 4/7] Refactor: move helper functions to utils.py, written
 tests, and clean structure

---
 README.md                         |  81 +++++++++++++---
 examples/demo_util.py             |  19 ++++
 examples/hash_demo.py             |  12 ---
 examples/sample_wiki.txt          |   3 -
 examples/sample_wiki.xml          |  18 ++++
 examples/sample_wiki.xml.bz2      | Bin 0 -> 236 bytes
 openverifiablellm/dataset_hash.py |  31 ------
 openverifiablellm/utils.py        | 156 ++++++++++++++++++++++++++++++
 pytest.ini                        |   2 -
 scripts/__init__.py               |   0
 scripts/generate_manifest.py      |  40 --------
 scripts/hash_utils.py             |  18 ----
 scripts/preprocess.py             |  72 --------------
 tests/test_dataset_hash.py        |  36 -------
 tests/test_generate_manifest.py   |  21 ----
 tests/test_util.py                | 120 +++++++++++++++++++++++
 16 files changed, 380 insertions(+), 249 deletions(-)
 create mode 100644 examples/demo_util.py
 delete mode 100644 examples/hash_demo.py
 delete mode 100644 examples/sample_wiki.txt
 create mode 100644 examples/sample_wiki.xml
 create mode 100644 examples/sample_wiki.xml.bz2
 delete mode 100644 openverifiablellm/dataset_hash.py
 create mode 100644 openverifiablellm/utils.py
 delete mode 100644 pytest.ini
 delete mode 100644 scripts/__init__.py
 delete mode 100644 scripts/generate_manifest.py
 delete mode 100644 scripts/hash_utils.py
 delete mode 100644 scripts/preprocess.py
 delete mode 100644 tests/test_dataset_hash.py
 delete mode 100644 tests/test_generate_manifest.py
 create mode 100644 tests/test_util.py

diff --git a/README.md b/README.md
index 0a68f5e..300a40d 100644
--- a/README.md
+++ b/README.md
@@ -88,16 +88,24 @@ It ensures that:
 ```text
 OpenVerifiableLLM/
 │
-├── scripts/
-│ ├── preprocess.py
-│ ├── generate_manifest.py
-│ └── hash_utils.py
+├── data/              ← created automatically at runtime
+│   ├── dataset_manifest.json
+│   └── processed/
+│       └── wiki_clean.txt
+|
+├── examples/
+│   ├── demo_util.py
+│   ├── sample_wiki.xml
+│   └── sample_wiki.xml.bz2
 │
-├── data/
-│ ├── raw/
-│ └── processed/
+├── openverifiablellm/
+│   ├── __init__.py        ← (should exist)
+│   └── utils.py
 │
-└── dataset_manifest.json
+├── tests/
+│   └── test_util.py
+│
+└── requirements.txt
 ```
 
 ---
@@ -127,24 +135,37 @@ cd OpenVerifiableLLM
 
 ### ▶ Running the Pipeline
 
-#### Step 1 — Place Dump File
+#### Step 1 — Install the Package
 
-Move your Wikipedia dump into:
+From the project root:
 
 ```bash
-data/raw/
+pip install -e .
 ```
 
+#### Step 2 — Place Dump File
+
+Move your Wikipedia dump into the project root directory
+(the same directory that contains the ```openverifiablellm/``` folder).
+
 Example:
 
 ```bash
-data/raw/simplewiki-20260201-pages-articles.xml.bz2
+simplewiki-20260201-pages-articles.xml.bz2
 ```
 
-#### Step 2 — Run Preprocessing
+Copy the file path to use as the argument when running preprocessing.
+
+Example (relative path):
 
 ```bash
-python -m scripts.preprocess <args>
+simplewiki-20260201-pages-articles.xml.bz2
+```
+
+#### Step 3 — Run Preprocessing
+
+```bash
+python -m openverifiablellm.utils simplewiki-20260201-pages-articles.xml.bz2
 ```
 
 This will:
@@ -152,6 +173,7 @@ This will:
 - Create `data/processed/wiki_clean.txt`
 - Generate `dataset_manifest.json`
 - Compute `SHA256` hashes
+- Log preprocessing status
 
 #### 📜 Example Manifest
 
@@ -166,6 +188,37 @@ This will:
 }
 ```
 
+---
+
+## 🧪 Running Tests
+
+To verify correctness and reproducibility:
+
+```bash
+pytest
+```
+
+This runs:
+
+- Unit tests for:
+
+  - `clean_wikitext`
+  - `compute_sha256`
+  - `extract_dump_date`
+  - `generate_manifest`
+
+- Integration test for:
+
+  - `extract_text_from_xml` (end-to-end pipeline using a synthetic .bz2 file)
+
+All tests should pass:
+
+```text
+11 passed in 0.xx s
+```
+
+---
+
 ## 📈 Future Extensions
 
 - Deterministic tokenization stage
diff --git a/examples/demo_util.py b/examples/demo_util.py
new file mode 100644
index 0000000..ac818e1
--- /dev/null
+++ b/examples/demo_util.py
@@ -0,0 +1,19 @@
+## run via-
+## python -m examples.demo_util examples\sample_wiki.xml.bz2
+
+import sys
+import logging
+from openverifiablellm.utils import extract_text_from_xml
+
+logger = logging.getLogger(__name__)
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python preprocess.py <input_dump>")
+        sys.exit(1)
+        
+    logging.basicConfig(
+    level=logging.INFO,
+    format="%(levelname)s - %(message)s"
+    )
+    extract_text_from_xml(sys.argv[1])
\ No newline at end of file
diff --git a/examples/hash_demo.py b/examples/hash_demo.py
deleted file mode 100644
index ad8842c..0000000
--- a/examples/hash_demo.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from pathlib import Path
-from openverifiablellm.dataset_hash import compute_sha256
-
-
-if __name__ == "__main__":
-    current_dir = Path(__file__).parent
-    dataset_path = current_dir / "sample_wiki.txt"
-
-    dataset_hash = compute_sha256(dataset_path)
-
-    print("Dataset Hash:")
-    print(dataset_hash)
\ No newline at end of file
diff --git a/examples/sample_wiki.txt b/examples/sample_wiki.txt
deleted file mode 100644
index c30144b..0000000
--- a/examples/sample_wiki.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-Wikipedia is a free online encyclopedia.
-It is maintained by a community of volunteers.
-This is a small reproducibility sample.
diff --git a/examples/sample_wiki.xml b/examples/sample_wiki.xml
new file mode 100644
index 0000000..a59646b
--- /dev/null
+++ b/examples/sample_wiki.xml
@@ -0,0 +1,18 @@
+import bz2
+
+xml_content = """<?xml version="1.0" encoding="UTF-8"?>
+<mediawiki>
+  <page>
+    <revision>
+      <text>
+        Hello <ref>citation</ref> world.
+        This is [[Python|programming language]]
+        {{Wikipedia }}is a free online encyclopedia.
+      </text>
+    </revision>
+  </page>
+</mediawiki>
+"""
+
+with bz2.open("examples/sample_wiki.xml.bz2", "wt", encoding="utf-8") as f:
+    f.write(xml_content)
\ No newline at end of file
diff --git a/examples/sample_wiki.xml.bz2 b/examples/sample_wiki.xml.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..ba8e6f1f5c469f6e8fbc8b554ded8fb6d4cc6f65
GIT binary patch
literal 236
zcmV<I02BX0T4*^jL0KkKS=9S`5C8x$Uw~8+Py^scfj~xzFYn*(FaWx+ieUiIJwN~e
zXaF@7^*y0Yl6p*<G-PBjCQUsAq#C1YdW?Y0Ks`VMASxn)R6&Cz)E<HqATa6Nj_r<{
zxxsSRp^|*;k^s9Xj*?1nI2gx;F&L!G#H%1fw^xn@SHr(*3V=v)#lociXBPl+b8ASJ
zkxtZ?IHaI_4ha_7dgF+DEy-5$#L8?nkZaW33RlY-tNg$7&H}?sga$GsfY{T`&L%^*
m#E?3B@mZX3@N!i$MD_#?)}V5SyLwQ)h1`)&6eKl1-oyY!ylDdf

literal 0
HcmV?d00001

diff --git a/openverifiablellm/dataset_hash.py b/openverifiablellm/dataset_hash.py
deleted file mode 100644
index 0a6f114..0000000
--- a/openverifiablellm/dataset_hash.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import hashlib
-from pathlib import Path
-from typing import Union
-
-
-def compute_sha256(file_path: Union[str, Path]) -> str:
-    """
-    Compute SHA256 hash of a file.
-
-    This provides a deterministic fingerprint of the dataset,
-    enabling reproducibility and verification.
-
-    Parameters
-    ----------
-    file_path : Union[str, Path]
-        Path to the dataset file (string or Path-like).
-
-    Returns
-    -------
-    str
-        SHA256 hash string.
-    """
-    path = Path(file_path)
-
-    sha256 = hashlib.sha256()
-
-    with path.open("rb") as f:
-        while chunk := f.read(8192):
-            sha256.update(chunk)
-
-    return sha256.hexdigest()
\ No newline at end of file
diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py
new file mode 100644
index 0000000..ba7f042
--- /dev/null
+++ b/openverifiablellm/utils.py
@@ -0,0 +1,156 @@
+import bz2
+import re
+import defusedxml.ElementTree as ET
+from pathlib import Path
+import sys
+from typing import Union
+import hashlib
+import logging
+import json
+import platform
+
+logger = logging.getLogger(__name__)
+
+# extract clean wikipage from actual wikipage
+def extract_text_from_xml(input_path):
+    """
+    Process a compressed Wikipedia XML dump into cleaned plain text.
+
+    Each <page> element is parsed, its revision text is extracted,
+    cleaned using `clean_wikitext()`, and appended to a single
+    output text file.
+
+    The processed output is saved to:
+        data/processed/wiki_clean.txt
+
+    Parameters
+    ----------
+    input_path : str or Path
+        Path to the compressed Wikipedia XML (.bz2) dump file.
+
+    Output
+    ------
+    Creates:
+        data/processed/wiki_clean.txt
+    """
+    input_path = Path(input_path)
+
+    # Fixed output path
+    project_root = Path.cwd()
+    output_dir = project_root / "data" / "processed"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    output_path = output_dir / "wiki_clean.txt"
+
+    with bz2.open(input_path, "rb") as f:
+        context = ET.iterparse(f, events=("end",))
+
+        with open(output_path, "w", encoding="utf-8") as out:
+            for _, elem in context:
+                if elem.tag.endswith("page"):
+                    text_elem = elem.find(".//{*}text")
+
+                    if text_elem is not None and text_elem.text:
+                        cleaned = clean_wikitext(text_elem.text)
+                        if cleaned:
+                            out.write(cleaned + "\n\n")
+
+                    elem.clear()
+    logger.info("Preprocessing complete. Output saved to %s", output_path)
+    generate_manifest(input_path,output_path)
+    
+# generate data manifest
+def generate_manifest(raw_path, processed_path):
+    raw_path = Path(raw_path)
+    processed_path = Path(processed_path)
+
+    if not processed_path.exists():
+        raise FileNotFoundError(
+            f"Processed file not found at {processed_path}. Run preprocessing first."
+        )
+
+    manifest = {
+        "wikipedia_dump": raw_path.name,
+        "dump_date": extract_dump_date(raw_path.name),
+        "raw_sha256": compute_sha256(str(raw_path)),
+        "processed_sha256": compute_sha256(str(processed_path)),
+        "preprocessing_version": "v1",
+        "python_version": platform.python_version()
+    }
+    project_root = Path.cwd()
+    manifest_path = project_root / "data" / "dataset_manifest.json"
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(manifest_path, "w") as f:
+        json.dump(manifest, f, indent=2)
+
+    logger.info("Manifest written to %s", manifest_path)
+
+# helpers
+def compute_sha256(file_path: Union[str, Path]) -> str:
+    """
+    Compute SHA256 hash of a file.
+
+    This provides a deterministic fingerprint of the dataset,
+    enabling reproducibility and verification.
+
+    Parameters
+    ----------
+    file_path : Union[str, Path]
+        Path to the dataset file (string or Path-like).
+
+    Returns
+    -------
+    str
+        SHA256 hash string.
+    """
+    path = Path(file_path)
+
+    sha256 = hashlib.sha256()
+
+    with path.open("rb") as f:
+        while chunk := f.read(8192):
+            sha256.update(chunk)
+
+    return sha256.hexdigest()
+
+def extract_dump_date(filename: str):
+    parts = filename.split("-")
+    for part in parts:
+        if part.isdigit() and len(part) == 8:
+            return f"{part[:4]}-{part[4:6]}-{part[6:]}"
+    return "unknown"
+
+def clean_wikitext(text: str) -> str:
+    """
+    Basic deterministic wikitext cleaning.
+
+    Note:
+    This uses simple regex-based rules for speed and consistency.
+    It does NOT fully parse MediaWiki syntax.
+
+    Limitations:
+    - Deeply nested templates may not be fully removed.
+    - Some complex <ref /> cases may not be perfectly handled.
+    - This is not a complete MediaWiki parser.
+
+    These limitations are acceptable for lightweight, deterministic preprocessing.
+    """
+    text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL)
+    text = re.sub(r"<ref.*?>.*?</ref>", "", text, flags=re.DOTALL)
+    text = re.sub(r"<.*?>", "", text)
+    text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text)
+    text = re.sub(r"\[\[(.*?)\]\]", r"\1", text)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python preprocess.py <input_dump>")
+        sys.exit(1)
+        
+    logging.basicConfig(
+    level=logging.INFO,
+    format="%(levelname)s - %(message)s"
+    )
+    extract_text_from_xml(sys.argv[1])
diff --git a/pytest.ini b/pytest.ini
deleted file mode 100644
index 03f586d..0000000
--- a/pytest.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-[pytest]
-pythonpath = .
\ No newline at end of file
diff --git a/scripts/__init__.py b/scripts/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/scripts/generate_manifest.py b/scripts/generate_manifest.py
deleted file mode 100644
index 8e73c49..0000000
--- a/scripts/generate_manifest.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import json
-import platform
-from pathlib import Path
-from .hash_utils import sha256_file
-
-# Anchor paths to project root (two levels up from this file)
-_PROJECT_ROOT = Path(__file__).resolve().parent.parent
-
-def extract_dump_date(filename: str):
-    parts = filename.split("-")
-    for part in parts:
-        if part.isdigit() and len(part) == 8:
-            return f"{part[:4]}-{part[4:6]}-{part[6:]}"
-    return "unknown"
-
-
-def generate_manifest(raw_path, processed_path):
-    raw_path = Path(raw_path)
-    processed_path = Path(processed_path)
-
-    if not processed_path.exists():
-        raise FileNotFoundError(
-            f"Processed file not found at {processed_path}. Run preprocessing first."
-        )
-
-    manifest = {
-        "wikipedia_dump": raw_path.name,
-        "dump_date": extract_dump_date(raw_path.name),
-        "raw_sha256": sha256_file(str(raw_path)),
-        "processed_sha256": sha256_file(str(processed_path)),
-        "preprocessing_version": "v1",
-        "python_version": platform.python_version()
-    }
-
-    manifest_path = _PROJECT_ROOT / "dataset_manifest.json"
-
-    with open(manifest_path, "w") as f:
-        json.dump(manifest, f, indent=2)
-
-    print(f"Manifest written to {manifest_path}")
\ No newline at end of file
diff --git a/scripts/hash_utils.py b/scripts/hash_utils.py
deleted file mode 100644
index 6106807..0000000
--- a/scripts/hash_utils.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import hashlib
-import sys
-
-def sha256_file(filepath):
-    sha256 = hashlib.sha256()
-    chunk_size = 1024 * 1024  # 1 MiB for better throughput
-    
-    with open(filepath, "rb") as f:
-        for chunk in iter(lambda: f.read(chunk_size), b""):
-            sha256.update(chunk)
-    return sha256.hexdigest()
-
-if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print("Usage: python hash_utils.py <filepath>")
-        sys.exit(1)
-    path = sys.argv[1]
-    print(sha256_file(path))
\ No newline at end of file
diff --git a/scripts/preprocess.py b/scripts/preprocess.py
deleted file mode 100644
index 236a6cc..0000000
--- a/scripts/preprocess.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import bz2
-import re
-import defusedxml.ElementTree as ET
-from pathlib import Path
-import sys
-from .generate_manifest import generate_manifest
-
-
-def clean_wikitext(text: str) -> str:
-    """
-    Basic deterministic wikitext cleaning.
-
-    NOTE:
-    This implementation intentionally uses regex-based approximations
-    for performance and determinism. It does NOT fully parse MediaWiki syntax.
-
-    Known limitations:
-    - Nested templates like {{Infobox | birth={{Date|1990|1|1}}}}
-    are not fully handled. The non-greedy template regex may leave
-    stray closing braces (e.g., "}}") in deeply nested structures.
-    - Self-closing references such as <ref name="foo"/> are only partially
-    handled. While generic tag stripping removes the tag itself,
-    complex edge cases may not be fully normalized.
-    - This is not a complete MediaWiki parser and should not be relied
-    upon for perfectly structured wikitext normalization.
-
-    These trade-offs are acceptable for v1 deterministic preprocessing.
-    """
-    text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL)
-    text = re.sub(r"<ref.*?>.*?</ref>", "", text, flags=re.DOTALL)
-    text = re.sub(r"<.*?>", "", text)
-    text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text)
-    text = re.sub(r"\[\[(.*?)\]\]", r"\1", text)
-    text = re.sub(r"\s+", " ", text)
-    return text.strip()
-
-
-def extract_text_from_xml(input_path):
-    input_path = Path(input_path)
-
-    # Fixed output path
-    output_dir = Path("data/processed")
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    output_path = output_dir / "wiki_clean.txt"
-
-    with bz2.open(input_path, "rb") as f:
-        context = ET.iterparse(f, events=("end",))
-
-        with open(output_path, "w", encoding="utf-8") as out:
-            for _, elem in context:
-                if elem.tag.endswith("page"):
-                    text_elem = elem.find(".//{*}text")
-
-                    if text_elem is not None and text_elem.text:
-                        cleaned = clean_wikitext(text_elem.text)
-                        if cleaned:
-                            out.write(cleaned + "\n\n")
-
-                    elem.clear()
-
-    print(f"Preprocessing complete. Output saved to {output_path}")
-
-    # Automatically generate manifest
-    generate_manifest(input_path, output_path)
-
-if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print("Usage: python preprocess.py <input_dump>")
-        sys.exit(1)
-
-    extract_text_from_xml(sys.argv[1])
diff --git a/tests/test_dataset_hash.py b/tests/test_dataset_hash.py
deleted file mode 100644
index b560baf..0000000
--- a/tests/test_dataset_hash.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import os
-import hashlib
-import tempfile
-import pytest
-from openverifiablellm.dataset_hash import compute_sha256
-
-
-def test_correct_sha256_output(tmp_path):
-    # Create a temporary file
-    file = tmp_path / "sample.txt"
-    content = "hello wikipedia"
-    file.write_text(content, encoding="utf-8")
-
-    # Expected hash using standard hashlib
-    expected = hashlib.sha256(content.encode("utf-8")).hexdigest()
-
-    # Hash using your function
-    actual = compute_sha256(str(file))
-
-    # Verify correctness
-    assert actual == expected
-
-
-def test_different_content_different_hash(tmp_path):
-    file1 = tmp_path / "content_a.txt"
-    file2 = tmp_path / "content_b.txt"
-
-    file1.write_text("Content A", encoding="utf-8")
-    file2.write_text("Content B", encoding="utf-8")
-
-    assert compute_sha256(file1) != compute_sha256(file2)
-
-
-def test_file_not_found():
-    with pytest.raises(FileNotFoundError):
-        compute_sha256("non_existent_file.txt")
\ No newline at end of file
diff --git a/tests/test_generate_manifest.py b/tests/test_generate_manifest.py
deleted file mode 100644
index 987e86d..0000000
--- a/tests/test_generate_manifest.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import pytest
-from scripts.generate_manifest import generate_manifest
-
-def test_generate_manifest_raises_if_processed_missing(tmp_path):
-    raw_file = tmp_path / "raw.txt"
-    raw_file.write_text("dummy")
-
-    missing_file = tmp_path / "missing.txt"
-
-    with pytest.raises(FileNotFoundError):
-        generate_manifest(raw_file, missing_file)
-        
-def test_generate_manifest_runs_if_file_exists(tmp_path):
-    raw_file = tmp_path / "raw.txt"
-    raw_file.write_text("dummy")
-
-    processed_file = tmp_path / "processed.txt"
-    processed_file.write_text("cleaned")
-
-    # Should not raise
-    generate_manifest(raw_file, processed_file)
\ No newline at end of file
diff --git a/tests/test_util.py b/tests/test_util.py
new file mode 100644
index 0000000..f163968
--- /dev/null
+++ b/tests/test_util.py
@@ -0,0 +1,120 @@
+import bz2
+import hashlib
+import pytest
+from openverifiablellm import utils
+# run test by--
+## pip install -e . (from project root)
+## pytest
+
+# --------------- clean_wikitext tests ------------------------------------
+
+def test_clean_wikitext_removes_templates_and_refs():
+    text = "Hello {{Infobox}} <ref>cite</ref> world"
+    cleaned = utils.clean_wikitext(text)
+    assert cleaned == "Hello world"
+
+
+def test_clean_wikitext_handles_links():
+    text = "This is [[Python|programming language]] and [[India]]"
+    cleaned = utils.clean_wikitext(text)
+    assert cleaned == "This is programming language and India"
+
+
+def test_clean_wikitext_collapses_whitespace():
+    text = "Hello   world\n\n   test"
+    cleaned = utils.clean_wikitext(text)
+    assert cleaned == "Hello world test"
+    
+# --------------- extract_dump_date tests ------------------------------------
+
+def test_extract_dump_date_valid():
+    filename = "simplewiki-20260201-pages-articles.xml.bz2"
+    assert utils.extract_dump_date(filename) == "2026-02-01"
+
+
+def test_extract_dump_date_invalid():
+    filename = "no-date-file.xml.bz2"
+    assert utils.extract_dump_date(filename) == "unknown"
+
+# --------------- generate manifest ------------------------------------
+
+def test_generate_manifest_raises_if_processed_missing(tmp_path):
+    raw_file = tmp_path / "raw.txt"
+    raw_file.write_text("dummy")
+
+    missing_file = tmp_path / "missing.txt"
+
+    with pytest.raises(FileNotFoundError):
+        utils.generate_manifest(raw_file, missing_file)
+        
+def test_generate_manifest_runs_if_file_exists(tmp_path):
+    raw_file = tmp_path / "raw.txt"
+    raw_file.write_text("dummy")
+
+    processed_file = tmp_path / "processed.txt"
+    processed_file.write_text("cleaned")
+
+    # Should not raise
+    utils.generate_manifest(raw_file, processed_file)
+    
+# --------------- compute_sha256 ------------------------------------
+
+def test_correct_sha256_output(tmp_path):
+    # Create a temporary file
+    file = tmp_path / "sample.txt"
+    content = "hello wikipedia"
+    file.write_text(content, encoding="utf-8")
+
+    # Expected hash using standard hashlib
+    expected = hashlib.sha256(content.encode("utf-8")).hexdigest()
+
+    # Hash using your function
+    actual = utils.compute_sha256(str(file))
+
+    # Verify correctness
+    assert actual == expected
+
+
+def test_different_content_different_hash(tmp_path):
+    file1 = tmp_path / "content_a.txt"
+    file2 = tmp_path / "content_b.txt"
+
+    file1.write_text("Content A", encoding="utf-8")
+    file2.write_text("Content B", encoding="utf-8")
+
+    assert utils.compute_sha256(file1) != utils.compute_sha256(file2)
+
+
+def test_file_not_found():
+    with pytest.raises(FileNotFoundError):
+        utils.compute_sha256("non_existent_file.txt")
+        
+# --------------- extract_text_from_xml tests ------------------------------------
+
+def test_extract_text_from_xml_end_to_end(tmp_path, monkeypatch):
+
+    xml_content = """<?xml version="1.0"?>
+    <mediawiki>
+      <page>
+        <revision>
+          <text>Hello [[World]]</text>
+        </revision>
+      </page>
+    </mediawiki>
+    """
+
+    input_file = tmp_path / "simplewiki-20260201-pages.xml.bz2"
+
+    with bz2.open(input_file, "wt", encoding="utf-8") as f:
+        f.write(xml_content)
+
+    # Redirect project root
+    monkeypatch.chdir(tmp_path)
+
+    utils.extract_text_from_xml(input_file)
+
+    processed_file = tmp_path / "data/processed/wiki_clean.txt"
+    assert processed_file.exists()
+
+    assert "Hello World" in processed_file.read_text()
+    
\ No newline at end of file

From 5ab32d6b1d5527dbd55889f5bc6c4ff63e774715 Mon Sep 17 00:00:00 2001
From: tanii1125 <dubeytanisha66@gmail.com>
Date: Sun, 22 Feb 2026 22:58:16 +0530
Subject: [PATCH 5/7] resolved coderabbitai reviews

---
 README.md                                    | 4 ++--
 examples/demo_util.py                        | 2 +-
 examples/{sample_wiki.xml => sample_wiki.py} | 0
 openverifiablellm/utils.py                   | 2 +-
 tests/test_util.py                           | 8 ++++++--
 5 files changed, 10 insertions(+), 6 deletions(-)
 rename examples/{sample_wiki.xml => sample_wiki.py} (100%)

diff --git a/README.md b/README.md
index 300a40d..d3fd80b 100644
--- a/README.md
+++ b/README.md
@@ -95,7 +95,7 @@ OpenVerifiableLLM/
 |
 ├── examples/
 │   ├── demo_util.py
-│   ├── sample_wiki.xml
+│   ├── sample_wiki.py
 │   └── sample_wiki.xml.bz2
 │
 ├── openverifiablellm/
@@ -146,7 +146,7 @@ pip install -e .
 #### Step 2 — Place Dump File
 
 Move your Wikipedia dump into the project root directory
-(the same directory that contains the ```openverifiablellm/``` folder).
+(the same directory that contains the `openverifiablellm/` folder).
 
 Example:
 
diff --git a/examples/demo_util.py b/examples/demo_util.py
index ac818e1..8ca4620 100644
--- a/examples/demo_util.py
+++ b/examples/demo_util.py
@@ -9,7 +9,7 @@
 
 if __name__ == "__main__":
     if len(sys.argv) < 2:
-        print("Usage: python preprocess.py <input_dump>")
+        print("Usage: python -m examples.demo_util <input_dump>")
         sys.exit(1)
         
     logging.basicConfig(
diff --git a/examples/sample_wiki.xml b/examples/sample_wiki.py
similarity index 100%
rename from examples/sample_wiki.xml
rename to examples/sample_wiki.py
diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py
index ba7f042..084e1e2 100644
--- a/openverifiablellm/utils.py
+++ b/openverifiablellm/utils.py
@@ -146,7 +146,7 @@ def clean_wikitext(text: str) -> str:
 
 if __name__ == "__main__":
     if len(sys.argv) < 2:
-        print("Usage: python preprocess.py <input_dump>")
+        print("Usage: python -m openverifiablellm.utils <input_dump>")
         sys.exit(1)
         
     logging.basicConfig(
diff --git a/tests/test_util.py b/tests/test_util.py
index f163968..3901f2d 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -47,15 +47,19 @@ def test_generate_manifest_raises_if_processed_missing(tmp_path):
     with pytest.raises(FileNotFoundError):
         utils.generate_manifest(raw_file, missing_file)
         
-def test_generate_manifest_runs_if_file_exists(tmp_path):
+def test_generate_manifest_runs_if_file_exists(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+
     raw_file = tmp_path / "raw.txt"
     raw_file.write_text("dummy")
 
     processed_file = tmp_path / "processed.txt"
     processed_file.write_text("cleaned")
 
-    # Should not raise
     utils.generate_manifest(raw_file, processed_file)
+
+    manifest_file = tmp_path / "data/dataset_manifest.json"
+    assert manifest_file.exists()
     
 # --------------- compute_sha256 ------------------------------------
 

From bce705a7cc887ba504def179b8409a9b63b4a0bf Mon Sep 17 00:00:00 2001
From: tanii1125 <dubeytanisha66@gmail.com>
Date: Tue, 24 Feb 2026 10:40:09 +0530
Subject: [PATCH 6/7] Address review feedback: remove __pycache__, delete
 requirements.txt, update gitignore, revert README

---
 .gitignore                                    |   1 +
 README.md                                     | 246 +++++++++---------
 pyproject.toml                                |   9 +
 requirements.txt                              |   2 -
 ...rate_manifest.cpython-313-pytest-9.0.2.pyc | Bin 1229 -> 0 bytes
 5 files changed, 137 insertions(+), 121 deletions(-)
 delete mode 100644 requirements.txt
 delete mode 100644 tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc

diff --git a/.gitignore b/.gitignore
index 14d1412..392cb81 100644
--- a/.gitignore
+++ b/.gitignore
@@ -331,3 +331,4 @@ __pycache__/
 *.pyc
 *.pyo
 *.pyd
+*.bz2
diff --git a/README.md b/README.md
index d3fd80b..d76f8be 100644
--- a/README.md
+++ b/README.md
@@ -48,184 +48,192 @@
 ---
 
 <div align="center">
-<h1>OpenVerifiableLLM – Deterministic Dataset Pipeline</h1>
+<h1>TODO: Project Name</h1>
 </div>
 
-OpenVerifiableLLM is a deterministic Wikipedia preprocessing and dataset verification pipeline designed to support fully reproducible LLM training.
-
-It ensures that:
-
-- The same Wikipedia dump always produces identical processed output.
-- Dataset fingerprints (SHA256 hashes) are generated for verification.
-- A manifest file captures dataset identity and environment metadata.
+[TODO](https://TODO.stability.nexus/) is a ... TODO: Project Description.
 
 ---
 
 ## 🚀 Features
 
-- **Deterministic Wikipedia preprocessing**
-- **Wikitext cleaning (templates, references, links removed)**
-- **Stable XML parsing with memory-efficient streaming**
-- **SHA256 hashing of raw and processed datasets**
-- **Automatic dataset manifest generation**
-- **Reproducible data identity tracking**
+TODO: List your main features here:
+
+- **Feature 1**: Description
+- **Feature 2**: Description
+- **Feature 3**: Description
+- **Feature 4**: Description
 
 ---
 
 ## 💻 Tech Stack
 
-- Python 3.9+
-- `xml.etree.ElementTree` (stream parsing)
-- `bz2` (compressed dump handling)
-- `hashlib` (SHA256 hashing)
-- `pathlib`
-- `re` (deterministic cleaning)
+TODO: Update based on your project
 
----
+### Frontend
+- React / Next.js / Flutter / React Native
+- TypeScript
+- TailwindCSS
 
-## 📂 Project Structure
-
-```text
-OpenVerifiableLLM/
-│
-├── data/              ← created automatically at runtime
-│   ├── dataset_manifest.json
-│   └── processed/
-│       └── wiki_clean.txt
-|
-├── examples/
-│   ├── demo_util.py
-│   ├── sample_wiki.py
-│   └── sample_wiki.xml.bz2
-│
-├── openverifiablellm/
-│   ├── __init__.py        ← (should exist)
-│   └── utils.py
-│
-├── tests/
-│   └── test_util.py
-│
-└── requirements.txt
-```
+### Backend
+- Flask / FastAPI / Node.js / Supabase
+- Database: PostgreSQL / SQLite / MongoDB
+
+### AI/ML (if applicable)
+- LangChain / LangGraph / LlamaIndex
+- Google Gemini / OpenAI / Anthropic Claude
+- Vector Database: Weaviate / Pinecone / Chroma
+- RAG / Prompt Engineering / Agent Frameworks
+
+### Blockchain (if applicable)
+- Solidity / solana / cardano / ergo Smart Contracts
+- Hardhat / Truffle / foundry
+- Web3.js / Ethers.js / Wagmi
+- OpenZeppelin / alchemy / Infura
 
 ---
 
-## �🍀 Getting Started
+## ✅ Project Checklist
+
+TODO: Complete applicable items based on your project type
+
+- [ ] **The protocol** (if applicable):
+   - [ ] has been described and formally specified in a paper.
+   - [ ] has had its main properties mathematically proven.
+   - [ ] has been formally verified.
+- [ ] **The smart contracts** (if applicable):
+   - [ ] were thoroughly reviewed by at least two knights of The Stable Order.
+   - [ ] were deployed to: [Add deployment details]
+- [ ] **The mobile app** (if applicable):
+   - [ ] has an _About_ page containing the Stability Nexus's logo and pointing to the social media accounts of the Stability Nexus.
+   - [ ] is available for download as a release in this repo.
+   - [ ] is available in the relevant app stores.
+- [ ] **The AI/ML components** (if applicable):
+   - [ ] LLM/model selection and configuration are documented.
+   - [ ] Prompts and system instructions are version-controlled.
+   - [ ] Content safety and moderation mechanisms are implemented.
+   - [ ] API keys and rate limits are properly managed.
 
-### Prerequisites
+---
 
-- Python 3.9+
-- Wikipedia dump from:
-  <https://dumps.wikimedia.org/>
+## 🔗 Repository Links
 
-Recommended for testing:
+TODO: Update with your repository structure
 
-- `simplewiki-YYYYMMDD-pages-articles.xml.bz2`
+1. [Main Repository](https://github.com/AOSSIE-Org/TODO)
+2. [Frontend](https://github.com/AOSSIE-Org/TODO/tree/main/frontend) (if separate)
+3. [Backend](https://github.com/AOSSIE-Org/TODO/tree/main/backend) (if separate)
 
 ---
 
-### Installation
+## 🏗️ Architecture Diagram
 
-#### 1. Clone the Repository
+TODO: Add your system architecture diagram here
 
-```bash
-git clone https://github.com/AOSSIE-Org/OpenVerifiableLLM.git
-cd OpenVerifiableLLM
+```
+[Architecture Diagram Placeholder]
 ```
 
-### ▶ Running the Pipeline
-
-#### Step 1 — Install the Package
-
-From the project root:
+You can create architecture diagrams using:
+- [Draw.io](https://draw.io)
+- [Excalidraw](https://excalidraw.com)
+- [Lucidchart](https://lucidchart.com)
+- [Mermaid](https://mermaid.js.org) (for code-based diagrams)
 
-```bash
-pip install -e .
-```
+Example structure to include:
+- Frontend components
+- Backend services
+- Database architecture
+- External APIs/services
+- Data flow between components
 
-#### Step 2 — Place Dump File
+---
 
-Move your Wikipedia dump into the project root directory
-(the same directory that contains the `openverifiablellm/` folder).
+## 🔄 User Flow
 
-Example:
+TODO: Add user flow diagrams showing how users interact with your application
 
-```bash
-simplewiki-20260201-pages-articles.xml.bz2
+```
+[User Flow Diagram Placeholder]
 ```
 
-Copy the file path to use as the argument when running preprocessing.
+### Key User Journeys
 
-Example (relative path):
+TODO: Document main user flows:
 
-```bash
-simplewiki-20260201-pages-articles.xml.bz2
-```
+1. **User Journey 1**: Description
+   - Step 1
+   - Step 2
+   - Step 3
 
-#### Step 3 — Run Preprocessing
+2. **User Journey 2**: Description
+   - Step 1
+   - Step 2
+   - Step 3
 
-```bash
-python -m openverifiablellm.utils simplewiki-20260201-pages-articles.xml.bz2
-```
+3. **User Journey 3**: Description
+   - Step 1
+   - Step 2
+   - Step 3
 
-This will:
+---
 
-- Create `data/processed/wiki_clean.txt`
-- Generate `dataset_manifest.json`
-- Compute `SHA256` hashes
-- Log preprocessing status
+## �🍀 Getting Started
 
-#### 📜 Example Manifest
+### Prerequisites
 
-```json
-{
-  "wikipedia_dump": "simplewiki-20260201-pages-articles.xml.bz2",
-  "dump_date": "2026-02-01",
-  "raw_sha256": "...",
-  "processed_sha256": "...",
-  "preprocessing_version": "v1",
-  "python_version": "3.13.2"
-}
-```
+TODO: List what developers need installed
 
----
+- Node.js 18+ / Python 3.9+ / Flutter SDK
+- npm / yarn / pnpm
+- [Any specific tools or accounts needed]
 
-## 🧪 Running Tests
+### Installation
 
-To verify correctness and reproducibility:
+TODO: Provide detailed setup instructions
+
+#### 1. Clone the Repository
 
 ```bash
-pytest
+git clone https://github.com/AOSSIE-Org/TODO.git
+cd TODO
 ```
 
-This runs:
+#### 2. Install Dependencies
 
-- Unit tests for:
+```bash
+npm install
+# or
+yarn install
+# or
+pnpm install
+```
 
-  - `clean_wikitext`
-  - `compute_sha256`
-  - `extract_dump_date`
-  - `generate_manifest`
+#### 3. Configure Environment Variables(.env.example)
 
-- Integration test for:
+Create a `.env` file in the root directory:
 
-  - `extract_text_from_xml` (end-to-end pipeline using a synthetic .bz2 file)
+```env
+# Add your environment variables here
+API_KEY=your_api_key
+DATABASE_URL=your_database_url
+```
 
-All tests should pass:
+#### 4. Run the Development Server
 
-```text
-11 passed in 0.xx s
+```bash
+npm run dev
+# or
+yarn dev
+# or
+pnpm dev
 ```
 
----
+#### 5. Open your Browser
 
-## 📈 Future Extensions
+Navigate to [http://localhost:3000](http://localhost:3000) to see the application.
 
-- Deterministic tokenization stage
-- Token-level hashing
-- Multi-GPU training reproducibility
-- Environment containerization (Docker)
-- Full checkpoint verification protocol
+For detailed setup instructions, please refer to our [Installation Guide](./docs/INSTALL_GUIDE.md) (if you have one).
 
 ---
 
@@ -269,4 +277,4 @@ Thanks a lot for spending your time helping TODO grow. Keep rocking 🥂
 
 [![Contributors](https://contrib.rocks/image?repo=AOSSIE-Org/TODO)](https://github.com/AOSSIE-Org/TODO/graphs/contributors)
 
-© 2025 AOSSIE
+© 2025 AOSSIE 
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 34e03d9..121d3bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,15 @@ authors = [
 ]
 requires-python = ">=3.9"
 
+dependencies= [
+    "defusedxml"
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest"
+]
+
 [tool.setuptools.packages.find]
 include = ["openverifiablellm*"]
 
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 634c013..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-defusedxml>=0.7.1
-pytest
\ No newline at end of file
diff --git a/tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc b/tests/__pycache__/test_generate_manifest.cpython-313-pytest-9.0.2.pyc
deleted file mode 100644
index b295c70fab69f451015dfc335bf413a8944f83eb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1229
zcmbVL&1(}u6o31XWb@U~)*7OOu6oH<O|@7Asl|dV($)wq6`_S?n@rM$WV6g{Xwy?b
zPhRw3CEmRH4~SR)gg|S{qA2Lan;1dp$v2Y^TdE!$*f;y${NB8MzxUgAJRSjD&x$YX
z=Q6-&?(`P=VzsAn+ks_RmKK0YVwdL?ChIV-(tzGM3DO{*$P1x)jcOQE&HxQBt7s!7
z$q0PqSLF=JA1o1vP|G7`#d7Q-VP0MmWjZE^n}SdK4VoQT!J?O=0FJaGEgi*m&{%M?
z8`U{h5m<S(_vx^tc7aE>^RUXRh}FxML5J0FFZy6;vU`JQ4jcUKm*5;+f=m}$!0JUL
zVifarD8Wr-6pE6e<_<&D+RS=eo*yXGDwX<nv|=-6J0%`70)AwZ+Q@-N5D|3i9`fJ^
zsbw?5{J}Z9OmbCku3B>nvy@gT<&+qTuXz>MbS-bg*YJ9?h&Vsiy;yjR19e^Zuid_}
zcF!fweM0S`ZLOEd;^N&kUWN&4o)|;ct@{%kaPr9Pe3)q$O_x^lgfUVuyJcwtD{_|E
zESd%!B$7>|mAKHjbD+ld)nrReHbc9r{%QDZv(g&A*jW4$PCQ9JPVXi&dtu|fmTG2p
zwW~iBh>Y!rjdx+=km11Lb!;Z8yp2X?QlVo4;iZH?D8Br!Kvv{Gib?Is0mZ;;h;>v9
zB`^Xx8o)T^655XU=@6q1<;%o!NWqY4h+{faqr4p&L*ozh@yDh`2l(OG-^i~yj8o62
zZIUe;M@WY;A&w85Lh}#ON7647t&#CY?m!*dSJN#u-L!Yr%#R=hPCwn)Q^(tw4@T72
z)@ydzvmNHgZn<@{@1@L+II|WGoqBfF$x<S;>xcUxKh%pwFj=%=QL5q1ev0L(?RqSG
zLT`c{p&rrAE1;(^p+ki>RjC$gWin0E7{h-ISPIQIMUteC0FU5d`~zJ7u7;$E_K+rB
H5vjib@(LrW


From 89041a05ec7b8ebbd709a5260fd86731dd54b497 Mon Sep 17 00:00:00 2001
From: tanii1125 <dubeytanisha66@gmail.com>
Date: Tue, 24 Feb 2026 11:06:24 +0530
Subject: [PATCH 7/7] docs: documentation for usage in tets and examples

---
 examples/demo_util.py | 10 +++++++---
 tests/test_util.py    | 11 ++++++++---
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/examples/demo_util.py b/examples/demo_util.py
index 8ca4620..9f41446 100644
--- a/examples/demo_util.py
+++ b/examples/demo_util.py
@@ -1,12 +1,16 @@
-## run via-
-## python -m examples.demo_util examples\sample_wiki.xml.bz2
-
 import sys
 import logging
 from openverifiablellm.utils import extract_text_from_xml
 
 logger = logging.getLogger(__name__)
 
+"""
+Demo for preprocessing pipeline.
+
+Run with:
+    python -m examples.demo_util examples\sample_wiki.xml.bz2
+"""
+
 if __name__ == "__main__":
     if len(sys.argv) < 2:
         print("Usage: python -m examples.demo_util <input_dump>")
diff --git a/tests/test_util.py b/tests/test_util.py
index 3901f2d..c0cede3 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -2,9 +2,14 @@
 import hashlib
 import pytest
 from openverifiablellm import utils
-# run test by--
-## pip install -e . (from project root)
-## pytest
+
+"""
+Unit and integration tests for OpenVerifiableLLM preprocessing pipeline.
+
+Run with:
+    pip install -e ".[dev]"
+    pytest
+"""
 
 # --------------- clean_wikitext tests ------------------------------------