diff --git a/.coderabbit.yaml b/.coderabbit.yaml
index 877065c..19d6c04 100644
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -157,6 +157,8 @@ reviews:
     - Confirm that the code meets the project's requirements and objectives
     - Confirm that copyright years are up-to date whenever a file is changed
     - Point out redundant obvious comments that do not add clarity to the code
+    - Ensure that comments are concise and suggest more concise comment statements if possible
+    - Discourage usage of verbose comment styles such as NatSpec
     - Look for code duplication
     - Suggest code completions when:
         - seeing a TODO comment
@@ -275,4 +277,4 @@ reviews:
         - Image optimization (appropriate size and format)
         - Proper @2x and @3x variants for different screen densities
         - SVG assets are optimized
-        - Font files are licensed and optimized
+        - Font files are licensed and optimized
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 46f6993..392cb81 100644
--- a/.gitignore
+++ b/.gitignore
@@ -324,8 +324,11 @@ TSWLatexianTemp*
 # option is specified. Footnotes are the stored in a file with suffix Notes.bib.
 # Uncomment the next line to have this generated file ignored.
 #*Notes.bib
+
+data/
 *.egg-info/
 __pycache__/
 *.pyc
 *.pyo
 *.pyd
+*.bz2
diff --git a/README.md b/README.md
index 3c5adf2..d76f8be 100644
--- a/README.md
+++ b/README.md
@@ -277,4 +277,4 @@ Thanks a lot for spending your time helping TODO grow. Keep rocking 🥂
 
 [![Contributors](https://contrib.rocks/image?repo=AOSSIE-Org/TODO)](https://github.com/AOSSIE-Org/TODO/graphs/contributors)
 
-© 2025 AOSSIE 
+© 2025 AOSSIE 
\ No newline at end of file
diff --git a/examples/demo_util.py b/examples/demo_util.py
new file mode 100644
index 0000000..9f41446
--- /dev/null
+++ b/examples/demo_util.py
@@ -0,0 +1,23 @@
+import sys
+import logging
+from openverifiablellm.utils import extract_text_from_xml
+
+logger = logging.getLogger(__name__)
+
+"""
+Demo for preprocessing pipeline.
+
+Run with:
+    python -m examples.demo_util examples\sample_wiki.xml.bz2
+"""
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python -m examples.demo_util <input_dump>")
+        sys.exit(1)
+        
+    logging.basicConfig(
+    level=logging.INFO,
+    format="%(levelname)s - %(message)s"
+    )
+    extract_text_from_xml(sys.argv[1])
\ No newline at end of file
diff --git a/examples/hash_demo.py b/examples/hash_demo.py
deleted file mode 100644
index ad8842c..0000000
--- a/examples/hash_demo.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from pathlib import Path
-from openverifiablellm.dataset_hash import compute_sha256
-
-
-if __name__ == "__main__":
-    current_dir = Path(__file__).parent
-    dataset_path = current_dir / "sample_wiki.txt"
-
-    dataset_hash = compute_sha256(dataset_path)
-
-    print("Dataset Hash:")
-    print(dataset_hash)
\ No newline at end of file
diff --git a/examples/sample_wiki.py b/examples/sample_wiki.py
new file mode 100644
index 0000000..a59646b
--- /dev/null
+++ b/examples/sample_wiki.py
@@ -0,0 +1,18 @@
+import bz2
+
+xml_content = """<?xml version="1.0" encoding="UTF-8"?>
+<mediawiki>
+  <page>
+    <revision>
+      <text>
+        Hello <ref>citation</ref> world.
+        This is [[Python|programming language]]
+        {{Wikipedia }}is a free online encyclopedia.
+      </text>
+    </revision>
+  </page>
+</mediawiki>
+"""
+
+with bz2.open("examples/sample_wiki.xml.bz2", "wt", encoding="utf-8") as f:
+    f.write(xml_content)
\ No newline at end of file
diff --git a/examples/sample_wiki.txt b/examples/sample_wiki.txt
deleted file mode 100644
index c30144b..0000000
--- a/examples/sample_wiki.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-Wikipedia is a free online encyclopedia.
-It is maintained by a community of volunteers.
-This is a small reproducibility sample.
diff --git a/examples/sample_wiki.xml.bz2 b/examples/sample_wiki.xml.bz2
new file mode 100644
index 0000000..ba8e6f1
Binary files /dev/null and b/examples/sample_wiki.xml.bz2 differ
diff --git a/openverifiablellm/dataset_hash.py b/openverifiablellm/dataset_hash.py
deleted file mode 100644
index 0a6f114..0000000
--- a/openverifiablellm/dataset_hash.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import hashlib
-from pathlib import Path
-from typing import Union
-
-
-def compute_sha256(file_path: Union[str, Path]) -> str:
-    """
-    Compute SHA256 hash of a file.
-
-    This provides a deterministic fingerprint of the dataset,
-    enabling reproducibility and verification.
-
-    Parameters
-    ----------
-    file_path : Union[str, Path]
-        Path to the dataset file (string or Path-like).
-
-    Returns
-    -------
-    str
-        SHA256 hash string.
-    """
-    path = Path(file_path)
-
-    sha256 = hashlib.sha256()
-
-    with path.open("rb") as f:
-        while chunk := f.read(8192):
-            sha256.update(chunk)
-
-    return sha256.hexdigest()
\ No newline at end of file
diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py
new file mode 100644
index 0000000..084e1e2
--- /dev/null
+++ b/openverifiablellm/utils.py
@@ -0,0 +1,156 @@
+import bz2
+import re
+import defusedxml.ElementTree as ET
+from pathlib import Path
+import sys
+from typing import Union
+import hashlib
+import logging
+import json
+import platform
+
+logger = logging.getLogger(__name__)
+
+# extract clean wikipage from actual wikipage
+def extract_text_from_xml(input_path):
+    """
+    Process a compressed Wikipedia XML dump into cleaned plain text.
+
+    Each <page> element is parsed, its revision text is extracted,
+    cleaned using `clean_wikitext()`, and appended to a single
+    output text file.
+
+    The processed output is saved to:
+        data/processed/wiki_clean.txt
+
+    Parameters
+    ----------
+    input_path : str or Path
+        Path to the compressed Wikipedia XML (.bz2) dump file.
+
+    Output
+    ------
+    Creates:
+        data/processed/wiki_clean.txt
+    """
+    input_path = Path(input_path)
+
+    # Fixed output path
+    project_root = Path.cwd()
+    output_dir = project_root / "data" / "processed"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    output_path = output_dir / "wiki_clean.txt"
+
+    with bz2.open(input_path, "rb") as f:
+        context = ET.iterparse(f, events=("end",))
+
+        with open(output_path, "w", encoding="utf-8") as out:
+            for _, elem in context:
+                if elem.tag.endswith("page"):
+                    text_elem = elem.find(".//{*}text")
+
+                    if text_elem is not None and text_elem.text:
+                        cleaned = clean_wikitext(text_elem.text)
+                        if cleaned:
+                            out.write(cleaned + "\n\n")
+
+                    elem.clear()
+    logger.info("Preprocessing complete. Output saved to %s", output_path)
+    generate_manifest(input_path,output_path)
+    
+# generate data manifest
+def generate_manifest(raw_path, processed_path):
+    raw_path = Path(raw_path)
+    processed_path = Path(processed_path)
+
+    if not processed_path.exists():
+        raise FileNotFoundError(
+            f"Processed file not found at {processed_path}. Run preprocessing first."
+        )
+
+    manifest = {
+        "wikipedia_dump": raw_path.name,
+        "dump_date": extract_dump_date(raw_path.name),
+        "raw_sha256": compute_sha256(str(raw_path)),
+        "processed_sha256": compute_sha256(str(processed_path)),
+        "preprocessing_version": "v1",
+        "python_version": platform.python_version()
+    }
+    project_root = Path.cwd()
+    manifest_path = project_root / "data" / "dataset_manifest.json"
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(manifest_path, "w") as f:
+        json.dump(manifest, f, indent=2)
+
+    logger.info("Manifest written to %s", manifest_path)
+
+# helpers
+def compute_sha256(file_path: Union[str, Path]) -> str:
+    """
+    Compute SHA256 hash of a file.
+
+    This provides a deterministic fingerprint of the dataset,
+    enabling reproducibility and verification.
+
+    Parameters
+    ----------
+    file_path : Union[str, Path]
+        Path to the dataset file (string or Path-like).
+
+    Returns
+    -------
+    str
+        SHA256 hash string.
+    """
+    path = Path(file_path)
+
+    sha256 = hashlib.sha256()
+
+    with path.open("rb") as f:
+        while chunk := f.read(8192):
+            sha256.update(chunk)
+
+    return sha256.hexdigest()
+
+def extract_dump_date(filename: str):
+    parts = filename.split("-")
+    for part in parts:
+        if part.isdigit() and len(part) == 8:
+            return f"{part[:4]}-{part[4:6]}-{part[6:]}"
+    return "unknown"
+
+def clean_wikitext(text: str) -> str:
+    """
+    Basic deterministic wikitext cleaning.
+
+    Note:
+    This uses simple regex-based rules for speed and consistency.
+    It does NOT fully parse MediaWiki syntax.
+
+    Limitations:
+    - Deeply nested templates may not be fully removed.
+    - Some complex <ref /> cases may not be perfectly handled.
+    - This is not a complete MediaWiki parser.
+
+    These limitations are acceptable for lightweight, deterministic preprocessing.
+    """
+    text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL)
+    text = re.sub(r"<ref.*?>.*?</ref>", "", text, flags=re.DOTALL)
+    text = re.sub(r"<.*?>", "", text)
+    text = re.sub(r"\[\[.*?\|(.*?)\]\]", r"\1", text)
+    text = re.sub(r"\[\[(.*?)\]\]", r"\1", text)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python -m openverifiablellm.utils <input_dump>")
+        sys.exit(1)
+        
+    logging.basicConfig(
+    level=logging.INFO,
+    format="%(levelname)s - %(message)s"
+    )
+    extract_text_from_xml(sys.argv[1])
diff --git a/pyproject.toml b/pyproject.toml
index 34e03d9..121d3bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,15 @@ authors = [
 ]
 requires-python = ">=3.9"
 
+dependencies= [
+    "defusedxml"
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest"
+]
+
 [tool.setuptools.packages.find]
 include = ["openverifiablellm*"]
 
diff --git a/tests/test_dataset_hash.py b/tests/test_dataset_hash.py
deleted file mode 100644
index b560baf..0000000
--- a/tests/test_dataset_hash.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import os
-import hashlib
-import tempfile
-import pytest
-from openverifiablellm.dataset_hash import compute_sha256
-
-
-def test_correct_sha256_output(tmp_path):
-    # Create a temporary file
-    file = tmp_path / "sample.txt"
-    content = "hello wikipedia"
-    file.write_text(content, encoding="utf-8")
-
-    # Expected hash using standard hashlib
-    expected = hashlib.sha256(content.encode("utf-8")).hexdigest()
-
-    # Hash using your function
-    actual = compute_sha256(str(file))
-
-    # Verify correctness
-    assert actual == expected
-
-
-def test_different_content_different_hash(tmp_path):
-    file1 = tmp_path / "content_a.txt"
-    file2 = tmp_path / "content_b.txt"
-
-    file1.write_text("Content A", encoding="utf-8")
-    file2.write_text("Content B", encoding="utf-8")
-
-    assert compute_sha256(file1) != compute_sha256(file2)
-
-
-def test_file_not_found():
-    with pytest.raises(FileNotFoundError):
-        compute_sha256("non_existent_file.txt")
\ No newline at end of file
diff --git a/tests/test_util.py b/tests/test_util.py
new file mode 100644
index 0000000..c0cede3
--- /dev/null
+++ b/tests/test_util.py
@@ -0,0 +1,129 @@
+import bz2
+import hashlib
+import pytest
+from openverifiablellm import utils
+
+"""
+Unit and integration tests for OpenVerifiableLLM preprocessing pipeline.
+
+Run with:
+    pip install -e ".[dev]"
+    pytest
+"""
+
+# --------------- clean_wikitext tests ------------------------------------
+
+def test_clean_wikitext_removes_templates_and_refs():
+    text = "Hello {{Infobox}} <ref>cite</ref> world"
+    cleaned = utils.clean_wikitext(text)
+    assert cleaned == "Hello world"
+
+
+def test_clean_wikitext_handles_links():
+    text = "This is [[Python|programming language]] and [[India]]"
+    cleaned = utils.clean_wikitext(text)
+    assert cleaned == "This is programming language and India"
+
+
+def test_clean_wikitext_collapses_whitespace():
+    text = "Hello   world\n\n   test"
+    cleaned = utils.clean_wikitext(text)
+    assert cleaned == "Hello world test"
+    
+# --------------- extract_dump_date tests ------------------------------------
+
+def test_extract_dump_date_valid():
+    filename = "simplewiki-20260201-pages-articles.xml.bz2"
+    assert utils.extract_dump_date(filename) == "2026-02-01"
+
+
+def test_extract_dump_date_invalid():
+    filename = "no-date-file.xml.bz2"
+    assert utils.extract_dump_date(filename) == "unknown"
+
+# --------------- generate manifest ------------------------------------
+
+def test_generate_manifest_raises_if_processed_missing(tmp_path):
+    raw_file = tmp_path / "raw.txt"
+    raw_file.write_text("dummy")
+
+    missing_file = tmp_path / "missing.txt"
+
+    with pytest.raises(FileNotFoundError):
+        utils.generate_manifest(raw_file, missing_file)
+        
+def test_generate_manifest_runs_if_file_exists(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+
+    raw_file = tmp_path / "raw.txt"
+    raw_file.write_text("dummy")
+
+    processed_file = tmp_path / "processed.txt"
+    processed_file.write_text("cleaned")
+
+    utils.generate_manifest(raw_file, processed_file)
+
+    manifest_file = tmp_path / "data/dataset_manifest.json"
+    assert manifest_file.exists()
+    
+# --------------- compute_sha256 ------------------------------------
+
+def test_correct_sha256_output(tmp_path):
+    # Create a temporary file
+    file = tmp_path / "sample.txt"
+    content = "hello wikipedia"
+    file.write_text(content, encoding="utf-8")
+
+    # Expected hash using standard hashlib
+    expected = hashlib.sha256(content.encode("utf-8")).hexdigest()
+
+    # Hash using your function
+    actual = utils.compute_sha256(str(file))
+
+    # Verify correctness
+    assert actual == expected
+
+
+def test_different_content_different_hash(tmp_path):
+    file1 = tmp_path / "content_a.txt"
+    file2 = tmp_path / "content_b.txt"
+
+    file1.write_text("Content A", encoding="utf-8")
+    file2.write_text("Content B", encoding="utf-8")
+
+    assert utils.compute_sha256(file1) != utils.compute_sha256(file2)
+
+
+def test_file_not_found():
+    with pytest.raises(FileNotFoundError):
+        utils.compute_sha256("non_existent_file.txt")
+        
+# --------------- extract_text_from_xml tests ------------------------------------
+
+def test_extract_text_from_xml_end_to_end(tmp_path, monkeypatch):
+
+    xml_content = """<?xml version="1.0"?>
+    <mediawiki>
+      <page>
+        <revision>
+          <text>Hello [[World]]</text>
+        </revision>
+      </page>
+    </mediawiki>
+    """
+
+    input_file = tmp_path / "simplewiki-20260201-pages.xml.bz2"
+
+    with bz2.open(input_file, "wt", encoding="utf-8") as f:
+        f.write(xml_content)
+
+    # Redirect project root
+    monkeypatch.chdir(tmp_path)
+
+    utils.extract_text_from_xml(input_file)
+
+    processed_file = tmp_path / "data/processed/wiki_clean.txt"
+    assert processed_file.exists()
+
+    assert "Hello World" in processed_file.read_text()
+    
\ No newline at end of file