From d74121468f2cedaaff0831d0b221687c63bb766a Mon Sep 17 00:00:00 2001 From: Shubhamx404 Date: Sun, 8 Mar 2026 02:10:27 +0530 Subject: [PATCH 1/4] Add malformed XML edge case test and fix verify.py regression --- openverifiablellm/verify.py | 13 +++++++------ tests/test_util.py | 25 ++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/openverifiablellm/verify.py b/openverifiablellm/verify.py index fe36aa9..bbdab16 100644 --- a/openverifiablellm/verify.py +++ b/openverifiablellm/verify.py @@ -285,12 +285,6 @@ def verify_preprocessing( actual=raw_merkle_actual, detail=f"Merkle root of raw dump (chunk={chunk_size} bytes)", ) - _check_field( - report, "manifest_chunk_size_bytes", - expected=manifest.get("chunk_size_bytes"), - actual=reproduced_manifest.get("chunk_size_bytes"), - detail="Merkle chunk size used during preprocessing", - ) else: report.add(CheckResult( name="raw_merkle_root", @@ -416,6 +410,13 @@ def verify_preprocessing( actual=reproduced_manifest.get("preprocessing_version"), detail="Preprocessing version tag", ) + if "chunk_size_bytes" in manifest: + _check_field( + report, "manifest_chunk_size_bytes", + expected=manifest["chunk_size_bytes"], + actual=reproduced_manifest.get("chunk_size_bytes"), + detail="Merkle chunk size used during preprocessing", + ) else: report.add(CheckResult( name="manifest_regenerated", diff --git a/tests/test_util.py b/tests/test_util.py index 4dcb989..430e1be 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -269,4 +269,27 @@ def test_export_and_load_merkle_proof(tmp_path): proof_file_path=proof_file, chunk_data=chunk, expected_root=root, - ) \ No newline at end of file + ) + +def test_extract_text_from_xml_malformed_xml(tmp_path, monkeypatch): + import defusedxml.ElementTree as ET + + malformed_xml_content = """ + + + + Hello [[Malformed]] + + + + """ + + input_file = tmp_path / "simplewiki-20260201-pages-malformed.xml" + + with open(input_file, "w", encoding="utf-8") as f: + f.write(malformed_xml_content) + + monkeypatch.chdir(tmp_path) + + with pytest.raises(ET.ParseError): + utils.extract_text_from_xml(input_file) From e923a038852a0c646baf771c01e4d46fccbd34f6 Mon Sep 17 00:00:00 2001 From: Shubham Kumar Sharma Date: Sun, 8 Mar 2026 12:39:13 +0530 Subject: [PATCH 2/4] Update openverifiablellm/verify.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- openverifiablellm/verify.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/openverifiablellm/verify.py b/openverifiablellm/verify.py index bbdab16..880b534 100644 --- a/openverifiablellm/verify.py +++ b/openverifiablellm/verify.py @@ -417,6 +417,12 @@ def verify_preprocessing( actual=reproduced_manifest.get("chunk_size_bytes"), detail="Merkle chunk size used during preprocessing", ) + else: + report.add(CheckResult( + name="manifest_chunk_size_bytes", + status=CheckStatus.SKIP, + detail="Field absent from manifest (older version)", + )) else: report.add(CheckResult( name="manifest_regenerated", From 0d0626a3dcacdc4f98754138b0522cb47f50089f Mon Sep 17 00:00:00 2001 From: Shubhamx404 Date: Sun, 8 Mar 2026 20:21:04 +0530 Subject: [PATCH 3/4] fixed coderabbit issue --- tests/test_verify.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_verify.py b/tests/test_verify.py index 57a9287..132d154 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -365,10 +365,11 @@ def setUp(self): def test_merkle_checks_are_skipped(self): r = verify_preprocessing(self.dump, project_root=self.tmp) - for name in ("raw_merkle_root", "processed_merkle_root"): + for name in ("raw_merkle_root", "processed_merkle_root", "manifest_chunk_size_bytes"): c = next((x for x in r.checks if x.name == name), None) self.assertIsNotNone(c, f"check '{name}' not found") self.assertEqual(c.status, CheckStatus.SKIP) + self.assertIn("Field absent from manifest (older version)", c.detail) def test_other_checks_still_pass(self): r = verify_preprocessing(self.dump, project_root=self.tmp) From c1a4ef7b2eec32ab6d71b1aa8518ca458a75153d Mon Sep 17 00:00:00 2001 From: Shubhamx404 Date: Wed, 18 Mar 2026 02:03:15 +0530 Subject: [PATCH 4/4] style: run formatter to fix CI lint issue --- openverifiablellm/verify.py | 12 +++++++----- tests/test_util.py | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/openverifiablellm/verify.py b/openverifiablellm/verify.py index f625093..4edfbca 100644 --- a/openverifiablellm/verify.py +++ b/openverifiablellm/verify.py @@ -603,11 +603,13 @@ def verify_preprocessing( detail="Merkle chunk size used during preprocessing", ) else: - report.add(CheckResult( - name="manifest_chunk_size_bytes", - status=CheckStatus.SKIP, - detail="Field absent from manifest (older version)", - )) + report.add( + CheckResult( + name="manifest_chunk_size_bytes", + status=CheckStatus.SKIP, + detail="Field absent from manifest (older version)", + ) + ) else: report.add( CheckResult( diff --git a/tests/test_util.py b/tests/test_util.py index 1182dff..a2b2d02 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -317,6 +317,7 @@ def test_export_and_load_merkle_proof(tmp_path): expected_root=root, ) + def test_extract_text_from_xml_malformed_xml(tmp_path, monkeypatch): import defusedxml.ElementTree as ET