diff --git a/openverifiablellm/verify.py b/openverifiablellm/verify.py index 5ec3891..4edfbca 100644 --- a/openverifiablellm/verify.py +++ b/openverifiablellm/verify.py @@ -593,6 +593,7 @@ def verify_preprocessing( actual=reproduced_manifest.get("preprocessing_version"), detail="Preprocessing version tag", ) + if "chunk_size_bytes" in manifest: _check_field( report, @@ -601,6 +602,14 @@ def verify_preprocessing( actual=reproduced_manifest.get("chunk_size_bytes"), detail="Merkle chunk size used during preprocessing", ) + else: + report.add( + CheckResult( + name="manifest_chunk_size_bytes", + status=CheckStatus.SKIP, + detail="Field absent from manifest (older version)", + ) + ) else: report.add( CheckResult( diff --git a/tests/test_util.py b/tests/test_util.py index 0ea6cac..a2b2d02 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -316,3 +316,27 @@ def test_export_and_load_merkle_proof(tmp_path): chunk_data=chunk, expected_root=root, ) + + +def test_extract_text_from_xml_malformed_xml(tmp_path, monkeypatch): + import defusedxml.ElementTree as ET + + malformed_xml_content = """ + + + + Hello [[Malformed]] + + + + """ + + input_file = tmp_path / "simplewiki-20260201-pages-malformed.xml" + + with open(input_file, "w", encoding="utf-8") as f: + f.write(malformed_xml_content) + + monkeypatch.chdir(tmp_path) + + with pytest.raises(ET.ParseError): + utils.extract_text_from_xml(input_file) diff --git a/tests/test_verify.py b/tests/test_verify.py index 931a66e..ce6f015 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -367,10 +367,11 @@ def setUp(self): def test_merkle_checks_are_skipped(self): r = verify_preprocessing(self.dump, project_root=self.tmp) - for name in ("raw_merkle_root", "processed_merkle_root"): + for name in ("raw_merkle_root", "processed_merkle_root", "manifest_chunk_size_bytes"): c = next((x for x in r.checks if x.name == name), None) self.assertIsNotNone(c, f"check '{name}' not found") self.assertEqual(c.status, CheckStatus.SKIP) + self.assertIn("Field absent from manifest (older version)", c.detail) def test_other_checks_still_pass(self): r = verify_preprocessing(self.dump, project_root=self.tmp)