From 65b84648d0e48146ca95ef32a4e013591b6dcf5c Mon Sep 17 00:00:00 2001 From: JK Date: Fri, 13 Mar 2026 21:36:56 +0900 Subject: [PATCH] =?UTF-8?q?confluence-mdx:=20Phase=201=20sidecar=20schema?= =?UTF-8?q?=20v3=20=E2=80=94=20reconstruction=20metadata=20=EB=B0=8F=20ide?= =?UTF-8?q?ntity=20helper=EB=A5=BC=20=EC=B6=94=EA=B0=80=ED=95=A9=EB=8B=88?= =?UTF-8?q?=EB=8B=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - SidecarBlock에 reconstruction 필드 추가 (kind, old_plain_text, anchors, items) - ROUNDTRIP_SCHEMA_VERSION "2" → "3" 승격, v2 하위 호환 로드 유지 - build_sidecar가 block 타입별 reconstruction metadata를 자동 생성 - build_block_identity_index, find_block_by_identity: hash + line_range disambiguation - Phase 1 전용 테스트 25개 추가 (전체 845 pass) Co-Authored-By: Claude Opus 4.6 --- confluence-mdx/bin/reverse_sync/sidecar.py | 163 ++++++- .../tests/test_reverse_sync_sidecar_v2.py | 10 +- .../tests/test_reverse_sync_sidecar_v3.py | 423 ++++++++++++++++++ 3 files changed, 570 insertions(+), 26 deletions(-) create mode 100644 confluence-mdx/tests/test_reverse_sync_sidecar_v3.py diff --git a/confluence-mdx/bin/reverse_sync/sidecar.py b/confluence-mdx/bin/reverse_sync/sidecar.py index 750295df6..5e10af338 100644 --- a/confluence-mdx/bin/reverse_sync/sidecar.py +++ b/confluence-mdx/bin/reverse_sync/sidecar.py @@ -1,9 +1,10 @@ """Sidecar 통합 모듈 — Block-level roundtrip sidecar 스키마/IO + Mapping lookup/인덱스. -Block-level sidecar (schema v2): +Block-level sidecar (schema v3): RoundtripSidecar, SidecarBlock, DocumentEnvelope, build_sidecar, verify_sidecar_integrity, - write_sidecar, load_sidecar, sha256_text + write_sidecar, load_sidecar, sha256_text, + find_block_by_identity Mapping lookup (mapping.yaml v3 기반): SidecarChildEntry, SidecarEntry, load_sidecar_mapping, build_mdx_to_sidecar_index, @@ -28,7 +29,10 @@ # Roundtrip sidecar — block-level fragment + metadata # --------------------------------------------------------------------------- -ROUNDTRIP_SCHEMA_VERSION = "2" +ROUNDTRIP_SCHEMA_VERSION = "3" + +# v2 스키마도 로드 허용 (하위 호환) +_COMPATIBLE_SCHEMA_VERSIONS = frozenset({"2", "3"}) def sha256_text(text: str) -> str: @@ -45,7 +49,12 @@ class DocumentEnvelope: @dataclass class SidecarBlock: - """Individual XHTML block + metadata.""" + """Individual XHTML block + metadata. + + schema v3에서 reconstruction 필드가 추가됨: + - reconstruction: dict | None — block 재구성에 필요한 metadata + kind, old_plain_text, anchors, items(list), child_blocks 등을 포함 + """ block_index: int xhtml_xpath: str @@ -53,6 +62,7 @@ class SidecarBlock: mdx_content_hash: str = "" mdx_line_range: tuple = (0, 0) lost_info: dict = field(default_factory=dict) + reconstruction: Optional[dict] = None @dataclass @@ -79,22 +89,25 @@ def reassemble_xhtml(self) -> str: def to_dict(self) -> dict: """JSON 직렬화.""" + blocks = [] + for b in self.blocks: + d: dict = { + "block_index": b.block_index, + "xhtml_xpath": b.xhtml_xpath, + "xhtml_fragment": b.xhtml_fragment, + "mdx_content_hash": b.mdx_content_hash, + "mdx_line_range": list(b.mdx_line_range), + "lost_info": b.lost_info, + } + if b.reconstruction is not None: + d["reconstruction"] = b.reconstruction + blocks.append(d) return { "schema_version": self.schema_version, "page_id": self.page_id, "mdx_sha256": self.mdx_sha256, "source_xhtml_sha256": self.source_xhtml_sha256, - "blocks": [ - { - "block_index": b.block_index, - "xhtml_xpath": b.xhtml_xpath, - "xhtml_fragment": b.xhtml_fragment, - "mdx_content_hash": b.mdx_content_hash, - "mdx_line_range": list(b.mdx_line_range), - "lost_info": b.lost_info, - } - for b in self.blocks - ], + "blocks": blocks, "separators": self.separators, "document_envelope": { "prefix": self.document_envelope.prefix, @@ -104,7 +117,7 @@ def to_dict(self) -> dict: @staticmethod def from_dict(data: dict) -> "RoundtripSidecar": - """JSON 역직렬화.""" + """JSON 역직렬화. v2/v3 모두 지원.""" blocks = [ SidecarBlock( block_index=b["block_index"], @@ -113,6 +126,7 @@ def from_dict(data: dict) -> "RoundtripSidecar": mdx_content_hash=b.get("mdx_content_hash", ""), mdx_line_range=tuple(b.get("mdx_line_range", (0, 0))), lost_info=b.get("lost_info", {}), + reconstruction=b.get("reconstruction"), ) for b in data.get("blocks", []) ] @@ -156,6 +170,54 @@ def verify_sidecar_integrity( ) +def _build_reconstruction_metadata( + fragment: str, + xhtml_type: str, +) -> Optional[dict]: + """XHTML fragment에서 reconstruction metadata를 생성한다. + + 현재 지원하는 kind: + - paragraph: old_plain_text + anchors + - heading: old_plain_text + - list: old_plain_text + items (placeholder) + - code: (None — clean block) + - table: (None — clean block) + - html_block: kind + old_plain_text + + Phase 3에서 anchor 분석이 추가될 예정. + """ + from reverse_sync.xhtml_normalizer import extract_plain_text + + # code, table은 clean block — reconstruction metadata 불필요 + if xhtml_type in ("code", "table"): + return None + + plain_text = extract_plain_text(fragment) + + kind_map = { + "heading": "heading", + "paragraph": "paragraph", + "list": "list", + "html_block": "container", + } + kind = kind_map.get(xhtml_type, xhtml_type) + + meta: dict = { + "kind": kind, + "old_plain_text": plain_text, + } + + # list는 items placeholder (Phase 3에서 실제 item 분석) + if xhtml_type == "list": + meta["items"] = [] + + # paragraph/heading은 anchors placeholder + if xhtml_type in ("heading", "paragraph"): + meta["anchors"] = [] + + return meta + + def build_sidecar( page_xhtml_text: str, mdx_text: str, @@ -163,7 +225,8 @@ def build_sidecar( ) -> RoundtripSidecar: """Block-level sidecar를 생성한다. - Fragment 추출 → MDX alignment → 무결성 검증 → RoundtripSidecar 반환. + Fragment 추출 → MDX alignment → reconstruction metadata 빌드 → + 무결성 검증 → RoundtripSidecar 반환. """ from reverse_sync.fragment_extractor import extract_block_fragments from reverse_sync.mapping_recorder import record_mapping @@ -187,12 +250,15 @@ def build_sidecar( sidecar_blocks: List[SidecarBlock] = [] for i, fragment in enumerate(frag_result.fragments): xpath = top_mappings[i].xhtml_xpath if i < len(top_mappings) else f"unknown[{i}]" + xhtml_type = top_mappings[i].type if i < len(top_mappings) else "" # 순차 1:1 대응 (향후 block alignment로 개선) mdx_block = mdx_content_blocks[i] if i < len(mdx_content_blocks) else None mdx_hash = sha256_text(mdx_block.content) if mdx_block else "" mdx_range = (mdx_block.line_start, mdx_block.line_end) if mdx_block else (0, 0) + reconstruction = _build_reconstruction_metadata(fragment, xhtml_type) + sidecar_blocks.append( SidecarBlock( block_index=i, @@ -200,6 +266,7 @@ def build_sidecar( xhtml_fragment=fragment, mdx_content_hash=mdx_hash, mdx_line_range=mdx_range, + reconstruction=reconstruction, ) ) @@ -231,14 +298,18 @@ def write_sidecar(sidecar: RoundtripSidecar, path: Path) -> None: def load_sidecar(path: Path) -> RoundtripSidecar: - """JSON 파일에서 RoundtripSidecar를 로드한다.""" + """JSON 파일에서 RoundtripSidecar를 로드한다. + + v2와 v3 스키마를 모두 지원한다. v2 파일은 reconstruction=None으로 로드된다. + """ data: Any = json.loads(path.read_text(encoding="utf-8")) if not isinstance(data, dict): raise ValueError("invalid sidecar payload") - if data.get("schema_version") != ROUNDTRIP_SCHEMA_VERSION: + version = data.get("schema_version") + if version not in _COMPATIBLE_SCHEMA_VERSIONS: raise ValueError( - f"expected schema_version={ROUNDTRIP_SCHEMA_VERSION}, " - f"got {data.get('schema_version')}" + f"expected schema_version in {sorted(_COMPATIBLE_SCHEMA_VERSIONS)}, " + f"got {version}" ) return RoundtripSidecar.from_dict(data) @@ -526,3 +597,53 @@ def find_mapping_by_sidecar( if entry is None: return None return xpath_to_mapping.get(entry.xhtml_xpath) + + +# --------------------------------------------------------------------------- +# Block identity — hash + line_range 기반 disambiguation +# --------------------------------------------------------------------------- + +@dataclass +class _IdentityKey: + """Internal identity lookup key.""" + mdx_content_hash: str + mdx_line_range: tuple + + +def build_block_identity_index( + sidecar: RoundtripSidecar, +) -> Dict[str, List[SidecarBlock]]: + """mdx_content_hash → SidecarBlock 리스트 인덱스를 구축한다. + + 동일 hash를 가진 블록이 여러 개일 때 line_range로 disambiguation할 수 있도록 + 리스트로 저장한다. + """ + index: Dict[str, List[SidecarBlock]] = {} + for block in sidecar.blocks: + if not block.mdx_content_hash: + continue + index.setdefault(block.mdx_content_hash, []).append(block) + return index + + +def find_block_by_identity( + mdx_content_hash: str, + mdx_line_range: tuple, + identity_index: Dict[str, List[SidecarBlock]], +) -> Optional[SidecarBlock]: + """hash + line_range로 SidecarBlock을 찾는다. + + 1. hash가 유일하면 바로 반환 + 2. 같은 hash가 여러 개면 line_range가 일치하는 블록을 반환 + 3. line_range도 일치하지 않으면 None + """ + candidates = identity_index.get(mdx_content_hash) + if not candidates: + return None + if len(candidates) == 1: + return candidates[0] + # line_range로 disambiguation + for block in candidates: + if block.mdx_line_range == mdx_line_range: + return block + return None diff --git a/confluence-mdx/tests/test_reverse_sync_sidecar_v2.py b/confluence-mdx/tests/test_reverse_sync_sidecar_v2.py index ffb0c2ad8..6de14f205 100644 --- a/confluence-mdx/tests/test_reverse_sync_sidecar_v2.py +++ b/confluence-mdx/tests/test_reverse_sync_sidecar_v2.py @@ -33,7 +33,7 @@ def test_create_sidecar(self): separators=[], document_envelope=DocumentEnvelope(prefix="", suffix="\n"), ) - assert sidecar.schema_version == "2" + assert sidecar.schema_version in ("2", "3") assert sidecar.page_id == "test" assert len(sidecar.blocks) == 1 @@ -52,7 +52,7 @@ def test_to_dict_roundtrip(self): d = original.to_dict() restored = RoundtripSidecar.from_dict(d) - assert restored.schema_version == "2" + assert restored.schema_version in ("2", "3") assert restored.page_id == "123" assert len(restored.blocks) == 2 assert restored.blocks[0].xhtml_fragment == "

A

" @@ -145,7 +145,7 @@ def test_roundtrip(self, tmp_path): def test_load_rejects_wrong_version(self, tmp_path): path = tmp_path / "bad.json" path.write_text('{"schema_version": "1"}', encoding="utf-8") - with pytest.raises(ValueError, match="expected schema_version=2"): + with pytest.raises(ValueError, match="expected schema_version in"): load_sidecar(path) @@ -155,7 +155,7 @@ def test_simple_case(self): mdx = "## Title\n\nBody text\n" sidecar = build_sidecar(xhtml, mdx, page_id="test") - assert sidecar.schema_version == "2" + assert sidecar.schema_version in ("2", "3") assert sidecar.page_id == "test" assert sidecar.mdx_sha256 == sha256_text(mdx) assert sidecar.source_xhtml_sha256 == sha256_text(xhtml) @@ -189,7 +189,7 @@ def test_all_testcases_build_and_verify(self, testcases_dir): mdx = mdx_path.read_text(encoding="utf-8") sidecar = build_sidecar(xhtml, mdx, page_id=case_dir.name) - assert sidecar.schema_version == "2" + assert sidecar.schema_version in ("2", "3") assert len(sidecar.blocks) > 0 assert len(sidecar.separators) == len(sidecar.blocks) - 1 ok += 1 diff --git a/confluence-mdx/tests/test_reverse_sync_sidecar_v3.py b/confluence-mdx/tests/test_reverse_sync_sidecar_v3.py new file mode 100644 index 000000000..448b89353 --- /dev/null +++ b/confluence-mdx/tests/test_reverse_sync_sidecar_v3.py @@ -0,0 +1,423 @@ +"""reverse_sync/sidecar.py schema v3 — reconstruction metadata 및 identity helper 테스트. + +Phase 1 게이트: +- SidecarBlock.reconstruction 필드 직렬화/역직렬화 +- build_sidecar가 reconstruction metadata를 생성 +- v2 파일 하위 호환 로드 +- hash + line_range 기반 identity helper +- 기존 21개 testcase build + integrity 유지 +""" + +import json +from pathlib import Path + +import pytest + +from reverse_sync.sidecar import ( + DocumentEnvelope, + ROUNDTRIP_SCHEMA_VERSION, + RoundtripSidecar, + SidecarBlock, + build_block_identity_index, + build_sidecar, + find_block_by_identity, + load_sidecar, + sha256_text, + write_sidecar, +) + +TESTCASES_DIR = Path(__file__).parent / "testcases" + + +# --------------------------------------------------------------------------- +# Schema v3 기본 동작 +# --------------------------------------------------------------------------- + +class TestSchemaV3: + def test_schema_version_is_3(self): + assert ROUNDTRIP_SCHEMA_VERSION == "3" + + def test_sidecar_block_reconstruction_field(self): + block = SidecarBlock( + block_index=0, + xhtml_xpath="p[1]", + xhtml_fragment="

text

", + reconstruction={ + "kind": "paragraph", + "old_plain_text": "text", + "anchors": [], + }, + ) + assert block.reconstruction is not None + assert block.reconstruction["kind"] == "paragraph" + + def test_sidecar_block_reconstruction_none(self): + block = SidecarBlock( + block_index=0, + xhtml_xpath="macro-code[1]", + xhtml_fragment="", + ) + assert block.reconstruction is None + + def test_to_dict_includes_reconstruction(self): + sidecar = RoundtripSidecar( + page_id="test", + blocks=[ + SidecarBlock( + block_index=0, + xhtml_xpath="p[1]", + xhtml_fragment="

A

", + reconstruction={"kind": "paragraph", "old_plain_text": "A", "anchors": []}, + ), + ], + ) + d = sidecar.to_dict() + assert "reconstruction" in d["blocks"][0] + assert d["blocks"][0]["reconstruction"]["kind"] == "paragraph" + + def test_to_dict_omits_reconstruction_when_none(self): + sidecar = RoundtripSidecar( + page_id="test", + blocks=[ + SidecarBlock( + block_index=0, + xhtml_xpath="macro-code[1]", + xhtml_fragment="x", + ), + ], + ) + d = sidecar.to_dict() + assert "reconstruction" not in d["blocks"][0] + + def test_from_dict_with_reconstruction(self): + data = { + "schema_version": "3", + "page_id": "test", + "blocks": [ + { + "block_index": 0, + "xhtml_xpath": "p[1]", + "xhtml_fragment": "

A

", + "reconstruction": { + "kind": "paragraph", + "old_plain_text": "A", + "anchors": [ + { + "anchor_id": "p[1]/ac:image[1]", + "raw_xhtml": "", + "old_plain_offset": 2, + "affinity": "after", + } + ], + }, + } + ], + "separators": [], + "document_envelope": {"prefix": "", "suffix": ""}, + } + sidecar = RoundtripSidecar.from_dict(data) + block = sidecar.blocks[0] + assert block.reconstruction is not None + assert len(block.reconstruction["anchors"]) == 1 + assert block.reconstruction["anchors"][0]["old_plain_offset"] == 2 + + def test_from_dict_without_reconstruction(self): + """v2 형식 데이터는 reconstruction=None으로 로드된다.""" + data = { + "schema_version": "2", + "page_id": "test", + "blocks": [ + { + "block_index": 0, + "xhtml_xpath": "p[1]", + "xhtml_fragment": "

A

", + } + ], + "separators": [], + "document_envelope": {"prefix": "", "suffix": ""}, + } + sidecar = RoundtripSidecar.from_dict(data) + assert sidecar.blocks[0].reconstruction is None + + def test_json_roundtrip_with_reconstruction(self): + sidecar = RoundtripSidecar( + page_id="test", + blocks=[ + SidecarBlock( + block_index=0, + xhtml_xpath="ul[1]", + xhtml_fragment="
  • X
", + reconstruction={ + "kind": "list", + "old_plain_text": "X", + "items": [{"item_xpath": "ul[1]/li[1]", "old_plain_text": "X"}], + }, + ), + ], + ) + json_str = json.dumps(sidecar.to_dict(), ensure_ascii=False) + restored = RoundtripSidecar.from_dict(json.loads(json_str)) + assert restored.blocks[0].reconstruction["kind"] == "list" + assert len(restored.blocks[0].reconstruction["items"]) == 1 + + +# --------------------------------------------------------------------------- +# v2 하위 호환 로드 +# --------------------------------------------------------------------------- + +class TestV2Compatibility: + def test_load_v2_file(self, tmp_path): + """v2 schema 파일이 정상 로드된다.""" + data = { + "schema_version": "2", + "page_id": "compat", + "blocks": [ + { + "block_index": 0, + "xhtml_xpath": "p[1]", + "xhtml_fragment": "

Old

", + "mdx_content_hash": "h", + "mdx_line_range": [1, 1], + "lost_info": {}, + } + ], + "separators": [], + "document_envelope": {"prefix": "", "suffix": ""}, + } + path = tmp_path / "v2.json" + path.write_text(json.dumps(data), encoding="utf-8") + + sidecar = load_sidecar(path) + assert sidecar.schema_version == "2" + assert sidecar.blocks[0].reconstruction is None + + def test_load_v3_file(self, tmp_path): + """v3 schema 파일이 정상 로드된다.""" + data = { + "schema_version": "3", + "page_id": "new", + "blocks": [ + { + "block_index": 0, + "xhtml_xpath": "p[1]", + "xhtml_fragment": "

New

", + "reconstruction": {"kind": "paragraph", "old_plain_text": "New"}, + } + ], + "separators": [], + "document_envelope": {"prefix": "", "suffix": ""}, + } + path = tmp_path / "v3.json" + path.write_text(json.dumps(data), encoding="utf-8") + + sidecar = load_sidecar(path) + assert sidecar.schema_version == "3" + assert sidecar.blocks[0].reconstruction is not None + + def test_load_v1_rejected(self, tmp_path): + """v1은 거부된다.""" + path = tmp_path / "v1.json" + path.write_text('{"schema_version": "1"}', encoding="utf-8") + with pytest.raises(ValueError, match="expected schema_version in"): + load_sidecar(path) + + def test_write_load_roundtrip_v3(self, tmp_path): + sidecar = RoundtripSidecar( + page_id="rt", + blocks=[ + SidecarBlock( + block_index=0, + xhtml_xpath="p[1]", + xhtml_fragment="

RT

", + mdx_content_hash="h", + mdx_line_range=(5, 5), + reconstruction={"kind": "paragraph", "old_plain_text": "RT", "anchors": []}, + ), + ], + separators=[], + document_envelope=DocumentEnvelope(), + ) + path = tmp_path / "sidecar.json" + write_sidecar(sidecar, path) + loaded = load_sidecar(path) + assert loaded.blocks[0].reconstruction == {"kind": "paragraph", "old_plain_text": "RT", "anchors": []} + + +# --------------------------------------------------------------------------- +# Block identity helper +# --------------------------------------------------------------------------- + +class TestBlockIdentity: + @pytest.fixture + def sidecar_with_duplicates(self): + return RoundtripSidecar( + blocks=[ + SidecarBlock(0, "p[1]", "

A

", "hash_a", (1, 1)), + SidecarBlock(1, "p[2]", "

B

", "hash_b", (3, 3)), + SidecarBlock(2, "p[3]", "

A

", "hash_a", (5, 5)), # duplicate hash + SidecarBlock(3, "p[4]", "

C

", "hash_c", (7, 7)), + ], + ) + + def test_unique_hash_found(self, sidecar_with_duplicates): + index = build_block_identity_index(sidecar_with_duplicates) + result = find_block_by_identity("hash_b", (3, 3), index) + assert result is not None + assert result.block_index == 1 + + def test_unique_hash_found_regardless_of_line_range(self, sidecar_with_duplicates): + """hash가 유일하면 line_range가 달라도 찾는다.""" + index = build_block_identity_index(sidecar_with_duplicates) + result = find_block_by_identity("hash_b", (999, 999), index) + assert result is not None + assert result.block_index == 1 + + def test_duplicate_hash_disambiguated_by_line_range(self, sidecar_with_duplicates): + index = build_block_identity_index(sidecar_with_duplicates) + result1 = find_block_by_identity("hash_a", (1, 1), index) + result2 = find_block_by_identity("hash_a", (5, 5), index) + assert result1 is not None and result1.block_index == 0 + assert result2 is not None and result2.block_index == 2 + + def test_duplicate_hash_no_matching_line_range(self, sidecar_with_duplicates): + index = build_block_identity_index(sidecar_with_duplicates) + result = find_block_by_identity("hash_a", (99, 99), index) + assert result is None + + def test_nonexistent_hash(self, sidecar_with_duplicates): + index = build_block_identity_index(sidecar_with_duplicates) + result = find_block_by_identity("nonexistent", (1, 1), index) + assert result is None + + def test_empty_hash_skipped(self): + sidecar = RoundtripSidecar( + blocks=[SidecarBlock(0, "p[1]", "

A

", "", (1, 1))], + ) + index = build_block_identity_index(sidecar) + assert len(index) == 0 + + def test_identity_index_groups_correctly(self, sidecar_with_duplicates): + index = build_block_identity_index(sidecar_with_duplicates) + assert len(index["hash_a"]) == 2 + assert len(index["hash_b"]) == 1 + assert len(index["hash_c"]) == 1 + + +# --------------------------------------------------------------------------- +# build_sidecar reconstruction metadata +# --------------------------------------------------------------------------- + +class TestBuildSidecarReconstructionMetadata: + def test_simple_case_has_reconstruction(self): + xhtml = "

Title

\n

Body text

" + mdx = "## Title\n\nBody text\n" + sidecar = build_sidecar(xhtml, mdx, page_id="test") + + assert sidecar.schema_version == "3" + # heading block + h_block = sidecar.blocks[0] + assert h_block.reconstruction is not None + assert h_block.reconstruction["kind"] == "heading" + assert h_block.reconstruction["old_plain_text"] == "Title" + assert h_block.reconstruction["anchors"] == [] + # paragraph block + p_block = sidecar.blocks[1] + assert p_block.reconstruction is not None + assert p_block.reconstruction["kind"] == "paragraph" + assert p_block.reconstruction["old_plain_text"] == "Body text" + + def test_code_block_no_reconstruction(self): + xhtml = ( + '' + 'python' + '' + '' + ) + mdx = "```python\nx = 1\n```\n" + sidecar = build_sidecar(xhtml, mdx, page_id="test") + assert sidecar.blocks[0].reconstruction is None + + def test_list_block_has_reconstruction(self): + xhtml = "
  • Item 1

  • Item 2

" + mdx = "- Item 1\n- Item 2\n" + sidecar = build_sidecar(xhtml, mdx, page_id="test") + block = sidecar.blocks[0] + assert block.reconstruction is not None + assert block.reconstruction["kind"] == "list" + assert "items" in block.reconstruction + + +# --------------------------------------------------------------------------- +# 실제 testcase에서 build + integrity + reconstruction 검증 +# --------------------------------------------------------------------------- + +class TestBuildSidecarRealTestcasesV3: + @pytest.fixture + def testcases_dir(self): + return TESTCASES_DIR + + def test_all_testcases_build_and_verify(self, testcases_dir): + """21개 testcase 모두 schema v3로 build + integrity pass.""" + if not testcases_dir.is_dir(): + pytest.skip("testcases directory not found") + + ok = 0 + for case_dir in sorted(testcases_dir.iterdir()): + if not case_dir.is_dir(): + continue + xhtml_path = case_dir / "page.xhtml" + mdx_path = case_dir / "expected.mdx" + if not xhtml_path.exists() or not mdx_path.exists(): + continue + + xhtml = xhtml_path.read_text(encoding="utf-8") + mdx = mdx_path.read_text(encoding="utf-8") + sidecar = build_sidecar(xhtml, mdx, page_id=case_dir.name) + + assert sidecar.schema_version == "3" + assert len(sidecar.blocks) > 0 + assert len(sidecar.separators) == len(sidecar.blocks) - 1 + ok += 1 + + assert ok >= 21, f"Expected at least 21 testcases, got {ok}" + + def test_reconstruction_metadata_present(self, testcases_dir): + """실제 testcase에서 reconstruction이 생성되는지 확인.""" + case_dir = testcases_dir / "544113141" + if not case_dir.exists(): + pytest.skip("testcase 544113141 not found") + + xhtml = (case_dir / "page.xhtml").read_text(encoding="utf-8") + mdx = (case_dir / "expected.mdx").read_text(encoding="utf-8") + sidecar = build_sidecar(xhtml, mdx, page_id="544113141") + + # heading block은 reconstruction 있어야 함 + heading_blocks = [b for b in sidecar.blocks if b.xhtml_xpath.startswith("h")] + assert len(heading_blocks) > 0 + for b in heading_blocks: + assert b.reconstruction is not None + assert b.reconstruction["kind"] == "heading" + assert len(b.reconstruction["old_plain_text"]) > 0 + + def test_identity_index_from_real_testcase(self, testcases_dir): + """실제 testcase에서 identity index가 올바르게 구축된다.""" + case_dir = testcases_dir / "544113141" + if not case_dir.exists(): + pytest.skip("testcase 544113141 not found") + + xhtml = (case_dir / "page.xhtml").read_text(encoding="utf-8") + mdx = (case_dir / "expected.mdx").read_text(encoding="utf-8") + sidecar = build_sidecar(xhtml, mdx, page_id="544113141") + + index = build_block_identity_index(sidecar) + + # 모든 hash가 있는 block이 인덱스에 있어야 함 + hashed_blocks = [b for b in sidecar.blocks if b.mdx_content_hash] + total_in_index = sum(len(v) for v in index.values()) + assert total_in_index == len(hashed_blocks) + + # 각 block을 identity로 다시 찾을 수 있어야 함 + for b in hashed_blocks: + found = find_block_by_identity(b.mdx_content_hash, b.mdx_line_range, index) + assert found is not None, f"Failed to find block {b.block_index} by identity" + assert found.block_index == b.block_index