diff --git a/confluence-mdx/bin/reverse_sync/sidecar.py b/confluence-mdx/bin/reverse_sync/sidecar.py index 750295df6..5e10af338 100644 --- a/confluence-mdx/bin/reverse_sync/sidecar.py +++ b/confluence-mdx/bin/reverse_sync/sidecar.py @@ -1,9 +1,10 @@ """Sidecar 통합 모듈 — Block-level roundtrip sidecar 스키마/IO + Mapping lookup/인덱스. -Block-level sidecar (schema v2): +Block-level sidecar (schema v3): RoundtripSidecar, SidecarBlock, DocumentEnvelope, build_sidecar, verify_sidecar_integrity, - write_sidecar, load_sidecar, sha256_text + write_sidecar, load_sidecar, sha256_text, + find_block_by_identity Mapping lookup (mapping.yaml v3 기반): SidecarChildEntry, SidecarEntry, load_sidecar_mapping, build_mdx_to_sidecar_index, @@ -28,7 +29,10 @@ # Roundtrip sidecar — block-level fragment + metadata # --------------------------------------------------------------------------- -ROUNDTRIP_SCHEMA_VERSION = "2" +ROUNDTRIP_SCHEMA_VERSION = "3" + +# v2 스키마도 로드 허용 (하위 호환) +_COMPATIBLE_SCHEMA_VERSIONS = frozenset({"2", "3"}) def sha256_text(text: str) -> str: @@ -45,7 +49,12 @@ class DocumentEnvelope: @dataclass class SidecarBlock: - """Individual XHTML block + metadata.""" + """Individual XHTML block + metadata. + + schema v3에서 reconstruction 필드가 추가됨: + - reconstruction: dict | None — block 재구성에 필요한 metadata + kind, old_plain_text, anchors, items(list), child_blocks 등을 포함 + """ block_index: int xhtml_xpath: str @@ -53,6 +62,7 @@ class SidecarBlock: mdx_content_hash: str = "" mdx_line_range: tuple = (0, 0) lost_info: dict = field(default_factory=dict) + reconstruction: Optional[dict] = None @dataclass @@ -79,22 +89,25 @@ def reassemble_xhtml(self) -> str: def to_dict(self) -> dict: """JSON 직렬화.""" + blocks = [] + for b in self.blocks: + d: dict = { + "block_index": b.block_index, + "xhtml_xpath": b.xhtml_xpath, + "xhtml_fragment": b.xhtml_fragment, + "mdx_content_hash": b.mdx_content_hash, + "mdx_line_range": list(b.mdx_line_range), + "lost_info": b.lost_info, + } + if b.reconstruction is not None: + d["reconstruction"] = b.reconstruction + blocks.append(d) return { "schema_version": self.schema_version, "page_id": self.page_id, "mdx_sha256": self.mdx_sha256, "source_xhtml_sha256": self.source_xhtml_sha256, - "blocks": [ - { - "block_index": b.block_index, - "xhtml_xpath": b.xhtml_xpath, - "xhtml_fragment": b.xhtml_fragment, - "mdx_content_hash": b.mdx_content_hash, - "mdx_line_range": list(b.mdx_line_range), - "lost_info": b.lost_info, - } - for b in self.blocks - ], + "blocks": blocks, "separators": self.separators, "document_envelope": { "prefix": self.document_envelope.prefix, @@ -104,7 +117,7 @@ def to_dict(self) -> dict: @staticmethod def from_dict(data: dict) -> "RoundtripSidecar": - """JSON 역직렬화.""" + """JSON 역직렬화. v2/v3 모두 지원.""" blocks = [ SidecarBlock( block_index=b["block_index"], @@ -113,6 +126,7 @@ def from_dict(data: dict) -> "RoundtripSidecar": mdx_content_hash=b.get("mdx_content_hash", ""), mdx_line_range=tuple(b.get("mdx_line_range", (0, 0))), lost_info=b.get("lost_info", {}), + reconstruction=b.get("reconstruction"), ) for b in data.get("blocks", []) ] @@ -156,6 +170,54 @@ def verify_sidecar_integrity( ) +def _build_reconstruction_metadata( + fragment: str, + xhtml_type: str, +) -> Optional[dict]: + """XHTML fragment에서 reconstruction metadata를 생성한다. + + 현재 지원하는 kind: + - paragraph: old_plain_text + anchors + - heading: old_plain_text + - list: old_plain_text + items (placeholder) + - code: (None — clean block) + - table: (None — clean block) + - html_block: kind + old_plain_text + + Phase 3에서 anchor 분석이 추가될 예정. + """ + from reverse_sync.xhtml_normalizer import extract_plain_text + + # code, table은 clean block — reconstruction metadata 불필요 + if xhtml_type in ("code", "table"): + return None + + plain_text = extract_plain_text(fragment) + + kind_map = { + "heading": "heading", + "paragraph": "paragraph", + "list": "list", + "html_block": "container", + } + kind = kind_map.get(xhtml_type, xhtml_type) + + meta: dict = { + "kind": kind, + "old_plain_text": plain_text, + } + + # list는 items placeholder (Phase 3에서 실제 item 분석) + if xhtml_type == "list": + meta["items"] = [] + + # paragraph/heading은 anchors placeholder + if xhtml_type in ("heading", "paragraph"): + meta["anchors"] = [] + + return meta + + def build_sidecar( page_xhtml_text: str, mdx_text: str, @@ -163,7 +225,8 @@ def build_sidecar( ) -> RoundtripSidecar: """Block-level sidecar를 생성한다. - Fragment 추출 → MDX alignment → 무결성 검증 → RoundtripSidecar 반환. + Fragment 추출 → MDX alignment → reconstruction metadata 빌드 → + 무결성 검증 → RoundtripSidecar 반환. """ from reverse_sync.fragment_extractor import extract_block_fragments from reverse_sync.mapping_recorder import record_mapping @@ -187,12 +250,15 @@ def build_sidecar( sidecar_blocks: List[SidecarBlock] = [] for i, fragment in enumerate(frag_result.fragments): xpath = top_mappings[i].xhtml_xpath if i < len(top_mappings) else f"unknown[{i}]" + xhtml_type = top_mappings[i].type if i < len(top_mappings) else "" # 순차 1:1 대응 (향후 block alignment로 개선) mdx_block = mdx_content_blocks[i] if i < len(mdx_content_blocks) else None mdx_hash = sha256_text(mdx_block.content) if mdx_block else "" mdx_range = (mdx_block.line_start, mdx_block.line_end) if mdx_block else (0, 0) + reconstruction = _build_reconstruction_metadata(fragment, xhtml_type) + sidecar_blocks.append( SidecarBlock( block_index=i, @@ -200,6 +266,7 @@ def build_sidecar( xhtml_fragment=fragment, mdx_content_hash=mdx_hash, mdx_line_range=mdx_range, + reconstruction=reconstruction, ) ) @@ -231,14 +298,18 @@ def write_sidecar(sidecar: RoundtripSidecar, path: Path) -> None: def load_sidecar(path: Path) -> RoundtripSidecar: - """JSON 파일에서 RoundtripSidecar를 로드한다.""" + """JSON 파일에서 RoundtripSidecar를 로드한다. + + v2와 v3 스키마를 모두 지원한다. v2 파일은 reconstruction=None으로 로드된다. + """ data: Any = json.loads(path.read_text(encoding="utf-8")) if not isinstance(data, dict): raise ValueError("invalid sidecar payload") - if data.get("schema_version") != ROUNDTRIP_SCHEMA_VERSION: + version = data.get("schema_version") + if version not in _COMPATIBLE_SCHEMA_VERSIONS: raise ValueError( - f"expected schema_version={ROUNDTRIP_SCHEMA_VERSION}, " - f"got {data.get('schema_version')}" + f"expected schema_version in {sorted(_COMPATIBLE_SCHEMA_VERSIONS)}, " + f"got {version}" ) return RoundtripSidecar.from_dict(data) @@ -526,3 +597,53 @@ def find_mapping_by_sidecar( if entry is None: return None return xpath_to_mapping.get(entry.xhtml_xpath) + + +# --------------------------------------------------------------------------- +# Block identity — hash + line_range 기반 disambiguation +# --------------------------------------------------------------------------- + +@dataclass +class _IdentityKey: + """Internal identity lookup key.""" + mdx_content_hash: str + mdx_line_range: tuple + + +def build_block_identity_index( + sidecar: RoundtripSidecar, +) -> Dict[str, List[SidecarBlock]]: + """mdx_content_hash → SidecarBlock 리스트 인덱스를 구축한다. + + 동일 hash를 가진 블록이 여러 개일 때 line_range로 disambiguation할 수 있도록 + 리스트로 저장한다. + """ + index: Dict[str, List[SidecarBlock]] = {} + for block in sidecar.blocks: + if not block.mdx_content_hash: + continue + index.setdefault(block.mdx_content_hash, []).append(block) + return index + + +def find_block_by_identity( + mdx_content_hash: str, + mdx_line_range: tuple, + identity_index: Dict[str, List[SidecarBlock]], +) -> Optional[SidecarBlock]: + """hash + line_range로 SidecarBlock을 찾는다. + + 1. hash가 유일하면 바로 반환 + 2. 같은 hash가 여러 개면 line_range가 일치하는 블록을 반환 + 3. line_range도 일치하지 않으면 None + """ + candidates = identity_index.get(mdx_content_hash) + if not candidates: + return None + if len(candidates) == 1: + return candidates[0] + # line_range로 disambiguation + for block in candidates: + if block.mdx_line_range == mdx_line_range: + return block + return None diff --git a/confluence-mdx/tests/test_reverse_sync_sidecar_v2.py b/confluence-mdx/tests/test_reverse_sync_sidecar_v2.py index ffb0c2ad8..6de14f205 100644 --- a/confluence-mdx/tests/test_reverse_sync_sidecar_v2.py +++ b/confluence-mdx/tests/test_reverse_sync_sidecar_v2.py @@ -33,7 +33,7 @@ def test_create_sidecar(self): separators=[], document_envelope=DocumentEnvelope(prefix="", suffix="\n"), ) - assert sidecar.schema_version == "2" + assert sidecar.schema_version in ("2", "3") assert sidecar.page_id == "test" assert len(sidecar.blocks) == 1 @@ -52,7 +52,7 @@ def test_to_dict_roundtrip(self): d = original.to_dict() restored = RoundtripSidecar.from_dict(d) - assert restored.schema_version == "2" + assert restored.schema_version in ("2", "3") assert restored.page_id == "123" assert len(restored.blocks) == 2 assert restored.blocks[0].xhtml_fragment == "
text
", + reconstruction={ + "kind": "paragraph", + "old_plain_text": "text", + "anchors": [], + }, + ) + assert block.reconstruction is not None + assert block.reconstruction["kind"] == "paragraph" + + def test_sidecar_block_reconstruction_none(self): + block = SidecarBlock( + block_index=0, + xhtml_xpath="macro-code[1]", + xhtml_fragment="A
", + reconstruction={"kind": "paragraph", "old_plain_text": "A", "anchors": []}, + ), + ], + ) + d = sidecar.to_dict() + assert "reconstruction" in d["blocks"][0] + assert d["blocks"][0]["reconstruction"]["kind"] == "paragraph" + + def test_to_dict_omits_reconstruction_when_none(self): + sidecar = RoundtripSidecar( + page_id="test", + blocks=[ + SidecarBlock( + block_index=0, + xhtml_xpath="macro-code[1]", + xhtml_fragment="x",
+ ),
+ ],
+ )
+ d = sidecar.to_dict()
+ assert "reconstruction" not in d["blocks"][0]
+
+ def test_from_dict_with_reconstruction(self):
+ data = {
+ "schema_version": "3",
+ "page_id": "test",
+ "blocks": [
+ {
+ "block_index": 0,
+ "xhtml_xpath": "p[1]",
+ "xhtml_fragment": "A
", + "reconstruction": { + "kind": "paragraph", + "old_plain_text": "A", + "anchors": [ + { + "anchor_id": "p[1]/ac:image[1]", + "raw_xhtml": "A
", + } + ], + "separators": [], + "document_envelope": {"prefix": "", "suffix": ""}, + } + sidecar = RoundtripSidecar.from_dict(data) + assert sidecar.blocks[0].reconstruction is None + + def test_json_roundtrip_with_reconstruction(self): + sidecar = RoundtripSidecar( + page_id="test", + blocks=[ + SidecarBlock( + block_index=0, + xhtml_xpath="ul[1]", + xhtml_fragment="Old
", + "mdx_content_hash": "h", + "mdx_line_range": [1, 1], + "lost_info": {}, + } + ], + "separators": [], + "document_envelope": {"prefix": "", "suffix": ""}, + } + path = tmp_path / "v2.json" + path.write_text(json.dumps(data), encoding="utf-8") + + sidecar = load_sidecar(path) + assert sidecar.schema_version == "2" + assert sidecar.blocks[0].reconstruction is None + + def test_load_v3_file(self, tmp_path): + """v3 schema 파일이 정상 로드된다.""" + data = { + "schema_version": "3", + "page_id": "new", + "blocks": [ + { + "block_index": 0, + "xhtml_xpath": "p[1]", + "xhtml_fragment": "New
", + "reconstruction": {"kind": "paragraph", "old_plain_text": "New"}, + } + ], + "separators": [], + "document_envelope": {"prefix": "", "suffix": ""}, + } + path = tmp_path / "v3.json" + path.write_text(json.dumps(data), encoding="utf-8") + + sidecar = load_sidecar(path) + assert sidecar.schema_version == "3" + assert sidecar.blocks[0].reconstruction is not None + + def test_load_v1_rejected(self, tmp_path): + """v1은 거부된다.""" + path = tmp_path / "v1.json" + path.write_text('{"schema_version": "1"}', encoding="utf-8") + with pytest.raises(ValueError, match="expected schema_version in"): + load_sidecar(path) + + def test_write_load_roundtrip_v3(self, tmp_path): + sidecar = RoundtripSidecar( + page_id="rt", + blocks=[ + SidecarBlock( + block_index=0, + xhtml_xpath="p[1]", + xhtml_fragment="RT
", + mdx_content_hash="h", + mdx_line_range=(5, 5), + reconstruction={"kind": "paragraph", "old_plain_text": "RT", "anchors": []}, + ), + ], + separators=[], + document_envelope=DocumentEnvelope(), + ) + path = tmp_path / "sidecar.json" + write_sidecar(sidecar, path) + loaded = load_sidecar(path) + assert loaded.blocks[0].reconstruction == {"kind": "paragraph", "old_plain_text": "RT", "anchors": []} + + +# --------------------------------------------------------------------------- +# Block identity helper +# --------------------------------------------------------------------------- + +class TestBlockIdentity: + @pytest.fixture + def sidecar_with_duplicates(self): + return RoundtripSidecar( + blocks=[ + SidecarBlock(0, "p[1]", "A
", "hash_a", (1, 1)), + SidecarBlock(1, "p[2]", "B
", "hash_b", (3, 3)), + SidecarBlock(2, "p[3]", "A
", "hash_a", (5, 5)), # duplicate hash + SidecarBlock(3, "p[4]", "C
", "hash_c", (7, 7)), + ], + ) + + def test_unique_hash_found(self, sidecar_with_duplicates): + index = build_block_identity_index(sidecar_with_duplicates) + result = find_block_by_identity("hash_b", (3, 3), index) + assert result is not None + assert result.block_index == 1 + + def test_unique_hash_found_regardless_of_line_range(self, sidecar_with_duplicates): + """hash가 유일하면 line_range가 달라도 찾는다.""" + index = build_block_identity_index(sidecar_with_duplicates) + result = find_block_by_identity("hash_b", (999, 999), index) + assert result is not None + assert result.block_index == 1 + + def test_duplicate_hash_disambiguated_by_line_range(self, sidecar_with_duplicates): + index = build_block_identity_index(sidecar_with_duplicates) + result1 = find_block_by_identity("hash_a", (1, 1), index) + result2 = find_block_by_identity("hash_a", (5, 5), index) + assert result1 is not None and result1.block_index == 0 + assert result2 is not None and result2.block_index == 2 + + def test_duplicate_hash_no_matching_line_range(self, sidecar_with_duplicates): + index = build_block_identity_index(sidecar_with_duplicates) + result = find_block_by_identity("hash_a", (99, 99), index) + assert result is None + + def test_nonexistent_hash(self, sidecar_with_duplicates): + index = build_block_identity_index(sidecar_with_duplicates) + result = find_block_by_identity("nonexistent", (1, 1), index) + assert result is None + + def test_empty_hash_skipped(self): + sidecar = RoundtripSidecar( + blocks=[SidecarBlock(0, "p[1]", "A
", "", (1, 1))], + ) + index = build_block_identity_index(sidecar) + assert len(index) == 0 + + def test_identity_index_groups_correctly(self, sidecar_with_duplicates): + index = build_block_identity_index(sidecar_with_duplicates) + assert len(index["hash_a"]) == 2 + assert len(index["hash_b"]) == 1 + assert len(index["hash_c"]) == 1 + + +# --------------------------------------------------------------------------- +# build_sidecar reconstruction metadata +# --------------------------------------------------------------------------- + +class TestBuildSidecarReconstructionMetadata: + def test_simple_case_has_reconstruction(self): + xhtml = "Body text
" + mdx = "## Title\n\nBody text\n" + sidecar = build_sidecar(xhtml, mdx, page_id="test") + + assert sidecar.schema_version == "3" + # heading block + h_block = sidecar.blocks[0] + assert h_block.reconstruction is not None + assert h_block.reconstruction["kind"] == "heading" + assert h_block.reconstruction["old_plain_text"] == "Title" + assert h_block.reconstruction["anchors"] == [] + # paragraph block + p_block = sidecar.blocks[1] + assert p_block.reconstruction is not None + assert p_block.reconstruction["kind"] == "paragraph" + assert p_block.reconstruction["old_plain_text"] == "Body text" + + def test_code_block_no_reconstruction(self): + xhtml = ( + 'Item 1
Item 2