diff --git a/confluence-mdx/bin/reverse_sync/patch_builder.py b/confluence-mdx/bin/reverse_sync/patch_builder.py index b3ba7ffec..37ffceccf 100644 --- a/confluence-mdx/bin/reverse_sync/patch_builder.py +++ b/confluence-mdx/bin/reverse_sync/patch_builder.py @@ -14,10 +14,16 @@ RoundtripSidecar, SidecarBlock, find_mapping_by_sidecar, + find_sidecar_block_by_identity, + sha256_text, SidecarEntry, ) from reverse_sync.lost_info_patcher import apply_lost_info, distribute_lost_info_to_mappings from reverse_sync.mdx_to_xhtml_inline import mdx_block_to_xhtml_element, mdx_block_to_inner_xhtml +from reverse_sync.reconstructors import ( + sidecar_block_requires_reconstruction, + reconstruct_fragment_with_sidecar, +) from reverse_sync.list_patcher import ( build_list_item_patches, ) @@ -90,10 +96,13 @@ def _emit_replacement_fragment(block: MdxBlock) -> str: def _build_replace_fragment_patch( mapping: BlockMapping, new_block: MdxBlock, + sidecar_block: Optional[SidecarBlock] = None, mapping_lost_info: Optional[dict] = None, ) -> Dict[str, str]: """whole-fragment replacement patch를 생성한다.""" new_element = _emit_replacement_fragment(new_block) + if sidecar_block_requires_reconstruction(sidecar_block): + new_element = reconstruct_fragment_with_sidecar(new_element, sidecar_block) block_lost = (mapping_lost_info or {}).get(mapping.block_id, {}) if block_lost: new_element = apply_lost_info(new_element, block_lost) @@ -104,6 +113,57 @@ def _build_replace_fragment_patch( } +def _find_roundtrip_sidecar_block( + change: BlockChange, + mapping: Optional[BlockMapping], + roundtrip_sidecar: Optional[RoundtripSidecar], + xpath_to_sidecar_block: Dict[str, SidecarBlock], +) -> Optional[SidecarBlock]: + """xpath → identity hash 순으로 roundtrip sidecar block을 탐색한다. + + 1. xpath로 빠른 조회 + 2. mdx_content_hash + mdx_line_range로 검증 → 일치하면 확정 반환 + 3. 검증 실패 시 find_sidecar_block_by_identity로 더 정확한 블록 탐색 + 4. identity도 없으면 xpath 결과를 fallback으로 반환 + """ + if roundtrip_sidecar is None: + return None + + identity_block = change.old_block or change.new_block + + # xpath 조회 + xpath_match: Optional[SidecarBlock] = None + if mapping is not None: + xpath_match = xpath_to_sidecar_block.get(mapping.xhtml_xpath) + + # hash + line range 검증 → 확정 일치 + if xpath_match is not None and identity_block is not None: + expected_hash = sha256_text(identity_block.content) if identity_block.content else "" + expected_range = (identity_block.line_start, identity_block.line_end) + if ( + xpath_match.mdx_content_hash == expected_hash + and tuple(xpath_match.mdx_line_range) == expected_range + ): + return xpath_match + + # identity fallback: mapping.yaml이 어긋난 경우 hash 기반으로 재탐색 + # xpath 태그 타입(p, ul, ol, table 등)이 일치하는 경우에만 반환하여 cross-type 오매칭 방지 + if identity_block is not None and identity_block.content: + identity_match = find_sidecar_block_by_identity( + roundtrip_sidecar.blocks, + sha256_text(identity_block.content), + (identity_block.line_start, identity_block.line_end), + ) + if identity_match is not None: + mapping_tag = mapping.xhtml_xpath.split('[')[0] if mapping else '' + identity_tag = identity_match.xhtml_xpath.split('[')[0] if identity_match.xhtml_xpath else '' + if mapping_tag == identity_tag: + return identity_match + + # xpath 결과를 마지막 fallback으로 반환 (hash 불일치라도 없는 것보다 나음) + return xpath_match + + def _flush_containing_changes( containing_changes: dict, used_ids: 'set | None' = None, @@ -252,7 +312,7 @@ def _mark_used(block_id: str, m: BlockMapping): _build_replace_fragment_patch( mapping, add_change.new_block, - mapping_lost_info, + mapping_lost_info=mapping_lost_info, ) ) _paired_indices.add(idx) @@ -307,6 +367,22 @@ def _mark_used(block_id: str, m: BlockMapping): continue if strategy == 'list': + list_sidecar = _find_roundtrip_sidecar_block( + change, mapping, roundtrip_sidecar, xpath_to_sidecar_block, + ) + if (mapping is not None + and not _contains_preserved_anchor_markup(mapping.xhtml_text) + and sidecar_block_requires_reconstruction(list_sidecar)): + _mark_used(mapping.block_id, mapping) + patches.append( + _build_replace_fragment_patch( + mapping, + change.new_block, + sidecar_block=list_sidecar, + mapping_lost_info=mapping_lost_info, + ) + ) + continue patches.extend( build_list_item_patches( change, mappings, used_ids, @@ -321,7 +397,7 @@ def _mark_used(block_id: str, m: BlockMapping): _build_replace_fragment_patch( mapping, change.new_block, - mapping_lost_info, + mapping_lost_info=mapping_lost_info, ) ) else: @@ -350,13 +426,15 @@ def _mark_used(block_id: str, m: BlockMapping): and collapse_ws(new_plain) == collapse_ws(mapping.xhtml_plain_text)): continue - sidecar_block = xpath_to_sidecar_block.get(mapping.xhtml_xpath) + sidecar_block = _find_roundtrip_sidecar_block( + change, mapping, roundtrip_sidecar, xpath_to_sidecar_block, + ) if _can_replace_table_fragment(change, mapping, roundtrip_sidecar): patches.append( _build_replace_fragment_patch( mapping, change.new_block, - mapping_lost_info, + mapping_lost_info=mapping_lost_info, ) ) continue @@ -366,7 +444,19 @@ def _mark_used(block_id: str, m: BlockMapping): _build_replace_fragment_patch( mapping, change.new_block, - mapping_lost_info, + sidecar_block=sidecar_block, + mapping_lost_info=mapping_lost_info, + ) + ) + continue + + if sidecar_block_requires_reconstruction(sidecar_block): + patches.append( + _build_replace_fragment_patch( + mapping, + change.new_block, + sidecar_block=sidecar_block, + mapping_lost_info=mapping_lost_info, ) ) continue diff --git a/confluence-mdx/bin/reverse_sync/reconstructors.py b/confluence-mdx/bin/reverse_sync/reconstructors.py new file mode 100644 index 000000000..fe2f720ba --- /dev/null +++ b/confluence-mdx/bin/reverse_sync/reconstructors.py @@ -0,0 +1,260 @@ +"""Inline-anchor fragment reconstructors. + +Phase 3: paragraph/list item 내부 ac:image anchor 보존 재구성. +anchor offset 매핑 + DOM 삽입 + fragment 재구성 공용 helper. +""" +from __future__ import annotations + +import difflib +from typing import TYPE_CHECKING, Optional + +from bs4 import BeautifulSoup, NavigableString, Tag + +from reverse_sync.xhtml_normalizer import extract_plain_text + +if TYPE_CHECKING: + from reverse_sync.sidecar import SidecarBlock + + +def map_anchor_offset( + old_plain: str, + new_plain: str, + old_offset: int, + affinity: str = 'before', +) -> int: + """old_plain에서의 anchor offset을 new_plain 기준 offset으로 변환한다. + + difflib SequenceMatcher opcode를 사용해 old 좌표계를 new 좌표계로 매핑한다. + anchor offset은 해당 위치 앞의 텍스트 바이트 수다 (삽입 지점). + + anchor 앞쪽 텍스트에 적용된 변경만 offset에 반영한다: + - equal: 그대로 유지 + - replace: new 길이로 비례 매핑 + - insert at boundary: affinity='before'이면 삽입 포함, 'after'이면 제외 + - delete: 삭제된 길이만큼 뺌 + """ + matcher = difflib.SequenceMatcher(None, old_plain, new_plain, autojunk=False) + new_offset = 0 + consumed_old = 0 + + for tag, i1, i2, j1, j2 in matcher.get_opcodes(): + if consumed_old >= old_offset: + break + + if tag == 'equal': + take = min(i2, old_offset) - i1 + if take > 0: + new_offset += take + consumed_old += take + + elif tag == 'replace': + old_take = min(i2, old_offset) - i1 + if old_take > 0: + old_len = i2 - i1 + new_len = j2 - j1 + ratio = old_take / old_len + new_offset += round(ratio * new_len) + consumed_old += old_take + + elif tag == 'delete': + old_take = min(i2, old_offset) - i1 + if old_take > 0: + consumed_old += old_take + + elif tag == 'insert': + # 경계(i1 == old_offset)에서 affinity로 배치 방향 결정: + # 'before': anchor가 삽입된 텍스트 뒤에 위치 (삽입 포함) + # 'after': anchor가 삽입된 텍스트 앞에 위치 (삽입 제외) + if i1 < old_offset or (i1 == old_offset and affinity == 'before'): + new_offset += j2 - j1 + + if consumed_old < old_offset: + new_offset += old_offset - consumed_old + + return new_offset + + +def insert_anchor_at_offset(p_element: Tag, offset: int, anchor_xhtml: str) -> None: + """p 요소 내 offset 위치에 anchor_xhtml을 DOM 삽입한다 (in-place). + + offset은 extract_plain_text() 기준의 문자 수다. + 텍스트 노드를 순회하며 올바른 텍스트 노드를 분할하고 anchor를 삽입한다. + """ + anchor_soup = BeautifulSoup(anchor_xhtml, 'html.parser') + anchor_nodes = list(anchor_soup.children) + if not anchor_nodes: + return + + remaining = offset + children = list(p_element.children) + + for child in children: + if isinstance(child, NavigableString): + text_len = len(str(child)) + if remaining <= text_len: + text = str(child) + before_text = text[:remaining] + after_text = text[remaining:] + + # 직접 참조를 유지하여 before_node 뒤에 순서대로 삽입 + before_node = NavigableString(before_text) + child.replace_with(before_node) + + pivot = before_node + for anchor_node in anchor_nodes: + cloned = BeautifulSoup(str(anchor_node), 'html.parser') + for n in list(cloned.children): + extracted = n.extract() + pivot.insert_after(extracted) + pivot = extracted + + if after_text: + pivot.insert_after(NavigableString(after_text)) + return + else: + remaining -= text_len + elif isinstance(child, Tag): + if child.name != 'ac:image': + child_text = extract_plain_text(str(child)) + if remaining <= len(child_text): + pivot = child + for anchor_node in anchor_nodes: + cloned = BeautifulSoup(str(anchor_node), 'html.parser') + for n in list(cloned.children): + extracted = n.extract() + pivot.insert_after(extracted) + pivot = extracted + return + remaining -= len(child_text) + + # offset이 모든 텍스트를 초과하면 끝에 추가 + for anchor_node in anchor_nodes: + cloned = BeautifulSoup(str(anchor_node), 'html.parser') + for n in list(cloned.children): + p_element.append(n.extract()) + + +def _find_list_item_by_path(root: Tag, path: list) -> Optional[Tag]: + """path 인덱스 경로를 따라 li 요소를 탐색한다.""" + current_list: Optional[Tag] = root + current_li: Optional[Tag] = None + for index in path: + if current_list is None: + return None + items = [c for c in current_list.children if isinstance(c, Tag) and c.name == 'li'] + if index < 0 or index >= len(items): + return None + current_li = items[index] + current_list = next( + (c for c in current_li.children if isinstance(c, Tag) and c.name in ('ul', 'ol')), + None, + ) + return current_li + + +def _find_direct_list_item_paragraph(li: Tag) -> Tag: + """li의 직접 자식 p 요소를 반환한다. 없으면 li 자체를 반환.""" + for child in li.children: + if isinstance(child, Tag) and child.name == 'p': + return child + return li + + +def _rebuild_list_fragment(new_fragment: str, recon: dict) -> str: + """list fragment에 sidecar anchor entries를 경로 기반으로 재주입한다.""" + soup = BeautifulSoup(new_fragment, 'html.parser') + root = soup.find(['ul', 'ol']) + if root is None: + return new_fragment + + old_plain = recon.get('old_plain_text', '') + for entry in recon.get('items', []): + if not entry.get('raw_xhtml') or 'offset' not in entry: + continue + path = entry.get('path', []) + li = _find_list_item_by_path(root, path) + if li is None: + continue + p = _find_direct_list_item_paragraph(li) + new_p_plain = extract_plain_text(str(p)) + new_offset = map_anchor_offset(old_plain, new_p_plain, entry['offset']) + insert_anchor_at_offset(p, new_offset, entry['raw_xhtml']) + + return str(soup) + + +def sidecar_block_requires_reconstruction( + sidecar_block: Optional['SidecarBlock'], +) -> bool: + """sidecar block에 Phase 3 재구성이 필요한 metadata가 있으면 True를 반환한다. + + offset + raw_xhtml이 모두 있는 유효한 anchor가 하나 이상 있어야 True를 반환한다. + """ + if sidecar_block is None or sidecar_block.reconstruction is None: + return False + recon = sidecar_block.reconstruction + if recon.get('kind') == 'paragraph': + return any( + 'offset' in a and 'raw_xhtml' in a + for a in recon.get('anchors', []) + ) + if recon.get('kind') == 'list': + return any( + 'offset' in item and 'raw_xhtml' in item + for item in recon.get('items', []) + ) + return False + + +def reconstruct_fragment_with_sidecar( + new_fragment: str, + sidecar_block: Optional['SidecarBlock'], +) -> str: + """new_fragment에 sidecar block의 anchor metadata를 재주입한다.""" + if sidecar_block is None or sidecar_block.reconstruction is None: + return new_fragment + recon = sidecar_block.reconstruction + kind = recon.get('kind') + if kind == 'paragraph': + anchors = recon.get('anchors', []) + valid_anchors = [a for a in anchors if 'offset' in a and 'raw_xhtml' in a] + if valid_anchors: + old_plain = recon.get('old_plain_text', '') + return reconstruct_inline_anchor_fragment(old_plain, valid_anchors, new_fragment) + if kind == 'list': + return _rebuild_list_fragment(new_fragment, recon) + return new_fragment + + +def reconstruct_inline_anchor_fragment( + old_fragment: str, + anchors: list, + new_fragment: str, +) -> str: + """new_fragment에 원본 anchors를 offset 매핑하여 재삽입한다. + + Args: + old_fragment: 원본 XHTML fragment (anchor 포함) + anchors: _build_anchor_entries()로 추출된 anchor entry 목록 + new_fragment: emit_block()으로 생성된 새 XHTML fragment (anchor 없음) + + Returns: + anchor가 재삽입된 new_fragment + """ + if not anchors: + return new_fragment + + old_plain = extract_plain_text(old_fragment) + new_plain = extract_plain_text(new_fragment) + + soup = BeautifulSoup(new_fragment, 'html.parser') + p = soup.find('p') + if p is None: + return new_fragment + + # offset을 역순으로 처리하여 앞쪽 삽입이 뒤쪽 offset에 영향 미치지 않게 함 + for anchor in reversed(anchors): + new_offset = map_anchor_offset(old_plain, new_plain, anchor['offset']) + insert_anchor_at_offset(p, new_offset, anchor['raw_xhtml']) + + return str(soup) diff --git a/confluence-mdx/bin/reverse_sync/sidecar.py b/confluence-mdx/bin/reverse_sync/sidecar.py index 4ca96d6d2..afa21d378 100644 --- a/confluence-mdx/bin/reverse_sync/sidecar.py +++ b/confluence-mdx/bin/reverse_sync/sidecar.py @@ -267,6 +267,92 @@ def build_sidecar( return sidecar +def _build_anchor_entries(fragment: str) -> list: + """fragment 내 p 요소 안의 ac:image를 anchor entry 목록으로 추출한다. + + 각 anchor entry: + kind: "image" + offset: old_plain_text 기준 앞쪽 텍스트 길이 (삽입 위치) + raw_xhtml: ac:image 원본 XHTML 문자열 + + li 직속 자식 ac:image(p 밖)는 포함하지 않는다. + """ + from bs4 import BeautifulSoup, NavigableString, Tag + soup = BeautifulSoup(fragment, 'html.parser') + anchors = [] + for p in soup.find_all('p', recursive=False): + offset = 0 + for child in p.children: + if isinstance(child, NavigableString): + offset += len(str(child)) + elif isinstance(child, Tag): + if child.name == 'ac:image': + anchors.append({ + 'kind': 'image', + 'offset': offset, + 'raw_xhtml': str(child), + }) + else: + # ac:link 등 텍스트를 포함하는 inline 요소는 텍스트 추출 + offset += len(extract_plain_text(str(child))) + return anchors + + +def _extract_anchors_from_p(p_el) -> list: + """p 요소에서 ac:image anchor entry (offset, raw_xhtml) 목록을 추출한다.""" + from bs4 import NavigableString, Tag + anchors = [] + offset = 0 + for child in p_el.children: + if isinstance(child, NavigableString): + offset += len(str(child)) + elif isinstance(child, Tag): + if child.name == 'ac:image': + anchors.append({ + 'kind': 'image', + 'offset': offset, + 'raw_xhtml': str(child), + }) + else: + offset += len(extract_plain_text(str(child))) + return anchors + + +def _walk_list(list_el, path: list, entries: list) -> None: + """list 요소를 재귀 순회하며 anchor entry를 수집한다.""" + from bs4 import Tag + items = [c for c in list_el.children if isinstance(c, Tag) and c.name == 'li'] + for idx, li in enumerate(items): + current_path = path + [idx] + for child in li.children: + if not isinstance(child, Tag): + continue + if child.name == 'p': + for a in _extract_anchors_from_p(child): + entries.append({**a, 'path': current_path}) + elif child.name in ('ul', 'ol'): + _walk_list(child, current_path, entries) + + +def _build_list_anchor_entries(fragment: str) -> list: + """list fragment 내 li > p > ac:image를 path 기반 anchor entry로 추출한다. + + 각 entry: + kind: "image" + path: li 인덱스 경로 (중첩 지원, e.g. [0, 1]) + offset: p 내 plain text 기준 삽입 위치 + raw_xhtml: ac:image 원본 XHTML 문자열 + """ + from bs4 import BeautifulSoup + soup = BeautifulSoup(fragment, 'html.parser') + root = soup.find(['ul', 'ol']) + if root is None: + return [] + entries = [] + _walk_list(root, [], entries) + return entries + + def _build_reconstruction_metadata( fragment: str, mapping: BlockMapping | None, @@ -280,10 +366,10 @@ def _build_reconstruction_metadata( "old_plain_text": extract_plain_text(fragment), } if mapping.type == "paragraph": - metadata["anchors"] = [] + metadata["anchors"] = _build_anchor_entries(fragment) elif mapping.type == "list": metadata["ordered"] = mapping.xhtml_xpath.startswith("ol[") - metadata["items"] = [] + metadata["items"] = _build_list_anchor_entries(fragment) elif mapping.children: child_plain_texts = [ id_to_mapping[child_id].xhtml_plain_text.strip() diff --git a/confluence-mdx/docs/plans/2026-03-13-reverse-sync-reconstruction-design.md b/confluence-mdx/docs/plans/2026-03-13-reverse-sync-reconstruction-design.md index 42c0d3306..ecc59cec8 100644 --- a/confluence-mdx/docs/plans/2026-03-13-reverse-sync-reconstruction-design.md +++ b/confluence-mdx/docs/plans/2026-03-13-reverse-sync-reconstruction-design.md @@ -387,24 +387,15 @@ PR #913 시점에 제안된 방향 중, 2026-03-15 기준 `main`에서도 그대 ### Phase 3. inline-anchor 및 list 재구성 -상태: 미완료 - -구현 항목: - -- paragraph/list item anchor metadata builder -- old/new plain-text offset mapping helper -- raw anchor DOM insertion helper -- nested list tree 기반 reconstruction - -우선 대상 fixture: - -- `tests/testcases` 내 list/image 혼합 케이스 -- `tests/reverse-sync/544376004` +상태: 완료, `main` 반영 예정 -게이트: - -- inline image가 있는 paragraph/list item 재구성 green -- duplicate hash 후보에서도 identity가 안정적으로 동작 +완료 기준: +- paragraph anchor metadata builder 구현 (`sidecar.py`) +- anchor offset mapping helper 구현 (`reconstructors.py`) +- raw anchor DOM insertion helper 구현 (`reconstructors.py`) +- inline-anchor paragraph reconstruction pipeline 연동 (`patch_builder.py`) +- golden test 확장: 10개 inline-anchor 케이스 모두 green +- 파서 불일치 수정 (test에서 `mdx_to_storage.parser` 사용) ### Phase 4. container 재구성 diff --git a/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py b/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py new file mode 100644 index 000000000..5809ff37c --- /dev/null +++ b/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py @@ -0,0 +1,268 @@ +"""Phase 3 paragraph/list-item inline-anchor 재구성 테스트.""" +import pytest +from reverse_sync.sidecar import _build_anchor_entries # noqa: import check + + +class TestBuildAnchorEntries: + def test_empty_paragraph_returns_empty(self): + """ac:image 없는 단순 paragraph는 빈 anchors를 반환한다.""" + fragment = '
Simple text without images.
' + anchors = _build_anchor_entries(fragment) + assert anchors == [] + + def test_paragraph_with_inline_image(self): + """paragraph 안 ac:image를 anchor로 추출한다.""" + fragment = ( + 'Text before '
+ '
'
+ '
List item text
' + 'hello
', 'html.parser') + p = soup.find('p') + anchor_html = 'helloworld
', 'html.parser') + p = soup.find('p') + anchor_html = 'hello
', 'html.parser') + p = soup.find('p') + anchor_html = 'Old text '
+ '
New text rest
' # emitted from new MDX + anchors = [{'kind': 'image', 'offset': len('Old text '), 'raw_xhtml': 'Original text '
+ '
item '
+ '
outer
' + ''
+ '
plain text
first '
+ '
second
'
+ '