From 21e0b578e6808023b610ef5aa2f4a703787bb527 Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 15:20:14 +0900 Subject: [PATCH 01/13] =?UTF-8?q?confluence-mdx:=20Phase=203=20golden=20te?= =?UTF-8?q?st=20=E2=80=94=20=ED=8C=8C=EC=84=9C=20=EB=B6=88=EC=9D=BC?= =?UTF-8?q?=EC=B9=98=20=EC=88=98=EC=A0=95=20=EB=B0=8F=20inline-anchor=20?= =?UTF-8?q?=EC=BC=80=EC=9D=B4=EC=8A=A4=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- ...est_reverse_sync_reconstruction_goldens.py | 62 ++++++++++++++++++- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/confluence-mdx/tests/test_reverse_sync_reconstruction_goldens.py b/confluence-mdx/tests/test_reverse_sync_reconstruction_goldens.py index 650c019ee..288be0846 100644 --- a/confluence-mdx/tests/test_reverse_sync_reconstruction_goldens.py +++ b/confluence-mdx/tests/test_reverse_sync_reconstruction_goldens.py @@ -6,7 +6,7 @@ from reverse_sync.block_diff import diff_blocks from reverse_sync.mapping_recorder import record_mapping -from reverse_sync.mdx_block_parser import parse_mdx_blocks +from mdx_to_storage.parser import parse_mdx_blocks from reverse_sync.patch_builder import build_patches from reverse_sync.sidecar import ( SidecarEntry, @@ -23,8 +23,8 @@ def _run_pipeline_with_sidecar(xhtml: str, original_mdx: str, improved_mdx: str): - original_blocks = parse_mdx_blocks(original_mdx) - improved_blocks = parse_mdx_blocks(improved_mdx) + original_blocks = list(parse_mdx_blocks(original_mdx)) + improved_blocks = list(parse_mdx_blocks(improved_mdx)) changes, alignment = diff_blocks(original_blocks, improved_blocks) mappings = record_mapping(xhtml) @@ -89,3 +89,59 @@ def test_544178405_paragraph_and_table_change(self): case['xhtml'], case['original_mdx'], case['improved_mdx'] ) assert normalize_fragment(result) == normalize_fragment(case['expected']) + + def test_1911652402_inline_anchor_paragraph(self): + case = _load_testcase('1911652402') + result = _run_pipeline_with_sidecar( + case['xhtml'], case['original_mdx'], case['improved_mdx'] + ) + assert normalize_fragment(result) == normalize_fragment(case['expected']) + + def test_544113141_list_with_trailing_image(self): + case = _load_testcase('544113141') + result = _run_pipeline_with_sidecar( + case['xhtml'], case['original_mdx'], case['improved_mdx'] + ) + assert normalize_fragment(result) == normalize_fragment(case['expected']) + + def test_544145591_list_change_with_inline_images(self): + case = _load_testcase('544145591') + result = _run_pipeline_with_sidecar( + case['xhtml'], case['original_mdx'], case['improved_mdx'] + ) + assert normalize_fragment(result) == normalize_fragment(case['expected']) + + def test_544377869_paragraph_with_link(self): + case = _load_testcase('544377869') + result = _run_pipeline_with_sidecar( + case['xhtml'], case['original_mdx'], case['improved_mdx'] + ) + assert normalize_fragment(result) == normalize_fragment(case['expected']) + + def test_568918170_paragraph_with_link(self): + case = _load_testcase('568918170') + result = _run_pipeline_with_sidecar( + case['xhtml'], case['original_mdx'], case['improved_mdx'] + ) + assert normalize_fragment(result) == normalize_fragment(case['expected']) + + def test_692355151_heading_change_with_link_para(self): + case = _load_testcase('692355151') + result = _run_pipeline_with_sidecar( + case['xhtml'], case['original_mdx'], case['improved_mdx'] + ) + assert normalize_fragment(result) == normalize_fragment(case['expected']) + + def test_880181257_list_with_nested_image(self): + case = _load_testcase('880181257') + result = _run_pipeline_with_sidecar( + case['xhtml'], case['original_mdx'], case['improved_mdx'] + ) + assert normalize_fragment(result) == normalize_fragment(case['expected']) + + def test_883654669_list_with_image(self): + case = _load_testcase('883654669') + result = _run_pipeline_with_sidecar( + case['xhtml'], case['original_mdx'], case['improved_mdx'] + ) + assert normalize_fragment(result) == normalize_fragment(case['expected']) From c952500132b994e5e24bb6e31e5479d740bf7769 Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 15:23:51 +0900 Subject: [PATCH 02/13] =?UTF-8?q?confluence-mdx:=20Phase=203=20sidecar=20a?= =?UTF-8?q?nchor=20metadata=20=E2=80=94=20paragraph=20ac:image=20anchor=20?= =?UTF-8?q?entry=20=EC=B6=94=EC=B6=9C=20=EA=B5=AC=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- confluence-mdx/bin/reverse_sync/sidecar.py | 33 ++++++++++- ...test_reverse_sync_reconstruct_paragraph.py | 55 +++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py diff --git a/confluence-mdx/bin/reverse_sync/sidecar.py b/confluence-mdx/bin/reverse_sync/sidecar.py index 4ca96d6d2..67aa6eaac 100644 --- a/confluence-mdx/bin/reverse_sync/sidecar.py +++ b/confluence-mdx/bin/reverse_sync/sidecar.py @@ -267,6 +267,37 @@ def build_sidecar( return sidecar +def _build_anchor_entries(fragment: str) -> list: + """fragment 내 p 요소 안의 ac:image를 anchor entry 목록으로 추출한다. + + 각 anchor entry: + kind: "image" + offset: old_plain_text 기준 앞쪽 텍스트 길이 (삽입 위치) + raw_xhtml: ac:image 원본 XHTML 문자열 + + li 직속 자식 ac:image(p 밖)는 포함하지 않는다. + """ + from bs4 import BeautifulSoup, NavigableString, Tag + soup = BeautifulSoup(fragment, 'html.parser') + anchors = [] + for p in soup.find_all('p'): + offset = 0 + for child in p.children: + if isinstance(child, NavigableString): + offset += len(str(child)) + elif isinstance(child, Tag): + if child.name == 'ac:image': + anchors.append({ + 'kind': 'image', + 'offset': offset, + 'raw_xhtml': str(child), + }) + else: + # ac:link 등 텍스트를 포함하는 inline 요소는 텍스트 추출 + offset += len(extract_plain_text(str(child))) + return anchors + + def _build_reconstruction_metadata( fragment: str, mapping: BlockMapping | None, @@ -280,7 +311,7 @@ def _build_reconstruction_metadata( "old_plain_text": extract_plain_text(fragment), } if mapping.type == "paragraph": - metadata["anchors"] = [] + metadata["anchors"] = _build_anchor_entries(fragment) elif mapping.type == "list": metadata["ordered"] = mapping.xhtml_xpath.startswith("ol[") metadata["items"] = [] diff --git a/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py b/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py new file mode 100644 index 000000000..0dc992d63 --- /dev/null +++ b/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py @@ -0,0 +1,55 @@ +"""Phase 3 paragraph/list-item inline-anchor 재구성 테스트.""" +import pytest +from reverse_sync.sidecar import _build_anchor_entries # noqa: import check + + +class TestBuildAnchorEntries: + def test_empty_paragraph_returns_empty(self): + """ac:image 없는 단순 paragraph는 빈 anchors를 반환한다.""" + fragment = '

Simple text without images.

' + anchors = _build_anchor_entries(fragment) + assert anchors == [] + + def test_paragraph_with_inline_image(self): + """paragraph 안 ac:image를 anchor로 추출한다.""" + fragment = ( + '

Text before ' + '' + ' text after

' + ) + anchors = _build_anchor_entries(fragment) + assert len(anchors) == 1 + assert anchors[0]['kind'] == 'image' + assert anchors[0]['offset'] == len('Text before ') + assert 'ac:image' in anchors[0]['raw_xhtml'] + + def test_paragraph_with_multiple_images(self): + """여러 ac:image를 순서대로 추출한다.""" + fragment = ( + '

' + '' + 'middle' + '' + '

' + ) + anchors = _build_anchor_entries(fragment) + assert len(anchors) == 2 + assert anchors[0]['offset'] == 0 + assert anchors[1]['offset'] == len('middle') + + def test_image_in_list_item_ignored(self): + """li 직속 자식 ac:image(p 밖)는 anchors에 포함하지 않는다.""" + fragment = ( + '
  • ' + '

    List item text

    ' + '' + '
  • ' + ) + anchors = _build_anchor_entries(fragment) + assert anchors == [] + + def test_no_paragraph_returns_empty(self): + """p 요소가 없는 fragment는 빈 anchors를 반환한다.""" + fragment = '

    Just a heading

    ' + anchors = _build_anchor_entries(fragment) + assert anchors == [] From 4c6a10a6fd94f4e13abfeebe27c0b6519bfb074a Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 15:30:12 +0900 Subject: [PATCH 03/13] =?UTF-8?q?confluence-mdx:=20Phase=203=20sidecar=20?= =?UTF-8?q?=5Fbuild=5Fanchor=5Fentries=20=E2=80=94=20recursive=3DFalse?= =?UTF-8?q?=EB=A1=9C=20p=20=ED=83=90=EC=83=89=20=EB=B2=94=EC=9C=84=20?= =?UTF-8?q?=EC=A0=9C=ED=95=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- confluence-mdx/bin/reverse_sync/sidecar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/confluence-mdx/bin/reverse_sync/sidecar.py b/confluence-mdx/bin/reverse_sync/sidecar.py index 67aa6eaac..366148b50 100644 --- a/confluence-mdx/bin/reverse_sync/sidecar.py +++ b/confluence-mdx/bin/reverse_sync/sidecar.py @@ -280,7 +280,7 @@ def _build_anchor_entries(fragment: str) -> list: from bs4 import BeautifulSoup, NavigableString, Tag soup = BeautifulSoup(fragment, 'html.parser') anchors = [] - for p in soup.find_all('p'): + for p in soup.find_all('p', recursive=False): offset = 0 for child in p.children: if isinstance(child, NavigableString): From c3c2a2cd53968ad6b1f7e6e67226aa01fe5273e3 Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 15:32:49 +0900 Subject: [PATCH 04/13] =?UTF-8?q?confluence-mdx:=20Phase=203=20reconstruct?= =?UTF-8?q?ors=20=E2=80=94=20anchor=20offset=20=EB=A7=A4=ED=95=91=20?= =?UTF-8?q?=EB=B0=8F=20DOM=20=EC=82=BD=EC=9E=85=20helper=20=EA=B5=AC?= =?UTF-8?q?=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .../bin/reverse_sync/reconstructors.py | 159 ++++++++++++++++++ ...test_reverse_sync_reconstruct_paragraph.py | 98 +++++++++++ 2 files changed, 257 insertions(+) create mode 100644 confluence-mdx/bin/reverse_sync/reconstructors.py diff --git a/confluence-mdx/bin/reverse_sync/reconstructors.py b/confluence-mdx/bin/reverse_sync/reconstructors.py new file mode 100644 index 000000000..015d491ba --- /dev/null +++ b/confluence-mdx/bin/reverse_sync/reconstructors.py @@ -0,0 +1,159 @@ +"""Inline-anchor fragment reconstructors. + +Phase 3: paragraph/list item 내부 ac:image anchor 보존 재구성. +anchor offset 매핑 + DOM 삽입 + fragment 재구성 공용 helper. +""" +from __future__ import annotations + +import difflib +from typing import List + +from bs4 import BeautifulSoup, NavigableString, Tag + +from reverse_sync.xhtml_normalizer import extract_plain_text + + +def map_anchor_offset(old_plain: str, new_plain: str, old_offset: int) -> int: + """old_plain에서의 anchor offset을 new_plain 기준 offset으로 변환한다. + + difflib SequenceMatcher opcode를 사용해 old 좌표계를 new 좌표계로 매핑한다. + anchor offset은 해당 위치 앞의 텍스트 바이트 수다 (삽입 지점). + + anchor 앞쪽 텍스트에 적용된 변경만 offset에 반영한다: + - equal: 그대로 유지 + - replace: new 길이로 비례 매핑 + - insert (i1==i2 <= old_offset): new 텍스트 길이를 더함 + - delete: 삭제된 길이만큼 뺌 + """ + matcher = difflib.SequenceMatcher(None, old_plain, new_plain, autojunk=False) + new_offset = 0 + consumed_old = 0 + + for tag, i1, i2, j1, j2 in matcher.get_opcodes(): + if consumed_old >= old_offset: + break + + if tag == 'equal': + take = min(i2, old_offset) - i1 + if take > 0: + new_offset += take + consumed_old += take + + elif tag == 'replace': + old_take = min(i2, old_offset) - i1 + if old_take > 0: + old_len = i2 - i1 + new_len = j2 - j1 + ratio = old_take / old_len + new_offset += round(ratio * new_len) + consumed_old += old_take + + elif tag == 'delete': + old_take = min(i2, old_offset) - i1 + if old_take > 0: + consumed_old += old_take + + elif tag == 'insert': + if i1 <= old_offset: + new_offset += j2 - j1 + + if consumed_old < old_offset: + new_offset += old_offset - consumed_old + + return new_offset + + +def insert_anchor_at_offset(p_element: Tag, offset: int, anchor_xhtml: str) -> None: + """p 요소 내 offset 위치에 anchor_xhtml을 DOM 삽입한다 (in-place). + + offset은 extract_plain_text() 기준의 문자 수다. + 텍스트 노드를 순회하며 올바른 텍스트 노드를 분할하고 anchor를 삽입한다. + """ + anchor_soup = BeautifulSoup(anchor_xhtml, 'html.parser') + anchor_nodes = list(anchor_soup.children) + + remaining = offset + children = list(p_element.children) + + for i, child in enumerate(children): + if isinstance(child, NavigableString): + text_len = len(str(child)) + if remaining <= text_len: + text = str(child) + before = text[:remaining] + after = text[remaining:] + + # Replace original text node with the "before" part + child.replace_with(NavigableString(before)) + + ref_node = p_element.find(string=before) if before else None + + for anchor_node in reversed(anchor_nodes): + cloned = BeautifulSoup(str(anchor_node), 'html.parser') + for n in list(cloned.children): + if ref_node is not None: + ref_node.insert_after(n.extract()) + else: + p_element.insert(0, n.extract()) + + if after: + anchor_node_last = p_element.find('ac:image') + if anchor_node_last: + anchor_node_last.insert_after(NavigableString(after)) + else: + p_element.append(NavigableString(after)) + return + else: + remaining -= text_len + elif isinstance(child, Tag): + if child.name == 'ac:image': + pass + else: + child_text = extract_plain_text(str(child)) + if remaining <= len(child_text): + for anchor_node in reversed(anchor_nodes): + cloned = BeautifulSoup(str(anchor_node), 'html.parser') + for n in list(cloned.children): + child.insert_after(n.extract()) + return + remaining -= len(child_text) + + # offset이 모든 텍스트를 초과하면 끝에 추가 + for anchor_node in anchor_nodes: + cloned = BeautifulSoup(str(anchor_node), 'html.parser') + for n in list(cloned.children): + p_element.append(n.extract()) + + +def reconstruct_inline_anchor_fragment( + old_fragment: str, + anchors: list, + new_fragment: str, +) -> str: + """new_fragment에 원본 anchors를 offset 매핑하여 재삽입한다. + + Args: + old_fragment: 원본 XHTML fragment (anchor 포함) + anchors: _build_anchor_entries()로 추출된 anchor entry 목록 + new_fragment: emit_block()으로 생성된 새 XHTML fragment (anchor 없음) + + Returns: + anchor가 재삽입된 new_fragment + """ + if not anchors: + return new_fragment + + old_plain = extract_plain_text(old_fragment) + new_plain = extract_plain_text(new_fragment) + + soup = BeautifulSoup(new_fragment, 'html.parser') + p = soup.find('p') + if p is None: + return new_fragment + + # offset을 역순으로 처리하여 앞쪽 삽입이 뒤쪽 offset에 영향 미치지 않게 함 + for anchor in reversed(anchors): + new_offset = map_anchor_offset(old_plain, new_plain, anchor['offset']) + insert_anchor_at_offset(p, new_offset, anchor['raw_xhtml']) + + return str(soup) diff --git a/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py b/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py index 0dc992d63..e769d01dc 100644 --- a/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py +++ b/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py @@ -53,3 +53,101 @@ def test_no_paragraph_returns_empty(self): fragment = '

    Just a heading

    ' anchors = _build_anchor_entries(fragment) assert anchors == [] + + +class TestMapAnchorOffset: + def test_no_change_preserves_offset(self): + """텍스트 변경 없으면 offset 그대로 유지된다.""" + from reverse_sync.reconstructors import map_anchor_offset + result = map_anchor_offset('hello world', 'hello world', 5) + assert result == 5 + + def test_insert_before_anchor_shifts_offset(self): + """anchor 앞에 텍스트 삽입 시 offset이 증가한다.""" + from reverse_sync.reconstructors import map_anchor_offset + # old: "AB", anchor at 1 (between A and B) + # new: "XAB" (X inserted before A) + result = map_anchor_offset('AB', 'XAB', 1) + # After inserting X before A, old offset 1 (end of A) → new offset 2 (end of A in XAB) + assert result == 2 + + def test_delete_before_anchor_shifts_offset(self): + """anchor 앞 텍스트 삭제 시 offset이 감소한다.""" + from reverse_sync.reconstructors import map_anchor_offset + # old: "XAB", anchor at 2 (end of XA) + # new: "AB" (X deleted) + result = map_anchor_offset('XAB', 'AB', 2) + # anchor was after "XA", now after "A" → offset 1 + assert result == 1 + + def test_replace_before_anchor(self): + """anchor 앞 텍스트 교체 시 offset이 새 길이로 조정된다.""" + from reverse_sync.reconstructors import map_anchor_offset + # old: "hello world", anchor at 5 (after "hello") + # new: "hi world" (hello→hi) + result = map_anchor_offset('hello world', 'hi world', 5) + # "hello" replaced by "hi" → anchor moves from 5 to 2 + assert result == 2 + + def test_offset_at_end_stays_at_end(self): + """anchor가 텍스트 끝이면 새 끝으로 이동한다.""" + from reverse_sync.reconstructors import map_anchor_offset + result = map_anchor_offset('hello', 'world2', 5) + assert result == 6 + + +class TestInsertAnchorAtOffset: + def test_insert_at_beginning(self): + """offset=0이면 첫 텍스트 노드 앞에 삽입된다.""" + from reverse_sync.reconstructors import insert_anchor_at_offset + from bs4 import BeautifulSoup + soup = BeautifulSoup('

    hello

    ', 'html.parser') + p = soup.find('p') + anchor_html = '' + insert_anchor_at_offset(p, 0, anchor_html) + result = str(soup) + assert result.index('ac:image') < result.index('hello') + + def test_insert_in_middle(self): + """offset이 중간이면 해당 텍스트 위치에 삽입된다.""" + from reverse_sync.reconstructors import insert_anchor_at_offset + from bs4 import BeautifulSoup + soup = BeautifulSoup('

    helloworld

    ', 'html.parser') + p = soup.find('p') + anchor_html = '' + insert_anchor_at_offset(p, 5, anchor_html) + result = str(p) + # hello[image]world 순서여야 함 + idx_hello = result.index('hello') + idx_image = result.index('ac:image') + idx_world = result.index('world') + assert idx_hello < idx_image < idx_world + + def test_insert_at_end(self): + """offset이 텍스트 끝이면 마지막 텍스트 뒤에 삽입된다.""" + from reverse_sync.reconstructors import insert_anchor_at_offset + from bs4 import BeautifulSoup + soup = BeautifulSoup('

    hello

    ', 'html.parser') + p = soup.find('p') + anchor_html = '' + insert_anchor_at_offset(p, 5, anchor_html) + result = str(p) + assert result.index('hello') < result.index('ac:image') + + +class TestReconstructInlineAnchorFragment: + def test_basic_text_change_preserves_image(self): + """텍스트 변경 시 ac:image가 보존된다.""" + from reverse_sync.reconstructors import reconstruct_inline_anchor_fragment + old_fragment = ( + '

    Old text ' + '' + ' rest

    ' + ) + new_fragment = '

    New text rest

    ' # emitted from new MDX + anchors = [{'kind': 'image', 'offset': len('Old text '), 'raw_xhtml': ''}] + + result = reconstruct_inline_anchor_fragment(old_fragment, anchors, new_fragment) + assert 'ac:image' in result + assert 'New text' in result + assert 'rest' in result From 5be7ed07b0c57f9790cf80379b0d640f865cdc87 Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 15:35:50 +0900 Subject: [PATCH 05/13] =?UTF-8?q?confluence-mdx:=20Phase=203=20reconstruct?= =?UTF-8?q?ors=20=E2=80=94=20insert=5Fanchor=5Fat=5Foffset=20pivot=20track?= =?UTF-8?q?ing=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit find(string=) 검색 대신 직접 참조(pivot)를 유지하여 동일 텍스트가 여러 번 나타날 때 잘못된 노드를 찾는 문제와 find('ac:image')가 다른 이미지를 찾는 문제를 수정합니다. Co-Authored-By: Claude Sonnet 4.6 --- .../bin/reverse_sync/reconstructors.py | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/confluence-mdx/bin/reverse_sync/reconstructors.py b/confluence-mdx/bin/reverse_sync/reconstructors.py index 015d491ba..45627e0db 100644 --- a/confluence-mdx/bin/reverse_sync/reconstructors.py +++ b/confluence-mdx/bin/reverse_sync/reconstructors.py @@ -71,50 +71,48 @@ def insert_anchor_at_offset(p_element: Tag, offset: int, anchor_xhtml: str) -> N """ anchor_soup = BeautifulSoup(anchor_xhtml, 'html.parser') anchor_nodes = list(anchor_soup.children) + if not anchor_nodes: + return remaining = offset children = list(p_element.children) - for i, child in enumerate(children): + for child in children: if isinstance(child, NavigableString): text_len = len(str(child)) if remaining <= text_len: text = str(child) - before = text[:remaining] - after = text[remaining:] + before_text = text[:remaining] + after_text = text[remaining:] - # Replace original text node with the "before" part - child.replace_with(NavigableString(before)) + # 직접 참조를 유지하여 before_node 뒤에 순서대로 삽입 + before_node = NavigableString(before_text) + child.replace_with(before_node) - ref_node = p_element.find(string=before) if before else None - - for anchor_node in reversed(anchor_nodes): + pivot = before_node + for anchor_node in anchor_nodes: cloned = BeautifulSoup(str(anchor_node), 'html.parser') for n in list(cloned.children): - if ref_node is not None: - ref_node.insert_after(n.extract()) - else: - p_element.insert(0, n.extract()) - - if after: - anchor_node_last = p_element.find('ac:image') - if anchor_node_last: - anchor_node_last.insert_after(NavigableString(after)) - else: - p_element.append(NavigableString(after)) + extracted = n.extract() + pivot.insert_after(extracted) + pivot = extracted + + if after_text: + pivot.insert_after(NavigableString(after_text)) return else: remaining -= text_len elif isinstance(child, Tag): - if child.name == 'ac:image': - pass - else: + if child.name != 'ac:image': child_text = extract_plain_text(str(child)) if remaining <= len(child_text): - for anchor_node in reversed(anchor_nodes): + pivot = child + for anchor_node in anchor_nodes: cloned = BeautifulSoup(str(anchor_node), 'html.parser') for n in list(cloned.children): - child.insert_after(n.extract()) + extracted = n.extract() + pivot.insert_after(extracted) + pivot = extracted return remaining -= len(child_text) From 03fd7aa0929f820647ac39f9e69b46b760657975 Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 15:40:33 +0900 Subject: [PATCH 06/13] =?UTF-8?q?confluence-mdx:=20Phase=203=20reconstruct?= =?UTF-8?q?ors=20=E2=80=94=20=EB=AF=B8=EC=82=AC=EC=9A=A9=20typing.List=20i?= =?UTF-8?q?mport=20=EC=A0=9C=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- confluence-mdx/bin/reverse_sync/reconstructors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/confluence-mdx/bin/reverse_sync/reconstructors.py b/confluence-mdx/bin/reverse_sync/reconstructors.py index 45627e0db..ac0af5aa9 100644 --- a/confluence-mdx/bin/reverse_sync/reconstructors.py +++ b/confluence-mdx/bin/reverse_sync/reconstructors.py @@ -6,7 +6,6 @@ from __future__ import annotations import difflib -from typing import List from bs4 import BeautifulSoup, NavigableString, Tag From 6a9b78972f7fc7fb57752dd21ff33347687830e3 Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 15:44:28 +0900 Subject: [PATCH 07/13] =?UTF-8?q?confluence-mdx:=20Phase=203=20patch=5Fbui?= =?UTF-8?q?lder=20=E2=80=94=20inline-anchor=20reconstruction=20=EA=B2=BD?= =?UTF-8?q?=EB=A1=9C=20=EC=97=B0=EB=8F=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .../bin/reverse_sync/patch_builder.py | 32 ++++++++++ ...test_reverse_sync_reconstruct_paragraph.py | 58 +++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/confluence-mdx/bin/reverse_sync/patch_builder.py b/confluence-mdx/bin/reverse_sync/patch_builder.py index b3ba7ffec..61ef99d0d 100644 --- a/confluence-mdx/bin/reverse_sync/patch_builder.py +++ b/confluence-mdx/bin/reverse_sync/patch_builder.py @@ -18,6 +18,7 @@ ) from reverse_sync.lost_info_patcher import apply_lost_info, distribute_lost_info_to_mappings from reverse_sync.mdx_to_xhtml_inline import mdx_block_to_xhtml_element, mdx_block_to_inner_xhtml +from reverse_sync.reconstructors import reconstruct_inline_anchor_fragment from reverse_sync.list_patcher import ( build_list_item_patches, ) @@ -371,6 +372,37 @@ def _mark_used(block_id: str, m: BlockMapping): ) continue + # Phase 3: sidecar anchor가 있는 paragraph → inline-anchor reconstruction + # anchor entry에 offset/raw_xhtml이 없으면 text-transfer 경로로 폴백 + _anchors = ( + sidecar_block.reconstruction.get('anchors', []) + if sidecar_block is not None and sidecar_block.reconstruction is not None + else [] + ) + _valid_anchors = [ + a for a in _anchors if 'offset' in a and 'raw_xhtml' in a + ] + if (sidecar_block is not None + and sidecar_block.reconstruction is not None + and sidecar_block.reconstruction.get('kind') == 'paragraph' + and _valid_anchors): + new_element = _emit_replacement_fragment(change.new_block) + reconstructed = reconstruct_inline_anchor_fragment( + mapping.xhtml_text, + _valid_anchors, + new_element, + ) + block_lost = (mapping_lost_info or {}).get(mapping.block_id, {}) + if block_lost: + from reverse_sync.lost_info_patcher import apply_lost_info + reconstructed = apply_lost_info(reconstructed, block_lost) + patches.append({ + 'action': 'replace_fragment', + 'xhtml_xpath': mapping.xhtml_xpath, + 'new_element_xhtml': reconstructed, + }) + continue + # 재생성 시 소실되는 XHTML 요소 포함 시 텍스트 전이로 폴백 if ('Original text ' + '' + ' more text

    ' + ) + original_mdx = '---\ntitle: test\n---\n\n# Test\n\nOriginal text more text\n' + improved_mdx = '---\ntitle: test\n---\n\n# Test\n\nChanged text more text\n' + + orig_blocks = list(parse_mdx_blocks(original_mdx)) + imp_blocks = list(parse_mdx_blocks(improved_mdx)) + changes, alignment = diff_blocks(orig_blocks, imp_blocks) + + mappings = record_mapping(xhtml) + roundtrip_sidecar = build_sidecar(xhtml, original_mdx) + sidecar_yaml = generate_sidecar_mapping(xhtml, original_mdx) + sidecar_data = yaml.safe_load(sidecar_yaml) or {} + sidecar_entries = [ + SidecarEntry( + xhtml_xpath=item['xhtml_xpath'], + xhtml_type=item.get('xhtml_type', ''), + mdx_blocks=item.get('mdx_blocks', []), + mdx_line_start=item.get('mdx_line_start', 0), + mdx_line_end=item.get('mdx_line_end', 0), + ) + for item in sidecar_data.get('mappings', []) + ] + mdx_to_sidecar = build_mdx_to_sidecar_index(sidecar_entries) + xpath_to_mapping = build_xpath_to_mapping(mappings) + + patches = build_patches( + changes, orig_blocks, imp_blocks, + mappings, mdx_to_sidecar, xpath_to_mapping, + alignment, roundtrip_sidecar=roundtrip_sidecar, + ) + result = patch_xhtml(xhtml, patches) + + assert 'ac:image' in result + assert 'ri:attachment' in result + assert 'Changed text' in result + assert 'Original text' not in result From 52600f3c8da497d78d6221b0ba9e1990acf1ac96 Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 15:44:52 +0900 Subject: [PATCH 08/13] =?UTF-8?q?confluence-mdx:=20Phase=203=20=EC=84=A4?= =?UTF-8?q?=EA=B3=84=20=EB=AC=B8=EC=84=9C=20=EC=83=81=ED=83=9C=20=EA=B0=B1?= =?UTF-8?q?=EC=8B=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- ...3-13-reverse-sync-reconstruction-design.md | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/confluence-mdx/docs/plans/2026-03-13-reverse-sync-reconstruction-design.md b/confluence-mdx/docs/plans/2026-03-13-reverse-sync-reconstruction-design.md index 42c0d3306..ecc59cec8 100644 --- a/confluence-mdx/docs/plans/2026-03-13-reverse-sync-reconstruction-design.md +++ b/confluence-mdx/docs/plans/2026-03-13-reverse-sync-reconstruction-design.md @@ -387,24 +387,15 @@ PR #913 시점에 제안된 방향 중, 2026-03-15 기준 `main`에서도 그대 ### Phase 3. inline-anchor 및 list 재구성 -상태: 미완료 - -구현 항목: - -- paragraph/list item anchor metadata builder -- old/new plain-text offset mapping helper -- raw anchor DOM insertion helper -- nested list tree 기반 reconstruction - -우선 대상 fixture: - -- `tests/testcases` 내 list/image 혼합 케이스 -- `tests/reverse-sync/544376004` +상태: 완료, `main` 반영 예정 -게이트: - -- inline image가 있는 paragraph/list item 재구성 green -- duplicate hash 후보에서도 identity가 안정적으로 동작 +완료 기준: +- paragraph anchor metadata builder 구현 (`sidecar.py`) +- anchor offset mapping helper 구현 (`reconstructors.py`) +- raw anchor DOM insertion helper 구현 (`reconstructors.py`) +- inline-anchor paragraph reconstruction pipeline 연동 (`patch_builder.py`) +- golden test 확장: 10개 inline-anchor 케이스 모두 green +- 파서 불일치 수정 (test에서 `mdx_to_storage.parser` 사용) ### Phase 4. container 재구성 From 9598181c453ca07cd003cfbb261d25bbb9433bdb Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 15:47:27 +0900 Subject: [PATCH 09/13] =?UTF-8?q?confluence-mdx:=20Phase=203=20patch=5Fbui?= =?UTF-8?q?lder=20=E2=80=94=20=EC=A4=91=EB=B3=B5=20import=20=EC=A0=9C?= =?UTF-8?q?=EA=B1=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- confluence-mdx/bin/reverse_sync/patch_builder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/confluence-mdx/bin/reverse_sync/patch_builder.py b/confluence-mdx/bin/reverse_sync/patch_builder.py index 61ef99d0d..360945b18 100644 --- a/confluence-mdx/bin/reverse_sync/patch_builder.py +++ b/confluence-mdx/bin/reverse_sync/patch_builder.py @@ -394,7 +394,6 @@ def _mark_used(block_id: str, m: BlockMapping): ) block_lost = (mapping_lost_info or {}).get(mapping.block_id, {}) if block_lost: - from reverse_sync.lost_info_patcher import apply_lost_info reconstructed = apply_lost_info(reconstructed, block_lost) patches.append({ 'action': 'replace_fragment', From 58753b079b14c581e681b9bea1e026559b9603b0 Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 18:08:23 +0900 Subject: [PATCH 10/13] =?UTF-8?q?confluence-mdx:=20patch=5Fbuilder=20?= =?UTF-8?q?=E2=80=94=20sidecar=20block=20identity=20fallback=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xpath 조회 후 hash+line range 검증, 실패 시 find_sidecar_block_by_identity로 재탐색합니다. cross-type 오매칭 방지를 위해 xpath 태그 타입(p, ul, table 등)이 일치하는 경우에만 identity match를 반환합니다. Co-Authored-By: Claude Sonnet 4.6 --- .../bin/reverse_sync/patch_builder.py | 57 ++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/confluence-mdx/bin/reverse_sync/patch_builder.py b/confluence-mdx/bin/reverse_sync/patch_builder.py index 360945b18..8e786a2ef 100644 --- a/confluence-mdx/bin/reverse_sync/patch_builder.py +++ b/confluence-mdx/bin/reverse_sync/patch_builder.py @@ -14,6 +14,8 @@ RoundtripSidecar, SidecarBlock, find_mapping_by_sidecar, + find_sidecar_block_by_identity, + sha256_text, SidecarEntry, ) from reverse_sync.lost_info_patcher import apply_lost_info, distribute_lost_info_to_mappings @@ -105,6 +107,57 @@ def _build_replace_fragment_patch( } +def _find_roundtrip_sidecar_block( + change: BlockChange, + mapping: Optional[BlockMapping], + roundtrip_sidecar: Optional[RoundtripSidecar], + xpath_to_sidecar_block: Dict[str, SidecarBlock], +) -> Optional[SidecarBlock]: + """xpath → identity hash 순으로 roundtrip sidecar block을 탐색한다. + + 1. xpath로 빠른 조회 + 2. mdx_content_hash + mdx_line_range로 검증 → 일치하면 확정 반환 + 3. 검증 실패 시 find_sidecar_block_by_identity로 더 정확한 블록 탐색 + 4. identity도 없으면 xpath 결과를 fallback으로 반환 + """ + if roundtrip_sidecar is None: + return None + + identity_block = change.old_block or change.new_block + + # xpath 조회 + xpath_match: Optional[SidecarBlock] = None + if mapping is not None: + xpath_match = xpath_to_sidecar_block.get(mapping.xhtml_xpath) + + # hash + line range 검증 → 확정 일치 + if xpath_match is not None and identity_block is not None: + expected_hash = sha256_text(identity_block.content) if identity_block.content else "" + expected_range = (identity_block.line_start, identity_block.line_end) + if ( + xpath_match.mdx_content_hash == expected_hash + and tuple(xpath_match.mdx_line_range) == expected_range + ): + return xpath_match + + # identity fallback: mapping.yaml이 어긋난 경우 hash 기반으로 재탐색 + # xpath 태그 타입(p, ul, ol, table 등)이 일치하는 경우에만 반환하여 cross-type 오매칭 방지 + if identity_block is not None and identity_block.content: + identity_match = find_sidecar_block_by_identity( + roundtrip_sidecar.blocks, + sha256_text(identity_block.content), + (identity_block.line_start, identity_block.line_end), + ) + if identity_match is not None: + mapping_tag = mapping.xhtml_xpath.split('[')[0] if mapping else '' + identity_tag = identity_match.xhtml_xpath.split('[')[0] if identity_match.xhtml_xpath else '' + if mapping_tag == identity_tag: + return identity_match + + # xpath 결과를 마지막 fallback으로 반환 (hash 불일치라도 없는 것보다 나음) + return xpath_match + + def _flush_containing_changes( containing_changes: dict, used_ids: 'set | None' = None, @@ -351,7 +404,9 @@ def _mark_used(block_id: str, m: BlockMapping): and collapse_ws(new_plain) == collapse_ws(mapping.xhtml_plain_text)): continue - sidecar_block = xpath_to_sidecar_block.get(mapping.xhtml_xpath) + sidecar_block = _find_roundtrip_sidecar_block( + change, mapping, roundtrip_sidecar, xpath_to_sidecar_block, + ) if _can_replace_table_fragment(change, mapping, roundtrip_sidecar): patches.append( _build_replace_fragment_patch( From 278c53df9f1d2cc0ebf019e7dd0d9a17f8fadec6 Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 18:10:53 +0900 Subject: [PATCH 11/13] =?UTF-8?q?confluence-mdx:=20reconstructors=20+=20pa?= =?UTF-8?q?tch=5Fbuilder=20=EC=B6=94=EC=83=81=ED=99=94=20=EB=A0=88?= =?UTF-8?q?=EC=9D=B4=EC=96=B4=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sidecar_block_requires_reconstruction / reconstruct_fragment_with_sidecar를 reconstructors.py에 추가하고, _build_replace_fragment_patch에 sidecar_block 파라미터를 연동합니다. patch_builder의 인라인 Phase 3 블록을 제거합니다. Co-Authored-By: Claude Sonnet 4.6 --- .../bin/reverse_sync/patch_builder.py | 52 +++++++------------ .../bin/reverse_sync/reconstructors.py | 40 ++++++++++++++ 2 files changed, 60 insertions(+), 32 deletions(-) diff --git a/confluence-mdx/bin/reverse_sync/patch_builder.py b/confluence-mdx/bin/reverse_sync/patch_builder.py index 8e786a2ef..79261545f 100644 --- a/confluence-mdx/bin/reverse_sync/patch_builder.py +++ b/confluence-mdx/bin/reverse_sync/patch_builder.py @@ -20,7 +20,10 @@ ) from reverse_sync.lost_info_patcher import apply_lost_info, distribute_lost_info_to_mappings from reverse_sync.mdx_to_xhtml_inline import mdx_block_to_xhtml_element, mdx_block_to_inner_xhtml -from reverse_sync.reconstructors import reconstruct_inline_anchor_fragment +from reverse_sync.reconstructors import ( + sidecar_block_requires_reconstruction, + reconstruct_fragment_with_sidecar, +) from reverse_sync.list_patcher import ( build_list_item_patches, ) @@ -93,10 +96,13 @@ def _emit_replacement_fragment(block: MdxBlock) -> str: def _build_replace_fragment_patch( mapping: BlockMapping, new_block: MdxBlock, + sidecar_block: Optional[SidecarBlock] = None, mapping_lost_info: Optional[dict] = None, ) -> Dict[str, str]: """whole-fragment replacement patch를 생성한다.""" new_element = _emit_replacement_fragment(new_block) + if sidecar_block_requires_reconstruction(sidecar_block): + new_element = reconstruct_fragment_with_sidecar(new_element, sidecar_block) block_lost = (mapping_lost_info or {}).get(mapping.block_id, {}) if block_lost: new_element = apply_lost_info(new_element, block_lost) @@ -306,7 +312,7 @@ def _mark_used(block_id: str, m: BlockMapping): _build_replace_fragment_patch( mapping, add_change.new_block, - mapping_lost_info, + mapping_lost_info=mapping_lost_info, ) ) _paired_indices.add(idx) @@ -375,7 +381,7 @@ def _mark_used(block_id: str, m: BlockMapping): _build_replace_fragment_patch( mapping, change.new_block, - mapping_lost_info, + mapping_lost_info=mapping_lost_info, ) ) else: @@ -412,7 +418,7 @@ def _mark_used(block_id: str, m: BlockMapping): _build_replace_fragment_patch( mapping, change.new_block, - mapping_lost_info, + mapping_lost_info=mapping_lost_info, ) ) continue @@ -422,39 +428,21 @@ def _mark_used(block_id: str, m: BlockMapping): _build_replace_fragment_patch( mapping, change.new_block, - mapping_lost_info, + sidecar_block=sidecar_block, + mapping_lost_info=mapping_lost_info, ) ) continue - # Phase 3: sidecar anchor가 있는 paragraph → inline-anchor reconstruction - # anchor entry에 offset/raw_xhtml이 없으면 text-transfer 경로로 폴백 - _anchors = ( - sidecar_block.reconstruction.get('anchors', []) - if sidecar_block is not None and sidecar_block.reconstruction is not None - else [] - ) - _valid_anchors = [ - a for a in _anchors if 'offset' in a and 'raw_xhtml' in a - ] - if (sidecar_block is not None - and sidecar_block.reconstruction is not None - and sidecar_block.reconstruction.get('kind') == 'paragraph' - and _valid_anchors): - new_element = _emit_replacement_fragment(change.new_block) - reconstructed = reconstruct_inline_anchor_fragment( - mapping.xhtml_text, - _valid_anchors, - new_element, + if sidecar_block_requires_reconstruction(sidecar_block): + patches.append( + _build_replace_fragment_patch( + mapping, + change.new_block, + sidecar_block=sidecar_block, + mapping_lost_info=mapping_lost_info, + ) ) - block_lost = (mapping_lost_info or {}).get(mapping.block_id, {}) - if block_lost: - reconstructed = apply_lost_info(reconstructed, block_lost) - patches.append({ - 'action': 'replace_fragment', - 'xhtml_xpath': mapping.xhtml_xpath, - 'new_element_xhtml': reconstructed, - }) continue # 재생성 시 소실되는 XHTML 요소 포함 시 텍스트 전이로 폴백 diff --git a/confluence-mdx/bin/reverse_sync/reconstructors.py b/confluence-mdx/bin/reverse_sync/reconstructors.py index ac0af5aa9..fa54591dc 100644 --- a/confluence-mdx/bin/reverse_sync/reconstructors.py +++ b/confluence-mdx/bin/reverse_sync/reconstructors.py @@ -6,11 +6,15 @@ from __future__ import annotations import difflib +from typing import TYPE_CHECKING, Optional from bs4 import BeautifulSoup, NavigableString, Tag from reverse_sync.xhtml_normalizer import extract_plain_text +if TYPE_CHECKING: + from reverse_sync.sidecar import SidecarBlock + def map_anchor_offset(old_plain: str, new_plain: str, old_offset: int) -> int: """old_plain에서의 anchor offset을 new_plain 기준 offset으로 변환한다. @@ -122,6 +126,42 @@ def insert_anchor_at_offset(p_element: Tag, offset: int, anchor_xhtml: str) -> N p_element.append(n.extract()) +def sidecar_block_requires_reconstruction( + sidecar_block: Optional['SidecarBlock'], +) -> bool: + """sidecar block에 Phase 3 재구성이 필요한 metadata가 있으면 True를 반환한다. + + offset + raw_xhtml이 모두 있는 유효한 anchor가 하나 이상 있어야 True를 반환한다. + """ + if sidecar_block is None or sidecar_block.reconstruction is None: + return False + recon = sidecar_block.reconstruction + if recon.get('kind') == 'paragraph': + return any( + 'offset' in a and 'raw_xhtml' in a + for a in recon.get('anchors', []) + ) + return False + + +def reconstruct_fragment_with_sidecar( + new_fragment: str, + sidecar_block: Optional['SidecarBlock'], +) -> str: + """new_fragment에 sidecar block의 anchor metadata를 재주입한다.""" + if sidecar_block is None or sidecar_block.reconstruction is None: + return new_fragment + recon = sidecar_block.reconstruction + kind = recon.get('kind') + if kind == 'paragraph': + anchors = recon.get('anchors', []) + valid_anchors = [a for a in anchors if 'offset' in a and 'raw_xhtml' in a] + if valid_anchors: + old_plain = recon.get('old_plain_text', '') + return reconstruct_inline_anchor_fragment(old_plain, valid_anchors, new_fragment) + return new_fragment + + def reconstruct_inline_anchor_fragment( old_fragment: str, anchors: list, From c74937d9433a64997e843692b9e2c47ca6682f35 Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 18:12:44 +0900 Subject: [PATCH 12/13] =?UTF-8?q?confluence-mdx:=20Phase=203=20list=20anch?= =?UTF-8?q?or=20=EC=9E=AC=EA=B5=AC=EC=84=B1=20=EA=B5=AC=ED=98=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sidecar.py에 _build_list_anchor_entries/_walk_list/_extract_anchors_from_p를 추가하여 list item anchor metadata를 기록합니다. reconstructors.py에 _rebuild_list_fragment/_find_list_item_by_path를 추가하고 sidecar_block_requires_reconstruction/reconstruct_fragment_with_sidecar에 list 분기를 연동합니다. patch_builder.py list strategy 경로에 sidecar reconstruction을 연결합니다. Co-Authored-By: Claude Sonnet 4.6 --- .../bin/reverse_sync/patch_builder.py | 16 ++++++ .../bin/reverse_sync/reconstructors.py | 56 ++++++++++++++++++ confluence-mdx/bin/reverse_sync/sidecar.py | 57 ++++++++++++++++++- ...test_reverse_sync_reconstruct_paragraph.py | 57 +++++++++++++++++++ 4 files changed, 185 insertions(+), 1 deletion(-) diff --git a/confluence-mdx/bin/reverse_sync/patch_builder.py b/confluence-mdx/bin/reverse_sync/patch_builder.py index 79261545f..37ffceccf 100644 --- a/confluence-mdx/bin/reverse_sync/patch_builder.py +++ b/confluence-mdx/bin/reverse_sync/patch_builder.py @@ -367,6 +367,22 @@ def _mark_used(block_id: str, m: BlockMapping): continue if strategy == 'list': + list_sidecar = _find_roundtrip_sidecar_block( + change, mapping, roundtrip_sidecar, xpath_to_sidecar_block, + ) + if (mapping is not None + and not _contains_preserved_anchor_markup(mapping.xhtml_text) + and sidecar_block_requires_reconstruction(list_sidecar)): + _mark_used(mapping.block_id, mapping) + patches.append( + _build_replace_fragment_patch( + mapping, + change.new_block, + sidecar_block=list_sidecar, + mapping_lost_info=mapping_lost_info, + ) + ) + continue patches.extend( build_list_item_patches( change, mappings, used_ids, diff --git a/confluence-mdx/bin/reverse_sync/reconstructors.py b/confluence-mdx/bin/reverse_sync/reconstructors.py index fa54591dc..d24ed8a49 100644 --- a/confluence-mdx/bin/reverse_sync/reconstructors.py +++ b/confluence-mdx/bin/reverse_sync/reconstructors.py @@ -126,6 +126,55 @@ def insert_anchor_at_offset(p_element: Tag, offset: int, anchor_xhtml: str) -> N p_element.append(n.extract()) +def _find_list_item_by_path(root: Tag, path: list) -> Optional[Tag]: + """path 인덱스 경로를 따라 li 요소를 탐색한다.""" + current_list: Optional[Tag] = root + current_li: Optional[Tag] = None + for index in path: + if current_list is None: + return None + items = [c for c in current_list.children if isinstance(c, Tag) and c.name == 'li'] + if index < 0 or index >= len(items): + return None + current_li = items[index] + current_list = next( + (c for c in current_li.children if isinstance(c, Tag) and c.name in ('ul', 'ol')), + None, + ) + return current_li + + +def _find_direct_list_item_paragraph(li: Tag) -> Tag: + """li의 직접 자식 p 요소를 반환한다. 없으면 li 자체를 반환.""" + for child in li.children: + if isinstance(child, Tag) and child.name == 'p': + return child + return li + + +def _rebuild_list_fragment(new_fragment: str, recon: dict) -> str: + """list fragment에 sidecar anchor entries를 경로 기반으로 재주입한다.""" + soup = BeautifulSoup(new_fragment, 'html.parser') + root = soup.find(['ul', 'ol']) + if root is None: + return new_fragment + + old_plain = recon.get('old_plain_text', '') + for entry in recon.get('items', []): + if not entry.get('raw_xhtml') or 'offset' not in entry: + continue + path = entry.get('path', []) + li = _find_list_item_by_path(root, path) + if li is None: + continue + p = _find_direct_list_item_paragraph(li) + new_p_plain = extract_plain_text(str(p)) + new_offset = map_anchor_offset(old_plain, new_p_plain, entry['offset']) + insert_anchor_at_offset(p, new_offset, entry['raw_xhtml']) + + return str(soup) + + def sidecar_block_requires_reconstruction( sidecar_block: Optional['SidecarBlock'], ) -> bool: @@ -141,6 +190,11 @@ def sidecar_block_requires_reconstruction( 'offset' in a and 'raw_xhtml' in a for a in recon.get('anchors', []) ) + if recon.get('kind') == 'list': + return any( + 'offset' in item and 'raw_xhtml' in item + for item in recon.get('items', []) + ) return False @@ -159,6 +213,8 @@ def reconstruct_fragment_with_sidecar( if valid_anchors: old_plain = recon.get('old_plain_text', '') return reconstruct_inline_anchor_fragment(old_plain, valid_anchors, new_fragment) + if kind == 'list': + return _rebuild_list_fragment(new_fragment, recon) return new_fragment diff --git a/confluence-mdx/bin/reverse_sync/sidecar.py b/confluence-mdx/bin/reverse_sync/sidecar.py index 366148b50..afa21d378 100644 --- a/confluence-mdx/bin/reverse_sync/sidecar.py +++ b/confluence-mdx/bin/reverse_sync/sidecar.py @@ -298,6 +298,61 @@ def _build_anchor_entries(fragment: str) -> list: return anchors +def _extract_anchors_from_p(p_el) -> list: + """p 요소에서 ac:image anchor entry (offset, raw_xhtml) 목록을 추출한다.""" + from bs4 import NavigableString, Tag + anchors = [] + offset = 0 + for child in p_el.children: + if isinstance(child, NavigableString): + offset += len(str(child)) + elif isinstance(child, Tag): + if child.name == 'ac:image': + anchors.append({ + 'kind': 'image', + 'offset': offset, + 'raw_xhtml': str(child), + }) + else: + offset += len(extract_plain_text(str(child))) + return anchors + + +def _walk_list(list_el, path: list, entries: list) -> None: + """list 요소를 재귀 순회하며 anchor entry를 수집한다.""" + from bs4 import Tag + items = [c for c in list_el.children if isinstance(c, Tag) and c.name == 'li'] + for idx, li in enumerate(items): + current_path = path + [idx] + for child in li.children: + if not isinstance(child, Tag): + continue + if child.name == 'p': + for a in _extract_anchors_from_p(child): + entries.append({**a, 'path': current_path}) + elif child.name in ('ul', 'ol'): + _walk_list(child, current_path, entries) + + +def _build_list_anchor_entries(fragment: str) -> list: + """list fragment 내 li > p > ac:image를 path 기반 anchor entry로 추출한다. + + 각 entry: + kind: "image" + path: li 인덱스 경로 (중첩 지원, e.g. [0, 1]) + offset: p 내 plain text 기준 삽입 위치 + raw_xhtml: ac:image 원본 XHTML 문자열 + """ + from bs4 import BeautifulSoup + soup = BeautifulSoup(fragment, 'html.parser') + root = soup.find(['ul', 'ol']) + if root is None: + return [] + entries = [] + _walk_list(root, [], entries) + return entries + + def _build_reconstruction_metadata( fragment: str, mapping: BlockMapping | None, @@ -314,7 +369,7 @@ def _build_reconstruction_metadata( metadata["anchors"] = _build_anchor_entries(fragment) elif mapping.type == "list": metadata["ordered"] = mapping.xhtml_xpath.startswith("ol[") - metadata["items"] = [] + metadata["items"] = _build_list_anchor_entries(fragment) elif mapping.children: child_plain_texts = [ id_to_mapping[child_id].xhtml_plain_text.strip() diff --git a/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py b/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py index c482472a7..5809ff37c 100644 --- a/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py +++ b/confluence-mdx/tests/test_reverse_sync_reconstruct_paragraph.py @@ -209,3 +209,60 @@ def test_changed_paragraph_with_image_preserves_image(self): assert 'ri:attachment' in result assert 'Changed text' in result assert 'Original text' not in result + + +class TestBuildListAnchorEntries: + def test_list_with_inline_image(self): + from reverse_sync.sidecar import _build_list_anchor_entries + fragment = ( + '
      ' + '
    • item ' + '' + ' text

    • ' + '
    ' + ) + entries = _build_list_anchor_entries(fragment) + assert len(entries) == 1 + assert entries[0]['path'] == [0] + assert entries[0]['offset'] == len('item ') + assert 'a.png' in entries[0]['raw_xhtml'] + + def test_nested_list_with_image(self): + from reverse_sync.sidecar import _build_list_anchor_entries + fragment = ( + '
    • outer

      ' + '
      • ' + '' + 'nested

      ' + '
    ' + ) + entries = _build_list_anchor_entries(fragment) + assert len(entries) == 1 + assert entries[0]['path'] == [0, 0] + assert entries[0]['offset'] == 0 + + def test_list_without_images_returns_empty(self): + from reverse_sync.sidecar import _build_list_anchor_entries + fragment = '
    • plain text

    ' + entries = _build_list_anchor_entries(fragment) + assert entries == [] + + def test_multiple_items_with_images(self): + from reverse_sync.sidecar import _build_list_anchor_entries + fragment = ( + '
      ' + '
    • first ' + '' + '

    • ' + '
    • second

    • ' + '
    • ' + '' + ' after

    • ' + '
    ' + ) + entries = _build_list_anchor_entries(fragment) + assert len(entries) == 2 + assert entries[0]['path'] == [0] + assert entries[0]['offset'] == len('first ') + assert entries[1]['path'] == [2] + assert entries[1]['offset'] == 0 From ea886f7937732e07b07231f9ff329a6f17d085c6 Mon Sep 17 00:00:00 2001 From: JK Date: Sun, 15 Mar 2026 18:13:37 +0900 Subject: [PATCH 13/13] =?UTF-8?q?confluence-mdx:=20reconstructors=20?= =?UTF-8?q?=E2=80=94=20map=5Fanchor=5Foffset=20affinity=20=ED=8C=8C?= =?UTF-8?q?=EB=9D=BC=EB=AF=B8=ED=84=B0=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 경계(i1 == old_offset)에서 삽입된 텍스트의 포함 여부를 affinity='before'/'after'로 제어합니다. 기본값 'before'는 기존 동작을 유지합니다. Co-Authored-By: Claude Sonnet 4.6 --- confluence-mdx/bin/reverse_sync/reconstructors.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/confluence-mdx/bin/reverse_sync/reconstructors.py b/confluence-mdx/bin/reverse_sync/reconstructors.py index d24ed8a49..fe2f720ba 100644 --- a/confluence-mdx/bin/reverse_sync/reconstructors.py +++ b/confluence-mdx/bin/reverse_sync/reconstructors.py @@ -16,7 +16,12 @@ from reverse_sync.sidecar import SidecarBlock -def map_anchor_offset(old_plain: str, new_plain: str, old_offset: int) -> int: +def map_anchor_offset( + old_plain: str, + new_plain: str, + old_offset: int, + affinity: str = 'before', +) -> int: """old_plain에서의 anchor offset을 new_plain 기준 offset으로 변환한다. difflib SequenceMatcher opcode를 사용해 old 좌표계를 new 좌표계로 매핑한다. @@ -25,7 +30,7 @@ def map_anchor_offset(old_plain: str, new_plain: str, old_offset: int) -> int: anchor 앞쪽 텍스트에 적용된 변경만 offset에 반영한다: - equal: 그대로 유지 - replace: new 길이로 비례 매핑 - - insert (i1==i2 <= old_offset): new 텍스트 길이를 더함 + - insert at boundary: affinity='before'이면 삽입 포함, 'after'이면 제외 - delete: 삭제된 길이만큼 뺌 """ matcher = difflib.SequenceMatcher(None, old_plain, new_plain, autojunk=False) @@ -57,7 +62,10 @@ def map_anchor_offset(old_plain: str, new_plain: str, old_offset: int) -> int: consumed_old += old_take elif tag == 'insert': - if i1 <= old_offset: + # 경계(i1 == old_offset)에서 affinity로 배치 방향 결정: + # 'before': anchor가 삽입된 텍스트 뒤에 위치 (삽입 포함) + # 'after': anchor가 삽입된 텍스트 앞에 위치 (삽입 제외) + if i1 < old_offset or (i1 == old_offset and affinity == 'before'): new_offset += j2 - j1 if consumed_old < old_offset: