From 6e1f75c89aa8c2b88d40a4aeb6a75ce88f6965f6 Mon Sep 17 00:00:00 2001
From: JK <jk@chequer.io>
Date: Fri, 13 Mar 2026 21:32:22 +0900
Subject: [PATCH 1/4] =?UTF-8?q?confluence-mdx:=20Phase=200=20=EA=B3=B5?=
 =?UTF-8?q?=EC=9A=A9=20helper=20=EC=B6=94=EC=B6=9C=20=E2=80=94=20xhtml=5Fn?=
 =?UTF-8?q?ormalizer=20=EB=B0=8F=20list=20tree=20public=20API=EB=A5=BC=20?=
 =?UTF-8?q?=EC=B6=94=EA=B0=80=ED=95=A9=EB=8B=88=EB=8B=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- xhtml_normalizer.py: extract_plain_text, normalize_fragment, extract_fragment_by_xpath 구현
- emitter.py: ListNode, parse_list_tree public API 승격
- Level 0 helper tests 43개 추가 (전체 820 pass)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 confluence-mdx/bin/mdx_to_storage/__init__.py |   4 +-
 confluence-mdx/bin/mdx_to_storage/emitter.py  |  22 +-
 .../bin/reverse_sync/xhtml_normalizer.py      | 225 ++++++++++++
 .../tests/test_reverse_sync_list_tree.py      |  75 ++++
 .../test_reverse_sync_xhtml_normalizer.py     | 323 ++++++++++++++++++
 5 files changed, 646 insertions(+), 3 deletions(-)
 create mode 100644 confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
 create mode 100644 confluence-mdx/tests/test_reverse_sync_list_tree.py
 create mode 100644 confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py

diff --git a/confluence-mdx/bin/mdx_to_storage/__init__.py b/confluence-mdx/bin/mdx_to_storage/__init__.py
index ee914b48d..2c4abde96 100644
--- a/confluence-mdx/bin/mdx_to_storage/__init__.py
+++ b/confluence-mdx/bin/mdx_to_storage/__init__.py
@@ -1,6 +1,6 @@
 """MDX -> Confluence Storage XHTML conversion package."""
 
-from .emitter import emit_block, emit_document
+from .emitter import ListNode, emit_block, emit_document, parse_list_tree
 from .inline import convert_heading_inline, convert_inline
 from .link_resolver import LinkResolver, PageEntry, load_pages_yaml
 from .parser import Block, parse_mdx, parse_mdx_blocks
@@ -8,12 +8,14 @@
 __all__ = [
     "Block",
     "LinkResolver",
+    "ListNode",
     "PageEntry",
     "convert_heading_inline",
     "convert_inline",
     "emit_block",
     "emit_document",
     "load_pages_yaml",
+    "parse_list_tree",
     "parse_mdx",
     "parse_mdx_blocks",
 ]
diff --git a/confluence-mdx/bin/mdx_to_storage/emitter.py b/confluence-mdx/bin/mdx_to_storage/emitter.py
index bbb099872..ba4340f70 100644
--- a/confluence-mdx/bin/mdx_to_storage/emitter.py
+++ b/confluence-mdx/bin/mdx_to_storage/emitter.py
@@ -37,12 +37,21 @@
 _IMG_ATTR_RE = re.compile(r'(\w[\w-]*)=(?:"([^"]*)"|\'([^\']*)\')')
 
 
-class _ListNode:
+class ListNode:
+    """List item node for tree-based list representation.
+
+    Public API for reconstruction pipeline.
+    """
+
     def __init__(self, ordered: bool, text: str, depth: int) -> None:
         self.ordered = ordered
         self.text = text
         self.depth = depth
-        self.children: list["_ListNode"] = []
+        self.children: list["ListNode"] = []
+
+
+# backward compat alias (internal)
+_ListNode = ListNode
 
 
 def emit_block(block: Block, context: Optional[dict] = None) -> str:
@@ -159,6 +168,15 @@ def _emit_single_depth_list(content: str, link_resolver: Optional[LinkResolver]
     return _render_list_nodes(roots, link_resolver=link_resolver)
 
 
+def parse_list_tree(content: str) -> list[ListNode]:
+    """MDX list content를 파싱하여 tree 구조의 ListNode 리스트를 반환한다.
+
+    Public API — reverse-sync reconstruction pipeline에서 사용한다.
+    """
+    items = _parse_list_items(content)
+    return _build_list_tree(items)
+
+
 def _parse_list_items(content: str) -> list[_ListNode]:
     items: list[_ListNode] = []
     for line in content.splitlines():
diff --git a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
new file mode 100644
index 000000000..6b7377180
--- /dev/null
+++ b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
@@ -0,0 +1,225 @@
+"""XHTML Normalizer — 공용 XHTML 정규화 및 plain-text 추출 유틸리티.
+
+reverse-sync 재구성 파이프라인의 공용 helper 모듈.
+BeautifulSoup 기반으로 fragment 비교, plain-text 추출, xpath 기반 fragment 추출을 제공한다.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+from bs4 import BeautifulSoup, NavigableString, Tag
+
+
+# ---------------------------------------------------------------------------
+# Ignored attributes — 비교 시 무시하는 Confluence 메타데이터 속성
+# ---------------------------------------------------------------------------
+
+IGNORED_ATTRIBUTES: frozenset[str] = frozenset({
+    "ac:macro-id",
+    "ac:local-id",
+    "local-id",
+    "ac:schema-version",
+    "ri:version-at-save",
+    "ac:original-height",
+    "ac:original-width",
+    "ac:custom-width",
+    "ac:alt",
+    "ac:layout",
+    "data-table-width",
+    "data-layout",
+    "data-highlight-colour",
+    "data-card-appearance",
+    "ac:breakout-mode",
+    "ac:breakout-width",
+    "ri:space-key",
+    "style",
+    "class",
+})
+
+
+# ---------------------------------------------------------------------------
+# Plain-text extraction
+# ---------------------------------------------------------------------------
+
+def extract_plain_text(fragment: str) -> str:
+    """XHTML fragment에서 plain text를 추출한다.
+
+    ac:plain-text-body(코드 블록 본문)는 제외하고,
+    ac:emoticon의 fallback 텍스트는 포함한다.
+
+    이 함수의 출력은 reconstruction에서 anchor offset 좌표의 기준이 된다.
+    """
+    soup = BeautifulSoup(fragment, "html.parser")
+    return _extract_text_from_element(soup)
+
+
+def _extract_text_from_element(element) -> str:
+    """재귀적으로 텍스트를 추출한다."""
+    parts: list[str] = []
+    for child in element.children:
+        if isinstance(child, NavigableString):
+            parts.append(str(child))
+        elif isinstance(child, Tag):
+            # 코드 블록 본문은 제외
+            if child.name == "ac:plain-text-body":
+                continue
+            # emoticon은 fallback 텍스트 사용
+            if child.name == "ac:emoticon":
+                fallback = child.get("ac:emoji-fallback", "")
+                if fallback:
+                    parts.append(fallback)
+                continue
+            # ac:image, ac:link 등 preservation unit은 텍스트 없음 (anchor로 처리)
+            if child.name in ("ac:image", "ac:link"):
+                continue
+            parts.append(_extract_text_from_element(child))
+    return "".join(parts)
+
+
+# ---------------------------------------------------------------------------
+# Fragment normalization
+# ---------------------------------------------------------------------------
+
+def normalize_fragment(fragment: str, strip_ignored_attrs: bool = True) -> str:
+    """XHTML fragment를 비교 가능한 정규화된 형태로 변환한다.
+
+    - layout section unwrap
+    - non-reversible macro 제거
+    - decoration unwrap
+    - ignored attribute 제거 (선택)
+    - BeautifulSoup prettify로 노드별 줄바꿈
+    """
+    soup = BeautifulSoup(fragment, "html.parser")
+    _strip_layout_sections(soup)
+    _strip_nonreversible_macros(soup)
+    _strip_decorations(soup)
+    if strip_ignored_attrs:
+        _strip_ignored_attributes(soup)
+    return soup.prettify(formatter="minimal").strip()
+
+
+def _strip_layout_sections(soup: BeautifulSoup) -> None:
+    for tag_name in ("ac:layout", "ac:layout-section", "ac:layout-cell"):
+        for tag in soup.find_all(tag_name):
+            tag.unwrap()
+
+
+def _strip_nonreversible_macros(soup: BeautifulSoup) -> None:
+    for macro in soup.find_all("ac:structured-macro"):
+        if macro.get("ac:name") in {"toc", "view-file"}:
+            macro.decompose()
+
+
+def _strip_decorations(soup: BeautifulSoup) -> None:
+    for tag_name in ("ac:adf-mark", "ac:inline-comment-marker"):
+        for tag in soup.find_all(tag_name):
+            tag.unwrap()
+    for colgroup in soup.find_all("colgroup"):
+        colgroup.decompose()
+
+
+def _strip_ignored_attributes(
+    soup: BeautifulSoup,
+    extra: Optional[frozenset[str]] = None,
+) -> None:
+    ignored = IGNORED_ATTRIBUTES | extra if extra else IGNORED_ATTRIBUTES
+    for tag in soup.find_all(True):
+        for attr in list(tag.attrs.keys()):
+            if attr in ignored:
+                del tag.attrs[attr]
+
+
+# ---------------------------------------------------------------------------
+# Fragment extraction by XPath
+# ---------------------------------------------------------------------------
+
+def extract_fragment_by_xpath(page_xhtml: str, xpath: str) -> Optional[str]:
+    """page XHTML에서 간이 XPath로 요소를 찾아 outerHTML을 반환한다.
+
+    xpath 형식: "p[1]", "ul[2]", "macro-info[1]/p[1]"
+    """
+    soup = BeautifulSoup(page_xhtml, "html.parser")
+    element = _find_element_by_xpath(soup, xpath)
+    if element is None:
+        return None
+    return str(element)
+
+
+def _find_element_by_xpath(soup, xpath: str):
+    """간이 XPath로 요소를 찾는다."""
+    parts = xpath.split("/")
+    if len(parts) == 1:
+        return _find_element_by_simple_xpath(soup, xpath)
+
+    current = _find_element_by_simple_xpath(soup, parts[0])
+    if current is None:
+        return None
+
+    for part in parts[1:]:
+        container = _find_content_container(current)
+        if container is None:
+            if ":" in (current.name or ""):
+                return None
+            container = current
+        current = _find_element_by_simple_xpath(container, part)
+        if current is None:
+            return None
+
+    return current
+
+
+_XPATH_PATTERN = re.compile(r"([a-z0-9:-]+)\[(\d+)\]")
+
+
+def _find_element_by_simple_xpath(parent, xpath: str):
+    """단일 XPath 파트로 요소를 찾는다."""
+    match = _XPATH_PATTERN.match(xpath)
+    if not match:
+        return None
+    tag_name = match.group(1)
+    index = int(match.group(2))  # 1-based
+
+    macro_name = None
+    if tag_name.startswith("macro-"):
+        macro_name = tag_name[len("macro-"):]
+
+    count = 0
+    for child in _iter_block_children(parent):
+        if not isinstance(child, Tag):
+            continue
+        if macro_name:
+            if child.name == "ac:structured-macro" and child.get("ac:name") == macro_name:
+                count += 1
+                if count == index:
+                    return child
+        elif child.name == tag_name:
+            count += 1
+            if count == index:
+                return child
+    return None
+
+
+def _iter_block_children(parent):
+    """블록 레벨 자식을 순회한다. ac:layout은 cell 내부로 진입한다."""
+    for child in parent.children:
+        if isinstance(child, Tag) and child.name == "ac:layout":
+            for section in child.find_all("ac:layout-section", recursive=False):
+                for cell in section.find_all("ac:layout-cell", recursive=False):
+                    yield from cell.children
+        else:
+            yield child
+
+
+def _find_content_container(parent: Tag):
+    """복합 xpath의 부모에서 콘텐츠 컨테이너를 찾는다."""
+    rich_body = parent.find("ac:rich-text-body")
+    if rich_body is not None:
+        return rich_body
+    node = parent.find("ac:adf-node")
+    if node is not None:
+        content = node.find("ac:adf-content")
+        if content is not None:
+            return content
+    return None
diff --git a/confluence-mdx/tests/test_reverse_sync_list_tree.py b/confluence-mdx/tests/test_reverse_sync_list_tree.py
new file mode 100644
index 000000000..25016f8b4
--- /dev/null
+++ b/confluence-mdx/tests/test_reverse_sync_list_tree.py
@@ -0,0 +1,75 @@
+"""Level 0 helper tests — parse_list_tree() public API 검증.
+
+Phase 0 게이트: list tree helper가 public API로 정상 동작하는지 확인한다.
+"""
+
+import pytest
+
+from mdx_to_storage import ListNode, parse_list_tree
+
+
+class TestParseListTree:
+    """parse_list_tree() public API 검증."""
+
+    def test_simple_unordered_list(self):
+        content = "- Item 1\n- Item 2\n- Item 3"
+        roots = parse_list_tree(content)
+        assert len(roots) == 3
+        assert all(not node.ordered for node in roots)
+        assert roots[0].text == "Item 1"
+        assert roots[1].text == "Item 2"
+        assert roots[2].text == "Item 3"
+
+    def test_simple_ordered_list(self):
+        content = "1. First\n2. Second\n3. Third"
+        roots = parse_list_tree(content)
+        assert len(roots) == 3
+        assert all(node.ordered for node in roots)
+        assert roots[0].text == "First"
+
+    def test_nested_list(self):
+        content = "- Parent\n    - Child 1\n    - Child 2"
+        roots = parse_list_tree(content)
+        assert len(roots) == 1
+        assert roots[0].text == "Parent"
+        assert len(roots[0].children) == 2
+        assert roots[0].children[0].text == "Child 1"
+        assert roots[0].children[1].text == "Child 2"
+
+    def test_mixed_ordered_unordered(self):
+        content = "- Unordered\n1. Ordered"
+        roots = parse_list_tree(content)
+        assert len(roots) == 2
+        assert not roots[0].ordered
+        assert roots[1].ordered
+
+    def test_deeply_nested(self):
+        content = "- L0\n    - L1\n        - L2"
+        roots = parse_list_tree(content)
+        assert len(roots) == 1
+        assert len(roots[0].children) == 1
+        assert len(roots[0].children[0].children) == 1
+        assert roots[0].children[0].children[0].text == "L2"
+
+    def test_continuation_line(self):
+        content = "- Item with\n  continuation"
+        roots = parse_list_tree(content)
+        assert len(roots) == 1
+        assert "continuation" in roots[0].text
+
+    def test_empty_content(self):
+        roots = parse_list_tree("")
+        assert roots == []
+
+    def test_list_node_type(self):
+        """반환값이 ListNode 인스턴스인지 확인."""
+        roots = parse_list_tree("- test")
+        assert isinstance(roots[0], ListNode)
+
+    def test_nested_ordered_under_unordered(self):
+        content = "- Parent\n    1. Child ordered"
+        roots = parse_list_tree(content)
+        assert len(roots) == 1
+        assert not roots[0].ordered
+        assert len(roots[0].children) == 1
+        assert roots[0].children[0].ordered
diff --git a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
new file mode 100644
index 000000000..2c08201a6
--- /dev/null
+++ b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
@@ -0,0 +1,323 @@
+"""Level 0 helper tests — xhtml_normalizer 모듈 검증.
+
+Phase 0 게이트:
+- extract_plain_text: 다양한 XHTML fragment에서 plain text 추출
+- normalize_fragment: fragment 비교 정규화
+- extract_fragment_by_xpath: 간이 XPath 기반 fragment 추출
+"""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from reverse_sync.xhtml_normalizer import (
+    extract_fragment_by_xpath,
+    extract_plain_text,
+    normalize_fragment,
+)
+
+TESTCASES_DIR = Path(__file__).parent / "testcases"
+
+
+# ---------------------------------------------------------------------------
+# extract_plain_text
+# ---------------------------------------------------------------------------
+
+class TestExtractPlainText:
+    """extract_plain_text() 기본 동작 검증."""
+
+    def test_simple_paragraph(self):
+        fragment = "<p>Hello world</p>"
+        assert extract_plain_text(fragment) == "Hello world"
+
+    def test_paragraph_with_bold(self):
+        fragment = "<p>A <strong>bold</strong> text</p>"
+        assert extract_plain_text(fragment) == "A bold text"
+
+    def test_paragraph_with_link(self):
+        fragment = '<p>See <a href="http://example.com">example</a> here</p>'
+        assert extract_plain_text(fragment) == "See example here"
+
+    def test_paragraph_with_inline_image_excluded(self):
+        """ac:image는 preservation unit이므로 plain text에서 제외된다."""
+        fragment = (
+            '<p>A '
+            '<ac:image ac:align="center"><ri:attachment ri:filename="test.png" /></ac:image>'
+            ' B</p>'
+        )
+        assert extract_plain_text(fragment) == "A  B"
+
+    def test_paragraph_with_inline_link_excluded(self):
+        """ac:link는 preservation unit이므로 plain text에서 제외된다."""
+        fragment = (
+            '<p>Before '
+            '<ac:link><ri:page ri:content-title="Page" /></ac:link>'
+            ' After</p>'
+        )
+        assert extract_plain_text(fragment) == "Before  After"
+
+    def test_paragraph_with_emoticon(self):
+        """ac:emoticon의 fallback 텍스트가 포함된다."""
+        fragment = (
+            '<p>'
+            '<ac:emoticon ac:name="tick" ac:emoji-fallback=":check_mark:" />'
+            ' Success</p>'
+        )
+        assert extract_plain_text(fragment) == ":check_mark: Success"
+
+    def test_code_macro_body_excluded(self):
+        """ac:plain-text-body(코드 블록 본문)는 제외된다."""
+        fragment = (
+            '<ac:structured-macro ac:name="code">'
+            '<ac:parameter ac:name="language">python</ac:parameter>'
+            '<ac:plain-text-body><![CDATA[print("hello")]]></ac:plain-text-body>'
+            '</ac:structured-macro>'
+        )
+        assert extract_plain_text(fragment).strip() == "python"
+
+    def test_list_plain_text(self):
+        fragment = "<ul><li><p>Item 1</p></li><li><p>Item 2</p></li></ul>"
+        text = extract_plain_text(fragment)
+        assert "Item 1" in text
+        assert "Item 2" in text
+
+    def test_heading(self):
+        fragment = "<h2>Section Title</h2>"
+        assert extract_plain_text(fragment) == "Section Title"
+
+    def test_nested_formatting(self):
+        fragment = "<p>A <strong><em>bold italic</em></strong> text</p>"
+        assert extract_plain_text(fragment) == "A bold italic text"
+
+    def test_empty_fragment(self):
+        assert extract_plain_text("") == ""
+        assert extract_plain_text("<p />") == ""
+
+    def test_callout_with_rich_body(self):
+        """callout macro 내부의 rich-text-body에서 텍스트를 추출한다."""
+        fragment = (
+            '<ac:structured-macro ac:name="info">'
+            '<ac:rich-text-body><p>Info text</p></ac:rich-text-body>'
+            '</ac:structured-macro>'
+        )
+        text = extract_plain_text(fragment)
+        assert "Info text" in text
+
+
+# ---------------------------------------------------------------------------
+# extract_plain_text — real testcase fixtures
+# ---------------------------------------------------------------------------
+
+class TestExtractPlainTextFromFixtures:
+    """실제 testcase fixture에서 extract_plain_text 동작 검증."""
+
+    @pytest.fixture
+    def sidecar_blocks(self):
+        """544113141 testcase의 sidecar blocks를 로드한다."""
+        path = TESTCASES_DIR / "544113141" / "expected.roundtrip.json"
+        if not path.exists():
+            pytest.skip("testcase fixture not found")
+        data = json.loads(path.read_text(encoding="utf-8"))
+        return data["blocks"]
+
+    def test_heading_fragment(self, sidecar_blocks):
+        """heading fragment의 plain text가 정확히 추출된다."""
+        block = sidecar_blocks[0]  # h2[1] "Overview"
+        assert block["xhtml_xpath"] == "h2[1]"
+        text = extract_plain_text(block["xhtml_fragment"])
+        assert text == "Overview"
+
+    def test_paragraph_fragment(self, sidecar_blocks):
+        """paragraph fragment의 plain text가 정확히 추출된다."""
+        block = sidecar_blocks[1]  # p[1]
+        assert block["xhtml_xpath"] == "p[1]"
+        text = extract_plain_text(block["xhtml_fragment"])
+        assert "조직에서 관리하는 DB 커넥션" in text
+
+    def test_list_with_image_fragment(self, sidecar_blocks):
+        """list + inline image fragment에서 image가 제외된다."""
+        block = sidecar_blocks[4]  # ol[1]
+        assert block["xhtml_xpath"] == "ol[1]"
+        text = extract_plain_text(block["xhtml_fragment"])
+        # ac:image는 제외되므로 파일명이 없어야 함
+        assert "image-20240730" not in text
+        # 텍스트 내용은 포함
+        assert "DB Access History" in text
+
+
+# ---------------------------------------------------------------------------
+# normalize_fragment
+# ---------------------------------------------------------------------------
+
+class TestNormalizeFragment:
+    """normalize_fragment() 정규화 검증."""
+
+    def test_attribute_order_irrelevant(self):
+        """속성 순서가 달라도 정규화 결과가 같다."""
+        a = '<p id="x" class="y">text</p>'
+        b = '<p class="y" id="x">text</p>'
+        # class는 ignored attribute이므로 제거됨
+        norm_a = normalize_fragment(a)
+        norm_b = normalize_fragment(b)
+        assert norm_a == norm_b
+
+    def test_ignored_attributes_stripped(self):
+        """IGNORED_ATTRIBUTES에 해당하는 속성이 제거된다."""
+        fragment = '<ac:image ac:macro-id="123" ac:align="center"><ri:attachment ri:filename="test.png" /></ac:image>'
+        result = normalize_fragment(fragment)
+        assert "ac:macro-id" not in result
+        assert 'ac:align="center"' in result
+
+    def test_layout_sections_unwrapped(self):
+        fragment = '<ac:layout><ac:layout-section><ac:layout-cell><p>content</p></ac:layout-cell></ac:layout-section></ac:layout>'
+        result = normalize_fragment(fragment)
+        assert "ac:layout" not in result
+        assert "content" in result
+
+    def test_nonreversible_macros_removed(self):
+        fragment = '<ac:structured-macro ac:name="toc"></ac:structured-macro><p>keep</p>'
+        result = normalize_fragment(fragment)
+        assert "toc" not in result
+        assert "keep" in result
+
+    def test_decorations_unwrapped(self):
+        fragment = '<p><ac:inline-comment-marker ac:ref="x">text</ac:inline-comment-marker></p>'
+        result = normalize_fragment(fragment)
+        assert "ac:inline-comment-marker" not in result
+        assert "text" in result
+
+    def test_same_content_normalizes_equal(self):
+        """내용이 동일한 두 fragment는 정규화 후 동일하다."""
+        a = "<p>Hello   <strong>world</strong></p>"
+        b = "<p>Hello   <strong>world</strong></p>"
+        assert normalize_fragment(a) == normalize_fragment(b)
+
+    def test_strip_ignored_attrs_option(self):
+        """strip_ignored_attrs=False면 속성을 유지한다."""
+        fragment = '<ac:image ac:macro-id="123" ac:align="center" />'
+        with_strip = normalize_fragment(fragment, strip_ignored_attrs=True)
+        without_strip = normalize_fragment(fragment, strip_ignored_attrs=False)
+        assert "ac:macro-id" not in with_strip
+        assert "ac:macro-id" in without_strip
+
+
+# ---------------------------------------------------------------------------
+# normalize_fragment — real testcase round-trip
+# ---------------------------------------------------------------------------
+
+class TestNormalizeFragmentRoundtrip:
+    """실제 testcase의 fragment를 정규화해서 자기 자신과 비교."""
+
+    @pytest.mark.parametrize("case_id", [
+        "544113141", "544381877", "544112828",
+    ])
+    def test_fragment_self_normalize_equal(self, case_id):
+        """같은 fragment를 두 번 정규화하면 결과가 동일하다 (idempotent)."""
+        path = TESTCASES_DIR / case_id / "expected.roundtrip.json"
+        if not path.exists():
+            pytest.skip(f"testcase {case_id} not found")
+        data = json.loads(path.read_text(encoding="utf-8"))
+        for block in data["blocks"]:
+            frag = block["xhtml_fragment"]
+            first = normalize_fragment(frag)
+            second = normalize_fragment(first)
+            assert first == second, (
+                f"normalize_fragment is not idempotent for "
+                f"{case_id} block {block['block_index']} ({block['xhtml_xpath']})"
+            )
+
+
+# ---------------------------------------------------------------------------
+# extract_fragment_by_xpath
+# ---------------------------------------------------------------------------
+
+class TestExtractFragmentByXpath:
+    """extract_fragment_by_xpath() 검증."""
+
+    def test_simple_xpath(self):
+        xhtml = "<h2>Title</h2><p>Para 1</p><p>Para 2</p>"
+        result = extract_fragment_by_xpath(xhtml, "p[2]")
+        assert result is not None
+        assert "Para 2" in result
+
+    def test_heading_xpath(self):
+        xhtml = "<h2>First</h2><h2>Second</h2>"
+        result = extract_fragment_by_xpath(xhtml, "h2[2]")
+        assert result is not None
+        assert "Second" in result
+
+    def test_list_xpath(self):
+        xhtml = "<p>text</p><ul><li><p>item</p></li></ul>"
+        result = extract_fragment_by_xpath(xhtml, "ul[1]")
+        assert result is not None
+        assert "item" in result
+
+    def test_macro_xpath(self):
+        xhtml = (
+            '<ac:structured-macro ac:name="info">'
+            '<ac:rich-text-body><p>info body</p></ac:rich-text-body>'
+            '</ac:structured-macro>'
+        )
+        result = extract_fragment_by_xpath(xhtml, "macro-info[1]")
+        assert result is not None
+        assert "info body" in result
+
+    def test_compound_xpath(self):
+        xhtml = (
+            '<ac:structured-macro ac:name="note">'
+            '<ac:rich-text-body><p>P1</p><p>P2</p></ac:rich-text-body>'
+            '</ac:structured-macro>'
+        )
+        result = extract_fragment_by_xpath(xhtml, "macro-note[1]/p[2]")
+        assert result is not None
+        assert "P2" in result
+
+    def test_nonexistent_xpath_returns_none(self):
+        xhtml = "<p>only one</p>"
+        assert extract_fragment_by_xpath(xhtml, "p[2]") is None
+        assert extract_fragment_by_xpath(xhtml, "h2[1]") is None
+
+    def test_multi_level_xpath(self):
+        """ul[1]/li[2] 같은 다단계 xpath."""
+        xhtml = "<ul><li><p>A</p></li><li><p>B</p></li></ul>"
+        result = extract_fragment_by_xpath(xhtml, "ul[1]/li[2]")
+        assert result is not None
+        assert "B" in result
+
+
+# ---------------------------------------------------------------------------
+# extract_fragment_by_xpath — real testcase fixtures
+# ---------------------------------------------------------------------------
+
+class TestExtractFragmentByXpathFromFixtures:
+    """실제 testcase page.xhtml에서 xpath 추출 검증."""
+
+    @pytest.mark.parametrize("case_id", [
+        "544113141", "544381877",
+    ])
+    def test_sidecar_xpath_matches_page(self, case_id):
+        """sidecar의 xhtml_xpath로 page.xhtml에서 fragment를 추출할 수 있다."""
+        sidecar_path = TESTCASES_DIR / case_id / "expected.roundtrip.json"
+        page_path = TESTCASES_DIR / case_id / "page.xhtml"
+        if not sidecar_path.exists() or not page_path.exists():
+            pytest.skip(f"testcase {case_id} not found")
+
+        data = json.loads(sidecar_path.read_text(encoding="utf-8"))
+        page_xhtml = page_path.read_text(encoding="utf-8")
+
+        for block in data["blocks"]:
+            xpath = block["xhtml_xpath"]
+            # compound xpath(child xpath)는 top-level만 테스트
+            if "/" in xpath:
+                continue
+            extracted = extract_fragment_by_xpath(page_xhtml, xpath)
+            assert extracted is not None, (
+                f"Failed to extract {xpath} from {case_id}"
+            )
+            # 추출된 fragment의 plain text가 sidecar fragment와 일치
+            expected_text = extract_plain_text(block["xhtml_fragment"])
+            actual_text = extract_plain_text(extracted)
+            assert expected_text.strip() == actual_text.strip(), (
+                f"Plain text mismatch for {case_id} {xpath}"
+            )

From 4d98766f41e6aea0354a035c603ddf978e20400b Mon Sep 17 00:00:00 2001
From: JK <jk@chequer.io>
Date: Fri, 13 Mar 2026 22:05:27 +0900
Subject: [PATCH 2/4] =?UTF-8?q?refactor(reverse=5Fsync):=20xhtml=5Fnormali?=
 =?UTF-8?q?zer=20DRY=20=EA=B0=9C=EC=84=A0=20=EB=B0=8F=20=EA=B3=B5=EC=9A=A9?=
 =?UTF-8?q?=20helper=20=EC=8A=B9=EA=B2=A9=ED=95=A9=EB=8B=88=EB=8B=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- mapping_recorder의 _iter_block_children, _get_text_with_emoticons 등을
  public API로 승격하고 backward-compat alias를 추가합니다
- xhtml_normalizer에서 중복 정의를 제거하고 mapping_recorder에서 import합니다
- normalize_fragment에 ignore_ri_filename 파라미터를 추가합니다
- _strip_decorations에 빈 <p> 제거 로직을 추가합니다
- mdx_to_storage_xhtml_verify의 중복 정규화 코드를 normalize_soup()으로 교체합니다
- 새 테스트 2건 추가 (ignore_ri_filename, empty_paragraph_removed)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../bin/reverse_sync/fragment_extractor.py    |  4 +-
 .../bin/reverse_sync/mapping_recorder.py      | 43 ++++++++-----
 .../mdx_to_storage_xhtml_verify.py            | 62 +------------------
 .../bin/reverse_sync/xhtml_normalizer.py      | 59 ++++++++++++------
 .../bin/reverse_sync/xhtml_patcher.py         | 12 ++--
 .../test_reverse_sync_xhtml_normalizer.py     | 15 +++++
 6 files changed, 92 insertions(+), 103 deletions(-)

diff --git a/confluence-mdx/bin/reverse_sync/fragment_extractor.py b/confluence-mdx/bin/reverse_sync/fragment_extractor.py
index 8f839246c..b249dad40 100644
--- a/confluence-mdx/bin/reverse_sync/fragment_extractor.py
+++ b/confluence-mdx/bin/reverse_sync/fragment_extractor.py
@@ -14,7 +14,7 @@
 
 from bs4 import BeautifulSoup, NavigableString, Tag
 
-from reverse_sync.mapping_recorder import _iter_block_children
+from reverse_sync.mapping_recorder import iter_block_children
 
 
 @dataclass
@@ -43,7 +43,7 @@ def extract_block_fragments(xhtml_text: str) -> FragmentExtractionResult:
 
     # Top-level element 순서 파악
     top_elements: List[Tuple[str, str]] = []
-    for child in _iter_block_children(soup):
+    for child in iter_block_children(soup):
         if isinstance(child, Tag):
             top_elements.append(("tag", child.name))
         elif isinstance(child, NavigableString):
diff --git a/confluence-mdx/bin/reverse_sync/mapping_recorder.py b/confluence-mdx/bin/reverse_sync/mapping_recorder.py
index 9ef485e37..275a2fe6f 100644
--- a/confluence-mdx/bin/reverse_sync/mapping_recorder.py
+++ b/confluence-mdx/bin/reverse_sync/mapping_recorder.py
@@ -17,10 +17,13 @@ class BlockMapping:
 
 HEADING_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
 
-_CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'})
+CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'})
 
+# backward-compat aliases
+_CALLOUT_MACRO_NAMES = CALLOUT_MACRO_NAMES
 
-def _get_text_with_emoticons(element) -> str:
+
+def get_text_with_emoticons(element) -> str:
     """get_text()와 동일하지만 ac:emoticon의 fallback 텍스트를 포함한다.
 
     Confluence의 <ac:emoticon> 태그는 self-closing으로 텍스트 노드가 없어서
@@ -38,11 +41,14 @@ def _get_text_with_emoticons(element) -> str:
                 if fallback:
                     parts.append(fallback)
             else:
-                parts.append(_get_text_with_emoticons(item))
+                parts.append(get_text_with_emoticons(item))
     return ''.join(parts)
 
+# backward-compat alias
+_get_text_with_emoticons = get_text_with_emoticons
+
 
-def _iter_block_children(parent):
+def iter_block_children(parent):
     """블록 레벨 자식을 순회한다. ac:layout은 cell 내부로 진입한다."""
     for child in parent.children:
         if isinstance(child, Tag) and child.name == 'ac:layout':
@@ -52,6 +58,9 @@ def _iter_block_children(parent):
         else:
             yield child
 
+# backward-compat alias
+_iter_block_children = iter_block_children
+
 
 def record_mapping(xhtml: str) -> List[BlockMapping]:
     """XHTML에서 블록 레벨 요소를 추출하여 매핑 레코드를 생성한다."""
@@ -59,7 +68,7 @@ def record_mapping(xhtml: str) -> List[BlockMapping]:
     mappings: List[BlockMapping] = []
     counters: dict = {}
 
-    for child in _iter_block_children(soup):
+    for child in iter_block_children(soup):
         if isinstance(child, NavigableString):
             if child.strip():
                 _add_mapping(mappings, counters, 'p', child.strip(), child.strip())
@@ -93,24 +102,24 @@ def record_mapping(xhtml: str) -> List[BlockMapping]:
                              block_type='code')
             else:
                 # Callout 매크로: body 텍스트만 추출 (파라미터 메타데이터 제외)
-                if macro_name in _CALLOUT_MACRO_NAMES:
+                if macro_name in CALLOUT_MACRO_NAMES:
                     rich_body = child.find('ac:rich-text-body')
-                    plain = _get_text_with_emoticons(rich_body) if rich_body else child.get_text()
+                    plain = get_text_with_emoticons(rich_body) if rich_body else child.get_text()
                 else:
                     plain = child.get_text()
                 _add_mapping(mappings, counters, f'macro-{macro_name}', str(child), plain,
                              block_type='html_block')
                 # Callout 매크로: 자식 요소 개별 매핑 추가
-                if macro_name in _CALLOUT_MACRO_NAMES:
+                if macro_name in CALLOUT_MACRO_NAMES:
                     parent_mapping = mappings[-1]
                     _add_rich_text_body_children(
                         child, parent_mapping, mappings, counters)
         elif tag_name == 'ac:adf-extension':
-            panel_type = _get_adf_panel_type(child)
+            panel_type = get_adf_panel_type(child)
             plain = child.get_text()
             _add_mapping(mappings, counters, tag_name, str(child), plain,
                          block_type='html_block')
-            if panel_type in _CALLOUT_MACRO_NAMES:
+            if panel_type in CALLOUT_MACRO_NAMES:
                 parent_mapping = mappings[-1]
                 _add_adf_content_children(
                     child, parent_mapping, mappings, counters)
@@ -172,7 +181,7 @@ def _add_container_children(
         child_counters[tag] = child_counters.get(tag, 0) + 1
         child_xpath = f"{parent_xpath}/{tag}[{child_counters[tag]}]"
 
-        plain = _get_text_with_emoticons(child)
+        plain = get_text_with_emoticons(child)
         if tag in ('ul', 'ol', 'table'):
             inner = str(child)
         else:
@@ -206,7 +215,7 @@ def _add_rich_text_body_children(
     _add_container_children(rich_body, parent_mapping, mappings, counters)
 
 
-def _get_adf_panel_type(element: Tag) -> str:
+def get_adf_panel_type(element: Tag) -> str:
     """ac:adf-extension 요소에서 panel-type을 추출한다."""
     node = element.find('ac:adf-node')
     if node is None:
@@ -216,14 +225,20 @@ def _get_adf_panel_type(element: Tag) -> str:
         return ''
     return attr.get_text().strip()
 
+# backward-compat alias
+_get_adf_panel_type = get_adf_panel_type
 
-def _get_adf_content_body(element: Tag):
+
+def get_adf_content_body(element: Tag):
     """ac:adf-extension 요소에서 ac:adf-content를 찾는다."""
     node = element.find('ac:adf-node')
     if node is None:
         return None
     return node.find('ac:adf-content')
 
+# backward-compat alias
+_get_adf_content_body = get_adf_content_body
+
 
 def _add_adf_content_children(
     adf_element: Tag,
@@ -232,5 +247,5 @@ def _add_adf_content_children(
     counters: dict,
 ):
     """ac:adf-extension의 ac:adf-content 내 자식 요소를 개별 매핑으로 추가한다."""
-    content_body = _get_adf_content_body(adf_element)
+    content_body = get_adf_content_body(adf_element)
     _add_container_children(content_body, parent_mapping, mappings, counters)
diff --git a/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py b/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py
index 78adde9e4..d1074c109 100644
--- a/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py
+++ b/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py
@@ -14,32 +14,10 @@
 from bs4 import BeautifulSoup
 from mdx_to_storage import emit_document, parse_mdx
 from mdx_to_storage.link_resolver import LinkResolver
+from reverse_sync.xhtml_normalizer import normalize_soup
 from xhtml_beautify_diff import beautify_xhtml, xhtml_diff
 
 
-_IGNORED_ATTRIBUTES = {
-    "ac:macro-id",
-    "ac:local-id",
-    "local-id",
-    "ac:schema-version",
-    "ri:version-at-save",
-    "ac:original-height",
-    "ac:original-width",
-    "ac:custom-width",
-    "ac:alt",
-    "ac:layout",
-    "data-table-width",
-    "data-layout",
-    "data-highlight-colour",
-    "data-card-appearance",
-    "ac:breakout-mode",
-    "ac:breakout-width",
-    "ri:space-key",
-    "style",
-    "class",
-}
-
-
 @dataclass
 class CaseVerification:
     case_id: str
@@ -77,10 +55,7 @@ def mdx_to_storage_xhtml_fragment(
 
 def _normalize_xhtml(xhtml: str, ignore_ri_filename: bool = False) -> str:
     soup = BeautifulSoup(xhtml, "html.parser")
-    _strip_layout_sections(soup)
-    _strip_nonreversible_macros(soup)
-    _strip_decorations(soup)
-    _strip_ignored_attributes(soup, ignore_ri_filename=ignore_ri_filename)
+    normalize_soup(soup, ignore_ri_filename=ignore_ri_filename)
     return beautify_xhtml(str(soup)).strip()
 
 
@@ -106,39 +81,6 @@ def verify_expected_mdx_against_page_xhtml(
     return False, generated, "\n".join(diff_lines)
 
 
-def _strip_ignored_attributes(soup: BeautifulSoup, ignore_ri_filename: bool = False) -> None:
-    ignored_attrs = set(_IGNORED_ATTRIBUTES)
-    if ignore_ri_filename:
-        ignored_attrs.add("ri:filename")
-    for tag in soup.find_all(True):
-        for attr in list(tag.attrs.keys()):
-            if attr in ignored_attrs:
-                del tag.attrs[attr]
-
-
-def _strip_layout_sections(soup: BeautifulSoup) -> None:
-    for tag_name in ("ac:layout", "ac:layout-section", "ac:layout-cell"):
-        for tag in soup.find_all(tag_name):
-            tag.unwrap()
-
-
-def _strip_nonreversible_macros(soup: BeautifulSoup) -> None:
-    for macro in soup.find_all("ac:structured-macro"):
-        if macro.get("ac:name") in {"toc", "view-file"}:
-            macro.decompose()
-
-
-def _strip_decorations(soup: BeautifulSoup) -> None:
-    for tag_name in ("ac:adf-mark", "ac:inline-comment-marker"):
-        for tag in soup.find_all(tag_name):
-            tag.unwrap()
-    for colgroup in soup.find_all("colgroup"):
-        colgroup.decompose()
-    for p in soup.find_all("p"):
-        if not p.get_text(strip=True) and not p.find_all(True):
-            p.decompose()
-
-
 def iter_testcase_dirs(testcases_dir: Path) -> Iterable[Path]:
     """`page.xhtml`과 `expected.mdx`가 있는 테스트케이스 디렉토리를 순회한다."""
     for child in sorted(testcases_dir.iterdir()):
diff --git a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
index 6b7377180..ffd4da881 100644
--- a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
+++ b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
@@ -11,6 +11,8 @@
 
 from bs4 import BeautifulSoup, NavigableString, Tag
 
+from reverse_sync.mapping_recorder import iter_block_children
+
 
 # ---------------------------------------------------------------------------
 # Ignored attributes — 비교 시 무시하는 Confluence 메타데이터 속성
@@ -82,21 +84,42 @@ def _extract_text_from_element(element) -> str:
 # Fragment normalization
 # ---------------------------------------------------------------------------
 
-def normalize_fragment(fragment: str, strip_ignored_attrs: bool = True) -> str:
+def normalize_soup(
+    soup: BeautifulSoup,
+    *,
+    strip_ignored_attrs: bool = True,
+    ignore_ri_filename: bool = False,
+) -> None:
+    """BeautifulSoup 객체를 in-place로 정규화한다.
+
+    normalize_fragment()와 verify 모듈이 공유하는 핵심 정규화 로직.
+    """
+    _strip_layout_sections(soup)
+    _strip_nonreversible_macros(soup)
+    _strip_decorations(soup)
+    if strip_ignored_attrs:
+        _strip_ignored_attributes(soup, ignore_ri_filename=ignore_ri_filename)
+
+
+def normalize_fragment(
+    fragment: str,
+    strip_ignored_attrs: bool = True,
+    ignore_ri_filename: bool = False,
+) -> str:
     """XHTML fragment를 비교 가능한 정규화된 형태로 변환한다.
 
     - layout section unwrap
     - non-reversible macro 제거
-    - decoration unwrap
+    - decoration unwrap + 빈 <p> 제거
     - ignored attribute 제거 (선택)
     - BeautifulSoup prettify로 노드별 줄바꿈
     """
     soup = BeautifulSoup(fragment, "html.parser")
-    _strip_layout_sections(soup)
-    _strip_nonreversible_macros(soup)
-    _strip_decorations(soup)
-    if strip_ignored_attrs:
-        _strip_ignored_attributes(soup)
+    normalize_soup(
+        soup,
+        strip_ignored_attrs=strip_ignored_attrs,
+        ignore_ri_filename=ignore_ri_filename,
+    )
     return soup.prettify(formatter="minimal").strip()
 
 
@@ -118,13 +141,20 @@ def _strip_decorations(soup: BeautifulSoup) -> None:
             tag.unwrap()
     for colgroup in soup.find_all("colgroup"):
         colgroup.decompose()
+    # 빈 <p> 제거 (decoration unwrap 후 남는 빈 요소)
+    for p in soup.find_all("p"):
+        if not p.get_text(strip=True) and not p.find_all(True):
+            p.decompose()
 
 
 def _strip_ignored_attributes(
     soup: BeautifulSoup,
     extra: Optional[frozenset[str]] = None,
+    ignore_ri_filename: bool = False,
 ) -> None:
-    ignored = IGNORED_ATTRIBUTES | extra if extra else IGNORED_ATTRIBUTES
+    ignored = IGNORED_ATTRIBUTES | extra if extra else set(IGNORED_ATTRIBUTES)
+    if ignore_ri_filename:
+        ignored = set(ignored) | {"ri:filename"}
     for tag in soup.find_all(True):
         for attr in list(tag.attrs.keys()):
             if attr in ignored:
@@ -186,7 +216,7 @@ def _find_element_by_simple_xpath(parent, xpath: str):
         macro_name = tag_name[len("macro-"):]
 
     count = 0
-    for child in _iter_block_children(parent):
+    for child in iter_block_children(parent):
         if not isinstance(child, Tag):
             continue
         if macro_name:
@@ -201,17 +231,6 @@ def _find_element_by_simple_xpath(parent, xpath: str):
     return None
 
 
-def _iter_block_children(parent):
-    """블록 레벨 자식을 순회한다. ac:layout은 cell 내부로 진입한다."""
-    for child in parent.children:
-        if isinstance(child, Tag) and child.name == "ac:layout":
-            for section in child.find_all("ac:layout-section", recursive=False):
-                for cell in section.find_all("ac:layout-cell", recursive=False):
-                    yield from cell.children
-        else:
-            yield child
-
-
 def _find_content_container(parent: Tag):
     """복합 xpath의 부모에서 콘텐츠 컨테이너를 찾는다."""
     rich_body = parent.find("ac:rich-text-body")
diff --git a/confluence-mdx/bin/reverse_sync/xhtml_patcher.py b/confluence-mdx/bin/reverse_sync/xhtml_patcher.py
index bfb1f10c1..116ecf9f6 100644
--- a/confluence-mdx/bin/reverse_sync/xhtml_patcher.py
+++ b/confluence-mdx/bin/reverse_sync/xhtml_patcher.py
@@ -3,9 +3,7 @@
 from bs4 import BeautifulSoup, NavigableString, Tag
 import difflib
 import re
-from reverse_sync.mapping_recorder import _iter_block_children
-
-from reverse_sync.mapping_recorder import _get_text_with_emoticons
+from reverse_sync.mapping_recorder import get_text_with_emoticons, iter_block_children
 
 
 def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str:
@@ -69,7 +67,7 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str:
             # patch 적용 시에는 기본 비교를 get_text()로 수행하고, 필요 시 emoticon fallback 텍스트 비교를 허용한다.
             current_plain = element.get_text()
             if old_text and current_plain.strip() != old_text.strip():
-                current_plain_with_emoticons = _get_text_with_emoticons(element)
+                current_plain_with_emoticons = get_text_with_emoticons(element)
                 if current_plain_with_emoticons.strip() != old_text.strip():
                     continue
             _replace_inner_html(element, patch['new_inner_xhtml'])
@@ -86,7 +84,7 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str:
             # mapping plain(old_text)과의 비교는 get_text() 우선, 실패 시 emoticon fallback 포함 텍스트로 재확인한다.
             current_plain = element.get_text()
             if current_plain.strip() != old_text.strip():
-                current_plain_with_emoticons = _get_text_with_emoticons(element)
+                current_plain_with_emoticons = get_text_with_emoticons(element)
                 if current_plain_with_emoticons.strip() != old_text.strip():
                     continue
             _apply_text_changes(element, old_text, new_text)
@@ -130,7 +128,7 @@ def _insert_element_resolved(soup: BeautifulSoup, anchor, new_html: str):
 
 def _find_first_block_element(soup: BeautifulSoup):
     """soup의 첫 번째 블록 레벨 요소를 찾는다."""
-    for child in _iter_block_children(soup):
+    for child in iter_block_children(soup):
         if isinstance(child, Tag):
             return child
     return None
@@ -212,7 +210,7 @@ def _find_element_by_simple_xpath(soup: BeautifulSoup, xpath: str):
         macro_name = tag_name[len('macro-'):]
 
     count = 0
-    for child in _iter_block_children(soup):
+    for child in iter_block_children(soup):
         if not isinstance(child, Tag):
             continue
         if macro_name:
diff --git a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
index 2c08201a6..d8ca14be1 100644
--- a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
+++ b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
@@ -201,6 +201,21 @@ def test_strip_ignored_attrs_option(self):
         assert "ac:macro-id" not in with_strip
         assert "ac:macro-id" in without_strip
 
+    def test_ignore_ri_filename_option(self):
+        """ignore_ri_filename=True면 ri:filename 속성도 제거된다."""
+        fragment = '<ri:attachment ri:filename="test.png" />'
+        normal = normalize_fragment(fragment)
+        ignored = normalize_fragment(fragment, ignore_ri_filename=True)
+        assert 'ri:filename' in normal
+        assert 'ri:filename' not in ignored
+
+    def test_empty_paragraph_removed(self):
+        """빈 <p> 요소가 decoration unwrap 후 제거된다."""
+        fragment = '<p><ac:inline-comment-marker ac:ref="x"></ac:inline-comment-marker></p><p>keep</p>'
+        result = normalize_fragment(fragment)
+        # 빈 <p>는 제거되고 keep만 남음
+        assert "keep" in result
+
 
 # ---------------------------------------------------------------------------
 # normalize_fragment — real testcase round-trip

From 46faf69ac801cb441269ddf11e4ed8310d54df16 Mon Sep 17 00:00:00 2001
From: JK <jk@chequer.io>
Date: Fri, 13 Mar 2026 22:09:16 +0900
Subject: [PATCH 3/4] =?UTF-8?q?fix(reverse=5Fsync):=20=EB=A6=AC=EB=B7=B0?=
 =?UTF-8?q?=20=EB=B0=98=EC=98=81=20=E2=80=94=20ac:link=20plain=20text=20?=
 =?UTF-8?q?=ED=8F=AC=ED=95=A8=20=EB=B0=8F=20ListNode=20start=20=EB=B3=B4?=
 =?UTF-8?q?=EC=A1=B4=ED=95=A9=EB=8B=88=EB=8B=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- extract_plain_text에서 ac:link를 skip하지 않고 visible label(ac:link-body)
  텍스트를 포함하도록 수정합니다 (기존 mapping_recorder contract와 일치)
- ListNode에 start 필드를 추가하여 ordered list marker number를 보존합니다
- _parse_list_items에서 marker number를 파싱하여 start에 저장합니다
- _render_list_nodes에서 첫 항목의 start를 <ol start="N">에 반영합니다
- 관련 테스트 추가/수정 (ac:link with body, start number preservation)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 confluence-mdx/bin/mdx_to_storage/emitter.py       | 11 +++++++----
 .../bin/reverse_sync/xhtml_normalizer.py           |  4 ++--
 .../tests/test_reverse_sync_list_tree.py           | 14 ++++++++++++++
 .../tests/test_reverse_sync_xhtml_normalizer.py    | 13 +++++++++++--
 4 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/confluence-mdx/bin/mdx_to_storage/emitter.py b/confluence-mdx/bin/mdx_to_storage/emitter.py
index ba4340f70..952e1a198 100644
--- a/confluence-mdx/bin/mdx_to_storage/emitter.py
+++ b/confluence-mdx/bin/mdx_to_storage/emitter.py
@@ -11,7 +11,7 @@
 from .parser import Block, HEADING_PATTERN
 
 
-_ORDERED_LIST_PATTERN = re.compile(r"^\d+\.\s+(.*)$")
+_ORDERED_LIST_PATTERN = re.compile(r"^(\d+)\.\s+(.*)$")
 _UNORDERED_LIST_PATTERN = re.compile(r"^[-*+]\s+(.*)$")
 _HEADING_LINE_PATTERN = HEADING_PATTERN
 _CALLOUT_TYPE_TO_MACRO = {
@@ -43,10 +43,11 @@ class ListNode:
     Public API for reconstruction pipeline.
     """
 
-    def __init__(self, ordered: bool, text: str, depth: int) -> None:
+    def __init__(self, ordered: bool, text: str, depth: int, start: int | None = None) -> None:
         self.ordered = ordered
         self.text = text
         self.depth = depth
+        self.start = start  # ordered list marker number (e.g. 2 for "2. item")
         self.children: list["ListNode"] = []
 
 
@@ -189,7 +190,8 @@ def _parse_list_items(content: str) -> list[_ListNode]:
 
         ordered_match = _ORDERED_LIST_PATTERN.match(stripped)
         if ordered_match:
-            items.append(_ListNode(True, ordered_match.group(1), depth))
+            marker_num = int(ordered_match.group(1))
+            items.append(_ListNode(True, ordered_match.group(2), depth, start=marker_num))
             continue
 
         unordered_match = _UNORDERED_LIST_PATTERN.match(stripped)
@@ -234,7 +236,8 @@ def _render_list_nodes(
 
         body = "".join(_render_list_item(node, link_resolver=link_resolver) for node in group)
         if tag == "ol":
-            parts.append(f'<ol start="1">{body}</ol>')
+            start = group[0].start if group[0].start is not None else 1
+            parts.append(f'<ol start="{start}">{body}</ol>')
         else:
             parts.append(f"<ul>{body}</ul>")
     return "".join(parts)
diff --git a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
index ffd4da881..6962a849c 100644
--- a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
+++ b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
@@ -73,8 +73,8 @@ def _extract_text_from_element(element) -> str:
                 if fallback:
                     parts.append(fallback)
                 continue
-            # ac:image, ac:link 등 preservation unit은 텍스트 없음 (anchor로 처리)
-            if child.name in ("ac:image", "ac:link"):
+            # ac:image는 preservation unit — 텍스트 없음 (anchor로 처리)
+            if child.name == "ac:image":
                 continue
             parts.append(_extract_text_from_element(child))
     return "".join(parts)
diff --git a/confluence-mdx/tests/test_reverse_sync_list_tree.py b/confluence-mdx/tests/test_reverse_sync_list_tree.py
index 25016f8b4..1cc5d79f9 100644
--- a/confluence-mdx/tests/test_reverse_sync_list_tree.py
+++ b/confluence-mdx/tests/test_reverse_sync_list_tree.py
@@ -26,6 +26,9 @@ def test_simple_ordered_list(self):
         assert len(roots) == 3
         assert all(node.ordered for node in roots)
         assert roots[0].text == "First"
+        assert roots[0].start == 1
+        assert roots[1].start == 2
+        assert roots[2].start == 3
 
     def test_nested_list(self):
         content = "- Parent\n    - Child 1\n    - Child 2"
@@ -71,5 +74,16 @@ def test_nested_ordered_under_unordered(self):
         roots = parse_list_tree(content)
         assert len(roots) == 1
         assert not roots[0].ordered
+        assert roots[0].start is None  # unordered → no start
         assert len(roots[0].children) == 1
         assert roots[0].children[0].ordered
+        assert roots[0].children[0].start == 1
+
+    def test_ordered_list_start_number_preserved(self):
+        """중간부터 시작하는 ordered list의 marker number가 보존된다."""
+        content = "2. Second\n3. Third\n4. Fourth"
+        roots = parse_list_tree(content)
+        assert len(roots) == 3
+        assert roots[0].start == 2
+        assert roots[1].start == 3
+        assert roots[2].start == 4
diff --git a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
index d8ca14be1..c61900fb6 100644
--- a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
+++ b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
@@ -48,8 +48,8 @@ def test_paragraph_with_inline_image_excluded(self):
         )
         assert extract_plain_text(fragment) == "A  B"
 
-    def test_paragraph_with_inline_link_excluded(self):
-        """ac:link는 preservation unit이므로 plain text에서 제외된다."""
+    def test_paragraph_with_inline_link_no_body(self):
+        """ac:link에 link-body가 없으면 텍스트가 비어있다."""
         fragment = (
             '<p>Before '
             '<ac:link><ri:page ri:content-title="Page" /></ac:link>'
@@ -57,6 +57,15 @@ def test_paragraph_with_inline_link_excluded(self):
         )
         assert extract_plain_text(fragment) == "Before  After"
 
+    def test_paragraph_with_inline_link_with_body(self):
+        """ac:link에 link-body가 있으면 visible label이 plain text에 포함된다."""
+        fragment = (
+            '<p>참조: '
+            '<ac:link><ri:page ri:content-title="Page" />'
+            '<ac:link-body>Click here</ac:link-body></ac:link></p>'
+        )
+        assert extract_plain_text(fragment) == "참조: Click here"
+
     def test_paragraph_with_emoticon(self):
         """ac:emoticon의 fallback 텍스트가 포함된다."""
         fragment = (

From 35fdb0c4c2854c26ff32726a4e321953d31dc7f4 Mon Sep 17 00:00:00 2001
From: JK <jk@chequer.io>
Date: Fri, 13 Mar 2026 22:55:23 +0900
Subject: [PATCH 4/4] =?UTF-8?q?fix(reverse=5Fsync):=20extract=5Fplain=5Fte?=
 =?UTF-8?q?xt=EC=97=90=EC=84=9C=20code=20macro=20=EB=B3=B8=EB=AC=B8?=
 =?UTF-8?q?=EC=9D=84=20=ED=8F=AC=ED=95=A8=ED=95=A9=EB=8B=88=EB=8B=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ac:plain-text-body 제외 로직을 제거하여 코드 블록 본문이 plain text에
  포함되도록 수정합니다
- 기존 mapping_recorder의 plain text contract와 일치시킵니다
- reconstruction anchor offset 좌표계에서 코드 본문 누락 방지

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 confluence-mdx/bin/reverse_sync/xhtml_normalizer.py       | 8 +++-----
 .../tests/test_reverse_sync_xhtml_normalizer.py           | 7 ++++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
index 6962a849c..f0ea77e48 100644
--- a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
+++ b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
@@ -48,8 +48,9 @@
 def extract_plain_text(fragment: str) -> str:
     """XHTML fragment에서 plain text를 추출한다.
 
-    ac:plain-text-body(코드 블록 본문)는 제외하고,
-    ac:emoticon의 fallback 텍스트는 포함한다.
+    ac:emoticon의 fallback 텍스트를 포함하고,
+    ac:image만 preservation unit으로 제외한다.
+    코드 블록 본문(ac:plain-text-body)과 링크 label(ac:link-body)은 포함한다.
 
     이 함수의 출력은 reconstruction에서 anchor offset 좌표의 기준이 된다.
     """
@@ -64,9 +65,6 @@ def _extract_text_from_element(element) -> str:
         if isinstance(child, NavigableString):
             parts.append(str(child))
         elif isinstance(child, Tag):
-            # 코드 블록 본문은 제외
-            if child.name == "ac:plain-text-body":
-                continue
             # emoticon은 fallback 텍스트 사용
             if child.name == "ac:emoticon":
                 fallback = child.get("ac:emoji-fallback", "")
diff --git a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
index c61900fb6..61fdef040 100644
--- a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
+++ b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
@@ -75,15 +75,16 @@ def test_paragraph_with_emoticon(self):
         )
         assert extract_plain_text(fragment) == ":check_mark: Success"
 
-    def test_code_macro_body_excluded(self):
-        """ac:plain-text-body(코드 블록 본문)는 제외된다."""
+    def test_code_macro_body_included(self):
+        """코드 블록 본문(ac:plain-text-body)이 plain text에 포함된다."""
         fragment = (
             '<ac:structured-macro ac:name="code">'
             '<ac:parameter ac:name="language">python</ac:parameter>'
             '<ac:plain-text-body><![CDATA[print("hello")]]></ac:plain-text-body>'
             '</ac:structured-macro>'
         )
-        assert extract_plain_text(fragment).strip() == "python"
+        text = extract_plain_text(fragment)
+        assert 'print("hello")' in text
 
     def test_list_plain_text(self):
         fragment = "<ul><li><p>Item 1</p></li><li><p>Item 2</p></li></ul>"