querypie · jk-kim0 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
@@ -1,19 +1,21 @@
 """MDX -> Confluence Storage XHTML conversion package."""
 
-from .emitter import emit_block, emit_document
+from .emitter import ListNode, emit_block, emit_document, parse_list_tree
 from .inline import convert_heading_inline, convert_inline
 from .link_resolver import LinkResolver, PageEntry, load_pages_yaml
 from .parser import Block, parse_mdx, parse_mdx_blocks
 
 __all__ = [
     "Block",
     "LinkResolver",
+    "ListNode",
     "PageEntry",
     "convert_heading_inline",
     "convert_inline",
     "emit_block",
     "emit_document",
     "load_pages_yaml",
+    "parse_list_tree",
     "parse_mdx",
     "parse_mdx_blocks",
 ]
@@ -11,7 +11,7 @@
 from .parser import Block, HEADING_PATTERN
 
 
-_ORDERED_LIST_PATTERN = re.compile(r"^\d+\.\s+(.*)$")
+_ORDERED_LIST_PATTERN = re.compile(r"^(\d+)\.\s+(.*)$")
 _UNORDERED_LIST_PATTERN = re.compile(r"^[-*+]\s+(.*)$")
 _HEADING_LINE_PATTERN = HEADING_PATTERN
 _CALLOUT_TYPE_TO_MACRO = {
@@ -37,12 +37,22 @@
 _IMG_ATTR_RE = re.compile(r'(\w[\w-]*)=(?:"([^"]*)"|\'([^\']*)\')')
 
 
-class _ListNode:
-    def __init__(self, ordered: bool, text: str, depth: int) -> None:
+class ListNode:
+    """List item node for tree-based list representation.
+
+    Public API for reconstruction pipeline.
+    """
+
+    def __init__(self, ordered: bool, text: str, depth: int, start: int | None = None) -> None:
         self.ordered = ordered
         self.text = text
         self.depth = depth
-        self.children: list["_ListNode"] = []
+        self.start = start  # ordered list marker number (e.g. 2 for "2. item")
+        self.children: list["ListNode"] = []
+
+
+# backward compat alias (internal)
+_ListNode = ListNode
 
 
 def emit_block(block: Block, context: Optional[dict] = None) -> str:
@@ -159,6 +169,15 @@ def _emit_single_depth_list(content: str, link_resolver: Optional[LinkResolver]
     return _render_list_nodes(roots, link_resolver=link_resolver)
 
 
+def parse_list_tree(content: str) -> list[ListNode]:
+    """MDX list content를 파싱하여 tree 구조의 ListNode 리스트를 반환한다.
+
+    Public API — reverse-sync reconstruction pipeline에서 사용한다.
+    """
+    items = _parse_list_items(content)
+    return _build_list_tree(items)
+
+
 def _parse_list_items(content: str) -> list[_ListNode]:
     items: list[_ListNode] = []
     for line in content.splitlines():
@@ -171,7 +190,8 @@ def _parse_list_items(content: str) -> list[_ListNode]:
 
         ordered_match = _ORDERED_LIST_PATTERN.match(stripped)
         if ordered_match:
-            items.append(_ListNode(True, ordered_match.group(1), depth))
+            marker_num = int(ordered_match.group(1))
+            items.append(_ListNode(True, ordered_match.group(2), depth, start=marker_num))
             continue
 
         unordered_match = _UNORDERED_LIST_PATTERN.match(stripped)
@@ -216,7 +236,8 @@ def _render_list_nodes(
 
         body = "".join(_render_list_item(node, link_resolver=link_resolver) for node in group)
         if tag == "ol":
-            parts.append(f'<ol start="1">{body}</ol>')
+            start = group[0].start if group[0].start is not None else 1
+            parts.append(f'<ol start="{start}">{body}</ol>')
         else:
             parts.append(f"<ul>{body}</ul>")
     return "".join(parts)

@@ -14,7 +14,7 @@
 
 from bs4 import BeautifulSoup, NavigableString, Tag
 
-from reverse_sync.mapping_recorder import _iter_block_children
+from reverse_sync.mapping_recorder import iter_block_children
 
 
 @dataclass
@@ -43,7 +43,7 @@ def extract_block_fragments(xhtml_text: str) -> FragmentExtractionResult:
 
     # Top-level element 순서 파악
     top_elements: List[Tuple[str, str]] = []
-    for child in _iter_block_children(soup):
+    for child in iter_block_children(soup):
         if isinstance(child, Tag):
             top_elements.append(("tag", child.name))
         elif isinstance(child, NavigableString):

@@ -17,10 +17,13 @@ class BlockMapping:
 
 HEADING_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
 
-_CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'})
+CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'})
 
+# backward-compat aliases
+_CALLOUT_MACRO_NAMES = CALLOUT_MACRO_NAMES
 
-def _get_text_with_emoticons(element) -> str:
+
+def get_text_with_emoticons(element) -> str:
     """get_text()와 동일하지만 ac:emoticon의 fallback 텍스트를 포함한다.
 
     Confluence의 <ac:emoticon> 태그는 self-closing으로 텍스트 노드가 없어서
@@ -38,11 +41,14 @@ def _get_text_with_emoticons(element) -> str:
                 if fallback:
                     parts.append(fallback)
             else:
-                parts.append(_get_text_with_emoticons(item))
+                parts.append(get_text_with_emoticons(item))
     return ''.join(parts)
 
+# backward-compat alias
+_get_text_with_emoticons = get_text_with_emoticons
+
 
-def _iter_block_children(parent):
+def iter_block_children(parent):
     """블록 레벨 자식을 순회한다. ac:layout은 cell 내부로 진입한다."""
     for child in parent.children:
         if isinstance(child, Tag) and child.name == 'ac:layout':
@@ -52,14 +58,17 @@ def _iter_block_children(parent):
         else:
             yield child
 
+# backward-compat alias
+_iter_block_children = iter_block_children
+
 
 def record_mapping(xhtml: str) -> List[BlockMapping]:
     """XHTML에서 블록 레벨 요소를 추출하여 매핑 레코드를 생성한다."""
     soup = BeautifulSoup(xhtml, 'html.parser')
     mappings: List[BlockMapping] = []
     counters: dict = {}
 
-    for child in _iter_block_children(soup):
+    for child in iter_block_children(soup):
         if isinstance(child, NavigableString):
             if child.strip():
                 _add_mapping(mappings, counters, 'p', child.strip(), child.strip())
@@ -93,24 +102,24 @@ def record_mapping(xhtml: str) -> List[BlockMapping]:
                              block_type='code')
             else:
                 # Callout 매크로: body 텍스트만 추출 (파라미터 메타데이터 제외)
-                if macro_name in _CALLOUT_MACRO_NAMES:
+                if macro_name in CALLOUT_MACRO_NAMES:
                     rich_body = child.find('ac:rich-text-body')
-                    plain = _get_text_with_emoticons(rich_body) if rich_body else child.get_text()
+                    plain = get_text_with_emoticons(rich_body) if rich_body else child.get_text()
                 else:
                     plain = child.get_text()
                 _add_mapping(mappings, counters, f'macro-{macro_name}', str(child), plain,
                              block_type='html_block')
                 # Callout 매크로: 자식 요소 개별 매핑 추가
-                if macro_name in _CALLOUT_MACRO_NAMES:
+                if macro_name in CALLOUT_MACRO_NAMES:
                     parent_mapping = mappings[-1]
                     _add_rich_text_body_children(
                         child, parent_mapping, mappings, counters)
         elif tag_name == 'ac:adf-extension':
-            panel_type = _get_adf_panel_type(child)
+            panel_type = get_adf_panel_type(child)
             plain = child.get_text()
             _add_mapping(mappings, counters, tag_name, str(child), plain,
                          block_type='html_block')
-            if panel_type in _CALLOUT_MACRO_NAMES:
+            if panel_type in CALLOUT_MACRO_NAMES:
                 parent_mapping = mappings[-1]
                 _add_adf_content_children(
                     child, parent_mapping, mappings, counters)
@@ -172,7 +181,7 @@ def _add_container_children(
         child_counters[tag] = child_counters.get(tag, 0) + 1
         child_xpath = f"{parent_xpath}/{tag}[{child_counters[tag]}]"
 
-        plain = _get_text_with_emoticons(child)
+        plain = get_text_with_emoticons(child)
         if tag in ('ul', 'ol', 'table'):
             inner = str(child)
         else:
@@ -206,7 +215,7 @@ def _add_rich_text_body_children(
     _add_container_children(rich_body, parent_mapping, mappings, counters)
 
 
-def _get_adf_panel_type(element: Tag) -> str:
+def get_adf_panel_type(element: Tag) -> str:
     """ac:adf-extension 요소에서 panel-type을 추출한다."""
     node = element.find('ac:adf-node')
     if node is None:
@@ -216,14 +225,20 @@ def _get_adf_panel_type(element: Tag) -> str:
         return ''
     return attr.get_text().strip()
 
+# backward-compat alias
+_get_adf_panel_type = get_adf_panel_type
 
-def _get_adf_content_body(element: Tag):
+
+def get_adf_content_body(element: Tag):
     """ac:adf-extension 요소에서 ac:adf-content를 찾는다."""
     node = element.find('ac:adf-node')
     if node is None:
         return None
     return node.find('ac:adf-content')
 
+# backward-compat alias
+_get_adf_content_body = get_adf_content_body
+
 
 def _add_adf_content_children(
     adf_element: Tag,
@@ -232,5 +247,5 @@ def _add_adf_content_children(
     counters: dict,
 ):
     """ac:adf-extension의 ac:adf-content 내 자식 요소를 개별 매핑으로 추가한다."""
-    content_body = _get_adf_content_body(adf_element)
+    content_body = get_adf_content_body(adf_element)
     _add_container_children(content_body, parent_mapping, mappings, counters)
@@ -14,32 +14,10 @@
 from bs4 import BeautifulSoup
 from mdx_to_storage import emit_document, parse_mdx
 from mdx_to_storage.link_resolver import LinkResolver
+from reverse_sync.xhtml_normalizer import normalize_soup
 from xhtml_beautify_diff import beautify_xhtml, xhtml_diff
 
 
-_IGNORED_ATTRIBUTES = {
-    "ac:macro-id",
-    "ac:local-id",
-    "local-id",
-    "ac:schema-version",
-    "ri:version-at-save",
-    "ac:original-height",
-    "ac:original-width",
-    "ac:custom-width",
-    "ac:alt",
-    "ac:layout",
-    "data-table-width",
-    "data-layout",
-    "data-highlight-colour",
-    "data-card-appearance",
-    "ac:breakout-mode",
-    "ac:breakout-width",
-    "ri:space-key",
-    "style",
-    "class",
-}
-
-
 @dataclass
 class CaseVerification:
     case_id: str
@@ -77,10 +55,7 @@ def mdx_to_storage_xhtml_fragment(
 
 def _normalize_xhtml(xhtml: str, ignore_ri_filename: bool = False) -> str:
     soup = BeautifulSoup(xhtml, "html.parser")
-    _strip_layout_sections(soup)
-    _strip_nonreversible_macros(soup)
-    _strip_decorations(soup)
-    _strip_ignored_attributes(soup, ignore_ri_filename=ignore_ri_filename)
+    normalize_soup(soup, ignore_ri_filename=ignore_ri_filename)
     return beautify_xhtml(str(soup)).strip()
 
 
@@ -106,39 +81,6 @@ def verify_expected_mdx_against_page_xhtml(
     return False, generated, "\n".join(diff_lines)
 
 
-def _strip_ignored_attributes(soup: BeautifulSoup, ignore_ri_filename: bool = False) -> None:
-    ignored_attrs = set(_IGNORED_ATTRIBUTES)
-    if ignore_ri_filename:
-        ignored_attrs.add("ri:filename")
-    for tag in soup.find_all(True):
-        for attr in list(tag.attrs.keys()):
-            if attr in ignored_attrs:
-                del tag.attrs[attr]
-
-
-def _strip_layout_sections(soup: BeautifulSoup) -> None:
-    for tag_name in ("ac:layout", "ac:layout-section", "ac:layout-cell"):
-        for tag in soup.find_all(tag_name):
-            tag.unwrap()
-
-
-def _strip_nonreversible_macros(soup: BeautifulSoup) -> None:
-    for macro in soup.find_all("ac:structured-macro"):
-        if macro.get("ac:name") in {"toc", "view-file"}:
-            macro.decompose()
-
-
-def _strip_decorations(soup: BeautifulSoup) -> None:
-    for tag_name in ("ac:adf-mark", "ac:inline-comment-marker"):
-        for tag in soup.find_all(tag_name):
-            tag.unwrap()
-    for colgroup in soup.find_all("colgroup"):
-        colgroup.decompose()
-    for p in soup.find_all("p"):
-        if not p.get_text(strip=True) and not p.find_all(True):
-            p.decompose()
-
-
 def iter_testcase_dirs(testcases_dir: Path) -> Iterable[Path]:
     """`page.xhtml`과 `expected.mdx`가 있는 테스트케이스 디렉토리를 순회한다."""
     for child in sorted(testcases_dir.iterdir()):