diff --git a/confluence-mdx/bin/mdx_to_storage/__init__.py b/confluence-mdx/bin/mdx_to_storage/__init__.py
index ee914b48d..2c4abde96 100644
--- a/confluence-mdx/bin/mdx_to_storage/__init__.py
+++ b/confluence-mdx/bin/mdx_to_storage/__init__.py
@@ -1,6 +1,6 @@
"""MDX -> Confluence Storage XHTML conversion package."""
-from .emitter import emit_block, emit_document
+from .emitter import ListNode, emit_block, emit_document, parse_list_tree
from .inline import convert_heading_inline, convert_inline
from .link_resolver import LinkResolver, PageEntry, load_pages_yaml
from .parser import Block, parse_mdx, parse_mdx_blocks
@@ -8,12 +8,14 @@
__all__ = [
"Block",
"LinkResolver",
+ "ListNode",
"PageEntry",
"convert_heading_inline",
"convert_inline",
"emit_block",
"emit_document",
"load_pages_yaml",
+ "parse_list_tree",
"parse_mdx",
"parse_mdx_blocks",
]
diff --git a/confluence-mdx/bin/mdx_to_storage/emitter.py b/confluence-mdx/bin/mdx_to_storage/emitter.py
index bbb099872..952e1a198 100644
--- a/confluence-mdx/bin/mdx_to_storage/emitter.py
+++ b/confluence-mdx/bin/mdx_to_storage/emitter.py
@@ -11,7 +11,7 @@
from .parser import Block, HEADING_PATTERN
-_ORDERED_LIST_PATTERN = re.compile(r"^\d+\.\s+(.*)$")
+_ORDERED_LIST_PATTERN = re.compile(r"^(\d+)\.\s+(.*)$")
_UNORDERED_LIST_PATTERN = re.compile(r"^[-*+]\s+(.*)$")
_HEADING_LINE_PATTERN = HEADING_PATTERN
_CALLOUT_TYPE_TO_MACRO = {
@@ -37,12 +37,22 @@
_IMG_ATTR_RE = re.compile(r'(\w[\w-]*)=(?:"([^"]*)"|\'([^\']*)\')')
-class _ListNode:
- def __init__(self, ordered: bool, text: str, depth: int) -> None:
+class ListNode:
+ """List item node for tree-based list representation.
+
+ Public API for reconstruction pipeline.
+ """
+
+ def __init__(self, ordered: bool, text: str, depth: int, start: int | None = None) -> None:
self.ordered = ordered
self.text = text
self.depth = depth
- self.children: list["_ListNode"] = []
+ self.start = start # ordered list marker number (e.g. 2 for "2. item")
+ self.children: list["ListNode"] = []
+
+
+# backward compat alias (internal)
+_ListNode = ListNode
def emit_block(block: Block, context: Optional[dict] = None) -> str:
@@ -159,6 +169,15 @@ def _emit_single_depth_list(content: str, link_resolver: Optional[LinkResolver]
return _render_list_nodes(roots, link_resolver=link_resolver)
+def parse_list_tree(content: str) -> list[ListNode]:
+ """MDX list content를 파싱하여 tree 구조의 ListNode 리스트를 반환한다.
+
+ Public API — reverse-sync reconstruction pipeline에서 사용한다.
+ """
+ items = _parse_list_items(content)
+ return _build_list_tree(items)
+
+
def _parse_list_items(content: str) -> list[_ListNode]:
items: list[_ListNode] = []
for line in content.splitlines():
@@ -171,7 +190,8 @@ def _parse_list_items(content: str) -> list[_ListNode]:
ordered_match = _ORDERED_LIST_PATTERN.match(stripped)
if ordered_match:
- items.append(_ListNode(True, ordered_match.group(1), depth))
+ marker_num = int(ordered_match.group(1))
+ items.append(_ListNode(True, ordered_match.group(2), depth, start=marker_num))
continue
unordered_match = _UNORDERED_LIST_PATTERN.match(stripped)
@@ -216,7 +236,8 @@ def _render_list_nodes(
body = "".join(_render_list_item(node, link_resolver=link_resolver) for node in group)
if tag == "ol":
- parts.append(f'
{body}
')
+ start = group[0].start if group[0].start is not None else 1
+ parts.append(f'{body}
')
else:
parts.append(f"")
return "".join(parts)
diff --git a/confluence-mdx/bin/reverse_sync/fragment_extractor.py b/confluence-mdx/bin/reverse_sync/fragment_extractor.py
index 8f839246c..b249dad40 100644
--- a/confluence-mdx/bin/reverse_sync/fragment_extractor.py
+++ b/confluence-mdx/bin/reverse_sync/fragment_extractor.py
@@ -14,7 +14,7 @@
from bs4 import BeautifulSoup, NavigableString, Tag
-from reverse_sync.mapping_recorder import _iter_block_children
+from reverse_sync.mapping_recorder import iter_block_children
@dataclass
@@ -43,7 +43,7 @@ def extract_block_fragments(xhtml_text: str) -> FragmentExtractionResult:
# Top-level element 순서 파악
top_elements: List[Tuple[str, str]] = []
- for child in _iter_block_children(soup):
+ for child in iter_block_children(soup):
if isinstance(child, Tag):
top_elements.append(("tag", child.name))
elif isinstance(child, NavigableString):
diff --git a/confluence-mdx/bin/reverse_sync/mapping_recorder.py b/confluence-mdx/bin/reverse_sync/mapping_recorder.py
index 9ef485e37..275a2fe6f 100644
--- a/confluence-mdx/bin/reverse_sync/mapping_recorder.py
+++ b/confluence-mdx/bin/reverse_sync/mapping_recorder.py
@@ -17,10 +17,13 @@ class BlockMapping:
HEADING_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
-_CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'})
+CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'})
+# backward-compat aliases
+_CALLOUT_MACRO_NAMES = CALLOUT_MACRO_NAMES
-def _get_text_with_emoticons(element) -> str:
+
+def get_text_with_emoticons(element) -> str:
"""get_text()와 동일하지만 ac:emoticon의 fallback 텍스트를 포함한다.
Confluence의 태그는 self-closing으로 텍스트 노드가 없어서
@@ -38,11 +41,14 @@ def _get_text_with_emoticons(element) -> str:
if fallback:
parts.append(fallback)
else:
- parts.append(_get_text_with_emoticons(item))
+ parts.append(get_text_with_emoticons(item))
return ''.join(parts)
+# backward-compat alias
+_get_text_with_emoticons = get_text_with_emoticons
+
-def _iter_block_children(parent):
+def iter_block_children(parent):
"""블록 레벨 자식을 순회한다. ac:layout은 cell 내부로 진입한다."""
for child in parent.children:
if isinstance(child, Tag) and child.name == 'ac:layout':
@@ -52,6 +58,9 @@ def _iter_block_children(parent):
else:
yield child
+# backward-compat alias
+_iter_block_children = iter_block_children
+
def record_mapping(xhtml: str) -> List[BlockMapping]:
"""XHTML에서 블록 레벨 요소를 추출하여 매핑 레코드를 생성한다."""
@@ -59,7 +68,7 @@ def record_mapping(xhtml: str) -> List[BlockMapping]:
mappings: List[BlockMapping] = []
counters: dict = {}
- for child in _iter_block_children(soup):
+ for child in iter_block_children(soup):
if isinstance(child, NavigableString):
if child.strip():
_add_mapping(mappings, counters, 'p', child.strip(), child.strip())
@@ -93,24 +102,24 @@ def record_mapping(xhtml: str) -> List[BlockMapping]:
block_type='code')
else:
# Callout 매크로: body 텍스트만 추출 (파라미터 메타데이터 제외)
- if macro_name in _CALLOUT_MACRO_NAMES:
+ if macro_name in CALLOUT_MACRO_NAMES:
rich_body = child.find('ac:rich-text-body')
- plain = _get_text_with_emoticons(rich_body) if rich_body else child.get_text()
+ plain = get_text_with_emoticons(rich_body) if rich_body else child.get_text()
else:
plain = child.get_text()
_add_mapping(mappings, counters, f'macro-{macro_name}', str(child), plain,
block_type='html_block')
# Callout 매크로: 자식 요소 개별 매핑 추가
- if macro_name in _CALLOUT_MACRO_NAMES:
+ if macro_name in CALLOUT_MACRO_NAMES:
parent_mapping = mappings[-1]
_add_rich_text_body_children(
child, parent_mapping, mappings, counters)
elif tag_name == 'ac:adf-extension':
- panel_type = _get_adf_panel_type(child)
+ panel_type = get_adf_panel_type(child)
plain = child.get_text()
_add_mapping(mappings, counters, tag_name, str(child), plain,
block_type='html_block')
- if panel_type in _CALLOUT_MACRO_NAMES:
+ if panel_type in CALLOUT_MACRO_NAMES:
parent_mapping = mappings[-1]
_add_adf_content_children(
child, parent_mapping, mappings, counters)
@@ -172,7 +181,7 @@ def _add_container_children(
child_counters[tag] = child_counters.get(tag, 0) + 1
child_xpath = f"{parent_xpath}/{tag}[{child_counters[tag]}]"
- plain = _get_text_with_emoticons(child)
+ plain = get_text_with_emoticons(child)
if tag in ('ul', 'ol', 'table'):
inner = str(child)
else:
@@ -206,7 +215,7 @@ def _add_rich_text_body_children(
_add_container_children(rich_body, parent_mapping, mappings, counters)
-def _get_adf_panel_type(element: Tag) -> str:
+def get_adf_panel_type(element: Tag) -> str:
"""ac:adf-extension 요소에서 panel-type을 추출한다."""
node = element.find('ac:adf-node')
if node is None:
@@ -216,14 +225,20 @@ def _get_adf_panel_type(element: Tag) -> str:
return ''
return attr.get_text().strip()
+# backward-compat alias
+_get_adf_panel_type = get_adf_panel_type
-def _get_adf_content_body(element: Tag):
+
+def get_adf_content_body(element: Tag):
"""ac:adf-extension 요소에서 ac:adf-content를 찾는다."""
node = element.find('ac:adf-node')
if node is None:
return None
return node.find('ac:adf-content')
+# backward-compat alias
+_get_adf_content_body = get_adf_content_body
+
def _add_adf_content_children(
adf_element: Tag,
@@ -232,5 +247,5 @@ def _add_adf_content_children(
counters: dict,
):
"""ac:adf-extension의 ac:adf-content 내 자식 요소를 개별 매핑으로 추가한다."""
- content_body = _get_adf_content_body(adf_element)
+ content_body = get_adf_content_body(adf_element)
_add_container_children(content_body, parent_mapping, mappings, counters)
diff --git a/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py b/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py
index 78adde9e4..d1074c109 100644
--- a/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py
+++ b/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py
@@ -14,32 +14,10 @@
from bs4 import BeautifulSoup
from mdx_to_storage import emit_document, parse_mdx
from mdx_to_storage.link_resolver import LinkResolver
+from reverse_sync.xhtml_normalizer import normalize_soup
from xhtml_beautify_diff import beautify_xhtml, xhtml_diff
-_IGNORED_ATTRIBUTES = {
- "ac:macro-id",
- "ac:local-id",
- "local-id",
- "ac:schema-version",
- "ri:version-at-save",
- "ac:original-height",
- "ac:original-width",
- "ac:custom-width",
- "ac:alt",
- "ac:layout",
- "data-table-width",
- "data-layout",
- "data-highlight-colour",
- "data-card-appearance",
- "ac:breakout-mode",
- "ac:breakout-width",
- "ri:space-key",
- "style",
- "class",
-}
-
-
@dataclass
class CaseVerification:
case_id: str
@@ -77,10 +55,7 @@ def mdx_to_storage_xhtml_fragment(
def _normalize_xhtml(xhtml: str, ignore_ri_filename: bool = False) -> str:
soup = BeautifulSoup(xhtml, "html.parser")
- _strip_layout_sections(soup)
- _strip_nonreversible_macros(soup)
- _strip_decorations(soup)
- _strip_ignored_attributes(soup, ignore_ri_filename=ignore_ri_filename)
+ normalize_soup(soup, ignore_ri_filename=ignore_ri_filename)
return beautify_xhtml(str(soup)).strip()
@@ -106,39 +81,6 @@ def verify_expected_mdx_against_page_xhtml(
return False, generated, "\n".join(diff_lines)
-def _strip_ignored_attributes(soup: BeautifulSoup, ignore_ri_filename: bool = False) -> None:
- ignored_attrs = set(_IGNORED_ATTRIBUTES)
- if ignore_ri_filename:
- ignored_attrs.add("ri:filename")
- for tag in soup.find_all(True):
- for attr in list(tag.attrs.keys()):
- if attr in ignored_attrs:
- del tag.attrs[attr]
-
-
-def _strip_layout_sections(soup: BeautifulSoup) -> None:
- for tag_name in ("ac:layout", "ac:layout-section", "ac:layout-cell"):
- for tag in soup.find_all(tag_name):
- tag.unwrap()
-
-
-def _strip_nonreversible_macros(soup: BeautifulSoup) -> None:
- for macro in soup.find_all("ac:structured-macro"):
- if macro.get("ac:name") in {"toc", "view-file"}:
- macro.decompose()
-
-
-def _strip_decorations(soup: BeautifulSoup) -> None:
- for tag_name in ("ac:adf-mark", "ac:inline-comment-marker"):
- for tag in soup.find_all(tag_name):
- tag.unwrap()
- for colgroup in soup.find_all("colgroup"):
- colgroup.decompose()
- for p in soup.find_all("p"):
- if not p.get_text(strip=True) and not p.find_all(True):
- p.decompose()
-
-
def iter_testcase_dirs(testcases_dir: Path) -> Iterable[Path]:
"""`page.xhtml`과 `expected.mdx`가 있는 테스트케이스 디렉토리를 순회한다."""
for child in sorted(testcases_dir.iterdir()):
diff --git a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
new file mode 100644
index 000000000..f0ea77e48
--- /dev/null
+++ b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py
@@ -0,0 +1,242 @@
+"""XHTML Normalizer — 공용 XHTML 정규화 및 plain-text 추출 유틸리티.
+
+reverse-sync 재구성 파이프라인의 공용 helper 모듈.
+BeautifulSoup 기반으로 fragment 비교, plain-text 추출, xpath 기반 fragment 추출을 제공한다.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+from bs4 import BeautifulSoup, NavigableString, Tag
+
+from reverse_sync.mapping_recorder import iter_block_children
+
+
+# ---------------------------------------------------------------------------
+# Ignored attributes — 비교 시 무시하는 Confluence 메타데이터 속성
+# ---------------------------------------------------------------------------
+
+IGNORED_ATTRIBUTES: frozenset[str] = frozenset({
+ "ac:macro-id",
+ "ac:local-id",
+ "local-id",
+ "ac:schema-version",
+ "ri:version-at-save",
+ "ac:original-height",
+ "ac:original-width",
+ "ac:custom-width",
+ "ac:alt",
+ "ac:layout",
+ "data-table-width",
+ "data-layout",
+ "data-highlight-colour",
+ "data-card-appearance",
+ "ac:breakout-mode",
+ "ac:breakout-width",
+ "ri:space-key",
+ "style",
+ "class",
+})
+
+
+# ---------------------------------------------------------------------------
+# Plain-text extraction
+# ---------------------------------------------------------------------------
+
+def extract_plain_text(fragment: str) -> str:
+ """XHTML fragment에서 plain text를 추출한다.
+
+ ac:emoticon의 fallback 텍스트를 포함하고,
+ ac:image만 preservation unit으로 제외한다.
+ 코드 블록 본문(ac:plain-text-body)과 링크 label(ac:link-body)은 포함한다.
+
+ 이 함수의 출력은 reconstruction에서 anchor offset 좌표의 기준이 된다.
+ """
+ soup = BeautifulSoup(fragment, "html.parser")
+ return _extract_text_from_element(soup)
+
+
+def _extract_text_from_element(element) -> str:
+ """재귀적으로 텍스트를 추출한다."""
+ parts: list[str] = []
+ for child in element.children:
+ if isinstance(child, NavigableString):
+ parts.append(str(child))
+ elif isinstance(child, Tag):
+ # emoticon은 fallback 텍스트 사용
+ if child.name == "ac:emoticon":
+ fallback = child.get("ac:emoji-fallback", "")
+ if fallback:
+ parts.append(fallback)
+ continue
+ # ac:image는 preservation unit — 텍스트 없음 (anchor로 처리)
+ if child.name == "ac:image":
+ continue
+ parts.append(_extract_text_from_element(child))
+ return "".join(parts)
+
+
+# ---------------------------------------------------------------------------
+# Fragment normalization
+# ---------------------------------------------------------------------------
+
+def normalize_soup(
+ soup: BeautifulSoup,
+ *,
+ strip_ignored_attrs: bool = True,
+ ignore_ri_filename: bool = False,
+) -> None:
+ """BeautifulSoup 객체를 in-place로 정규화한다.
+
+ normalize_fragment()와 verify 모듈이 공유하는 핵심 정규화 로직.
+ """
+ _strip_layout_sections(soup)
+ _strip_nonreversible_macros(soup)
+ _strip_decorations(soup)
+ if strip_ignored_attrs:
+ _strip_ignored_attributes(soup, ignore_ri_filename=ignore_ri_filename)
+
+
+def normalize_fragment(
+ fragment: str,
+ strip_ignored_attrs: bool = True,
+ ignore_ri_filename: bool = False,
+) -> str:
+ """XHTML fragment를 비교 가능한 정규화된 형태로 변환한다.
+
+ - layout section unwrap
+ - non-reversible macro 제거
+ - decoration unwrap + 빈 제거
+ - ignored attribute 제거 (선택)
+ - BeautifulSoup prettify로 노드별 줄바꿈
+ """
+ soup = BeautifulSoup(fragment, "html.parser")
+ normalize_soup(
+ soup,
+ strip_ignored_attrs=strip_ignored_attrs,
+ ignore_ri_filename=ignore_ri_filename,
+ )
+ return soup.prettify(formatter="minimal").strip()
+
+
+def _strip_layout_sections(soup: BeautifulSoup) -> None:
+ for tag_name in ("ac:layout", "ac:layout-section", "ac:layout-cell"):
+ for tag in soup.find_all(tag_name):
+ tag.unwrap()
+
+
+def _strip_nonreversible_macros(soup: BeautifulSoup) -> None:
+ for macro in soup.find_all("ac:structured-macro"):
+ if macro.get("ac:name") in {"toc", "view-file"}:
+ macro.decompose()
+
+
+def _strip_decorations(soup: BeautifulSoup) -> None:
+ for tag_name in ("ac:adf-mark", "ac:inline-comment-marker"):
+ for tag in soup.find_all(tag_name):
+ tag.unwrap()
+ for colgroup in soup.find_all("colgroup"):
+ colgroup.decompose()
+ # 빈
제거 (decoration unwrap 후 남는 빈 요소)
+ for p in soup.find_all("p"):
+ if not p.get_text(strip=True) and not p.find_all(True):
+ p.decompose()
+
+
+def _strip_ignored_attributes(
+ soup: BeautifulSoup,
+ extra: Optional[frozenset[str]] = None,
+ ignore_ri_filename: bool = False,
+) -> None:
+ ignored = IGNORED_ATTRIBUTES | extra if extra else set(IGNORED_ATTRIBUTES)
+ if ignore_ri_filename:
+ ignored = set(ignored) | {"ri:filename"}
+ for tag in soup.find_all(True):
+ for attr in list(tag.attrs.keys()):
+ if attr in ignored:
+ del tag.attrs[attr]
+
+
+# ---------------------------------------------------------------------------
+# Fragment extraction by XPath
+# ---------------------------------------------------------------------------
+
+def extract_fragment_by_xpath(page_xhtml: str, xpath: str) -> Optional[str]:
+ """page XHTML에서 간이 XPath로 요소를 찾아 outerHTML을 반환한다.
+
+ xpath 형식: "p[1]", "ul[2]", "macro-info[1]/p[1]"
+ """
+ soup = BeautifulSoup(page_xhtml, "html.parser")
+ element = _find_element_by_xpath(soup, xpath)
+ if element is None:
+ return None
+ return str(element)
+
+
+def _find_element_by_xpath(soup, xpath: str):
+ """간이 XPath로 요소를 찾는다."""
+ parts = xpath.split("/")
+ if len(parts) == 1:
+ return _find_element_by_simple_xpath(soup, xpath)
+
+ current = _find_element_by_simple_xpath(soup, parts[0])
+ if current is None:
+ return None
+
+ for part in parts[1:]:
+ container = _find_content_container(current)
+ if container is None:
+ if ":" in (current.name or ""):
+ return None
+ container = current
+ current = _find_element_by_simple_xpath(container, part)
+ if current is None:
+ return None
+
+ return current
+
+
+_XPATH_PATTERN = re.compile(r"([a-z0-9:-]+)\[(\d+)\]")
+
+
+def _find_element_by_simple_xpath(parent, xpath: str):
+ """단일 XPath 파트로 요소를 찾는다."""
+ match = _XPATH_PATTERN.match(xpath)
+ if not match:
+ return None
+ tag_name = match.group(1)
+ index = int(match.group(2)) # 1-based
+
+ macro_name = None
+ if tag_name.startswith("macro-"):
+ macro_name = tag_name[len("macro-"):]
+
+ count = 0
+ for child in iter_block_children(parent):
+ if not isinstance(child, Tag):
+ continue
+ if macro_name:
+ if child.name == "ac:structured-macro" and child.get("ac:name") == macro_name:
+ count += 1
+ if count == index:
+ return child
+ elif child.name == tag_name:
+ count += 1
+ if count == index:
+ return child
+ return None
+
+
+def _find_content_container(parent: Tag):
+ """복합 xpath의 부모에서 콘텐츠 컨테이너를 찾는다."""
+ rich_body = parent.find("ac:rich-text-body")
+ if rich_body is not None:
+ return rich_body
+ node = parent.find("ac:adf-node")
+ if node is not None:
+ content = node.find("ac:adf-content")
+ if content is not None:
+ return content
+ return None
diff --git a/confluence-mdx/bin/reverse_sync/xhtml_patcher.py b/confluence-mdx/bin/reverse_sync/xhtml_patcher.py
index bfb1f10c1..116ecf9f6 100644
--- a/confluence-mdx/bin/reverse_sync/xhtml_patcher.py
+++ b/confluence-mdx/bin/reverse_sync/xhtml_patcher.py
@@ -3,9 +3,7 @@
from bs4 import BeautifulSoup, NavigableString, Tag
import difflib
import re
-from reverse_sync.mapping_recorder import _iter_block_children
-
-from reverse_sync.mapping_recorder import _get_text_with_emoticons
+from reverse_sync.mapping_recorder import get_text_with_emoticons, iter_block_children
def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str:
@@ -69,7 +67,7 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str:
# patch 적용 시에는 기본 비교를 get_text()로 수행하고, 필요 시 emoticon fallback 텍스트 비교를 허용한다.
current_plain = element.get_text()
if old_text and current_plain.strip() != old_text.strip():
- current_plain_with_emoticons = _get_text_with_emoticons(element)
+ current_plain_with_emoticons = get_text_with_emoticons(element)
if current_plain_with_emoticons.strip() != old_text.strip():
continue
_replace_inner_html(element, patch['new_inner_xhtml'])
@@ -86,7 +84,7 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str:
# mapping plain(old_text)과의 비교는 get_text() 우선, 실패 시 emoticon fallback 포함 텍스트로 재확인한다.
current_plain = element.get_text()
if current_plain.strip() != old_text.strip():
- current_plain_with_emoticons = _get_text_with_emoticons(element)
+ current_plain_with_emoticons = get_text_with_emoticons(element)
if current_plain_with_emoticons.strip() != old_text.strip():
continue
_apply_text_changes(element, old_text, new_text)
@@ -130,7 +128,7 @@ def _insert_element_resolved(soup: BeautifulSoup, anchor, new_html: str):
def _find_first_block_element(soup: BeautifulSoup):
"""soup의 첫 번째 블록 레벨 요소를 찾는다."""
- for child in _iter_block_children(soup):
+ for child in iter_block_children(soup):
if isinstance(child, Tag):
return child
return None
@@ -212,7 +210,7 @@ def _find_element_by_simple_xpath(soup: BeautifulSoup, xpath: str):
macro_name = tag_name[len('macro-'):]
count = 0
- for child in _iter_block_children(soup):
+ for child in iter_block_children(soup):
if not isinstance(child, Tag):
continue
if macro_name:
diff --git a/confluence-mdx/tests/test_reverse_sync_list_tree.py b/confluence-mdx/tests/test_reverse_sync_list_tree.py
new file mode 100644
index 000000000..1cc5d79f9
--- /dev/null
+++ b/confluence-mdx/tests/test_reverse_sync_list_tree.py
@@ -0,0 +1,89 @@
+"""Level 0 helper tests — parse_list_tree() public API 검증.
+
+Phase 0 게이트: list tree helper가 public API로 정상 동작하는지 확인한다.
+"""
+
+import pytest
+
+from mdx_to_storage import ListNode, parse_list_tree
+
+
+class TestParseListTree:
+ """parse_list_tree() public API 검증."""
+
+ def test_simple_unordered_list(self):
+ content = "- Item 1\n- Item 2\n- Item 3"
+ roots = parse_list_tree(content)
+ assert len(roots) == 3
+ assert all(not node.ordered for node in roots)
+ assert roots[0].text == "Item 1"
+ assert roots[1].text == "Item 2"
+ assert roots[2].text == "Item 3"
+
+ def test_simple_ordered_list(self):
+ content = "1. First\n2. Second\n3. Third"
+ roots = parse_list_tree(content)
+ assert len(roots) == 3
+ assert all(node.ordered for node in roots)
+ assert roots[0].text == "First"
+ assert roots[0].start == 1
+ assert roots[1].start == 2
+ assert roots[2].start == 3
+
+ def test_nested_list(self):
+ content = "- Parent\n - Child 1\n - Child 2"
+ roots = parse_list_tree(content)
+ assert len(roots) == 1
+ assert roots[0].text == "Parent"
+ assert len(roots[0].children) == 2
+ assert roots[0].children[0].text == "Child 1"
+ assert roots[0].children[1].text == "Child 2"
+
+ def test_mixed_ordered_unordered(self):
+ content = "- Unordered\n1. Ordered"
+ roots = parse_list_tree(content)
+ assert len(roots) == 2
+ assert not roots[0].ordered
+ assert roots[1].ordered
+
+ def test_deeply_nested(self):
+ content = "- L0\n - L1\n - L2"
+ roots = parse_list_tree(content)
+ assert len(roots) == 1
+ assert len(roots[0].children) == 1
+ assert len(roots[0].children[0].children) == 1
+ assert roots[0].children[0].children[0].text == "L2"
+
+ def test_continuation_line(self):
+ content = "- Item with\n continuation"
+ roots = parse_list_tree(content)
+ assert len(roots) == 1
+ assert "continuation" in roots[0].text
+
+ def test_empty_content(self):
+ roots = parse_list_tree("")
+ assert roots == []
+
+ def test_list_node_type(self):
+ """반환값이 ListNode 인스턴스인지 확인."""
+ roots = parse_list_tree("- test")
+ assert isinstance(roots[0], ListNode)
+
+ def test_nested_ordered_under_unordered(self):
+ content = "- Parent\n 1. Child ordered"
+ roots = parse_list_tree(content)
+ assert len(roots) == 1
+ assert not roots[0].ordered
+ assert roots[0].start is None # unordered → no start
+ assert len(roots[0].children) == 1
+ assert roots[0].children[0].ordered
+ assert roots[0].children[0].start == 1
+
+ def test_ordered_list_start_number_preserved(self):
+ """중간부터 시작하는 ordered list의 marker number가 보존된다."""
+ content = "2. Second\n3. Third\n4. Fourth"
+ roots = parse_list_tree(content)
+ assert len(roots) == 3
+ assert roots[0].start == 2
+ assert roots[1].start == 3
+ assert roots[2].start == 4
diff --git a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
new file mode 100644
index 000000000..61fdef040
--- /dev/null
+++ b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py
@@ -0,0 +1,348 @@
+"""Level 0 helper tests — xhtml_normalizer 모듈 검증.
+
+Phase 0 게이트:
+- extract_plain_text: 다양한 XHTML fragment에서 plain text 추출
+- normalize_fragment: fragment 비교 정규화
+- extract_fragment_by_xpath: 간이 XPath 기반 fragment 추출
+"""
+
+import json
+from pathlib import Path
+
+import pytest
+
+from reverse_sync.xhtml_normalizer import (
+ extract_fragment_by_xpath,
+ extract_plain_text,
+ normalize_fragment,
+)
+
+TESTCASES_DIR = Path(__file__).parent / "testcases"
+
+
+# ---------------------------------------------------------------------------
+# extract_plain_text
+# ---------------------------------------------------------------------------
+
+class TestExtractPlainText:
+ """extract_plain_text() 기본 동작 검증."""
+
+ def test_simple_paragraph(self):
+ fragment = "
Hello world
"
+ assert extract_plain_text(fragment) == "Hello world"
+
+ def test_paragraph_with_bold(self):
+ fragment = "A bold text
"
+ assert extract_plain_text(fragment) == "A bold text"
+
+ def test_paragraph_with_link(self):
+ fragment = 'See example here
'
+ assert extract_plain_text(fragment) == "See example here"
+
+ def test_paragraph_with_inline_image_excluded(self):
+ """ac:image는 preservation unit이므로 plain text에서 제외된다."""
+ fragment = (
+ 'A '
+ ''
+ ' B
'
+ )
+ assert extract_plain_text(fragment) == "A B"
+
+ def test_paragraph_with_inline_link_no_body(self):
+ """ac:link에 link-body가 없으면 텍스트가 비어있다."""
+ fragment = (
+ 'Before '
+ ''
+ ' After
'
+ )
+ assert extract_plain_text(fragment) == "Before After"
+
+ def test_paragraph_with_inline_link_with_body(self):
+ """ac:link에 link-body가 있으면 visible label이 plain text에 포함된다."""
+ fragment = (
+ '참조: '
+ ''
+ 'Click here
'
+ )
+ assert extract_plain_text(fragment) == "참조: Click here"
+
+ def test_paragraph_with_emoticon(self):
+ """ac:emoticon의 fallback 텍스트가 포함된다."""
+ fragment = (
+ ''
+ ''
+ ' Success
'
+ )
+ assert extract_plain_text(fragment) == ":check_mark: Success"
+
+ def test_code_macro_body_included(self):
+ """코드 블록 본문(ac:plain-text-body)이 plain text에 포함된다."""
+ fragment = (
+ ''
+ 'python'
+ ''
+ ''
+ )
+ text = extract_plain_text(fragment)
+ assert 'print("hello")' in text
+
+ def test_list_plain_text(self):
+ fragment = ""
+ text = extract_plain_text(fragment)
+ assert "Item 1" in text
+ assert "Item 2" in text
+
+ def test_heading(self):
+ fragment = "Section Title
"
+ assert extract_plain_text(fragment) == "Section Title"
+
+ def test_nested_formatting(self):
+ fragment = "A bold italic text
"
+ assert extract_plain_text(fragment) == "A bold italic text"
+
+ def test_empty_fragment(self):
+ assert extract_plain_text("") == ""
+ assert extract_plain_text("") == ""
+
+ def test_callout_with_rich_body(self):
+ """callout macro 내부의 rich-text-body에서 텍스트를 추출한다."""
+ fragment = (
+ ''
+ 'Info text
'
+ ''
+ )
+ text = extract_plain_text(fragment)
+ assert "Info text" in text
+
+
+# ---------------------------------------------------------------------------
+# extract_plain_text — real testcase fixtures
+# ---------------------------------------------------------------------------
+
+class TestExtractPlainTextFromFixtures:
+ """실제 testcase fixture에서 extract_plain_text 동작 검증."""
+
+ @pytest.fixture
+ def sidecar_blocks(self):
+ """544113141 testcase의 sidecar blocks를 로드한다."""
+ path = TESTCASES_DIR / "544113141" / "expected.roundtrip.json"
+ if not path.exists():
+ pytest.skip("testcase fixture not found")
+ data = json.loads(path.read_text(encoding="utf-8"))
+ return data["blocks"]
+
+ def test_heading_fragment(self, sidecar_blocks):
+ """heading fragment의 plain text가 정확히 추출된다."""
+ block = sidecar_blocks[0] # h2[1] "Overview"
+ assert block["xhtml_xpath"] == "h2[1]"
+ text = extract_plain_text(block["xhtml_fragment"])
+ assert text == "Overview"
+
+ def test_paragraph_fragment(self, sidecar_blocks):
+ """paragraph fragment의 plain text가 정확히 추출된다."""
+ block = sidecar_blocks[1] # p[1]
+ assert block["xhtml_xpath"] == "p[1]"
+ text = extract_plain_text(block["xhtml_fragment"])
+ assert "조직에서 관리하는 DB 커넥션" in text
+
+ def test_list_with_image_fragment(self, sidecar_blocks):
+ """list + inline image fragment에서 image가 제외된다."""
+ block = sidecar_blocks[4] # ol[1]
+ assert block["xhtml_xpath"] == "ol[1]"
+ text = extract_plain_text(block["xhtml_fragment"])
+ # ac:image는 제외되므로 파일명이 없어야 함
+ assert "image-20240730" not in text
+ # 텍스트 내용은 포함
+ assert "DB Access History" in text
+
+
+# ---------------------------------------------------------------------------
+# normalize_fragment
+# ---------------------------------------------------------------------------
+
+class TestNormalizeFragment:
+ """normalize_fragment() 정규화 검증."""
+
+ def test_attribute_order_irrelevant(self):
+ """속성 순서가 달라도 정규화 결과가 같다."""
+ a = 'text
'
+ b = 'text
'
+ # class는 ignored attribute이므로 제거됨
+ norm_a = normalize_fragment(a)
+ norm_b = normalize_fragment(b)
+ assert norm_a == norm_b
+
+ def test_ignored_attributes_stripped(self):
+ """IGNORED_ATTRIBUTES에 해당하는 속성이 제거된다."""
+ fragment = ''
+ result = normalize_fragment(fragment)
+ assert "ac:macro-id" not in result
+ assert 'ac:align="center"' in result
+
+ def test_layout_sections_unwrapped(self):
+ fragment = 'content
'
+ result = normalize_fragment(fragment)
+ assert "ac:layout" not in result
+ assert "content" in result
+
+ def test_nonreversible_macros_removed(self):
+ fragment = 'keep
'
+ result = normalize_fragment(fragment)
+ assert "toc" not in result
+ assert "keep" in result
+
+ def test_decorations_unwrapped(self):
+ fragment = 'text
'
+ result = normalize_fragment(fragment)
+ assert "ac:inline-comment-marker" not in result
+ assert "text" in result
+
+ def test_same_content_normalizes_equal(self):
+ """내용이 동일한 두 fragment는 정규화 후 동일하다."""
+ a = "Hello world
"
+ b = "Hello world
"
+ assert normalize_fragment(a) == normalize_fragment(b)
+
+ def test_strip_ignored_attrs_option(self):
+ """strip_ignored_attrs=False면 속성을 유지한다."""
+ fragment = ''
+ with_strip = normalize_fragment(fragment, strip_ignored_attrs=True)
+ without_strip = normalize_fragment(fragment, strip_ignored_attrs=False)
+ assert "ac:macro-id" not in with_strip
+ assert "ac:macro-id" in without_strip
+
+ def test_ignore_ri_filename_option(self):
+ """ignore_ri_filename=True면 ri:filename 속성도 제거된다."""
+ fragment = ''
+ normal = normalize_fragment(fragment)
+ ignored = normalize_fragment(fragment, ignore_ri_filename=True)
+ assert 'ri:filename' in normal
+ assert 'ri:filename' not in ignored
+
+ def test_empty_paragraph_removed(self):
+ """빈 요소가 decoration unwrap 후 제거된다."""
+ fragment = '
keep
'
+ result = normalize_fragment(fragment)
+ # 빈 는 제거되고 keep만 남음
+ assert "keep" in result
+
+
+# ---------------------------------------------------------------------------
+# normalize_fragment — real testcase round-trip
+# ---------------------------------------------------------------------------
+
+class TestNormalizeFragmentRoundtrip:
+ """실제 testcase의 fragment를 정규화해서 자기 자신과 비교."""
+
+ @pytest.mark.parametrize("case_id", [
+ "544113141", "544381877", "544112828",
+ ])
+ def test_fragment_self_normalize_equal(self, case_id):
+ """같은 fragment를 두 번 정규화하면 결과가 동일하다 (idempotent)."""
+ path = TESTCASES_DIR / case_id / "expected.roundtrip.json"
+ if not path.exists():
+ pytest.skip(f"testcase {case_id} not found")
+ data = json.loads(path.read_text(encoding="utf-8"))
+ for block in data["blocks"]:
+ frag = block["xhtml_fragment"]
+ first = normalize_fragment(frag)
+ second = normalize_fragment(first)
+ assert first == second, (
+ f"normalize_fragment is not idempotent for "
+ f"{case_id} block {block['block_index']} ({block['xhtml_xpath']})"
+ )
+
+
+# ---------------------------------------------------------------------------
+# extract_fragment_by_xpath
+# ---------------------------------------------------------------------------
+
+class TestExtractFragmentByXpath:
+ """extract_fragment_by_xpath() 검증."""
+
+ def test_simple_xpath(self):
+ xhtml = "
Title
Para 1
Para 2
"
+ result = extract_fragment_by_xpath(xhtml, "p[2]")
+ assert result is not None
+ assert "Para 2" in result
+
+ def test_heading_xpath(self):
+ xhtml = "First
Second
"
+ result = extract_fragment_by_xpath(xhtml, "h2[2]")
+ assert result is not None
+ assert "Second" in result
+
+ def test_list_xpath(self):
+ xhtml = "text
"
+ result = extract_fragment_by_xpath(xhtml, "ul[1]")
+ assert result is not None
+ assert "item" in result
+
+ def test_macro_xpath(self):
+ xhtml = (
+ ''
+ 'info body
'
+ ''
+ )
+ result = extract_fragment_by_xpath(xhtml, "macro-info[1]")
+ assert result is not None
+ assert "info body" in result
+
+ def test_compound_xpath(self):
+ xhtml = (
+ ''
+ 'P1
P2
'
+ ''
+ )
+ result = extract_fragment_by_xpath(xhtml, "macro-note[1]/p[2]")
+ assert result is not None
+ assert "P2" in result
+
+ def test_nonexistent_xpath_returns_none(self):
+ xhtml = "only one
"
+ assert extract_fragment_by_xpath(xhtml, "p[2]") is None
+ assert extract_fragment_by_xpath(xhtml, "h2[1]") is None
+
+ def test_multi_level_xpath(self):
+ """ul[1]/li[2] 같은 다단계 xpath."""
+ xhtml = ""
+ result = extract_fragment_by_xpath(xhtml, "ul[1]/li[2]")
+ assert result is not None
+ assert "B" in result
+
+
+# ---------------------------------------------------------------------------
+# extract_fragment_by_xpath — real testcase fixtures
+# ---------------------------------------------------------------------------
+
+class TestExtractFragmentByXpathFromFixtures:
+ """실제 testcase page.xhtml에서 xpath 추출 검증."""
+
+ @pytest.mark.parametrize("case_id", [
+ "544113141", "544381877",
+ ])
+ def test_sidecar_xpath_matches_page(self, case_id):
+ """sidecar의 xhtml_xpath로 page.xhtml에서 fragment를 추출할 수 있다."""
+ sidecar_path = TESTCASES_DIR / case_id / "expected.roundtrip.json"
+ page_path = TESTCASES_DIR / case_id / "page.xhtml"
+ if not sidecar_path.exists() or not page_path.exists():
+ pytest.skip(f"testcase {case_id} not found")
+
+ data = json.loads(sidecar_path.read_text(encoding="utf-8"))
+ page_xhtml = page_path.read_text(encoding="utf-8")
+
+ for block in data["blocks"]:
+ xpath = block["xhtml_xpath"]
+ # compound xpath(child xpath)는 top-level만 테스트
+ if "/" in xpath:
+ continue
+ extracted = extract_fragment_by_xpath(page_xhtml, xpath)
+ assert extracted is not None, (
+ f"Failed to extract {xpath} from {case_id}"
+ )
+ # 추출된 fragment의 plain text가 sidecar fragment와 일치
+ expected_text = extract_plain_text(block["xhtml_fragment"])
+ actual_text = extract_plain_text(extracted)
+ assert expected_text.strip() == actual_text.strip(), (
+ f"Plain text mismatch for {case_id} {xpath}"
+ )