diff --git a/confluence-mdx/bin/mdx_to_storage/__init__.py b/confluence-mdx/bin/mdx_to_storage/__init__.py index ee914b48d..2c4abde96 100644 --- a/confluence-mdx/bin/mdx_to_storage/__init__.py +++ b/confluence-mdx/bin/mdx_to_storage/__init__.py @@ -1,6 +1,6 @@ """MDX -> Confluence Storage XHTML conversion package.""" -from .emitter import emit_block, emit_document +from .emitter import ListNode, emit_block, emit_document, parse_list_tree from .inline import convert_heading_inline, convert_inline from .link_resolver import LinkResolver, PageEntry, load_pages_yaml from .parser import Block, parse_mdx, parse_mdx_blocks @@ -8,12 +8,14 @@ __all__ = [ "Block", "LinkResolver", + "ListNode", "PageEntry", "convert_heading_inline", "convert_inline", "emit_block", "emit_document", "load_pages_yaml", + "parse_list_tree", "parse_mdx", "parse_mdx_blocks", ] diff --git a/confluence-mdx/bin/mdx_to_storage/emitter.py b/confluence-mdx/bin/mdx_to_storage/emitter.py index bbb099872..952e1a198 100644 --- a/confluence-mdx/bin/mdx_to_storage/emitter.py +++ b/confluence-mdx/bin/mdx_to_storage/emitter.py @@ -11,7 +11,7 @@ from .parser import Block, HEADING_PATTERN -_ORDERED_LIST_PATTERN = re.compile(r"^\d+\.\s+(.*)$") +_ORDERED_LIST_PATTERN = re.compile(r"^(\d+)\.\s+(.*)$") _UNORDERED_LIST_PATTERN = re.compile(r"^[-*+]\s+(.*)$") _HEADING_LINE_PATTERN = HEADING_PATTERN _CALLOUT_TYPE_TO_MACRO = { @@ -37,12 +37,22 @@ _IMG_ATTR_RE = re.compile(r'(\w[\w-]*)=(?:"([^"]*)"|\'([^\']*)\')') -class _ListNode: - def __init__(self, ordered: bool, text: str, depth: int) -> None: +class ListNode: + """List item node for tree-based list representation. + + Public API for reconstruction pipeline. + """ + + def __init__(self, ordered: bool, text: str, depth: int, start: int | None = None) -> None: self.ordered = ordered self.text = text self.depth = depth - self.children: list["_ListNode"] = [] + self.start = start # ordered list marker number (e.g. 2 for "2. item") + self.children: list["ListNode"] = [] + + +# backward compat alias (internal) +_ListNode = ListNode def emit_block(block: Block, context: Optional[dict] = None) -> str: @@ -159,6 +169,15 @@ def _emit_single_depth_list(content: str, link_resolver: Optional[LinkResolver] return _render_list_nodes(roots, link_resolver=link_resolver) +def parse_list_tree(content: str) -> list[ListNode]: + """MDX list content를 파싱하여 tree 구조의 ListNode 리스트를 반환한다. + + Public API — reverse-sync reconstruction pipeline에서 사용한다. + """ + items = _parse_list_items(content) + return _build_list_tree(items) + + def _parse_list_items(content: str) -> list[_ListNode]: items: list[_ListNode] = [] for line in content.splitlines(): @@ -171,7 +190,8 @@ def _parse_list_items(content: str) -> list[_ListNode]: ordered_match = _ORDERED_LIST_PATTERN.match(stripped) if ordered_match: - items.append(_ListNode(True, ordered_match.group(1), depth)) + marker_num = int(ordered_match.group(1)) + items.append(_ListNode(True, ordered_match.group(2), depth, start=marker_num)) continue unordered_match = _UNORDERED_LIST_PATTERN.match(stripped) @@ -216,7 +236,8 @@ def _render_list_nodes( body = "".join(_render_list_item(node, link_resolver=link_resolver) for node in group) if tag == "ol": - parts.append(f'
    {body}
') + start = group[0].start if group[0].start is not None else 1 + parts.append(f'
    {body}
') else: parts.append(f"") return "".join(parts) diff --git a/confluence-mdx/bin/reverse_sync/fragment_extractor.py b/confluence-mdx/bin/reverse_sync/fragment_extractor.py index 8f839246c..b249dad40 100644 --- a/confluence-mdx/bin/reverse_sync/fragment_extractor.py +++ b/confluence-mdx/bin/reverse_sync/fragment_extractor.py @@ -14,7 +14,7 @@ from bs4 import BeautifulSoup, NavigableString, Tag -from reverse_sync.mapping_recorder import _iter_block_children +from reverse_sync.mapping_recorder import iter_block_children @dataclass @@ -43,7 +43,7 @@ def extract_block_fragments(xhtml_text: str) -> FragmentExtractionResult: # Top-level element 순서 파악 top_elements: List[Tuple[str, str]] = [] - for child in _iter_block_children(soup): + for child in iter_block_children(soup): if isinstance(child, Tag): top_elements.append(("tag", child.name)) elif isinstance(child, NavigableString): diff --git a/confluence-mdx/bin/reverse_sync/mapping_recorder.py b/confluence-mdx/bin/reverse_sync/mapping_recorder.py index 9ef485e37..275a2fe6f 100644 --- a/confluence-mdx/bin/reverse_sync/mapping_recorder.py +++ b/confluence-mdx/bin/reverse_sync/mapping_recorder.py @@ -17,10 +17,13 @@ class BlockMapping: HEADING_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'} -_CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'}) +CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'}) +# backward-compat aliases +_CALLOUT_MACRO_NAMES = CALLOUT_MACRO_NAMES -def _get_text_with_emoticons(element) -> str: + +def get_text_with_emoticons(element) -> str: """get_text()와 동일하지만 ac:emoticon의 fallback 텍스트를 포함한다. Confluence의 태그는 self-closing으로 텍스트 노드가 없어서 @@ -38,11 +41,14 @@ def _get_text_with_emoticons(element) -> str: if fallback: parts.append(fallback) else: - parts.append(_get_text_with_emoticons(item)) + parts.append(get_text_with_emoticons(item)) return ''.join(parts) +# backward-compat alias +_get_text_with_emoticons = get_text_with_emoticons + -def _iter_block_children(parent): +def iter_block_children(parent): """블록 레벨 자식을 순회한다. ac:layout은 cell 내부로 진입한다.""" for child in parent.children: if isinstance(child, Tag) and child.name == 'ac:layout': @@ -52,6 +58,9 @@ def _iter_block_children(parent): else: yield child +# backward-compat alias +_iter_block_children = iter_block_children + def record_mapping(xhtml: str) -> List[BlockMapping]: """XHTML에서 블록 레벨 요소를 추출하여 매핑 레코드를 생성한다.""" @@ -59,7 +68,7 @@ def record_mapping(xhtml: str) -> List[BlockMapping]: mappings: List[BlockMapping] = [] counters: dict = {} - for child in _iter_block_children(soup): + for child in iter_block_children(soup): if isinstance(child, NavigableString): if child.strip(): _add_mapping(mappings, counters, 'p', child.strip(), child.strip()) @@ -93,24 +102,24 @@ def record_mapping(xhtml: str) -> List[BlockMapping]: block_type='code') else: # Callout 매크로: body 텍스트만 추출 (파라미터 메타데이터 제외) - if macro_name in _CALLOUT_MACRO_NAMES: + if macro_name in CALLOUT_MACRO_NAMES: rich_body = child.find('ac:rich-text-body') - plain = _get_text_with_emoticons(rich_body) if rich_body else child.get_text() + plain = get_text_with_emoticons(rich_body) if rich_body else child.get_text() else: plain = child.get_text() _add_mapping(mappings, counters, f'macro-{macro_name}', str(child), plain, block_type='html_block') # Callout 매크로: 자식 요소 개별 매핑 추가 - if macro_name in _CALLOUT_MACRO_NAMES: + if macro_name in CALLOUT_MACRO_NAMES: parent_mapping = mappings[-1] _add_rich_text_body_children( child, parent_mapping, mappings, counters) elif tag_name == 'ac:adf-extension': - panel_type = _get_adf_panel_type(child) + panel_type = get_adf_panel_type(child) plain = child.get_text() _add_mapping(mappings, counters, tag_name, str(child), plain, block_type='html_block') - if panel_type in _CALLOUT_MACRO_NAMES: + if panel_type in CALLOUT_MACRO_NAMES: parent_mapping = mappings[-1] _add_adf_content_children( child, parent_mapping, mappings, counters) @@ -172,7 +181,7 @@ def _add_container_children( child_counters[tag] = child_counters.get(tag, 0) + 1 child_xpath = f"{parent_xpath}/{tag}[{child_counters[tag]}]" - plain = _get_text_with_emoticons(child) + plain = get_text_with_emoticons(child) if tag in ('ul', 'ol', 'table'): inner = str(child) else: @@ -206,7 +215,7 @@ def _add_rich_text_body_children( _add_container_children(rich_body, parent_mapping, mappings, counters) -def _get_adf_panel_type(element: Tag) -> str: +def get_adf_panel_type(element: Tag) -> str: """ac:adf-extension 요소에서 panel-type을 추출한다.""" node = element.find('ac:adf-node') if node is None: @@ -216,14 +225,20 @@ def _get_adf_panel_type(element: Tag) -> str: return '' return attr.get_text().strip() +# backward-compat alias +_get_adf_panel_type = get_adf_panel_type -def _get_adf_content_body(element: Tag): + +def get_adf_content_body(element: Tag): """ac:adf-extension 요소에서 ac:adf-content를 찾는다.""" node = element.find('ac:adf-node') if node is None: return None return node.find('ac:adf-content') +# backward-compat alias +_get_adf_content_body = get_adf_content_body + def _add_adf_content_children( adf_element: Tag, @@ -232,5 +247,5 @@ def _add_adf_content_children( counters: dict, ): """ac:adf-extension의 ac:adf-content 내 자식 요소를 개별 매핑으로 추가한다.""" - content_body = _get_adf_content_body(adf_element) + content_body = get_adf_content_body(adf_element) _add_container_children(content_body, parent_mapping, mappings, counters) diff --git a/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py b/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py index 78adde9e4..d1074c109 100644 --- a/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py +++ b/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py @@ -14,32 +14,10 @@ from bs4 import BeautifulSoup from mdx_to_storage import emit_document, parse_mdx from mdx_to_storage.link_resolver import LinkResolver +from reverse_sync.xhtml_normalizer import normalize_soup from xhtml_beautify_diff import beautify_xhtml, xhtml_diff -_IGNORED_ATTRIBUTES = { - "ac:macro-id", - "ac:local-id", - "local-id", - "ac:schema-version", - "ri:version-at-save", - "ac:original-height", - "ac:original-width", - "ac:custom-width", - "ac:alt", - "ac:layout", - "data-table-width", - "data-layout", - "data-highlight-colour", - "data-card-appearance", - "ac:breakout-mode", - "ac:breakout-width", - "ri:space-key", - "style", - "class", -} - - @dataclass class CaseVerification: case_id: str @@ -77,10 +55,7 @@ def mdx_to_storage_xhtml_fragment( def _normalize_xhtml(xhtml: str, ignore_ri_filename: bool = False) -> str: soup = BeautifulSoup(xhtml, "html.parser") - _strip_layout_sections(soup) - _strip_nonreversible_macros(soup) - _strip_decorations(soup) - _strip_ignored_attributes(soup, ignore_ri_filename=ignore_ri_filename) + normalize_soup(soup, ignore_ri_filename=ignore_ri_filename) return beautify_xhtml(str(soup)).strip() @@ -106,39 +81,6 @@ def verify_expected_mdx_against_page_xhtml( return False, generated, "\n".join(diff_lines) -def _strip_ignored_attributes(soup: BeautifulSoup, ignore_ri_filename: bool = False) -> None: - ignored_attrs = set(_IGNORED_ATTRIBUTES) - if ignore_ri_filename: - ignored_attrs.add("ri:filename") - for tag in soup.find_all(True): - for attr in list(tag.attrs.keys()): - if attr in ignored_attrs: - del tag.attrs[attr] - - -def _strip_layout_sections(soup: BeautifulSoup) -> None: - for tag_name in ("ac:layout", "ac:layout-section", "ac:layout-cell"): - for tag in soup.find_all(tag_name): - tag.unwrap() - - -def _strip_nonreversible_macros(soup: BeautifulSoup) -> None: - for macro in soup.find_all("ac:structured-macro"): - if macro.get("ac:name") in {"toc", "view-file"}: - macro.decompose() - - -def _strip_decorations(soup: BeautifulSoup) -> None: - for tag_name in ("ac:adf-mark", "ac:inline-comment-marker"): - for tag in soup.find_all(tag_name): - tag.unwrap() - for colgroup in soup.find_all("colgroup"): - colgroup.decompose() - for p in soup.find_all("p"): - if not p.get_text(strip=True) and not p.find_all(True): - p.decompose() - - def iter_testcase_dirs(testcases_dir: Path) -> Iterable[Path]: """`page.xhtml`과 `expected.mdx`가 있는 테스트케이스 디렉토리를 순회한다.""" for child in sorted(testcases_dir.iterdir()): diff --git a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py new file mode 100644 index 000000000..f0ea77e48 --- /dev/null +++ b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py @@ -0,0 +1,242 @@ +"""XHTML Normalizer — 공용 XHTML 정규화 및 plain-text 추출 유틸리티. + +reverse-sync 재구성 파이프라인의 공용 helper 모듈. +BeautifulSoup 기반으로 fragment 비교, plain-text 추출, xpath 기반 fragment 추출을 제공한다. +""" + +from __future__ import annotations + +import re +from typing import Optional + +from bs4 import BeautifulSoup, NavigableString, Tag + +from reverse_sync.mapping_recorder import iter_block_children + + +# --------------------------------------------------------------------------- +# Ignored attributes — 비교 시 무시하는 Confluence 메타데이터 속성 +# --------------------------------------------------------------------------- + +IGNORED_ATTRIBUTES: frozenset[str] = frozenset({ + "ac:macro-id", + "ac:local-id", + "local-id", + "ac:schema-version", + "ri:version-at-save", + "ac:original-height", + "ac:original-width", + "ac:custom-width", + "ac:alt", + "ac:layout", + "data-table-width", + "data-layout", + "data-highlight-colour", + "data-card-appearance", + "ac:breakout-mode", + "ac:breakout-width", + "ri:space-key", + "style", + "class", +}) + + +# --------------------------------------------------------------------------- +# Plain-text extraction +# --------------------------------------------------------------------------- + +def extract_plain_text(fragment: str) -> str: + """XHTML fragment에서 plain text를 추출한다. + + ac:emoticon의 fallback 텍스트를 포함하고, + ac:image만 preservation unit으로 제외한다. + 코드 블록 본문(ac:plain-text-body)과 링크 label(ac:link-body)은 포함한다. + + 이 함수의 출력은 reconstruction에서 anchor offset 좌표의 기준이 된다. + """ + soup = BeautifulSoup(fragment, "html.parser") + return _extract_text_from_element(soup) + + +def _extract_text_from_element(element) -> str: + """재귀적으로 텍스트를 추출한다.""" + parts: list[str] = [] + for child in element.children: + if isinstance(child, NavigableString): + parts.append(str(child)) + elif isinstance(child, Tag): + # emoticon은 fallback 텍스트 사용 + if child.name == "ac:emoticon": + fallback = child.get("ac:emoji-fallback", "") + if fallback: + parts.append(fallback) + continue + # ac:image는 preservation unit — 텍스트 없음 (anchor로 처리) + if child.name == "ac:image": + continue + parts.append(_extract_text_from_element(child)) + return "".join(parts) + + +# --------------------------------------------------------------------------- +# Fragment normalization +# --------------------------------------------------------------------------- + +def normalize_soup( + soup: BeautifulSoup, + *, + strip_ignored_attrs: bool = True, + ignore_ri_filename: bool = False, +) -> None: + """BeautifulSoup 객체를 in-place로 정규화한다. + + normalize_fragment()와 verify 모듈이 공유하는 핵심 정규화 로직. + """ + _strip_layout_sections(soup) + _strip_nonreversible_macros(soup) + _strip_decorations(soup) + if strip_ignored_attrs: + _strip_ignored_attributes(soup, ignore_ri_filename=ignore_ri_filename) + + +def normalize_fragment( + fragment: str, + strip_ignored_attrs: bool = True, + ignore_ri_filename: bool = False, +) -> str: + """XHTML fragment를 비교 가능한 정규화된 형태로 변환한다. + + - layout section unwrap + - non-reversible macro 제거 + - decoration unwrap + 빈

제거 + - ignored attribute 제거 (선택) + - BeautifulSoup prettify로 노드별 줄바꿈 + """ + soup = BeautifulSoup(fragment, "html.parser") + normalize_soup( + soup, + strip_ignored_attrs=strip_ignored_attrs, + ignore_ri_filename=ignore_ri_filename, + ) + return soup.prettify(formatter="minimal").strip() + + +def _strip_layout_sections(soup: BeautifulSoup) -> None: + for tag_name in ("ac:layout", "ac:layout-section", "ac:layout-cell"): + for tag in soup.find_all(tag_name): + tag.unwrap() + + +def _strip_nonreversible_macros(soup: BeautifulSoup) -> None: + for macro in soup.find_all("ac:structured-macro"): + if macro.get("ac:name") in {"toc", "view-file"}: + macro.decompose() + + +def _strip_decorations(soup: BeautifulSoup) -> None: + for tag_name in ("ac:adf-mark", "ac:inline-comment-marker"): + for tag in soup.find_all(tag_name): + tag.unwrap() + for colgroup in soup.find_all("colgroup"): + colgroup.decompose() + # 빈

제거 (decoration unwrap 후 남는 빈 요소) + for p in soup.find_all("p"): + if not p.get_text(strip=True) and not p.find_all(True): + p.decompose() + + +def _strip_ignored_attributes( + soup: BeautifulSoup, + extra: Optional[frozenset[str]] = None, + ignore_ri_filename: bool = False, +) -> None: + ignored = IGNORED_ATTRIBUTES | extra if extra else set(IGNORED_ATTRIBUTES) + if ignore_ri_filename: + ignored = set(ignored) | {"ri:filename"} + for tag in soup.find_all(True): + for attr in list(tag.attrs.keys()): + if attr in ignored: + del tag.attrs[attr] + + +# --------------------------------------------------------------------------- +# Fragment extraction by XPath +# --------------------------------------------------------------------------- + +def extract_fragment_by_xpath(page_xhtml: str, xpath: str) -> Optional[str]: + """page XHTML에서 간이 XPath로 요소를 찾아 outerHTML을 반환한다. + + xpath 형식: "p[1]", "ul[2]", "macro-info[1]/p[1]" + """ + soup = BeautifulSoup(page_xhtml, "html.parser") + element = _find_element_by_xpath(soup, xpath) + if element is None: + return None + return str(element) + + +def _find_element_by_xpath(soup, xpath: str): + """간이 XPath로 요소를 찾는다.""" + parts = xpath.split("/") + if len(parts) == 1: + return _find_element_by_simple_xpath(soup, xpath) + + current = _find_element_by_simple_xpath(soup, parts[0]) + if current is None: + return None + + for part in parts[1:]: + container = _find_content_container(current) + if container is None: + if ":" in (current.name or ""): + return None + container = current + current = _find_element_by_simple_xpath(container, part) + if current is None: + return None + + return current + + +_XPATH_PATTERN = re.compile(r"([a-z0-9:-]+)\[(\d+)\]") + + +def _find_element_by_simple_xpath(parent, xpath: str): + """단일 XPath 파트로 요소를 찾는다.""" + match = _XPATH_PATTERN.match(xpath) + if not match: + return None + tag_name = match.group(1) + index = int(match.group(2)) # 1-based + + macro_name = None + if tag_name.startswith("macro-"): + macro_name = tag_name[len("macro-"):] + + count = 0 + for child in iter_block_children(parent): + if not isinstance(child, Tag): + continue + if macro_name: + if child.name == "ac:structured-macro" and child.get("ac:name") == macro_name: + count += 1 + if count == index: + return child + elif child.name == tag_name: + count += 1 + if count == index: + return child + return None + + +def _find_content_container(parent: Tag): + """복합 xpath의 부모에서 콘텐츠 컨테이너를 찾는다.""" + rich_body = parent.find("ac:rich-text-body") + if rich_body is not None: + return rich_body + node = parent.find("ac:adf-node") + if node is not None: + content = node.find("ac:adf-content") + if content is not None: + return content + return None diff --git a/confluence-mdx/bin/reverse_sync/xhtml_patcher.py b/confluence-mdx/bin/reverse_sync/xhtml_patcher.py index bfb1f10c1..116ecf9f6 100644 --- a/confluence-mdx/bin/reverse_sync/xhtml_patcher.py +++ b/confluence-mdx/bin/reverse_sync/xhtml_patcher.py @@ -3,9 +3,7 @@ from bs4 import BeautifulSoup, NavigableString, Tag import difflib import re -from reverse_sync.mapping_recorder import _iter_block_children - -from reverse_sync.mapping_recorder import _get_text_with_emoticons +from reverse_sync.mapping_recorder import get_text_with_emoticons, iter_block_children def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str: @@ -69,7 +67,7 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str: # patch 적용 시에는 기본 비교를 get_text()로 수행하고, 필요 시 emoticon fallback 텍스트 비교를 허용한다. current_plain = element.get_text() if old_text and current_plain.strip() != old_text.strip(): - current_plain_with_emoticons = _get_text_with_emoticons(element) + current_plain_with_emoticons = get_text_with_emoticons(element) if current_plain_with_emoticons.strip() != old_text.strip(): continue _replace_inner_html(element, patch['new_inner_xhtml']) @@ -86,7 +84,7 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str: # mapping plain(old_text)과의 비교는 get_text() 우선, 실패 시 emoticon fallback 포함 텍스트로 재확인한다. current_plain = element.get_text() if current_plain.strip() != old_text.strip(): - current_plain_with_emoticons = _get_text_with_emoticons(element) + current_plain_with_emoticons = get_text_with_emoticons(element) if current_plain_with_emoticons.strip() != old_text.strip(): continue _apply_text_changes(element, old_text, new_text) @@ -130,7 +128,7 @@ def _insert_element_resolved(soup: BeautifulSoup, anchor, new_html: str): def _find_first_block_element(soup: BeautifulSoup): """soup의 첫 번째 블록 레벨 요소를 찾는다.""" - for child in _iter_block_children(soup): + for child in iter_block_children(soup): if isinstance(child, Tag): return child return None @@ -212,7 +210,7 @@ def _find_element_by_simple_xpath(soup: BeautifulSoup, xpath: str): macro_name = tag_name[len('macro-'):] count = 0 - for child in _iter_block_children(soup): + for child in iter_block_children(soup): if not isinstance(child, Tag): continue if macro_name: diff --git a/confluence-mdx/tests/test_reverse_sync_list_tree.py b/confluence-mdx/tests/test_reverse_sync_list_tree.py new file mode 100644 index 000000000..1cc5d79f9 --- /dev/null +++ b/confluence-mdx/tests/test_reverse_sync_list_tree.py @@ -0,0 +1,89 @@ +"""Level 0 helper tests — parse_list_tree() public API 검증. + +Phase 0 게이트: list tree helper가 public API로 정상 동작하는지 확인한다. +""" + +import pytest + +from mdx_to_storage import ListNode, parse_list_tree + + +class TestParseListTree: + """parse_list_tree() public API 검증.""" + + def test_simple_unordered_list(self): + content = "- Item 1\n- Item 2\n- Item 3" + roots = parse_list_tree(content) + assert len(roots) == 3 + assert all(not node.ordered for node in roots) + assert roots[0].text == "Item 1" + assert roots[1].text == "Item 2" + assert roots[2].text == "Item 3" + + def test_simple_ordered_list(self): + content = "1. First\n2. Second\n3. Third" + roots = parse_list_tree(content) + assert len(roots) == 3 + assert all(node.ordered for node in roots) + assert roots[0].text == "First" + assert roots[0].start == 1 + assert roots[1].start == 2 + assert roots[2].start == 3 + + def test_nested_list(self): + content = "- Parent\n - Child 1\n - Child 2" + roots = parse_list_tree(content) + assert len(roots) == 1 + assert roots[0].text == "Parent" + assert len(roots[0].children) == 2 + assert roots[0].children[0].text == "Child 1" + assert roots[0].children[1].text == "Child 2" + + def test_mixed_ordered_unordered(self): + content = "- Unordered\n1. Ordered" + roots = parse_list_tree(content) + assert len(roots) == 2 + assert not roots[0].ordered + assert roots[1].ordered + + def test_deeply_nested(self): + content = "- L0\n - L1\n - L2" + roots = parse_list_tree(content) + assert len(roots) == 1 + assert len(roots[0].children) == 1 + assert len(roots[0].children[0].children) == 1 + assert roots[0].children[0].children[0].text == "L2" + + def test_continuation_line(self): + content = "- Item with\n continuation" + roots = parse_list_tree(content) + assert len(roots) == 1 + assert "continuation" in roots[0].text + + def test_empty_content(self): + roots = parse_list_tree("") + assert roots == [] + + def test_list_node_type(self): + """반환값이 ListNode 인스턴스인지 확인.""" + roots = parse_list_tree("- test") + assert isinstance(roots[0], ListNode) + + def test_nested_ordered_under_unordered(self): + content = "- Parent\n 1. Child ordered" + roots = parse_list_tree(content) + assert len(roots) == 1 + assert not roots[0].ordered + assert roots[0].start is None # unordered → no start + assert len(roots[0].children) == 1 + assert roots[0].children[0].ordered + assert roots[0].children[0].start == 1 + + def test_ordered_list_start_number_preserved(self): + """중간부터 시작하는 ordered list의 marker number가 보존된다.""" + content = "2. Second\n3. Third\n4. Fourth" + roots = parse_list_tree(content) + assert len(roots) == 3 + assert roots[0].start == 2 + assert roots[1].start == 3 + assert roots[2].start == 4 diff --git a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py new file mode 100644 index 000000000..61fdef040 --- /dev/null +++ b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py @@ -0,0 +1,348 @@ +"""Level 0 helper tests — xhtml_normalizer 모듈 검증. + +Phase 0 게이트: +- extract_plain_text: 다양한 XHTML fragment에서 plain text 추출 +- normalize_fragment: fragment 비교 정규화 +- extract_fragment_by_xpath: 간이 XPath 기반 fragment 추출 +""" + +import json +from pathlib import Path + +import pytest + +from reverse_sync.xhtml_normalizer import ( + extract_fragment_by_xpath, + extract_plain_text, + normalize_fragment, +) + +TESTCASES_DIR = Path(__file__).parent / "testcases" + + +# --------------------------------------------------------------------------- +# extract_plain_text +# --------------------------------------------------------------------------- + +class TestExtractPlainText: + """extract_plain_text() 기본 동작 검증.""" + + def test_simple_paragraph(self): + fragment = "

Hello world

" + assert extract_plain_text(fragment) == "Hello world" + + def test_paragraph_with_bold(self): + fragment = "

A bold text

" + assert extract_plain_text(fragment) == "A bold text" + + def test_paragraph_with_link(self): + fragment = '

See example here

' + assert extract_plain_text(fragment) == "See example here" + + def test_paragraph_with_inline_image_excluded(self): + """ac:image는 preservation unit이므로 plain text에서 제외된다.""" + fragment = ( + '

A ' + '' + ' B

' + ) + assert extract_plain_text(fragment) == "A B" + + def test_paragraph_with_inline_link_no_body(self): + """ac:link에 link-body가 없으면 텍스트가 비어있다.""" + fragment = ( + '

Before ' + '' + ' After

' + ) + assert extract_plain_text(fragment) == "Before After" + + def test_paragraph_with_inline_link_with_body(self): + """ac:link에 link-body가 있으면 visible label이 plain text에 포함된다.""" + fragment = ( + '

참조: ' + '' + 'Click here

' + ) + assert extract_plain_text(fragment) == "참조: Click here" + + def test_paragraph_with_emoticon(self): + """ac:emoticon의 fallback 텍스트가 포함된다.""" + fragment = ( + '

' + '' + ' Success

' + ) + assert extract_plain_text(fragment) == ":check_mark: Success" + + def test_code_macro_body_included(self): + """코드 블록 본문(ac:plain-text-body)이 plain text에 포함된다.""" + fragment = ( + '' + 'python' + '' + '' + ) + text = extract_plain_text(fragment) + assert 'print("hello")' in text + + def test_list_plain_text(self): + fragment = "" + text = extract_plain_text(fragment) + assert "Item 1" in text + assert "Item 2" in text + + def test_heading(self): + fragment = "

Section Title

" + assert extract_plain_text(fragment) == "Section Title" + + def test_nested_formatting(self): + fragment = "

A bold italic text

" + assert extract_plain_text(fragment) == "A bold italic text" + + def test_empty_fragment(self): + assert extract_plain_text("") == "" + assert extract_plain_text("

") == "" + + def test_callout_with_rich_body(self): + """callout macro 내부의 rich-text-body에서 텍스트를 추출한다.""" + fragment = ( + '' + '

Info text

' + '' + ) + text = extract_plain_text(fragment) + assert "Info text" in text + + +# --------------------------------------------------------------------------- +# extract_plain_text — real testcase fixtures +# --------------------------------------------------------------------------- + +class TestExtractPlainTextFromFixtures: + """실제 testcase fixture에서 extract_plain_text 동작 검증.""" + + @pytest.fixture + def sidecar_blocks(self): + """544113141 testcase의 sidecar blocks를 로드한다.""" + path = TESTCASES_DIR / "544113141" / "expected.roundtrip.json" + if not path.exists(): + pytest.skip("testcase fixture not found") + data = json.loads(path.read_text(encoding="utf-8")) + return data["blocks"] + + def test_heading_fragment(self, sidecar_blocks): + """heading fragment의 plain text가 정확히 추출된다.""" + block = sidecar_blocks[0] # h2[1] "Overview" + assert block["xhtml_xpath"] == "h2[1]" + text = extract_plain_text(block["xhtml_fragment"]) + assert text == "Overview" + + def test_paragraph_fragment(self, sidecar_blocks): + """paragraph fragment의 plain text가 정확히 추출된다.""" + block = sidecar_blocks[1] # p[1] + assert block["xhtml_xpath"] == "p[1]" + text = extract_plain_text(block["xhtml_fragment"]) + assert "조직에서 관리하는 DB 커넥션" in text + + def test_list_with_image_fragment(self, sidecar_blocks): + """list + inline image fragment에서 image가 제외된다.""" + block = sidecar_blocks[4] # ol[1] + assert block["xhtml_xpath"] == "ol[1]" + text = extract_plain_text(block["xhtml_fragment"]) + # ac:image는 제외되므로 파일명이 없어야 함 + assert "image-20240730" not in text + # 텍스트 내용은 포함 + assert "DB Access History" in text + + +# --------------------------------------------------------------------------- +# normalize_fragment +# --------------------------------------------------------------------------- + +class TestNormalizeFragment: + """normalize_fragment() 정규화 검증.""" + + def test_attribute_order_irrelevant(self): + """속성 순서가 달라도 정규화 결과가 같다.""" + a = '

text

' + b = '

text

' + # class는 ignored attribute이므로 제거됨 + norm_a = normalize_fragment(a) + norm_b = normalize_fragment(b) + assert norm_a == norm_b + + def test_ignored_attributes_stripped(self): + """IGNORED_ATTRIBUTES에 해당하는 속성이 제거된다.""" + fragment = '' + result = normalize_fragment(fragment) + assert "ac:macro-id" not in result + assert 'ac:align="center"' in result + + def test_layout_sections_unwrapped(self): + fragment = '

content

' + result = normalize_fragment(fragment) + assert "ac:layout" not in result + assert "content" in result + + def test_nonreversible_macros_removed(self): + fragment = '

keep

' + result = normalize_fragment(fragment) + assert "toc" not in result + assert "keep" in result + + def test_decorations_unwrapped(self): + fragment = '

text

' + result = normalize_fragment(fragment) + assert "ac:inline-comment-marker" not in result + assert "text" in result + + def test_same_content_normalizes_equal(self): + """내용이 동일한 두 fragment는 정규화 후 동일하다.""" + a = "

Hello world

" + b = "

Hello world

" + assert normalize_fragment(a) == normalize_fragment(b) + + def test_strip_ignored_attrs_option(self): + """strip_ignored_attrs=False면 속성을 유지한다.""" + fragment = '' + with_strip = normalize_fragment(fragment, strip_ignored_attrs=True) + without_strip = normalize_fragment(fragment, strip_ignored_attrs=False) + assert "ac:macro-id" not in with_strip + assert "ac:macro-id" in without_strip + + def test_ignore_ri_filename_option(self): + """ignore_ri_filename=True면 ri:filename 속성도 제거된다.""" + fragment = '' + normal = normalize_fragment(fragment) + ignored = normalize_fragment(fragment, ignore_ri_filename=True) + assert 'ri:filename' in normal + assert 'ri:filename' not in ignored + + def test_empty_paragraph_removed(self): + """빈

요소가 decoration unwrap 후 제거된다.""" + fragment = '

keep

' + result = normalize_fragment(fragment) + # 빈

는 제거되고 keep만 남음 + assert "keep" in result + + +# --------------------------------------------------------------------------- +# normalize_fragment — real testcase round-trip +# --------------------------------------------------------------------------- + +class TestNormalizeFragmentRoundtrip: + """실제 testcase의 fragment를 정규화해서 자기 자신과 비교.""" + + @pytest.mark.parametrize("case_id", [ + "544113141", "544381877", "544112828", + ]) + def test_fragment_self_normalize_equal(self, case_id): + """같은 fragment를 두 번 정규화하면 결과가 동일하다 (idempotent).""" + path = TESTCASES_DIR / case_id / "expected.roundtrip.json" + if not path.exists(): + pytest.skip(f"testcase {case_id} not found") + data = json.loads(path.read_text(encoding="utf-8")) + for block in data["blocks"]: + frag = block["xhtml_fragment"] + first = normalize_fragment(frag) + second = normalize_fragment(first) + assert first == second, ( + f"normalize_fragment is not idempotent for " + f"{case_id} block {block['block_index']} ({block['xhtml_xpath']})" + ) + + +# --------------------------------------------------------------------------- +# extract_fragment_by_xpath +# --------------------------------------------------------------------------- + +class TestExtractFragmentByXpath: + """extract_fragment_by_xpath() 검증.""" + + def test_simple_xpath(self): + xhtml = "

Title

Para 1

Para 2

" + result = extract_fragment_by_xpath(xhtml, "p[2]") + assert result is not None + assert "Para 2" in result + + def test_heading_xpath(self): + xhtml = "

First

Second

" + result = extract_fragment_by_xpath(xhtml, "h2[2]") + assert result is not None + assert "Second" in result + + def test_list_xpath(self): + xhtml = "

text

  • item

" + result = extract_fragment_by_xpath(xhtml, "ul[1]") + assert result is not None + assert "item" in result + + def test_macro_xpath(self): + xhtml = ( + '' + '

info body

' + '
' + ) + result = extract_fragment_by_xpath(xhtml, "macro-info[1]") + assert result is not None + assert "info body" in result + + def test_compound_xpath(self): + xhtml = ( + '' + '

P1

P2

' + '
' + ) + result = extract_fragment_by_xpath(xhtml, "macro-note[1]/p[2]") + assert result is not None + assert "P2" in result + + def test_nonexistent_xpath_returns_none(self): + xhtml = "

only one

" + assert extract_fragment_by_xpath(xhtml, "p[2]") is None + assert extract_fragment_by_xpath(xhtml, "h2[1]") is None + + def test_multi_level_xpath(self): + """ul[1]/li[2] 같은 다단계 xpath.""" + xhtml = "
  • A

  • B

" + result = extract_fragment_by_xpath(xhtml, "ul[1]/li[2]") + assert result is not None + assert "B" in result + + +# --------------------------------------------------------------------------- +# extract_fragment_by_xpath — real testcase fixtures +# --------------------------------------------------------------------------- + +class TestExtractFragmentByXpathFromFixtures: + """실제 testcase page.xhtml에서 xpath 추출 검증.""" + + @pytest.mark.parametrize("case_id", [ + "544113141", "544381877", + ]) + def test_sidecar_xpath_matches_page(self, case_id): + """sidecar의 xhtml_xpath로 page.xhtml에서 fragment를 추출할 수 있다.""" + sidecar_path = TESTCASES_DIR / case_id / "expected.roundtrip.json" + page_path = TESTCASES_DIR / case_id / "page.xhtml" + if not sidecar_path.exists() or not page_path.exists(): + pytest.skip(f"testcase {case_id} not found") + + data = json.loads(sidecar_path.read_text(encoding="utf-8")) + page_xhtml = page_path.read_text(encoding="utf-8") + + for block in data["blocks"]: + xpath = block["xhtml_xpath"] + # compound xpath(child xpath)는 top-level만 테스트 + if "/" in xpath: + continue + extracted = extract_fragment_by_xpath(page_xhtml, xpath) + assert extracted is not None, ( + f"Failed to extract {xpath} from {case_id}" + ) + # 추출된 fragment의 plain text가 sidecar fragment와 일치 + expected_text = extract_plain_text(block["xhtml_fragment"]) + actual_text = extract_plain_text(extracted) + assert expected_text.strip() == actual_text.strip(), ( + f"Plain text mismatch for {case_id} {xpath}" + )