From 6e1f75c89aa8c2b88d40a4aeb6a75ce88f6965f6 Mon Sep 17 00:00:00 2001 From: JK Date: Fri, 13 Mar 2026 21:32:22 +0900 Subject: [PATCH 1/4] =?UTF-8?q?confluence-mdx:=20Phase=200=20=EA=B3=B5?= =?UTF-8?q?=EC=9A=A9=20helper=20=EC=B6=94=EC=B6=9C=20=E2=80=94=20xhtml=5Fn?= =?UTF-8?q?ormalizer=20=EB=B0=8F=20list=20tree=20public=20API=EB=A5=BC=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80=ED=95=A9=EB=8B=88=EB=8B=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - xhtml_normalizer.py: extract_plain_text, normalize_fragment, extract_fragment_by_xpath 구현 - emitter.py: ListNode, parse_list_tree public API 승격 - Level 0 helper tests 43개 추가 (전체 820 pass) Co-Authored-By: Claude Opus 4.6 --- confluence-mdx/bin/mdx_to_storage/__init__.py | 4 +- confluence-mdx/bin/mdx_to_storage/emitter.py | 22 +- .../bin/reverse_sync/xhtml_normalizer.py | 225 ++++++++++++ .../tests/test_reverse_sync_list_tree.py | 75 ++++ .../test_reverse_sync_xhtml_normalizer.py | 323 ++++++++++++++++++ 5 files changed, 646 insertions(+), 3 deletions(-) create mode 100644 confluence-mdx/bin/reverse_sync/xhtml_normalizer.py create mode 100644 confluence-mdx/tests/test_reverse_sync_list_tree.py create mode 100644 confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py diff --git a/confluence-mdx/bin/mdx_to_storage/__init__.py b/confluence-mdx/bin/mdx_to_storage/__init__.py index ee914b48d..2c4abde96 100644 --- a/confluence-mdx/bin/mdx_to_storage/__init__.py +++ b/confluence-mdx/bin/mdx_to_storage/__init__.py @@ -1,6 +1,6 @@ """MDX -> Confluence Storage XHTML conversion package.""" -from .emitter import emit_block, emit_document +from .emitter import ListNode, emit_block, emit_document, parse_list_tree from .inline import convert_heading_inline, convert_inline from .link_resolver import LinkResolver, PageEntry, load_pages_yaml from .parser import Block, parse_mdx, parse_mdx_blocks @@ -8,12 +8,14 @@ __all__ = [ "Block", "LinkResolver", + "ListNode", "PageEntry", "convert_heading_inline", "convert_inline", "emit_block", "emit_document", "load_pages_yaml", + "parse_list_tree", "parse_mdx", "parse_mdx_blocks", ] diff --git a/confluence-mdx/bin/mdx_to_storage/emitter.py b/confluence-mdx/bin/mdx_to_storage/emitter.py index bbb099872..ba4340f70 100644 --- a/confluence-mdx/bin/mdx_to_storage/emitter.py +++ b/confluence-mdx/bin/mdx_to_storage/emitter.py @@ -37,12 +37,21 @@ _IMG_ATTR_RE = re.compile(r'(\w[\w-]*)=(?:"([^"]*)"|\'([^\']*)\')') -class _ListNode: +class ListNode: + """List item node for tree-based list representation. + + Public API for reconstruction pipeline. + """ + def __init__(self, ordered: bool, text: str, depth: int) -> None: self.ordered = ordered self.text = text self.depth = depth - self.children: list["_ListNode"] = [] + self.children: list["ListNode"] = [] + + +# backward compat alias (internal) +_ListNode = ListNode def emit_block(block: Block, context: Optional[dict] = None) -> str: @@ -159,6 +168,15 @@ def _emit_single_depth_list(content: str, link_resolver: Optional[LinkResolver] return _render_list_nodes(roots, link_resolver=link_resolver) +def parse_list_tree(content: str) -> list[ListNode]: + """MDX list content를 파싱하여 tree 구조의 ListNode 리스트를 반환한다. + + Public API — reverse-sync reconstruction pipeline에서 사용한다. + """ + items = _parse_list_items(content) + return _build_list_tree(items) + + def _parse_list_items(content: str) -> list[_ListNode]: items: list[_ListNode] = [] for line in content.splitlines(): diff --git a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py new file mode 100644 index 000000000..6b7377180 --- /dev/null +++ b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py @@ -0,0 +1,225 @@ +"""XHTML Normalizer — 공용 XHTML 정규화 및 plain-text 추출 유틸리티. + +reverse-sync 재구성 파이프라인의 공용 helper 모듈. +BeautifulSoup 기반으로 fragment 비교, plain-text 추출, xpath 기반 fragment 추출을 제공한다. +""" + +from __future__ import annotations + +import re +from typing import Optional + +from bs4 import BeautifulSoup, NavigableString, Tag + + +# --------------------------------------------------------------------------- +# Ignored attributes — 비교 시 무시하는 Confluence 메타데이터 속성 +# --------------------------------------------------------------------------- + +IGNORED_ATTRIBUTES: frozenset[str] = frozenset({ + "ac:macro-id", + "ac:local-id", + "local-id", + "ac:schema-version", + "ri:version-at-save", + "ac:original-height", + "ac:original-width", + "ac:custom-width", + "ac:alt", + "ac:layout", + "data-table-width", + "data-layout", + "data-highlight-colour", + "data-card-appearance", + "ac:breakout-mode", + "ac:breakout-width", + "ri:space-key", + "style", + "class", +}) + + +# --------------------------------------------------------------------------- +# Plain-text extraction +# --------------------------------------------------------------------------- + +def extract_plain_text(fragment: str) -> str: + """XHTML fragment에서 plain text를 추출한다. + + ac:plain-text-body(코드 블록 본문)는 제외하고, + ac:emoticon의 fallback 텍스트는 포함한다. + + 이 함수의 출력은 reconstruction에서 anchor offset 좌표의 기준이 된다. + """ + soup = BeautifulSoup(fragment, "html.parser") + return _extract_text_from_element(soup) + + +def _extract_text_from_element(element) -> str: + """재귀적으로 텍스트를 추출한다.""" + parts: list[str] = [] + for child in element.children: + if isinstance(child, NavigableString): + parts.append(str(child)) + elif isinstance(child, Tag): + # 코드 블록 본문은 제외 + if child.name == "ac:plain-text-body": + continue + # emoticon은 fallback 텍스트 사용 + if child.name == "ac:emoticon": + fallback = child.get("ac:emoji-fallback", "") + if fallback: + parts.append(fallback) + continue + # ac:image, ac:link 등 preservation unit은 텍스트 없음 (anchor로 처리) + if child.name in ("ac:image", "ac:link"): + continue + parts.append(_extract_text_from_element(child)) + return "".join(parts) + + +# --------------------------------------------------------------------------- +# Fragment normalization +# --------------------------------------------------------------------------- + +def normalize_fragment(fragment: str, strip_ignored_attrs: bool = True) -> str: + """XHTML fragment를 비교 가능한 정규화된 형태로 변환한다. + + - layout section unwrap + - non-reversible macro 제거 + - decoration unwrap + - ignored attribute 제거 (선택) + - BeautifulSoup prettify로 노드별 줄바꿈 + """ + soup = BeautifulSoup(fragment, "html.parser") + _strip_layout_sections(soup) + _strip_nonreversible_macros(soup) + _strip_decorations(soup) + if strip_ignored_attrs: + _strip_ignored_attributes(soup) + return soup.prettify(formatter="minimal").strip() + + +def _strip_layout_sections(soup: BeautifulSoup) -> None: + for tag_name in ("ac:layout", "ac:layout-section", "ac:layout-cell"): + for tag in soup.find_all(tag_name): + tag.unwrap() + + +def _strip_nonreversible_macros(soup: BeautifulSoup) -> None: + for macro in soup.find_all("ac:structured-macro"): + if macro.get("ac:name") in {"toc", "view-file"}: + macro.decompose() + + +def _strip_decorations(soup: BeautifulSoup) -> None: + for tag_name in ("ac:adf-mark", "ac:inline-comment-marker"): + for tag in soup.find_all(tag_name): + tag.unwrap() + for colgroup in soup.find_all("colgroup"): + colgroup.decompose() + + +def _strip_ignored_attributes( + soup: BeautifulSoup, + extra: Optional[frozenset[str]] = None, +) -> None: + ignored = IGNORED_ATTRIBUTES | extra if extra else IGNORED_ATTRIBUTES + for tag in soup.find_all(True): + for attr in list(tag.attrs.keys()): + if attr in ignored: + del tag.attrs[attr] + + +# --------------------------------------------------------------------------- +# Fragment extraction by XPath +# --------------------------------------------------------------------------- + +def extract_fragment_by_xpath(page_xhtml: str, xpath: str) -> Optional[str]: + """page XHTML에서 간이 XPath로 요소를 찾아 outerHTML을 반환한다. + + xpath 형식: "p[1]", "ul[2]", "macro-info[1]/p[1]" + """ + soup = BeautifulSoup(page_xhtml, "html.parser") + element = _find_element_by_xpath(soup, xpath) + if element is None: + return None + return str(element) + + +def _find_element_by_xpath(soup, xpath: str): + """간이 XPath로 요소를 찾는다.""" + parts = xpath.split("/") + if len(parts) == 1: + return _find_element_by_simple_xpath(soup, xpath) + + current = _find_element_by_simple_xpath(soup, parts[0]) + if current is None: + return None + + for part in parts[1:]: + container = _find_content_container(current) + if container is None: + if ":" in (current.name or ""): + return None + container = current + current = _find_element_by_simple_xpath(container, part) + if current is None: + return None + + return current + + +_XPATH_PATTERN = re.compile(r"([a-z0-9:-]+)\[(\d+)\]") + + +def _find_element_by_simple_xpath(parent, xpath: str): + """단일 XPath 파트로 요소를 찾는다.""" + match = _XPATH_PATTERN.match(xpath) + if not match: + return None + tag_name = match.group(1) + index = int(match.group(2)) # 1-based + + macro_name = None + if tag_name.startswith("macro-"): + macro_name = tag_name[len("macro-"):] + + count = 0 + for child in _iter_block_children(parent): + if not isinstance(child, Tag): + continue + if macro_name: + if child.name == "ac:structured-macro" and child.get("ac:name") == macro_name: + count += 1 + if count == index: + return child + elif child.name == tag_name: + count += 1 + if count == index: + return child + return None + + +def _iter_block_children(parent): + """블록 레벨 자식을 순회한다. ac:layout은 cell 내부로 진입한다.""" + for child in parent.children: + if isinstance(child, Tag) and child.name == "ac:layout": + for section in child.find_all("ac:layout-section", recursive=False): + for cell in section.find_all("ac:layout-cell", recursive=False): + yield from cell.children + else: + yield child + + +def _find_content_container(parent: Tag): + """복합 xpath의 부모에서 콘텐츠 컨테이너를 찾는다.""" + rich_body = parent.find("ac:rich-text-body") + if rich_body is not None: + return rich_body + node = parent.find("ac:adf-node") + if node is not None: + content = node.find("ac:adf-content") + if content is not None: + return content + return None diff --git a/confluence-mdx/tests/test_reverse_sync_list_tree.py b/confluence-mdx/tests/test_reverse_sync_list_tree.py new file mode 100644 index 000000000..25016f8b4 --- /dev/null +++ b/confluence-mdx/tests/test_reverse_sync_list_tree.py @@ -0,0 +1,75 @@ +"""Level 0 helper tests — parse_list_tree() public API 검증. + +Phase 0 게이트: list tree helper가 public API로 정상 동작하는지 확인한다. +""" + +import pytest + +from mdx_to_storage import ListNode, parse_list_tree + + +class TestParseListTree: + """parse_list_tree() public API 검증.""" + + def test_simple_unordered_list(self): + content = "- Item 1\n- Item 2\n- Item 3" + roots = parse_list_tree(content) + assert len(roots) == 3 + assert all(not node.ordered for node in roots) + assert roots[0].text == "Item 1" + assert roots[1].text == "Item 2" + assert roots[2].text == "Item 3" + + def test_simple_ordered_list(self): + content = "1. First\n2. Second\n3. Third" + roots = parse_list_tree(content) + assert len(roots) == 3 + assert all(node.ordered for node in roots) + assert roots[0].text == "First" + + def test_nested_list(self): + content = "- Parent\n - Child 1\n - Child 2" + roots = parse_list_tree(content) + assert len(roots) == 1 + assert roots[0].text == "Parent" + assert len(roots[0].children) == 2 + assert roots[0].children[0].text == "Child 1" + assert roots[0].children[1].text == "Child 2" + + def test_mixed_ordered_unordered(self): + content = "- Unordered\n1. Ordered" + roots = parse_list_tree(content) + assert len(roots) == 2 + assert not roots[0].ordered + assert roots[1].ordered + + def test_deeply_nested(self): + content = "- L0\n - L1\n - L2" + roots = parse_list_tree(content) + assert len(roots) == 1 + assert len(roots[0].children) == 1 + assert len(roots[0].children[0].children) == 1 + assert roots[0].children[0].children[0].text == "L2" + + def test_continuation_line(self): + content = "- Item with\n continuation" + roots = parse_list_tree(content) + assert len(roots) == 1 + assert "continuation" in roots[0].text + + def test_empty_content(self): + roots = parse_list_tree("") + assert roots == [] + + def test_list_node_type(self): + """반환값이 ListNode 인스턴스인지 확인.""" + roots = parse_list_tree("- test") + assert isinstance(roots[0], ListNode) + + def test_nested_ordered_under_unordered(self): + content = "- Parent\n 1. Child ordered" + roots = parse_list_tree(content) + assert len(roots) == 1 + assert not roots[0].ordered + assert len(roots[0].children) == 1 + assert roots[0].children[0].ordered diff --git a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py new file mode 100644 index 000000000..2c08201a6 --- /dev/null +++ b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py @@ -0,0 +1,323 @@ +"""Level 0 helper tests — xhtml_normalizer 모듈 검증. + +Phase 0 게이트: +- extract_plain_text: 다양한 XHTML fragment에서 plain text 추출 +- normalize_fragment: fragment 비교 정규화 +- extract_fragment_by_xpath: 간이 XPath 기반 fragment 추출 +""" + +import json +from pathlib import Path + +import pytest + +from reverse_sync.xhtml_normalizer import ( + extract_fragment_by_xpath, + extract_plain_text, + normalize_fragment, +) + +TESTCASES_DIR = Path(__file__).parent / "testcases" + + +# --------------------------------------------------------------------------- +# extract_plain_text +# --------------------------------------------------------------------------- + +class TestExtractPlainText: + """extract_plain_text() 기본 동작 검증.""" + + def test_simple_paragraph(self): + fragment = "

Hello world

" + assert extract_plain_text(fragment) == "Hello world" + + def test_paragraph_with_bold(self): + fragment = "

A bold text

" + assert extract_plain_text(fragment) == "A bold text" + + def test_paragraph_with_link(self): + fragment = '

See example here

' + assert extract_plain_text(fragment) == "See example here" + + def test_paragraph_with_inline_image_excluded(self): + """ac:image는 preservation unit이므로 plain text에서 제외된다.""" + fragment = ( + '

A ' + '' + ' B

' + ) + assert extract_plain_text(fragment) == "A B" + + def test_paragraph_with_inline_link_excluded(self): + """ac:link는 preservation unit이므로 plain text에서 제외된다.""" + fragment = ( + '

Before ' + '' + ' After

' + ) + assert extract_plain_text(fragment) == "Before After" + + def test_paragraph_with_emoticon(self): + """ac:emoticon의 fallback 텍스트가 포함된다.""" + fragment = ( + '

' + '' + ' Success

' + ) + assert extract_plain_text(fragment) == ":check_mark: Success" + + def test_code_macro_body_excluded(self): + """ac:plain-text-body(코드 블록 본문)는 제외된다.""" + fragment = ( + '' + 'python' + '' + '' + ) + assert extract_plain_text(fragment).strip() == "python" + + def test_list_plain_text(self): + fragment = "" + text = extract_plain_text(fragment) + assert "Item 1" in text + assert "Item 2" in text + + def test_heading(self): + fragment = "

Section Title

" + assert extract_plain_text(fragment) == "Section Title" + + def test_nested_formatting(self): + fragment = "

A bold italic text

" + assert extract_plain_text(fragment) == "A bold italic text" + + def test_empty_fragment(self): + assert extract_plain_text("") == "" + assert extract_plain_text("

") == "" + + def test_callout_with_rich_body(self): + """callout macro 내부의 rich-text-body에서 텍스트를 추출한다.""" + fragment = ( + '' + '

Info text

' + '' + ) + text = extract_plain_text(fragment) + assert "Info text" in text + + +# --------------------------------------------------------------------------- +# extract_plain_text — real testcase fixtures +# --------------------------------------------------------------------------- + +class TestExtractPlainTextFromFixtures: + """실제 testcase fixture에서 extract_plain_text 동작 검증.""" + + @pytest.fixture + def sidecar_blocks(self): + """544113141 testcase의 sidecar blocks를 로드한다.""" + path = TESTCASES_DIR / "544113141" / "expected.roundtrip.json" + if not path.exists(): + pytest.skip("testcase fixture not found") + data = json.loads(path.read_text(encoding="utf-8")) + return data["blocks"] + + def test_heading_fragment(self, sidecar_blocks): + """heading fragment의 plain text가 정확히 추출된다.""" + block = sidecar_blocks[0] # h2[1] "Overview" + assert block["xhtml_xpath"] == "h2[1]" + text = extract_plain_text(block["xhtml_fragment"]) + assert text == "Overview" + + def test_paragraph_fragment(self, sidecar_blocks): + """paragraph fragment의 plain text가 정확히 추출된다.""" + block = sidecar_blocks[1] # p[1] + assert block["xhtml_xpath"] == "p[1]" + text = extract_plain_text(block["xhtml_fragment"]) + assert "조직에서 관리하는 DB 커넥션" in text + + def test_list_with_image_fragment(self, sidecar_blocks): + """list + inline image fragment에서 image가 제외된다.""" + block = sidecar_blocks[4] # ol[1] + assert block["xhtml_xpath"] == "ol[1]" + text = extract_plain_text(block["xhtml_fragment"]) + # ac:image는 제외되므로 파일명이 없어야 함 + assert "image-20240730" not in text + # 텍스트 내용은 포함 + assert "DB Access History" in text + + +# --------------------------------------------------------------------------- +# normalize_fragment +# --------------------------------------------------------------------------- + +class TestNormalizeFragment: + """normalize_fragment() 정규화 검증.""" + + def test_attribute_order_irrelevant(self): + """속성 순서가 달라도 정규화 결과가 같다.""" + a = '

text

' + b = '

text

' + # class는 ignored attribute이므로 제거됨 + norm_a = normalize_fragment(a) + norm_b = normalize_fragment(b) + assert norm_a == norm_b + + def test_ignored_attributes_stripped(self): + """IGNORED_ATTRIBUTES에 해당하는 속성이 제거된다.""" + fragment = '' + result = normalize_fragment(fragment) + assert "ac:macro-id" not in result + assert 'ac:align="center"' in result + + def test_layout_sections_unwrapped(self): + fragment = '

content

' + result = normalize_fragment(fragment) + assert "ac:layout" not in result + assert "content" in result + + def test_nonreversible_macros_removed(self): + fragment = '

keep

' + result = normalize_fragment(fragment) + assert "toc" not in result + assert "keep" in result + + def test_decorations_unwrapped(self): + fragment = '

text

' + result = normalize_fragment(fragment) + assert "ac:inline-comment-marker" not in result + assert "text" in result + + def test_same_content_normalizes_equal(self): + """내용이 동일한 두 fragment는 정규화 후 동일하다.""" + a = "

Hello world

" + b = "

Hello world

" + assert normalize_fragment(a) == normalize_fragment(b) + + def test_strip_ignored_attrs_option(self): + """strip_ignored_attrs=False면 속성을 유지한다.""" + fragment = '' + with_strip = normalize_fragment(fragment, strip_ignored_attrs=True) + without_strip = normalize_fragment(fragment, strip_ignored_attrs=False) + assert "ac:macro-id" not in with_strip + assert "ac:macro-id" in without_strip + + +# --------------------------------------------------------------------------- +# normalize_fragment — real testcase round-trip +# --------------------------------------------------------------------------- + +class TestNormalizeFragmentRoundtrip: + """실제 testcase의 fragment를 정규화해서 자기 자신과 비교.""" + + @pytest.mark.parametrize("case_id", [ + "544113141", "544381877", "544112828", + ]) + def test_fragment_self_normalize_equal(self, case_id): + """같은 fragment를 두 번 정규화하면 결과가 동일하다 (idempotent).""" + path = TESTCASES_DIR / case_id / "expected.roundtrip.json" + if not path.exists(): + pytest.skip(f"testcase {case_id} not found") + data = json.loads(path.read_text(encoding="utf-8")) + for block in data["blocks"]: + frag = block["xhtml_fragment"] + first = normalize_fragment(frag) + second = normalize_fragment(first) + assert first == second, ( + f"normalize_fragment is not idempotent for " + f"{case_id} block {block['block_index']} ({block['xhtml_xpath']})" + ) + + +# --------------------------------------------------------------------------- +# extract_fragment_by_xpath +# --------------------------------------------------------------------------- + +class TestExtractFragmentByXpath: + """extract_fragment_by_xpath() 검증.""" + + def test_simple_xpath(self): + xhtml = "

Title

Para 1

Para 2

" + result = extract_fragment_by_xpath(xhtml, "p[2]") + assert result is not None + assert "Para 2" in result + + def test_heading_xpath(self): + xhtml = "

First

Second

" + result = extract_fragment_by_xpath(xhtml, "h2[2]") + assert result is not None + assert "Second" in result + + def test_list_xpath(self): + xhtml = "

text

  • item

" + result = extract_fragment_by_xpath(xhtml, "ul[1]") + assert result is not None + assert "item" in result + + def test_macro_xpath(self): + xhtml = ( + '' + '

info body

' + '
' + ) + result = extract_fragment_by_xpath(xhtml, "macro-info[1]") + assert result is not None + assert "info body" in result + + def test_compound_xpath(self): + xhtml = ( + '' + '

P1

P2

' + '
' + ) + result = extract_fragment_by_xpath(xhtml, "macro-note[1]/p[2]") + assert result is not None + assert "P2" in result + + def test_nonexistent_xpath_returns_none(self): + xhtml = "

only one

" + assert extract_fragment_by_xpath(xhtml, "p[2]") is None + assert extract_fragment_by_xpath(xhtml, "h2[1]") is None + + def test_multi_level_xpath(self): + """ul[1]/li[2] 같은 다단계 xpath.""" + xhtml = "
  • A

  • B

" + result = extract_fragment_by_xpath(xhtml, "ul[1]/li[2]") + assert result is not None + assert "B" in result + + +# --------------------------------------------------------------------------- +# extract_fragment_by_xpath — real testcase fixtures +# --------------------------------------------------------------------------- + +class TestExtractFragmentByXpathFromFixtures: + """실제 testcase page.xhtml에서 xpath 추출 검증.""" + + @pytest.mark.parametrize("case_id", [ + "544113141", "544381877", + ]) + def test_sidecar_xpath_matches_page(self, case_id): + """sidecar의 xhtml_xpath로 page.xhtml에서 fragment를 추출할 수 있다.""" + sidecar_path = TESTCASES_DIR / case_id / "expected.roundtrip.json" + page_path = TESTCASES_DIR / case_id / "page.xhtml" + if not sidecar_path.exists() or not page_path.exists(): + pytest.skip(f"testcase {case_id} not found") + + data = json.loads(sidecar_path.read_text(encoding="utf-8")) + page_xhtml = page_path.read_text(encoding="utf-8") + + for block in data["blocks"]: + xpath = block["xhtml_xpath"] + # compound xpath(child xpath)는 top-level만 테스트 + if "/" in xpath: + continue + extracted = extract_fragment_by_xpath(page_xhtml, xpath) + assert extracted is not None, ( + f"Failed to extract {xpath} from {case_id}" + ) + # 추출된 fragment의 plain text가 sidecar fragment와 일치 + expected_text = extract_plain_text(block["xhtml_fragment"]) + actual_text = extract_plain_text(extracted) + assert expected_text.strip() == actual_text.strip(), ( + f"Plain text mismatch for {case_id} {xpath}" + ) From 4d98766f41e6aea0354a035c603ddf978e20400b Mon Sep 17 00:00:00 2001 From: JK Date: Fri, 13 Mar 2026 22:05:27 +0900 Subject: [PATCH 2/4] =?UTF-8?q?refactor(reverse=5Fsync):=20xhtml=5Fnormali?= =?UTF-8?q?zer=20DRY=20=EA=B0=9C=EC=84=A0=20=EB=B0=8F=20=EA=B3=B5=EC=9A=A9?= =?UTF-8?q?=20helper=20=EC=8A=B9=EA=B2=A9=ED=95=A9=EB=8B=88=EB=8B=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - mapping_recorder의 _iter_block_children, _get_text_with_emoticons 등을 public API로 승격하고 backward-compat alias를 추가합니다 - xhtml_normalizer에서 중복 정의를 제거하고 mapping_recorder에서 import합니다 - normalize_fragment에 ignore_ri_filename 파라미터를 추가합니다 - _strip_decorations에 빈

제거 로직을 추가합니다 - mdx_to_storage_xhtml_verify의 중복 정규화 코드를 normalize_soup()으로 교체합니다 - 새 테스트 2건 추가 (ignore_ri_filename, empty_paragraph_removed) Co-Authored-By: Claude Opus 4.6 --- .../bin/reverse_sync/fragment_extractor.py | 4 +- .../bin/reverse_sync/mapping_recorder.py | 43 ++++++++----- .../mdx_to_storage_xhtml_verify.py | 62 +------------------ .../bin/reverse_sync/xhtml_normalizer.py | 59 ++++++++++++------ .../bin/reverse_sync/xhtml_patcher.py | 12 ++-- .../test_reverse_sync_xhtml_normalizer.py | 15 +++++ 6 files changed, 92 insertions(+), 103 deletions(-) diff --git a/confluence-mdx/bin/reverse_sync/fragment_extractor.py b/confluence-mdx/bin/reverse_sync/fragment_extractor.py index 8f839246c..b249dad40 100644 --- a/confluence-mdx/bin/reverse_sync/fragment_extractor.py +++ b/confluence-mdx/bin/reverse_sync/fragment_extractor.py @@ -14,7 +14,7 @@ from bs4 import BeautifulSoup, NavigableString, Tag -from reverse_sync.mapping_recorder import _iter_block_children +from reverse_sync.mapping_recorder import iter_block_children @dataclass @@ -43,7 +43,7 @@ def extract_block_fragments(xhtml_text: str) -> FragmentExtractionResult: # Top-level element 순서 파악 top_elements: List[Tuple[str, str]] = [] - for child in _iter_block_children(soup): + for child in iter_block_children(soup): if isinstance(child, Tag): top_elements.append(("tag", child.name)) elif isinstance(child, NavigableString): diff --git a/confluence-mdx/bin/reverse_sync/mapping_recorder.py b/confluence-mdx/bin/reverse_sync/mapping_recorder.py index 9ef485e37..275a2fe6f 100644 --- a/confluence-mdx/bin/reverse_sync/mapping_recorder.py +++ b/confluence-mdx/bin/reverse_sync/mapping_recorder.py @@ -17,10 +17,13 @@ class BlockMapping: HEADING_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'} -_CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'}) +CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'}) +# backward-compat aliases +_CALLOUT_MACRO_NAMES = CALLOUT_MACRO_NAMES -def _get_text_with_emoticons(element) -> str: + +def get_text_with_emoticons(element) -> str: """get_text()와 동일하지만 ac:emoticon의 fallback 텍스트를 포함한다. Confluence의 태그는 self-closing으로 텍스트 노드가 없어서 @@ -38,11 +41,14 @@ def _get_text_with_emoticons(element) -> str: if fallback: parts.append(fallback) else: - parts.append(_get_text_with_emoticons(item)) + parts.append(get_text_with_emoticons(item)) return ''.join(parts) +# backward-compat alias +_get_text_with_emoticons = get_text_with_emoticons + -def _iter_block_children(parent): +def iter_block_children(parent): """블록 레벨 자식을 순회한다. ac:layout은 cell 내부로 진입한다.""" for child in parent.children: if isinstance(child, Tag) and child.name == 'ac:layout': @@ -52,6 +58,9 @@ def _iter_block_children(parent): else: yield child +# backward-compat alias +_iter_block_children = iter_block_children + def record_mapping(xhtml: str) -> List[BlockMapping]: """XHTML에서 블록 레벨 요소를 추출하여 매핑 레코드를 생성한다.""" @@ -59,7 +68,7 @@ def record_mapping(xhtml: str) -> List[BlockMapping]: mappings: List[BlockMapping] = [] counters: dict = {} - for child in _iter_block_children(soup): + for child in iter_block_children(soup): if isinstance(child, NavigableString): if child.strip(): _add_mapping(mappings, counters, 'p', child.strip(), child.strip()) @@ -93,24 +102,24 @@ def record_mapping(xhtml: str) -> List[BlockMapping]: block_type='code') else: # Callout 매크로: body 텍스트만 추출 (파라미터 메타데이터 제외) - if macro_name in _CALLOUT_MACRO_NAMES: + if macro_name in CALLOUT_MACRO_NAMES: rich_body = child.find('ac:rich-text-body') - plain = _get_text_with_emoticons(rich_body) if rich_body else child.get_text() + plain = get_text_with_emoticons(rich_body) if rich_body else child.get_text() else: plain = child.get_text() _add_mapping(mappings, counters, f'macro-{macro_name}', str(child), plain, block_type='html_block') # Callout 매크로: 자식 요소 개별 매핑 추가 - if macro_name in _CALLOUT_MACRO_NAMES: + if macro_name in CALLOUT_MACRO_NAMES: parent_mapping = mappings[-1] _add_rich_text_body_children( child, parent_mapping, mappings, counters) elif tag_name == 'ac:adf-extension': - panel_type = _get_adf_panel_type(child) + panel_type = get_adf_panel_type(child) plain = child.get_text() _add_mapping(mappings, counters, tag_name, str(child), plain, block_type='html_block') - if panel_type in _CALLOUT_MACRO_NAMES: + if panel_type in CALLOUT_MACRO_NAMES: parent_mapping = mappings[-1] _add_adf_content_children( child, parent_mapping, mappings, counters) @@ -172,7 +181,7 @@ def _add_container_children( child_counters[tag] = child_counters.get(tag, 0) + 1 child_xpath = f"{parent_xpath}/{tag}[{child_counters[tag]}]" - plain = _get_text_with_emoticons(child) + plain = get_text_with_emoticons(child) if tag in ('ul', 'ol', 'table'): inner = str(child) else: @@ -206,7 +215,7 @@ def _add_rich_text_body_children( _add_container_children(rich_body, parent_mapping, mappings, counters) -def _get_adf_panel_type(element: Tag) -> str: +def get_adf_panel_type(element: Tag) -> str: """ac:adf-extension 요소에서 panel-type을 추출한다.""" node = element.find('ac:adf-node') if node is None: @@ -216,14 +225,20 @@ def _get_adf_panel_type(element: Tag) -> str: return '' return attr.get_text().strip() +# backward-compat alias +_get_adf_panel_type = get_adf_panel_type -def _get_adf_content_body(element: Tag): + +def get_adf_content_body(element: Tag): """ac:adf-extension 요소에서 ac:adf-content를 찾는다.""" node = element.find('ac:adf-node') if node is None: return None return node.find('ac:adf-content') +# backward-compat alias +_get_adf_content_body = get_adf_content_body + def _add_adf_content_children( adf_element: Tag, @@ -232,5 +247,5 @@ def _add_adf_content_children( counters: dict, ): """ac:adf-extension의 ac:adf-content 내 자식 요소를 개별 매핑으로 추가한다.""" - content_body = _get_adf_content_body(adf_element) + content_body = get_adf_content_body(adf_element) _add_container_children(content_body, parent_mapping, mappings, counters) diff --git a/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py b/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py index 78adde9e4..d1074c109 100644 --- a/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py +++ b/confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py @@ -14,32 +14,10 @@ from bs4 import BeautifulSoup from mdx_to_storage import emit_document, parse_mdx from mdx_to_storage.link_resolver import LinkResolver +from reverse_sync.xhtml_normalizer import normalize_soup from xhtml_beautify_diff import beautify_xhtml, xhtml_diff -_IGNORED_ATTRIBUTES = { - "ac:macro-id", - "ac:local-id", - "local-id", - "ac:schema-version", - "ri:version-at-save", - "ac:original-height", - "ac:original-width", - "ac:custom-width", - "ac:alt", - "ac:layout", - "data-table-width", - "data-layout", - "data-highlight-colour", - "data-card-appearance", - "ac:breakout-mode", - "ac:breakout-width", - "ri:space-key", - "style", - "class", -} - - @dataclass class CaseVerification: case_id: str @@ -77,10 +55,7 @@ def mdx_to_storage_xhtml_fragment( def _normalize_xhtml(xhtml: str, ignore_ri_filename: bool = False) -> str: soup = BeautifulSoup(xhtml, "html.parser") - _strip_layout_sections(soup) - _strip_nonreversible_macros(soup) - _strip_decorations(soup) - _strip_ignored_attributes(soup, ignore_ri_filename=ignore_ri_filename) + normalize_soup(soup, ignore_ri_filename=ignore_ri_filename) return beautify_xhtml(str(soup)).strip() @@ -106,39 +81,6 @@ def verify_expected_mdx_against_page_xhtml( return False, generated, "\n".join(diff_lines) -def _strip_ignored_attributes(soup: BeautifulSoup, ignore_ri_filename: bool = False) -> None: - ignored_attrs = set(_IGNORED_ATTRIBUTES) - if ignore_ri_filename: - ignored_attrs.add("ri:filename") - for tag in soup.find_all(True): - for attr in list(tag.attrs.keys()): - if attr in ignored_attrs: - del tag.attrs[attr] - - -def _strip_layout_sections(soup: BeautifulSoup) -> None: - for tag_name in ("ac:layout", "ac:layout-section", "ac:layout-cell"): - for tag in soup.find_all(tag_name): - tag.unwrap() - - -def _strip_nonreversible_macros(soup: BeautifulSoup) -> None: - for macro in soup.find_all("ac:structured-macro"): - if macro.get("ac:name") in {"toc", "view-file"}: - macro.decompose() - - -def _strip_decorations(soup: BeautifulSoup) -> None: - for tag_name in ("ac:adf-mark", "ac:inline-comment-marker"): - for tag in soup.find_all(tag_name): - tag.unwrap() - for colgroup in soup.find_all("colgroup"): - colgroup.decompose() - for p in soup.find_all("p"): - if not p.get_text(strip=True) and not p.find_all(True): - p.decompose() - - def iter_testcase_dirs(testcases_dir: Path) -> Iterable[Path]: """`page.xhtml`과 `expected.mdx`가 있는 테스트케이스 디렉토리를 순회한다.""" for child in sorted(testcases_dir.iterdir()): diff --git a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py index 6b7377180..ffd4da881 100644 --- a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py +++ b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py @@ -11,6 +11,8 @@ from bs4 import BeautifulSoup, NavigableString, Tag +from reverse_sync.mapping_recorder import iter_block_children + # --------------------------------------------------------------------------- # Ignored attributes — 비교 시 무시하는 Confluence 메타데이터 속성 @@ -82,21 +84,42 @@ def _extract_text_from_element(element) -> str: # Fragment normalization # --------------------------------------------------------------------------- -def normalize_fragment(fragment: str, strip_ignored_attrs: bool = True) -> str: +def normalize_soup( + soup: BeautifulSoup, + *, + strip_ignored_attrs: bool = True, + ignore_ri_filename: bool = False, +) -> None: + """BeautifulSoup 객체를 in-place로 정규화한다. + + normalize_fragment()와 verify 모듈이 공유하는 핵심 정규화 로직. + """ + _strip_layout_sections(soup) + _strip_nonreversible_macros(soup) + _strip_decorations(soup) + if strip_ignored_attrs: + _strip_ignored_attributes(soup, ignore_ri_filename=ignore_ri_filename) + + +def normalize_fragment( + fragment: str, + strip_ignored_attrs: bool = True, + ignore_ri_filename: bool = False, +) -> str: """XHTML fragment를 비교 가능한 정규화된 형태로 변환한다. - layout section unwrap - non-reversible macro 제거 - - decoration unwrap + - decoration unwrap + 빈

제거 - ignored attribute 제거 (선택) - BeautifulSoup prettify로 노드별 줄바꿈 """ soup = BeautifulSoup(fragment, "html.parser") - _strip_layout_sections(soup) - _strip_nonreversible_macros(soup) - _strip_decorations(soup) - if strip_ignored_attrs: - _strip_ignored_attributes(soup) + normalize_soup( + soup, + strip_ignored_attrs=strip_ignored_attrs, + ignore_ri_filename=ignore_ri_filename, + ) return soup.prettify(formatter="minimal").strip() @@ -118,13 +141,20 @@ def _strip_decorations(soup: BeautifulSoup) -> None: tag.unwrap() for colgroup in soup.find_all("colgroup"): colgroup.decompose() + # 빈

제거 (decoration unwrap 후 남는 빈 요소) + for p in soup.find_all("p"): + if not p.get_text(strip=True) and not p.find_all(True): + p.decompose() def _strip_ignored_attributes( soup: BeautifulSoup, extra: Optional[frozenset[str]] = None, + ignore_ri_filename: bool = False, ) -> None: - ignored = IGNORED_ATTRIBUTES | extra if extra else IGNORED_ATTRIBUTES + ignored = IGNORED_ATTRIBUTES | extra if extra else set(IGNORED_ATTRIBUTES) + if ignore_ri_filename: + ignored = set(ignored) | {"ri:filename"} for tag in soup.find_all(True): for attr in list(tag.attrs.keys()): if attr in ignored: @@ -186,7 +216,7 @@ def _find_element_by_simple_xpath(parent, xpath: str): macro_name = tag_name[len("macro-"):] count = 0 - for child in _iter_block_children(parent): + for child in iter_block_children(parent): if not isinstance(child, Tag): continue if macro_name: @@ -201,17 +231,6 @@ def _find_element_by_simple_xpath(parent, xpath: str): return None -def _iter_block_children(parent): - """블록 레벨 자식을 순회한다. ac:layout은 cell 내부로 진입한다.""" - for child in parent.children: - if isinstance(child, Tag) and child.name == "ac:layout": - for section in child.find_all("ac:layout-section", recursive=False): - for cell in section.find_all("ac:layout-cell", recursive=False): - yield from cell.children - else: - yield child - - def _find_content_container(parent: Tag): """복합 xpath의 부모에서 콘텐츠 컨테이너를 찾는다.""" rich_body = parent.find("ac:rich-text-body") diff --git a/confluence-mdx/bin/reverse_sync/xhtml_patcher.py b/confluence-mdx/bin/reverse_sync/xhtml_patcher.py index bfb1f10c1..116ecf9f6 100644 --- a/confluence-mdx/bin/reverse_sync/xhtml_patcher.py +++ b/confluence-mdx/bin/reverse_sync/xhtml_patcher.py @@ -3,9 +3,7 @@ from bs4 import BeautifulSoup, NavigableString, Tag import difflib import re -from reverse_sync.mapping_recorder import _iter_block_children - -from reverse_sync.mapping_recorder import _get_text_with_emoticons +from reverse_sync.mapping_recorder import get_text_with_emoticons, iter_block_children def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str: @@ -69,7 +67,7 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str: # patch 적용 시에는 기본 비교를 get_text()로 수행하고, 필요 시 emoticon fallback 텍스트 비교를 허용한다. current_plain = element.get_text() if old_text and current_plain.strip() != old_text.strip(): - current_plain_with_emoticons = _get_text_with_emoticons(element) + current_plain_with_emoticons = get_text_with_emoticons(element) if current_plain_with_emoticons.strip() != old_text.strip(): continue _replace_inner_html(element, patch['new_inner_xhtml']) @@ -86,7 +84,7 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str: # mapping plain(old_text)과의 비교는 get_text() 우선, 실패 시 emoticon fallback 포함 텍스트로 재확인한다. current_plain = element.get_text() if current_plain.strip() != old_text.strip(): - current_plain_with_emoticons = _get_text_with_emoticons(element) + current_plain_with_emoticons = get_text_with_emoticons(element) if current_plain_with_emoticons.strip() != old_text.strip(): continue _apply_text_changes(element, old_text, new_text) @@ -130,7 +128,7 @@ def _insert_element_resolved(soup: BeautifulSoup, anchor, new_html: str): def _find_first_block_element(soup: BeautifulSoup): """soup의 첫 번째 블록 레벨 요소를 찾는다.""" - for child in _iter_block_children(soup): + for child in iter_block_children(soup): if isinstance(child, Tag): return child return None @@ -212,7 +210,7 @@ def _find_element_by_simple_xpath(soup: BeautifulSoup, xpath: str): macro_name = tag_name[len('macro-'):] count = 0 - for child in _iter_block_children(soup): + for child in iter_block_children(soup): if not isinstance(child, Tag): continue if macro_name: diff --git a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py index 2c08201a6..d8ca14be1 100644 --- a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py +++ b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py @@ -201,6 +201,21 @@ def test_strip_ignored_attrs_option(self): assert "ac:macro-id" not in with_strip assert "ac:macro-id" in without_strip + def test_ignore_ri_filename_option(self): + """ignore_ri_filename=True면 ri:filename 속성도 제거된다.""" + fragment = '' + normal = normalize_fragment(fragment) + ignored = normalize_fragment(fragment, ignore_ri_filename=True) + assert 'ri:filename' in normal + assert 'ri:filename' not in ignored + + def test_empty_paragraph_removed(self): + """빈

요소가 decoration unwrap 후 제거된다.""" + fragment = '

keep

' + result = normalize_fragment(fragment) + # 빈

는 제거되고 keep만 남음 + assert "keep" in result + # --------------------------------------------------------------------------- # normalize_fragment — real testcase round-trip From 46faf69ac801cb441269ddf11e4ed8310d54df16 Mon Sep 17 00:00:00 2001 From: JK Date: Fri, 13 Mar 2026 22:09:16 +0900 Subject: [PATCH 3/4] =?UTF-8?q?fix(reverse=5Fsync):=20=EB=A6=AC=EB=B7=B0?= =?UTF-8?q?=20=EB=B0=98=EC=98=81=20=E2=80=94=20ac:link=20plain=20text=20?= =?UTF-8?q?=ED=8F=AC=ED=95=A8=20=EB=B0=8F=20ListNode=20start=20=EB=B3=B4?= =?UTF-8?q?=EC=A1=B4=ED=95=A9=EB=8B=88=EB=8B=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - extract_plain_text에서 ac:link를 skip하지 않고 visible label(ac:link-body) 텍스트를 포함하도록 수정합니다 (기존 mapping_recorder contract와 일치) - ListNode에 start 필드를 추가하여 ordered list marker number를 보존합니다 - _parse_list_items에서 marker number를 파싱하여 start에 저장합니다 - _render_list_nodes에서 첫 항목의 start를

    에 반영합니다 - 관련 테스트 추가/수정 (ac:link with body, start number preservation) Co-Authored-By: Claude Opus 4.6 --- confluence-mdx/bin/mdx_to_storage/emitter.py | 11 +++++++---- .../bin/reverse_sync/xhtml_normalizer.py | 4 ++-- .../tests/test_reverse_sync_list_tree.py | 14 ++++++++++++++ .../tests/test_reverse_sync_xhtml_normalizer.py | 13 +++++++++++-- 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/confluence-mdx/bin/mdx_to_storage/emitter.py b/confluence-mdx/bin/mdx_to_storage/emitter.py index ba4340f70..952e1a198 100644 --- a/confluence-mdx/bin/mdx_to_storage/emitter.py +++ b/confluence-mdx/bin/mdx_to_storage/emitter.py @@ -11,7 +11,7 @@ from .parser import Block, HEADING_PATTERN -_ORDERED_LIST_PATTERN = re.compile(r"^\d+\.\s+(.*)$") +_ORDERED_LIST_PATTERN = re.compile(r"^(\d+)\.\s+(.*)$") _UNORDERED_LIST_PATTERN = re.compile(r"^[-*+]\s+(.*)$") _HEADING_LINE_PATTERN = HEADING_PATTERN _CALLOUT_TYPE_TO_MACRO = { @@ -43,10 +43,11 @@ class ListNode: Public API for reconstruction pipeline. """ - def __init__(self, ordered: bool, text: str, depth: int) -> None: + def __init__(self, ordered: bool, text: str, depth: int, start: int | None = None) -> None: self.ordered = ordered self.text = text self.depth = depth + self.start = start # ordered list marker number (e.g. 2 for "2. item") self.children: list["ListNode"] = [] @@ -189,7 +190,8 @@ def _parse_list_items(content: str) -> list[_ListNode]: ordered_match = _ORDERED_LIST_PATTERN.match(stripped) if ordered_match: - items.append(_ListNode(True, ordered_match.group(1), depth)) + marker_num = int(ordered_match.group(1)) + items.append(_ListNode(True, ordered_match.group(2), depth, start=marker_num)) continue unordered_match = _UNORDERED_LIST_PATTERN.match(stripped) @@ -234,7 +236,8 @@ def _render_list_nodes( body = "".join(_render_list_item(node, link_resolver=link_resolver) for node in group) if tag == "ol": - parts.append(f'
      {body}
    ') + start = group[0].start if group[0].start is not None else 1 + parts.append(f'
      {body}
    ') else: parts.append(f"
      {body}
    ") return "".join(parts) diff --git a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py index ffd4da881..6962a849c 100644 --- a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py +++ b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py @@ -73,8 +73,8 @@ def _extract_text_from_element(element) -> str: if fallback: parts.append(fallback) continue - # ac:image, ac:link 등 preservation unit은 텍스트 없음 (anchor로 처리) - if child.name in ("ac:image", "ac:link"): + # ac:image는 preservation unit — 텍스트 없음 (anchor로 처리) + if child.name == "ac:image": continue parts.append(_extract_text_from_element(child)) return "".join(parts) diff --git a/confluence-mdx/tests/test_reverse_sync_list_tree.py b/confluence-mdx/tests/test_reverse_sync_list_tree.py index 25016f8b4..1cc5d79f9 100644 --- a/confluence-mdx/tests/test_reverse_sync_list_tree.py +++ b/confluence-mdx/tests/test_reverse_sync_list_tree.py @@ -26,6 +26,9 @@ def test_simple_ordered_list(self): assert len(roots) == 3 assert all(node.ordered for node in roots) assert roots[0].text == "First" + assert roots[0].start == 1 + assert roots[1].start == 2 + assert roots[2].start == 3 def test_nested_list(self): content = "- Parent\n - Child 1\n - Child 2" @@ -71,5 +74,16 @@ def test_nested_ordered_under_unordered(self): roots = parse_list_tree(content) assert len(roots) == 1 assert not roots[0].ordered + assert roots[0].start is None # unordered → no start assert len(roots[0].children) == 1 assert roots[0].children[0].ordered + assert roots[0].children[0].start == 1 + + def test_ordered_list_start_number_preserved(self): + """중간부터 시작하는 ordered list의 marker number가 보존된다.""" + content = "2. Second\n3. Third\n4. Fourth" + roots = parse_list_tree(content) + assert len(roots) == 3 + assert roots[0].start == 2 + assert roots[1].start == 3 + assert roots[2].start == 4 diff --git a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py index d8ca14be1..c61900fb6 100644 --- a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py +++ b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py @@ -48,8 +48,8 @@ def test_paragraph_with_inline_image_excluded(self): ) assert extract_plain_text(fragment) == "A B" - def test_paragraph_with_inline_link_excluded(self): - """ac:link는 preservation unit이므로 plain text에서 제외된다.""" + def test_paragraph_with_inline_link_no_body(self): + """ac:link에 link-body가 없으면 텍스트가 비어있다.""" fragment = ( '

    Before ' '' @@ -57,6 +57,15 @@ def test_paragraph_with_inline_link_excluded(self): ) assert extract_plain_text(fragment) == "Before After" + def test_paragraph_with_inline_link_with_body(self): + """ac:link에 link-body가 있으면 visible label이 plain text에 포함된다.""" + fragment = ( + '

    참조: ' + '' + 'Click here

    ' + ) + assert extract_plain_text(fragment) == "참조: Click here" + def test_paragraph_with_emoticon(self): """ac:emoticon의 fallback 텍스트가 포함된다.""" fragment = ( From 35fdb0c4c2854c26ff32726a4e321953d31dc7f4 Mon Sep 17 00:00:00 2001 From: JK Date: Fri, 13 Mar 2026 22:55:23 +0900 Subject: [PATCH 4/4] =?UTF-8?q?fix(reverse=5Fsync):=20extract=5Fplain=5Fte?= =?UTF-8?q?xt=EC=97=90=EC=84=9C=20code=20macro=20=EB=B3=B8=EB=AC=B8?= =?UTF-8?q?=EC=9D=84=20=ED=8F=AC=ED=95=A8=ED=95=A9=EB=8B=88=EB=8B=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ac:plain-text-body 제외 로직을 제거하여 코드 블록 본문이 plain text에 포함되도록 수정합니다 - 기존 mapping_recorder의 plain text contract와 일치시킵니다 - reconstruction anchor offset 좌표계에서 코드 본문 누락 방지 Co-Authored-By: Claude Opus 4.6 --- confluence-mdx/bin/reverse_sync/xhtml_normalizer.py | 8 +++----- .../tests/test_reverse_sync_xhtml_normalizer.py | 7 ++++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py index 6962a849c..f0ea77e48 100644 --- a/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py +++ b/confluence-mdx/bin/reverse_sync/xhtml_normalizer.py @@ -48,8 +48,9 @@ def extract_plain_text(fragment: str) -> str: """XHTML fragment에서 plain text를 추출한다. - ac:plain-text-body(코드 블록 본문)는 제외하고, - ac:emoticon의 fallback 텍스트는 포함한다. + ac:emoticon의 fallback 텍스트를 포함하고, + ac:image만 preservation unit으로 제외한다. + 코드 블록 본문(ac:plain-text-body)과 링크 label(ac:link-body)은 포함한다. 이 함수의 출력은 reconstruction에서 anchor offset 좌표의 기준이 된다. """ @@ -64,9 +65,6 @@ def _extract_text_from_element(element) -> str: if isinstance(child, NavigableString): parts.append(str(child)) elif isinstance(child, Tag): - # 코드 블록 본문은 제외 - if child.name == "ac:plain-text-body": - continue # emoticon은 fallback 텍스트 사용 if child.name == "ac:emoticon": fallback = child.get("ac:emoji-fallback", "") diff --git a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py index c61900fb6..61fdef040 100644 --- a/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py +++ b/confluence-mdx/tests/test_reverse_sync_xhtml_normalizer.py @@ -75,15 +75,16 @@ def test_paragraph_with_emoticon(self): ) assert extract_plain_text(fragment) == ":check_mark: Success" - def test_code_macro_body_excluded(self): - """ac:plain-text-body(코드 블록 본문)는 제외된다.""" + def test_code_macro_body_included(self): + """코드 블록 본문(ac:plain-text-body)이 plain text에 포함된다.""" fragment = ( '' 'python' '' '' ) - assert extract_plain_text(fragment).strip() == "python" + text = extract_plain_text(fragment) + assert 'print("hello")' in text def test_list_plain_text(self): fragment = "
    • Item 1

    • Item 2

    "