Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion confluence-mdx/bin/mdx_to_storage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
"""MDX -> Confluence Storage XHTML conversion package."""

from .emitter import emit_block, emit_document
from .emitter import ListNode, emit_block, emit_document, parse_list_tree
from .inline import convert_heading_inline, convert_inline
from .link_resolver import LinkResolver, PageEntry, load_pages_yaml
from .parser import Block, parse_mdx, parse_mdx_blocks

__all__ = [
"Block",
"LinkResolver",
"ListNode",
"PageEntry",
"convert_heading_inline",
"convert_inline",
"emit_block",
"emit_document",
"load_pages_yaml",
"parse_list_tree",
"parse_mdx",
"parse_mdx_blocks",
]
33 changes: 27 additions & 6 deletions confluence-mdx/bin/mdx_to_storage/emitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .parser import Block, HEADING_PATTERN


_ORDERED_LIST_PATTERN = re.compile(r"^\d+\.\s+(.*)$")
_ORDERED_LIST_PATTERN = re.compile(r"^(\d+)\.\s+(.*)$")
_UNORDERED_LIST_PATTERN = re.compile(r"^[-*+]\s+(.*)$")
_HEADING_LINE_PATTERN = HEADING_PATTERN
_CALLOUT_TYPE_TO_MACRO = {
Expand All @@ -37,12 +37,22 @@
_IMG_ATTR_RE = re.compile(r'(\w[\w-]*)=(?:"([^"]*)"|\'([^\']*)\')')


class _ListNode:
def __init__(self, ordered: bool, text: str, depth: int) -> None:
class ListNode:
"""List item node for tree-based list representation.

Public API for reconstruction pipeline.
"""

def __init__(self, ordered: bool, text: str, depth: int, start: int | None = None) -> None:
self.ordered = ordered
self.text = text
self.depth = depth
self.children: list["_ListNode"] = []
self.start = start # ordered list marker number (e.g. 2 for "2. item")
self.children: list["ListNode"] = []


# backward compat alias (internal)
_ListNode = ListNode


def emit_block(block: Block, context: Optional[dict] = None) -> str:
Expand Down Expand Up @@ -159,6 +169,15 @@ def _emit_single_depth_list(content: str, link_resolver: Optional[LinkResolver]
return _render_list_nodes(roots, link_resolver=link_resolver)


def parse_list_tree(content: str) -> list[ListNode]:
"""MDX list content를 파싱하여 tree 구조의 ListNode 리스트를 반환한다.

Public API — reverse-sync reconstruction pipeline에서 사용한다.
"""
items = _parse_list_items(content)
return _build_list_tree(items)


def _parse_list_items(content: str) -> list[_ListNode]:
items: list[_ListNode] = []
for line in content.splitlines():
Expand All @@ -171,7 +190,8 @@ def _parse_list_items(content: str) -> list[_ListNode]:

ordered_match = _ORDERED_LIST_PATTERN.match(stripped)
if ordered_match:
items.append(_ListNode(True, ordered_match.group(1), depth))
marker_num = int(ordered_match.group(1))
items.append(_ListNode(True, ordered_match.group(2), depth, start=marker_num))
continue

unordered_match = _UNORDERED_LIST_PATTERN.match(stripped)
Expand Down Expand Up @@ -216,7 +236,8 @@ def _render_list_nodes(

body = "".join(_render_list_item(node, link_resolver=link_resolver) for node in group)
if tag == "ol":
parts.append(f'<ol start="1">{body}</ol>')
start = group[0].start if group[0].start is not None else 1
parts.append(f'<ol start="{start}">{body}</ol>')
else:
parts.append(f"<ul>{body}</ul>")
return "".join(parts)
Expand Down
4 changes: 2 additions & 2 deletions confluence-mdx/bin/reverse_sync/fragment_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from bs4 import BeautifulSoup, NavigableString, Tag

from reverse_sync.mapping_recorder import _iter_block_children
from reverse_sync.mapping_recorder import iter_block_children


@dataclass
Expand Down Expand Up @@ -43,7 +43,7 @@ def extract_block_fragments(xhtml_text: str) -> FragmentExtractionResult:

# Top-level element 순서 파악
top_elements: List[Tuple[str, str]] = []
for child in _iter_block_children(soup):
for child in iter_block_children(soup):
if isinstance(child, Tag):
top_elements.append(("tag", child.name))
elif isinstance(child, NavigableString):
Expand Down
43 changes: 29 additions & 14 deletions confluence-mdx/bin/reverse_sync/mapping_recorder.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@ class BlockMapping:

HEADING_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}

_CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'})
CALLOUT_MACRO_NAMES = frozenset({'tip', 'info', 'note', 'warning', 'panel'})

# backward-compat aliases
_CALLOUT_MACRO_NAMES = CALLOUT_MACRO_NAMES

def _get_text_with_emoticons(element) -> str:

def get_text_with_emoticons(element) -> str:
"""get_text()와 동일하지만 ac:emoticon의 fallback 텍스트를 포함한다.

Confluence의 <ac:emoticon> 태그는 self-closing으로 텍스트 노드가 없어서
Expand All @@ -38,11 +41,14 @@ def _get_text_with_emoticons(element) -> str:
if fallback:
parts.append(fallback)
else:
parts.append(_get_text_with_emoticons(item))
parts.append(get_text_with_emoticons(item))
return ''.join(parts)

# backward-compat alias
_get_text_with_emoticons = get_text_with_emoticons


def _iter_block_children(parent):
def iter_block_children(parent):
"""블록 레벨 자식을 순회한다. ac:layout은 cell 내부로 진입한다."""
for child in parent.children:
if isinstance(child, Tag) and child.name == 'ac:layout':
Expand All @@ -52,14 +58,17 @@ def _iter_block_children(parent):
else:
yield child

# backward-compat alias
_iter_block_children = iter_block_children


def record_mapping(xhtml: str) -> List[BlockMapping]:
"""XHTML에서 블록 레벨 요소를 추출하여 매핑 레코드를 생성한다."""
soup = BeautifulSoup(xhtml, 'html.parser')
mappings: List[BlockMapping] = []
counters: dict = {}

for child in _iter_block_children(soup):
for child in iter_block_children(soup):
if isinstance(child, NavigableString):
if child.strip():
_add_mapping(mappings, counters, 'p', child.strip(), child.strip())
Expand Down Expand Up @@ -93,24 +102,24 @@ def record_mapping(xhtml: str) -> List[BlockMapping]:
block_type='code')
else:
# Callout 매크로: body 텍스트만 추출 (파라미터 메타데이터 제외)
if macro_name in _CALLOUT_MACRO_NAMES:
if macro_name in CALLOUT_MACRO_NAMES:
rich_body = child.find('ac:rich-text-body')
plain = _get_text_with_emoticons(rich_body) if rich_body else child.get_text()
plain = get_text_with_emoticons(rich_body) if rich_body else child.get_text()
else:
plain = child.get_text()
_add_mapping(mappings, counters, f'macro-{macro_name}', str(child), plain,
block_type='html_block')
# Callout 매크로: 자식 요소 개별 매핑 추가
if macro_name in _CALLOUT_MACRO_NAMES:
if macro_name in CALLOUT_MACRO_NAMES:
parent_mapping = mappings[-1]
_add_rich_text_body_children(
child, parent_mapping, mappings, counters)
elif tag_name == 'ac:adf-extension':
panel_type = _get_adf_panel_type(child)
panel_type = get_adf_panel_type(child)
plain = child.get_text()
_add_mapping(mappings, counters, tag_name, str(child), plain,
block_type='html_block')
if panel_type in _CALLOUT_MACRO_NAMES:
if panel_type in CALLOUT_MACRO_NAMES:
parent_mapping = mappings[-1]
_add_adf_content_children(
child, parent_mapping, mappings, counters)
Expand Down Expand Up @@ -172,7 +181,7 @@ def _add_container_children(
child_counters[tag] = child_counters.get(tag, 0) + 1
child_xpath = f"{parent_xpath}/{tag}[{child_counters[tag]}]"

plain = _get_text_with_emoticons(child)
plain = get_text_with_emoticons(child)
if tag in ('ul', 'ol', 'table'):
inner = str(child)
else:
Expand Down Expand Up @@ -206,7 +215,7 @@ def _add_rich_text_body_children(
_add_container_children(rich_body, parent_mapping, mappings, counters)


def _get_adf_panel_type(element: Tag) -> str:
def get_adf_panel_type(element: Tag) -> str:
"""ac:adf-extension 요소에서 panel-type을 추출한다."""
node = element.find('ac:adf-node')
if node is None:
Expand All @@ -216,14 +225,20 @@ def _get_adf_panel_type(element: Tag) -> str:
return ''
return attr.get_text().strip()

# backward-compat alias
_get_adf_panel_type = get_adf_panel_type

def _get_adf_content_body(element: Tag):

def get_adf_content_body(element: Tag):
"""ac:adf-extension 요소에서 ac:adf-content를 찾는다."""
node = element.find('ac:adf-node')
if node is None:
return None
return node.find('ac:adf-content')

# backward-compat alias
_get_adf_content_body = get_adf_content_body


def _add_adf_content_children(
adf_element: Tag,
Expand All @@ -232,5 +247,5 @@ def _add_adf_content_children(
counters: dict,
):
"""ac:adf-extension의 ac:adf-content 내 자식 요소를 개별 매핑으로 추가한다."""
content_body = _get_adf_content_body(adf_element)
content_body = get_adf_content_body(adf_element)
_add_container_children(content_body, parent_mapping, mappings, counters)
62 changes: 2 additions & 60 deletions confluence-mdx/bin/reverse_sync/mdx_to_storage_xhtml_verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,32 +14,10 @@
from bs4 import BeautifulSoup
from mdx_to_storage import emit_document, parse_mdx
from mdx_to_storage.link_resolver import LinkResolver
from reverse_sync.xhtml_normalizer import normalize_soup
from xhtml_beautify_diff import beautify_xhtml, xhtml_diff


_IGNORED_ATTRIBUTES = {
"ac:macro-id",
"ac:local-id",
"local-id",
"ac:schema-version",
"ri:version-at-save",
"ac:original-height",
"ac:original-width",
"ac:custom-width",
"ac:alt",
"ac:layout",
"data-table-width",
"data-layout",
"data-highlight-colour",
"data-card-appearance",
"ac:breakout-mode",
"ac:breakout-width",
"ri:space-key",
"style",
"class",
}


@dataclass
class CaseVerification:
case_id: str
Expand Down Expand Up @@ -77,10 +55,7 @@ def mdx_to_storage_xhtml_fragment(

def _normalize_xhtml(xhtml: str, ignore_ri_filename: bool = False) -> str:
soup = BeautifulSoup(xhtml, "html.parser")
_strip_layout_sections(soup)
_strip_nonreversible_macros(soup)
_strip_decorations(soup)
_strip_ignored_attributes(soup, ignore_ri_filename=ignore_ri_filename)
normalize_soup(soup, ignore_ri_filename=ignore_ri_filename)
return beautify_xhtml(str(soup)).strip()


Expand All @@ -106,39 +81,6 @@ def verify_expected_mdx_against_page_xhtml(
return False, generated, "\n".join(diff_lines)


def _strip_ignored_attributes(soup: BeautifulSoup, ignore_ri_filename: bool = False) -> None:
ignored_attrs = set(_IGNORED_ATTRIBUTES)
if ignore_ri_filename:
ignored_attrs.add("ri:filename")
for tag in soup.find_all(True):
for attr in list(tag.attrs.keys()):
if attr in ignored_attrs:
del tag.attrs[attr]


def _strip_layout_sections(soup: BeautifulSoup) -> None:
for tag_name in ("ac:layout", "ac:layout-section", "ac:layout-cell"):
for tag in soup.find_all(tag_name):
tag.unwrap()


def _strip_nonreversible_macros(soup: BeautifulSoup) -> None:
for macro in soup.find_all("ac:structured-macro"):
if macro.get("ac:name") in {"toc", "view-file"}:
macro.decompose()


def _strip_decorations(soup: BeautifulSoup) -> None:
for tag_name in ("ac:adf-mark", "ac:inline-comment-marker"):
for tag in soup.find_all(tag_name):
tag.unwrap()
for colgroup in soup.find_all("colgroup"):
colgroup.decompose()
for p in soup.find_all("p"):
if not p.get_text(strip=True) and not p.find_all(True):
p.decompose()


def iter_testcase_dirs(testcases_dir: Path) -> Iterable[Path]:
"""`page.xhtml`과 `expected.mdx`가 있는 테스트케이스 디렉토리를 순회한다."""
for child in sorted(testcases_dir.iterdir()):
Expand Down
Loading
Loading