Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 140 additions & 5 deletions confluence-mdx/bin/reverse_sync/patch_builder.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
"""패치 빌더 — MDX diff 변경과 XHTML 매핑을 결합하여 XHTML 패치를 생성."""
from typing import Dict, List, Optional

from mdx_to_storage.emitter import emit_block
from mdx_to_storage.parser import parse_mdx
from reverse_sync.block_diff import BlockChange, NON_CONTENT_TYPES
from reverse_sync.mapping_recorder import BlockMapping
from mdx_to_storage.parser import Block as MdxBlock
from text_utils import (
normalize_mdx_to_plain, collapse_ws,
)
from reverse_sync.text_transfer import transfer_text_changes
from reverse_sync.sidecar import find_mapping_by_sidecar, SidecarEntry
from reverse_sync.sidecar import (
RoundtripSidecar,
SidecarBlock,
find_mapping_by_sidecar,
SidecarEntry,
)
from reverse_sync.lost_info_patcher import apply_lost_info, distribute_lost_info_to_mappings
from reverse_sync.mdx_to_xhtml_inline import mdx_block_to_xhtml_element, mdx_block_to_inner_xhtml
from reverse_sync.list_patcher import (
Expand All @@ -22,6 +29,81 @@
)


_CLEAN_BLOCK_TYPES = frozenset(("heading", "code_block", "hr"))


def _contains_preserved_anchor_markup(xhtml_text: str) -> bool:
"""preservation unit이 있으면 clean whole-fragment replacement 대상이 아니다."""
return "<ac:" in xhtml_text or "<ri:" in xhtml_text


def _is_clean_block(
block_type: str,
mapping: Optional[BlockMapping],
sidecar_block: Optional[SidecarBlock],
) -> bool:
"""Phase 2 clean block 여부를 판별한다."""
if mapping is None:
return False

if block_type in _CLEAN_BLOCK_TYPES:
return True

if sidecar_block is not None:
recon = sidecar_block.reconstruction
if recon is None:
return False
if recon.get("kind") == "paragraph":
return len(recon.get("anchors", [])) == 0
return False

return block_type == "paragraph" and not _contains_preserved_anchor_markup(
mapping.xhtml_text
)


def _can_replace_table_fragment(
change: BlockChange,
mapping: Optional[BlockMapping],
roundtrip_sidecar: Optional[RoundtripSidecar],
) -> bool:
"""table 계열을 whole-fragment replacement로 처리할 수 있는지 판별한다."""
if roundtrip_sidecar is None or mapping is None:
return False
if _contains_preserved_anchor_markup(mapping.xhtml_text):
return False
block = change.new_block or change.old_block
return (
(block.type == "html_block" and block.content.lstrip().startswith("<table"))
or is_markdown_table(change.old_block.content)
)


def _emit_replacement_fragment(block: MdxBlock) -> str:
"""Block content를 현재 forward emitter 기준 fragment로 변환한다."""
parsed_blocks = [parsed for parsed in parse_mdx(block.content) if parsed.type != "empty"]
if len(parsed_blocks) == 1:
return emit_block(parsed_blocks[0])
return mdx_block_to_xhtml_element(block)


def _build_replace_fragment_patch(
mapping: BlockMapping,
new_block: MdxBlock,
mapping_lost_info: Optional[dict] = None,
) -> Dict[str, str]:
"""whole-fragment replacement patch를 생성한다."""
new_element = _emit_replacement_fragment(new_block)
block_lost = (mapping_lost_info or {}).get(mapping.block_id, {})
if block_lost:
new_element = apply_lost_info(new_element, block_lost)
return {
"action": "replace_fragment",
"xhtml_xpath": mapping.xhtml_xpath,
"new_element_xhtml": new_element,
}


def _flush_containing_changes(
containing_changes: dict,
used_ids: 'set | None' = None,
Expand Down Expand Up @@ -105,12 +187,18 @@ def build_patches(
xpath_to_mapping: Dict[str, 'BlockMapping'],
alignment: Optional[Dict[int, int]] = None,
page_lost_info: Optional[dict] = None,
roundtrip_sidecar: Optional[RoundtripSidecar] = None,
) -> List[Dict[str, str]]:
"""diff 변경과 매핑을 결합하여 XHTML 패치 목록을 구성한다.

sidecar 인덱스를 사용하여 O(1) 직접 조회를 수행한다.
"""
patches = []
xpath_to_sidecar_block: Dict[str, SidecarBlock] = {}
if roundtrip_sidecar is not None:
xpath_to_sidecar_block = {
block.xhtml_xpath: block for block in roundtrip_sidecar.blocks
}
used_ids: set = set() # 이미 매칭된 mapping block_id (중복 매칭 방지)
# child → parent 역참조 맵 (부모-자식 간 중복 매칭 방지)
child_to_parent: dict = {}
Expand Down Expand Up @@ -154,6 +242,22 @@ def _mark_used(block_id: str, m: BlockMapping):
idx, mdx_to_sidecar, xpath_to_mapping)
if mapping is None:
continue
sidecar_block = xpath_to_sidecar_block.get(mapping.xhtml_xpath)
if _is_clean_block(
add_change.new_block.type,
mapping,
sidecar_block,
) or _can_replace_table_fragment(del_change, mapping, roundtrip_sidecar):
patches.append(
_build_replace_fragment_patch(
mapping,
add_change.new_block,
mapping_lost_info,
)
)
_paired_indices.add(idx)
_mark_used(mapping.block_id, mapping)
continue
old_plain = normalize_mdx_to_plain(
del_change.old_block.content, del_change.old_block.type)
new_plain = normalize_mdx_to_plain(
Expand Down Expand Up @@ -211,10 +315,20 @@ def _mark_used(block_id: str, m: BlockMapping):
continue

if strategy == 'table':
patches.extend(
build_table_row_patches(
change, mappings, used_ids,
mdx_to_sidecar, xpath_to_mapping))
if _can_replace_table_fragment(change, mapping, roundtrip_sidecar):
_mark_used(mapping.block_id, mapping)
patches.append(
_build_replace_fragment_patch(
mapping,
change.new_block,
mapping_lost_info,
)
)
else:
patches.extend(
build_table_row_patches(
change, mappings, used_ids,
mdx_to_sidecar, xpath_to_mapping))
continue

new_plain = normalize_mdx_to_plain(
Expand All @@ -236,6 +350,27 @@ def _mark_used(block_id: str, m: BlockMapping):
and collapse_ws(new_plain) == collapse_ws(mapping.xhtml_plain_text)):
continue

sidecar_block = xpath_to_sidecar_block.get(mapping.xhtml_xpath)
if _can_replace_table_fragment(change, mapping, roundtrip_sidecar):
patches.append(
_build_replace_fragment_patch(
mapping,
change.new_block,
mapping_lost_info,
)
)
continue

if _is_clean_block(change.old_block.type, mapping, sidecar_block):
patches.append(
_build_replace_fragment_patch(
mapping,
change.new_block,
mapping_lost_info,
)
)
continue

# 재생성 시 소실되는 XHTML 요소 포함 시 텍스트 전이로 폴백
if ('<ac:link' in mapping.xhtml_text
or '<ri:attachment' in mapping.xhtml_text):
Expand Down
34 changes: 31 additions & 3 deletions confluence-mdx/bin/reverse_sync/xhtml_patcher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""XHTML Patcher — 매핑과 diff를 이용해 XHTML의 텍스트를 패치한다."""
"""XHTML Patcher — fragment 단위 DOM patch를 적용한다."""
from typing import List, Dict
from bs4 import BeautifulSoup, NavigableString, Tag
import difflib
Expand All @@ -12,10 +12,11 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str:
Args:
xhtml: 원본 XHTML 문자열
patches: 패치 목록. 각 패치는 dict:
- action: "modify" (기본) | "delete" | "insert"
- action: "modify" (기본) | "delete" | "insert" | "replace_fragment"
- modify: xhtml_xpath, old_plain_text, new_plain_text 또는 new_inner_xhtml
- delete: xhtml_xpath
- insert: after_xpath (None이면 맨 앞), new_element_xhtml
- replace_fragment: xhtml_xpath, new_element_xhtml

Returns:
패치된 XHTML 문자열
Expand All @@ -25,6 +26,7 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str:
# 패치를 action별로 분류
delete_patches = [p for p in patches if p.get('action') == 'delete']
insert_patches = [p for p in patches if p.get('action') == 'insert']
replace_patches = [p for p in patches if p.get('action') == 'replace_fragment']
modify_patches = [p for p in patches
if p.get('action', 'modify') == 'modify']

Expand All @@ -51,6 +53,12 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str:
if el is not None:
resolved_modifies.append((el, p))

resolved_replacements = []
for p in replace_patches:
el = _find_element_by_xpath(soup, p['xhtml_xpath'])
if el is not None:
resolved_replacements.append((el, p))

# 1단계: delete
for element in resolved_deletes:
element.decompose()
Expand All @@ -59,7 +67,11 @@ def patch_xhtml(xhtml: str, patches: List[Dict[str, str]]) -> str:
for anchor, patch in resolved_inserts:
_insert_element_resolved(soup, anchor, patch['new_element_xhtml'])

# 3단계: modify
# 3단계: replace fragment
for element, patch in resolved_replacements:
_replace_element_resolved(element, patch['new_element_xhtml'])

# 4단계: modify
for element, patch in resolved_modifies:
if 'new_inner_xhtml' in patch:
old_text = patch.get('old_plain_text', '')
Expand Down Expand Up @@ -160,6 +172,22 @@ def _replace_inner_html(element: Tag, new_inner_xhtml: str):
element.append(child.extract())


def _replace_element_resolved(element: Tag, new_html: str):
"""요소 전체를 새 fragment로 교체한다."""
new_content = BeautifulSoup(new_html, 'html.parser')
replacements = [child.extract() for child in list(new_content.children)]
if not replacements:
element.decompose()
return

first = replacements[0]
element.replace_with(first)
prev = first
for child in replacements[1:]:
prev.insert_after(child)
prev = child


def _find_element_by_xpath(soup: BeautifulSoup, xpath: str):
"""간이 XPath로 요소를 찾는다.

Expand Down
5 changes: 4 additions & 1 deletion confluence-mdx/bin/reverse_sync_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ def run_verify(
from reverse_sync.sidecar import (
SidecarEntry, SidecarChildEntry, generate_sidecar_mapping,
build_mdx_to_sidecar_index, build_xpath_to_mapping,
build_sidecar,
)
# forward converter가 생성한 mapping.yaml에서 lost_info를 보존
existing_mapping = var_dir / 'mapping.yaml'
Expand Down Expand Up @@ -386,12 +387,14 @@ def run_verify(
children=children,
))
mdx_to_sidecar = build_mdx_to_sidecar_index(sidecar_entries)
roundtrip_sidecar = build_sidecar(xhtml, original_mdx, page_id=page_id)
xpath_to_mapping = build_xpath_to_mapping(original_mappings)

# Step 4: XHTML 패치 → patched.xhtml 저장
patches = build_patches(changes, original_blocks, improved_blocks,
original_mappings, mdx_to_sidecar, xpath_to_mapping,
alignment, page_lost_info=page_lost_info)
alignment, page_lost_info=page_lost_info,
roundtrip_sidecar=roundtrip_sidecar)
patched_xhtml = patch_xhtml(xhtml, patches)
(var_dir / 'reverse-sync.patched.xhtml').write_text(patched_xhtml)

Expand Down
29 changes: 19 additions & 10 deletions confluence-mdx/tests/test_reverse_sync_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,8 +567,8 @@ def testbuild_patches_index_mapping():

assert len(patches) == 1
assert patches[0]['xhtml_xpath'] == 'p[1]'
assert patches[0]['old_plain_text'] == 'Old text.'
assert patches[0]['new_inner_xhtml'] == 'New text.'
assert patches[0]['action'] == 'replace_fragment'
assert patches[0]['new_element_xhtml'] == '<p>New text.</p>'


def testbuild_patches_skips_non_content():
Expand Down Expand Up @@ -787,7 +787,12 @@ def testbuild_patches_table_block():
from reverse_sync.mdx_block_parser import MdxBlock
from reverse_sync.block_diff import BlockChange
from reverse_sync.mapping_recorder import BlockMapping
from reverse_sync.sidecar import SidecarEntry
from reverse_sync.sidecar import (
DocumentEnvelope,
RoundtripSidecar,
SidecarBlock,
SidecarEntry,
)

old_table = '<table>\n<th>\n**Databased Access Control**\n</th>\n</table>\n'
new_table = '<table>\n<th>\n**Database Access Control**\n</th>\n</table>\n'
Expand All @@ -809,19 +814,23 @@ def testbuild_patches_table_block():
mdx_to_sidecar = {
0: SidecarEntry(xhtml_xpath='table[1]', xhtml_type='table', mdx_blocks=[0]),
}
roundtrip_sidecar = RoundtripSidecar(
page_id='test',
blocks=[SidecarBlock(0, 'table[1]', '<table>...</table>', 'hash1', (1, 5))],
separators=[],
document_envelope=DocumentEnvelope(prefix='', suffix='\n'),
)
xpath_to_mapping = {m.xhtml_xpath: m for m in mappings}

patches = build_patches(changes, original_blocks, improved_blocks, mappings,
mdx_to_sidecar, xpath_to_mapping)
mdx_to_sidecar, xpath_to_mapping,
roundtrip_sidecar=roundtrip_sidecar)

assert len(patches) == 1
assert patches[0]['xhtml_xpath'] == 'table[1]'
assert patches[0]['old_plain_text'] == 'Databased Access Control'
# bold content가 변경되어 has_inline_format_change()가 True →
# new_inner_xhtml 패치가 생성됨 (outer <table> 없이 innerHTML만 포함)
assert 'new_inner_xhtml' in patches[0]
assert '<strong>Database Access Control</strong>' in patches[0]['new_inner_xhtml']
assert not patches[0]['new_inner_xhtml'].startswith('<table')
assert patches[0]['action'] == 'replace_fragment'
assert '<strong>Database Access Control</strong>' in patches[0]['new_element_xhtml']
assert patches[0]['new_element_xhtml'].startswith('<table')


# --- sidecar 전용 매칭 코드 경로 테스트 ---
Expand Down
Loading
Loading