Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 142 additions & 21 deletions confluence-mdx/bin/reverse_sync/sidecar.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Sidecar 통합 모듈 — Block-level roundtrip sidecar 스키마/IO + Mapping lookup/인덱스.

Block-level sidecar (schema v2):
Block-level sidecar (schema v3):
RoundtripSidecar, SidecarBlock, DocumentEnvelope,
build_sidecar, verify_sidecar_integrity,
write_sidecar, load_sidecar, sha256_text
write_sidecar, load_sidecar, sha256_text,
find_block_by_identity

Mapping lookup (mapping.yaml v3 기반):
SidecarChildEntry, SidecarEntry, load_sidecar_mapping, build_mdx_to_sidecar_index,
Expand All @@ -28,7 +29,10 @@
# Roundtrip sidecar — block-level fragment + metadata
# ---------------------------------------------------------------------------

ROUNDTRIP_SCHEMA_VERSION = "2"
ROUNDTRIP_SCHEMA_VERSION = "3"

# v2 스키마도 로드 허용 (하위 호환)
_COMPATIBLE_SCHEMA_VERSIONS = frozenset({"2", "3"})


def sha256_text(text: str) -> str:
Expand All @@ -45,14 +49,20 @@ class DocumentEnvelope:

@dataclass
class SidecarBlock:
"""Individual XHTML block + metadata."""
"""Individual XHTML block + metadata.

schema v3에서 reconstruction 필드가 추가됨:
- reconstruction: dict | None — block 재구성에 필요한 metadata
kind, old_plain_text, anchors, items(list), child_blocks 등을 포함
"""

block_index: int
xhtml_xpath: str
xhtml_fragment: str
mdx_content_hash: str = ""
mdx_line_range: tuple = (0, 0)
lost_info: dict = field(default_factory=dict)
reconstruction: Optional[dict] = None


@dataclass
Expand All @@ -79,22 +89,25 @@ def reassemble_xhtml(self) -> str:

def to_dict(self) -> dict:
"""JSON 직렬화."""
blocks = []
for b in self.blocks:
d: dict = {
"block_index": b.block_index,
"xhtml_xpath": b.xhtml_xpath,
"xhtml_fragment": b.xhtml_fragment,
"mdx_content_hash": b.mdx_content_hash,
"mdx_line_range": list(b.mdx_line_range),
"lost_info": b.lost_info,
}
if b.reconstruction is not None:
d["reconstruction"] = b.reconstruction
blocks.append(d)
return {
"schema_version": self.schema_version,
"page_id": self.page_id,
"mdx_sha256": self.mdx_sha256,
"source_xhtml_sha256": self.source_xhtml_sha256,
"blocks": [
{
"block_index": b.block_index,
"xhtml_xpath": b.xhtml_xpath,
"xhtml_fragment": b.xhtml_fragment,
"mdx_content_hash": b.mdx_content_hash,
"mdx_line_range": list(b.mdx_line_range),
"lost_info": b.lost_info,
}
for b in self.blocks
],
"blocks": blocks,
"separators": self.separators,
"document_envelope": {
"prefix": self.document_envelope.prefix,
Expand All @@ -104,7 +117,7 @@ def to_dict(self) -> dict:

@staticmethod
def from_dict(data: dict) -> "RoundtripSidecar":
"""JSON 역직렬화."""
"""JSON 역직렬화. v2/v3 모두 지원."""
blocks = [
SidecarBlock(
block_index=b["block_index"],
Expand All @@ -113,6 +126,7 @@ def from_dict(data: dict) -> "RoundtripSidecar":
mdx_content_hash=b.get("mdx_content_hash", ""),
mdx_line_range=tuple(b.get("mdx_line_range", (0, 0))),
lost_info=b.get("lost_info", {}),
reconstruction=b.get("reconstruction"),
)
for b in data.get("blocks", [])
]
Expand Down Expand Up @@ -156,14 +170,63 @@ def verify_sidecar_integrity(
)


def _build_reconstruction_metadata(
fragment: str,
xhtml_type: str,
) -> Optional[dict]:
"""XHTML fragment에서 reconstruction metadata를 생성한다.

현재 지원하는 kind:
- paragraph: old_plain_text + anchors
- heading: old_plain_text
- list: old_plain_text + items (placeholder)
- code: (None — clean block)
- table: (None — clean block)
- html_block: kind + old_plain_text

Phase 3에서 anchor 분석이 추가될 예정.
"""
from reverse_sync.xhtml_normalizer import extract_plain_text

# code, table은 clean block — reconstruction metadata 불필요
if xhtml_type in ("code", "table"):
return None

plain_text = extract_plain_text(fragment)

kind_map = {
"heading": "heading",
"paragraph": "paragraph",
"list": "list",
"html_block": "container",
}
kind = kind_map.get(xhtml_type, xhtml_type)

meta: dict = {
"kind": kind,
"old_plain_text": plain_text,
}

# list는 items placeholder (Phase 3에서 실제 item 분석)
if xhtml_type == "list":
meta["items"] = []

# paragraph/heading은 anchors placeholder
if xhtml_type in ("heading", "paragraph"):
meta["anchors"] = []

return meta


def build_sidecar(
page_xhtml_text: str,
mdx_text: str,
page_id: str = "",
) -> RoundtripSidecar:
"""Block-level sidecar를 생성한다.

Fragment 추출 → MDX alignment → 무결성 검증 → RoundtripSidecar 반환.
Fragment 추출 → MDX alignment → reconstruction metadata 빌드 →
무결성 검증 → RoundtripSidecar 반환.
"""
from reverse_sync.fragment_extractor import extract_block_fragments
from reverse_sync.mapping_recorder import record_mapping
Expand All @@ -187,19 +250,23 @@ def build_sidecar(
sidecar_blocks: List[SidecarBlock] = []
for i, fragment in enumerate(frag_result.fragments):
xpath = top_mappings[i].xhtml_xpath if i < len(top_mappings) else f"unknown[{i}]"
xhtml_type = top_mappings[i].type if i < len(top_mappings) else ""

# 순차 1:1 대응 (향후 block alignment로 개선)
mdx_block = mdx_content_blocks[i] if i < len(mdx_content_blocks) else None
mdx_hash = sha256_text(mdx_block.content) if mdx_block else ""
mdx_range = (mdx_block.line_start, mdx_block.line_end) if mdx_block else (0, 0)

reconstruction = _build_reconstruction_metadata(fragment, xhtml_type)

sidecar_blocks.append(
SidecarBlock(
block_index=i,
xhtml_xpath=xpath,
xhtml_fragment=fragment,
mdx_content_hash=mdx_hash,
mdx_line_range=mdx_range,
reconstruction=reconstruction,
)
)

Expand Down Expand Up @@ -231,14 +298,18 @@ def write_sidecar(sidecar: RoundtripSidecar, path: Path) -> None:


def load_sidecar(path: Path) -> RoundtripSidecar:
"""JSON 파일에서 RoundtripSidecar를 로드한다."""
"""JSON 파일에서 RoundtripSidecar를 로드한다.

v2와 v3 스키마를 모두 지원한다. v2 파일은 reconstruction=None으로 로드된다.
"""
data: Any = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(data, dict):
raise ValueError("invalid sidecar payload")
if data.get("schema_version") != ROUNDTRIP_SCHEMA_VERSION:
version = data.get("schema_version")
if version not in _COMPATIBLE_SCHEMA_VERSIONS:
raise ValueError(
f"expected schema_version={ROUNDTRIP_SCHEMA_VERSION}, "
f"got {data.get('schema_version')}"
f"expected schema_version in {sorted(_COMPATIBLE_SCHEMA_VERSIONS)}, "
f"got {version}"
)
return RoundtripSidecar.from_dict(data)

Expand Down Expand Up @@ -526,3 +597,53 @@ def find_mapping_by_sidecar(
if entry is None:
return None
return xpath_to_mapping.get(entry.xhtml_xpath)


# ---------------------------------------------------------------------------
# Block identity — hash + line_range 기반 disambiguation
# ---------------------------------------------------------------------------

@dataclass
class _IdentityKey:
"""Internal identity lookup key."""
mdx_content_hash: str
mdx_line_range: tuple


def build_block_identity_index(
sidecar: RoundtripSidecar,
) -> Dict[str, List[SidecarBlock]]:
"""mdx_content_hash → SidecarBlock 리스트 인덱스를 구축한다.

동일 hash를 가진 블록이 여러 개일 때 line_range로 disambiguation할 수 있도록
리스트로 저장한다.
"""
index: Dict[str, List[SidecarBlock]] = {}
for block in sidecar.blocks:
if not block.mdx_content_hash:
continue
index.setdefault(block.mdx_content_hash, []).append(block)
return index


def find_block_by_identity(
mdx_content_hash: str,
mdx_line_range: tuple,
identity_index: Dict[str, List[SidecarBlock]],
) -> Optional[SidecarBlock]:
"""hash + line_range로 SidecarBlock을 찾는다.

1. hash가 유일하면 바로 반환
2. 같은 hash가 여러 개면 line_range가 일치하는 블록을 반환
3. line_range도 일치하지 않으면 None
"""
candidates = identity_index.get(mdx_content_hash)
if not candidates:
return None
if len(candidates) == 1:
return candidates[0]
# line_range로 disambiguation
for block in candidates:
if block.mdx_line_range == mdx_line_range:
return block
return None
10 changes: 5 additions & 5 deletions confluence-mdx/tests/test_reverse_sync_sidecar_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_create_sidecar(self):
separators=[],
document_envelope=DocumentEnvelope(prefix="", suffix="\n"),
)
assert sidecar.schema_version == "2"
assert sidecar.schema_version in ("2", "3")
assert sidecar.page_id == "test"
assert len(sidecar.blocks) == 1

Expand All @@ -52,7 +52,7 @@ def test_to_dict_roundtrip(self):
d = original.to_dict()
restored = RoundtripSidecar.from_dict(d)

assert restored.schema_version == "2"
assert restored.schema_version in ("2", "3")
assert restored.page_id == "123"
assert len(restored.blocks) == 2
assert restored.blocks[0].xhtml_fragment == "<h2>A</h2>"
Expand Down Expand Up @@ -145,7 +145,7 @@ def test_roundtrip(self, tmp_path):
def test_load_rejects_wrong_version(self, tmp_path):
path = tmp_path / "bad.json"
path.write_text('{"schema_version": "1"}', encoding="utf-8")
with pytest.raises(ValueError, match="expected schema_version=2"):
with pytest.raises(ValueError, match="expected schema_version in"):
load_sidecar(path)


Expand All @@ -155,7 +155,7 @@ def test_simple_case(self):
mdx = "## Title\n\nBody text\n"
sidecar = build_sidecar(xhtml, mdx, page_id="test")

assert sidecar.schema_version == "2"
assert sidecar.schema_version in ("2", "3")
assert sidecar.page_id == "test"
assert sidecar.mdx_sha256 == sha256_text(mdx)
assert sidecar.source_xhtml_sha256 == sha256_text(xhtml)
Expand Down Expand Up @@ -189,7 +189,7 @@ def test_all_testcases_build_and_verify(self, testcases_dir):
mdx = mdx_path.read_text(encoding="utf-8")
sidecar = build_sidecar(xhtml, mdx, page_id=case_dir.name)

assert sidecar.schema_version == "2"
assert sidecar.schema_version in ("2", "3")
assert len(sidecar.blocks) > 0
assert len(sidecar.separators) == len(sidecar.blocks) - 1
ok += 1
Expand Down
Loading
Loading