Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions scielo_classic_website/htmlbody/html_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def process_single_link(self, link_element: etree.Element, base_path: str) -> Op
# Erro ao processar ou circular ou já processado - cria xref
return self.create_xref_element(file_path, anchor, link_element)

def process_html_internal(self, html_content: str, base_path: str = None) -> etree.Element:
def process_html_internal(self, html_content: str, base_path: str = None) -> Optional[etree.Element]:
"""Processa HTML internamente (para recursão)."""
try:
# Parse do HTML
Expand All @@ -224,7 +224,10 @@ def process_html_internal(self, html_content: str, base_path: str = None) -> etr

except Exception as e:
print(f"Erro ao processar HTML: {e}")
return html_content
# Retorna None em caso de falha; o conteúdo original (string) não
# é um elemento etree e quebraria os chamadores que esperam um
# Element (ex.: parent.replace, .tag, .set).
return None

def process_html(self, html_content: str, base_path: str = None) -> str:
"""Processa HTML incorporando referências locais (API pública)."""
Expand Down
53 changes: 44 additions & 9 deletions scielo_classic_website/spsxml/sps_xml_body_pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,21 @@ def precond(data):
@plumber.precondition(precond)
def transform(self, data):
raw = data
xml = ET.fromstring(raw.xml_body_and_back[-1])
content = raw.xml_body_and_back[-1]
try:
xml = ET.fromstring(content)
except ET.XMLSyntaxError:
# Tenta recuperar XML malformado (ex: comentários inválidos,
# condicionais MS Office residuais) usando o parser de recuperação
# do lxml para evitar falha total da etapa.
if isinstance(content, str):
content_bytes = content.encode("utf-8")
else:
content_bytes = content
recover_parser = ET.XMLParser(recover=True)
xml = ET.fromstring(content_bytes, parser=recover_parser)
if xml is None:
raise
_report(xml, func_name=type(self))
return data, xml

Expand Down Expand Up @@ -1858,8 +1872,8 @@ def transform(self, data):
return data

def merge(self, journal_acron_folder, html_reader, xml):
try:
for body in xml.xpath(".//body"):
for body in xml.xpath(".//body"):
try:
body_str = ET.tostring(body, encoding="iso-8859-1").decode("iso-8859-1")
input_html = f"<html>{body_str}</html>"
new_body = merge_html(
Expand All @@ -1868,10 +1882,19 @@ def merge(self, journal_acron_folder, html_reader, xml):
encoding="iso-8859-1",
content_reader=html_reader
)
if not isinstance(new_body, ET._Element):
continue
parent = body.getparent()
if parent is None:
continue
parent.replace(body, new_body)

for back in xml.xpath(".//back"):
except Exception as e:
logging.error(f"MarkHTMLFileToEmbedPipe - error processing body: {e}")
logging.exception(e)

for back in xml.xpath(".//back"):
original_tag = back.tag
try:
back.tag = "body"
back_str = ET.tostring(back, encoding="iso-8859-1").decode("iso-8859-1")
input_html = f"<html>{back_str}</html>"
Expand All @@ -1881,12 +1904,24 @@ def merge(self, journal_acron_folder, html_reader, xml):
encoding="iso-8859-1",
content_reader=html_reader
)
new_back.tag = "back"
if not isinstance(new_back, ET._Element):
# Restaura tag original se a mesclagem falhou
back.tag = original_tag
continue
new_back.tag = original_tag
parent = back.getparent()
if parent is None:
back.tag = original_tag
continue
parent.replace(back, new_back)
except Exception as e:
logging.error(f"MarkHTMLFileToEmbedPipe - error processing html embedding: {e}")
logging.exception(e)
except Exception as e:
# Garante que a tag original seja restaurada mesmo em caso de erro
try:
back.tag = original_tag
except Exception:
pass
logging.error(f"MarkHTMLFileToEmbedPipe - error processing back: {e}")
logging.exception(e)


class XMLBodyCenterPipe(plumber.Pipe):
Expand Down
126 changes: 126 additions & 0 deletions tests/test_sps_xml_body_step_30.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""
Testes para a etapa `convert_html_to_xml_step_30_embed_html` do pipeline
de conversão de HTML para XML (terceira chamada da sequência em
`convert_html_to_xml`).

Cobre os comportamentos de robustez introduzidos para evitar que a etapa
falhe com `XMLSyntaxError` (originando o erro reportado no Article Proc do
serviço migrador) e para preservar o estado do documento quando a mesclagem
de HTML embutido falhar parcialmente.
"""
from unittest import TestCase
from unittest import mock

from lxml import etree as ET

from scielo_classic_website.spsxml import sps_xml_body_pipes
from scielo_classic_website.spsxml.sps_xml_body_pipes import (
MarkHTMLFileToEmbedPipe,
StartPipe,
)


class _Journal:
def __init__(self, acronym="abc"):
self.acronym = acronym


class _Raw:
def __init__(self, xml_body_and_back=None, journal=None, html_reader=None):
self.xml_body_and_back = xml_body_and_back or []
self.journal = journal
if html_reader is not None:
self.html_reader = html_reader


class TestStartPipeRecover(TestCase):
def test_parses_valid_xml(self):
raw = _Raw(xml_body_and_back=["<article><body><p>ok</p></body></article>"])
_, xml = StartPipe().transform(raw)
self.assertEqual(xml.tag, "article")
self.assertEqual(xml.find(".//p").text, "ok")

def test_recovers_from_invalid_comment_with_double_hyphen(self):
# Comentário inválido para XML (contém '--' interno) — clipboard do MS
# pode introduzir esse padrão. Antes da correção, isso quebrava o
# passo 30 com XMLSyntaxError.
raw = _Raw(
xml_body_and_back=[
"<article><body><p>ok</p>"
"<!--EndF>><!--EndFragment--></body></article>"
]
)
_, xml = StartPipe().transform(raw)
self.assertEqual(xml.tag, "article")
self.assertEqual(xml.find(".//p").text, "ok")


class TestMarkHTMLFileToEmbedPipeMerge(TestCase):
def _build_xml(self):
return ET.fromstring(
"<article>"
"<body><p>body content</p></body>"
"<back><ref>r1</ref></back>"
"</article>"
)

def test_back_tag_preserved_when_merge_html_returns_none(self):
"""Se merge_html falhar e retornar None, a tag <back> deve ser
restaurada — antes da correção, ela permanecia como <body>."""
raw = _Raw(journal=_Journal())
xml = self._build_xml()
with mock.patch.object(sps_xml_body_pipes, "merge_html", return_value=None):
MarkHTMLFileToEmbedPipe().transform((raw, xml))
self.assertIsNotNone(xml.find(".//back"))
self.assertIsNotNone(xml.find(".//body"))

def test_back_tag_preserved_when_merge_html_raises(self):
"""Se merge_html lançar exceção ao processar <back>, a tag original
deve ser restaurada e a transformação deve concluir sem propagar."""
raw = _Raw(journal=_Journal())
xml = self._build_xml()
with mock.patch.object(
sps_xml_body_pipes, "merge_html", side_effect=RuntimeError("boom")
):
# Não deve levantar exceção
MarkHTMLFileToEmbedPipe().transform((raw, xml))
self.assertIsNotNone(xml.find(".//back"))
self.assertIsNotNone(xml.find(".//body"))

def test_back_failure_does_not_block_body_processing(self):
"""Falha em <back> não deve impedir a substituição bem-sucedida em
<body>, e vice-versa."""
raw = _Raw(journal=_Journal())
xml = self._build_xml()

new_body = ET.fromstring("<body><p>merged body</p></body>")

def fake_merge_html(input_html, **kwargs):
if "<back" in input_html or "back content" in input_html:
raise RuntimeError("back failure")
# Para <body>, retorna um elemento válido
return ET.fromstring("<body><p>merged body</p></body>")

with mock.patch.object(
sps_xml_body_pipes, "merge_html", side_effect=fake_merge_html
):
MarkHTMLFileToEmbedPipe().transform((raw, xml))

# body foi substituído com sucesso
self.assertEqual(xml.find(".//body/p").text, "merged body")
# back foi preservado
self.assertIsNotNone(xml.find(".//back"))


class TestHTMLMergerInternalReturnsNoneOnError(TestCase):
def test_process_html_internal_returns_none_on_parse_failure(self):
from scielo_classic_website.htmlbody.html_merger import HTMLMerger

merger = HTMLMerger()
# Faz a função interna do parser explodir
with mock.patch(
"scielo_classic_website.htmlbody.html_merger.html.fromstring",
side_effect=RuntimeError("boom"),
):
result = merger.process_html_internal("<html><body/></html>")
self.assertIsNone(result)