diff --git a/scielo_classic_website/htmlbody/html_merger.py b/scielo_classic_website/htmlbody/html_merger.py
index a2b8667..b84d46d 100644
--- a/scielo_classic_website/htmlbody/html_merger.py
+++ b/scielo_classic_website/htmlbody/html_merger.py
@@ -201,7 +201,7 @@ def process_single_link(self, link_element: etree.Element, base_path: str) -> Op
# Erro ao processar ou circular ou já processado - cria xref
return self.create_xref_element(file_path, anchor, link_element)
- def process_html_internal(self, html_content: str, base_path: str = None) -> etree.Element:
+ def process_html_internal(self, html_content: str, base_path: str = None) -> Optional[etree.Element]:
"""Processa HTML internamente (para recursão)."""
try:
# Parse do HTML
@@ -224,7 +224,10 @@ def process_html_internal(self, html_content: str, base_path: str = None) -> etr
except Exception as e:
print(f"Erro ao processar HTML: {e}")
- return html_content
+ # Retorna None em caso de falha; o conteúdo original (string) não
+ # é um elemento etree e quebraria os chamadores que esperam um
+ # Element (ex.: parent.replace, .tag, .set).
+ return None
def process_html(self, html_content: str, base_path: str = None) -> str:
"""Processa HTML incorporando referências locais (API pública)."""
diff --git a/scielo_classic_website/spsxml/sps_xml_body_pipes.py b/scielo_classic_website/spsxml/sps_xml_body_pipes.py
index 0a42fad..5a40e0d 100644
--- a/scielo_classic_website/spsxml/sps_xml_body_pipes.py
+++ b/scielo_classic_website/spsxml/sps_xml_body_pipes.py
@@ -480,7 +480,21 @@ def precond(data):
@plumber.precondition(precond)
def transform(self, data):
raw = data
- xml = ET.fromstring(raw.xml_body_and_back[-1])
+ content = raw.xml_body_and_back[-1]
+ try:
+ xml = ET.fromstring(content)
+ except ET.XMLSyntaxError:
+ # Tenta recuperar XML malformado (ex: comentários inválidos,
+ # condicionais MS Office residuais) usando o parser de recuperação
+ # do lxml para evitar falha total da etapa.
+ if isinstance(content, str):
+ content_bytes = content.encode("utf-8")
+ else:
+ content_bytes = content
+ recover_parser = ET.XMLParser(recover=True)
+ xml = ET.fromstring(content_bytes, parser=recover_parser)
+ if xml is None:
+ raise
_report(xml, func_name=type(self))
return data, xml
@@ -1858,8 +1872,8 @@ def transform(self, data):
return data
def merge(self, journal_acron_folder, html_reader, xml):
- try:
- for body in xml.xpath(".//body"):
+ for body in xml.xpath(".//body"):
+ try:
body_str = ET.tostring(body, encoding="iso-8859-1").decode("iso-8859-1")
input_html = f"{body_str}"
new_body = merge_html(
@@ -1868,10 +1882,19 @@ def merge(self, journal_acron_folder, html_reader, xml):
encoding="iso-8859-1",
content_reader=html_reader
)
+ if not isinstance(new_body, ET._Element):
+ continue
parent = body.getparent()
+ if parent is None:
+ continue
parent.replace(body, new_body)
-
- for back in xml.xpath(".//back"):
+ except Exception as e:
+ logging.error(f"MarkHTMLFileToEmbedPipe - error processing body: {e}")
+ logging.exception(e)
+
+ for back in xml.xpath(".//back"):
+ original_tag = back.tag
+ try:
back.tag = "body"
back_str = ET.tostring(back, encoding="iso-8859-1").decode("iso-8859-1")
input_html = f"{back_str}"
@@ -1881,12 +1904,24 @@ def merge(self, journal_acron_folder, html_reader, xml):
encoding="iso-8859-1",
content_reader=html_reader
)
- new_back.tag = "back"
+ if not isinstance(new_back, ET._Element):
+ # Restaura tag original se a mesclagem falhou
+ back.tag = original_tag
+ continue
+ new_back.tag = original_tag
parent = back.getparent()
+ if parent is None:
+ back.tag = original_tag
+ continue
parent.replace(back, new_back)
- except Exception as e:
- logging.error(f"MarkHTMLFileToEmbedPipe - error processing html embedding: {e}")
- logging.exception(e)
+ except Exception as e:
+ # Garante que a tag original seja restaurada mesmo em caso de erro
+ try:
+ back.tag = original_tag
+ except Exception:
+ pass
+ logging.error(f"MarkHTMLFileToEmbedPipe - error processing back: {e}")
+ logging.exception(e)
class XMLBodyCenterPipe(plumber.Pipe):
diff --git a/tests/test_sps_xml_body_step_30.py b/tests/test_sps_xml_body_step_30.py
new file mode 100644
index 0000000..2f93462
--- /dev/null
+++ b/tests/test_sps_xml_body_step_30.py
@@ -0,0 +1,126 @@
+"""
+Testes para a etapa `convert_html_to_xml_step_30_embed_html` do pipeline
+de conversão de HTML para XML (terceira chamada da sequência em
+`convert_html_to_xml`).
+
+Cobre os comportamentos de robustez introduzidos para evitar que a etapa
+falhe com `XMLSyntaxError` (originando o erro reportado no Article Proc do
+serviço migrador) e para preservar o estado do documento quando a mesclagem
+de HTML embutido falhar parcialmente.
+"""
+from unittest import TestCase
+from unittest import mock
+
+from lxml import etree as ET
+
+from scielo_classic_website.spsxml import sps_xml_body_pipes
+from scielo_classic_website.spsxml.sps_xml_body_pipes import (
+ MarkHTMLFileToEmbedPipe,
+ StartPipe,
+)
+
+
+class _Journal:
+ def __init__(self, acronym="abc"):
+ self.acronym = acronym
+
+
+class _Raw:
+ def __init__(self, xml_body_and_back=None, journal=None, html_reader=None):
+ self.xml_body_and_back = xml_body_and_back or []
+ self.journal = journal
+ if html_reader is not None:
+ self.html_reader = html_reader
+
+
+class TestStartPipeRecover(TestCase):
+ def test_parses_valid_xml(self):
+ raw = _Raw(xml_body_and_back=["ok
"])
+ _, xml = StartPipe().transform(raw)
+ self.assertEqual(xml.tag, "article")
+ self.assertEqual(xml.find(".//p").text, "ok")
+
+ def test_recovers_from_invalid_comment_with_double_hyphen(self):
+ # Comentário inválido para XML (contém '--' interno) — clipboard do MS
+ # pode introduzir esse padrão. Antes da correção, isso quebrava o
+ # passo 30 com XMLSyntaxError.
+ raw = _Raw(
+ xml_body_and_back=[
+ "ok
"
+ ""
+ ]
+ )
+ _, xml = StartPipe().transform(raw)
+ self.assertEqual(xml.tag, "article")
+ self.assertEqual(xml.find(".//p").text, "ok")
+
+
+class TestMarkHTMLFileToEmbedPipeMerge(TestCase):
+ def _build_xml(self):
+ return ET.fromstring(
+ ""
+ "body content
"
+ "[r1]"
+ ""
+ )
+
+ def test_back_tag_preserved_when_merge_html_returns_none(self):
+ """Se merge_html falhar e retornar None, a tag deve ser
+ restaurada — antes da correção, ela permanecia como ."""
+ raw = _Raw(journal=_Journal())
+ xml = self._build_xml()
+ with mock.patch.object(sps_xml_body_pipes, "merge_html", return_value=None):
+ MarkHTMLFileToEmbedPipe().transform((raw, xml))
+ self.assertIsNotNone(xml.find(".//back"))
+ self.assertIsNotNone(xml.find(".//body"))
+
+ def test_back_tag_preserved_when_merge_html_raises(self):
+ """Se merge_html lançar exceção ao processar , a tag original
+ deve ser restaurada e a transformação deve concluir sem propagar."""
+ raw = _Raw(journal=_Journal())
+ xml = self._build_xml()
+ with mock.patch.object(
+ sps_xml_body_pipes, "merge_html", side_effect=RuntimeError("boom")
+ ):
+ # Não deve levantar exceção
+ MarkHTMLFileToEmbedPipe().transform((raw, xml))
+ self.assertIsNotNone(xml.find(".//back"))
+ self.assertIsNotNone(xml.find(".//body"))
+
+ def test_back_failure_does_not_block_body_processing(self):
+ """Falha em não deve impedir a substituição bem-sucedida em
+ , e vice-versa."""
+ raw = _Raw(journal=_Journal())
+ xml = self._build_xml()
+
+ new_body = ET.fromstring("merged body
")
+
+ def fake_merge_html(input_html, **kwargs):
+ if ", retorna um elemento válido
+ return ET.fromstring("merged body
")
+
+ with mock.patch.object(
+ sps_xml_body_pipes, "merge_html", side_effect=fake_merge_html
+ ):
+ MarkHTMLFileToEmbedPipe().transform((raw, xml))
+
+ # body foi substituído com sucesso
+ self.assertEqual(xml.find(".//body/p").text, "merged body")
+ # back foi preservado
+ self.assertIsNotNone(xml.find(".//back"))
+
+
+class TestHTMLMergerInternalReturnsNoneOnError(TestCase):
+ def test_process_html_internal_returns_none_on_parse_failure(self):
+ from scielo_classic_website.htmlbody.html_merger import HTMLMerger
+
+ merger = HTMLMerger()
+ # Faz a função interna do parser explodir
+ with mock.patch(
+ "scielo_classic_website.htmlbody.html_merger.html.fromstring",
+ side_effect=RuntimeError("boom"),
+ ):
+ result = merger.process_html_internal("")
+ self.assertIsNone(result)