diff --git a/scielo_classic_website/htmlbody/html_merger.py b/scielo_classic_website/htmlbody/html_merger.py index a2b8667..b84d46d 100644 --- a/scielo_classic_website/htmlbody/html_merger.py +++ b/scielo_classic_website/htmlbody/html_merger.py @@ -201,7 +201,7 @@ def process_single_link(self, link_element: etree.Element, base_path: str) -> Op # Erro ao processar ou circular ou já processado - cria xref return self.create_xref_element(file_path, anchor, link_element) - def process_html_internal(self, html_content: str, base_path: str = None) -> etree.Element: + def process_html_internal(self, html_content: str, base_path: str = None) -> Optional[etree.Element]: """Processa HTML internamente (para recursão).""" try: # Parse do HTML @@ -224,7 +224,10 @@ def process_html_internal(self, html_content: str, base_path: str = None) -> etr except Exception as e: print(f"Erro ao processar HTML: {e}") - return html_content + # Retorna None em caso de falha; o conteúdo original (string) não + # é um elemento etree e quebraria os chamadores que esperam um + # Element (ex.: parent.replace, .tag, .set). + return None def process_html(self, html_content: str, base_path: str = None) -> str: """Processa HTML incorporando referências locais (API pública).""" diff --git a/scielo_classic_website/spsxml/sps_xml_body_pipes.py b/scielo_classic_website/spsxml/sps_xml_body_pipes.py index 0a42fad..5a40e0d 100644 --- a/scielo_classic_website/spsxml/sps_xml_body_pipes.py +++ b/scielo_classic_website/spsxml/sps_xml_body_pipes.py @@ -480,7 +480,21 @@ def precond(data): @plumber.precondition(precond) def transform(self, data): raw = data - xml = ET.fromstring(raw.xml_body_and_back[-1]) + content = raw.xml_body_and_back[-1] + try: + xml = ET.fromstring(content) + except ET.XMLSyntaxError: + # Tenta recuperar XML malformado (ex: comentários inválidos, + # condicionais MS Office residuais) usando o parser de recuperação + # do lxml para evitar falha total da etapa. + if isinstance(content, str): + content_bytes = content.encode("utf-8") + else: + content_bytes = content + recover_parser = ET.XMLParser(recover=True) + xml = ET.fromstring(content_bytes, parser=recover_parser) + if xml is None: + raise _report(xml, func_name=type(self)) return data, xml @@ -1858,8 +1872,8 @@ def transform(self, data): return data def merge(self, journal_acron_folder, html_reader, xml): - try: - for body in xml.xpath(".//body"): + for body in xml.xpath(".//body"): + try: body_str = ET.tostring(body, encoding="iso-8859-1").decode("iso-8859-1") input_html = f"{body_str}" new_body = merge_html( @@ -1868,10 +1882,19 @@ def merge(self, journal_acron_folder, html_reader, xml): encoding="iso-8859-1", content_reader=html_reader ) + if not isinstance(new_body, ET._Element): + continue parent = body.getparent() + if parent is None: + continue parent.replace(body, new_body) - - for back in xml.xpath(".//back"): + except Exception as e: + logging.error(f"MarkHTMLFileToEmbedPipe - error processing body: {e}") + logging.exception(e) + + for back in xml.xpath(".//back"): + original_tag = back.tag + try: back.tag = "body" back_str = ET.tostring(back, encoding="iso-8859-1").decode("iso-8859-1") input_html = f"{back_str}" @@ -1881,12 +1904,24 @@ def merge(self, journal_acron_folder, html_reader, xml): encoding="iso-8859-1", content_reader=html_reader ) - new_back.tag = "back" + if not isinstance(new_back, ET._Element): + # Restaura tag original se a mesclagem falhou + back.tag = original_tag + continue + new_back.tag = original_tag parent = back.getparent() + if parent is None: + back.tag = original_tag + continue parent.replace(back, new_back) - except Exception as e: - logging.error(f"MarkHTMLFileToEmbedPipe - error processing html embedding: {e}") - logging.exception(e) + except Exception as e: + # Garante que a tag original seja restaurada mesmo em caso de erro + try: + back.tag = original_tag + except Exception: + pass + logging.error(f"MarkHTMLFileToEmbedPipe - error processing back: {e}") + logging.exception(e) class XMLBodyCenterPipe(plumber.Pipe): diff --git a/tests/test_sps_xml_body_step_30.py b/tests/test_sps_xml_body_step_30.py new file mode 100644 index 0000000..2f93462 --- /dev/null +++ b/tests/test_sps_xml_body_step_30.py @@ -0,0 +1,126 @@ +""" +Testes para a etapa `convert_html_to_xml_step_30_embed_html` do pipeline +de conversão de HTML para XML (terceira chamada da sequência em +`convert_html_to_xml`). + +Cobre os comportamentos de robustez introduzidos para evitar que a etapa +falhe com `XMLSyntaxError` (originando o erro reportado no Article Proc do +serviço migrador) e para preservar o estado do documento quando a mesclagem +de HTML embutido falhar parcialmente. +""" +from unittest import TestCase +from unittest import mock + +from lxml import etree as ET + +from scielo_classic_website.spsxml import sps_xml_body_pipes +from scielo_classic_website.spsxml.sps_xml_body_pipes import ( + MarkHTMLFileToEmbedPipe, + StartPipe, +) + + +class _Journal: + def __init__(self, acronym="abc"): + self.acronym = acronym + + +class _Raw: + def __init__(self, xml_body_and_back=None, journal=None, html_reader=None): + self.xml_body_and_back = xml_body_and_back or [] + self.journal = journal + if html_reader is not None: + self.html_reader = html_reader + + +class TestStartPipeRecover(TestCase): + def test_parses_valid_xml(self): + raw = _Raw(xml_body_and_back=["

ok

"]) + _, xml = StartPipe().transform(raw) + self.assertEqual(xml.tag, "article") + self.assertEqual(xml.find(".//p").text, "ok") + + def test_recovers_from_invalid_comment_with_double_hyphen(self): + # Comentário inválido para XML (contém '--' interno) — clipboard do MS + # pode introduzir esse padrão. Antes da correção, isso quebrava o + # passo 30 com XMLSyntaxError. + raw = _Raw( + xml_body_and_back=[ + "

ok

" + "
" + ] + ) + _, xml = StartPipe().transform(raw) + self.assertEqual(xml.tag, "article") + self.assertEqual(xml.find(".//p").text, "ok") + + +class TestMarkHTMLFileToEmbedPipeMerge(TestCase): + def _build_xml(self): + return ET.fromstring( + "
" + "

body content

" + "r1" + "
" + ) + + def test_back_tag_preserved_when_merge_html_returns_none(self): + """Se merge_html falhar e retornar None, a tag deve ser + restaurada — antes da correção, ela permanecia como .""" + raw = _Raw(journal=_Journal()) + xml = self._build_xml() + with mock.patch.object(sps_xml_body_pipes, "merge_html", return_value=None): + MarkHTMLFileToEmbedPipe().transform((raw, xml)) + self.assertIsNotNone(xml.find(".//back")) + self.assertIsNotNone(xml.find(".//body")) + + def test_back_tag_preserved_when_merge_html_raises(self): + """Se merge_html lançar exceção ao processar , a tag original + deve ser restaurada e a transformação deve concluir sem propagar.""" + raw = _Raw(journal=_Journal()) + xml = self._build_xml() + with mock.patch.object( + sps_xml_body_pipes, "merge_html", side_effect=RuntimeError("boom") + ): + # Não deve levantar exceção + MarkHTMLFileToEmbedPipe().transform((raw, xml)) + self.assertIsNotNone(xml.find(".//back")) + self.assertIsNotNone(xml.find(".//body")) + + def test_back_failure_does_not_block_body_processing(self): + """Falha em não deve impedir a substituição bem-sucedida em + , e vice-versa.""" + raw = _Raw(journal=_Journal()) + xml = self._build_xml() + + new_body = ET.fromstring("

merged body

") + + def fake_merge_html(input_html, **kwargs): + if ", retorna um elemento válido + return ET.fromstring("

merged body

") + + with mock.patch.object( + sps_xml_body_pipes, "merge_html", side_effect=fake_merge_html + ): + MarkHTMLFileToEmbedPipe().transform((raw, xml)) + + # body foi substituído com sucesso + self.assertEqual(xml.find(".//body/p").text, "merged body") + # back foi preservado + self.assertIsNotNone(xml.find(".//back")) + + +class TestHTMLMergerInternalReturnsNoneOnError(TestCase): + def test_process_html_internal_returns_none_on_parse_failure(self): + from scielo_classic_website.htmlbody.html_merger import HTMLMerger + + merger = HTMLMerger() + # Faz a função interna do parser explodir + with mock.patch( + "scielo_classic_website.htmlbody.html_merger.html.fromstring", + side_effect=RuntimeError("boom"), + ): + result = merger.process_html_internal("") + self.assertIsNone(result)