scieloorg · Copilot · Apr 28, 2026 · Apr 28, 2026 · Apr 29, 2026 · robertatakenaka
diff --git a/scielo_classic_website/htmlbody/html_fixer.py b/scielo_classic_website/htmlbody/html_fixer.py
@@ -39,7 +39,9 @@ def get_best_choice_between_original_and_fixed(score, original, fixed_html, min_
 
 
 def load_html(content):
-    return fromstring(wrap_html(content))
+    tree = fromstring(wrap_html(content))
+    remove_invalid_namespace_attributes(tree)
+    return tree
 
 
 def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namespaces=True):
@@ -60,6 +62,7 @@ def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namesp
     fixed_content = fix(content, style_mappings, tags_to_fix)
     wrapped = wrap_html(fixed_content)
     tree = fromstring(wrapped)
+    remove_invalid_namespace_attributes(tree)
     return html2xml(tree)
 
 
@@ -98,6 +101,7 @@ def fix(content, style_mappings=None, tags_to_fix=None):
     # Pipeline de processamento
     content = remove_invalid_xml_comments(content)
     content = remove_ms_office_conditionals(content)
+    content = remove_invalid_namespace_prefix_attributes(content)
     content = avoid_mismatched_styles(content, style_mappings)
     content = avoid_mismatched_tags(content, tags_to_fix)
     content = remove_namespaces_from_content(content)
@@ -366,6 +370,91 @@ def remove_invalid_xml_comments(html):
     return re.sub(r'<!--.*?-->', _filter_invalid_xml_comment, html, flags=re.DOTALL)
 
 
+_VALID_NAMESPACE_PREFIXES = frozenset({"xml", "xlink"})
+
+
+# Atributos cujo nome tem prefixo de namespace (ex.: ``mailto:dade``).
+# Captura o nome do prefixo e o valor opcional ("...", '...' ou sem aspas).
+_INVALID_NAMESPACE_ATTR_RE = re.compile(
+    r'\s+([A-Za-z_][\w.-]*):[\w.-]+'
+    r'''(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>]+))?'''
+)
+
+
+def _strip_invalid_namespace_attributes_in_tag(tag_match):
+    tag = tag_match.group(0)
+
+    def _maybe_strip(attr_match):
+        prefix = attr_match.group(1)
+        if prefix in _VALID_NAMESPACE_PREFIXES:
+            return attr_match.group(0)
+        return ""
+
+    return _INVALID_NAMESPACE_ATTR_RE.sub(_maybe_strip, tag)
+
+
+def remove_invalid_namespace_prefix_attributes(content):
+    """
+    Remove atributos com prefixo de namespace não declarado do HTML de origem.
+
+    HTML de origem ocasionalmente contém atributos malformados como
+    ``<a mailto:dade="...">``. Quando o conteúdo gerado é re-parseado como
+    XML (parser estrito), o lxml interpreta os dois pontos como prefixo de
+    namespace e levanta ``XMLSyntaxError``. Esta limpeza ocorre no nível
+    de string para corrigir o conteúdo original antes da geração do HTML,
+    de forma que toda a pipeline subsequente trabalhe com conteúdo limpo.
+
+    Os prefixos padrão (``xml``, ``xlink``) são preservados.
+    """
+    if not content:
+        return content
+    # Aplica somente dentro de tags abertas/auto-fechadas (``<...>``)
+    return re.sub(
+        r"<[A-Za-z][^>]*>",
+        _strip_invalid_namespace_attributes_in_tag,
+        content,
+    )
+
+
+def remove_invalid_namespace_attributes(tree):
+    """
+    Remove atributos cujo nome contém prefixo de namespace não declarado.
+
+    HTML de origem ocasionalmente contém atributos malformados como
+    ``<a mailto:dade="...">``. O parser HTML do lxml mantém o nome literal
+    com dois pontos. Quando a árvore é serializada como XML e novamente
+    parseada, o lxml interpreta os dois pontos como separador de namespace
+    e levanta ``XMLSyntaxError`` ("Namespace prefix X for Y on Z is not
+    defined").
+
+    Esta função percorre a árvore e remove tais atributos. Os prefixos
+    padrão (``xml``, ``xlink``) são preservados; atributos já mapeados
+    em namespace pelo lxml (armazenados na notação Clark
+    ``{uri}localname``) também são preservados.
+    """
+    if tree is None:
+        return tree
+
+    elements = tree.iter() if hasattr(tree, "iter") else [tree]
+    for elem in elements:
+        attrib = getattr(elem, "attrib", None)
+        if not attrib:
+            continue
+        for attr_name in list(attrib.keys()):
+            if not isinstance(attr_name, str):
+                continue
+            # Atributos já mapeados em namespace ficam em notação Clark
+            if attr_name.startswith("{"):
+                continue
+            if ":" not in attr_name:
+                continue
+            prefix = attr_name.split(":", 1)[0]
+            if prefix in _VALID_NAMESPACE_PREFIXES:
+                continue
+            del attrib[attr_name]
+    return tree
+
+
 def remove_ms_office_conditionals(xml_str):
     """
     Remove blocos condicionais do MS Office que causam erros de parsing XML.

diff --git a/tests/test_html_fixer.py b/tests/test_html_fixer.py
@@ -1,8 +1,16 @@
 from unittest import TestCase
 
 from lxml import etree as ET
+from lxml import html as lxml_html
 
-from scielo_classic_website.htmlbody.html_fixer import remove_invalid_xml_comments
+from scielo_classic_website.htmlbody.html_fixer import (
+    fix,
+    get_fixed_html,
+    load_html,
+    remove_invalid_namespace_attributes,
+    remove_invalid_namespace_prefix_attributes,
+    remove_invalid_xml_comments,
+)
 
 
 class TestRemoveInvalidXmlComments(TestCase):
@@ -71,3 +79,123 @@ def test_multiline_invalid_comment(self):
         html = "<p>text</p><!--EndF>>\n<!--EndFragment--><p>more</p>"
         result = remove_invalid_xml_comments(html)
         self.assertEqual(result, "<p>text</p><p>more</p>")
+
+
+class TestRemoveInvalidNamespaceAttributes(TestCase):
+    def test_removes_undefined_namespace_attribute(self):
+        tree = lxml_html.fromstring(
+            '<html><body><a mailto:dade="x" href="y">link</a></body></html>'
+        )
+        remove_invalid_namespace_attributes(tree)
+        a = tree.find(".//a")
+        self.assertNotIn("mailto:dade", a.attrib)
+        self.assertEqual(a.get("href"), "y")
+
+    def test_serialized_tree_is_valid_xml(self):
+        tree = lxml_html.fromstring(
+            '<html><body><a mailto:dade="x" href="y">link</a></body></html>'
+        )
+        remove_invalid_namespace_attributes(tree)
+        serialized = ET.tostring(tree, method="xml").decode("utf-8")
+        # Re-parsing as XML must not raise XMLSyntaxError
+        ET.fromstring(serialized)
+
+    def test_preserves_xml_and_xlink_prefixes(self):
+        tree = lxml_html.fromstring(
+            '<html><body>'
+            '<a xml:lang="pt" xlink:href="x" mailto:foo="y">link</a>'
+            '</body></html>'
+        )
+        remove_invalid_namespace_attributes(tree)
+        a = tree.find(".//a")
+        self.assertEqual(a.get("xml:lang"), "pt")
+        self.assertEqual(a.get("xlink:href"), "x")
+        self.assertNotIn("mailto:foo", a.attrib)
+
+    def test_preserves_attributes_without_colon(self):
+        tree = lxml_html.fromstring(
+            '<html><body><p id="x" class="y">text</p></body></html>'
+        )
+        remove_invalid_namespace_attributes(tree)
+        p = tree.find(".//p")
+        self.assertEqual(p.get("id"), "x")
+        self.assertEqual(p.get("class"), "y")
+
+    def test_handles_none_tree(self):
+        self.assertIsNone(remove_invalid_namespace_attributes(None))
+
+    def test_load_html_strips_invalid_namespace_attributes(self):
+        tree = load_html('<p>foo <a mailto:dade="z" href="y">link</a> bar</p>')
+        a = tree.find(".//a")
+        self.assertNotIn("mailto:dade", a.attrib)
+        # Tree must serialize to valid XML
+        serialized = ET.tostring(tree, method="xml").decode("utf-8")
+        ET.fromstring(serialized)
+
+    def test_get_fixed_html_output_is_valid_xml(self):
+        # Attribute value contains '>' so the regex-based
+        # ``remove_namespaces_from_content`` step (used inside ``fix()``)
+        # cannot reliably strip the bad attribute. The tree-level cleanup
+        # must still produce XML that re-parses without errors.
+        content = '<p>Hello <a mailto:dade="a>b" href="x">world</a></p>'
+        result = get_fixed_html(content)
+        wrapped = f"<root>{result}</root>"
+        ET.fromstring(wrapped)
+
+
+class TestRemoveInvalidNamespacePrefixAttributes(TestCase):
+    def test_removes_basic_undefined_prefix_attribute(self):
+        content = '<p>Hello <a mailto:dade="x@y.com" href="z">link</a></p>'
+        self.assertEqual(
+            remove_invalid_namespace_prefix_attributes(content),
+            '<p>Hello <a href="z">link</a></p>',
+        )
+
+    def test_handles_attribute_value_with_lt(self):
+        # This is the case where remove_namespaces_from_content is fooled
+        # because '<' inside the value breaks the text-based tag detection.
+        content = '<p>Hello <a mailto:dade="a<b" href="z">link</a> end</p>'
+        self.assertEqual(
+            remove_invalid_namespace_prefix_attributes(content),
+            '<p>Hello <a href="z">link</a> end</p>',
+        )
+
+    def test_handles_single_quoted_value(self):
+        content = "<p><a mailto:dade='a\"b' href='z'>link</a></p>"
+        self.assertEqual(
+            remove_invalid_namespace_prefix_attributes(content),
+            "<p><a href='z'>link</a></p>",
+        )
+
+    def test_preserves_xml_and_xlink_prefixes(self):
+        content = (
+            '<p><a xml:lang="pt" xlink:href="x" mailto:foo="y" href="h">l</a></p>'
+        )
+        self.assertEqual(
+            remove_invalid_namespace_prefix_attributes(content),
+            '<p><a xml:lang="pt" xlink:href="x" href="h">l</a></p>',
+        )
+
+    def test_handles_attribute_without_value(self):
+        content = '<p><a mailto:dade>l</a></p>'
+        self.assertEqual(
+            remove_invalid_namespace_prefix_attributes(content),
+            '<p><a>l</a></p>',
+        )
+
+    def test_does_not_change_plain_content(self):
+        content = '<p>Plain text without bad attrs</p>'
+        self.assertEqual(
+            remove_invalid_namespace_prefix_attributes(content), content
+        )
+
+    def test_handles_none_and_empty(self):
+        self.assertIsNone(remove_invalid_namespace_prefix_attributes(None))
+        self.assertEqual(remove_invalid_namespace_prefix_attributes(""), "")
+
+    def test_called_inside_fix(self):
+        # Through the full fix() pipeline, the bad attribute is gone but
+        # the surrounding <a> tag is preserved (unlike the old behaviour
+        # of remove_namespaces_from_content which dropped the whole tag).
+        content = '<p>Hello <a mailto:dade="a<b" href="z">link</a> end</p>'
+        self.assertIn('<a href="z">link</a>', fix(content))