From 8bb3234f1c43d1551c7df0a5f095168be0d05b5a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 28 Apr 2026 19:46:46 +0000
Subject: [PATCH 1/3] Initial plan
From 80d9c3364bf51749b46675b2975b6dafbac222ca Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 28 Apr 2026 19:59:21 +0000
Subject: [PATCH 2/3] Strip attributes with undefined namespace prefixes from
parsed HTML trees
Agent-Logs-Url: https://github.com/scieloorg/scielo_migration/sessions/807d21a0-f07d-4b7f-9102-fcce21174c55
Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com>
---
scielo_classic_website/htmlbody/html_fixer.py | 47 ++++++++++++-
tests/test_html_fixer.py | 70 ++++++++++++++++++-
2 files changed, 115 insertions(+), 2 deletions(-)
diff --git a/scielo_classic_website/htmlbody/html_fixer.py b/scielo_classic_website/htmlbody/html_fixer.py
index 860a6f7..4c2e702 100644
--- a/scielo_classic_website/htmlbody/html_fixer.py
+++ b/scielo_classic_website/htmlbody/html_fixer.py
@@ -39,7 +39,9 @@ def get_best_choice_between_original_and_fixed(score, original, fixed_html, min_
def load_html(content):
- return fromstring(wrap_html(content))
+ tree = fromstring(wrap_html(content))
+ remove_invalid_namespace_attributes(tree)
+ return tree
def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namespaces=True):
@@ -60,6 +62,7 @@ def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namesp
fixed_content = fix(content, style_mappings, tags_to_fix)
wrapped = wrap_html(fixed_content)
tree = fromstring(wrapped)
+ remove_invalid_namespace_attributes(tree)
return html2xml(tree)
@@ -366,6 +369,48 @@ def remove_invalid_xml_comments(html):
return re.sub(r'', _filter_invalid_xml_comment, html, flags=re.DOTALL)
+_VALID_NAMESPACE_PREFIXES = frozenset({"xml", "xlink"})
+
+
+def remove_invalid_namespace_attributes(tree):
+ """
+ Remove atributos cujo nome contém prefixo de namespace não declarado.
+
+ HTML de origem ocasionalmente contém atributos malformados como
+ ````. O parser HTML do lxml mantém o nome literal
+ com dois pontos. Quando a árvore é serializada como XML e novamente
+ parseada, o lxml interpreta os dois pontos como separador de namespace
+ e levanta ``XMLSyntaxError`` ("Namespace prefix X for Y on Z is not
+ defined").
+
+ Esta função percorre a árvore e remove tais atributos. Os prefixos
+ padrão (``xml``, ``xlink``) são preservados; atributos já mapeados
+ em namespace pelo lxml (armazenados na notação Clark
+ ``{uri}localname``) também são preservados.
+ """
+ if tree is None:
+ return tree
+
+ elements = tree.iter() if hasattr(tree, "iter") else [tree]
+ for elem in elements:
+ attrib = getattr(elem, "attrib", None)
+ if not attrib:
+ continue
+ for attr_name in list(attrib.keys()):
+ if not isinstance(attr_name, str):
+ continue
+ # Atributos já mapeados em namespace ficam em notação Clark
+ if attr_name.startswith("{"):
+ continue
+ if ":" not in attr_name:
+ continue
+ prefix = attr_name.split(":", 1)[0]
+ if prefix in _VALID_NAMESPACE_PREFIXES:
+ continue
+ del attrib[attr_name]
+ return tree
+
+
def remove_ms_office_conditionals(xml_str):
"""
Remove blocos condicionais do MS Office que causam erros de parsing XML.
diff --git a/tests/test_html_fixer.py b/tests/test_html_fixer.py
index 80a0625..8e71a97 100644
--- a/tests/test_html_fixer.py
+++ b/tests/test_html_fixer.py
@@ -1,8 +1,14 @@
from unittest import TestCase
from lxml import etree as ET
+from lxml import html as lxml_html
-from scielo_classic_website.htmlbody.html_fixer import remove_invalid_xml_comments
+from scielo_classic_website.htmlbody.html_fixer import (
+ get_fixed_html,
+ load_html,
+ remove_invalid_namespace_attributes,
+ remove_invalid_xml_comments,
+)
class TestRemoveInvalidXmlComments(TestCase):
@@ -71,3 +77,65 @@ def test_multiline_invalid_comment(self):
html = " text more text more
text
' + ) + remove_invalid_namespace_attributes(tree) + p = tree.find(".//p") + self.assertEqual(p.get("id"), "x") + self.assertEqual(p.get("class"), "y") + + def test_handles_none_tree(self): + self.assertIsNone(remove_invalid_namespace_attributes(None)) + + def test_load_html_strips_invalid_namespace_attributes(self): + tree = load_html('foo link bar
') + a = tree.find(".//a") + self.assertNotIn("mailto:dade", a.attrib) + # Tree must serialize to valid XML + serialized = ET.tostring(tree, method="xml").decode("utf-8") + ET.fromstring(serialized) + + def test_get_fixed_html_output_is_valid_xml(self): + # Attribute value contains '>' so the regex-based + # ``remove_namespaces_from_content`` step (used inside ``fix()``) + # cannot reliably strip the bad attribute. The tree-level cleanup + # must still produce XML that re-parses without errors. + content = 'Hello world
' + result = get_fixed_html(content) + wrapped = f"Hello link
' + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), + 'Hello link
', + ) + + def test_handles_attribute_value_with_lt(self): + # This is the case where remove_namespaces_from_content is fooled + # because '<' inside the value breaks the text-based tag detection. + content = 'Hello link end
' + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), + 'Hello link end
', + ) + + def test_handles_single_quoted_value(self): + content = "" + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), + "", + ) + + def test_preserves_xml_and_xlink_prefixes(self): + content = ( + '' + ) + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), + '', + ) + + def test_handles_attribute_without_value(self): + content = '' + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), + '', + ) + + def test_does_not_change_plain_content(self): + content = 'Plain text without bad attrs
' + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), content + ) + + def test_handles_none_and_empty(self): + self.assertIsNone(remove_invalid_namespace_prefix_attributes(None)) + self.assertEqual(remove_invalid_namespace_prefix_attributes(""), "") + + def test_called_inside_fix(self): + # Through the full fix() pipeline, the bad attribute is gone but + # the surrounding tag is preserved (unlike the old behaviour + # of remove_namespaces_from_content which dropped the whole tag). + content = 'Hello link end
' + self.assertIn('link', fix(content))