From 8bb3234f1c43d1551c7df0a5f095168be0d05b5a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 28 Apr 2026 19:46:46 +0000 Subject: [PATCH 1/3] Initial plan From 80d9c3364bf51749b46675b2975b6dafbac222ca Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 28 Apr 2026 19:59:21 +0000 Subject: [PATCH 2/3] Strip attributes with undefined namespace prefixes from parsed HTML trees Agent-Logs-Url: https://github.com/scieloorg/scielo_migration/sessions/807d21a0-f07d-4b7f-9102-fcce21174c55 Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- scielo_classic_website/htmlbody/html_fixer.py | 47 ++++++++++++- tests/test_html_fixer.py | 70 ++++++++++++++++++- 2 files changed, 115 insertions(+), 2 deletions(-) diff --git a/scielo_classic_website/htmlbody/html_fixer.py b/scielo_classic_website/htmlbody/html_fixer.py index 860a6f7..4c2e702 100644 --- a/scielo_classic_website/htmlbody/html_fixer.py +++ b/scielo_classic_website/htmlbody/html_fixer.py @@ -39,7 +39,9 @@ def get_best_choice_between_original_and_fixed(score, original, fixed_html, min_ def load_html(content): - return fromstring(wrap_html(content)) + tree = fromstring(wrap_html(content)) + remove_invalid_namespace_attributes(tree) + return tree def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namespaces=True): @@ -60,6 +62,7 @@ def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namesp fixed_content = fix(content, style_mappings, tags_to_fix) wrapped = wrap_html(fixed_content) tree = fromstring(wrapped) + remove_invalid_namespace_attributes(tree) return html2xml(tree) @@ -366,6 +369,48 @@ def remove_invalid_xml_comments(html): return re.sub(r'', _filter_invalid_xml_comment, html, flags=re.DOTALL) +_VALID_NAMESPACE_PREFIXES = frozenset({"xml", "xlink"}) + + +def remove_invalid_namespace_attributes(tree): + """ + Remove atributos cujo nome contém prefixo de namespace não declarado. + + HTML de origem ocasionalmente contém atributos malformados como + ````. O parser HTML do lxml mantém o nome literal + com dois pontos. Quando a árvore é serializada como XML e novamente + parseada, o lxml interpreta os dois pontos como separador de namespace + e levanta ``XMLSyntaxError`` ("Namespace prefix X for Y on Z is not + defined"). + + Esta função percorre a árvore e remove tais atributos. Os prefixos + padrão (``xml``, ``xlink``) são preservados; atributos já mapeados + em namespace pelo lxml (armazenados na notação Clark + ``{uri}localname``) também são preservados. + """ + if tree is None: + return tree + + elements = tree.iter() if hasattr(tree, "iter") else [tree] + for elem in elements: + attrib = getattr(elem, "attrib", None) + if not attrib: + continue + for attr_name in list(attrib.keys()): + if not isinstance(attr_name, str): + continue + # Atributos já mapeados em namespace ficam em notação Clark + if attr_name.startswith("{"): + continue + if ":" not in attr_name: + continue + prefix = attr_name.split(":", 1)[0] + if prefix in _VALID_NAMESPACE_PREFIXES: + continue + del attrib[attr_name] + return tree + + def remove_ms_office_conditionals(xml_str): """ Remove blocos condicionais do MS Office que causam erros de parsing XML. diff --git a/tests/test_html_fixer.py b/tests/test_html_fixer.py index 80a0625..8e71a97 100644 --- a/tests/test_html_fixer.py +++ b/tests/test_html_fixer.py @@ -1,8 +1,14 @@ from unittest import TestCase from lxml import etree as ET +from lxml import html as lxml_html -from scielo_classic_website.htmlbody.html_fixer import remove_invalid_xml_comments +from scielo_classic_website.htmlbody.html_fixer import ( + get_fixed_html, + load_html, + remove_invalid_namespace_attributes, + remove_invalid_xml_comments, +) class TestRemoveInvalidXmlComments(TestCase): @@ -71,3 +77,65 @@ def test_multiline_invalid_comment(self): html = "

text

more

" result = remove_invalid_xml_comments(html) self.assertEqual(result, "

text

more

") + + +class TestRemoveInvalidNamespaceAttributes(TestCase): + def test_removes_undefined_namespace_attribute(self): + tree = lxml_html.fromstring( + '
link' + ) + remove_invalid_namespace_attributes(tree) + a = tree.find(".//a") + self.assertNotIn("mailto:dade", a.attrib) + self.assertEqual(a.get("href"), "y") + + def test_serialized_tree_is_valid_xml(self): + tree = lxml_html.fromstring( + 'link' + ) + remove_invalid_namespace_attributes(tree) + serialized = ET.tostring(tree, method="xml").decode("utf-8") + # Re-parsing as XML must not raise XMLSyntaxError + ET.fromstring(serialized) + + def test_preserves_xml_and_xlink_prefixes(self): + tree = lxml_html.fromstring( + '' + 'link' + '' + ) + remove_invalid_namespace_attributes(tree) + a = tree.find(".//a") + self.assertEqual(a.get("xml:lang"), "pt") + self.assertEqual(a.get("xlink:href"), "x") + self.assertNotIn("mailto:foo", a.attrib) + + def test_preserves_attributes_without_colon(self): + tree = lxml_html.fromstring( + '

text

' + ) + remove_invalid_namespace_attributes(tree) + p = tree.find(".//p") + self.assertEqual(p.get("id"), "x") + self.assertEqual(p.get("class"), "y") + + def test_handles_none_tree(self): + self.assertIsNone(remove_invalid_namespace_attributes(None)) + + def test_load_html_strips_invalid_namespace_attributes(self): + tree = load_html('

foo link bar

') + a = tree.find(".//a") + self.assertNotIn("mailto:dade", a.attrib) + # Tree must serialize to valid XML + serialized = ET.tostring(tree, method="xml").decode("utf-8") + ET.fromstring(serialized) + + def test_get_fixed_html_output_is_valid_xml(self): + # Attribute value contains '>' so the regex-based + # ``remove_namespaces_from_content`` step (used inside ``fix()``) + # cannot reliably strip the bad attribute. The tree-level cleanup + # must still produce XML that re-parses without errors. + content = '

Hello world

' + result = get_fixed_html(content) + wrapped = f"{result}" + ET.fromstring(wrapped) From 6d9b31fd1cb1bdcebbaaf9ab330fd8f2c963c694 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 29 Apr 2026 19:50:22 +0000 Subject: [PATCH 3/3] Add string-level cleanup in fix() to correct original content before HTML generation Agent-Logs-Url: https://github.com/scieloorg/scielo_migration/sessions/3cb06e98-1827-4e85-96a2-b2ed33363233 Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- scielo_classic_website/htmlbody/html_fixer.py | 44 ++++++++++++++ tests/test_html_fixer.py | 60 +++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/scielo_classic_website/htmlbody/html_fixer.py b/scielo_classic_website/htmlbody/html_fixer.py index 4c2e702..0310c92 100644 --- a/scielo_classic_website/htmlbody/html_fixer.py +++ b/scielo_classic_website/htmlbody/html_fixer.py @@ -101,6 +101,7 @@ def fix(content, style_mappings=None, tags_to_fix=None): # Pipeline de processamento content = remove_invalid_xml_comments(content) content = remove_ms_office_conditionals(content) + content = remove_invalid_namespace_prefix_attributes(content) content = avoid_mismatched_styles(content, style_mappings) content = avoid_mismatched_tags(content, tags_to_fix) content = remove_namespaces_from_content(content) @@ -372,6 +373,49 @@ def remove_invalid_xml_comments(html): _VALID_NAMESPACE_PREFIXES = frozenset({"xml", "xlink"}) +# Atributos cujo nome tem prefixo de namespace (ex.: ``mailto:dade``). +# Captura o nome do prefixo e o valor opcional ("...", '...' ou sem aspas). +_INVALID_NAMESPACE_ATTR_RE = re.compile( + r'\s+([A-Za-z_][\w.-]*):[\w.-]+' + r'''(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>]+))?''' +) + + +def _strip_invalid_namespace_attributes_in_tag(tag_match): + tag = tag_match.group(0) + + def _maybe_strip(attr_match): + prefix = attr_match.group(1) + if prefix in _VALID_NAMESPACE_PREFIXES: + return attr_match.group(0) + return "" + + return _INVALID_NAMESPACE_ATTR_RE.sub(_maybe_strip, tag) + + +def remove_invalid_namespace_prefix_attributes(content): + """ + Remove atributos com prefixo de namespace não declarado do HTML de origem. + + HTML de origem ocasionalmente contém atributos malformados como + ````. Quando o conteúdo gerado é re-parseado como + XML (parser estrito), o lxml interpreta os dois pontos como prefixo de + namespace e levanta ``XMLSyntaxError``. Esta limpeza ocorre no nível + de string para corrigir o conteúdo original antes da geração do HTML, + de forma que toda a pipeline subsequente trabalhe com conteúdo limpo. + + Os prefixos padrão (``xml``, ``xlink``) são preservados. + """ + if not content: + return content + # Aplica somente dentro de tags abertas/auto-fechadas (``<...>``) + return re.sub( + r"<[A-Za-z][^>]*>", + _strip_invalid_namespace_attributes_in_tag, + content, + ) + + def remove_invalid_namespace_attributes(tree): """ Remove atributos cujo nome contém prefixo de namespace não declarado. diff --git a/tests/test_html_fixer.py b/tests/test_html_fixer.py index 8e71a97..a35026e 100644 --- a/tests/test_html_fixer.py +++ b/tests/test_html_fixer.py @@ -4,9 +4,11 @@ from lxml import html as lxml_html from scielo_classic_website.htmlbody.html_fixer import ( + fix, get_fixed_html, load_html, remove_invalid_namespace_attributes, + remove_invalid_namespace_prefix_attributes, remove_invalid_xml_comments, ) @@ -139,3 +141,61 @@ def test_get_fixed_html_output_is_valid_xml(self): result = get_fixed_html(content) wrapped = f"{result}" ET.fromstring(wrapped) + + +class TestRemoveInvalidNamespacePrefixAttributes(TestCase): + def test_removes_basic_undefined_prefix_attribute(self): + content = '

Hello link

' + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), + '

Hello link

', + ) + + def test_handles_attribute_value_with_lt(self): + # This is the case where remove_namespaces_from_content is fooled + # because '<' inside the value breaks the text-based tag detection. + content = '

Hello link end

' + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), + '

Hello link end

', + ) + + def test_handles_single_quoted_value(self): + content = "

link

" + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), + "

link

", + ) + + def test_preserves_xml_and_xlink_prefixes(self): + content = ( + '

l

' + ) + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), + '

l

', + ) + + def test_handles_attribute_without_value(self): + content = '

l

' + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), + '

l

', + ) + + def test_does_not_change_plain_content(self): + content = '

Plain text without bad attrs

' + self.assertEqual( + remove_invalid_namespace_prefix_attributes(content), content + ) + + def test_handles_none_and_empty(self): + self.assertIsNone(remove_invalid_namespace_prefix_attributes(None)) + self.assertEqual(remove_invalid_namespace_prefix_attributes(""), "") + + def test_called_inside_fix(self): + # Through the full fix() pipeline, the bad attribute is gone but + # the surrounding tag is preserved (unlike the old behaviour + # of remove_namespaces_from_content which dropped the whole tag). + content = '

Hello link end

' + self.assertIn('link', fix(content))