diff --git a/lib/markbridge/parsers/html/parser.rb b/lib/markbridge/parsers/html/parser.rb index 61a718c..ad43622 100644 --- a/lib/markbridge/parsers/html/parser.rb +++ b/lib/markbridge/parsers/html/parser.rb @@ -5,6 +5,11 @@ module Parsers module HTML # Parses HTML into an AST using Nokogiri class Parser + # Tags whose contents should be dropped entirely (not emitted as text). + # These are raw-text/metadata elements whose children are either CSS, + # JavaScript, or document metadata that shouldn't appear in output. + IGNORED_TAGS = %w[style script head title noscript template].freeze + attr_reader :unknown_tags # Create a new parser with optional custom handlers @@ -72,6 +77,8 @@ def process_text_node(node, parent) # @param parent [AST::Element] def process_element_node(node, parent) tag_name = node.name.downcase + return if IGNORED_TAGS.include?(tag_name) + handler = @handlers[tag_name] if handler diff --git a/spec/unit/markbridge/parsers/html/parser_spec.rb b/spec/unit/markbridge/parsers/html/parser_spec.rb index b3d7e13..2624229 100644 --- a/spec/unit/markbridge/parsers/html/parser_spec.rb +++ b/spec/unit/markbridge/parsers/html/parser_spec.rb @@ -132,6 +132,39 @@ expect(doc.children[1].children[0].text).to eq("Two") end + it "drops style tag contents entirely" do + doc = parser.parse("hello") + + expect(doc.children.size).to eq(1) + expect(doc.children[0]).to be_a(Markbridge::AST::Text) + expect(doc.children[0].text).to eq("hello") + end + + it "drops script tag contents entirely" do + doc = parser.parse("hello") + + expect(doc.children.size).to eq(1) + expect(doc.children[0]).to be_a(Markbridge::AST::Text) + expect(doc.children[0].text).to eq("hello") + end + + it "drops head subtree including nested style/title/meta" do + html = + "