Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions lib/markbridge/parsers/html/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ module Parsers
module HTML
# Parses HTML into an AST using Nokogiri
class Parser
# Tags whose contents should be dropped entirely (not emitted as text).
# These are raw-text/metadata elements whose children are either CSS,
# JavaScript, or document metadata that shouldn't appear in output.
IGNORED_TAGS = %w[style script head title noscript template].freeze

attr_reader :unknown_tags

# Create a new parser with optional custom handlers
Expand Down Expand Up @@ -72,6 +77,8 @@ def process_text_node(node, parent)
# @param parent [AST::Element]
def process_element_node(node, parent)
tag_name = node.name.downcase
return if IGNORED_TAGS.include?(tag_name)

handler = @handlers[tag_name]

if handler
Expand Down
33 changes: 33 additions & 0 deletions spec/unit/markbridge/parsers/html/parser_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,39 @@
expect(doc.children[1].children[0].text).to eq("Two")
end

it "drops style tag contents entirely" do
doc = parser.parse("<style>.foo { color: red; }</style>hello")

expect(doc.children.size).to eq(1)
expect(doc.children[0]).to be_a(Markbridge::AST::Text)
expect(doc.children[0].text).to eq("hello")
end

it "drops script tag contents entirely" do
doc = parser.parse("<script>alert('xss')</script>hello")

expect(doc.children.size).to eq(1)
expect(doc.children[0]).to be_a(Markbridge::AST::Text)
expect(doc.children[0].text).to eq("hello")
end

it "drops head subtree including nested style/title/meta" do
html =
"<html><head><title>T</title><style>.a{}</style></head>" \
"<body>body text</body></html>"
doc = parser.parse(html)

expect(doc.children.size).to eq(1)
expect(doc.children[0]).to be_a(Markbridge::AST::Text)
expect(doc.children[0].text).to eq("body text")
end

it "does not count ignored tags as unknown" do
parser.parse("<style>.a{}</style><script>x</script>")

expect(parser.unknown_tags).to be_empty
end

it "tracks unknown tags" do
parser.parse("<unknown>text</unknown>")

Expand Down