Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@ jobs:
runs-on: ubuntu-latest
name: Ruby ${{ matrix.ruby }}
strategy:
fail-fast: false
matrix:
ruby:
- '3.2'
- '3.3'
- '3.4'
- '4.0'
- 'jruby'
- 'truffleruby'

steps:
- name: Checkout
Expand Down
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ source "https://rubygems.org"

gemspec

gem "commonmarker"
gem "commonmarker", install_if: -> { RUBY_ENGINE == "ruby" }
gem "lefthook"
gem "nokogiri"
gem "puma"
Expand Down
25 changes: 25 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,13 @@ GEM
thread_safe (~> 0.3, >= 0.3.1)
base64 (0.3.0)
bigdecimal (4.1.2)
bigdecimal (4.1.2-java)
childprocess (5.1.0)
logger (~> 1.5)
coercible (1.0.0)
descendants_tracker (~> 0.0.1)
commonmarker (2.8.1)
rb_sys (~> 0.9)
commonmarker (2.8.1-aarch64-linux)
commonmarker (2.8.1-aarch64-linux-musl)
commonmarker (2.8.1-arm-linux)
Expand Down Expand Up @@ -72,6 +75,7 @@ GEM
sexp_processor (~> 4.8)
ice_nine (0.11.2)
json (2.19.4)
json (2.19.4-java)
language_server-protocol (3.17.0.5)
launchy (3.1.1)
addressable (~> 2.8)
Expand All @@ -82,6 +86,7 @@ GEM
logger (1.7.0)
mustermann (3.1.1)
nio4r (2.7.5)
nio4r (2.7.5-java)
nokogiri (1.19.2-aarch64-linux-gnu)
racc (~> 1.4)
nokogiri (1.19.2-aarch64-linux-musl)
Expand All @@ -92,6 +97,8 @@ GEM
racc (~> 1.4)
nokogiri (1.19.2-arm64-darwin)
racc (~> 1.4)
nokogiri (1.19.2-java)
racc (~> 1.4)
nokogiri (1.19.2-x86_64-darwin)
racc (~> 1.4)
nokogiri (1.19.2-x86_64-linux-gnu)
Expand All @@ -109,7 +116,10 @@ GEM
public_suffix (7.0.5)
puma (8.0.0)
nio4r (~> 2.0)
puma (8.0.0-java)
nio4r (~> 2.0)
racc (1.8.1)
racc (1.8.1-java)
rack (3.2.6)
rack-protection (4.2.1)
base64 (>= 0.1.0)
Expand All @@ -122,6 +132,9 @@ GEM
rack (>= 3)
rainbow (3.1.1)
rake (13.4.2)
rake-compiler-dock (1.12.0)
rb_sys (0.9.127)
rake-compiler-dock (= 1.12.0)
reek (6.5.0)
dry-schema (~> 1.13)
logger (~> 1.6)
Expand Down Expand Up @@ -197,6 +210,7 @@ GEM
syntax_tree (6.3.0)
prettier_print (>= 1.2.0)
thread_safe (0.3.6)
thread_safe (0.3.6-java)
tilt (2.7.0)
tty-which (0.5.0)
unicode-display_width (3.2.0)
Expand All @@ -214,6 +228,7 @@ PLATFORMS
arm-linux-gnu
arm-linux-musl
arm64-darwin
universal-java
x86_64-darwin
x86_64-linux-gnu
x86_64-linux-musl
Expand All @@ -240,8 +255,10 @@ CHECKSUMS
axiom-types (0.1.1) sha256=c1ff113f3de516fa195b2db7e0a9a95fd1b08475a502ff660d04507a09980383
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
bigdecimal (4.1.2) sha256=53d217666027eab4280346fba98e7d5b66baaae1b9c3c1c0ffe89d48188a3fbd
bigdecimal (4.1.2-java) sha256=ccc836eab720a525529f70ed0de26a206fdbc9a9e8ac67b3b4ac7318b03e114d
childprocess (5.1.0) sha256=9a8d484be2fd4096a0e90a0cd3e449a05bc3aa33f8ac9e4d6dcef6ac1455b6ec
coercible (1.0.0) sha256=5081ad24352cc8435ce5472bc2faa30260c7ea7f2102cc6a9f167c4d9bffaadc
commonmarker (2.8.1) sha256=f78caf3cfc671fc64e85c9f49eaab0fb4ed3a96e5489be6cc7f59065768cf63e
commonmarker (2.8.1-aarch64-linux) sha256=f855599cc6855f4137d72dcacae9571451075afe6e6c8522eba353df9b81d0bf
commonmarker (2.8.1-aarch64-linux-musl) sha256=bbc2b3d361403431a5aac737dc86997d3b2843276ef395e7499cf17ad48e1b2c
commonmarker (2.8.1-arm-linux) sha256=ede48564a9c2e29e003361fd7b0b158b3b85bcbd5a15b963fa2b3c78eef3993f
Expand All @@ -265,6 +282,7 @@ CHECKSUMS
flog (4.9.4) sha256=12cc054fab7a2cbd2a906514397c4d7788954d530564782d6f14939dc2dfbcbb
ice_nine (0.11.2) sha256=5d506a7d2723d5592dc121b9928e4931742730131f22a1a37649df1c1e2e63db
json (2.19.4) sha256=670a7d333fb3b18ca5b29cb255eb7bef099e40d88c02c80bd42a3f30fe5239ac
json (2.19.4-java) sha256=f7f0fe701e2bef648497b0eb59422f5b453e5038cfbaf9cde09af20e22241efb
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
launchy (3.1.1) sha256=72b847b5cc961589dde2c395af0108c86ff0119f42d4648d25b5440ebb10059e
lefthook (2.1.6) sha256=e5c4c46f789c54961de030bf324f3fb38989a6adc944a6e8df50b4d432ebaf1d
Expand All @@ -273,11 +291,13 @@ CHECKSUMS
markbridge (0.1.0)
mustermann (3.1.1) sha256=4c6170c7234d5499c345562ba7c7dfe73e1754286dcc1abb053064d66a127198
nio4r (2.7.5) sha256=6c90168e48fb5f8e768419c93abb94ba2b892a1d0602cb06eef16d8b7df1dca1
nio4r (2.7.5-java) sha256=d14779d2a9b012ec0148a53344fbb2ed2a3c4d90c5dd923bf281135ab983b2c9
nokogiri (1.19.2-aarch64-linux-gnu) sha256=c34d5c8208025587554608e98fd88ab125b29c80f9352b821964e9a5d5cfbd19
nokogiri (1.19.2-aarch64-linux-musl) sha256=7f6b4b0202d507326841a4f790294bf75098aef50c7173443812e3ac5cb06515
nokogiri (1.19.2-arm-linux-gnu) sha256=b7fa1139016f3dc850bda1260988f0d749934a939d04ef2da13bec060d7d5081
nokogiri (1.19.2-arm-linux-musl) sha256=61114d44f6742ff72194a1b3020967201e2eb982814778d130f6471c11f9828c
nokogiri (1.19.2-arm64-darwin) sha256=58d8ea2e31a967b843b70487a44c14c8ba1866daa1b9da9be9dbdf1b43dee205
nokogiri (1.19.2-java) sha256=e9d67034bc80ca71043040beea8a91be5dc99b662daa38a2bfb361b7a2cc8717
nokogiri (1.19.2-x86_64-darwin) sha256=7d9af11fda72dfaa2961d8c4d5380ca0b51bc389dc5f8d4b859b9644f195e7a4
nokogiri (1.19.2-x86_64-linux-gnu) sha256=fa8feca882b73e871a9845f3817a72e9734c8e974bdc4fbad6e4bc6e8076b94f
nokogiri (1.19.2-x86_64-linux-musl) sha256=93128448e61a9383a30baef041bf1f5817e22f297a1d400521e90294445069a8
Expand All @@ -289,13 +309,17 @@ CHECKSUMS
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
puma (8.0.0) sha256=1681050b8b60fab1d3033255ab58b6aec64cd063e43fc6f8204bcb8bf9364b88
puma (8.0.0-java) sha256=494ccc69aa368cf271e2612fcb88cb50653727c160519c95b0b3d34a856fcd81
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
racc (1.8.1-java) sha256=54f2e6d1e1b91c154013277d986f52a90e5ececbe91465d29172e49342732b98
rack (3.2.6) sha256=5ed78e1f73b2e25679bec7d45ee2d4483cc4146eb1be0264fc4d94cb5ef212c2
rack-protection (4.2.1) sha256=cf6e2842df8c55f5e4d1a4be015e603e19e9bc3a7178bae58949ccbb58558bac
rack-session (2.1.2) sha256=595434f8c0c3473ae7d7ac56ecda6cc6dfd9d37c0b2b5255330aa1576967ffe8
rackup (2.3.1) sha256=6c79c26753778e90983761d677a48937ee3192b3ffef6bc963c0950f94688868
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
rake (13.4.2) sha256=cb825b2bd5f1f8e91ca37bddb4b9aaf345551b4731da62949be002fa89283701
rake-compiler-dock (1.12.0) sha256=f13205c2738f3d2053afcd03491a9e4541b22a59a0bfc53fc8bc883bd8188023
rb_sys (0.9.127) sha256=e9f90df3bb0577472d26d96127d5b5774b98f44de881e7d36aeefd28d6337847
reek (6.5.0) sha256=d26d3a492773b2bbc228888067a21afe33ac07954a17dbd64cdeae42c4c69be1
regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb
rexml (3.4.4) sha256=19e0a2c3425dfbf2d4fc1189747bdb2f849b6c5e74180401b15734bc97b5d142
Expand All @@ -318,6 +342,7 @@ CHECKSUMS
sinatra (4.2.1) sha256=b7aeb9b11d046b552972ade834f1f9be98b185fa8444480688e3627625377080
syntax_tree (6.3.0) sha256=56e25a9692c798ec94c5442fe94c5e94af76bef91edc8bb02052cbdecf35f13d
thread_safe (0.3.6) sha256=9ed7072821b51c57e8d6b7011a8e282e25aeea3a4065eab326e43f66f063b05a
thread_safe (0.3.6-java) sha256=bb28394cd0924c068981adee71f36a81c85c92e7d74d3f62372bd51489a0e0c2
tilt (2.7.0) sha256=0d5b9ba69f6a36490c64b0eee9f6e9aad517e20dcc848800a06eb116f08c6ab3
tty-which (0.5.0) sha256=5824055f0d6744c97e7c4426544f01d519c40d1806ef2ef47d9854477993f466
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
Expand Down
12 changes: 6 additions & 6 deletions docs/parsers/comparison.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Markbridge includes three parsers, each designed for different input formats:
**Location:** `Markbridge::Parsers::HTML::Parser`

**Key features:**
- Leverages Nokogiri's HTML5 parser
- Leverages Nokogiri's HTML parser
- DOM tree traversal
- Handles malformed HTML gracefully
- Void element support (self-closing tags)
Expand Down Expand Up @@ -91,14 +91,14 @@ Input → Scanner → Tokens → Handler (via Registry) → AST
### HTML Parser Architecture

```
Input → Nokogiri HTML5 → DOM Tree → Handler → AST
Input → Nokogiri HTML → DOM Tree → Handler → AST
↓ ↓
Element Nodes process()
Text Nodes (stateless)
```

**Components:**
- **Nokogiri::HTML5:** External HTML parser
- **Nokogiri::HTML:** External HTML parser
- **HandlerRegistry:** Simple tag-to-handler mapping
- **Handlers:** Stateless, receive entire element at once
- **Parser:** Walks DOM tree, dispatches to handlers
Expand Down Expand Up @@ -369,7 +369,7 @@ All three parsers have linear complexity, but differ in implementation:
### HTML Performance Characteristics

**Advantages:**
- ✓ Mature HTML5 parser (Nokogiri)
- ✓ Mature HTML parser (Nokogiri)
- ✓ Handles malformed input well
- ✓ Simple handler API (no state)
- ✓ Battle-tested parsing
Expand Down Expand Up @@ -413,7 +413,7 @@ Memory Usage (10 KB input):

CPU Time (10 KB input):
BBCode: 3-5 ms (custom scanner + state)
HTML: 2-4 ms (Nokogiri HTML5)
HTML: 2-4 ms (Nokogiri HTML)
TextFormatter: 2-4 ms (Nokogiri XML)

Dependencies:
Expand Down Expand Up @@ -678,7 +678,7 @@ end
- ✓ Web scraping → Markdown conversion
- ✓ HTML email → Markdown
- ✓ Handling malformed HTML
- ✓ Leveraging HTML5 standards
- ✓ Leveraging standard HTML parsing
- ✓ Simple handler requirements

**Examples:**
Expand Down
8 changes: 4 additions & 4 deletions docs/parsers/html.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# HTML Parser Guide

This guide explains how the HTML parser converts standard HTML into the Markbridge AST using Nokogiri's HTML5 parser.
This guide explains how the HTML parser converts standard HTML into the Markbridge AST using Nokogiri's HTML parser.

## Table of Contents

Expand All @@ -14,10 +14,10 @@ This guide explains how the HTML parser converts standard HTML into the Markbrid

## Overview

The HTML parser (`Markbridge::Parsers::HTML::Parser`) uses Nokogiri's HTML5 parser to convert HTML markup into AST. It provides a simpler alternative to the BBCode parser when working with HTML content.
The HTML parser (`Markbridge::Parsers::HTML::Parser`) uses Nokogiri to convert HTML markup into AST. It provides a simpler alternative to the BBCode parser when working with HTML content.

**Key Features:**
- Leverages Nokogiri's battle-tested HTML5 parser
- Leverages Nokogiri's battle-tested HTML parser (libxml2 on MRI/TruffleRuby, Xerces/NekoHTML on JRuby)
- Handles malformed HTML gracefully
- Stateless handler API (simpler than BBCode)
- Lambda handler support for quick customization
Expand Down Expand Up @@ -121,7 +121,7 @@ parser.process_children(nokogiri_element, ast_parent)
```

**Parsing Flow:**
1. Parse HTML with Nokogiri::HTML5.fragment
1. Parse HTML with Nokogiri::HTML.fragment
2. Walk DOM tree
3. Dispatch each element to registered handlers
4. Return completed AST::Document
Expand Down
8 changes: 6 additions & 2 deletions lib/markbridge/parsers/html/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,12 @@ def initialize(handlers: nil, &block)
def parse(input)
@unknown_tags.clear

# Parse HTML with Nokogiri
doc = Nokogiri::HTML5.fragment(input)
# Parse HTML with Nokogiri. Using the generic HTML (HTML4) parser rather
# than HTML5 because Nokogiri::HTML5 is not available on JRuby
# (see sparklemotion/nokogiri#2227). Table support treats thead/tbody/tfoot
# as transparent, so the parse-tree difference (HTML5 auto-inserts tbody,
# HTML4 does not) has no effect on the AST.
doc = Nokogiri::HTML.fragment(input)

# Create root AST document
document = AST::Document.new
Expand Down
14 changes: 11 additions & 3 deletions spec/unit/markbridge/parsers/html/parser_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,17 @@
it "handles malformed HTML gracefully" do
doc = parser.parse("<b>bold <i>italic</b></i>")

# Nokogiri fixes the nesting
expect(doc.children.size).to eq(1)
expect(doc.children[0]).to be_a(Markbridge::AST::Bold)
# Nokogiri recovers from the mismatched tags. The exact tree shape is
# parser-dependent (libxml2 reparents into <b><i>…</i></b>; JRuby's
# NekoHTML leaves <b> and <i> as siblings), but the content survives
# and the top-level node is always the <b>.
expect(doc.children.first).to be_a(Markbridge::AST::Bold)

collect_text = ->(node) do
return node.text if node.is_a?(Markbridge::AST::Text)
node.respond_to?(:children) ? node.children.map(&collect_text).join : ""
end
expect(collect_text.call(doc)).to include("bold", "italic")
end

it "handles empty input" do
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# frozen_string_literal: true

require "json"
require "commonmarker"

# commonmarker's Rust/Magnus native extension only builds on MRI.
require "commonmarker" if RUBY_ENGINE == "ruby"

# Validates the MarkdownEscaper against all CommonMark Spec 0.31.2 examples.
#
Expand All @@ -15,8 +17,10 @@
# 2. **Content preservation** — every word that appears in the spec's expected
# HTML output must still appear after escaping and re-rendering. This catches
# bugs where the escaper silently drops or corrupts text.
skip_reason = "commonmarker not available on #{RUBY_ENGINE}" unless RUBY_ENGINE == "ruby"

RSpec.describe Markbridge::Renderers::Discourse::MarkdownEscaper do
context "with CommonMark Spec 0.31.2 examples" do
context "with CommonMark Spec 0.31.2 examples", skip: skip_reason do
let(:escaper) { described_class.new(escape_hard_line_breaks: true) }

SPEC_EXAMPLES = JSON.parse(SPEC_ROOT.join("fixtures/commonmark_spec_0.31.2.json").read)
Expand Down