diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 8d5f973..15d3e12 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -34,12 +34,15 @@ jobs:
runs-on: ubuntu-latest
name: Ruby ${{ matrix.ruby }}
strategy:
+ fail-fast: false
matrix:
ruby:
- '3.2'
- '3.3'
- '3.4'
- '4.0'
+ - 'jruby'
+ - 'truffleruby'
steps:
- name: Checkout
diff --git a/Gemfile b/Gemfile
index 67159f5..6e3fcf9 100644
--- a/Gemfile
+++ b/Gemfile
@@ -4,7 +4,7 @@ source "https://rubygems.org"
gemspec
-gem "commonmarker"
+gem "commonmarker", install_if: -> { RUBY_ENGINE == "ruby" }
gem "lefthook"
gem "nokogiri"
gem "puma"
diff --git a/Gemfile.lock b/Gemfile.lock
index 7254406..d3f457a 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -15,10 +15,13 @@ GEM
thread_safe (~> 0.3, >= 0.3.1)
base64 (0.3.0)
bigdecimal (4.1.2)
+ bigdecimal (4.1.2-java)
childprocess (5.1.0)
logger (~> 1.5)
coercible (1.0.0)
descendants_tracker (~> 0.0.1)
+ commonmarker (2.8.1)
+ rb_sys (~> 0.9)
commonmarker (2.8.1-aarch64-linux)
commonmarker (2.8.1-aarch64-linux-musl)
commonmarker (2.8.1-arm-linux)
@@ -72,6 +75,7 @@ GEM
sexp_processor (~> 4.8)
ice_nine (0.11.2)
json (2.19.4)
+ json (2.19.4-java)
language_server-protocol (3.17.0.5)
launchy (3.1.1)
addressable (~> 2.8)
@@ -82,6 +86,7 @@ GEM
logger (1.7.0)
mustermann (3.1.1)
nio4r (2.7.5)
+ nio4r (2.7.5-java)
nokogiri (1.19.2-aarch64-linux-gnu)
racc (~> 1.4)
nokogiri (1.19.2-aarch64-linux-musl)
@@ -92,6 +97,8 @@ GEM
racc (~> 1.4)
nokogiri (1.19.2-arm64-darwin)
racc (~> 1.4)
+ nokogiri (1.19.2-java)
+ racc (~> 1.4)
nokogiri (1.19.2-x86_64-darwin)
racc (~> 1.4)
nokogiri (1.19.2-x86_64-linux-gnu)
@@ -109,7 +116,10 @@ GEM
public_suffix (7.0.5)
puma (8.0.0)
nio4r (~> 2.0)
+ puma (8.0.0-java)
+ nio4r (~> 2.0)
racc (1.8.1)
+ racc (1.8.1-java)
rack (3.2.6)
rack-protection (4.2.1)
base64 (>= 0.1.0)
@@ -122,6 +132,9 @@ GEM
rack (>= 3)
rainbow (3.1.1)
rake (13.4.2)
+ rake-compiler-dock (1.12.0)
+ rb_sys (0.9.127)
+ rake-compiler-dock (= 1.12.0)
reek (6.5.0)
dry-schema (~> 1.13)
logger (~> 1.6)
@@ -197,6 +210,7 @@ GEM
syntax_tree (6.3.0)
prettier_print (>= 1.2.0)
thread_safe (0.3.6)
+ thread_safe (0.3.6-java)
tilt (2.7.0)
tty-which (0.5.0)
unicode-display_width (3.2.0)
@@ -214,6 +228,7 @@ PLATFORMS
arm-linux-gnu
arm-linux-musl
arm64-darwin
+ universal-java
x86_64-darwin
x86_64-linux-gnu
x86_64-linux-musl
@@ -240,8 +255,10 @@ CHECKSUMS
axiom-types (0.1.1) sha256=c1ff113f3de516fa195b2db7e0a9a95fd1b08475a502ff660d04507a09980383
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
bigdecimal (4.1.2) sha256=53d217666027eab4280346fba98e7d5b66baaae1b9c3c1c0ffe89d48188a3fbd
+ bigdecimal (4.1.2-java) sha256=ccc836eab720a525529f70ed0de26a206fdbc9a9e8ac67b3b4ac7318b03e114d
childprocess (5.1.0) sha256=9a8d484be2fd4096a0e90a0cd3e449a05bc3aa33f8ac9e4d6dcef6ac1455b6ec
coercible (1.0.0) sha256=5081ad24352cc8435ce5472bc2faa30260c7ea7f2102cc6a9f167c4d9bffaadc
+ commonmarker (2.8.1) sha256=f78caf3cfc671fc64e85c9f49eaab0fb4ed3a96e5489be6cc7f59065768cf63e
commonmarker (2.8.1-aarch64-linux) sha256=f855599cc6855f4137d72dcacae9571451075afe6e6c8522eba353df9b81d0bf
commonmarker (2.8.1-aarch64-linux-musl) sha256=bbc2b3d361403431a5aac737dc86997d3b2843276ef395e7499cf17ad48e1b2c
commonmarker (2.8.1-arm-linux) sha256=ede48564a9c2e29e003361fd7b0b158b3b85bcbd5a15b963fa2b3c78eef3993f
@@ -265,6 +282,7 @@ CHECKSUMS
flog (4.9.4) sha256=12cc054fab7a2cbd2a906514397c4d7788954d530564782d6f14939dc2dfbcbb
ice_nine (0.11.2) sha256=5d506a7d2723d5592dc121b9928e4931742730131f22a1a37649df1c1e2e63db
json (2.19.4) sha256=670a7d333fb3b18ca5b29cb255eb7bef099e40d88c02c80bd42a3f30fe5239ac
+ json (2.19.4-java) sha256=f7f0fe701e2bef648497b0eb59422f5b453e5038cfbaf9cde09af20e22241efb
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
launchy (3.1.1) sha256=72b847b5cc961589dde2c395af0108c86ff0119f42d4648d25b5440ebb10059e
lefthook (2.1.6) sha256=e5c4c46f789c54961de030bf324f3fb38989a6adc944a6e8df50b4d432ebaf1d
@@ -273,11 +291,13 @@ CHECKSUMS
markbridge (0.1.0)
mustermann (3.1.1) sha256=4c6170c7234d5499c345562ba7c7dfe73e1754286dcc1abb053064d66a127198
nio4r (2.7.5) sha256=6c90168e48fb5f8e768419c93abb94ba2b892a1d0602cb06eef16d8b7df1dca1
+ nio4r (2.7.5-java) sha256=d14779d2a9b012ec0148a53344fbb2ed2a3c4d90c5dd923bf281135ab983b2c9
nokogiri (1.19.2-aarch64-linux-gnu) sha256=c34d5c8208025587554608e98fd88ab125b29c80f9352b821964e9a5d5cfbd19
nokogiri (1.19.2-aarch64-linux-musl) sha256=7f6b4b0202d507326841a4f790294bf75098aef50c7173443812e3ac5cb06515
nokogiri (1.19.2-arm-linux-gnu) sha256=b7fa1139016f3dc850bda1260988f0d749934a939d04ef2da13bec060d7d5081
nokogiri (1.19.2-arm-linux-musl) sha256=61114d44f6742ff72194a1b3020967201e2eb982814778d130f6471c11f9828c
nokogiri (1.19.2-arm64-darwin) sha256=58d8ea2e31a967b843b70487a44c14c8ba1866daa1b9da9be9dbdf1b43dee205
+ nokogiri (1.19.2-java) sha256=e9d67034bc80ca71043040beea8a91be5dc99b662daa38a2bfb361b7a2cc8717
nokogiri (1.19.2-x86_64-darwin) sha256=7d9af11fda72dfaa2961d8c4d5380ca0b51bc389dc5f8d4b859b9644f195e7a4
nokogiri (1.19.2-x86_64-linux-gnu) sha256=fa8feca882b73e871a9845f3817a72e9734c8e974bdc4fbad6e4bc6e8076b94f
nokogiri (1.19.2-x86_64-linux-musl) sha256=93128448e61a9383a30baef041bf1f5817e22f297a1d400521e90294445069a8
@@ -289,13 +309,17 @@ CHECKSUMS
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
puma (8.0.0) sha256=1681050b8b60fab1d3033255ab58b6aec64cd063e43fc6f8204bcb8bf9364b88
+ puma (8.0.0-java) sha256=494ccc69aa368cf271e2612fcb88cb50653727c160519c95b0b3d34a856fcd81
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
+ racc (1.8.1-java) sha256=54f2e6d1e1b91c154013277d986f52a90e5ececbe91465d29172e49342732b98
rack (3.2.6) sha256=5ed78e1f73b2e25679bec7d45ee2d4483cc4146eb1be0264fc4d94cb5ef212c2
rack-protection (4.2.1) sha256=cf6e2842df8c55f5e4d1a4be015e603e19e9bc3a7178bae58949ccbb58558bac
rack-session (2.1.2) sha256=595434f8c0c3473ae7d7ac56ecda6cc6dfd9d37c0b2b5255330aa1576967ffe8
rackup (2.3.1) sha256=6c79c26753778e90983761d677a48937ee3192b3ffef6bc963c0950f94688868
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
rake (13.4.2) sha256=cb825b2bd5f1f8e91ca37bddb4b9aaf345551b4731da62949be002fa89283701
+ rake-compiler-dock (1.12.0) sha256=f13205c2738f3d2053afcd03491a9e4541b22a59a0bfc53fc8bc883bd8188023
+ rb_sys (0.9.127) sha256=e9f90df3bb0577472d26d96127d5b5774b98f44de881e7d36aeefd28d6337847
reek (6.5.0) sha256=d26d3a492773b2bbc228888067a21afe33ac07954a17dbd64cdeae42c4c69be1
regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb
rexml (3.4.4) sha256=19e0a2c3425dfbf2d4fc1189747bdb2f849b6c5e74180401b15734bc97b5d142
@@ -318,6 +342,7 @@ CHECKSUMS
sinatra (4.2.1) sha256=b7aeb9b11d046b552972ade834f1f9be98b185fa8444480688e3627625377080
syntax_tree (6.3.0) sha256=56e25a9692c798ec94c5442fe94c5e94af76bef91edc8bb02052cbdecf35f13d
thread_safe (0.3.6) sha256=9ed7072821b51c57e8d6b7011a8e282e25aeea3a4065eab326e43f66f063b05a
+ thread_safe (0.3.6-java) sha256=bb28394cd0924c068981adee71f36a81c85c92e7d74d3f62372bd51489a0e0c2
tilt (2.7.0) sha256=0d5b9ba69f6a36490c64b0eee9f6e9aad517e20dcc848800a06eb116f08c6ab3
tty-which (0.5.0) sha256=5824055f0d6744c97e7c4426544f01d519c40d1806ef2ef47d9854477993f466
unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
diff --git a/docs/parsers/comparison.md b/docs/parsers/comparison.md
index 38938e8..323da1d 100644
--- a/docs/parsers/comparison.md
+++ b/docs/parsers/comparison.md
@@ -44,7 +44,7 @@ Markbridge includes three parsers, each designed for different input formats:
**Location:** `Markbridge::Parsers::HTML::Parser`
**Key features:**
-- Leverages Nokogiri's HTML5 parser
+- Leverages Nokogiri's HTML parser
- DOM tree traversal
- Handles malformed HTML gracefully
- Void element support (self-closing tags)
@@ -91,14 +91,14 @@ Input → Scanner → Tokens → Handler (via Registry) → AST
### HTML Parser Architecture
```
-Input → Nokogiri HTML5 → DOM Tree → Handler → AST
+Input → Nokogiri HTML → DOM Tree → Handler → AST
↓ ↓
Element Nodes process()
Text Nodes (stateless)
```
**Components:**
-- **Nokogiri::HTML5:** External HTML parser
+- **Nokogiri::HTML:** External HTML parser
- **HandlerRegistry:** Simple tag-to-handler mapping
- **Handlers:** Stateless, receive entire element at once
- **Parser:** Walks DOM tree, dispatches to handlers
@@ -369,7 +369,7 @@ All three parsers have linear complexity, but differ in implementation:
### HTML Performance Characteristics
**Advantages:**
-- ✓ Mature HTML5 parser (Nokogiri)
+- ✓ Mature HTML parser (Nokogiri)
- ✓ Handles malformed input well
- ✓ Simple handler API (no state)
- ✓ Battle-tested parsing
@@ -413,7 +413,7 @@ Memory Usage (10 KB input):
CPU Time (10 KB input):
BBCode: 3-5 ms (custom scanner + state)
- HTML: 2-4 ms (Nokogiri HTML5)
+ HTML: 2-4 ms (Nokogiri HTML)
TextFormatter: 2-4 ms (Nokogiri XML)
Dependencies:
@@ -678,7 +678,7 @@ end
- ✓ Web scraping → Markdown conversion
- ✓ HTML email → Markdown
- ✓ Handling malformed HTML
-- ✓ Leveraging HTML5 standards
+- ✓ Leveraging standard HTML parsing
- ✓ Simple handler requirements
**Examples:**
diff --git a/docs/parsers/html.md b/docs/parsers/html.md
index bc3df31..5e752b2 100644
--- a/docs/parsers/html.md
+++ b/docs/parsers/html.md
@@ -1,6 +1,6 @@
# HTML Parser Guide
-This guide explains how the HTML parser converts standard HTML into the Markbridge AST using Nokogiri's HTML5 parser.
+This guide explains how the HTML parser converts standard HTML into the Markbridge AST using Nokogiri's HTML parser.
## Table of Contents
@@ -14,10 +14,10 @@ This guide explains how the HTML parser converts standard HTML into the Markbrid
## Overview
-The HTML parser (`Markbridge::Parsers::HTML::Parser`) uses Nokogiri's HTML5 parser to convert HTML markup into AST. It provides a simpler alternative to the BBCode parser when working with HTML content.
+The HTML parser (`Markbridge::Parsers::HTML::Parser`) uses Nokogiri to convert HTML markup into AST. It provides a simpler alternative to the BBCode parser when working with HTML content.
**Key Features:**
-- Leverages Nokogiri's battle-tested HTML5 parser
+- Leverages Nokogiri's battle-tested HTML parser (libxml2 on MRI/TruffleRuby, Xerces/NekoHTML on JRuby)
- Handles malformed HTML gracefully
- Stateless handler API (simpler than BBCode)
- Lambda handler support for quick customization
@@ -121,7 +121,7 @@ parser.process_children(nokogiri_element, ast_parent)
```
**Parsing Flow:**
-1. Parse HTML with Nokogiri::HTML5.fragment
+1. Parse HTML with Nokogiri::HTML.fragment
2. Walk DOM tree
3. Dispatch each element to registered handlers
4. Return completed AST::Document
diff --git a/lib/markbridge/parsers/html/parser.rb b/lib/markbridge/parsers/html/parser.rb
index 61a718c..04718c9 100644
--- a/lib/markbridge/parsers/html/parser.rb
+++ b/lib/markbridge/parsers/html/parser.rb
@@ -26,8 +26,12 @@ def initialize(handlers: nil, &block)
def parse(input)
@unknown_tags.clear
- # Parse HTML with Nokogiri
- doc = Nokogiri::HTML5.fragment(input)
+ # Parse HTML with Nokogiri. Using the generic HTML (HTML4) parser rather
+ # than HTML5 because Nokogiri::HTML5 is not available on JRuby
+ # (see sparklemotion/nokogiri#2227). Table support treats thead/tbody/tfoot
+ # as transparent, so the parse-tree difference (HTML5 auto-inserts tbody,
+ # HTML4 does not) has no effect on the AST.
+ doc = Nokogiri::HTML.fragment(input)
# Create root AST document
document = AST::Document.new
diff --git a/spec/unit/markbridge/parsers/html/parser_spec.rb b/spec/unit/markbridge/parsers/html/parser_spec.rb
index b3d7e13..9a2d748 100644
--- a/spec/unit/markbridge/parsers/html/parser_spec.rb
+++ b/spec/unit/markbridge/parsers/html/parser_spec.rb
@@ -150,9 +150,17 @@
it "handles malformed HTML gracefully" do
doc = parser.parse("bold italic")
- # Nokogiri fixes the nesting
- expect(doc.children.size).to eq(1)
- expect(doc.children[0]).to be_a(Markbridge::AST::Bold)
+ # Nokogiri recovers from the mismatched tags. The exact tree shape is
+ # parser-dependent (libxml2 reparents into …; JRuby's
+ # NekoHTML leaves and as siblings), but the content survives
+ # and the top-level node is always the .
+ expect(doc.children.first).to be_a(Markbridge::AST::Bold)
+
+ collect_text = ->(node) do
+ return node.text if node.is_a?(Markbridge::AST::Text)
+ node.respond_to?(:children) ? node.children.map(&collect_text).join : ""
+ end
+ expect(collect_text.call(doc)).to include("bold", "italic")
end
it "handles empty input" do
diff --git a/spec/unit/markbridge/renderers/discourse/markdown_escaper/commonmark_spec_spec.rb b/spec/unit/markbridge/renderers/discourse/markdown_escaper/commonmark_spec_spec.rb
index d8a71a0..f6c5452 100644
--- a/spec/unit/markbridge/renderers/discourse/markdown_escaper/commonmark_spec_spec.rb
+++ b/spec/unit/markbridge/renderers/discourse/markdown_escaper/commonmark_spec_spec.rb
@@ -1,7 +1,9 @@
# frozen_string_literal: true
require "json"
-require "commonmarker"
+
+# commonmarker's Rust/Magnus native extension only builds on MRI.
+require "commonmarker" if RUBY_ENGINE == "ruby"
# Validates the MarkdownEscaper against all CommonMark Spec 0.31.2 examples.
#
@@ -15,8 +17,10 @@
# 2. **Content preservation** — every word that appears in the spec's expected
# HTML output must still appear after escaping and re-rendering. This catches
# bugs where the escaper silently drops or corrupts text.
+skip_reason = "commonmarker not available on #{RUBY_ENGINE}" unless RUBY_ENGINE == "ruby"
+
RSpec.describe Markbridge::Renderers::Discourse::MarkdownEscaper do
- context "with CommonMark Spec 0.31.2 examples" do
+ context "with CommonMark Spec 0.31.2 examples", skip: skip_reason do
let(:escaper) { described_class.new(escape_hard_line_breaks: true) }
SPEC_EXAMPLES = JSON.parse(SPEC_ROOT.join("fixtures/commonmark_spec_0.31.2.json").read)