From 6cf1bd2c4d910e6c29310cec12230c1bbd1a9e14 Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Thu, 23 Apr 2026 00:28:12 +0200 Subject: [PATCH 1/4] Add JRuby and TruffleRuby to CI test matrix Also set fail-fast: false so one implementation's failure doesn't cancel the others. --- .github/workflows/main.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8d5f973..15d3e12 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -34,12 +34,15 @@ jobs: runs-on: ubuntu-latest name: Ruby ${{ matrix.ruby }} strategy: + fail-fast: false matrix: ruby: - '3.2' - '3.3' - '3.4' - '4.0' + - 'jruby' + - 'truffleruby' steps: - name: Checkout From 9a83e00af9d3511e4f30932544fe15de794f5a41 Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Thu, 23 Apr 2026 00:29:02 +0200 Subject: [PATCH 2/4] Make commonmarker MRI-only for JRuby/TruffleRuby compatibility commonmarker 2.x uses a Rust/Magnus native extension that cannot build on JRuby (no universal-java variant) or TruffleRuby (magnus 0.8.2 is incompatible with TruffleRuby's rb-sys bindings). Since commonmarker is only used in one spec (validating MarkdownEscaper against CommonMark Spec 0.31.2 fixtures), gate it with install_if so it's resolved into the lockfile but only installed on MRI, and skip the spec on non-MRI engines. Add universal-java to the lockfile platforms so JRuby can resolve the bundle. --- Gemfile | 2 +- Gemfile.lock | 25 +++++++++++++++++++ .../markdown_escaper/commonmark_spec_spec.rb | 8 ++++-- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/Gemfile b/Gemfile index 67159f5..6e3fcf9 100644 --- a/Gemfile +++ b/Gemfile @@ -4,7 +4,7 @@ source "https://rubygems.org" gemspec -gem "commonmarker" +gem "commonmarker", install_if: -> { RUBY_ENGINE == "ruby" } gem "lefthook" gem "nokogiri" gem "puma" diff --git a/Gemfile.lock b/Gemfile.lock index 7254406..d3f457a 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,10 +15,13 @@ GEM thread_safe (~> 0.3, >= 0.3.1) base64 (0.3.0) bigdecimal (4.1.2) + bigdecimal (4.1.2-java) childprocess (5.1.0) logger (~> 1.5) coercible (1.0.0) descendants_tracker (~> 0.0.1) + commonmarker (2.8.1) + rb_sys (~> 0.9) commonmarker (2.8.1-aarch64-linux) commonmarker (2.8.1-aarch64-linux-musl) commonmarker (2.8.1-arm-linux) @@ -72,6 +75,7 @@ GEM sexp_processor (~> 4.8) ice_nine (0.11.2) json (2.19.4) + json (2.19.4-java) language_server-protocol (3.17.0.5) launchy (3.1.1) addressable (~> 2.8) @@ -82,6 +86,7 @@ GEM logger (1.7.0) mustermann (3.1.1) nio4r (2.7.5) + nio4r (2.7.5-java) nokogiri (1.19.2-aarch64-linux-gnu) racc (~> 1.4) nokogiri (1.19.2-aarch64-linux-musl) @@ -92,6 +97,8 @@ GEM racc (~> 1.4) nokogiri (1.19.2-arm64-darwin) racc (~> 1.4) + nokogiri (1.19.2-java) + racc (~> 1.4) nokogiri (1.19.2-x86_64-darwin) racc (~> 1.4) nokogiri (1.19.2-x86_64-linux-gnu) @@ -109,7 +116,10 @@ GEM public_suffix (7.0.5) puma (8.0.0) nio4r (~> 2.0) + puma (8.0.0-java) + nio4r (~> 2.0) racc (1.8.1) + racc (1.8.1-java) rack (3.2.6) rack-protection (4.2.1) base64 (>= 0.1.0) @@ -122,6 +132,9 @@ GEM rack (>= 3) rainbow (3.1.1) rake (13.4.2) + rake-compiler-dock (1.12.0) + rb_sys (0.9.127) + rake-compiler-dock (= 1.12.0) reek (6.5.0) dry-schema (~> 1.13) logger (~> 1.6) @@ -197,6 +210,7 @@ GEM syntax_tree (6.3.0) prettier_print (>= 1.2.0) thread_safe (0.3.6) + thread_safe (0.3.6-java) tilt (2.7.0) tty-which (0.5.0) unicode-display_width (3.2.0) @@ -214,6 +228,7 @@ PLATFORMS arm-linux-gnu arm-linux-musl arm64-darwin + universal-java x86_64-darwin x86_64-linux-gnu x86_64-linux-musl @@ -240,8 +255,10 @@ CHECKSUMS axiom-types (0.1.1) sha256=c1ff113f3de516fa195b2db7e0a9a95fd1b08475a502ff660d04507a09980383 base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b bigdecimal (4.1.2) sha256=53d217666027eab4280346fba98e7d5b66baaae1b9c3c1c0ffe89d48188a3fbd + bigdecimal (4.1.2-java) sha256=ccc836eab720a525529f70ed0de26a206fdbc9a9e8ac67b3b4ac7318b03e114d childprocess (5.1.0) sha256=9a8d484be2fd4096a0e90a0cd3e449a05bc3aa33f8ac9e4d6dcef6ac1455b6ec coercible (1.0.0) sha256=5081ad24352cc8435ce5472bc2faa30260c7ea7f2102cc6a9f167c4d9bffaadc + commonmarker (2.8.1) sha256=f78caf3cfc671fc64e85c9f49eaab0fb4ed3a96e5489be6cc7f59065768cf63e commonmarker (2.8.1-aarch64-linux) sha256=f855599cc6855f4137d72dcacae9571451075afe6e6c8522eba353df9b81d0bf commonmarker (2.8.1-aarch64-linux-musl) sha256=bbc2b3d361403431a5aac737dc86997d3b2843276ef395e7499cf17ad48e1b2c commonmarker (2.8.1-arm-linux) sha256=ede48564a9c2e29e003361fd7b0b158b3b85bcbd5a15b963fa2b3c78eef3993f @@ -265,6 +282,7 @@ CHECKSUMS flog (4.9.4) sha256=12cc054fab7a2cbd2a906514397c4d7788954d530564782d6f14939dc2dfbcbb ice_nine (0.11.2) sha256=5d506a7d2723d5592dc121b9928e4931742730131f22a1a37649df1c1e2e63db json (2.19.4) sha256=670a7d333fb3b18ca5b29cb255eb7bef099e40d88c02c80bd42a3f30fe5239ac + json (2.19.4-java) sha256=f7f0fe701e2bef648497b0eb59422f5b453e5038cfbaf9cde09af20e22241efb language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc launchy (3.1.1) sha256=72b847b5cc961589dde2c395af0108c86ff0119f42d4648d25b5440ebb10059e lefthook (2.1.6) sha256=e5c4c46f789c54961de030bf324f3fb38989a6adc944a6e8df50b4d432ebaf1d @@ -273,11 +291,13 @@ CHECKSUMS markbridge (0.1.0) mustermann (3.1.1) sha256=4c6170c7234d5499c345562ba7c7dfe73e1754286dcc1abb053064d66a127198 nio4r (2.7.5) sha256=6c90168e48fb5f8e768419c93abb94ba2b892a1d0602cb06eef16d8b7df1dca1 + nio4r (2.7.5-java) sha256=d14779d2a9b012ec0148a53344fbb2ed2a3c4d90c5dd923bf281135ab983b2c9 nokogiri (1.19.2-aarch64-linux-gnu) sha256=c34d5c8208025587554608e98fd88ab125b29c80f9352b821964e9a5d5cfbd19 nokogiri (1.19.2-aarch64-linux-musl) sha256=7f6b4b0202d507326841a4f790294bf75098aef50c7173443812e3ac5cb06515 nokogiri (1.19.2-arm-linux-gnu) sha256=b7fa1139016f3dc850bda1260988f0d749934a939d04ef2da13bec060d7d5081 nokogiri (1.19.2-arm-linux-musl) sha256=61114d44f6742ff72194a1b3020967201e2eb982814778d130f6471c11f9828c nokogiri (1.19.2-arm64-darwin) sha256=58d8ea2e31a967b843b70487a44c14c8ba1866daa1b9da9be9dbdf1b43dee205 + nokogiri (1.19.2-java) sha256=e9d67034bc80ca71043040beea8a91be5dc99b662daa38a2bfb361b7a2cc8717 nokogiri (1.19.2-x86_64-darwin) sha256=7d9af11fda72dfaa2961d8c4d5380ca0b51bc389dc5f8d4b859b9644f195e7a4 nokogiri (1.19.2-x86_64-linux-gnu) sha256=fa8feca882b73e871a9845f3817a72e9734c8e974bdc4fbad6e4bc6e8076b94f nokogiri (1.19.2-x86_64-linux-musl) sha256=93128448e61a9383a30baef041bf1f5817e22f297a1d400521e90294445069a8 @@ -289,13 +309,17 @@ CHECKSUMS prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85 public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623 puma (8.0.0) sha256=1681050b8b60fab1d3033255ab58b6aec64cd063e43fc6f8204bcb8bf9364b88 + puma (8.0.0-java) sha256=494ccc69aa368cf271e2612fcb88cb50653727c160519c95b0b3d34a856fcd81 racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f + racc (1.8.1-java) sha256=54f2e6d1e1b91c154013277d986f52a90e5ececbe91465d29172e49342732b98 rack (3.2.6) sha256=5ed78e1f73b2e25679bec7d45ee2d4483cc4146eb1be0264fc4d94cb5ef212c2 rack-protection (4.2.1) sha256=cf6e2842df8c55f5e4d1a4be015e603e19e9bc3a7178bae58949ccbb58558bac rack-session (2.1.2) sha256=595434f8c0c3473ae7d7ac56ecda6cc6dfd9d37c0b2b5255330aa1576967ffe8 rackup (2.3.1) sha256=6c79c26753778e90983761d677a48937ee3192b3ffef6bc963c0950f94688868 rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a rake (13.4.2) sha256=cb825b2bd5f1f8e91ca37bddb4b9aaf345551b4731da62949be002fa89283701 + rake-compiler-dock (1.12.0) sha256=f13205c2738f3d2053afcd03491a9e4541b22a59a0bfc53fc8bc883bd8188023 + rb_sys (0.9.127) sha256=e9f90df3bb0577472d26d96127d5b5774b98f44de881e7d36aeefd28d6337847 reek (6.5.0) sha256=d26d3a492773b2bbc228888067a21afe33ac07954a17dbd64cdeae42c4c69be1 regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb rexml (3.4.4) sha256=19e0a2c3425dfbf2d4fc1189747bdb2f849b6c5e74180401b15734bc97b5d142 @@ -318,6 +342,7 @@ CHECKSUMS sinatra (4.2.1) sha256=b7aeb9b11d046b552972ade834f1f9be98b185fa8444480688e3627625377080 syntax_tree (6.3.0) sha256=56e25a9692c798ec94c5442fe94c5e94af76bef91edc8bb02052cbdecf35f13d thread_safe (0.3.6) sha256=9ed7072821b51c57e8d6b7011a8e282e25aeea3a4065eab326e43f66f063b05a + thread_safe (0.3.6-java) sha256=bb28394cd0924c068981adee71f36a81c85c92e7d74d3f62372bd51489a0e0c2 tilt (2.7.0) sha256=0d5b9ba69f6a36490c64b0eee9f6e9aad517e20dcc848800a06eb116f08c6ab3 tty-which (0.5.0) sha256=5824055f0d6744c97e7c4426544f01d519c40d1806ef2ef47d9854477993f466 unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42 diff --git a/spec/unit/markbridge/renderers/discourse/markdown_escaper/commonmark_spec_spec.rb b/spec/unit/markbridge/renderers/discourse/markdown_escaper/commonmark_spec_spec.rb index d8a71a0..f6c5452 100644 --- a/spec/unit/markbridge/renderers/discourse/markdown_escaper/commonmark_spec_spec.rb +++ b/spec/unit/markbridge/renderers/discourse/markdown_escaper/commonmark_spec_spec.rb @@ -1,7 +1,9 @@ # frozen_string_literal: true require "json" -require "commonmarker" + +# commonmarker's Rust/Magnus native extension only builds on MRI. +require "commonmarker" if RUBY_ENGINE == "ruby" # Validates the MarkdownEscaper against all CommonMark Spec 0.31.2 examples. # @@ -15,8 +17,10 @@ # 2. **Content preservation** — every word that appears in the spec's expected # HTML output must still appear after escaping and re-rendering. This catches # bugs where the escaper silently drops or corrupts text. +skip_reason = "commonmarker not available on #{RUBY_ENGINE}" unless RUBY_ENGINE == "ruby" + RSpec.describe Markbridge::Renderers::Discourse::MarkdownEscaper do - context "with CommonMark Spec 0.31.2 examples" do + context "with CommonMark Spec 0.31.2 examples", skip: skip_reason do let(:escaper) { described_class.new(escape_hard_line_breaks: true) } SPEC_EXAMPLES = JSON.parse(SPEC_ROOT.join("fixtures/commonmark_spec_0.31.2.json").read) From 7eb0d9e9b96dc67c472c1613a3a4724ff34950dd Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Thu, 23 Apr 2026 00:30:02 +0200 Subject: [PATCH 3/4] Switch HTML parser from Nokogiri::HTML5 to Nokogiri::HTML Nokogiri::HTML5 is not available on JRuby, which blocked running the test suite and using the HTML parser on JRuby. The generic Nokogiri::HTML parser works on all engines. The AST output is identical in practice: the registered handlers only cover basic formatting, lists, links, images, code, quotes, and tables. Table support treats thead/tbody/tfoot as transparent wrappers, so the one notable HTML5/HTML4 difference (implicit tbody insertion) has no effect on the resulting AST. --- docs/parsers/comparison.md | 12 ++++++------ docs/parsers/html.md | 8 ++++---- lib/markbridge/parsers/html/parser.rb | 8 ++++++-- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/docs/parsers/comparison.md b/docs/parsers/comparison.md index 38938e8..323da1d 100644 --- a/docs/parsers/comparison.md +++ b/docs/parsers/comparison.md @@ -44,7 +44,7 @@ Markbridge includes three parsers, each designed for different input formats: **Location:** `Markbridge::Parsers::HTML::Parser` **Key features:** -- Leverages Nokogiri's HTML5 parser +- Leverages Nokogiri's HTML parser - DOM tree traversal - Handles malformed HTML gracefully - Void element support (self-closing tags) @@ -91,14 +91,14 @@ Input → Scanner → Tokens → Handler (via Registry) → AST ### HTML Parser Architecture ``` -Input → Nokogiri HTML5 → DOM Tree → Handler → AST +Input → Nokogiri HTML → DOM Tree → Handler → AST ↓ ↓ Element Nodes process() Text Nodes (stateless) ``` **Components:** -- **Nokogiri::HTML5:** External HTML parser +- **Nokogiri::HTML:** External HTML parser - **HandlerRegistry:** Simple tag-to-handler mapping - **Handlers:** Stateless, receive entire element at once - **Parser:** Walks DOM tree, dispatches to handlers @@ -369,7 +369,7 @@ All three parsers have linear complexity, but differ in implementation: ### HTML Performance Characteristics **Advantages:** -- ✓ Mature HTML5 parser (Nokogiri) +- ✓ Mature HTML parser (Nokogiri) - ✓ Handles malformed input well - ✓ Simple handler API (no state) - ✓ Battle-tested parsing @@ -413,7 +413,7 @@ Memory Usage (10 KB input): CPU Time (10 KB input): BBCode: 3-5 ms (custom scanner + state) - HTML: 2-4 ms (Nokogiri HTML5) + HTML: 2-4 ms (Nokogiri HTML) TextFormatter: 2-4 ms (Nokogiri XML) Dependencies: @@ -678,7 +678,7 @@ end - ✓ Web scraping → Markdown conversion - ✓ HTML email → Markdown - ✓ Handling malformed HTML -- ✓ Leveraging HTML5 standards +- ✓ Leveraging standard HTML parsing - ✓ Simple handler requirements **Examples:** diff --git a/docs/parsers/html.md b/docs/parsers/html.md index bc3df31..5e752b2 100644 --- a/docs/parsers/html.md +++ b/docs/parsers/html.md @@ -1,6 +1,6 @@ # HTML Parser Guide -This guide explains how the HTML parser converts standard HTML into the Markbridge AST using Nokogiri's HTML5 parser. +This guide explains how the HTML parser converts standard HTML into the Markbridge AST using Nokogiri's HTML parser. ## Table of Contents @@ -14,10 +14,10 @@ This guide explains how the HTML parser converts standard HTML into the Markbrid ## Overview -The HTML parser (`Markbridge::Parsers::HTML::Parser`) uses Nokogiri's HTML5 parser to convert HTML markup into AST. It provides a simpler alternative to the BBCode parser when working with HTML content. +The HTML parser (`Markbridge::Parsers::HTML::Parser`) uses Nokogiri to convert HTML markup into AST. It provides a simpler alternative to the BBCode parser when working with HTML content. **Key Features:** -- Leverages Nokogiri's battle-tested HTML5 parser +- Leverages Nokogiri's battle-tested HTML parser (libxml2 on MRI/TruffleRuby, Xerces/NekoHTML on JRuby) - Handles malformed HTML gracefully - Stateless handler API (simpler than BBCode) - Lambda handler support for quick customization @@ -121,7 +121,7 @@ parser.process_children(nokogiri_element, ast_parent) ``` **Parsing Flow:** -1. Parse HTML with Nokogiri::HTML5.fragment +1. Parse HTML with Nokogiri::HTML.fragment 2. Walk DOM tree 3. Dispatch each element to registered handlers 4. Return completed AST::Document diff --git a/lib/markbridge/parsers/html/parser.rb b/lib/markbridge/parsers/html/parser.rb index 61a718c..04718c9 100644 --- a/lib/markbridge/parsers/html/parser.rb +++ b/lib/markbridge/parsers/html/parser.rb @@ -26,8 +26,12 @@ def initialize(handlers: nil, &block) def parse(input) @unknown_tags.clear - # Parse HTML with Nokogiri - doc = Nokogiri::HTML5.fragment(input) + # Parse HTML with Nokogiri. Using the generic HTML (HTML4) parser rather + # than HTML5 because Nokogiri::HTML5 is not available on JRuby + # (see sparklemotion/nokogiri#2227). Table support treats thead/tbody/tfoot + # as transparent, so the parse-tree difference (HTML5 auto-inserts tbody, + # HTML4 does not) has no effect on the AST. + doc = Nokogiri::HTML.fragment(input) # Create root AST document document = AST::Document.new From c6cfabfb2ea41b2aeaa9bcaca95902d08597a185 Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Thu, 23 Apr 2026 00:30:02 +0200 Subject: [PATCH 4/4] Loosen malformed-HTML spec to accept parser-specific recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing assertion required the recovered tree to be a single nested italic. That holds on libxml2 (MRI/TruffleRuby) but not on JRuby's NekoHTML, which leaves and as siblings. Both are valid recoveries — nothing crashes, and all text survives. Assert only that the first child is still a Bold and that both words appear somewhere in the tree, which is what "gracefully" actually means. --- spec/unit/markbridge/parsers/html/parser_spec.rb | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/spec/unit/markbridge/parsers/html/parser_spec.rb b/spec/unit/markbridge/parsers/html/parser_spec.rb index b3d7e13..9a2d748 100644 --- a/spec/unit/markbridge/parsers/html/parser_spec.rb +++ b/spec/unit/markbridge/parsers/html/parser_spec.rb @@ -150,9 +150,17 @@ it "handles malformed HTML gracefully" do doc = parser.parse("bold italic") - # Nokogiri fixes the nesting - expect(doc.children.size).to eq(1) - expect(doc.children[0]).to be_a(Markbridge::AST::Bold) + # Nokogiri recovers from the mismatched tags. The exact tree shape is + # parser-dependent (libxml2 reparents into ; JRuby's + # NekoHTML leaves and as siblings), but the content survives + # and the top-level node is always the . + expect(doc.children.first).to be_a(Markbridge::AST::Bold) + + collect_text = ->(node) do + return node.text if node.is_a?(Markbridge::AST::Text) + node.respond_to?(:children) ? node.children.map(&collect_text).join : "" + end + expect(collect_text.call(doc)).to include("bold", "italic") end it "handles empty input" do