From 46cc1f82eabffef0d3098f721c5e1036226095ed Mon Sep 17 00:00:00 2001
From: tompng <tomoyapenguin@gmail.com>
Date: Wed, 26 Nov 2025 01:55:35 +0900
Subject: [PATCH] Tokenizer for syntax highlighting using Prism

---
 lib/rdoc/markup/to_html.rb          |  38 +++--
 lib/rdoc/parser/prism_ruby.rb       |  46 +++---
 lib/rdoc/parser/tokenizer.rb        | 243 ++++++++++++++++++++++++++++
 lib/rdoc/token_stream.rb            |   2 +
 test/rdoc/parser/prism_ruby_test.rb |  18 +++
 test/rdoc/parser/tokenizer_test.rb  | 141 ++++++++++++++++
 6 files changed, 448 insertions(+), 40 deletions(-)
 create mode 100644 lib/rdoc/parser/tokenizer.rb
 create mode 100644 test/rdoc/parser/tokenizer_test.rb
diff --git a/lib/rdoc/markup/to_html.rb b/lib/rdoc/markup/to_html.rb
index 8d019bef46..f0ad1bf848 100644
--- a/lib/rdoc/markup/to_html.rb
+++ b/lib/rdoc/markup/to_html.rb
@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require 'cgi/escape'
 require 'cgi/util' unless defined?(CGI::EscapeExt)
+require 'rdoc/parser/ripper_state_lex'
 
 ##
 # Outputs RDoc markup as HTML.
@@ -216,6 +217,23 @@ def accept_paragraph(paragraph)
     @res << "</p>\n"
   end
 
+  # Generate syntax highlighted html for ruby-like text.
+
+  def parsable_text_to_html(text)
+    if defined?(RDoc::Parser::PrismRuby) && RDoc::Parser::Ruby == RDoc::Parser::PrismRuby
+      tokens = RDoc::Parser::Tokenizer.tokenize(text).map do |type, text|
+        RDoc::TokenStream::RipperStateLexCompatToken.new(type, text)
+      end
+    else
+      # RipperStateLex.parse is assumed to fail in some cases.
+      # Failing input is unknown.
+      tokens = RDoc::Parser::RipperStateLex.parse(text) rescue return
+    end
+    result = RDoc::TokenStream.to_html tokens
+    result = result + "\n" unless "\n" == result[-1]
+    result
+  end
+
   ##
   # Adds +verbatim+ to the output
 
@@ -224,20 +242,12 @@ def accept_verbatim(verbatim)
 
     klass = nil
 
-    content = if verbatim.ruby? or parseable? text then
-                begin
-                  tokens = RDoc::Parser::RipperStateLex.parse text
-                  klass  = ' class="ruby"'
-
-                  result = RDoc::TokenStream.to_html tokens
-                  result = result + "\n" unless "\n" == result[-1]
-                  result
-                rescue
-                  CGI.escapeHTML text
-                end
-              else
-                CGI.escapeHTML text
-              end
+    if verbatim.ruby? || parseable?(text)
+      content = parsable_text_to_html(text)
+      klass = ' class="ruby"' if content # RDoc::Parser::RipperStateLex.parse may fail
+    end
+
+    content ||= CGI.escapeHTML text
 
     if @options.pipe then
       @res << "\n<pre><code>#{CGI.escapeHTML text}\n</code></pre>\n"
diff --git a/lib/rdoc/parser/prism_ruby.rb b/lib/rdoc/parser/prism_ruby.rb
index 56da6ac227..c18b37e8b3 100644
--- a/lib/rdoc/parser/prism_ruby.rb
+++ b/lib/rdoc/parser/prism_ruby.rb
@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 
 require 'prism'
-require_relative 'ripper_state_lex'
+require_relative 'tokenizer'
 
 # Unlike lib/rdoc/parser/ruby.rb, this file is not based on rtags and does not contain code from
 #   rtags.rb -
@@ -89,10 +89,13 @@ def record_location(container) # :nodoc:
   # Scans this Ruby file for Ruby constructs
 
   def scan
-    @tokens = RDoc::Parser::RipperStateLex.parse(@content)
     @lines = @content.lines
-    result = Prism.parse(@content)
-    @program_node = result.value
+    result = Prism.parse_lex(@content)
+    @prism_comments = result.comments
+    @program_node, unordered_tokens = result.value
+    # Heredoc tokens are not in start_offset order.
+    # Need to sort them to use bsearch for finding tokens by location.
+    @prism_tokens = unordered_tokens.map(&:first).sort_by { |t| t.location.start_offset }
     @line_nodes = {}
     prepare_line_nodes(@program_node)
     prepare_comments(result.comments)
@@ -205,7 +208,7 @@ def parse_comment_tomdoc(container, comment, line_no, start_line)
 
     meth.start_collecting_tokens(:ruby)
     node = @line_nodes[line_no]
-    tokens = node ? visible_tokens_from_location(node.location) : [file_line_comment_token(start_line)]
+    tokens = node ? visible_tokens_from_node(node) : [file_line_comment_token(start_line)]
     tokens.each { |token| meth.token_stream << token }
 
     container.add_method meth
@@ -273,7 +276,7 @@ def handle_meta_method_comment(comment, directives, node)
     elsif line_no || node
       method_name ||= call_node_name_arguments(node).first if is_call_node
       if node
-        tokens = visible_tokens_from_location(node.location)
+        tokens = visible_tokens_from_node(node)
         line_no = node.location.start_line
       else
         tokens = [file_line_comment_token(line_no)]
@@ -368,30 +371,21 @@ def parse_comment_text_to_directives(comment_text, start_line) # :nodoc:
     [comment, directives]
   end
 
-  def slice_tokens(start_pos, end_pos) # :nodoc:
-    start_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> start_pos) >= 0 }
-    end_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> end_pos) >= 0 }
-    tokens = @tokens[start_index...end_index]
-    tokens.pop if tokens.last&.kind == :on_nl
-    tokens
-  end
-
   def file_line_comment_token(line_no) # :nodoc:
-    position_comment = RDoc::Parser::RipperStateLex::Token.new(line_no - 1, 0, :on_comment)
-    position_comment[:text] = "# File #{@top_level.relative_name}, line #{line_no}"
-    position_comment
+    text = "# File #{@top_level.relative_name}, line #{line_no}"
+    RDoc::TokenStream::RipperStateLexCompatToken.new(:on_comment, text)
   end
 
-  # Returns tokens from the given location
+  # Returns tokens of the given node's location for syntax highlighting
 
-  def visible_tokens_from_location(location)
+  def visible_tokens_from_node(node)
+    location = node.location
     position_comment = file_line_comment_token(location.start_line)
-    newline_token = RDoc::Parser::RipperStateLex::Token.new(0, 0, :on_nl, "\n")
-    indent_token = RDoc::Parser::RipperStateLex::Token.new(location.start_line, 0, :on_sp, ' ' * location.start_character_column)
-    tokens = slice_tokens(
-      [location.start_line, location.start_character_column],
-      [location.end_line, location.end_character_column]
-    )
+    newline_token = RDoc::TokenStream::RipperStateLexCompatToken.new(:on_nl, "\n")
+    indent_token = RDoc::TokenStream::RipperStateLexCompatToken.new(:on_sp, ' ' * location.start_character_column)
+    tokens = RDoc::Parser::Tokenizer.partial_tokenize(@content, node, @prism_tokens, @prism_comments).map do |type, text|
+      RDoc::TokenStream::RipperStateLexCompatToken.new(type, text)
+    end
     [position_comment, newline_token, indent_token, *tokens]
   end
 
@@ -894,7 +888,7 @@ def visit_def_node(node)
       end
       name = node.name.to_s
       params, block_params, calls_super = MethodSignatureVisitor.scan_signature(node)
-      tokens = @scanner.visible_tokens_from_location(node.location)
+      tokens = @scanner.visible_tokens_from_node(node)
 
       @scanner.add_method(
         name,
diff --git a/lib/rdoc/parser/tokenizer.rb b/lib/rdoc/parser/tokenizer.rb
new file mode 100644
index 0000000000..cc8dd181e1
--- /dev/null
+++ b/lib/rdoc/parser/tokenizer.rb
@@ -0,0 +1,243 @@
+require 'prism'
+require 'set'
+
+# Tokenize Ruby code as RDoc::Parser::RipperStateLex style types and token squashing.
+# Token squashing is required by RDoc::TokenStream's syntax highlighting.
+module RDoc::Parser::Tokenizer
+  # This constants and token type map are for compatibility with RDoc::Parser::RipperStateLex.
+  OTHER = :other
+  SPACE = :on_sp
+  NEWLINE = :on_nl
+  KEYWORD = :on_kw
+  OP = :on_op
+  HEREDOC_BEG = :on_heredoc_beg
+  HEREDOC_CONTENT = :on_heredoc
+  HEREDOC_END = :on_heredoc_end
+  COMMENT = :on_comment
+  INTEGER = :on_int
+  FLOAT = :on_float
+  RATIONAL = :on_rational
+  IMAGINARY = :on_imaginary
+  SYMBOL = :on_symbol
+  REGEXP = :on_regexp
+  STRING = :on_tstring
+  WORDS = :on_dstring
+  DEF_METHOD_NAME = :on_ident
+  DSTRING = :on_dstring
+
+  OP_TOKENS = %i[
+    AMPERSAND AMPERSAND_AMPERSAND
+    BANG BANG_EQUAL BANG_TILDE CARET COLON COLON_COLON
+    EQUAL EQUAL_EQUAL EQUAL_GREATER EQUAL_TILDE
+    GREATER GREATER_GREATER
+    LESS LESS_EQUAL LESS_EQUAL_GREATER LESS_LESS
+    MINUS MINUS_GREATER PERCENT PIPE PIPE_PIPE PLUS
+    QUESTION_MARK SLASH STAR STAR_STAR TILDE
+    UAMPERSAND UMINUS UPLUS USTAR USTAR_STAR
+  ].to_set
+
+  TOKEN_TYPE_MAP = {
+    IDENTIFIER: :on_ident,
+    METHOD_NAME: :on_ident,
+    INSTANCE_VARIABLE: :on_ivar,
+    CLASS_VARIABLE: :on_cvar,
+    GLOBAL_VARIABLE: :on_gvar,
+    BACK_REFERENCE: :on_backref,
+    NUMBERED_REFERENCE: :on_backref,
+    CONSTANT: :on_const,
+    LABEL: :on_label,
+    INTEGER: :on_int,
+    FLOAT: :on_float,
+    RATIONAL: :on_rational,
+    IMAGINARY: :on_imaginary,
+  }
+
+  class << self
+    def tokenize(code)
+      result = Prism.parse_lex(code)
+      program_node, unordered_tokens = result.value
+      prism_tokens = unordered_tokens.map(&:first).sort_by! { |token| token.location.start_offset }
+      partial_tokenize(code, program_node, prism_tokens, result.comments, 0, code.bytesize)
+    end
+
+    def partial_tokenize(whole_code, node, prism_tokens, prism_comments, start_offset = nil, end_offset = nil)
+      start_offset ||= node.location.start_offset
+      end_offset ||= node.location.end_offset
+      visitor = SquashTokenVisitor.new
+      node.accept(visitor)
+      squashed_tokens = visitor.tokens
+      comment_tokens = comment_tokens(slice_by_location(prism_comments, start_offset, end_offset))
+      normal_tokens = normal_tokens(slice_by_location(prism_tokens, start_offset, end_offset))
+      prior_tokens = (squashed_tokens + comment_tokens).sort_by {|_, start_offset, _| start_offset }
+      unify_tokens(whole_code, prior_tokens, normal_tokens, start_offset, end_offset)
+    end
+
+    private
+
+    def slice_by_location(items, start_offset, end_offset)
+      start_index = items.bsearch_index { |item| item.location.end_offset > start_offset } || items.size
+      end_index = items.bsearch_index { |item| item.location.start_offset >= end_offset } || items.size
+      items[start_index...end_index]
+    end
+
+    # Unify prior tokens and normal tokens into a token stream.
+    # Prior tokens have higher priority than normal tokens.
+    # Also adds missing text (spaces, newlines, etc.) as separate tokens
+    # so that the entire code is covered.
+    def unify_tokens(code, prior_tokens, normal_tokens, start_offset, end_offset)
+      tokens = []
+      offset = start_offset
+
+      # Add missing text such as spaces and newlines as a separate token
+      flush = -> next_offset {
+        return if offset == next_offset
+
+        code.byteslice(offset...next_offset).scan(/\n|\s+|[^\s]+/) do |text|
+          type =
+            if text == "\n"
+              NEWLINE
+            elsif /\A\s+\z/.match?(text)
+              SPACE
+            else
+              OTHER
+            end
+          tokens << [type, text]
+        end
+      }
+
+      until prior_tokens.empty? && normal_tokens.empty?
+        ptok = prior_tokens.first
+        ntok = normal_tokens.first
+        if ntok && (!ptok || ntok[2] <= ptok[1])
+          token = normal_tokens.shift
+        else
+          token = prior_tokens.shift
+        end
+        type, start_pos, end_pos = token
+        next if start_pos < offset
+
+        flush.call(start_pos)
+        tokens << [type, code.byteslice(start_pos...end_pos)]
+        offset = end_pos
+      end
+      flush.call(end_offset)
+      tokens
+    end
+
+    # Extract normal comment and embdoc comment (consists of multiple tokens) as a single token
+    def comment_tokens(comments)
+      comments.map do |comment|
+        [COMMENT, comment.location.start_offset, comment.location.end_offset]
+      end
+    end
+
+    # Convert normal Prism tokens to [type, start_offset, end_offset]
+    def normal_tokens(tokens)
+      tokens.map do |token,|
+        type =
+          if token.type.start_with?('KEYWORD_')
+            KEYWORD
+          elsif OP_TOKENS.include?(token.type.to_sym)
+            OP
+          else
+            TOKEN_TYPE_MAP[token.type] || OTHER
+          end
+        [type, token.location.start_offset, token.location.end_offset]
+      end
+    end
+  end
+
+  # Visitor to squash several tokens that consist a single node into a single token
+  class SquashTokenVisitor < Prism::Visitor
+    attr_reader :tokens
+    def initialize
+      @tokens = []
+    end
+
+    # Squash UMINUS and its operand(integer, float, rational, imaginary) token into a single token
+    def visit_integer_node(node)
+      push_location(node.location, INTEGER)
+    end
+
+    def visit_float_node(node)
+      push_location(node.location, FLOAT)
+    end
+
+    def visit_rational_node(node)
+      push_location(node.location, RATIONAL)
+    end
+
+    def visit_imaginary_node(node)
+      push_location(node.location, IMAGINARY)
+    end
+
+    def visit_symbol_node(node)
+      push_location(node.location, SYMBOL)
+    end
+    alias visit_interpolated_symbol_node visit_symbol_node
+
+    def visit_regular_expression_node(node)
+      push_location(node.location, REGEXP)
+    end
+    alias visit_match_last_line_node visit_regular_expression_node
+    alias visit_interpolated_regular_expression_node visit_regular_expression_node
+    alias visit_interpolated_match_last_line_node visit_regular_expression_node
+
+    def visit_string_node(node)
+      # opening of StringNode inside InterpolatedStringNode might be nil
+      if node.opening&.start_with?('<<')
+        push_location(node.opening_loc, HEREDOC_BEG)
+        push_location(node.content_loc, HEREDOC_CONTENT)
+        push_location(node.closing_loc, HEREDOC_END)
+      else
+        push_location(node.location, STRING)
+      end
+    end
+    alias visit_x_string_node visit_string_node
+
+    def visit_array_node(node)
+      # Right hand side of `a = 1,2` is an array node without opening
+      if node.opening&.start_with?('%')
+        # Percent array: squash entire node into a single token.
+        # We don't handle embedded expressions inside yet.
+        push_location(node.location, WORDS)
+      else
+        super
+      end
+    end
+
+    def push_location(location, type)
+      @tokens << [type, location.start_offset, location.end_offset]
+    end
+
+    def visit_def_node(node)
+      # For special colorizing of method name in def node
+      push_location(node.name_loc, DEF_METHOD_NAME)
+      super
+    end
+
+    def visit_interpolated_string_node(node)
+      # `"a" "b"` is an interpolated string node without opening
+      if node.opening&.start_with?('<<')
+        # Heredocs. Squash content into a single token.
+        # We don't tokenize embedded expressions inside, and don't handle nested heredocs yet.
+        push_location(node.opening_loc, HEREDOC_BEG)
+        unless node.parts.empty?
+          # Squash heredoc content into a single token
+          part_locations = node.parts.map(&:location)
+          @tokens << [
+            HEREDOC_CONTENT,
+            part_locations.map(&:start_offset).min,
+            part_locations.map(&:end_offset).max
+          ]
+        end
+        # incomplete heredoc might not have closing_loc
+        push_location(node.closing_loc, HEREDOC_END) if node.closing_loc
+      else
+        # Squash entire node into a single token
+        push_location(node.location, DSTRING)
+      end
+    end
+    alias visit_interpolated_x_string_node visit_interpolated_string_node
+  end
+end
diff --git a/lib/rdoc/token_stream.rb b/lib/rdoc/token_stream.rb
index 5a4ca82a67..07f83862c1 100644
--- a/lib/rdoc/token_stream.rb
+++ b/lib/rdoc/token_stream.rb
@@ -9,6 +9,8 @@
 
 module RDoc::TokenStream
 
+  RipperStateLexCompatToken = Struct.new(:kind, :text, :line_no, :char_no, :state)
+
   ##
   # Converts +token_stream+ to HTML wrapping various tokens with
   # <tt><span></tt> elements. Some tokens types are wrapped in spans
diff --git a/test/rdoc/parser/prism_ruby_test.rb b/test/rdoc/parser/prism_ruby_test.rb
index e393f70e3e..e12a9751bf 100644
--- a/test/rdoc/parser/prism_ruby_test.rb
+++ b/test/rdoc/parser/prism_ruby_test.rb
@@ -2065,6 +2065,24 @@ def test_read_directive_linear_performance
     end
   end
 
+  def test_code_object_token_stream
+    util_parser <<~RUBY
+      class Foo
+        def foo
+          42
+        end
+
+        private def bar
+          :bar
+        end
+      end
+    RUBY
+
+    foo, bar = @top_level.classes.first.method_list
+    # Skip first two tokens: location comment and newline
+    assert_equal(['  ', 'def', ' ', 'foo', "\n", '    ', '42', "\n", '  ', 'end'], foo.token_stream.drop(2).map(&:text))
+    assert_equal(['          ', 'def', ' ', 'bar', "\n", '    ', ':bar', "\n", '  ', 'end'], bar.token_stream.drop(2).map(&:text))
+  end
 
   def test_markup_first_comment
     util_parser <<~RUBY
diff --git a/test/rdoc/parser/tokenizer_test.rb b/test/rdoc/parser/tokenizer_test.rb
new file mode 100644
index 0000000000..07cbcb1a8a
--- /dev/null
+++ b/test/rdoc/parser/tokenizer_test.rb
@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+require_relative '../helper'
+require 'rdoc/parser/tokenizer'
+
+class RDocParserTokenizerTest < RDoc::TestCase
+  def test_partial_tokenize
+    code = <<~RUBY
+      class A
+        def m
+          # comment
+          42
+        end
+      end
+    RUBY
+    parse_result = Prism.parse_lex(code)
+    program_node, unordered_tokens = parse_result.value
+    prism_tokens = unordered_tokens.map(&:first).sort_by! { |token| token.location.start_offset }
+    def_node = program_node.statements.body[0].body.body[0]
+    tokens = RDoc::Parser::Tokenizer.partial_tokenize(code, def_node, prism_tokens, parse_result.comments)
+    expected = ['def', ' ', 'm', "\n", '    ', '# comment', "\n", '    ', '42', "\n", '  ', 'end']
+    assert_equal(expected, tokens.map(&:last))
+  end
+
+  def test_comment
+    code = <<~RUBY
+      # comment1
+      class A
+      =begin
+      comment2
+      =end
+        def m
+          42 # comment3
+        end
+      end
+    RUBY
+    tokens = RDoc::Parser::Tokenizer.tokenize(code)
+    assert_equal(code, tokens.map(&:last).join)
+    assert_include(tokens, [:on_comment, '# comment1'])
+    assert_include(tokens, [:on_comment, "=begin\ncomment2\n=end\n"])
+    assert_include(tokens, [:on_comment, '# comment3'])
+  end
+
+  def test_squash_uminus
+    code = <<~RUBY
+      def m
+        -42; -4.2; -42i; -42r
+      end
+    RUBY
+    tokens = RDoc::Parser::Tokenizer.tokenize(code)
+    assert_equal(code, tokens.map(&:last).join)
+    assert_include(tokens, [:on_int, '-42'])
+    assert_include(tokens, [:on_float, '-4.2'])
+    assert_include(tokens, [:on_imaginary, '-42i'])
+    assert_include(tokens, [:on_rational, '-42r'])
+  end
+
+  def test_squash_interpolated_node
+    code = <<~'RUBY'
+      def m
+        "string#{interpolation}example"
+        /regexp#{interpolation}example/
+        :"symbol#{interpolation}example"
+      end
+    RUBY
+    tokens = RDoc::Parser::Tokenizer.tokenize(code)
+    assert_equal(code, tokens.map(&:last).join)
+    assert_include(tokens, [:on_dstring, '"string#{interpolation}example"'])
+    assert_include(tokens, [:on_regexp, '/regexp#{interpolation}example/'])
+    assert_include(tokens, [:on_symbol, ':"symbol#{interpolation}example"'])
+  end
+
+  def test_squash_words
+    code = <<~RUBY
+      def m
+        a = 1, 2 # array without opening. %w[] squashing should not fail with this input
+        %w[one two three]
+        %W[one \#{two} three]
+        %i[one two three]
+        %I[one \#{two} three]
+      end
+    RUBY
+    tokens = RDoc::Parser::Tokenizer.tokenize(code)
+    assert_equal(code, tokens.map(&:last).join)
+    assert_include(tokens, [:on_dstring, '%w[one two three]'])
+    assert_include(tokens, [:on_dstring, '%W[one #{two} three]'])
+    assert_include(tokens, [:on_dstring, '%i[one two three]'])
+    assert_include(tokens, [:on_dstring, '%I[one #{two} three]'])
+  end
+
+  def test_multibyte
+    code = <<~RUBY
+      def f(s = '💎')
+        # comment 💎
+        puts '💎' + s
+      end
+    RUBY
+    tokens = RDoc::Parser::Tokenizer.tokenize(code)
+    assert_equal(code, tokens.map(&:last).join)
+  end
+
+  def test_string_concat_node
+    # concatenated string node has no opening
+    code = <<~'RUBY'
+      def f
+        %[hello] 'HELLO'\
+        "world"
+      end
+    RUBY
+    tokens = RDoc::Parser::Tokenizer.tokenize(code)
+    assert_equal(code, tokens.map(&:last).join)
+  end
+
+  def test_squash_heredoc
+    code = <<~'RUBY'
+      def f
+        str1 = <<~AA
+          single-line-heredoc
+        AA
+        str2 = <<~`BB` # comment
+          x-string-heredoc
+        BB
+        str3 = <<~CC.itself
+          multi-line
+          #{embed}
+          heredoc
+        CC
+      end
+    RUBY
+    tokens = RDoc::Parser::Tokenizer.tokenize(code)
+    assert_equal(code, tokens.map(&:last).join)
+    assert_include(tokens, [:on_heredoc_beg, '<<~AA'])
+    assert_include(tokens, [:on_heredoc_beg, '<<~`BB`'])
+    assert_include(tokens, [:on_heredoc_beg, '<<~CC'])
+    assert_include(tokens, [:on_heredoc_end, "  AA\n"])
+    assert_include(tokens, [:on_heredoc_end, "  BB\n"])
+    assert_include(tokens, [:on_heredoc_end, "  CC\n"])
+    assert_include(tokens, [:on_heredoc, "    single-line-heredoc\n"])
+    assert_include(tokens, [:on_heredoc, "    x-string-heredoc\n"])
+    assert_include(tokens, [:on_heredoc, "    multi-line\n    \#{embed}\n    heredoc\n"])
+  end
+end