From 46cc1f82eabffef0d3098f721c5e1036226095ed Mon Sep 17 00:00:00 2001 From: tompng Date: Wed, 26 Nov 2025 01:55:35 +0900 Subject: [PATCH] Tokenizer for syntax highlighting using Prism --- lib/rdoc/markup/to_html.rb | 38 +++-- lib/rdoc/parser/prism_ruby.rb | 46 +++--- lib/rdoc/parser/tokenizer.rb | 243 ++++++++++++++++++++++++++++ lib/rdoc/token_stream.rb | 2 + test/rdoc/parser/prism_ruby_test.rb | 18 +++ test/rdoc/parser/tokenizer_test.rb | 141 ++++++++++++++++ 6 files changed, 448 insertions(+), 40 deletions(-) create mode 100644 lib/rdoc/parser/tokenizer.rb create mode 100644 test/rdoc/parser/tokenizer_test.rb diff --git a/lib/rdoc/markup/to_html.rb b/lib/rdoc/markup/to_html.rb index 8d019bef46..f0ad1bf848 100644 --- a/lib/rdoc/markup/to_html.rb +++ b/lib/rdoc/markup/to_html.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true require 'cgi/escape' require 'cgi/util' unless defined?(CGI::EscapeExt) +require 'rdoc/parser/ripper_state_lex' ## # Outputs RDoc markup as HTML. @@ -216,6 +217,23 @@ def accept_paragraph(paragraph) @res << "

\n" end + # Generate syntax highlighted html for ruby-like text. + + def parsable_text_to_html(text) + if defined?(RDoc::Parser::PrismRuby) && RDoc::Parser::Ruby == RDoc::Parser::PrismRuby + tokens = RDoc::Parser::Tokenizer.tokenize(text).map do |type, text| + RDoc::TokenStream::RipperStateLexCompatToken.new(type, text) + end + else + # RipperStateLex.parse is assumed to fail in some cases. + # Failing input is unknown. + tokens = RDoc::Parser::RipperStateLex.parse(text) rescue return + end + result = RDoc::TokenStream.to_html tokens + result = result + "\n" unless "\n" == result[-1] + result + end + ## # Adds +verbatim+ to the output @@ -224,20 +242,12 @@ def accept_verbatim(verbatim) klass = nil - content = if verbatim.ruby? or parseable? text then - begin - tokens = RDoc::Parser::RipperStateLex.parse text - klass = ' class="ruby"' - - result = RDoc::TokenStream.to_html tokens - result = result + "\n" unless "\n" == result[-1] - result - rescue - CGI.escapeHTML text - end - else - CGI.escapeHTML text - end + if verbatim.ruby? || parseable?(text) + content = parsable_text_to_html(text) + klass = ' class="ruby"' if content # RDoc::Parser::RipperStateLex.parse may fail + end + + content ||= CGI.escapeHTML text if @options.pipe then @res << "\n
#{CGI.escapeHTML text}\n
\n" diff --git a/lib/rdoc/parser/prism_ruby.rb b/lib/rdoc/parser/prism_ruby.rb index 56da6ac227..c18b37e8b3 100644 --- a/lib/rdoc/parser/prism_ruby.rb +++ b/lib/rdoc/parser/prism_ruby.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true require 'prism' -require_relative 'ripper_state_lex' +require_relative 'tokenizer' # Unlike lib/rdoc/parser/ruby.rb, this file is not based on rtags and does not contain code from # rtags.rb - @@ -89,10 +89,13 @@ def record_location(container) # :nodoc: # Scans this Ruby file for Ruby constructs def scan - @tokens = RDoc::Parser::RipperStateLex.parse(@content) @lines = @content.lines - result = Prism.parse(@content) - @program_node = result.value + result = Prism.parse_lex(@content) + @prism_comments = result.comments + @program_node, unordered_tokens = result.value + # Heredoc tokens are not in start_offset order. + # Need to sort them to use bsearch for finding tokens by location. + @prism_tokens = unordered_tokens.map(&:first).sort_by { |t| t.location.start_offset } @line_nodes = {} prepare_line_nodes(@program_node) prepare_comments(result.comments) @@ -205,7 +208,7 @@ def parse_comment_tomdoc(container, comment, line_no, start_line) meth.start_collecting_tokens(:ruby) node = @line_nodes[line_no] - tokens = node ? visible_tokens_from_location(node.location) : [file_line_comment_token(start_line)] + tokens = node ? visible_tokens_from_node(node) : [file_line_comment_token(start_line)] tokens.each { |token| meth.token_stream << token } container.add_method meth @@ -273,7 +276,7 @@ def handle_meta_method_comment(comment, directives, node) elsif line_no || node method_name ||= call_node_name_arguments(node).first if is_call_node if node - tokens = visible_tokens_from_location(node.location) + tokens = visible_tokens_from_node(node) line_no = node.location.start_line else tokens = [file_line_comment_token(line_no)] @@ -368,30 +371,21 @@ def parse_comment_text_to_directives(comment_text, start_line) # :nodoc: [comment, directives] end - def slice_tokens(start_pos, end_pos) # :nodoc: - start_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> start_pos) >= 0 } - end_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> end_pos) >= 0 } - tokens = @tokens[start_index...end_index] - tokens.pop if tokens.last&.kind == :on_nl - tokens - end - def file_line_comment_token(line_no) # :nodoc: - position_comment = RDoc::Parser::RipperStateLex::Token.new(line_no - 1, 0, :on_comment) - position_comment[:text] = "# File #{@top_level.relative_name}, line #{line_no}" - position_comment + text = "# File #{@top_level.relative_name}, line #{line_no}" + RDoc::TokenStream::RipperStateLexCompatToken.new(:on_comment, text) end - # Returns tokens from the given location + # Returns tokens of the given node's location for syntax highlighting - def visible_tokens_from_location(location) + def visible_tokens_from_node(node) + location = node.location position_comment = file_line_comment_token(location.start_line) - newline_token = RDoc::Parser::RipperStateLex::Token.new(0, 0, :on_nl, "\n") - indent_token = RDoc::Parser::RipperStateLex::Token.new(location.start_line, 0, :on_sp, ' ' * location.start_character_column) - tokens = slice_tokens( - [location.start_line, location.start_character_column], - [location.end_line, location.end_character_column] - ) + newline_token = RDoc::TokenStream::RipperStateLexCompatToken.new(:on_nl, "\n") + indent_token = RDoc::TokenStream::RipperStateLexCompatToken.new(:on_sp, ' ' * location.start_character_column) + tokens = RDoc::Parser::Tokenizer.partial_tokenize(@content, node, @prism_tokens, @prism_comments).map do |type, text| + RDoc::TokenStream::RipperStateLexCompatToken.new(type, text) + end [position_comment, newline_token, indent_token, *tokens] end @@ -894,7 +888,7 @@ def visit_def_node(node) end name = node.name.to_s params, block_params, calls_super = MethodSignatureVisitor.scan_signature(node) - tokens = @scanner.visible_tokens_from_location(node.location) + tokens = @scanner.visible_tokens_from_node(node) @scanner.add_method( name, diff --git a/lib/rdoc/parser/tokenizer.rb b/lib/rdoc/parser/tokenizer.rb new file mode 100644 index 0000000000..cc8dd181e1 --- /dev/null +++ b/lib/rdoc/parser/tokenizer.rb @@ -0,0 +1,243 @@ +require 'prism' +require 'set' + +# Tokenize Ruby code as RDoc::Parser::RipperStateLex style types and token squashing. +# Token squashing is required by RDoc::TokenStream's syntax highlighting. +module RDoc::Parser::Tokenizer + # This constants and token type map are for compatibility with RDoc::Parser::RipperStateLex. + OTHER = :other + SPACE = :on_sp + NEWLINE = :on_nl + KEYWORD = :on_kw + OP = :on_op + HEREDOC_BEG = :on_heredoc_beg + HEREDOC_CONTENT = :on_heredoc + HEREDOC_END = :on_heredoc_end + COMMENT = :on_comment + INTEGER = :on_int + FLOAT = :on_float + RATIONAL = :on_rational + IMAGINARY = :on_imaginary + SYMBOL = :on_symbol + REGEXP = :on_regexp + STRING = :on_tstring + WORDS = :on_dstring + DEF_METHOD_NAME = :on_ident + DSTRING = :on_dstring + + OP_TOKENS = %i[ + AMPERSAND AMPERSAND_AMPERSAND + BANG BANG_EQUAL BANG_TILDE CARET COLON COLON_COLON + EQUAL EQUAL_EQUAL EQUAL_GREATER EQUAL_TILDE + GREATER GREATER_GREATER + LESS LESS_EQUAL LESS_EQUAL_GREATER LESS_LESS + MINUS MINUS_GREATER PERCENT PIPE PIPE_PIPE PLUS + QUESTION_MARK SLASH STAR STAR_STAR TILDE + UAMPERSAND UMINUS UPLUS USTAR USTAR_STAR + ].to_set + + TOKEN_TYPE_MAP = { + IDENTIFIER: :on_ident, + METHOD_NAME: :on_ident, + INSTANCE_VARIABLE: :on_ivar, + CLASS_VARIABLE: :on_cvar, + GLOBAL_VARIABLE: :on_gvar, + BACK_REFERENCE: :on_backref, + NUMBERED_REFERENCE: :on_backref, + CONSTANT: :on_const, + LABEL: :on_label, + INTEGER: :on_int, + FLOAT: :on_float, + RATIONAL: :on_rational, + IMAGINARY: :on_imaginary, + } + + class << self + def tokenize(code) + result = Prism.parse_lex(code) + program_node, unordered_tokens = result.value + prism_tokens = unordered_tokens.map(&:first).sort_by! { |token| token.location.start_offset } + partial_tokenize(code, program_node, prism_tokens, result.comments, 0, code.bytesize) + end + + def partial_tokenize(whole_code, node, prism_tokens, prism_comments, start_offset = nil, end_offset = nil) + start_offset ||= node.location.start_offset + end_offset ||= node.location.end_offset + visitor = SquashTokenVisitor.new + node.accept(visitor) + squashed_tokens = visitor.tokens + comment_tokens = comment_tokens(slice_by_location(prism_comments, start_offset, end_offset)) + normal_tokens = normal_tokens(slice_by_location(prism_tokens, start_offset, end_offset)) + prior_tokens = (squashed_tokens + comment_tokens).sort_by {|_, start_offset, _| start_offset } + unify_tokens(whole_code, prior_tokens, normal_tokens, start_offset, end_offset) + end + + private + + def slice_by_location(items, start_offset, end_offset) + start_index = items.bsearch_index { |item| item.location.end_offset > start_offset } || items.size + end_index = items.bsearch_index { |item| item.location.start_offset >= end_offset } || items.size + items[start_index...end_index] + end + + # Unify prior tokens and normal tokens into a token stream. + # Prior tokens have higher priority than normal tokens. + # Also adds missing text (spaces, newlines, etc.) as separate tokens + # so that the entire code is covered. + def unify_tokens(code, prior_tokens, normal_tokens, start_offset, end_offset) + tokens = [] + offset = start_offset + + # Add missing text such as spaces and newlines as a separate token + flush = -> next_offset { + return if offset == next_offset + + code.byteslice(offset...next_offset).scan(/\n|\s+|[^\s]+/) do |text| + type = + if text == "\n" + NEWLINE + elsif /\A\s+\z/.match?(text) + SPACE + else + OTHER + end + tokens << [type, text] + end + } + + until prior_tokens.empty? && normal_tokens.empty? + ptok = prior_tokens.first + ntok = normal_tokens.first + if ntok && (!ptok || ntok[2] <= ptok[1]) + token = normal_tokens.shift + else + token = prior_tokens.shift + end + type, start_pos, end_pos = token + next if start_pos < offset + + flush.call(start_pos) + tokens << [type, code.byteslice(start_pos...end_pos)] + offset = end_pos + end + flush.call(end_offset) + tokens + end + + # Extract normal comment and embdoc comment (consists of multiple tokens) as a single token + def comment_tokens(comments) + comments.map do |comment| + [COMMENT, comment.location.start_offset, comment.location.end_offset] + end + end + + # Convert normal Prism tokens to [type, start_offset, end_offset] + def normal_tokens(tokens) + tokens.map do |token,| + type = + if token.type.start_with?('KEYWORD_') + KEYWORD + elsif OP_TOKENS.include?(token.type.to_sym) + OP + else + TOKEN_TYPE_MAP[token.type] || OTHER + end + [type, token.location.start_offset, token.location.end_offset] + end + end + end + + # Visitor to squash several tokens that consist a single node into a single token + class SquashTokenVisitor < Prism::Visitor + attr_reader :tokens + def initialize + @tokens = [] + end + + # Squash UMINUS and its operand(integer, float, rational, imaginary) token into a single token + def visit_integer_node(node) + push_location(node.location, INTEGER) + end + + def visit_float_node(node) + push_location(node.location, FLOAT) + end + + def visit_rational_node(node) + push_location(node.location, RATIONAL) + end + + def visit_imaginary_node(node) + push_location(node.location, IMAGINARY) + end + + def visit_symbol_node(node) + push_location(node.location, SYMBOL) + end + alias visit_interpolated_symbol_node visit_symbol_node + + def visit_regular_expression_node(node) + push_location(node.location, REGEXP) + end + alias visit_match_last_line_node visit_regular_expression_node + alias visit_interpolated_regular_expression_node visit_regular_expression_node + alias visit_interpolated_match_last_line_node visit_regular_expression_node + + def visit_string_node(node) + # opening of StringNode inside InterpolatedStringNode might be nil + if node.opening&.start_with?('<<') + push_location(node.opening_loc, HEREDOC_BEG) + push_location(node.content_loc, HEREDOC_CONTENT) + push_location(node.closing_loc, HEREDOC_END) + else + push_location(node.location, STRING) + end + end + alias visit_x_string_node visit_string_node + + def visit_array_node(node) + # Right hand side of `a = 1,2` is an array node without opening + if node.opening&.start_with?('%') + # Percent array: squash entire node into a single token. + # We don't handle embedded expressions inside yet. + push_location(node.location, WORDS) + else + super + end + end + + def push_location(location, type) + @tokens << [type, location.start_offset, location.end_offset] + end + + def visit_def_node(node) + # For special colorizing of method name in def node + push_location(node.name_loc, DEF_METHOD_NAME) + super + end + + def visit_interpolated_string_node(node) + # `"a" "b"` is an interpolated string node without opening + if node.opening&.start_with?('<<') + # Heredocs. Squash content into a single token. + # We don't tokenize embedded expressions inside, and don't handle nested heredocs yet. + push_location(node.opening_loc, HEREDOC_BEG) + unless node.parts.empty? + # Squash heredoc content into a single token + part_locations = node.parts.map(&:location) + @tokens << [ + HEREDOC_CONTENT, + part_locations.map(&:start_offset).min, + part_locations.map(&:end_offset).max + ] + end + # incomplete heredoc might not have closing_loc + push_location(node.closing_loc, HEREDOC_END) if node.closing_loc + else + # Squash entire node into a single token + push_location(node.location, DSTRING) + end + end + alias visit_interpolated_x_string_node visit_interpolated_string_node + end +end diff --git a/lib/rdoc/token_stream.rb b/lib/rdoc/token_stream.rb index 5a4ca82a67..07f83862c1 100644 --- a/lib/rdoc/token_stream.rb +++ b/lib/rdoc/token_stream.rb @@ -9,6 +9,8 @@ module RDoc::TokenStream + RipperStateLexCompatToken = Struct.new(:kind, :text, :line_no, :char_no, :state) + ## # Converts +token_stream+ to HTML wrapping various tokens with # elements. Some tokens types are wrapped in spans diff --git a/test/rdoc/parser/prism_ruby_test.rb b/test/rdoc/parser/prism_ruby_test.rb index e393f70e3e..e12a9751bf 100644 --- a/test/rdoc/parser/prism_ruby_test.rb +++ b/test/rdoc/parser/prism_ruby_test.rb @@ -2065,6 +2065,24 @@ def test_read_directive_linear_performance end end + def test_code_object_token_stream + util_parser <<~RUBY + class Foo + def foo + 42 + end + + private def bar + :bar + end + end + RUBY + + foo, bar = @top_level.classes.first.method_list + # Skip first two tokens: location comment and newline + assert_equal([' ', 'def', ' ', 'foo', "\n", ' ', '42', "\n", ' ', 'end'], foo.token_stream.drop(2).map(&:text)) + assert_equal([' ', 'def', ' ', 'bar', "\n", ' ', ':bar', "\n", ' ', 'end'], bar.token_stream.drop(2).map(&:text)) + end def test_markup_first_comment util_parser <<~RUBY diff --git a/test/rdoc/parser/tokenizer_test.rb b/test/rdoc/parser/tokenizer_test.rb new file mode 100644 index 0000000000..07cbcb1a8a --- /dev/null +++ b/test/rdoc/parser/tokenizer_test.rb @@ -0,0 +1,141 @@ +# frozen_string_literal: true +require_relative '../helper' +require 'rdoc/parser/tokenizer' + +class RDocParserTokenizerTest < RDoc::TestCase + def test_partial_tokenize + code = <<~RUBY + class A + def m + # comment + 42 + end + end + RUBY + parse_result = Prism.parse_lex(code) + program_node, unordered_tokens = parse_result.value + prism_tokens = unordered_tokens.map(&:first).sort_by! { |token| token.location.start_offset } + def_node = program_node.statements.body[0].body.body[0] + tokens = RDoc::Parser::Tokenizer.partial_tokenize(code, def_node, prism_tokens, parse_result.comments) + expected = ['def', ' ', 'm', "\n", ' ', '# comment', "\n", ' ', '42', "\n", ' ', 'end'] + assert_equal(expected, tokens.map(&:last)) + end + + def test_comment + code = <<~RUBY + # comment1 + class A + =begin + comment2 + =end + def m + 42 # comment3 + end + end + RUBY + tokens = RDoc::Parser::Tokenizer.tokenize(code) + assert_equal(code, tokens.map(&:last).join) + assert_include(tokens, [:on_comment, '# comment1']) + assert_include(tokens, [:on_comment, "=begin\ncomment2\n=end\n"]) + assert_include(tokens, [:on_comment, '# comment3']) + end + + def test_squash_uminus + code = <<~RUBY + def m + -42; -4.2; -42i; -42r + end + RUBY + tokens = RDoc::Parser::Tokenizer.tokenize(code) + assert_equal(code, tokens.map(&:last).join) + assert_include(tokens, [:on_int, '-42']) + assert_include(tokens, [:on_float, '-4.2']) + assert_include(tokens, [:on_imaginary, '-42i']) + assert_include(tokens, [:on_rational, '-42r']) + end + + def test_squash_interpolated_node + code = <<~'RUBY' + def m + "string#{interpolation}example" + /regexp#{interpolation}example/ + :"symbol#{interpolation}example" + end + RUBY + tokens = RDoc::Parser::Tokenizer.tokenize(code) + assert_equal(code, tokens.map(&:last).join) + assert_include(tokens, [:on_dstring, '"string#{interpolation}example"']) + assert_include(tokens, [:on_regexp, '/regexp#{interpolation}example/']) + assert_include(tokens, [:on_symbol, ':"symbol#{interpolation}example"']) + end + + def test_squash_words + code = <<~RUBY + def m + a = 1, 2 # array without opening. %w[] squashing should not fail with this input + %w[one two three] + %W[one \#{two} three] + %i[one two three] + %I[one \#{two} three] + end + RUBY + tokens = RDoc::Parser::Tokenizer.tokenize(code) + assert_equal(code, tokens.map(&:last).join) + assert_include(tokens, [:on_dstring, '%w[one two three]']) + assert_include(tokens, [:on_dstring, '%W[one #{two} three]']) + assert_include(tokens, [:on_dstring, '%i[one two three]']) + assert_include(tokens, [:on_dstring, '%I[one #{two} three]']) + end + + def test_multibyte + code = <<~RUBY + def f(s = '💎') + # comment 💎 + puts '💎' + s + end + RUBY + tokens = RDoc::Parser::Tokenizer.tokenize(code) + assert_equal(code, tokens.map(&:last).join) + end + + def test_string_concat_node + # concatenated string node has no opening + code = <<~'RUBY' + def f + %[hello] 'HELLO'\ + "world" + end + RUBY + tokens = RDoc::Parser::Tokenizer.tokenize(code) + assert_equal(code, tokens.map(&:last).join) + end + + def test_squash_heredoc + code = <<~'RUBY' + def f + str1 = <<~AA + single-line-heredoc + AA + str2 = <<~`BB` # comment + x-string-heredoc + BB + str3 = <<~CC.itself + multi-line + #{embed} + heredoc + CC + end + RUBY + tokens = RDoc::Parser::Tokenizer.tokenize(code) + assert_equal(code, tokens.map(&:last).join) + assert_include(tokens, [:on_heredoc_beg, '<<~AA']) + assert_include(tokens, [:on_heredoc_beg, '<<~`BB`']) + assert_include(tokens, [:on_heredoc_beg, '<<~CC']) + assert_include(tokens, [:on_heredoc_end, " AA\n"]) + assert_include(tokens, [:on_heredoc_end, " BB\n"]) + assert_include(tokens, [:on_heredoc_end, " CC\n"]) + assert_include(tokens, [:on_heredoc, " single-line-heredoc\n"]) + assert_include(tokens, [:on_heredoc, " x-string-heredoc\n"]) + assert_include(tokens, [:on_heredoc, " multi-line\n \#{embed}\n heredoc\n"]) + end +end