[ruby/prism] Further refine string handling in the parser translator

Mostly around newlines and line continuation. * percent arrays need special backslash handling in the ast * Fix offset issue for heredocs with many line continuations (used wrong variable as index access) * More refined rules on when to simplify string tokens * Handle line continuations in squiggly heredocs * Correctly dedent squiggly heredocs with interpolation * Consider `':foo:` and `%s[foo]` to not be interpolation https://github.com/ruby/prism/commit/4edfe9d981
author: Earlopain <[email protected]> 2025-01-15 23:24:05 +0100
committer: Kevin Newton <[email protected]> 2025-03-18 13:36:53 -0400
commit: bc506295a30a5806b3346ed09cd679f3b8ee6f64 (patch)
tree: d37cb1f1b816eaa63be60f55b99eebe0a970e9eb /lib
parent: 9e5e3f1bede46ed499a809975c663ba32c34ffff (diff)
2 files changed, 224 insertions, 107 deletions
diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb
index 4eec8205c8..1459b53c48 100644
--- a/lib/prism/translation/parser/compiler.rb
+++ b/lib/prism/translation/parser/compiler.rb
@@ -74,7 +74,29 @@ module Prism
         # []
         # ^^
         def visit_array_node(node)
-          builder.array(token(node.opening_loc), visit_all(node.elements), token(node.closing_loc))
+          if node.opening&.start_with?("%w", "%W", "%i", "%I")
+            elements = node.elements.flat_map do |element|
+              if element.is_a?(StringNode)
+                if element.content.include?("\n")
+                  string_nodes_from_line_continuations(element.unescaped, element.content, element.content_loc.start_offset, node.opening)
+                else
+                  [builder.string_internal([element.unescaped, srange(element.content_loc)])]
+                end
+              elsif element.is_a?(InterpolatedStringNode)
+                builder.string_compose(
+                  token(element.opening_loc),
+                  string_nodes_from_interpolation(element, node.opening),
+                  token(element.closing_loc)
+                )
+              else
+                [visit(element)]
+              end
+            end
+          else
+            elements = visit_all(node.elements)
+          end
+
+          builder.array(token(node.opening_loc), elements, token(node.closing_loc))
         end
 
         # foo => [bar]
@@ -1088,19 +1110,9 @@ module Prism
             return visit_heredoc(node) { |children, closing| builder.string_compose(token(node.opening_loc), children, closing) }
           end
 
-          parts = node.parts.flat_map do |part|
-            # When the content of a string node is split across multiple lines, the
-            # parser gem creates individual string nodes for each line the content is part of.
-            if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil?
-              string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, node.opening)
-            else
-              visit(part)
-            end
-          end
-
           builder.string_compose(
             token(node.opening_loc),
-            parts,
+            string_nodes_from_interpolation(node, node.opening),
             token(node.closing_loc)
           )
         end
@@ -1119,14 +1131,14 @@ module Prism
         # ^^^^^^^^^^^^
         def visit_interpolated_x_string_node(node)
           if node.heredoc?
-            visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) }
-          else
-            builder.xstring_compose(
-              token(node.opening_loc),
-              visit_all(node.parts),
-              token(node.closing_loc)
-            )
+            return visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) }
           end
+
+          builder.xstring_compose(
+            token(node.opening_loc),
+            string_nodes_from_interpolation(node, node.opening),
+            token(node.closing_loc)
+          )
         end
 
         # -> { it }
@@ -2024,13 +2036,6 @@ module Prism
           end
         end
 
-        # The parser gem automatically converts \r\n to \n, meaning our offsets
-        # need to be adjusted to always subtract 1 from the length.
-        def chomped_bytesize(line)
-          chomped = line.chomp
-          chomped.bytesize + (chomped == line ? 0 : 1)
-        end
-
         # Visit a heredoc that can be either a string or an xstring.
         def visit_heredoc(node)
           children = Array.new
@@ -2099,55 +2104,88 @@ module Prism
           end
         end
 
+        # When the content of a string node is split across multiple lines, the
+        # parser gem creates individual string nodes for each line the content is part of.
+        def string_nodes_from_interpolation(node, opening)
+          node.parts.flat_map do |part|
+            if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil?
+              string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, opening)
+            else
+              visit(part)
+            end
+          end
+        end
+
         # Create parser string nodes from a single prism node. The parser gem
         # "glues" strings together when a line continuation is encountered.
         def string_nodes_from_line_continuations(unescaped, escaped, start_offset, opening)
           unescaped = unescaped.lines
           escaped = escaped.lines
+          percent_array = opening&.start_with?("%w", "%W", "%i", "%I")
+
+          # Non-interpolating strings
+          if opening&.end_with?("'") || opening&.start_with?("%q", "%s", "%w", "%i")
+            current_length = 0
+            current_line = +""
+
+            escaped.filter_map.with_index do |escaped_line, index|
+              unescaped_line = unescaped.fetch(index, "")
+              current_length += escaped_line.bytesize
+              current_line << unescaped_line
 
-          escaped_lengths = []
-          normalized_lengths = []
-          # Keeps track of where an unescaped line should start a new token. An unescaped
-          # \n would otherwise be indistinguishable from the actual newline at the end of
-          # of the line. The parser gem only emits a new string node at "real" newlines,
-          # line continuations don't start a new node as well.
-          do_next_tokens = []
-
-          if opening&.end_with?("'")
-            escaped.each do |line|
-              escaped_lengths << line.bytesize
-              normalized_lengths << chomped_bytesize(line)
-              do_next_tokens << true
+              # Glue line continuations together. Only %w and %i arrays can contain these.
+              if percent_array && escaped_line[/(\\)*\n$/, 1]&.length&.odd?
+                next unless index == escaped.count - 1
+              end
+              s = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_length)])
+              start_offset += escaped_line.bytesize
+              current_line = +""
+              current_length = 0
+              s
             end
           else
+            escaped_lengths = []
+            normalized_lengths = []
+            # Keeps track of where an unescaped line should start a new token. An unescaped
+            # \n would otherwise be indistinguishable from the actual newline at the end of
+            # of the line. The parser gem only emits a new string node at "real" newlines,
+            # line continuations don't start a new node as well.
+            do_next_tokens = []
+
             escaped
               .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false }
               .each do |lines|
                 escaped_lengths << lines.sum(&:bytesize)
-                normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
                 unescaped_lines_count = lines.sum do |line|
                   line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false }
                 end
-                do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false))
+                extra = 1
+                extra = lines.count if percent_array # Account for line continuations in percent arrays
+
+                normalized_lengths.concat(Array.new(unescaped_lines_count + extra, 0))
+                normalized_lengths[-1] = lines.sum { |line| line.bytesize }
+                do_next_tokens.concat(Array.new(unescaped_lines_count + extra, false))
                 do_next_tokens[-1] = true
               end
-          end
-
-          current_line = +""
-          current_normalized_length = 0
 
-          unescaped.filter_map.with_index do |unescaped_line, index|
-            current_line << unescaped_line
-            current_normalized_length += normalized_lengths.fetch(index, 0)
-
-            if do_next_tokens[index]
-              inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
-              start_offset += escaped_lengths.fetch(index, 0)
-              current_line = +""
-              current_normalized_length = 0
-              inner_part
-            else
-              nil
+            current_line = +""
+            current_normalized_length = 0
+
+            emitted_count = 0
+            unescaped.filter_map.with_index do |unescaped_line, index|
+              current_line << unescaped_line
+              current_normalized_length += normalized_lengths.fetch(index, 0)
+
+              if do_next_tokens[index]
+                inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
+                start_offset += escaped_lengths.fetch(emitted_count, 0)
+                current_line = +""
+                current_normalized_length = 0
+                emitted_count += 1
+                inner_part
+              else
+                nil
+              end
             end
           end
         end
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
index f7187b1724..7db519499f 100644
--- a/lib/prism/translation/parser/lexer.rb
+++ b/lib/prism/translation/parser/lexer.rb
@@ -341,6 +341,7 @@ module Prism
             when :tRATIONAL
               value = parse_rational(value)
             when :tSPACE
+              location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
               value = nil
             when :tSTRING_BEG
               next_token = lexed[index][0]
@@ -354,11 +355,15 @@ module Prism
                 location = range(next_location.start_offset, next_location.end_offset)
                 index += 1
               elsif value.start_with?("'", '"', "%")
-                if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
-                  # the parser gem doesn't simplify strings when its value ends in a newline
-                  if !(string_value = next_token.value).end_with?("\n") && basic_quotes
+                if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
+                  string_value = next_token.value
+                  if simplify_string?(string_value, value)
                     next_location = token.location.join(next_next_token.location)
-                    value = unescape_string(string_value, value)
+                    if percent_array?(value)
+                      value = percent_array_unescape(string_value)
+                    else
+                      value = unescape_string(string_value, value)
+                    end
                     type = :tSTRING
                     location = range(next_location.start_offset, next_location.end_offset)
                     index += 2
@@ -397,16 +402,34 @@ module Prism
                 quote_stack.push(value)
               end
             when :tSTRING_CONTENT
+              is_percent_array = percent_array?(quote_stack.last)
+
               if (lines = token.value.lines).one?
-                # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
-                is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
-                # The parser gem only removes indentation when the heredoc is not nested
-                not_nested = heredoc_stack.size == 1
-                if is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
-                  value = trim_heredoc_whitespace(value, current_heredoc)
-                end
+                # Prism usually emits a single token for strings with line continuations.
+                # For squiggly heredocs they are not joined so we do that manually here.
+                current_string = +""
+                current_length = 0
+                start_offset = token.location.start_offset
+                while token.type == :STRING_CONTENT
+                  current_length += token.value.bytesize
+                  # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
+                  is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
+                  # The parser gem only removes indentation when the heredoc is not nested
+                  not_nested = heredoc_stack.size == 1
+                  if is_percent_array
+                    value = percent_array_unescape(token.value)
+                  elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
+                    value = trim_heredoc_whitespace(token.value, current_heredoc)
+                  end
 
-                value = unescape_string(value, quote_stack.last)
+                  current_string << unescape_string(value, quote_stack.last)
+                  if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
+                    tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
+                    break
+                  end
+                  token = lexed[index][0]
+                  index += 1
+                end
               else
                 # When the parser gem encounters a line continuation inside of a multiline string,
                 # it emits a single string node. The backslash (and remaining newline) is removed.
@@ -419,12 +442,10 @@ module Prism
                   chomped_line = line.chomp
                   backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
                   is_interpolation = interpolation?(quote_stack.last)
-                  is_percent_array = percent_array?(quote_stack.last)
 
                   if backslash_count.odd? && (is_interpolation || is_percent_array)
                     if is_percent_array
-                      # Remove the last backslash, keep potential newlines
-                      current_line << line.sub(/(\\)(\r?\n)\z/, '\2')
+                      current_line << percent_array_unescape(line)
                       adjustment += 1
                     else
                       chomped_line.delete_suffix!("\\")
@@ -446,8 +467,8 @@ module Prism
                     adjustment = 0
                   end
                 end
-                next
               end
+              next
             when :tSTRING_DVAR
               value = nil
             when :tSTRING_END
@@ -570,12 +591,13 @@ module Prism
           while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
             next_token_index += 1
             next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
+            first_token_on_line = next_token.location.start_column == 0
 
             # String content inside nested heredocs and interpolation is ignored
             if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
               # When interpolation is the first token of a line there is no string
               # content to check against. There will be no common whitespace.
-              if nesting_level == 0 && next_token.location.start_column == 0
+              if nesting_level == 0 && first_token_on_line
                 result = 0
               end
               nesting_level += 1
@@ -583,7 +605,7 @@ module Prism
               nesting_level -= 1
               # When we encountered the matching heredoc end, we can exit
               break if nesting_level == -1
-            elsif next_token.type == :STRING_CONTENT && nesting_level == 0
+            elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
               common_whitespace = 0
               next_token.value[/^\s*/].each_char do |char|
                 if char == "\t"
@@ -672,50 +694,107 @@ module Prism
             while (skipped = scanner.skip_until(/\\/))
               # Append what was just skipped over, excluding the found backslash.
               result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
-
-              # Simple single-character escape sequences like \n
-              if (replacement = ESCAPES[scanner.peek(1)])
-                result.append_as_bytes(replacement)
-                scanner.pos += 1
-              elsif (octal = scanner.check(/[0-7]{1,3}/))
-                # \nnn
-                result.append_as_bytes(octal.to_i(8).chr)
-                scanner.pos += octal.bytesize
-              elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/))
-                # \xnn
-                result.append_as_bytes(hex[1..].to_i(16).chr)
-                scanner.pos += hex.bytesize
-              elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/))
-                # \unnnn
-                result.append_as_bytes(unicode[1..].hex.chr(Encoding::UTF_8))
-                scanner.pos += unicode.bytesize
-              elsif scanner.peek(3) == "u{}"
-                # https://github.com/whitequark/parser/issues/856
-                scanner.pos += 3
-              elsif (unicode_parts = scanner.check(/u{.*}/))
-                # \u{nnnn ...}
-                unicode_parts[2..-2].split.each do |unicode|
-                  result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
-                end
-                scanner.pos += unicode_parts.bytesize
-              end
+              escape_read(result, scanner, false, false)
             end
 
-            # Add remainging chars
+            # Add remaining chars
             result.append_as_bytes(string.byteslice(scanner.pos..))
-
             result.force_encoding(source_buffer.source.encoding)
-
-            result
           else
             delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
             string.gsub(/\\([\\#{delimiters}])/, '\1')
           end
         end
 
+        # Certain strings are merged into a single string token.
+        def simplify_string?(value, quote)
+          case quote
+          when "'"
+            # Only simplify 'foo'
+            !value.include?("\n")
+          when '"'
+            # Simplify when every line ends with a line continuation, or it is the last line
+            value.lines.all? do |line|
+              !line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
+            end
+          else
+            # %q and similar are never simplified
+            false
+          end
+        end
+
+        # Escape a byte value, given the control and meta flags.
+        def escape_build(value, control, meta)
+          value &= 0x9f if control
+          value |= 0x80 if meta
+          value
+        end
+
+        # Read an escape out of the string scanner, given the control and meta
+        # flags, and push the unescaped value into the result.
+        def escape_read(result, scanner, control, meta)
+          if scanner.skip("\n")
+            # Line continuation
+          elsif (value = ESCAPES[scanner.peek(1)])
+            # Simple single-character escape sequences like \n
+            result.append_as_bytes(value)
+            scanner.pos += 1
+          elsif (value = scanner.scan(/[0-7]{1,3}/))
+            # \nnn
+            result.append_as_bytes(escape_build(value.to_i(8), control, meta))
+          elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/))
+            # \xnn
+            result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta))
+          elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/))
+            # \unnnn
+            result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8))
+          elsif scanner.skip("u{}")
+            # https://github.com/whitequark/parser/issues/856
+          elsif (value = scanner.scan(/u{.*?}/))
+            # \u{nnnn ...}
+            value[2..-2].split.each do |unicode|
+              result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
+            end
+          elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/))
+            # \cx or \C-x where x is an ASCII printable character
+            escape_read(result, scanner, true, meta)
+          elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/))
+            # \M-x where x is an ASCII printable character
+            escape_read(result, scanner, control, true)
+          elsif (byte = scanner.get_byte)
+            # Something else after an escape.
+            if control && byte == "?"
+              result.append_as_bytes(escape_build(0x7f, false, meta))
+            else
+              result.append_as_bytes(escape_build(byte.ord, control, meta))
+            end
+          end
+        end
+
+        # In a percent array, certain whitespace can be preceeded with a backslash,
+        # causing the following characters to be part of the previous element.
+        def percent_array_unescape(string)
+          string.gsub(/(\\)+[ \f\n\r\t\v]/) do |full_match|
+            full_match.delete_prefix!("\\") if Regexp.last_match[1].length.odd?
+            full_match
+          end
+        end
+
+        # For %-arrays whitespace, the parser gem only considers whitespace before the newline.
+        def percent_array_leading_whitespace(string)
+          return 1 if string.start_with?("\n")
+
+          leading_whitespace = 0
+          string.each_char do |c|
+            break if c == "\n"
+            leading_whitespace += 1
+          end
+          leading_whitespace
+        end
+
         # Determine if characters preceeded by a backslash should be escaped or not
         def interpolation?(quote)
-          quote != "'" && !quote.start_with?("%q", "%w", "%i")
+          !quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
         end
 
         # Regexp allow interpolation but are handled differently during unescaping
author	Earlopain <[email protected]>	2025-01-15 23:24:05 +0100
committer	Kevin Newton <[email protected]>	2025-03-18 13:36:53 -0400
commit	bc506295a30a5806b3346ed09cd679f3b8ee6f64 (patch)
tree	d37cb1f1b816eaa63be60f55b99eebe0a970e9eb /lib
parent	9e5e3f1bede46ed499a809975c663ba32c34ffff (diff)