[ruby/prism] Further refine string handling in the parser translator

Mostly around newlines and line continuation. * percent arrays need special backslash handling in the ast * Fix offset issue for heredocs with many line continuations (used wrong variable as index access) * More refined rules on when to simplify string tokens * Handle line continuations in squiggly heredocs * Correctly dedent squiggly heredocs with interpolation * Consider `':foo:` and `%s[foo]` to not be interpolation https://github.com/ruby/prism/commit/4edfe9d981
author: Earlopain <[email protected]> 2025-01-15 23:24:05 +0100
committer: Kevin Newton <[email protected]> 2025-03-18 13:36:53 -0400
commit: fd7a10cf4a73f27a0113a6bc2a65c4c274ee11ec (patch)
tree: 788c85bfbff8be7d04934cd3ed54500038da252b /lib
parent: 5d138f2b436dc84b1efed86ac3328e67638887cb (diff)
2 files changed, 75 insertions, 0 deletions
diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb
index 83cbe6e0c9..8f6a602c42 100644
--- a/lib/prism/translation/parser/compiler.rb
+++ b/lib/prism/translation/parser/compiler.rb
@@ -1119,6 +1119,7 @@ module Prism
 
 <<<<<<< HEAD
 <<<<<<< HEAD
+<<<<<<< HEAD
 =======
           parts = if node.parts.one? { |part| part.type == :string_node }
             node.parts.flat_map do |node|
@@ -1150,6 +1151,8 @@ module Prism
           end
 
 >>>>>>> a651126458 (Fix an incompatibility with the parser translator)
+=======
+>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator)
           builder.string_compose(
             token(node.opening_loc),
             string_nodes_from_interpolation(node, node.opening),
@@ -2214,9 +2217,13 @@ module Prism
           unescaped = unescaped.lines
           escaped = escaped.lines
 <<<<<<< HEAD
+<<<<<<< HEAD
           percent_array = opening&.start_with?("%w", "%W", "%i", "%I")
 =======
 >>>>>>> 2637007929 (Better handle all kinds of multiline strings in the parser translator)
+=======
+          percent_array = opening&.start_with?("%w", "%W", "%i", "%I")
+>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator)
 
           # Non-interpolating strings
           if opening&.end_with?("'") || opening&.start_with?("%q", "%s", "%w", "%i")
@@ -2224,6 +2231,9 @@ module Prism
             current_line = +""
 
 <<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator)
             escaped.filter_map.with_index do |escaped_line, index|
               unescaped_line = unescaped.fetch(index, "")
               current_length += escaped_line.bytesize
@@ -2238,6 +2248,7 @@ module Prism
               current_line = +""
               current_length = 0
               s
+<<<<<<< HEAD
 =======
           if opening&.end_with?("'")
             escaped.each do |line|
@@ -2245,6 +2256,8 @@ module Prism
               normalized_lengths << chomped_bytesize(line)
               do_next_tokens << true
 >>>>>>> 2637007929 (Better handle all kinds of multiline strings in the parser translator)
+=======
+>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator)
             end
           else
             escaped_lengths = []
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
index 687b4f6043..4b7ced7afa 100644
--- a/lib/prism/translation/parser/lexer.rb
+++ b/lib/prism/translation/parser/lexer.rb
@@ -407,6 +407,7 @@ module Prism
 =======
 
               if (lines = token.value.lines).one?
+<<<<<<< HEAD
                 # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
                 is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
                 # The parser gem only removes indentation when the heredoc is not nested
@@ -437,6 +438,25 @@ module Prism
                   end
 
 <<<<<<< HEAD
+=======
+                # Prism usually emits a single token for strings with line continuations.
+                # For squiggly heredocs they are not joined so we do that manually here.
+                current_string = +""
+                current_length = 0
+                start_offset = token.location.start_offset
+                while token.type == :STRING_CONTENT
+                  current_length += token.value.bytesize
+                  # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
+                  is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
+                  # The parser gem only removes indentation when the heredoc is not nested
+                  not_nested = heredoc_stack.size == 1
+                  if is_percent_array
+                    value = percent_array_unescape(token.value)
+                  elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
+                    value = trim_heredoc_whitespace(token.value, current_heredoc)
+                  end
+
+>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator)
                   current_string << unescape_string(value, quote_stack.last)
                   if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
                     tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
@@ -714,7 +734,40 @@ module Prism
             while (skipped = scanner.skip_until(/\\/))
               # Append what was just skipped over, excluding the found backslash.
               result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
+<<<<<<< HEAD
               escape_read(result, scanner, false, false)
+=======
+
+              if scanner.peek(1) == "\n"
+                # Line continuation
+                scanner.pos += 1
+              elsif (replacement = ESCAPES[scanner.peek(1)])
+                # Simple single-character escape sequences like \n
+                result.append_as_bytes(replacement)
+                scanner.pos += 1
+              elsif (octal = scanner.check(/[0-7]{1,3}/))
+                # \nnn
+                result.append_as_bytes(octal.to_i(8).chr)
+                scanner.pos += octal.bytesize
+              elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/))
+                # \xnn
+                result.append_as_bytes(hex[1..].to_i(16).chr)
+                scanner.pos += hex.bytesize
+              elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/))
+                # \unnnn
+                result.append_as_bytes(unicode[1..].hex.chr(Encoding::UTF_8))
+                scanner.pos += unicode.bytesize
+              elsif scanner.peek(3) == "u{}"
+                # https://github.com/whitequark/parser/issues/856
+                scanner.pos += 3
+              elsif (unicode_parts = scanner.check(/u{.*}/))
+                # \u{nnnn ...}
+                unicode_parts[2..-2].split.each do |unicode|
+                  result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
+                end
+                scanner.pos += unicode_parts.bytesize
+              end
+>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator)
             end
 
             # Add remaining chars
@@ -727,6 +780,9 @@ module Prism
         end
 
 <<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator)
         # Certain strings are merged into a single string token.
         def simplify_string?(value, quote)
           case quote
@@ -744,6 +800,7 @@ module Prism
           end
         end
 
+<<<<<<< HEAD
         # Escape a byte value, given the control and meta flags.
         def escape_build(value, control, meta)
           value &= 0x9f if control
@@ -794,6 +851,8 @@ module Prism
 
 =======
 >>>>>>> bd3dd2b62a (Fix parser translator tokens for %-arrays with whitespace escapes)
+=======
+>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator)
         # In a percent array, certain whitespace can be preceeded with a backslash,
         # causing the following characters to be part of the previous element.
         def percent_array_unescape(string)
@@ -818,11 +877,14 @@ module Prism
         # Determine if characters preceeded by a backslash should be escaped or not
         def interpolation?(quote)
           !quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
+<<<<<<< HEAD
         end
 
         # Regexp allow interpolation but are handled differently during unescaping
         def regexp?(quote)
           quote == "/" || quote.start_with?("%r")
+=======
+>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator)
         end
 
         # Regexp allow interpolation but are handled differently during unescaping
author	Earlopain <[email protected]>	2025-01-15 23:24:05 +0100
committer	Kevin Newton <[email protected]>	2025-03-18 13:36:53 -0400
commit	fd7a10cf4a73f27a0113a6bc2a65c4c274ee11ec (patch)
tree	788c85bfbff8be7d04934cd3ed54500038da252b /lib
parent	5d138f2b436dc84b1efed86ac3328e67638887cb (diff)