[ruby/prism] Fix parser translator ast for regex with line continuation

Turns out, the vast majority of work was already done with handling the same for heredocs I'm confident this should also apply to actual string nodes (there's even a todo for it) but no tests change if I apply it there too, so I can't say for sure if the logic would be correct. The individual test files are a bit too large, maybe something else would break that currently passes. Leaving it for later to look more closely into that. https://github.com/ruby/prism/commit/6bba1c54e1
author: Earlopain <[email protected]> 2025-01-06 20:18:03 +0100
committer: Kevin Newton <[email protected]> 2025-01-11 19:09:05 -0500
commit: a234fd516f82702f9efd67e3f67de129702a6801 (patch)
tree: 5cb7b38794458953bcf4cd70887833eb734356f4
parent: d1a70014f9a1ee411c41338d0929443bab004cda (diff)
3 files changed, 64 insertions, 56 deletions
diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb
index 54e08eb991..c6a7154625 100644
--- a/lib/prism/translation/parser/compiler.rb
+++ b/lib/prism/translation/parser/compiler.rb
@@ -1511,13 +1511,9 @@ module Prism
         # /foo/
         # ^^^^^
         def visit_regular_expression_node(node)
-          content = node.content
           parts =
-            if content.include?("\n")
-              offset = node.content_loc.start_offset
-              content.lines.map do |line|
-                builder.string_internal([line, srange_offsets(offset, offset += line.bytesize)])
-              end
+            if node.content.include?("\n")
+              string_nodes_from_line_continuations(node, node.content_loc.start_offset, node.opening)
             else
               [builder.string_internal(token(node.content_loc))]
             end
@@ -2074,55 +2070,7 @@ module Prism
           node.parts.each do |part|
             pushing =
               if part.is_a?(StringNode) && part.unescaped.include?("\n")
-                unescaped = part.unescaped.lines
-                escaped = part.content.lines
-
-                escaped_lengths = []
-                normalized_lengths = []
-                # Keeps track of where an unescaped line should start a new token. An unescaped
-                # \n would otherwise be indistinguishable from the actual newline at the end of
-                # of the line. The parser gem only emits a new string node at "real" newlines,
-                # line continuations don't start a new node as well.
-                do_next_tokens = []
-
-                if node.opening.end_with?("'")
-                  escaped.each do |line|
-                    escaped_lengths << line.bytesize
-                    normalized_lengths << chomped_bytesize(line)
-                    do_next_tokens << true
-                  end
-                else
-                  escaped
-                    .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false }
-                    .each do |lines|
-                      escaped_lengths << lines.sum(&:bytesize)
-                      normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
-                      unescaped_lines_count = lines.sum do |line|
-                        line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false }
-                      end
-                      do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false))
-                      do_next_tokens[-1] = true
-                    end
-                end
-
-                start_offset = part.location.start_offset
-                current_line = +""
-                current_normalized_length = 0
-
-                unescaped.filter_map.with_index do |unescaped_line, index|
-                  current_line << unescaped_line
-                  current_normalized_length += normalized_lengths.fetch(index, 0)
-
-                  if do_next_tokens[index]
-                    inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
-                    start_offset += escaped_lengths.fetch(index, 0)
-                    current_line = +""
-                    current_normalized_length = 0
-                    inner_part
-                  else
-                    nil
-                  end
-                end
+                string_nodes_from_line_continuations(part, part.location.start_offset, node.opening)
               else
                 [visit(part)]
               end
@@ -2172,6 +2120,59 @@ module Prism
             parser.pattern_variables.pop
           end
         end
+
+        # Create parser string nodes from a single prism node. The parser gem
+        # "glues" strings together when a line continuation is encountered.
+        def string_nodes_from_line_continuations(node, start_offset, opening)
+          unescaped = node.unescaped.lines
+          escaped = node.content.lines
+
+          escaped_lengths = []
+          normalized_lengths = []
+          # Keeps track of where an unescaped line should start a new token. An unescaped
+          # \n would otherwise be indistinguishable from the actual newline at the end of
+          # of the line. The parser gem only emits a new string node at "real" newlines,
+          # line continuations don't start a new node as well.
+          do_next_tokens = []
+
+          if opening.end_with?("'")
+            escaped.each do |line|
+              escaped_lengths << line.bytesize
+              normalized_lengths << chomped_bytesize(line)
+              do_next_tokens << true
+            end
+          else
+            escaped
+              .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false }
+              .each do |lines|
+                escaped_lengths << lines.sum(&:bytesize)
+                normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
+                unescaped_lines_count = lines.sum do |line|
+                  line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false }
+                end
+                do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false))
+                do_next_tokens[-1] = true
+              end
+          end
+
+          current_line = +""
+          current_normalized_length = 0
+
+          unescaped.filter_map.with_index do |unescaped_line, index|
+            current_line << unescaped_line
+            current_normalized_length += normalized_lengths.fetch(index, 0)
+
+            if do_next_tokens[index]
+              inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
+              start_offset += escaped_lengths.fetch(index, 0)
+              current_line = +""
+              current_normalized_length = 0
+              inner_part
+            else
+              nil
+            end
+          end
+        end
       end
     end
   end
diff --git a/test/prism/fixtures/regex.txt b/test/prism/fixtures/regex.txt
index 4623733f58..85e600fbdd 100644
--- a/test/prism/fixtures/regex.txt
+++ b/test/prism/fixtures/regex.txt
@@ -46,3 +46,11 @@ tap { /(?<a>)/ =~ to_s }
 def foo(nil:) = /(?<nil>)/ =~ ""
 
 /(?-x:#)/x
+
+/a
+b\
+c\
+d\\\
+e\\
+f\
+/
diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb
index 5536a3e6ee..d28acd527f 100644
--- a/test/prism/ruby/parser_test.rb
+++ b/test/prism/ruby/parser_test.rb
@@ -62,7 +62,6 @@ module Prism
     # These files are either failing to parse or failing to translate, so we'll
     # skip them for now.
     skip_all = skip_incorrect | [
-      "regex.txt",
       "unescaping.txt",
       "seattlerb/bug190.txt",
       "seattlerb/heredoc_with_extra_carriage_returns_windows.txt",
author	Earlopain <[email protected]>	2025-01-06 20:18:03 +0100
committer	Kevin Newton <[email protected]>	2025-01-11 19:09:05 -0500
commit	a234fd516f82702f9efd67e3f67de129702a6801 (patch)
tree	5cb7b38794458953bcf4cd70887833eb734356f4
parent	d1a70014f9a1ee411c41338d0929443bab004cda (diff)