summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEarlopain <[email protected]>2025-01-06 20:18:03 +0100
committerKevin Newton <[email protected]>2025-01-11 19:09:05 -0500
commita234fd516f82702f9efd67e3f67de129702a6801 (patch)
tree5cb7b38794458953bcf4cd70887833eb734356f4
parentd1a70014f9a1ee411c41338d0929443bab004cda (diff)
[ruby/prism] Fix parser translator ast for regex with line continuation
Turns out, the vast majority of work was already done with handling the same for heredocs I'm confident this should also apply to actual string nodes (there's even a todo for it) but no tests change if I apply it there too, so I can't say for sure if the logic would be correct. The individual test files are a bit too large, maybe something else would break that currently passes. Leaving it for later to look more closely into that. https://github.com/ruby/prism/commit/6bba1c54e1
-rw-r--r--lib/prism/translation/parser/compiler.rb111
-rw-r--r--test/prism/fixtures/regex.txt8
-rw-r--r--test/prism/ruby/parser_test.rb1
3 files changed, 64 insertions, 56 deletions
diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb
index 54e08eb991..c6a7154625 100644
--- a/lib/prism/translation/parser/compiler.rb
+++ b/lib/prism/translation/parser/compiler.rb
@@ -1511,13 +1511,9 @@ module Prism
# /foo/
# ^^^^^
def visit_regular_expression_node(node)
- content = node.content
parts =
- if content.include?("\n")
- offset = node.content_loc.start_offset
- content.lines.map do |line|
- builder.string_internal([line, srange_offsets(offset, offset += line.bytesize)])
- end
+ if node.content.include?("\n")
+ string_nodes_from_line_continuations(node, node.content_loc.start_offset, node.opening)
else
[builder.string_internal(token(node.content_loc))]
end
@@ -2074,55 +2070,7 @@ module Prism
node.parts.each do |part|
pushing =
if part.is_a?(StringNode) && part.unescaped.include?("\n")
- unescaped = part.unescaped.lines
- escaped = part.content.lines
-
- escaped_lengths = []
- normalized_lengths = []
- # Keeps track of where an unescaped line should start a new token. An unescaped
- # \n would otherwise be indistinguishable from the actual newline at the end of
- # of the line. The parser gem only emits a new string node at "real" newlines,
- # line continuations don't start a new node as well.
- do_next_tokens = []
-
- if node.opening.end_with?("'")
- escaped.each do |line|
- escaped_lengths << line.bytesize
- normalized_lengths << chomped_bytesize(line)
- do_next_tokens << true
- end
- else
- escaped
- .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false }
- .each do |lines|
- escaped_lengths << lines.sum(&:bytesize)
- normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
- unescaped_lines_count = lines.sum do |line|
- line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false }
- end
- do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false))
- do_next_tokens[-1] = true
- end
- end
-
- start_offset = part.location.start_offset
- current_line = +""
- current_normalized_length = 0
-
- unescaped.filter_map.with_index do |unescaped_line, index|
- current_line << unescaped_line
- current_normalized_length += normalized_lengths.fetch(index, 0)
-
- if do_next_tokens[index]
- inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
- start_offset += escaped_lengths.fetch(index, 0)
- current_line = +""
- current_normalized_length = 0
- inner_part
- else
- nil
- end
- end
+ string_nodes_from_line_continuations(part, part.location.start_offset, node.opening)
else
[visit(part)]
end
@@ -2172,6 +2120,59 @@ module Prism
parser.pattern_variables.pop
end
end
+
+ # Create parser string nodes from a single prism node. The parser gem
+ # "glues" strings together when a line continuation is encountered.
+ def string_nodes_from_line_continuations(node, start_offset, opening)
+ unescaped = node.unescaped.lines
+ escaped = node.content.lines
+
+ escaped_lengths = []
+ normalized_lengths = []
+ # Keeps track of where an unescaped line should start a new token. An unescaped
+ # \n would otherwise be indistinguishable from the actual newline at the end of
+ # of the line. The parser gem only emits a new string node at "real" newlines,
+ # line continuations don't start a new node as well.
+ do_next_tokens = []
+
+ if opening.end_with?("'")
+ escaped.each do |line|
+ escaped_lengths << line.bytesize
+ normalized_lengths << chomped_bytesize(line)
+ do_next_tokens << true
+ end
+ else
+ escaped
+ .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false }
+ .each do |lines|
+ escaped_lengths << lines.sum(&:bytesize)
+ normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
+ unescaped_lines_count = lines.sum do |line|
+ line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false }
+ end
+ do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false))
+ do_next_tokens[-1] = true
+ end
+ end
+
+ current_line = +""
+ current_normalized_length = 0
+
+ unescaped.filter_map.with_index do |unescaped_line, index|
+ current_line << unescaped_line
+ current_normalized_length += normalized_lengths.fetch(index, 0)
+
+ if do_next_tokens[index]
+ inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
+ start_offset += escaped_lengths.fetch(index, 0)
+ current_line = +""
+ current_normalized_length = 0
+ inner_part
+ else
+ nil
+ end
+ end
+ end
end
end
end
diff --git a/test/prism/fixtures/regex.txt b/test/prism/fixtures/regex.txt
index 4623733f58..85e600fbdd 100644
--- a/test/prism/fixtures/regex.txt
+++ b/test/prism/fixtures/regex.txt
@@ -46,3 +46,11 @@ tap { /(?<a>)/ =~ to_s }
def foo(nil:) = /(?<nil>)/ =~ ""
/(?-x:#)/x
+
+/a
+b\
+c\
+d\\\
+e\\
+f\
+/
diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb
index 5536a3e6ee..d28acd527f 100644
--- a/test/prism/ruby/parser_test.rb
+++ b/test/prism/ruby/parser_test.rb
@@ -62,7 +62,6 @@ module Prism
# These files are either failing to parse or failing to translate, so we'll
# skip them for now.
skip_all = skip_incorrect | [
- "regex.txt",
"unescaping.txt",
"seattlerb/bug190.txt",
"seattlerb/heredoc_with_extra_carriage_returns_windows.txt",