diff options
author | Earlopain <[email protected]> | 2025-01-15 23:24:05 +0100 |
---|---|---|
committer | Kevin Newton <[email protected]> | 2025-03-18 13:36:53 -0400 |
commit | fd7a10cf4a73f27a0113a6bc2a65c4c274ee11ec (patch) | |
tree | 788c85bfbff8be7d04934cd3ed54500038da252b /lib | |
parent | 5d138f2b436dc84b1efed86ac3328e67638887cb (diff) |
[ruby/prism] Further refine string handling in the parser translator
Mostly around newlines and line continuation.
* percent arrays need special backslash handling in the ast
* Fix offset issue for heredocs with many line continuations (used wrong variable as index access)
* More refined rules on when to simplify string tokens
* Handle line continuations in squiggly heredocs
* Correctly dedent squiggly heredocs with interpolation
* Consider `':foo:` and `%s[foo]` to not be interpolation
https://github.com/ruby/prism/commit/4edfe9d981
Diffstat (limited to 'lib')
-rw-r--r-- | lib/prism/translation/parser/compiler.rb | 13 | ||||
-rw-r--r-- | lib/prism/translation/parser/lexer.rb | 62 |
2 files changed, 75 insertions, 0 deletions
diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb index 83cbe6e0c9..8f6a602c42 100644 --- a/lib/prism/translation/parser/compiler.rb +++ b/lib/prism/translation/parser/compiler.rb @@ -1119,6 +1119,7 @@ module Prism <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= parts = if node.parts.one? { |part| part.type == :string_node } node.parts.flat_map do |node| @@ -1150,6 +1151,8 @@ module Prism end >>>>>>> a651126458 (Fix an incompatibility with the parser translator) +======= +>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator) builder.string_compose( token(node.opening_loc), string_nodes_from_interpolation(node, node.opening), @@ -2214,9 +2217,13 @@ module Prism unescaped = unescaped.lines escaped = escaped.lines <<<<<<< HEAD +<<<<<<< HEAD percent_array = opening&.start_with?("%w", "%W", "%i", "%I") ======= >>>>>>> 2637007929 (Better handle all kinds of multiline strings in the parser translator) +======= + percent_array = opening&.start_with?("%w", "%W", "%i", "%I") +>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator) # Non-interpolating strings if opening&.end_with?("'") || opening&.start_with?("%q", "%s", "%w", "%i") @@ -2224,6 +2231,9 @@ module Prism current_line = +"" <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator) escaped.filter_map.with_index do |escaped_line, index| unescaped_line = unescaped.fetch(index, "") current_length += escaped_line.bytesize @@ -2238,6 +2248,7 @@ module Prism current_line = +"" current_length = 0 s +<<<<<<< HEAD ======= if opening&.end_with?("'") escaped.each do |line| @@ -2245,6 +2256,8 @@ module Prism normalized_lengths << chomped_bytesize(line) do_next_tokens << true >>>>>>> 2637007929 (Better handle all kinds of multiline strings in the parser translator) +======= +>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator) end else escaped_lengths = [] diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index 687b4f6043..4b7ced7afa 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -407,6 +407,7 @@ module Prism ======= if (lines = token.value.lines).one? +<<<<<<< HEAD # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line. is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line # The parser gem only removes indentation when the heredoc is not nested @@ -437,6 +438,25 @@ module Prism end <<<<<<< HEAD +======= + # Prism usually emits a single token for strings with line continuations. + # For squiggly heredocs they are not joined so we do that manually here. + current_string = +"" + current_length = 0 + start_offset = token.location.start_offset + while token.type == :STRING_CONTENT + current_length += token.value.bytesize + # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line. + is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line + # The parser gem only removes indentation when the heredoc is not nested + not_nested = heredoc_stack.size == 1 + if is_percent_array + value = percent_array_unescape(token.value) + elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0 + value = trim_heredoc_whitespace(token.value, current_heredoc) + end + +>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator) current_string << unescape_string(value, quote_stack.last) if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last) tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]] @@ -714,7 +734,40 @@ module Prism while (skipped = scanner.skip_until(/\\/)) # Append what was just skipped over, excluding the found backslash. result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1)) +<<<<<<< HEAD escape_read(result, scanner, false, false) +======= + + if scanner.peek(1) == "\n" + # Line continuation + scanner.pos += 1 + elsif (replacement = ESCAPES[scanner.peek(1)]) + # Simple single-character escape sequences like \n + result.append_as_bytes(replacement) + scanner.pos += 1 + elsif (octal = scanner.check(/[0-7]{1,3}/)) + # \nnn + result.append_as_bytes(octal.to_i(8).chr) + scanner.pos += octal.bytesize + elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/)) + # \xnn + result.append_as_bytes(hex[1..].to_i(16).chr) + scanner.pos += hex.bytesize + elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/)) + # \unnnn + result.append_as_bytes(unicode[1..].hex.chr(Encoding::UTF_8)) + scanner.pos += unicode.bytesize + elsif scanner.peek(3) == "u{}" + # https://github.com/whitequark/parser/issues/856 + scanner.pos += 3 + elsif (unicode_parts = scanner.check(/u{.*}/)) + # \u{nnnn ...} + unicode_parts[2..-2].split.each do |unicode| + result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8)) + end + scanner.pos += unicode_parts.bytesize + end +>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator) end # Add remaining chars @@ -727,6 +780,9 @@ module Prism end <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator) # Certain strings are merged into a single string token. def simplify_string?(value, quote) case quote @@ -744,6 +800,7 @@ module Prism end end +<<<<<<< HEAD # Escape a byte value, given the control and meta flags. def escape_build(value, control, meta) value &= 0x9f if control @@ -794,6 +851,8 @@ module Prism ======= >>>>>>> bd3dd2b62a (Fix parser translator tokens for %-arrays with whitespace escapes) +======= +>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator) # In a percent array, certain whitespace can be preceeded with a backslash, # causing the following characters to be part of the previous element. def percent_array_unescape(string) @@ -818,11 +877,14 @@ module Prism # Determine if characters preceeded by a backslash should be escaped or not def interpolation?(quote) !quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s") +<<<<<<< HEAD end # Regexp allow interpolation but are handled differently during unescaping def regexp?(quote) quote == "/" || quote.start_with?("%r") +======= +>>>>>>> 4edfe9d981 (Further refine string handling in the parser translator) end # Regexp allow interpolation but are handled differently during unescaping |