diff options
author | Earlopain <[email protected]> | 2025-01-15 23:24:05 +0100 |
---|---|---|
committer | Kevin Newton <[email protected]> | 2025-03-18 13:36:53 -0400 |
commit | bc506295a30a5806b3346ed09cd679f3b8ee6f64 (patch) | |
tree | d37cb1f1b816eaa63be60f55b99eebe0a970e9eb /lib | |
parent | 9e5e3f1bede46ed499a809975c663ba32c34ffff (diff) |
[ruby/prism] Further refine string handling in the parser translator
Mostly around newlines and line continuation.
* percent arrays need special backslash handling in the ast
* Fix offset issue for heredocs with many line continuations (used wrong variable as index access)
* More refined rules on when to simplify string tokens
* Handle line continuations in squiggly heredocs
* Correctly dedent squiggly heredocs with interpolation
* Consider `':foo:` and `%s[foo]` to not be interpolation
https://github.com/ruby/prism/commit/4edfe9d981
Diffstat (limited to 'lib')
-rw-r--r-- | lib/prism/translation/parser/compiler.rb | 152 | ||||
-rw-r--r-- | lib/prism/translation/parser/lexer.rb | 179 |
2 files changed, 224 insertions, 107 deletions
diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb index 4eec8205c8..1459b53c48 100644 --- a/lib/prism/translation/parser/compiler.rb +++ b/lib/prism/translation/parser/compiler.rb @@ -74,7 +74,29 @@ module Prism # [] # ^^ def visit_array_node(node) - builder.array(token(node.opening_loc), visit_all(node.elements), token(node.closing_loc)) + if node.opening&.start_with?("%w", "%W", "%i", "%I") + elements = node.elements.flat_map do |element| + if element.is_a?(StringNode) + if element.content.include?("\n") + string_nodes_from_line_continuations(element.unescaped, element.content, element.content_loc.start_offset, node.opening) + else + [builder.string_internal([element.unescaped, srange(element.content_loc)])] + end + elsif element.is_a?(InterpolatedStringNode) + builder.string_compose( + token(element.opening_loc), + string_nodes_from_interpolation(element, node.opening), + token(element.closing_loc) + ) + else + [visit(element)] + end + end + else + elements = visit_all(node.elements) + end + + builder.array(token(node.opening_loc), elements, token(node.closing_loc)) end # foo => [bar] @@ -1088,19 +1110,9 @@ module Prism return visit_heredoc(node) { |children, closing| builder.string_compose(token(node.opening_loc), children, closing) } end - parts = node.parts.flat_map do |part| - # When the content of a string node is split across multiple lines, the - # parser gem creates individual string nodes for each line the content is part of. - if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil? - string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, node.opening) - else - visit(part) - end - end - builder.string_compose( token(node.opening_loc), - parts, + string_nodes_from_interpolation(node, node.opening), token(node.closing_loc) ) end @@ -1119,14 +1131,14 @@ module Prism # ^^^^^^^^^^^^ def visit_interpolated_x_string_node(node) if node.heredoc? - visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) } - else - builder.xstring_compose( - token(node.opening_loc), - visit_all(node.parts), - token(node.closing_loc) - ) + return visit_heredoc(node) { |children, closing| builder.xstring_compose(token(node.opening_loc), children, closing) } end + + builder.xstring_compose( + token(node.opening_loc), + string_nodes_from_interpolation(node, node.opening), + token(node.closing_loc) + ) end # -> { it } @@ -2024,13 +2036,6 @@ module Prism end end - # The parser gem automatically converts \r\n to \n, meaning our offsets - # need to be adjusted to always subtract 1 from the length. - def chomped_bytesize(line) - chomped = line.chomp - chomped.bytesize + (chomped == line ? 0 : 1) - end - # Visit a heredoc that can be either a string or an xstring. def visit_heredoc(node) children = Array.new @@ -2099,55 +2104,88 @@ module Prism end end + # When the content of a string node is split across multiple lines, the + # parser gem creates individual string nodes for each line the content is part of. + def string_nodes_from_interpolation(node, opening) + node.parts.flat_map do |part| + if part.type == :string_node && part.content.include?("\n") && part.opening_loc.nil? + string_nodes_from_line_continuations(part.unescaped, part.content, part.content_loc.start_offset, opening) + else + visit(part) + end + end + end + # Create parser string nodes from a single prism node. The parser gem # "glues" strings together when a line continuation is encountered. def string_nodes_from_line_continuations(unescaped, escaped, start_offset, opening) unescaped = unescaped.lines escaped = escaped.lines + percent_array = opening&.start_with?("%w", "%W", "%i", "%I") + + # Non-interpolating strings + if opening&.end_with?("'") || opening&.start_with?("%q", "%s", "%w", "%i") + current_length = 0 + current_line = +"" + + escaped.filter_map.with_index do |escaped_line, index| + unescaped_line = unescaped.fetch(index, "") + current_length += escaped_line.bytesize + current_line << unescaped_line - escaped_lengths = [] - normalized_lengths = [] - # Keeps track of where an unescaped line should start a new token. An unescaped - # \n would otherwise be indistinguishable from the actual newline at the end of - # of the line. The parser gem only emits a new string node at "real" newlines, - # line continuations don't start a new node as well. - do_next_tokens = [] - - if opening&.end_with?("'") - escaped.each do |line| - escaped_lengths << line.bytesize - normalized_lengths << chomped_bytesize(line) - do_next_tokens << true + # Glue line continuations together. Only %w and %i arrays can contain these. + if percent_array && escaped_line[/(\\)*\n$/, 1]&.length&.odd? + next unless index == escaped.count - 1 + end + s = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_length)]) + start_offset += escaped_line.bytesize + current_line = +"" + current_length = 0 + s end else + escaped_lengths = [] + normalized_lengths = [] + # Keeps track of where an unescaped line should start a new token. An unescaped + # \n would otherwise be indistinguishable from the actual newline at the end of + # of the line. The parser gem only emits a new string node at "real" newlines, + # line continuations don't start a new node as well. + do_next_tokens = [] + escaped .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false } .each do |lines| escaped_lengths << lines.sum(&:bytesize) - normalized_lengths << lines.sum { |line| chomped_bytesize(line) } unescaped_lines_count = lines.sum do |line| line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false } end - do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false)) + extra = 1 + extra = lines.count if percent_array # Account for line continuations in percent arrays + + normalized_lengths.concat(Array.new(unescaped_lines_count + extra, 0)) + normalized_lengths[-1] = lines.sum { |line| line.bytesize } + do_next_tokens.concat(Array.new(unescaped_lines_count + extra, false)) do_next_tokens[-1] = true end - end - - current_line = +"" - current_normalized_length = 0 - unescaped.filter_map.with_index do |unescaped_line, index| - current_line << unescaped_line - current_normalized_length += normalized_lengths.fetch(index, 0) - - if do_next_tokens[index] - inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)]) - start_offset += escaped_lengths.fetch(index, 0) - current_line = +"" - current_normalized_length = 0 - inner_part - else - nil + current_line = +"" + current_normalized_length = 0 + + emitted_count = 0 + unescaped.filter_map.with_index do |unescaped_line, index| + current_line << unescaped_line + current_normalized_length += normalized_lengths.fetch(index, 0) + + if do_next_tokens[index] + inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)]) + start_offset += escaped_lengths.fetch(emitted_count, 0) + current_line = +"" + current_normalized_length = 0 + emitted_count += 1 + inner_part + else + nil + end end end end diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index f7187b1724..7db519499f 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -341,6 +341,7 @@ module Prism when :tRATIONAL value = parse_rational(value) when :tSPACE + location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value)) value = nil when :tSTRING_BEG next_token = lexed[index][0] @@ -354,11 +355,15 @@ module Prism location = range(next_location.start_offset, next_location.end_offset) index += 1 elsif value.start_with?("'", '"', "%") - if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END - # the parser gem doesn't simplify strings when its value ends in a newline - if !(string_value = next_token.value).end_with?("\n") && basic_quotes + if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END + string_value = next_token.value + if simplify_string?(string_value, value) next_location = token.location.join(next_next_token.location) - value = unescape_string(string_value, value) + if percent_array?(value) + value = percent_array_unescape(string_value) + else + value = unescape_string(string_value, value) + end type = :tSTRING location = range(next_location.start_offset, next_location.end_offset) index += 2 @@ -397,16 +402,34 @@ module Prism quote_stack.push(value) end when :tSTRING_CONTENT + is_percent_array = percent_array?(quote_stack.last) + if (lines = token.value.lines).one? - # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line. - is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line - # The parser gem only removes indentation when the heredoc is not nested - not_nested = heredoc_stack.size == 1 - if is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0 - value = trim_heredoc_whitespace(value, current_heredoc) - end + # Prism usually emits a single token for strings with line continuations. + # For squiggly heredocs they are not joined so we do that manually here. + current_string = +"" + current_length = 0 + start_offset = token.location.start_offset + while token.type == :STRING_CONTENT + current_length += token.value.bytesize + # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line. + is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line + # The parser gem only removes indentation when the heredoc is not nested + not_nested = heredoc_stack.size == 1 + if is_percent_array + value = percent_array_unescape(token.value) + elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0 + value = trim_heredoc_whitespace(token.value, current_heredoc) + end - value = unescape_string(value, quote_stack.last) + current_string << unescape_string(value, quote_stack.last) + if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last) + tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]] + break + end + token = lexed[index][0] + index += 1 + end else # When the parser gem encounters a line continuation inside of a multiline string, # it emits a single string node. The backslash (and remaining newline) is removed. @@ -419,12 +442,10 @@ module Prism chomped_line = line.chomp backslash_count = chomped_line[/\\{1,}\z/]&.length || 0 is_interpolation = interpolation?(quote_stack.last) - is_percent_array = percent_array?(quote_stack.last) if backslash_count.odd? && (is_interpolation || is_percent_array) if is_percent_array - # Remove the last backslash, keep potential newlines - current_line << line.sub(/(\\)(\r?\n)\z/, '\2') + current_line << percent_array_unescape(line) adjustment += 1 else chomped_line.delete_suffix!("\\") @@ -446,8 +467,8 @@ module Prism adjustment = 0 end end - next end + next when :tSTRING_DVAR value = nil when :tSTRING_END @@ -570,12 +591,13 @@ module Prism while (lexed[next_token_index] && next_token = lexed[next_token_index][0]) next_token_index += 1 next_next_token = lexed[next_token_index] && lexed[next_token_index][0] + first_token_on_line = next_token.location.start_column == 0 # String content inside nested heredocs and interpolation is ignored if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN # When interpolation is the first token of a line there is no string # content to check against. There will be no common whitespace. - if nesting_level == 0 && next_token.location.start_column == 0 + if nesting_level == 0 && first_token_on_line result = 0 end nesting_level += 1 @@ -583,7 +605,7 @@ module Prism nesting_level -= 1 # When we encountered the matching heredoc end, we can exit break if nesting_level == -1 - elsif next_token.type == :STRING_CONTENT && nesting_level == 0 + elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line common_whitespace = 0 next_token.value[/^\s*/].each_char do |char| if char == "\t" @@ -672,50 +694,107 @@ module Prism while (skipped = scanner.skip_until(/\\/)) # Append what was just skipped over, excluding the found backslash. result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1)) - - # Simple single-character escape sequences like \n - if (replacement = ESCAPES[scanner.peek(1)]) - result.append_as_bytes(replacement) - scanner.pos += 1 - elsif (octal = scanner.check(/[0-7]{1,3}/)) - # \nnn - result.append_as_bytes(octal.to_i(8).chr) - scanner.pos += octal.bytesize - elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/)) - # \xnn - result.append_as_bytes(hex[1..].to_i(16).chr) - scanner.pos += hex.bytesize - elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/)) - # \unnnn - result.append_as_bytes(unicode[1..].hex.chr(Encoding::UTF_8)) - scanner.pos += unicode.bytesize - elsif scanner.peek(3) == "u{}" - # https://github.com/whitequark/parser/issues/856 - scanner.pos += 3 - elsif (unicode_parts = scanner.check(/u{.*}/)) - # \u{nnnn ...} - unicode_parts[2..-2].split.each do |unicode| - result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8)) - end - scanner.pos += unicode_parts.bytesize - end + escape_read(result, scanner, false, false) end - # Add remainging chars + # Add remaining chars result.append_as_bytes(string.byteslice(scanner.pos..)) - result.force_encoding(source_buffer.source.encoding) - - result else delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}") string.gsub(/\\([\\#{delimiters}])/, '\1') end end + # Certain strings are merged into a single string token. + def simplify_string?(value, quote) + case quote + when "'" + # Only simplify 'foo' + !value.include?("\n") + when '"' + # Simplify when every line ends with a line continuation, or it is the last line + value.lines.all? do |line| + !line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd? + end + else + # %q and similar are never simplified + false + end + end + + # Escape a byte value, given the control and meta flags. + def escape_build(value, control, meta) + value &= 0x9f if control + value |= 0x80 if meta + value + end + + # Read an escape out of the string scanner, given the control and meta + # flags, and push the unescaped value into the result. + def escape_read(result, scanner, control, meta) + if scanner.skip("\n") + # Line continuation + elsif (value = ESCAPES[scanner.peek(1)]) + # Simple single-character escape sequences like \n + result.append_as_bytes(value) + scanner.pos += 1 + elsif (value = scanner.scan(/[0-7]{1,3}/)) + # \nnn + result.append_as_bytes(escape_build(value.to_i(8), control, meta)) + elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/)) + # \xnn + result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta)) + elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/)) + # \unnnn + result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8)) + elsif scanner.skip("u{}") + # https://github.com/whitequark/parser/issues/856 + elsif (value = scanner.scan(/u{.*?}/)) + # \u{nnnn ...} + value[2..-2].split.each do |unicode| + result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8)) + end + elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/)) + # \cx or \C-x where x is an ASCII printable character + escape_read(result, scanner, true, meta) + elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/)) + # \M-x where x is an ASCII printable character + escape_read(result, scanner, control, true) + elsif (byte = scanner.get_byte) + # Something else after an escape. + if control && byte == "?" + result.append_as_bytes(escape_build(0x7f, false, meta)) + else + result.append_as_bytes(escape_build(byte.ord, control, meta)) + end + end + end + + # In a percent array, certain whitespace can be preceeded with a backslash, + # causing the following characters to be part of the previous element. + def percent_array_unescape(string) + string.gsub(/(\\)+[ \f\n\r\t\v]/) do |full_match| + full_match.delete_prefix!("\\") if Regexp.last_match[1].length.odd? + full_match + end + end + + # For %-arrays whitespace, the parser gem only considers whitespace before the newline. + def percent_array_leading_whitespace(string) + return 1 if string.start_with?("\n") + + leading_whitespace = 0 + string.each_char do |c| + break if c == "\n" + leading_whitespace += 1 + end + leading_whitespace + end + # Determine if characters preceeded by a backslash should be escaped or not def interpolation?(quote) - quote != "'" && !quote.start_with?("%q", "%w", "%i") + !quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s") end # Regexp allow interpolation but are handled differently during unescaping |