diff options
-rw-r--r-- | lib/prism/translation/parser/lexer.rb | 154 | ||||
-rw-r--r-- | test/prism/ruby/parser_test.rb | 50 |
2 files changed, 154 insertions, 50 deletions
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index 61e22159a1..71eafe5a1a 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require "strscan" + module Prism module Translation class Parser @@ -251,6 +253,8 @@ module Prism end when :tCHARACTER value.delete_prefix!("?") + # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism. + value = unescape_string(value, "?") when :tCOMMENT if token.type == :EMBDOC_BEGIN start_index = index @@ -432,6 +436,156 @@ module Prism rescue ArgumentError 0r end + + # Wonky heredoc tab/spaces rules. + # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558 + def calculate_heredoc_whitespace(heredoc_token_index) + next_token_index = heredoc_token_index + nesting_level = 0 + previous_line = -1 + result = Float::MAX + + while (lexed[next_token_index] && next_token = lexed[next_token_index][0]) + next_token_index += 1 + next_next_token = lexed[next_token_index] && lexed[next_token_index][0] + + # String content inside nested heredocs and interpolation is ignored + if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN + nesting_level += 1 + elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END + nesting_level -= 1 + # When we encountered the matching heredoc end, we can exit + break if nesting_level == -1 + elsif next_token.type == :STRING_CONTENT && nesting_level == 0 + common_whitespace = 0 + next_token.value[/^\s*/].each_char do |char| + if char == "\t" + common_whitespace = (common_whitespace / 8 + 1) * 8; + else + common_whitespace += 1 + end + end + + is_first_token_on_line = next_token.location.start_line != previous_line + # Whitespace is significant if followed by interpolation + whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line + if is_first_token_on_line && !whitespace_only && common_whitespace < result + result = common_whitespace + previous_line = next_token.location.start_line + end + end + end + result + end + + # Wonky heredoc tab/spaces rules. + # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545 + def trim_heredoc_whitespace(string, heredoc) + trimmed_whitespace = 0 + trimmed_characters = 0 + while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace + if string[trimmed_characters] == "\t" + trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8; + break if trimmed_whitespace > heredoc.common_whitespace + else + trimmed_whitespace += 1 + end + trimmed_characters += 1 + end + + string[trimmed_characters..] + end + + # Escape sequences that have special and should appear unescaped in the resulting string. + ESCAPES = { + "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f", + "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t", + "v" => "\v", "\\" => "\\" + }.freeze + private_constant :ESCAPES + + # When one of these delimiters is encountered, then the other + # one is allowed to be escaped as well. + DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze + private_constant :DELIMITER_SYMETRY + + # Apply Ruby string escaping rules + def unescape_string(string, quote) + # In single-quoted heredocs, everything is taken literally. + return string if quote == "<<'" + + # TODO: Implement regexp escaping + return string if quote == "/" || quote.start_with?("%r") + + # OPTIMIZATION: Assume that few strings need escaping to speed up the common case. + return string unless string.include?("\\") + + if interpolation?(quote) + # Appending individual escape sequences may force the string out of its intended + # encoding. Start out with binary and force it back later. + result = "".b + + scanner = StringScanner.new(string) + while (skipped = scanner.skip_until(/\\/)) + # Append what was just skipped over, excluding the found backslash. + result << string.byteslice(scanner.pos - skipped, skipped - 1) + + # Simple single-character escape sequences like \n + if (replacement = ESCAPES[scanner.peek(1)]) + result << replacement + scanner.pos += 1 + elsif (octal = scanner.check(/[0-7]{1,3}/)) + # \nnn + # NOTE: When Ruby 3.4 is required, this can become result.append_as_bytes(chr) + result << octal.to_i(8).chr.b + scanner.pos += octal.bytesize + elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/)) + # \xnn + result << hex[1..].to_i(16).chr.b + scanner.pos += hex.bytesize + elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/)) + # \unnnn + result << unicode[1..].hex.chr(Encoding::UTF_8).b + scanner.pos += unicode.bytesize + elsif scanner.peek(3) == "u{}" + # https://github.com/whitequark/parser/issues/856 + scanner.pos += 3 + elsif (unicode_parts = scanner.check(/u{.*}/)) + # \u{nnnn ...} + unicode_parts[2..-2].split.each do |unicode| + result << unicode.hex.chr(Encoding::UTF_8).b + end + scanner.pos += unicode_parts.bytesize + end + end + + # Add remainging chars + result << string.byteslice(scanner.pos..) + + result.force_encoding(source_buffer.source.encoding) + + result + else + if quote == "'" + delimiter = "'" + else + delimiter = quote[2] + end + + delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}") + string.gsub(/\\([\\#{delimiters}])/, '\1') + end + end + + # Determine if characters preceeded by a backslash should be escaped or not + def interpolation?(quote) + quote != "'" && !quote.start_with?("%q", "%w", "%i") + end + + # Determine if the string is part of a %-style array. + def percent_array?(quote) + quote.start_with?("%w", "%W", "%i", "%I") + end end end end diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb index e972142672..4ba38bd0c0 100644 --- a/test/prism/ruby/parser_test.rb +++ b/test/prism/ruby/parser_test.rb @@ -81,78 +81,28 @@ module Prism # These files are failing to translate their lexer output into the lexer # output expected by the parser gem, so we'll skip them for now. skip_tokens = [ - "comments.txt", "dash_heredocs.txt", - "dos_endings.txt", "embdoc_no_newline_at_end.txt", - "heredoc_with_comment.txt", "heredocs_with_ignored_newlines.txt", - "indented_file_end.txt", "methods.txt", "strings.txt", "tilde_heredocs.txt", - "xstring_with_backslash.txt", "seattlerb/backticks_interpolation_line.txt", "seattlerb/bug169.txt", "seattlerb/case_in.txt", - "seattlerb/class_comments.txt", "seattlerb/difficult4__leading_dots2.txt", "seattlerb/difficult6__7.txt", "seattlerb/difficult6__8.txt", - "seattlerb/dsym_esc_to_sym.txt", - "seattlerb/heredoc__backslash_dos_format.txt", - "seattlerb/heredoc_backslash_nl.txt", - "seattlerb/heredoc_comma_arg.txt", - "seattlerb/heredoc_squiggly_blank_line_plus_interpolation.txt", - "seattlerb/heredoc_squiggly_blank_lines.txt", - "seattlerb/heredoc_squiggly_interp.txt", - "seattlerb/heredoc_squiggly_tabs_extra.txt", - "seattlerb/heredoc_squiggly_tabs.txt", - "seattlerb/heredoc_squiggly_visually_blank_lines.txt", - "seattlerb/heredoc_squiggly.txt", "seattlerb/heredoc_unicode.txt", - "seattlerb/heredoc_with_carriage_return_escapes_windows.txt", - "seattlerb/heredoc_with_carriage_return_escapes.txt", - "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt", - "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes.txt", - "seattlerb/interpolated_symbol_array_line_breaks.txt", - "seattlerb/interpolated_word_array_line_breaks.txt", - "seattlerb/label_vs_string.txt", - "seattlerb/module_comments.txt", - "seattlerb/non_interpolated_symbol_array_line_breaks.txt", - "seattlerb/non_interpolated_word_array_line_breaks.txt", - "seattlerb/parse_line_block_inline_comment_leading_newlines.txt", - "seattlerb/parse_line_block_inline_comment.txt", - "seattlerb/parse_line_block_inline_multiline_comment.txt", - "seattlerb/parse_line_dstr_escaped_newline.txt", "seattlerb/parse_line_heredoc.txt", - "seattlerb/parse_line_multiline_str_literal_n.txt", - "seattlerb/parse_line_str_with_newline_escape.txt", "seattlerb/pct_w_heredoc_interp_nested.txt", - "seattlerb/qsymbols_empty_space.txt", - "seattlerb/qw_escape_term.txt", - "seattlerb/qWords_space.txt", - "seattlerb/read_escape_unicode_curlies.txt", - "seattlerb/read_escape_unicode_h4.txt", "seattlerb/required_kwarg_no_value.txt", "seattlerb/slashy_newlines_within_string.txt", - "seattlerb/str_double_escaped_newline.txt", - "seattlerb/str_double_newline.txt", - "seattlerb/str_evstr_escape.txt", - "seattlerb/str_newline_hash_line_number.txt", - "seattlerb/str_single_newline.txt", - "seattlerb/symbols_empty_space.txt", "seattlerb/TestRubyParserShared.txt", "unparser/corpus/literal/assignment.txt", - "unparser/corpus/literal/dstr.txt", - "unparser/corpus/semantic/opasgn.txt", "whitequark/args.txt", "whitequark/beginless_erange_after_newline.txt", "whitequark/beginless_irange_after_newline.txt", - "whitequark/bug_ascii_8bit_in_literal.txt", - "whitequark/bug_def_no_paren_eql_begin.txt", - "whitequark/dedenting_heredoc.txt", - "whitequark/dedenting_non_interpolating_heredoc_line_continuation.txt", "whitequark/forward_arg_with_open_args.txt", "whitequark/kwarg_no_paren.txt", "whitequark/lbrace_arg_after_command_args.txt", |