summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/prism/translation/parser/lexer.rb154
-rw-r--r--test/prism/ruby/parser_test.rb50
2 files changed, 154 insertions, 50 deletions
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
index 61e22159a1..71eafe5a1a 100644
--- a/lib/prism/translation/parser/lexer.rb
+++ b/lib/prism/translation/parser/lexer.rb
@@ -1,5 +1,7 @@
# frozen_string_literal: true
+require "strscan"
+
module Prism
module Translation
class Parser
@@ -251,6 +253,8 @@ module Prism
end
when :tCHARACTER
value.delete_prefix!("?")
+ # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
+ value = unescape_string(value, "?")
when :tCOMMENT
if token.type == :EMBDOC_BEGIN
start_index = index
@@ -432,6 +436,156 @@ module Prism
rescue ArgumentError
0r
end
+
+ # Wonky heredoc tab/spaces rules.
+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
+ def calculate_heredoc_whitespace(heredoc_token_index)
+ next_token_index = heredoc_token_index
+ nesting_level = 0
+ previous_line = -1
+ result = Float::MAX
+
+ while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
+ next_token_index += 1
+ next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
+
+ # String content inside nested heredocs and interpolation is ignored
+ if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
+ nesting_level += 1
+ elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
+ nesting_level -= 1
+ # When we encountered the matching heredoc end, we can exit
+ break if nesting_level == -1
+ elsif next_token.type == :STRING_CONTENT && nesting_level == 0
+ common_whitespace = 0
+ next_token.value[/^\s*/].each_char do |char|
+ if char == "\t"
+ common_whitespace = (common_whitespace / 8 + 1) * 8;
+ else
+ common_whitespace += 1
+ end
+ end
+
+ is_first_token_on_line = next_token.location.start_line != previous_line
+ # Whitespace is significant if followed by interpolation
+ whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
+ if is_first_token_on_line && !whitespace_only && common_whitespace < result
+ result = common_whitespace
+ previous_line = next_token.location.start_line
+ end
+ end
+ end
+ result
+ end
+
+ # Wonky heredoc tab/spaces rules.
+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
+ def trim_heredoc_whitespace(string, heredoc)
+ trimmed_whitespace = 0
+ trimmed_characters = 0
+ while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
+ if string[trimmed_characters] == "\t"
+ trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
+ break if trimmed_whitespace > heredoc.common_whitespace
+ else
+ trimmed_whitespace += 1
+ end
+ trimmed_characters += 1
+ end
+
+ string[trimmed_characters..]
+ end
+
+ # Escape sequences that have special and should appear unescaped in the resulting string.
+ ESCAPES = {
+ "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
+ "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
+ "v" => "\v", "\\" => "\\"
+ }.freeze
+ private_constant :ESCAPES
+
+ # When one of these delimiters is encountered, then the other
+ # one is allowed to be escaped as well.
+ DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
+ private_constant :DELIMITER_SYMETRY
+
+ # Apply Ruby string escaping rules
+ def unescape_string(string, quote)
+ # In single-quoted heredocs, everything is taken literally.
+ return string if quote == "<<'"
+
+ # TODO: Implement regexp escaping
+ return string if quote == "/" || quote.start_with?("%r")
+
+ # OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
+ return string unless string.include?("\\")
+
+ if interpolation?(quote)
+ # Appending individual escape sequences may force the string out of its intended
+ # encoding. Start out with binary and force it back later.
+ result = "".b
+
+ scanner = StringScanner.new(string)
+ while (skipped = scanner.skip_until(/\\/))
+ # Append what was just skipped over, excluding the found backslash.
+ result << string.byteslice(scanner.pos - skipped, skipped - 1)
+
+ # Simple single-character escape sequences like \n
+ if (replacement = ESCAPES[scanner.peek(1)])
+ result << replacement
+ scanner.pos += 1
+ elsif (octal = scanner.check(/[0-7]{1,3}/))
+ # \nnn
+ # NOTE: When Ruby 3.4 is required, this can become result.append_as_bytes(chr)
+ result << octal.to_i(8).chr.b
+ scanner.pos += octal.bytesize
+ elsif (hex = scanner.check(/x([0-9a-fA-F]{1,2})/))
+ # \xnn
+ result << hex[1..].to_i(16).chr.b
+ scanner.pos += hex.bytesize
+ elsif (unicode = scanner.check(/u([0-9a-fA-F]{4})/))
+ # \unnnn
+ result << unicode[1..].hex.chr(Encoding::UTF_8).b
+ scanner.pos += unicode.bytesize
+ elsif scanner.peek(3) == "u{}"
+ # https://github.com/whitequark/parser/issues/856
+ scanner.pos += 3
+ elsif (unicode_parts = scanner.check(/u{.*}/))
+ # \u{nnnn ...}
+ unicode_parts[2..-2].split.each do |unicode|
+ result << unicode.hex.chr(Encoding::UTF_8).b
+ end
+ scanner.pos += unicode_parts.bytesize
+ end
+ end
+
+ # Add remainging chars
+ result << string.byteslice(scanner.pos..)
+
+ result.force_encoding(source_buffer.source.encoding)
+
+ result
+ else
+ if quote == "'"
+ delimiter = "'"
+ else
+ delimiter = quote[2]
+ end
+
+ delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
+ string.gsub(/\\([\\#{delimiters}])/, '\1')
+ end
+ end
+
+ # Determine if characters preceeded by a backslash should be escaped or not
+ def interpolation?(quote)
+ quote != "'" && !quote.start_with?("%q", "%w", "%i")
+ end
+
+ # Determine if the string is part of a %-style array.
+ def percent_array?(quote)
+ quote.start_with?("%w", "%W", "%i", "%I")
+ end
end
end
end
diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb
index e972142672..4ba38bd0c0 100644
--- a/test/prism/ruby/parser_test.rb
+++ b/test/prism/ruby/parser_test.rb
@@ -81,78 +81,28 @@ module Prism
# These files are failing to translate their lexer output into the lexer
# output expected by the parser gem, so we'll skip them for now.
skip_tokens = [
- "comments.txt",
"dash_heredocs.txt",
- "dos_endings.txt",
"embdoc_no_newline_at_end.txt",
- "heredoc_with_comment.txt",
"heredocs_with_ignored_newlines.txt",
- "indented_file_end.txt",
"methods.txt",
"strings.txt",
"tilde_heredocs.txt",
- "xstring_with_backslash.txt",
"seattlerb/backticks_interpolation_line.txt",
"seattlerb/bug169.txt",
"seattlerb/case_in.txt",
- "seattlerb/class_comments.txt",
"seattlerb/difficult4__leading_dots2.txt",
"seattlerb/difficult6__7.txt",
"seattlerb/difficult6__8.txt",
- "seattlerb/dsym_esc_to_sym.txt",
- "seattlerb/heredoc__backslash_dos_format.txt",
- "seattlerb/heredoc_backslash_nl.txt",
- "seattlerb/heredoc_comma_arg.txt",
- "seattlerb/heredoc_squiggly_blank_line_plus_interpolation.txt",
- "seattlerb/heredoc_squiggly_blank_lines.txt",
- "seattlerb/heredoc_squiggly_interp.txt",
- "seattlerb/heredoc_squiggly_tabs_extra.txt",
- "seattlerb/heredoc_squiggly_tabs.txt",
- "seattlerb/heredoc_squiggly_visually_blank_lines.txt",
- "seattlerb/heredoc_squiggly.txt",
"seattlerb/heredoc_unicode.txt",
- "seattlerb/heredoc_with_carriage_return_escapes_windows.txt",
- "seattlerb/heredoc_with_carriage_return_escapes.txt",
- "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt",
- "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes.txt",
- "seattlerb/interpolated_symbol_array_line_breaks.txt",
- "seattlerb/interpolated_word_array_line_breaks.txt",
- "seattlerb/label_vs_string.txt",
- "seattlerb/module_comments.txt",
- "seattlerb/non_interpolated_symbol_array_line_breaks.txt",
- "seattlerb/non_interpolated_word_array_line_breaks.txt",
- "seattlerb/parse_line_block_inline_comment_leading_newlines.txt",
- "seattlerb/parse_line_block_inline_comment.txt",
- "seattlerb/parse_line_block_inline_multiline_comment.txt",
- "seattlerb/parse_line_dstr_escaped_newline.txt",
"seattlerb/parse_line_heredoc.txt",
- "seattlerb/parse_line_multiline_str_literal_n.txt",
- "seattlerb/parse_line_str_with_newline_escape.txt",
"seattlerb/pct_w_heredoc_interp_nested.txt",
- "seattlerb/qsymbols_empty_space.txt",
- "seattlerb/qw_escape_term.txt",
- "seattlerb/qWords_space.txt",
- "seattlerb/read_escape_unicode_curlies.txt",
- "seattlerb/read_escape_unicode_h4.txt",
"seattlerb/required_kwarg_no_value.txt",
"seattlerb/slashy_newlines_within_string.txt",
- "seattlerb/str_double_escaped_newline.txt",
- "seattlerb/str_double_newline.txt",
- "seattlerb/str_evstr_escape.txt",
- "seattlerb/str_newline_hash_line_number.txt",
- "seattlerb/str_single_newline.txt",
- "seattlerb/symbols_empty_space.txt",
"seattlerb/TestRubyParserShared.txt",
"unparser/corpus/literal/assignment.txt",
- "unparser/corpus/literal/dstr.txt",
- "unparser/corpus/semantic/opasgn.txt",
"whitequark/args.txt",
"whitequark/beginless_erange_after_newline.txt",
"whitequark/beginless_irange_after_newline.txt",
- "whitequark/bug_ascii_8bit_in_literal.txt",
- "whitequark/bug_def_no_paren_eql_begin.txt",
- "whitequark/dedenting_heredoc.txt",
- "whitequark/dedenting_non_interpolating_heredoc_line_continuation.txt",
"whitequark/forward_arg_with_open_args.txt",
"whitequark/kwarg_no_paren.txt",
"whitequark/lbrace_arg_after_command_args.txt",