summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/prism/translation/parser/compiler.rb2
-rw-r--r--lib/prism/translation/parser/lexer.rb35
-rw-r--r--test/prism/ruby/parser_test.rb4
3 files changed, 29 insertions, 12 deletions
diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb
index 26c5c174f4..a7e29d9dfc 100644
--- a/lib/prism/translation/parser/compiler.rb
+++ b/lib/prism/translation/parser/compiler.rb
@@ -1507,7 +1507,7 @@ module Prism
elsif node.content.include?("\n")
string_nodes_from_line_continuations(node.unescaped, node.content, node.content_loc.start_offset, node.opening)
else
- [builder.string_internal(token(node.content_loc))]
+ [builder.string_internal([node.unescaped, srange(node.content_loc)])]
end
builder.regexp_compose(
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
index a54d355652..1fa2723f03 100644
--- a/lib/prism/translation/parser/lexer.rb
+++ b/lib/prism/translation/parser/lexer.rb
@@ -633,18 +633,34 @@ module Prism
DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
private_constant :DELIMITER_SYMETRY
+
+ # https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14
+ REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"]
+ private_constant :REGEXP_META_CHARACTERS
+
# Apply Ruby string escaping rules
def unescape_string(string, quote)
# In single-quoted heredocs, everything is taken literally.
return string if quote == "<<'"
- # TODO: Implement regexp escaping
- return string if quote == "/" || quote.start_with?("%r")
-
# OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
return string unless string.include?("\\")
- if interpolation?(quote)
+ # Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc.
+ delimiter = quote[-1]
+
+ if regexp?(quote)
+ # Should be escaped handled to single-quoted heredocs. The only character that is
+ # allowed to be escaped is the delimiter, except when that also has special meaning
+ # in the regexp. Since all the symetry delimiters have special meaning, they don't need
+ # to be considered separately.
+ if REGEXP_META_CHARACTERS.include?(delimiter)
+ string
+ else
+ # There can never be an even amount of backslashes. It would be a syntax error.
+ string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1')
+ end
+ elsif interpolation?(quote)
# Appending individual escape sequences may force the string out of its intended
# encoding. Start out with binary and force it back later.
result = "".b
@@ -690,12 +706,6 @@ module Prism
result
else
- if quote == "'"
- delimiter = "'"
- else
- delimiter = quote[2]
- end
-
delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
string.gsub(/\\([\\#{delimiters}])/, '\1')
end
@@ -706,6 +716,11 @@ module Prism
quote != "'" && !quote.start_with?("%q", "%w", "%i")
end
+ # Regexp allow interpolation but are handled differently during unescaping
+ def regexp?(quote)
+ quote == "/" || quote.start_with?("%r")
+ end
+
# Determine if the string is part of a %-style array.
def percent_array?(quote)
quote.start_with?("%w", "%W", "%i", "%I")
diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb
index 8d791da369..fae8ec8dec 100644
--- a/test/prism/ruby/parser_test.rb
+++ b/test/prism/ruby/parser_test.rb
@@ -69,13 +69,15 @@ module Prism
# https://github.com/whitequark/parser/issues/950
"whitequark/dedenting_interpolating_heredoc_fake_line_continuation.txt",
+
+ # Contains an escaped multibyte character. This is supposed to drop to backslash
+ "seattlerb/regexp_escape_extended.txt",
]
# These files are either failing to parse or failing to translate, so we'll
# skip them for now.
skip_all = skip_incorrect | [
"unescaping.txt",
- "seattlerb/bug190.txt",
"seattlerb/heredoc_with_extra_carriage_returns_windows.txt",
"seattlerb/heredoc_with_only_carriage_returns_windows.txt",
"seattlerb/heredoc_with_only_carriage_returns.txt",