diff options
author | Kevin Newton <[email protected]> | 2024-03-08 13:49:23 -0500 |
---|---|---|
committer | git <[email protected]> | 2024-03-08 19:18:42 +0000 |
commit | cfcbbbd87c16e9501eebbfb2bd2d714e3c90596c (patch) | |
tree | e2b28786d80b0d3e1805b45a8f10589425a12f3c | |
parent | dd24d88473d432f166894bd6d187a6db2c27bfc5 (diff) |
[ruby/prism] Simplify the ASCII-only check in regexp
https://github.com/ruby/prism/commit/b0935be727
-rw-r--r-- | prism/prism.c | 47 |
1 files changed, 22 insertions, 25 deletions
diff --git a/prism/prism.c b/prism/prism.c index 903c7511de..9eb8dd9a67 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -5950,7 +5950,7 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) { } static pm_node_flags_t -parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) { +parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) { assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) || (modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) || (modifier == 'e' && modifier_encoding == PM_ENCODING_EUC_JP_ENTRY) || @@ -5961,7 +5961,6 @@ parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, con // If an ASCII-only string without character escapes is used with an encoding modifier, then resulting Regexp // has the modifier encoding, unless the ASCII-8BIT modifier is used, in which case the Regexp "downgrades" to // the US-ASCII encoding. - bool ascii_only = pm_ascii_only_p(contents); if (ascii_only) { return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags; } @@ -6011,7 +6010,7 @@ parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, con * may be explicitly set with an escape sequence. */ static pm_node_flags_t -parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags) { +parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags) { // TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report. bool valid_unicode_range = true; if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) { @@ -6021,7 +6020,7 @@ parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_str // US-ASCII strings do not admit multi-byte character literals. However, character escape sequences corresponding // to multi-byte characters are allowed. - if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !pm_ascii_only_p(contents)) { + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) { // CRuby will continue processing even though a SyntaxError has already been detected. It may result in the // following error message appearing twice. We do the same for compatibility. PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); @@ -6036,25 +6035,25 @@ parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_str */ if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY); + return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY); } if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'u', PM_ENCODING_UTF_8_ENTRY); + return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY); } if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'e', PM_ENCODING_EUC_JP_ENTRY); + return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY); } if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY); + return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY); } // At this point no encoding modifiers will be present on the regular expression as they would have already // been processed. Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all // regular expressions without an encoding modifier appearing in source are eligible for "downgrading" to US-ASCII. - if (pm_ascii_only_p(contents)) { + if (ascii_only) { return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING; } @@ -10594,10 +10593,10 @@ parser_lex(pm_parser_t *parser) { parser->current.end++; if (peek(parser) != '\n') { if (lex_mode->as.regexp.terminator != '\r') { - pm_regexp_token_buffer_push_byte(&token_buffer, '\\'); + pm_token_buffer_push_byte(&token_buffer.base, '\\'); } - pm_token_buffer_push_byte(&token_buffer.base, '\r'); pm_regexp_token_buffer_push_byte(&token_buffer, '\r'); + pm_token_buffer_push_byte(&token_buffer.base, '\r'); break; } /* fallthrough */ @@ -10621,7 +10620,7 @@ parser_lex(pm_parser_t *parser) { case 'M': case 'u': case 'x': - escape_read(parser, &token_buffer.base.buffer, &token_buffer.regexp_buffer, PM_ESCAPE_FLAG_REGEXP); + escape_read(parser, &token_buffer.regexp_buffer, &token_buffer.base.buffer, PM_ESCAPE_FLAG_REGEXP); break; default: if (lex_mode->as.regexp.terminator == peeked) { @@ -10632,19 +10631,19 @@ parser_lex(pm_parser_t *parser) { case '$': case ')': case '*': case '+': case '.': case '>': case '?': case ']': case '^': case '|': case '}': - pm_regexp_token_buffer_push_byte(&token_buffer, '\\'); + pm_token_buffer_push_byte(&token_buffer.base, '\\'); break; default: break; } - pm_token_buffer_push_byte(&token_buffer.base, peeked); pm_regexp_token_buffer_push_byte(&token_buffer, peeked); + pm_token_buffer_push_byte(&token_buffer.base, peeked); parser->current.end++; break; } - if (peeked < 0x80) pm_regexp_token_buffer_push_byte(&token_buffer, '\\'); + if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer.base, '\\'); pm_regexp_token_buffer_push_escaped(&token_buffer, parser); break; } @@ -17159,17 +17158,17 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t content = parser->current; pm_string_t source = parser->current_regular_expression_source; + bool ascii_only = pm_ascii_only_p(&source); pm_string_constant_init(&parser->current_regular_expression_source, "", 0); - parser_lex(parser); // If we hit an end, then we can create a regular expression node // without interpolation, which can be represented more succinctly and // more easily compiled. if (accept1(parser, PM_TOKEN_REGEXP_END)) { - pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &source); - pm_node_flag_set(node, parse_and_validate_regular_expression_encoding(parser, &source, &unescaped, node->flags)); - pm_string_free(&unescaped); + pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); + pm_node_flag_set(node, parse_and_validate_regular_expression_encoding(parser, &unescaped, ascii_only, node->flags)); + pm_string_free(&source); return node; } @@ -17179,9 +17178,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); - pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &source); + pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped); pm_interpolated_regular_expression_node_append(interpolated, part); - pm_string_free(&unescaped); + pm_string_free(&source); } else { // If the first part of the body of the regular expression is not a // string content, then we have interpolation and we need to create an @@ -17197,11 +17196,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); - pm_node_t *node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->current, &closing, &parser->current_regular_expression_source); + pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); pm_node_flag_set(node, parse_unescaped_encoding(parser)); - - pm_string_free(&parser->current_string); - pm_string_constant_init(&parser->current_regular_expression_source, "", 0); + pm_string_free(&parser->current_regular_expression_source); parser_lex(parser); pm_interpolated_regular_expression_node_append(interpolated, node); |