diff options
author | Kevin Newton <[email protected]> | 2024-06-04 12:05:48 -0400 |
---|---|---|
committer | Kevin Newton <[email protected]> | 2024-06-05 14:40:03 -0400 |
commit | ad438623e8f4e90b7d6567c6c552be89d080dcca (patch) | |
tree | 49919529999290e3e56b47afb37c07b3d6bfeeae | |
parent | aa61d4237dad54e01c00f8cef4ba7f7cdffb8a80 (diff) |
[ruby/prism] Switch regexp parsing to use a callback from named capture groups
https://github.com/ruby/prism/commit/29d80e486e
-rw-r--r-- | prism/prism.c | 148 | ||||
-rw-r--r-- | prism/regexp.c | 22 | ||||
-rw-r--r-- | prism/regexp.h | 13 |
3 files changed, 105 insertions, 78 deletions
diff --git a/prism/prism.c b/prism/prism.c index 12bb55c490..5e37a39952 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -20003,89 +20003,107 @@ parse_call_operator_write(pm_parser_t *parser, pm_call_node_t *call_node, const } /** - * Potentially change a =~ with a regular expression with named captures into a - * match write node. + * This struct is used to pass information between the regular expression parser + * and the named capture callback. */ -static pm_node_t * -parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call) { - pm_string_list_t named_captures = { 0 }; - pm_node_t *result; - - if (pm_regexp_named_capture_group_names(pm_string_source(content), pm_string_length(content), &named_captures, parser->encoding_changed, parser->encoding) && (named_captures.length > 0)) { - // Since we should not create a MatchWriteNode when all capture names - // are invalid, creating a MatchWriteNode is delaid here. - pm_match_write_node_t *match = NULL; - pm_constant_id_list_t names = { 0 }; +typedef struct { + pm_parser_t *parser; + const pm_string_t *content; + pm_call_node_t *call; + pm_match_write_node_t *match; + pm_constant_id_list_t names; +} parse_regular_expression_named_capture_data_t; - for (size_t index = 0; index < named_captures.length; index++) { - pm_string_t *string = &named_captures.strings[index]; +/** + * This callback is called when the regular expression parser encounters a named + * capture group. + */ +void +parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { + parse_regular_expression_named_capture_data_t *callback_data = (parse_regular_expression_named_capture_data_t *) data; - const uint8_t *source = pm_string_source(string); - size_t length = pm_string_length(string); + pm_parser_t *parser = callback_data->parser; + const pm_string_t *content = callback_data->content; + pm_call_node_t *call = callback_data->call; + pm_constant_id_list_t *names = &callback_data->names; - pm_location_t location; - pm_constant_id_t name; + const uint8_t *source = pm_string_source(capture); + size_t length = pm_string_length(capture); - // If the name of the capture group isn't a valid identifier, we do - // not add it to the local table. - if (!pm_slice_is_valid_local(parser, source, source + length)) continue; + pm_location_t location; + pm_constant_id_t name; - if (content->type == PM_STRING_SHARED) { - // If the unescaped string is a slice of the source, then we can - // copy the names directly. The pointers will line up. - location = (pm_location_t) { .start = source, .end = source + length }; - name = pm_parser_constant_id_location(parser, location.start, location.end); - } else { - // Otherwise, the name is a slice of the malloc-ed owned string, - // in which case we need to copy it out into a new string. - location = call->receiver->location; + // If the name of the capture group isn't a valid identifier, we do + // not add it to the local table. + if (!pm_slice_is_valid_local(parser, source, source + length)) return; - void *memory = xmalloc(length); - if (memory == NULL) abort(); - - memcpy(memory, source, length); - name = pm_parser_constant_id_owned(parser, (uint8_t *) memory, length); - } + if (content->type == PM_STRING_SHARED) { + // If the unescaped string is a slice of the source, then we can + // copy the names directly. The pointers will line up. + location = (pm_location_t) { .start = source, .end = source + length }; + name = pm_parser_constant_id_location(parser, location.start, location.end); + } else { + // Otherwise, the name is a slice of the malloc-ed owned string, + // in which case we need to copy it out into a new string. + location = call->receiver->location; - if (name != 0) { - // We dont want to create duplicate targets if the capture name - // is duplicated. - if (pm_constant_id_list_includes(&names, name)) continue; - pm_constant_id_list_append(&names, name); + void *memory = xmalloc(length); + if (memory == NULL) abort(); - int depth; - if ((depth = pm_parser_local_depth_constant_id(parser, name)) == -1) { - // If the identifier is not already a local, then we'll add - // it to the local table unless it's a keyword. - if (pm_local_is_keyword((const char *) source, length)) continue; + memcpy(memory, source, length); + name = pm_parser_constant_id_owned(parser, (uint8_t *) memory, length); + } - pm_parser_local_add(parser, name, location.start, location.end, 0); - } + // Add this name to the list of constants if it is valid, not duplicated, + // and not a keyword. + if (name != 0 && !pm_constant_id_list_includes(names, name)) { + pm_constant_id_list_append(names, name); - // Here we lazily create the MatchWriteNode since we know we're - // about to add a target. - if (match == NULL) match = pm_match_write_node_create(parser, call); + int depth; + if ((depth = pm_parser_local_depth_constant_id(parser, name)) == -1) { + // If the local is not already a local but it is a keyword, then we + // do not want to add a capture for this. + if (pm_local_is_keyword((const char *) source, length)) return; - // Next, create the local variable target and add it to the - // list of targets for the match. - pm_node_t *target = (pm_node_t *) pm_local_variable_target_node_create(parser, &location, name, depth == -1 ? 0 : (uint32_t) depth); - pm_node_list_append(&match->targets, target); - } + // If the identifier is not already a local, then we will add it to + // the local table. + pm_parser_local_add(parser, name, location.start, location.end, 0); } - if (match != NULL) { - result = (pm_node_t *) match; - } else { - result = (pm_node_t *) call; + // Here we lazily create the MatchWriteNode since we know we're + // about to add a target. + if (callback_data->match == NULL) { + callback_data->match = pm_match_write_node_create(parser, call); } - pm_constant_id_list_free(&names); - } else { - result = (pm_node_t *) call; + // Next, create the local variable target and add it to the list of + // targets for the match. + pm_node_t *target = (pm_node_t *) pm_local_variable_target_node_create(parser, &location, name, depth == -1 ? 0 : (uint32_t) depth); + pm_node_list_append(&callback_data->match->targets, target); } +} - pm_string_list_free(&named_captures); - return result; +/** + * Potentially change a =~ with a regular expression with named captures into a + * match write node. + */ +static pm_node_t * +parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call) { + parse_regular_expression_named_capture_data_t callback_data = { + .parser = parser, + .content = content, + .call = call, + .names = { 0 } + }; + + pm_regexp_parse(pm_string_source(content), pm_string_length(content), parser->encoding_changed, parser->encoding, parse_regular_expression_named_capture, &callback_data); + pm_constant_id_list_free(&callback_data.names); + + if (callback_data.match != NULL) { + return (pm_node_t *) callback_data.match; + } else { + return (pm_node_t *) call; + } } static inline pm_node_t * diff --git a/prism/regexp.c b/prism/regexp.c index 6e0fdd295c..15bb7c2611 100644 --- a/prism/regexp.c +++ b/prism/regexp.c @@ -13,28 +13,32 @@ typedef struct { /** A pointer to the end of the source that we are parsing. */ const uint8_t *end; - /** A list of named captures that we've found. */ - pm_string_list_t *named_captures; - /** Whether the encoding has changed from the default. */ bool encoding_changed; /** The encoding of the source. */ const pm_encoding_t *encoding; + + /** The callback to call when a named capture group is found. */ + pm_regexp_name_callback_t name_callback; + + /** The data to pass to the name callback. */ + void *name_data; } pm_regexp_parser_t; /** * This initializes a new parser with the given source. */ static void -pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) { +pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, bool encoding_changed, const pm_encoding_t *encoding, pm_regexp_name_callback_t name_callback, void *name_data) { *parser = (pm_regexp_parser_t) { .start = start, .cursor = start, .end = end, - .named_captures = named_captures, .encoding_changed = encoding_changed, - .encoding = encoding + .encoding = encoding, + .name_callback = name_callback, + .name_data = name_data }; } @@ -45,7 +49,7 @@ static void pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) { pm_string_t string; pm_string_shared_init(&string, start, end); - pm_string_list_append(parser->named_captures, &string); + parser->name_callback(&string, parser->name_data); pm_string_free(&string); } @@ -646,8 +650,8 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) { * groups. */ PRISM_EXPORTED_FUNCTION bool -pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) { +pm_regexp_parse(const uint8_t *source, size_t size, bool encoding_changed, const pm_encoding_t *encoding, pm_regexp_name_callback_t name_callback, void *name_data) { pm_regexp_parser_t parser; - pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding); + pm_regexp_parser_init(&parser, source, source + size, encoding_changed, encoding, name_callback, name_data); return pm_regexp_parse_pattern(&parser); } diff --git a/prism/regexp.h b/prism/regexp.h index c5ceab11f9..ca5185d2fd 100644 --- a/prism/regexp.h +++ b/prism/regexp.h @@ -18,16 +18,21 @@ #include <string.h> /** - * Parse a regular expression and extract the names of all of the named capture - * groups. + * This callback is called when a named capture group is found. + */ +typedef void (*pm_regexp_name_callback_t)(const pm_string_t *name, void *data); + +/** + * Parse a regular expression. * * @param source The source code to parse. * @param size The size of the source code. - * @param named_captures The list to add the names of the named capture groups. * @param encoding_changed Whether or not the encoding changed from the default. * @param encoding The encoding of the source code. + * @param name_callback The callback to call when a named capture group is found. + * @param name_data The data to pass to the name callback. * @return Whether or not the parsing was successful. */ -PRISM_EXPORTED_FUNCTION bool pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding); +PRISM_EXPORTED_FUNCTION bool pm_regexp_parse(const uint8_t *source, size_t size, bool encoding_changed, const pm_encoding_t *encoding, pm_regexp_name_callback_t name_callback, void *name_data); #endif |