summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <[email protected]>2024-06-04 12:05:48 -0400
committerKevin Newton <[email protected]>2024-06-05 14:40:03 -0400
commitad438623e8f4e90b7d6567c6c552be89d080dcca (patch)
tree49919529999290e3e56b47afb37c07b3d6bfeeae
parentaa61d4237dad54e01c00f8cef4ba7f7cdffb8a80 (diff)
[ruby/prism] Switch regexp parsing to use a callback from named capture groups
https://github.com/ruby/prism/commit/29d80e486e
-rw-r--r--prism/prism.c148
-rw-r--r--prism/regexp.c22
-rw-r--r--prism/regexp.h13
3 files changed, 105 insertions, 78 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 12bb55c490..5e37a39952 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -20003,89 +20003,107 @@ parse_call_operator_write(pm_parser_t *parser, pm_call_node_t *call_node, const
}
/**
- * Potentially change a =~ with a regular expression with named captures into a
- * match write node.
+ * This struct is used to pass information between the regular expression parser
+ * and the named capture callback.
*/
-static pm_node_t *
-parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call) {
- pm_string_list_t named_captures = { 0 };
- pm_node_t *result;
-
- if (pm_regexp_named_capture_group_names(pm_string_source(content), pm_string_length(content), &named_captures, parser->encoding_changed, parser->encoding) && (named_captures.length > 0)) {
- // Since we should not create a MatchWriteNode when all capture names
- // are invalid, creating a MatchWriteNode is delaid here.
- pm_match_write_node_t *match = NULL;
- pm_constant_id_list_t names = { 0 };
+typedef struct {
+ pm_parser_t *parser;
+ const pm_string_t *content;
+ pm_call_node_t *call;
+ pm_match_write_node_t *match;
+ pm_constant_id_list_t names;
+} parse_regular_expression_named_capture_data_t;
- for (size_t index = 0; index < named_captures.length; index++) {
- pm_string_t *string = &named_captures.strings[index];
+/**
+ * This callback is called when the regular expression parser encounters a named
+ * capture group.
+ */
+void
+parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
+ parse_regular_expression_named_capture_data_t *callback_data = (parse_regular_expression_named_capture_data_t *) data;
- const uint8_t *source = pm_string_source(string);
- size_t length = pm_string_length(string);
+ pm_parser_t *parser = callback_data->parser;
+ const pm_string_t *content = callback_data->content;
+ pm_call_node_t *call = callback_data->call;
+ pm_constant_id_list_t *names = &callback_data->names;
- pm_location_t location;
- pm_constant_id_t name;
+ const uint8_t *source = pm_string_source(capture);
+ size_t length = pm_string_length(capture);
- // If the name of the capture group isn't a valid identifier, we do
- // not add it to the local table.
- if (!pm_slice_is_valid_local(parser, source, source + length)) continue;
+ pm_location_t location;
+ pm_constant_id_t name;
- if (content->type == PM_STRING_SHARED) {
- // If the unescaped string is a slice of the source, then we can
- // copy the names directly. The pointers will line up.
- location = (pm_location_t) { .start = source, .end = source + length };
- name = pm_parser_constant_id_location(parser, location.start, location.end);
- } else {
- // Otherwise, the name is a slice of the malloc-ed owned string,
- // in which case we need to copy it out into a new string.
- location = call->receiver->location;
+ // If the name of the capture group isn't a valid identifier, we do
+ // not add it to the local table.
+ if (!pm_slice_is_valid_local(parser, source, source + length)) return;
- void *memory = xmalloc(length);
- if (memory == NULL) abort();
-
- memcpy(memory, source, length);
- name = pm_parser_constant_id_owned(parser, (uint8_t *) memory, length);
- }
+ if (content->type == PM_STRING_SHARED) {
+ // If the unescaped string is a slice of the source, then we can
+ // copy the names directly. The pointers will line up.
+ location = (pm_location_t) { .start = source, .end = source + length };
+ name = pm_parser_constant_id_location(parser, location.start, location.end);
+ } else {
+ // Otherwise, the name is a slice of the malloc-ed owned string,
+ // in which case we need to copy it out into a new string.
+ location = call->receiver->location;
- if (name != 0) {
- // We dont want to create duplicate targets if the capture name
- // is duplicated.
- if (pm_constant_id_list_includes(&names, name)) continue;
- pm_constant_id_list_append(&names, name);
+ void *memory = xmalloc(length);
+ if (memory == NULL) abort();
- int depth;
- if ((depth = pm_parser_local_depth_constant_id(parser, name)) == -1) {
- // If the identifier is not already a local, then we'll add
- // it to the local table unless it's a keyword.
- if (pm_local_is_keyword((const char *) source, length)) continue;
+ memcpy(memory, source, length);
+ name = pm_parser_constant_id_owned(parser, (uint8_t *) memory, length);
+ }
- pm_parser_local_add(parser, name, location.start, location.end, 0);
- }
+ // Add this name to the list of constants if it is valid, not duplicated,
+ // and not a keyword.
+ if (name != 0 && !pm_constant_id_list_includes(names, name)) {
+ pm_constant_id_list_append(names, name);
- // Here we lazily create the MatchWriteNode since we know we're
- // about to add a target.
- if (match == NULL) match = pm_match_write_node_create(parser, call);
+ int depth;
+ if ((depth = pm_parser_local_depth_constant_id(parser, name)) == -1) {
+ // If the local is not already a local but it is a keyword, then we
+ // do not want to add a capture for this.
+ if (pm_local_is_keyword((const char *) source, length)) return;
- // Next, create the local variable target and add it to the
- // list of targets for the match.
- pm_node_t *target = (pm_node_t *) pm_local_variable_target_node_create(parser, &location, name, depth == -1 ? 0 : (uint32_t) depth);
- pm_node_list_append(&match->targets, target);
- }
+ // If the identifier is not already a local, then we will add it to
+ // the local table.
+ pm_parser_local_add(parser, name, location.start, location.end, 0);
}
- if (match != NULL) {
- result = (pm_node_t *) match;
- } else {
- result = (pm_node_t *) call;
+ // Here we lazily create the MatchWriteNode since we know we're
+ // about to add a target.
+ if (callback_data->match == NULL) {
+ callback_data->match = pm_match_write_node_create(parser, call);
}
- pm_constant_id_list_free(&names);
- } else {
- result = (pm_node_t *) call;
+ // Next, create the local variable target and add it to the list of
+ // targets for the match.
+ pm_node_t *target = (pm_node_t *) pm_local_variable_target_node_create(parser, &location, name, depth == -1 ? 0 : (uint32_t) depth);
+ pm_node_list_append(&callback_data->match->targets, target);
}
+}
- pm_string_list_free(&named_captures);
- return result;
+/**
+ * Potentially change a =~ with a regular expression with named captures into a
+ * match write node.
+ */
+static pm_node_t *
+parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call) {
+ parse_regular_expression_named_capture_data_t callback_data = {
+ .parser = parser,
+ .content = content,
+ .call = call,
+ .names = { 0 }
+ };
+
+ pm_regexp_parse(pm_string_source(content), pm_string_length(content), parser->encoding_changed, parser->encoding, parse_regular_expression_named_capture, &callback_data);
+ pm_constant_id_list_free(&callback_data.names);
+
+ if (callback_data.match != NULL) {
+ return (pm_node_t *) callback_data.match;
+ } else {
+ return (pm_node_t *) call;
+ }
}
static inline pm_node_t *
diff --git a/prism/regexp.c b/prism/regexp.c
index 6e0fdd295c..15bb7c2611 100644
--- a/prism/regexp.c
+++ b/prism/regexp.c
@@ -13,28 +13,32 @@ typedef struct {
/** A pointer to the end of the source that we are parsing. */
const uint8_t *end;
- /** A list of named captures that we've found. */
- pm_string_list_t *named_captures;
-
/** Whether the encoding has changed from the default. */
bool encoding_changed;
/** The encoding of the source. */
const pm_encoding_t *encoding;
+
+ /** The callback to call when a named capture group is found. */
+ pm_regexp_name_callback_t name_callback;
+
+ /** The data to pass to the name callback. */
+ void *name_data;
} pm_regexp_parser_t;
/**
* This initializes a new parser with the given source.
*/
static void
-pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
+pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, bool encoding_changed, const pm_encoding_t *encoding, pm_regexp_name_callback_t name_callback, void *name_data) {
*parser = (pm_regexp_parser_t) {
.start = start,
.cursor = start,
.end = end,
- .named_captures = named_captures,
.encoding_changed = encoding_changed,
- .encoding = encoding
+ .encoding = encoding,
+ .name_callback = name_callback,
+ .name_data = name_data
};
}
@@ -45,7 +49,7 @@ static void
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
pm_string_t string;
pm_string_shared_init(&string, start, end);
- pm_string_list_append(parser->named_captures, &string);
+ parser->name_callback(&string, parser->name_data);
pm_string_free(&string);
}
@@ -646,8 +650,8 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
* groups.
*/
PRISM_EXPORTED_FUNCTION bool
-pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
+pm_regexp_parse(const uint8_t *source, size_t size, bool encoding_changed, const pm_encoding_t *encoding, pm_regexp_name_callback_t name_callback, void *name_data) {
pm_regexp_parser_t parser;
- pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
+ pm_regexp_parser_init(&parser, source, source + size, encoding_changed, encoding, name_callback, name_data);
return pm_regexp_parse_pattern(&parser);
}
diff --git a/prism/regexp.h b/prism/regexp.h
index c5ceab11f9..ca5185d2fd 100644
--- a/prism/regexp.h
+++ b/prism/regexp.h
@@ -18,16 +18,21 @@
#include <string.h>
/**
- * Parse a regular expression and extract the names of all of the named capture
- * groups.
+ * This callback is called when a named capture group is found.
+ */
+typedef void (*pm_regexp_name_callback_t)(const pm_string_t *name, void *data);
+
+/**
+ * Parse a regular expression.
*
* @param source The source code to parse.
* @param size The size of the source code.
- * @param named_captures The list to add the names of the named capture groups.
* @param encoding_changed Whether or not the encoding changed from the default.
* @param encoding The encoding of the source code.
+ * @param name_callback The callback to call when a named capture group is found.
+ * @param name_data The data to pass to the name callback.
* @return Whether or not the parsing was successful.
*/
-PRISM_EXPORTED_FUNCTION bool pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding);
+PRISM_EXPORTED_FUNCTION bool pm_regexp_parse(const uint8_t *source, size_t size, bool encoding_changed, const pm_encoding_t *encoding, pm_regexp_name_callback_t name_callback, void *name_data);
#endif