[ruby/net-http] Freeze some constants to improve Ractor compatibility
[ruby.git] / prism / util / pm_strpbrk.c
blob916a4cc3fd3c16803e20ed203c141f9d1848402b
1 #include "prism/util/pm_strpbrk.h"
3 /**
4 * Add an invalid multibyte character error to the parser.
5 */
6 static inline void
7 pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
8 pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
11 /**
12 * Set the explicit encoding for the parser to the current encoding.
14 static inline void
15 pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) {
16 if (parser->explicit_encoding != NULL) {
17 if (parser->explicit_encoding == parser->encoding) {
18 // Okay, we already locked to this encoding.
19 } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
20 // Not okay, we already found a Unicode escape sequence and this
21 // conflicts.
22 pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name);
23 } else {
24 // Should not be anything else.
25 assert(false && "unreachable");
29 parser->explicit_encoding = parser->encoding;
32 /**
33 * This is the default path.
35 static inline const uint8_t *
36 pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
37 size_t index = 0;
39 while (index < maximum) {
40 if (strchr((const char *) charset, source[index]) != NULL) {
41 return source + index;
44 if (source[index] < 0x80) {
45 index++;
46 } else {
47 size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index));
49 if (width > 0) {
50 index += width;
51 } else if (!validate) {
52 index++;
53 } else {
54 // At this point we know we have an invalid multibyte character.
55 // We'll walk forward as far as we can until we find the next
56 // valid character so that we don't spam the user with a ton of
57 // the same kind of error.
58 const size_t start = index;
60 do {
61 index++;
62 } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
64 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
69 return NULL;
72 /**
73 * This is the path when the encoding is ASCII-8BIT.
75 static inline const uint8_t *
76 pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
77 size_t index = 0;
79 while (index < maximum) {
80 if (strchr((const char *) charset, source[index]) != NULL) {
81 return source + index;
84 if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1);
85 index++;
88 return NULL;
91 /**
92 * This is the slow path that does care about the encoding.
94 static inline const uint8_t *
95 pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
96 size_t index = 0;
97 const pm_encoding_t *encoding = parser->encoding;
99 while (index < maximum) {
100 if (strchr((const char *) charset, source[index]) != NULL) {
101 return source + index;
104 if (source[index] < 0x80) {
105 index++;
106 } else {
107 size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
108 if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width);
110 if (width > 0) {
111 index += width;
112 } else if (!validate) {
113 index++;
114 } else {
115 // At this point we know we have an invalid multibyte character.
116 // We'll walk forward as far as we can until we find the next
117 // valid character so that we don't spam the user with a ton of
118 // the same kind of error.
119 const size_t start = index;
121 do {
122 index++;
123 } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
125 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
130 return NULL;
134 * This is the fast path that does not care about the encoding because we know
135 * the encoding only supports single-byte characters.
137 static inline const uint8_t *
138 pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
139 size_t index = 0;
140 const pm_encoding_t *encoding = parser->encoding;
142 while (index < maximum) {
143 if (strchr((const char *) charset, source[index]) != NULL) {
144 return source + index;
147 if (source[index] < 0x80 || !validate) {
148 index++;
149 } else {
150 size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
151 pm_strpbrk_explicit_encoding_set(parser, source, width);
153 if (width > 0) {
154 index += width;
155 } else {
156 // At this point we know we have an invalid multibyte character.
157 // We'll walk forward as far as we can until we find the next
158 // valid character so that we don't spam the user with a ton of
159 // the same kind of error.
160 const size_t start = index;
162 do {
163 index++;
164 } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
166 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
171 return NULL;
175 * Here we have rolled our own version of strpbrk. The standard library strpbrk
176 * has undefined behavior when the source string is not null-terminated. We want
177 * to support strings that are not null-terminated because pm_parse does not
178 * have the contract that the string is null-terminated. (This is desirable
179 * because it means the extension can call pm_parse with the result of a call to
180 * mmap).
182 * The standard library strpbrk also does not support passing a maximum length
183 * to search. We want to support this for the reason mentioned above, but we
184 * also don't want it to stop on null bytes. Ruby actually allows null bytes
185 * within strings, comments, regular expressions, etc. So we need to be able to
186 * skip past them.
188 * Finally, we want to support encodings wherein the charset could contain
189 * characters that are trailing bytes of multi-byte characters. For example, in
190 * Shift_JIS, the backslash character can be a trailing byte. In that case we
191 * need to take a slower path and iterate one multi-byte character at a time.
193 const uint8_t *
194 pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) {
195 if (length <= 0) {
196 return NULL;
197 } else if (!parser->encoding_changed) {
198 return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
199 } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
200 return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate);
201 } else if (parser->encoding->multibyte) {
202 return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
203 } else {
204 return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);