1 #include "prism/util/pm_strpbrk.h"
4 * Add an invalid multibyte character error to the parser.
7 pm_strpbrk_invalid_multibyte_character(pm_parser_t
*parser
, const uint8_t *start
, const uint8_t *end
) {
8 pm_diagnostic_list_append_format(&parser
->error_list
, start
, end
, PM_ERR_INVALID_MULTIBYTE_CHARACTER
, *start
);
12 * Set the explicit encoding for the parser to the current encoding.
15 pm_strpbrk_explicit_encoding_set(pm_parser_t
*parser
, const uint8_t *source
, size_t width
) {
16 if (parser
->explicit_encoding
!= NULL
) {
17 if (parser
->explicit_encoding
== parser
->encoding
) {
18 // Okay, we already locked to this encoding.
19 } else if (parser
->explicit_encoding
== PM_ENCODING_UTF_8_ENTRY
) {
20 // Not okay, we already found a Unicode escape sequence and this
22 pm_diagnostic_list_append_format(&parser
->error_list
, source
, source
+ width
, PM_ERR_MIXED_ENCODING
, parser
->encoding
->name
);
24 // Should not be anything else.
25 assert(false && "unreachable");
29 parser
->explicit_encoding
= parser
->encoding
;
33 * This is the default path.
35 static inline const uint8_t *
36 pm_strpbrk_utf8(pm_parser_t
*parser
, const uint8_t *source
, const uint8_t *charset
, size_t maximum
, bool validate
) {
39 while (index
< maximum
) {
40 if (strchr((const char *) charset
, source
[index
]) != NULL
) {
41 return source
+ index
;
44 if (source
[index
] < 0x80) {
47 size_t width
= pm_encoding_utf_8_char_width(source
+ index
, (ptrdiff_t) (maximum
- index
));
51 } else if (!validate
) {
54 // At this point we know we have an invalid multibyte character.
55 // We'll walk forward as far as we can until we find the next
56 // valid character so that we don't spam the user with a ton of
57 // the same kind of error.
58 const size_t start
= index
;
62 } while (index
< maximum
&& pm_encoding_utf_8_char_width(source
+ index
, (ptrdiff_t) (maximum
- index
)) == 0);
64 pm_strpbrk_invalid_multibyte_character(parser
, source
+ start
, source
+ index
);
73 * This is the path when the encoding is ASCII-8BIT.
75 static inline const uint8_t *
76 pm_strpbrk_ascii_8bit(pm_parser_t
*parser
, const uint8_t *source
, const uint8_t *charset
, size_t maximum
, bool validate
) {
79 while (index
< maximum
) {
80 if (strchr((const char *) charset
, source
[index
]) != NULL
) {
81 return source
+ index
;
84 if (validate
&& source
[index
] >= 0x80) pm_strpbrk_explicit_encoding_set(parser
, source
, 1);
92 * This is the slow path that does care about the encoding.
94 static inline const uint8_t *
95 pm_strpbrk_multi_byte(pm_parser_t
*parser
, const uint8_t *source
, const uint8_t *charset
, size_t maximum
, bool validate
) {
97 const pm_encoding_t
*encoding
= parser
->encoding
;
99 while (index
< maximum
) {
100 if (strchr((const char *) charset
, source
[index
]) != NULL
) {
101 return source
+ index
;
104 if (source
[index
] < 0x80) {
107 size_t width
= encoding
->char_width(source
+ index
, (ptrdiff_t) (maximum
- index
));
108 if (validate
) pm_strpbrk_explicit_encoding_set(parser
, source
, width
);
112 } else if (!validate
) {
115 // At this point we know we have an invalid multibyte character.
116 // We'll walk forward as far as we can until we find the next
117 // valid character so that we don't spam the user with a ton of
118 // the same kind of error.
119 const size_t start
= index
;
123 } while (index
< maximum
&& encoding
->char_width(source
+ index
, (ptrdiff_t) (maximum
- index
)) == 0);
125 pm_strpbrk_invalid_multibyte_character(parser
, source
+ start
, source
+ index
);
134 * This is the fast path that does not care about the encoding because we know
135 * the encoding only supports single-byte characters.
137 static inline const uint8_t *
138 pm_strpbrk_single_byte(pm_parser_t
*parser
, const uint8_t *source
, const uint8_t *charset
, size_t maximum
, bool validate
) {
140 const pm_encoding_t
*encoding
= parser
->encoding
;
142 while (index
< maximum
) {
143 if (strchr((const char *) charset
, source
[index
]) != NULL
) {
144 return source
+ index
;
147 if (source
[index
] < 0x80 || !validate
) {
150 size_t width
= encoding
->char_width(source
+ index
, (ptrdiff_t) (maximum
- index
));
151 pm_strpbrk_explicit_encoding_set(parser
, source
, width
);
156 // At this point we know we have an invalid multibyte character.
157 // We'll walk forward as far as we can until we find the next
158 // valid character so that we don't spam the user with a ton of
159 // the same kind of error.
160 const size_t start
= index
;
164 } while (index
< maximum
&& encoding
->char_width(source
+ index
, (ptrdiff_t) (maximum
- index
)) == 0);
166 pm_strpbrk_invalid_multibyte_character(parser
, source
+ start
, source
+ index
);
175 * Here we have rolled our own version of strpbrk. The standard library strpbrk
176 * has undefined behavior when the source string is not null-terminated. We want
177 * to support strings that are not null-terminated because pm_parse does not
178 * have the contract that the string is null-terminated. (This is desirable
179 * because it means the extension can call pm_parse with the result of a call to
182 * The standard library strpbrk also does not support passing a maximum length
183 * to search. We want to support this for the reason mentioned above, but we
184 * also don't want it to stop on null bytes. Ruby actually allows null bytes
185 * within strings, comments, regular expressions, etc. So we need to be able to
188 * Finally, we want to support encodings wherein the charset could contain
189 * characters that are trailing bytes of multi-byte characters. For example, in
190 * Shift_JIS, the backslash character can be a trailing byte. In that case we
191 * need to take a slower path and iterate one multi-byte character at a time.
194 pm_strpbrk(pm_parser_t
*parser
, const uint8_t *source
, const uint8_t *charset
, ptrdiff_t length
, bool validate
) {
197 } else if (!parser
->encoding_changed
) {
198 return pm_strpbrk_utf8(parser
, source
, charset
, (size_t) length
, validate
);
199 } else if (parser
->encoding
== PM_ENCODING_ASCII_8BIT_ENTRY
) {
200 return pm_strpbrk_ascii_8bit(parser
, source
, charset
, (size_t) length
, validate
);
201 } else if (parser
->encoding
->multibyte
) {
202 return pm_strpbrk_multi_byte(parser
, source
, charset
, (size_t) length
, validate
);
204 return pm_strpbrk_single_byte(parser
, source
, charset
, (size_t) length
, validate
);