prism/util/pm_strpbrk.c

   1 #include "prism/util/pm_strpbrk.h"
   2
   3 /**
   4  * Add an invalid multibyte character error to the parser.
   5  */
   6 static inline void
   7 pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
   8     pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
   9 }
  10
  11 /**
  12  * Set the explicit encoding for the parser to the current encoding.
  13  */
  14 static inline void
  15 pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) {
  16     if (parser->explicit_encoding != NULL) {
  17         if (parser->explicit_encoding == parser->encoding) {
  18             // Okay, we already locked to this encoding.
  19         } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
  20             // Not okay, we already found a Unicode escape sequence and this
  21             // conflicts.
  22             pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name);
  23         } else {
  24             // Should not be anything else.
  25             assert(false && "unreachable");
  26         }
  27     }
  28
  29     parser->explicit_encoding = parser->encoding;
  30 }
  31
  32 /**
  33  * This is the default path.
  34  */
  35 static inline const uint8_t *
  36 pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
  37     size_t index = 0;
  38
  39     while (index < maximum) {
  40         if (strchr((const char *) charset, source[index]) != NULL) {
  41             return source + index;
  42         }
  43
  44         if (source[index] < 0x80) {
  45             index++;
  46         } else {
  47             size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index));
  48
  49             if (width > 0) {
  50                 index += width;
  51             } else if (!validate) {
  52                 index++;
  53             } else {
  54                 // At this point we know we have an invalid multibyte character.
  55                 // We'll walk forward as far as we can until we find the next
  56                 // valid character so that we don't spam the user with a ton of
  57                 // the same kind of error.
  58                 const size_t start = index;
  59
  60                 do {
  61                     index++;
  62                 } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
  63
  64                 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
  65             }
  66         }
  67     }
  68
  69     return NULL;
  70 }
  71
  72 /**
  73  * This is the path when the encoding is ASCII-8BIT.
  74  */
  75 static inline const uint8_t *
  76 pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
  77     size_t index = 0;
  78
  79     while (index < maximum) {
  80         if (strchr((const char *) charset, source[index]) != NULL) {
  81             return source + index;
  82         }
  83
  84         if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1);
  85         index++;
  86     }
  87
  88     return NULL;
  89 }
  90
  91 /**
  92  * This is the slow path that does care about the encoding.
  93  */
  94 static inline const uint8_t *
  95 pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
  96     size_t index = 0;
  97     const pm_encoding_t *encoding = parser->encoding;
  98
  99     while (index < maximum) {
 100         if (strchr((const char *) charset, source[index]) != NULL) {
 101             return source + index;
 102         }
 103
 104         if (source[index] < 0x80) {
 105             index++;
 106         } else {
 107             size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
 108             if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width);
 109
 110             if (width > 0) {
 111                 index += width;
 112             } else if (!validate) {
 113                 index++;
 114             } else {
 115                 // At this point we know we have an invalid multibyte character.
 116                 // We'll walk forward as far as we can until we find the next
 117                 // valid character so that we don't spam the user with a ton of
 118                 // the same kind of error.
 119                 const size_t start = index;
 120
 121                 do {
 122                     index++;
 123                 } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
 124
 125                 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
 126             }
 127         }
 128     }
 129
 130     return NULL;
 131 }
 132
 133 /**
 134  * This is the fast path that does not care about the encoding because we know
 135  * the encoding only supports single-byte characters.
 136  */
 137 static inline const uint8_t *
 138 pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
 139     size_t index = 0;
 140     const pm_encoding_t *encoding = parser->encoding;
 141
 142     while (index < maximum) {
 143         if (strchr((const char *) charset, source[index]) != NULL) {
 144             return source + index;
 145         }
 146
 147         if (source[index] < 0x80 || !validate) {
 148             index++;
 149         } else {
 150             size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
 151             pm_strpbrk_explicit_encoding_set(parser, source, width);
 152
 153             if (width > 0) {
 154                 index += width;
 155             } else {
 156                 // At this point we know we have an invalid multibyte character.
 157                 // We'll walk forward as far as we can until we find the next
 158                 // valid character so that we don't spam the user with a ton of
 159                 // the same kind of error.
 160                 const size_t start = index;
 161
 162                 do {
 163                     index++;
 164                 } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
 165
 166                 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
 167             }
 168         }
 169     }
 170
 171     return NULL;
 172 }
 173
 174 /**
 175  * Here we have rolled our own version of strpbrk. The standard library strpbrk
 176  * has undefined behavior when the source string is not null-terminated. We want
 177  * to support strings that are not null-terminated because pm_parse does not
 178  * have the contract that the string is null-terminated. (This is desirable
 179  * because it means the extension can call pm_parse with the result of a call to
 180  * mmap).
 181  *
 182  * The standard library strpbrk also does not support passing a maximum length
 183  * to search. We want to support this for the reason mentioned above, but we
 184  * also don't want it to stop on null bytes. Ruby actually allows null bytes
 185  * within strings, comments, regular expressions, etc. So we need to be able to
 186  * skip past them.
 187  *
 188  * Finally, we want to support encodings wherein the charset could contain
 189  * characters that are trailing bytes of multi-byte characters. For example, in
 190  * Shift_JIS, the backslash character can be a trailing byte. In that case we
 191  * need to take a slower path and iterate one multi-byte character at a time.
 192  */
 193 const uint8_t *
 194 pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) {
 195     if (length <= 0) {
 196         return NULL;
 197     } else if (!parser->encoding_changed) {
 198         return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
 199     } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
 200         return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate);
 201     } else if (parser->encoding->multibyte) {
 202         return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
 203     } else {
 204         return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);
 205     }
 206 }