[ruby/json] Adjust to the CVTUTF code being gone

I, Luke T. Shumaker, am the sole author of the added code. I did not reference CVTUTF when writing it. I did reference the Unicode standard (15.0.0), the Wikipedia article on UTF-8, and the Wikipedia article on UTF-16. When I saw some tests fail, I did reference the old deleted code (but a JSON-specific part, inherently not as based on CVTUTF) to determine that script_safe should also escape U+2028 and U+2029. I targeted simplicity and clarity when writing the code--it can likely be optimized. In my mind, the obvious next optimization is to have it combine contiguous non-escaped characters into just one call to fbuffer_append(), instead of calling fbuffer_append() for each character. Regarding the use of the "modern" types `uint32_t`, `uint16_t`, and `bool`: - ruby.h is guaranteed to give us uint32_t and uint16_t. - Since Ruby 3.0.0, ruby.h is guaranteed to give us bool... but we support down to Ruby 2.3. But, ruby.h is guaranteed to give us HAVE_STDBOOL_H for the C99 stdbool.h; so use that to include stdbool.h if we can, and if not then fall back to a copy of the same bool definition that Ruby 3.0.5 uses with C89. https://github.com/ruby/json/commit/c96351f874
author: Luke T. Shumaker <[email protected]> 2024-02-22 20:51:28 -0700
committer: Hiroshi SHIBATA <[email protected]> 2024-10-08 14:10:05 +0900
commit: 74d459fd52ef85f92f7c20819afcc4ffcf11714d (patch)
tree: c967d95e7b5f20bc32956087368571e831f7ded6 /ext/json/parser/parser.rl
parent: 6e47968929f2ee77376d28a6561266d8f8e3a4f7 (diff)
1 files changed, 22 insertions, 10 deletions
diff --git a/ext/json/parser/parser.rl b/ext/json/parser/parser.rl
index 873c1b3007..959b6e7384 100644
--- a/ext/json/parser/parser.rl
+++ b/ext/json/parser/parser.rl
@@ -20,26 +20,28 @@ static const signed char digit_values[256] = {
     -1, -1, -1, -1, -1, -1, -1
 };
 
-static UTF32 unescape_unicode(const unsigned char *p)
+static uint32_t unescape_unicode(const unsigned char *p)
 {
+    const uint32_t replacement_char = 0xFFFD;
+
     signed char b;
-    UTF32 result = 0;
+    uint32_t result = 0;
     b = digit_values[p[0]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     b = digit_values[p[1]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     b = digit_values[p[2]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     b = digit_values[p[3]];
-    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    if (b < 0) return replacement_char;
     result = (result << 4) | (unsigned char)b;
     return result;
 }
 
-static int convert_UTF32_to_UTF8(char *buf, UTF32 ch)
+static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
 {
     int len = 1;
     if (ch <= 0x7F) {
@@ -493,9 +495,19 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
                         "incomplete unicode character escape sequence at '%s'", p
                       );
                     } else {
-                        UTF32 ch = unescape_unicode((unsigned char *) ++pe);
+                        uint32_t ch = unescape_unicode((unsigned char *) ++pe);
                         pe += 3;
-                        if (UNI_SUR_HIGH_START == (ch & 0xFC00)) {
+                        /* To handle values above U+FFFF, we take a sequence of
+                         * \uXXXX escapes in the U+D800..U+DBFF then
+                         * U+DC00..U+DFFF ranges, take the low 10 bits from each
+                         * to make a 20-bit number, then add 0x10000 to get the
+                         * final codepoint.
+                         *
+                         * See Unicode 15: §3.8 "Surrogates", §5.3 "Handling
+                         * Surrogate Pairs in UTF-16", and §23.6 "Surrogates
+                         * Area".
+                         */
+                        if ((ch & 0xFC00) == 0xD800) {
                             pe++;
                             if (pe > stringEnd - 6) {
                               if (bufferSize > MAX_STACK_BUFFER_SIZE) {
@@ -507,7 +519,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
                                 );
                             }
                             if (pe[0] == '\\' && pe[1] == 'u') {
-                                UTF32 sur = unescape_unicode((unsigned char *) pe + 2);
+                                uint32_t sur = unescape_unicode((unsigned char *) pe + 2);
                                 ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
                                         | (sur & 0x3FF));
                                 pe += 5;
author	Luke T. Shumaker <[email protected]>	2024-02-22 20:51:28 -0700
committer	Hiroshi SHIBATA <[email protected]>	2024-10-08 14:10:05 +0900
commit	74d459fd52ef85f92f7c20819afcc4ffcf11714d (patch)
tree	c967d95e7b5f20bc32956087368571e831f7ded6 /ext/json/parser/parser.rl
parent	6e47968929f2ee77376d28a6561266d8f8e3a4f7 (diff)