[Bug #20280] Check by `rb_parser_enc_str_coderange`
[ruby.git] / string.c
blobd8c2a9108502ff7843f30974f36fd76db513ab1c
1 /**********************************************************************
3 string.c -
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
12 **********************************************************************/
14 #include "ruby/internal/config.h"
16 #include <ctype.h>
17 #include <errno.h>
18 #include <math.h>
20 #ifdef HAVE_UNISTD_H
21 # include <unistd.h>
22 #endif
24 #include "debug_counter.h"
25 #include "encindex.h"
26 #include "id.h"
27 #include "internal.h"
28 #include "internal/array.h"
29 #include "internal/compar.h"
30 #include "internal/compilers.h"
31 #include "internal/encoding.h"
32 #include "internal/error.h"
33 #include "internal/gc.h"
34 #include "internal/numeric.h"
35 #include "internal/object.h"
36 #include "internal/proc.h"
37 #include "internal/re.h"
38 #include "internal/sanitizers.h"
39 #include "internal/string.h"
40 #include "internal/transcode.h"
41 #include "probes.h"
42 #include "ruby/encoding.h"
43 #include "ruby/re.h"
44 #include "ruby/util.h"
45 #include "ruby_assert.h"
46 #include "vm_sync.h"
48 #if defined HAVE_CRYPT_R
49 # if defined HAVE_CRYPT_H
50 # include <crypt.h>
51 # endif
52 #elif !defined HAVE_CRYPT
53 # include "missing/crypt.h"
54 # define HAVE_CRYPT_R 1
55 #endif
57 #define BEG(no) (regs->beg[(no)])
58 #define END(no) (regs->end[(no)])
60 #undef rb_str_new
61 #undef rb_usascii_str_new
62 #undef rb_utf8_str_new
63 #undef rb_enc_str_new
64 #undef rb_str_new_cstr
65 #undef rb_usascii_str_new_cstr
66 #undef rb_utf8_str_new_cstr
67 #undef rb_enc_str_new_cstr
68 #undef rb_external_str_new_cstr
69 #undef rb_locale_str_new_cstr
70 #undef rb_str_dup_frozen
71 #undef rb_str_buf_new_cstr
72 #undef rb_str_buf_cat
73 #undef rb_str_buf_cat2
74 #undef rb_str_cat2
75 #undef rb_str_cat_cstr
76 #undef rb_fstring_cstr
78 VALUE rb_cString;
79 VALUE rb_cSymbol;
81 /* Flags of RString
83 * 1: RSTRING_NOEMBED
84 * The string is not embedded. When a string is embedded, the contents
85 * follow the header. When a string is not embedded, the contents is
86 * on a separately allocated buffer.
87 * 2: STR_SHARED (equal to ELTS_SHARED)
88 * The string is shared. The buffer this string points to is owned by
89 * another string (the shared root).
90 * 5: STR_SHARED_ROOT
91 * Other strings may point to the contents of this string. When this
92 * flag is set, STR_SHARED must not be set.
93 * 6: STR_BORROWED
94 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
95 * to be unshared by rb_str_tmp_frozen_release.
96 * 7: STR_TMPLOCK
97 * The pointer to the buffer is passed to a system call such as
98 * read(2). Any modification and realloc is prohibited.
99 * 8-9: ENC_CODERANGE
100 * Stores the coderange of the string.
101 * 10-16: ENCODING
102 * Stores the encoding of the string.
103 * 17: RSTRING_FSTR
104 * The string is a fstring. The string is deduplicated in the fstring
105 * table.
106 * 18: STR_NOFREE
107 * Do not free this string's buffer when the string is reclaimed
108 * by the garbage collector. Used for when the string buffer is a C
109 * string literal.
110 * 19: STR_FAKESTR
111 * The string is not allocated or managed by the garbage collector.
112 * Typically, the string object header (struct RString) is temporarily
113 * allocated on C stack.
116 #define RUBY_MAX_CHAR_LEN 16
117 #define STR_SHARED_ROOT FL_USER5
118 #define STR_BORROWED FL_USER6
119 #define STR_TMPLOCK FL_USER7
120 #define STR_NOFREE FL_USER18
121 #define STR_FAKESTR FL_USER19
123 #define STR_SET_NOEMBED(str) do {\
124 FL_SET((str), STR_NOEMBED);\
125 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
126 } while (0)
127 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
129 #define STR_SET_LEN(str, n) do { \
130 RSTRING(str)->len = (n); \
131 } while (0)
133 static inline bool
134 str_enc_fastpath(VALUE str)
136 // The overwhelming majority of strings are in one of these 3 encodings.
137 switch (ENCODING_GET_INLINED(str)) {
138 case ENCINDEX_ASCII_8BIT:
139 case ENCINDEX_UTF_8:
140 case ENCINDEX_US_ASCII:
141 return true;
142 default:
143 return false;
147 #define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
148 #define TERM_FILL(ptr, termlen) do {\
149 char *const term_fill_ptr = (ptr);\
150 const int term_fill_len = (termlen);\
151 *term_fill_ptr = '\0';\
152 if (UNLIKELY(term_fill_len > 1))\
153 memset(term_fill_ptr, 0, term_fill_len);\
154 } while (0)
156 #define RESIZE_CAPA(str,capacity) do {\
157 const int termlen = TERM_LEN(str);\
158 RESIZE_CAPA_TERM(str,capacity,termlen);\
159 } while (0)
160 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
161 if (STR_EMBED_P(str)) {\
162 if (str_embed_capa(str) < capacity + termlen) {\
163 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
164 const long tlen = RSTRING_LEN(str);\
165 memcpy(tmp, RSTRING_PTR(str), tlen);\
166 RSTRING(str)->as.heap.ptr = tmp;\
167 RSTRING(str)->len = tlen;\
168 STR_SET_NOEMBED(str);\
169 RSTRING(str)->as.heap.aux.capa = (capacity);\
172 else {\
173 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
174 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
175 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
176 RSTRING(str)->as.heap.aux.capa = (capacity);\
178 } while (0)
180 #define STR_SET_SHARED(str, shared_str) do { \
181 if (!FL_TEST(str, STR_FAKESTR)) { \
182 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
183 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
184 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
185 FL_SET((str), STR_SHARED); \
186 FL_SET((shared_str), STR_SHARED_ROOT); \
187 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
188 FL_SET_RAW((shared_str), STR_BORROWED); \
190 } while (0)
192 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
193 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
194 /* TODO: include the terminator size in capa. */
196 #define STR_ENC_GET(str) get_encoding(str)
198 #if !defined SHARABLE_MIDDLE_SUBSTRING
199 # define SHARABLE_MIDDLE_SUBSTRING 0
200 #endif
201 #if !SHARABLE_MIDDLE_SUBSTRING
202 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
203 #else
204 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
205 #endif
208 static inline long
209 str_embed_capa(VALUE str)
211 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
214 bool
215 rb_str_reembeddable_p(VALUE str)
217 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
220 static inline size_t
221 rb_str_embed_size(long capa)
223 return offsetof(struct RString, as.embed.ary) + capa;
226 size_t
227 rb_str_size_as_embedded(VALUE str)
229 size_t real_size;
230 if (STR_EMBED_P(str)) {
231 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
233 /* if the string is not currently embedded, but it can be embedded, how
234 * much space would it require */
235 else if (rb_str_reembeddable_p(str)) {
236 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
238 else {
239 real_size = sizeof(struct RString);
241 return real_size;
244 static inline bool
245 STR_EMBEDDABLE_P(long len, long termlen)
247 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
250 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
251 static VALUE str_new_frozen(VALUE klass, VALUE orig);
252 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
253 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
254 static VALUE str_new(VALUE klass, const char *ptr, long len);
255 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
256 static inline void str_modifiable(VALUE str);
257 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
259 static inline void
260 str_make_independent(VALUE str)
262 long len = RSTRING_LEN(str);
263 int termlen = TERM_LEN(str);
264 str_make_independent_expand((str), len, 0L, termlen);
267 static inline int str_dependent_p(VALUE str);
269 void
270 rb_str_make_independent(VALUE str)
272 if (str_dependent_p(str)) {
273 str_make_independent(str);
277 void
278 rb_str_make_embedded(VALUE str)
280 RUBY_ASSERT(rb_str_reembeddable_p(str));
281 RUBY_ASSERT(!STR_EMBED_P(str));
283 char *buf = RSTRING(str)->as.heap.ptr;
284 long len = RSTRING(str)->len;
286 STR_SET_EMBED(str);
287 STR_SET_LEN(str, len);
289 if (len > 0) {
290 memcpy(RSTRING_PTR(str), buf, len);
291 ruby_xfree(buf);
294 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
297 void
298 rb_debug_rstring_null_ptr(const char *func)
300 fprintf(stderr, "%s is returning NULL!! "
301 "SIGSEGV is highly expected to follow immediately.\n"
302 "If you could reproduce, attach your debugger here, "
303 "and look at the passed string.\n",
304 func);
307 /* symbols for [up|down|swap]case/capitalize options */
308 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
310 static rb_encoding *
311 get_encoding(VALUE str)
313 return rb_enc_from_index(ENCODING_GET(str));
316 static void
317 mustnot_broken(VALUE str)
319 if (is_broken_string(str)) {
320 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
324 static void
325 mustnot_wchar(VALUE str)
327 rb_encoding *enc = STR_ENC_GET(str);
328 if (rb_enc_mbminlen(enc) > 1) {
329 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
333 static int fstring_cmp(VALUE a, VALUE b);
335 static VALUE register_fstring(VALUE str, bool copy);
337 const struct st_hash_type rb_fstring_hash_type = {
338 fstring_cmp,
339 rb_str_hash,
342 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
344 struct fstr_update_arg {
345 VALUE fstr;
346 bool copy;
349 static int
350 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
353 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
354 VALUE str = (VALUE)*key;
356 if (existing) {
357 /* because of lazy sweep, str may be unmarked already and swept
358 * at next time */
360 if (rb_objspace_garbage_object_p(str)) {
361 arg->fstr = Qundef;
362 return ST_DELETE;
365 arg->fstr = str;
366 return ST_STOP;
368 else {
369 if (FL_TEST_RAW(str, STR_FAKESTR)) {
370 if (arg->copy) {
371 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
372 rb_enc_copy(new_str, str);
373 str = new_str;
375 else {
376 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
377 RSTRING(str)->len,
378 ENCODING_GET(str));
380 OBJ_FREEZE_RAW(str);
382 else {
383 if (!OBJ_FROZEN(str))
384 str = str_new_frozen(rb_cString, str);
385 if (STR_SHARED_P(str)) { /* str should not be shared */
386 /* shared substring */
387 str_make_independent(str);
388 RUBY_ASSERT(OBJ_FROZEN(str));
390 if (!BARE_STRING_P(str)) {
391 str = str_new_frozen(rb_cString, str);
394 RBASIC(str)->flags |= RSTRING_FSTR;
396 *key = *value = arg->fstr = str;
397 return ST_CONTINUE;
401 RUBY_FUNC_EXPORTED
402 VALUE
403 rb_fstring(VALUE str)
405 VALUE fstr;
406 int bare;
408 Check_Type(str, T_STRING);
410 if (FL_TEST(str, RSTRING_FSTR))
411 return str;
413 bare = BARE_STRING_P(str);
414 if (!bare) {
415 if (STR_EMBED_P(str)) {
416 OBJ_FREEZE_RAW(str);
417 return str;
420 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
421 RUBY_ASSERT(OBJ_FROZEN(str));
422 return str;
426 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE))
427 rb_str_resize(str, RSTRING_LEN(str));
429 fstr = register_fstring(str, FALSE);
431 if (!bare) {
432 str_replace_shared_without_enc(str, fstr);
433 OBJ_FREEZE_RAW(str);
434 return str;
436 return fstr;
439 static VALUE
440 register_fstring(VALUE str, bool copy)
442 struct fstr_update_arg args;
443 args.copy = copy;
445 RB_VM_LOCK_ENTER();
447 st_table *frozen_strings = rb_vm_fstring_table();
448 do {
449 args.fstr = str;
450 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
451 } while (UNDEF_P(args.fstr));
453 RB_VM_LOCK_LEAVE();
455 RUBY_ASSERT(OBJ_FROZEN(args.fstr));
456 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
457 RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
458 RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
460 return args.fstr;
463 static VALUE
464 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
466 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
467 /* SHARED to be allocated by the callback */
469 if (!name) {
470 RUBY_ASSERT_ALWAYS(len == 0);
471 name = "";
474 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
476 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
477 fake_str->len = len;
478 fake_str->as.heap.ptr = (char *)name;
479 fake_str->as.heap.aux.capa = len;
480 return (VALUE)fake_str;
484 * set up a fake string which refers a static string literal.
486 VALUE
487 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
489 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
493 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
494 * shared string which refers a static string literal. `ptr` must
495 * point a constant string.
497 VALUE
498 rb_fstring_new(const char *ptr, long len)
500 struct RString fake_str;
501 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
504 VALUE
505 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
507 struct RString fake_str;
508 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
511 VALUE
512 rb_fstring_cstr(const char *ptr)
514 return rb_fstring_new(ptr, strlen(ptr));
517 static int
518 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
520 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
521 return ST_CONTINUE;
524 static int
525 fstring_cmp(VALUE a, VALUE b)
527 long alen, blen;
528 const char *aptr, *bptr;
529 RSTRING_GETMEM(a, aptr, alen);
530 RSTRING_GETMEM(b, bptr, blen);
531 return (alen != blen ||
532 ENCODING_GET(a) != ENCODING_GET(b) ||
533 memcmp(aptr, bptr, alen) != 0);
536 static inline int
537 single_byte_optimizable(VALUE str)
539 rb_encoding *enc;
541 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
542 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
543 return 1;
545 enc = STR_ENC_GET(str);
546 if (rb_enc_mbmaxlen(enc) == 1)
547 return 1;
549 /* Conservative. Possibly single byte.
550 * "\xa1" in Shift_JIS for example. */
551 return 0;
554 VALUE rb_fs;
556 static inline const char *
557 search_nonascii(const char *p, const char *e)
559 const uintptr_t *s, *t;
561 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
562 # if SIZEOF_UINTPTR_T == 8
563 # define NONASCII_MASK UINT64_C(0x8080808080808080)
564 # elif SIZEOF_UINTPTR_T == 4
565 # define NONASCII_MASK UINT32_C(0x80808080)
566 # else
567 # error "don't know what to do."
568 # endif
569 #else
570 # if SIZEOF_UINTPTR_T == 8
571 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
572 # elif SIZEOF_UINTPTR_T == 4
573 # define NONASCII_MASK 0x80808080UL /* or...? */
574 # else
575 # error "don't know what to do."
576 # endif
577 #endif
579 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
580 #if !UNALIGNED_WORD_ACCESS
581 if ((uintptr_t)p % SIZEOF_VOIDP) {
582 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
583 p += l;
584 switch (l) {
585 default: UNREACHABLE;
586 #if SIZEOF_VOIDP > 4
587 case 7: if (p[-7]&0x80) return p-7;
588 case 6: if (p[-6]&0x80) return p-6;
589 case 5: if (p[-5]&0x80) return p-5;
590 case 4: if (p[-4]&0x80) return p-4;
591 #endif
592 case 3: if (p[-3]&0x80) return p-3;
593 case 2: if (p[-2]&0x80) return p-2;
594 case 1: if (p[-1]&0x80) return p-1;
595 case 0: break;
598 #endif
599 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
600 #define aligned_ptr(value) \
601 __builtin_assume_aligned((value), sizeof(uintptr_t))
602 #else
603 #define aligned_ptr(value) (uintptr_t *)(value)
604 #endif
605 s = aligned_ptr(p);
606 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
607 #undef aligned_ptr
608 for (;s < t; s++) {
609 if (*s & NONASCII_MASK) {
610 #ifdef WORDS_BIGENDIAN
611 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
612 #else
613 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
614 #endif
617 p = (const char *)s;
620 switch (e - p) {
621 default: UNREACHABLE;
622 #if SIZEOF_VOIDP > 4
623 case 7: if (e[-7]&0x80) return e-7;
624 case 6: if (e[-6]&0x80) return e-6;
625 case 5: if (e[-5]&0x80) return e-5;
626 case 4: if (e[-4]&0x80) return e-4;
627 #endif
628 case 3: if (e[-3]&0x80) return e-3;
629 case 2: if (e[-2]&0x80) return e-2;
630 case 1: if (e[-1]&0x80) return e-1;
631 case 0: return NULL;
635 static int
636 coderange_scan(const char *p, long len, rb_encoding *enc)
638 const char *e = p + len;
640 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
641 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
642 p = search_nonascii(p, e);
643 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
646 if (rb_enc_asciicompat(enc)) {
647 p = search_nonascii(p, e);
648 if (!p) return ENC_CODERANGE_7BIT;
649 for (;;) {
650 int ret = rb_enc_precise_mbclen(p, e, enc);
651 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
652 p += MBCLEN_CHARFOUND_LEN(ret);
653 if (p == e) break;
654 p = search_nonascii(p, e);
655 if (!p) break;
658 else {
659 while (p < e) {
660 int ret = rb_enc_precise_mbclen(p, e, enc);
661 if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
662 p += MBCLEN_CHARFOUND_LEN(ret);
665 return ENC_CODERANGE_VALID;
668 long
669 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
671 const char *p = s;
673 if (*cr == ENC_CODERANGE_BROKEN)
674 return e - s;
676 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
677 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
678 if (*cr == ENC_CODERANGE_VALID) return e - s;
679 p = search_nonascii(p, e);
680 *cr = p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
681 return e - s;
683 else if (rb_enc_asciicompat(enc)) {
684 p = search_nonascii(p, e);
685 if (!p) {
686 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
687 return e - s;
689 for (;;) {
690 int ret = rb_enc_precise_mbclen(p, e, enc);
691 if (!MBCLEN_CHARFOUND_P(ret)) {
692 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
693 return p - s;
695 p += MBCLEN_CHARFOUND_LEN(ret);
696 if (p == e) break;
697 p = search_nonascii(p, e);
698 if (!p) break;
701 else {
702 while (p < e) {
703 int ret = rb_enc_precise_mbclen(p, e, enc);
704 if (!MBCLEN_CHARFOUND_P(ret)) {
705 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
706 return p - s;
708 p += MBCLEN_CHARFOUND_LEN(ret);
711 *cr = ENC_CODERANGE_VALID;
712 return e - s;
715 static inline void
716 str_enc_copy(VALUE str1, VALUE str2)
718 rb_enc_set_index(str1, ENCODING_GET(str2));
721 /* Like str_enc_copy, but does not check frozen status of str1.
722 * You should use this only if you're certain that str1 is not frozen. */
723 static inline void
724 str_enc_copy_direct(VALUE str1, VALUE str2)
726 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
727 if (inlined_encoding == ENCODING_INLINE_MAX) {
728 rb_enc_set_index(str1, rb_enc_get_index(str2));
730 else {
731 ENCODING_SET_INLINED(str1, inlined_encoding);
735 static void
736 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
738 /* this function is designed for copying encoding and coderange
739 * from src to new string "dest" which is made from the part of src.
741 str_enc_copy(dest, src);
742 if (RSTRING_LEN(dest) == 0) {
743 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
744 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
745 else
746 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
747 return;
749 switch (ENC_CODERANGE(src)) {
750 case ENC_CODERANGE_7BIT:
751 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
752 break;
753 case ENC_CODERANGE_VALID:
754 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
755 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
756 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
757 else
758 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
759 break;
760 default:
761 break;
765 static void
766 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
768 str_enc_copy(dest, src);
769 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
772 static int
773 enc_coderange_scan(VALUE str, rb_encoding *enc)
775 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
779 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
781 return enc_coderange_scan(str, enc);
785 rb_enc_str_coderange(VALUE str)
787 int cr = ENC_CODERANGE(str);
789 if (cr == ENC_CODERANGE_UNKNOWN) {
790 cr = enc_coderange_scan(str, get_encoding(str));
791 ENC_CODERANGE_SET(str, cr);
793 return cr;
797 rb_enc_str_asciionly_p(VALUE str)
799 rb_encoding *enc = STR_ENC_GET(str);
801 if (!rb_enc_asciicompat(enc))
802 return FALSE;
803 else if (is_ascii_string(str))
804 return TRUE;
805 return FALSE;
808 static inline void
809 str_mod_check(VALUE s, const char *p, long len)
811 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
812 rb_raise(rb_eRuntimeError, "string modified");
816 static size_t
817 str_capacity(VALUE str, const int termlen)
819 if (STR_EMBED_P(str)) {
820 return str_embed_capa(str) - termlen;
822 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
823 return RSTRING(str)->len;
825 else {
826 return RSTRING(str)->as.heap.aux.capa;
830 size_t
831 rb_str_capacity(VALUE str)
833 return str_capacity(str, TERM_LEN(str));
836 static inline void
837 must_not_null(const char *ptr)
839 if (!ptr) {
840 rb_raise(rb_eArgError, "NULL pointer given");
844 static inline VALUE
845 str_alloc_embed(VALUE klass, size_t capa)
847 size_t size = rb_str_embed_size(capa);
848 RUBY_ASSERT(size > 0);
849 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
851 NEWOBJ_OF(str, struct RString, klass,
852 T_STRING | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), size, 0);
854 return (VALUE)str;
857 static inline VALUE
858 str_alloc_heap(VALUE klass)
860 NEWOBJ_OF(str, struct RString, klass,
861 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
863 return (VALUE)str;
866 static inline VALUE
867 empty_str_alloc(VALUE klass)
869 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
870 VALUE str = str_alloc_embed(klass, 0);
871 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
872 return str;
875 static VALUE
876 str_new0(VALUE klass, const char *ptr, long len, int termlen)
878 VALUE str;
880 if (len < 0) {
881 rb_raise(rb_eArgError, "negative string size (or size too big)");
884 RUBY_DTRACE_CREATE_HOOK(STRING, len);
886 if (STR_EMBEDDABLE_P(len, termlen)) {
887 str = str_alloc_embed(klass, len + termlen);
888 if (len == 0) {
889 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
892 else {
893 str = str_alloc_heap(klass);
894 RSTRING(str)->as.heap.aux.capa = len;
895 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
896 * integer overflow. If we can STATIC_ASSERT that, the following
897 * mul_add_mul can be reverted to a simple ALLOC_N. */
898 RSTRING(str)->as.heap.ptr =
899 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
901 if (ptr) {
902 memcpy(RSTRING_PTR(str), ptr, len);
904 STR_SET_LEN(str, len);
905 TERM_FILL(RSTRING_PTR(str) + len, termlen);
906 return str;
909 static VALUE
910 str_new(VALUE klass, const char *ptr, long len)
912 return str_new0(klass, ptr, len, 1);
915 VALUE
916 rb_str_new(const char *ptr, long len)
918 return str_new(rb_cString, ptr, len);
921 VALUE
922 rb_usascii_str_new(const char *ptr, long len)
924 VALUE str = rb_str_new(ptr, len);
925 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
926 return str;
929 VALUE
930 rb_utf8_str_new(const char *ptr, long len)
932 VALUE str = str_new(rb_cString, ptr, len);
933 rb_enc_associate_index(str, rb_utf8_encindex());
934 return str;
937 VALUE
938 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
940 VALUE str;
942 if (!enc) return rb_str_new(ptr, len);
944 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
945 rb_enc_associate(str, enc);
946 return str;
949 VALUE
950 rb_str_new_cstr(const char *ptr)
952 must_not_null(ptr);
953 /* rb_str_new_cstr() can take pointer from non-malloc-generated
954 * memory regions, and that cannot be detected by the MSAN. Just
955 * trust the programmer that the argument passed here is a sane C
956 * string. */
957 __msan_unpoison_string(ptr);
958 return rb_str_new(ptr, strlen(ptr));
961 VALUE
962 rb_usascii_str_new_cstr(const char *ptr)
964 VALUE str = rb_str_new_cstr(ptr);
965 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
966 return str;
969 VALUE
970 rb_utf8_str_new_cstr(const char *ptr)
972 VALUE str = rb_str_new_cstr(ptr);
973 rb_enc_associate_index(str, rb_utf8_encindex());
974 return str;
977 VALUE
978 rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
980 must_not_null(ptr);
981 if (rb_enc_mbminlen(enc) != 1) {
982 rb_raise(rb_eArgError, "wchar encoding given");
984 return rb_enc_str_new(ptr, strlen(ptr), enc);
987 static VALUE
988 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
990 VALUE str;
992 if (len < 0) {
993 rb_raise(rb_eArgError, "negative string size (or size too big)");
996 if (!ptr) {
997 rb_encoding *enc = rb_enc_get_from_index(encindex);
998 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1000 else {
1001 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1002 str = str_alloc_heap(klass);
1003 RSTRING(str)->len = len;
1004 RSTRING(str)->as.heap.ptr = (char *)ptr;
1005 RSTRING(str)->as.heap.aux.capa = len;
1006 RBASIC(str)->flags |= STR_NOFREE;
1008 rb_enc_associate_index(str, encindex);
1009 return str;
1012 VALUE
1013 rb_str_new_static(const char *ptr, long len)
1015 return str_new_static(rb_cString, ptr, len, 0);
1018 VALUE
1019 rb_usascii_str_new_static(const char *ptr, long len)
1021 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1024 VALUE
1025 rb_utf8_str_new_static(const char *ptr, long len)
1027 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1030 VALUE
1031 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1033 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1036 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1037 rb_encoding *from, rb_encoding *to,
1038 int ecflags, VALUE ecopts);
1040 static inline bool
1041 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1043 int encidx = rb_enc_to_index(enc);
1044 if (rb_enc_get_index(str) == encidx)
1045 return is_ascii_string(str);
1046 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1049 VALUE
1050 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1052 long len;
1053 const char *ptr;
1054 VALUE newstr;
1056 if (!to) return str;
1057 if (!from) from = rb_enc_get(str);
1058 if (from == to) return str;
1059 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1060 rb_is_ascii8bit_enc(to)) {
1061 if (STR_ENC_GET(str) != to) {
1062 str = rb_str_dup(str);
1063 rb_enc_associate(str, to);
1065 return str;
1068 RSTRING_GETMEM(str, ptr, len);
1069 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1070 from, to, ecflags, ecopts);
1071 if (NIL_P(newstr)) {
1072 /* some error, return original */
1073 return str;
1075 return newstr;
1078 VALUE
1079 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1080 rb_encoding *from, int ecflags, VALUE ecopts)
1082 long olen;
1084 olen = RSTRING_LEN(newstr);
1085 if (ofs < -olen || olen < ofs)
1086 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1087 if (ofs < 0) ofs += olen;
1088 if (!from) {
1089 STR_SET_LEN(newstr, ofs);
1090 return rb_str_cat(newstr, ptr, len);
1093 rb_str_modify(newstr);
1094 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1095 rb_enc_get(newstr),
1096 ecflags, ecopts);
1099 VALUE
1100 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1102 STR_SET_LEN(str, 0);
1103 rb_enc_associate(str, enc);
1104 rb_str_cat(str, ptr, len);
1105 return str;
1108 static VALUE
1109 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1110 rb_encoding *from, rb_encoding *to,
1111 int ecflags, VALUE ecopts)
1113 rb_econv_t *ec;
1114 rb_econv_result_t ret;
1115 long olen;
1116 VALUE econv_wrapper;
1117 const unsigned char *start, *sp;
1118 unsigned char *dest, *dp;
1119 size_t converted_output = (size_t)ofs;
1121 olen = rb_str_capacity(newstr);
1123 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1124 RBASIC_CLEAR_CLASS(econv_wrapper);
1125 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1126 if (!ec) return Qnil;
1127 DATA_PTR(econv_wrapper) = ec;
1129 sp = (unsigned char*)ptr;
1130 start = sp;
1131 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1132 (dp = dest + converted_output),
1133 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1134 ret == econv_destination_buffer_full) {
1135 /* destination buffer short */
1136 size_t converted_input = sp - start;
1137 size_t rest = len - converted_input;
1138 converted_output = dp - dest;
1139 rb_str_set_len(newstr, converted_output);
1140 if (converted_input && converted_output &&
1141 rest < (LONG_MAX / converted_output)) {
1142 rest = (rest * converted_output) / converted_input;
1144 else {
1145 rest = olen;
1147 olen += rest < 2 ? 2 : rest;
1148 rb_str_resize(newstr, olen);
1150 DATA_PTR(econv_wrapper) = 0;
1151 rb_econv_close(ec);
1152 switch (ret) {
1153 case econv_finished:
1154 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1155 rb_str_set_len(newstr, len);
1156 rb_enc_associate(newstr, to);
1157 return newstr;
1159 default:
1160 return Qnil;
1164 VALUE
1165 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)