1 /**********************************************************************
6 created at: Mon Aug 9 17:12:58 JST 1993
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
12 **********************************************************************/
14 #include "ruby/internal/config.h"
24 #include "debug_counter.h"
28 #include "internal/array.h"
29 #include "internal/compar.h"
30 #include "internal/compilers.h"
31 #include "internal/encoding.h"
32 #include "internal/error.h"
33 #include "internal/gc.h"
34 #include "internal/numeric.h"
35 #include "internal/object.h"
36 #include "internal/proc.h"
37 #include "internal/re.h"
38 #include "internal/sanitizers.h"
39 #include "internal/string.h"
40 #include "internal/transcode.h"
42 #include "ruby/encoding.h"
44 #include "ruby/util.h"
45 #include "ruby_assert.h"
48 #if defined HAVE_CRYPT_R
49 # if defined HAVE_CRYPT_H
52 #elif !defined HAVE_CRYPT
53 # include "missing/crypt.h"
54 # define HAVE_CRYPT_R 1
57 #define BEG(no) (regs->beg[(no)])
58 #define END(no) (regs->end[(no)])
61 #undef rb_usascii_str_new
62 #undef rb_utf8_str_new
64 #undef rb_str_new_cstr
65 #undef rb_usascii_str_new_cstr
66 #undef rb_utf8_str_new_cstr
67 #undef rb_enc_str_new_cstr
68 #undef rb_external_str_new_cstr
69 #undef rb_locale_str_new_cstr
70 #undef rb_str_dup_frozen
71 #undef rb_str_buf_new_cstr
73 #undef rb_str_buf_cat2
75 #undef rb_str_cat_cstr
76 #undef rb_fstring_cstr
84 * The string is not embedded. When a string is embedded, the contents
85 * follow the header. When a string is not embedded, the contents is
86 * on a separately allocated buffer.
87 * 2: STR_SHARED (equal to ELTS_SHARED)
88 * The string is shared. The buffer this string points to is owned by
89 * another string (the shared root).
91 * Other strings may point to the contents of this string. When this
92 * flag is set, STR_SHARED must not be set.
94 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
95 * to be unshared by rb_str_tmp_frozen_release.
97 * The pointer to the buffer is passed to a system call such as
98 * read(2). Any modification and realloc is prohibited.
100 * Stores the coderange of the string.
102 * Stores the encoding of the string.
104 * The string is a fstring. The string is deduplicated in the fstring
107 * Do not free this string's buffer when the string is reclaimed
108 * by the garbage collector. Used for when the string buffer is a C
111 * The string is not allocated or managed by the garbage collector.
112 * Typically, the string object header (struct RString) is temporarily
113 * allocated on C stack.
116 #define RUBY_MAX_CHAR_LEN 16
117 #define STR_SHARED_ROOT FL_USER5
118 #define STR_BORROWED FL_USER6
119 #define STR_TMPLOCK FL_USER7
120 #define STR_NOFREE FL_USER18
121 #define STR_FAKESTR FL_USER19
123 #define STR_SET_NOEMBED(str) do {\
124 FL_SET((str), STR_NOEMBED);\
125 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
127 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
129 #define STR_SET_LEN(str, n) do { \
130 RSTRING(str)->len = (n); \
134 str_enc_fastpath(VALUE str
)
136 // The overwhelming majority of strings are in one of these 3 encodings.
137 switch (ENCODING_GET_INLINED(str
)) {
138 case ENCINDEX_ASCII_8BIT
:
140 case ENCINDEX_US_ASCII
:
147 #define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
148 #define TERM_FILL(ptr, termlen) do {\
149 char *const term_fill_ptr = (ptr);\
150 const int term_fill_len = (termlen);\
151 *term_fill_ptr = '\0';\
152 if (UNLIKELY(term_fill_len > 1))\
153 memset(term_fill_ptr, 0, term_fill_len);\
156 #define RESIZE_CAPA(str,capacity) do {\
157 const int termlen = TERM_LEN(str);\
158 RESIZE_CAPA_TERM(str,capacity,termlen);\
160 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
161 if (STR_EMBED_P(str)) {\
162 if (str_embed_capa(str) < capacity + termlen) {\
163 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
164 const long tlen = RSTRING_LEN(str);\
165 memcpy(tmp, RSTRING_PTR(str), tlen);\
166 RSTRING(str)->as.heap.ptr = tmp;\
167 RSTRING(str)->len = tlen;\
168 STR_SET_NOEMBED(str);\
169 RSTRING(str)->as.heap.aux.capa = (capacity);\
173 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
174 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
175 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
176 RSTRING(str)->as.heap.aux.capa = (capacity);\
180 #define STR_SET_SHARED(str, shared_str) do { \
181 if (!FL_TEST(str, STR_FAKESTR)) { \
182 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
183 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
184 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
185 FL_SET((str), STR_SHARED); \
186 FL_SET((shared_str), STR_SHARED_ROOT); \
187 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
188 FL_SET_RAW((shared_str), STR_BORROWED); \
192 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
193 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
194 /* TODO: include the terminator size in capa. */
196 #define STR_ENC_GET(str) get_encoding(str)
198 #if !defined SHARABLE_MIDDLE_SUBSTRING
199 # define SHARABLE_MIDDLE_SUBSTRING 0
201 #if !SHARABLE_MIDDLE_SUBSTRING
202 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
204 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
209 str_embed_capa(VALUE str
)
211 return rb_gc_obj_slot_size(str
) - offsetof(struct RString
, as
.embed
.ary
);
215 rb_str_reembeddable_p(VALUE str
)
217 return !FL_TEST(str
, STR_NOFREE
|STR_SHARED_ROOT
|STR_SHARED
);
221 rb_str_embed_size(long capa
)
223 return offsetof(struct RString
, as
.embed
.ary
) + capa
;
227 rb_str_size_as_embedded(VALUE str
)
230 if (STR_EMBED_P(str
)) {
231 real_size
= rb_str_embed_size(RSTRING(str
)->len
) + TERM_LEN(str
);
233 /* if the string is not currently embedded, but it can be embedded, how
234 * much space would it require */
235 else if (rb_str_reembeddable_p(str
)) {
236 real_size
= rb_str_embed_size(RSTRING(str
)->as
.heap
.aux
.capa
) + TERM_LEN(str
);
239 real_size
= sizeof(struct RString
);
245 STR_EMBEDDABLE_P(long len
, long termlen
)
247 return rb_gc_size_allocatable_p(rb_str_embed_size(len
+ termlen
));
250 static VALUE
str_replace_shared_without_enc(VALUE str2
, VALUE str
);
251 static VALUE
str_new_frozen(VALUE klass
, VALUE orig
);
252 static VALUE
str_new_frozen_buffer(VALUE klass
, VALUE orig
, int copy_encoding
);
253 static VALUE
str_new_static(VALUE klass
, const char *ptr
, long len
, int encindex
);
254 static VALUE
str_new(VALUE klass
, const char *ptr
, long len
);
255 static void str_make_independent_expand(VALUE str
, long len
, long expand
, const int termlen
);
256 static inline void str_modifiable(VALUE str
);
257 static VALUE
rb_str_downcase(int argc
, VALUE
*argv
, VALUE str
);
260 str_make_independent(VALUE str
)
262 long len
= RSTRING_LEN(str
);
263 int termlen
= TERM_LEN(str
);
264 str_make_independent_expand((str
), len
, 0L, termlen
);
267 static inline int str_dependent_p(VALUE str
);
270 rb_str_make_independent(VALUE str
)
272 if (str_dependent_p(str
)) {
273 str_make_independent(str
);
278 rb_str_make_embedded(VALUE str
)
280 RUBY_ASSERT(rb_str_reembeddable_p(str
));
281 RUBY_ASSERT(!STR_EMBED_P(str
));
283 char *buf
= RSTRING(str
)->as
.heap
.ptr
;
284 long len
= RSTRING(str
)->len
;
287 STR_SET_LEN(str
, len
);
290 memcpy(RSTRING_PTR(str
), buf
, len
);
294 TERM_FILL(RSTRING(str
)->as
.embed
.ary
+ len
, TERM_LEN(str
));
298 rb_debug_rstring_null_ptr(const char *func
)
300 fprintf(stderr
, "%s is returning NULL!! "
301 "SIGSEGV is highly expected to follow immediately.\n"
302 "If you could reproduce, attach your debugger here, "
303 "and look at the passed string.\n",
307 /* symbols for [up|down|swap]case/capitalize options */
308 static VALUE sym_ascii
, sym_turkic
, sym_lithuanian
, sym_fold
;
311 get_encoding(VALUE str
)
313 return rb_enc_from_index(ENCODING_GET(str
));
317 mustnot_broken(VALUE str
)
319 if (is_broken_string(str
)) {
320 rb_raise(rb_eArgError
, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str
)));
325 mustnot_wchar(VALUE str
)
327 rb_encoding
*enc
= STR_ENC_GET(str
);
328 if (rb_enc_mbminlen(enc
) > 1) {
329 rb_raise(rb_eArgError
, "wide char encoding: %s", rb_enc_name(enc
));
333 static int fstring_cmp(VALUE a
, VALUE b
);
335 static VALUE
register_fstring(VALUE str
, bool copy
);
337 const struct st_hash_type rb_fstring_hash_type
= {
342 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
344 struct fstr_update_arg
{
350 fstr_update_callback(st_data_t
*key
, st_data_t
*value
, st_data_t data
, int existing
)
353 struct fstr_update_arg
*arg
= (struct fstr_update_arg
*)data
;
354 VALUE str
= (VALUE
)*key
;
357 /* because of lazy sweep, str may be unmarked already and swept
360 if (rb_objspace_garbage_object_p(str
)) {
369 if (FL_TEST_RAW(str
, STR_FAKESTR
)) {
371 VALUE new_str
= str_new(rb_cString
, RSTRING(str
)->as
.heap
.ptr
, RSTRING(str
)->len
);
372 rb_enc_copy(new_str
, str
);
376 str
= str_new_static(rb_cString
, RSTRING(str
)->as
.heap
.ptr
,
383 if (!OBJ_FROZEN(str
))
384 str
= str_new_frozen(rb_cString
, str
);
385 if (STR_SHARED_P(str
)) { /* str should not be shared */
386 /* shared substring */
387 str_make_independent(str
);
388 RUBY_ASSERT(OBJ_FROZEN(str
));
390 if (!BARE_STRING_P(str
)) {
391 str
= str_new_frozen(rb_cString
, str
);
394 RBASIC(str
)->flags
|= RSTRING_FSTR
;
396 *key
= *value
= arg
->fstr
= str
;
403 rb_fstring(VALUE str
)
408 Check_Type(str
, T_STRING
);
410 if (FL_TEST(str
, RSTRING_FSTR
))
413 bare
= BARE_STRING_P(str
);
415 if (STR_EMBED_P(str
)) {
420 if (FL_TEST_RAW(str
, STR_SHARED_ROOT
| STR_SHARED
) == STR_SHARED_ROOT
) {
421 RUBY_ASSERT(OBJ_FROZEN(str
));
426 if (!FL_TEST_RAW(str
, FL_FREEZE
| STR_NOFREE
))
427 rb_str_resize(str
, RSTRING_LEN(str
));
429 fstr
= register_fstring(str
, FALSE
);
432 str_replace_shared_without_enc(str
, fstr
);
440 register_fstring(VALUE str
, bool copy
)
442 struct fstr_update_arg args
;
447 st_table
*frozen_strings
= rb_vm_fstring_table();
450 st_update(frozen_strings
, (st_data_t
)str
, fstr_update_callback
, (st_data_t
)&args
);
451 } while (UNDEF_P(args
.fstr
));
455 RUBY_ASSERT(OBJ_FROZEN(args
.fstr
));
456 RUBY_ASSERT(!FL_TEST_RAW(args
.fstr
, STR_FAKESTR
));
457 RUBY_ASSERT(!FL_TEST_RAW(args
.fstr
, FL_EXIVAR
));
458 RUBY_ASSERT(RBASIC_CLASS(args
.fstr
) == rb_cString
);
464 setup_fake_str(struct RString
*fake_str
, const char *name
, long len
, int encidx
)
466 fake_str
->basic
.flags
= T_STRING
|RSTRING_NOEMBED
|STR_NOFREE
|STR_FAKESTR
;
467 /* SHARED to be allocated by the callback */
470 RUBY_ASSERT_ALWAYS(len
== 0);
474 ENCODING_SET_INLINED((VALUE
)fake_str
, encidx
);
476 RBASIC_SET_CLASS_RAW((VALUE
)fake_str
, rb_cString
);
478 fake_str
->as
.heap
.ptr
= (char *)name
;
479 fake_str
->as
.heap
.aux
.capa
= len
;
480 return (VALUE
)fake_str
;
484 * set up a fake string which refers a static string literal.
487 rb_setup_fake_str(struct RString
*fake_str
, const char *name
, long len
, rb_encoding
*enc
)
489 return setup_fake_str(fake_str
, name
, len
, rb_enc_to_index(enc
));
493 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
494 * shared string which refers a static string literal. `ptr` must
495 * point a constant string.
498 rb_fstring_new(const char *ptr
, long len
)
500 struct RString fake_str
;
501 return register_fstring(setup_fake_str(&fake_str
, ptr
, len
, ENCINDEX_US_ASCII
), FALSE
);
505 rb_fstring_enc_new(const char *ptr
, long len
, rb_encoding
*enc
)
507 struct RString fake_str
;
508 return register_fstring(rb_setup_fake_str(&fake_str
, ptr
, len
, enc
), FALSE
);
512 rb_fstring_cstr(const char *ptr
)
514 return rb_fstring_new(ptr
, strlen(ptr
));
518 fstring_set_class_i(st_data_t key
, st_data_t val
, st_data_t arg
)
520 RBASIC_SET_CLASS((VALUE
)key
, (VALUE
)arg
);
525 fstring_cmp(VALUE a
, VALUE b
)
528 const char *aptr
, *bptr
;
529 RSTRING_GETMEM(a
, aptr
, alen
);
530 RSTRING_GETMEM(b
, bptr
, blen
);
531 return (alen
!= blen
||
532 ENCODING_GET(a
) != ENCODING_GET(b
) ||
533 memcmp(aptr
, bptr
, alen
) != 0);
537 single_byte_optimizable(VALUE str
)
541 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
542 if (ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
)
545 enc
= STR_ENC_GET(str
);
546 if (rb_enc_mbmaxlen(enc
) == 1)
549 /* Conservative. Possibly single byte.
550 * "\xa1" in Shift_JIS for example. */
556 static inline const char *
557 search_nonascii(const char *p
, const char *e
)
559 const uintptr_t *s
, *t
;
561 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
562 # if SIZEOF_UINTPTR_T == 8
563 # define NONASCII_MASK UINT64_C(0x8080808080808080)
564 # elif SIZEOF_UINTPTR_T == 4
565 # define NONASCII_MASK UINT32_C(0x80808080)
567 # error "don't know what to do."
570 # if SIZEOF_UINTPTR_T == 8
571 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
572 # elif SIZEOF_UINTPTR_T == 4
573 # define NONASCII_MASK 0x80808080UL /* or...? */
575 # error "don't know what to do."
579 if (UNALIGNED_WORD_ACCESS
|| e
- p
>= SIZEOF_VOIDP
) {
580 #if !UNALIGNED_WORD_ACCESS
581 if ((uintptr_t)p
% SIZEOF_VOIDP
) {
582 int l
= SIZEOF_VOIDP
- (uintptr_t)p
% SIZEOF_VOIDP
;
585 default: UNREACHABLE
;
587 case 7: if (p
[-7]&0x80) return p
-7;
588 case 6: if (p
[-6]&0x80) return p
-6;
589 case 5: if (p
[-5]&0x80) return p
-5;
590 case 4: if (p
[-4]&0x80) return p
-4;
592 case 3: if (p
[-3]&0x80) return p
-3;
593 case 2: if (p
[-2]&0x80) return p
-2;
594 case 1: if (p
[-1]&0x80) return p
-1;
599 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
600 #define aligned_ptr(value) \
601 __builtin_assume_aligned((value), sizeof(uintptr_t))
603 #define aligned_ptr(value) (uintptr_t *)(value)
606 t
= (uintptr_t *)(e
- (SIZEOF_VOIDP
-1));
609 if (*s
& NONASCII_MASK
) {
610 #ifdef WORDS_BIGENDIAN
611 return (const char *)s
+ (nlz_intptr(*s
&NONASCII_MASK
)>>3);
613 return (const char *)s
+ (ntz_intptr(*s
&NONASCII_MASK
)>>3);
621 default: UNREACHABLE
;
623 case 7: if (e
[-7]&0x80) return e
-7;
624 case 6: if (e
[-6]&0x80) return e
-6;
625 case 5: if (e
[-5]&0x80) return e
-5;
626 case 4: if (e
[-4]&0x80) return e
-4;
628 case 3: if (e
[-3]&0x80) return e
-3;
629 case 2: if (e
[-2]&0x80) return e
-2;
630 case 1: if (e
[-1]&0x80) return e
-1;
636 coderange_scan(const char *p
, long len
, rb_encoding
*enc
)
638 const char *e
= p
+ len
;
640 if (rb_enc_to_index(enc
) == rb_ascii8bit_encindex()) {
641 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
642 p
= search_nonascii(p
, e
);
643 return p
? ENC_CODERANGE_VALID
: ENC_CODERANGE_7BIT
;
646 if (rb_enc_asciicompat(enc
)) {
647 p
= search_nonascii(p
, e
);
648 if (!p
) return ENC_CODERANGE_7BIT
;
650 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
651 if (!MBCLEN_CHARFOUND_P(ret
)) return ENC_CODERANGE_BROKEN
;
652 p
+= MBCLEN_CHARFOUND_LEN(ret
);
654 p
= search_nonascii(p
, e
);
660 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
661 if (!MBCLEN_CHARFOUND_P(ret
)) return ENC_CODERANGE_BROKEN
;
662 p
+= MBCLEN_CHARFOUND_LEN(ret
);
665 return ENC_CODERANGE_VALID
;
669 rb_str_coderange_scan_restartable(const char *s
, const char *e
, rb_encoding
*enc
, int *cr
)
673 if (*cr
== ENC_CODERANGE_BROKEN
)
676 if (rb_enc_to_index(enc
) == rb_ascii8bit_encindex()) {
677 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
678 if (*cr
== ENC_CODERANGE_VALID
) return e
- s
;
679 p
= search_nonascii(p
, e
);
680 *cr
= p
? ENC_CODERANGE_VALID
: ENC_CODERANGE_7BIT
;
683 else if (rb_enc_asciicompat(enc
)) {
684 p
= search_nonascii(p
, e
);
686 if (*cr
!= ENC_CODERANGE_VALID
) *cr
= ENC_CODERANGE_7BIT
;
690 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
691 if (!MBCLEN_CHARFOUND_P(ret
)) {
692 *cr
= MBCLEN_INVALID_P(ret
) ? ENC_CODERANGE_BROKEN
: ENC_CODERANGE_UNKNOWN
;
695 p
+= MBCLEN_CHARFOUND_LEN(ret
);
697 p
= search_nonascii(p
, e
);
703 int ret
= rb_enc_precise_mbclen(p
, e
, enc
);
704 if (!MBCLEN_CHARFOUND_P(ret
)) {
705 *cr
= MBCLEN_INVALID_P(ret
) ? ENC_CODERANGE_BROKEN
: ENC_CODERANGE_UNKNOWN
;
708 p
+= MBCLEN_CHARFOUND_LEN(ret
);
711 *cr
= ENC_CODERANGE_VALID
;
716 str_enc_copy(VALUE str1
, VALUE str2
)
718 rb_enc_set_index(str1
, ENCODING_GET(str2
));
721 /* Like str_enc_copy, but does not check frozen status of str1.
722 * You should use this only if you're certain that str1 is not frozen. */
724 str_enc_copy_direct(VALUE str1
, VALUE str2
)
726 int inlined_encoding
= RB_ENCODING_GET_INLINED(str2
);
727 if (inlined_encoding
== ENCODING_INLINE_MAX
) {
728 rb_enc_set_index(str1
, rb_enc_get_index(str2
));
731 ENCODING_SET_INLINED(str1
, inlined_encoding
);
736 rb_enc_cr_str_copy_for_substr(VALUE dest
, VALUE src
)
738 /* this function is designed for copying encoding and coderange
739 * from src to new string "dest" which is made from the part of src.
741 str_enc_copy(dest
, src
);
742 if (RSTRING_LEN(dest
) == 0) {
743 if (!rb_enc_asciicompat(STR_ENC_GET(src
)))
744 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_VALID
);
746 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
749 switch (ENC_CODERANGE(src
)) {
750 case ENC_CODERANGE_7BIT
:
751 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
753 case ENC_CODERANGE_VALID
:
754 if (!rb_enc_asciicompat(STR_ENC_GET(src
)) ||
755 search_nonascii(RSTRING_PTR(dest
), RSTRING_END(dest
)))
756 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_VALID
);
758 ENC_CODERANGE_SET(dest
, ENC_CODERANGE_7BIT
);
766 rb_enc_cr_str_exact_copy(VALUE dest
, VALUE src
)
768 str_enc_copy(dest
, src
);
769 ENC_CODERANGE_SET(dest
, ENC_CODERANGE(src
));
773 enc_coderange_scan(VALUE str
, rb_encoding
*enc
)
775 return coderange_scan(RSTRING_PTR(str
), RSTRING_LEN(str
), enc
);
779 rb_enc_str_coderange_scan(VALUE str
, rb_encoding
*enc
)
781 return enc_coderange_scan(str
, enc
);
785 rb_enc_str_coderange(VALUE str
)
787 int cr
= ENC_CODERANGE(str
);
789 if (cr
== ENC_CODERANGE_UNKNOWN
) {
790 cr
= enc_coderange_scan(str
, get_encoding(str
));
791 ENC_CODERANGE_SET(str
, cr
);
797 rb_enc_str_asciionly_p(VALUE str
)
799 rb_encoding
*enc
= STR_ENC_GET(str
);
801 if (!rb_enc_asciicompat(enc
))
803 else if (is_ascii_string(str
))
809 str_mod_check(VALUE s
, const char *p
, long len
)
811 if (RSTRING_PTR(s
) != p
|| RSTRING_LEN(s
) != len
){
812 rb_raise(rb_eRuntimeError
, "string modified");
817 str_capacity(VALUE str
, const int termlen
)
819 if (STR_EMBED_P(str
)) {
820 return str_embed_capa(str
) - termlen
;
822 else if (FL_TEST(str
, STR_SHARED
|STR_NOFREE
)) {
823 return RSTRING(str
)->len
;
826 return RSTRING(str
)->as
.heap
.aux
.capa
;
831 rb_str_capacity(VALUE str
)
833 return str_capacity(str
, TERM_LEN(str
));
837 must_not_null(const char *ptr
)
840 rb_raise(rb_eArgError
, "NULL pointer given");
845 str_alloc_embed(VALUE klass
, size_t capa
)
847 size_t size
= rb_str_embed_size(capa
);
848 RUBY_ASSERT(size
> 0);
849 RUBY_ASSERT(rb_gc_size_allocatable_p(size
));
851 NEWOBJ_OF(str
, struct RString
, klass
,
852 T_STRING
| (RGENGC_WB_PROTECTED_STRING
? FL_WB_PROTECTED
: 0), size
, 0);
858 str_alloc_heap(VALUE klass
)
860 NEWOBJ_OF(str
, struct RString
, klass
,
861 T_STRING
| STR_NOEMBED
| (RGENGC_WB_PROTECTED_STRING
? FL_WB_PROTECTED
: 0), sizeof(struct RString
), 0);
867 empty_str_alloc(VALUE klass
)
869 RUBY_DTRACE_CREATE_HOOK(STRING
, 0);
870 VALUE str
= str_alloc_embed(klass
, 0);
871 memset(RSTRING(str
)->as
.embed
.ary
, 0, str_embed_capa(str
));
876 str_new0(VALUE klass
, const char *ptr
, long len
, int termlen
)
881 rb_raise(rb_eArgError
, "negative string size (or size too big)");
884 RUBY_DTRACE_CREATE_HOOK(STRING
, len
);
886 if (STR_EMBEDDABLE_P(len
, termlen
)) {
887 str
= str_alloc_embed(klass
, len
+ termlen
);
889 ENC_CODERANGE_SET(str
, ENC_CODERANGE_7BIT
);
893 str
= str_alloc_heap(klass
);
894 RSTRING(str
)->as
.heap
.aux
.capa
= len
;
895 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
896 * integer overflow. If we can STATIC_ASSERT that, the following
897 * mul_add_mul can be reverted to a simple ALLOC_N. */
898 RSTRING(str
)->as
.heap
.ptr
=
899 rb_xmalloc_mul_add_mul(sizeof(char), len
, sizeof(char), termlen
);
902 memcpy(RSTRING_PTR(str
), ptr
, len
);
904 STR_SET_LEN(str
, len
);
905 TERM_FILL(RSTRING_PTR(str
) + len
, termlen
);
910 str_new(VALUE klass
, const char *ptr
, long len
)
912 return str_new0(klass
, ptr
, len
, 1);
916 rb_str_new(const char *ptr
, long len
)
918 return str_new(rb_cString
, ptr
, len
);
922 rb_usascii_str_new(const char *ptr
, long len
)
924 VALUE str
= rb_str_new(ptr
, len
);
925 ENCODING_CODERANGE_SET(str
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
930 rb_utf8_str_new(const char *ptr
, long len
)
932 VALUE str
= str_new(rb_cString
, ptr
, len
);
933 rb_enc_associate_index(str
, rb_utf8_encindex());
938 rb_enc_str_new(const char *ptr
, long len
, rb_encoding
*enc
)
942 if (!enc
) return rb_str_new(ptr
, len
);
944 str
= str_new0(rb_cString
, ptr
, len
, rb_enc_mbminlen(enc
));
945 rb_enc_associate(str
, enc
);
950 rb_str_new_cstr(const char *ptr
)
953 /* rb_str_new_cstr() can take pointer from non-malloc-generated
954 * memory regions, and that cannot be detected by the MSAN. Just
955 * trust the programmer that the argument passed here is a sane C
957 __msan_unpoison_string(ptr
);
958 return rb_str_new(ptr
, strlen(ptr
));
962 rb_usascii_str_new_cstr(const char *ptr
)
964 VALUE str
= rb_str_new_cstr(ptr
);
965 ENCODING_CODERANGE_SET(str
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
970 rb_utf8_str_new_cstr(const char *ptr
)
972 VALUE str
= rb_str_new_cstr(ptr
);
973 rb_enc_associate_index(str
, rb_utf8_encindex());
978 rb_enc_str_new_cstr(const char *ptr
, rb_encoding
*enc
)
981 if (rb_enc_mbminlen(enc
) != 1) {
982 rb_raise(rb_eArgError
, "wchar encoding given");
984 return rb_enc_str_new(ptr
, strlen(ptr
), enc
);
988 str_new_static(VALUE klass
, const char *ptr
, long len
, int encindex
)
993 rb_raise(rb_eArgError
, "negative string size (or size too big)");
997 rb_encoding
*enc
= rb_enc_get_from_index(encindex
);
998 str
= str_new0(klass
, ptr
, len
, rb_enc_mbminlen(enc
));
1001 RUBY_DTRACE_CREATE_HOOK(STRING
, len
);
1002 str
= str_alloc_heap(klass
);
1003 RSTRING(str
)->len
= len
;
1004 RSTRING(str
)->as
.heap
.ptr
= (char *)ptr
;
1005 RSTRING(str
)->as
.heap
.aux
.capa
= len
;
1006 RBASIC(str
)->flags
|= STR_NOFREE
;
1008 rb_enc_associate_index(str
, encindex
);
1013 rb_str_new_static(const char *ptr
, long len
)
1015 return str_new_static(rb_cString
, ptr
, len
, 0);
1019 rb_usascii_str_new_static(const char *ptr
, long len
)
1021 return str_new_static(rb_cString
, ptr
, len
, ENCINDEX_US_ASCII
);
1025 rb_utf8_str_new_static(const char *ptr
, long len
)
1027 return str_new_static(rb_cString
, ptr
, len
, ENCINDEX_UTF_8
);
1031 rb_enc_str_new_static(const char *ptr
, long len
, rb_encoding
*enc
)
1033 return str_new_static(rb_cString
, ptr
, len
, rb_enc_to_index(enc
));
1036 static VALUE
str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1037 rb_encoding
*from
, rb_encoding
*to
,
1038 int ecflags
, VALUE ecopts
);
1041 is_enc_ascii_string(VALUE str
, rb_encoding
*enc
)
1043 int encidx
= rb_enc_to_index(enc
);
1044 if (rb_enc_get_index(str
) == encidx
)
1045 return is_ascii_string(str
);
1046 return enc_coderange_scan(str
, enc
) == ENC_CODERANGE_7BIT
;
1050 rb_str_conv_enc_opts(VALUE str
, rb_encoding
*from
, rb_encoding
*to
, int ecflags
, VALUE ecopts
)
1056 if (!to
) return str
;
1057 if (!from
) from
= rb_enc_get(str
);
1058 if (from
== to
) return str
;
1059 if ((rb_enc_asciicompat(to
) && is_enc_ascii_string(str
, from
)) ||
1060 rb_is_ascii8bit_enc(to
)) {
1061 if (STR_ENC_GET(str
) != to
) {
1062 str
= rb_str_dup(str
);
1063 rb_enc_associate(str
, to
);
1068 RSTRING_GETMEM(str
, ptr
, len
);
1069 newstr
= str_cat_conv_enc_opts(rb_str_buf_new(len
), 0, ptr
, len
,
1070 from
, to
, ecflags
, ecopts
);
1071 if (NIL_P(newstr
)) {
1072 /* some error, return original */
1079 rb_str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1080 rb_encoding
*from
, int ecflags
, VALUE ecopts
)
1084 olen
= RSTRING_LEN(newstr
);
1085 if (ofs
< -olen
|| olen
< ofs
)
1086 rb_raise(rb_eIndexError
, "index %ld out of string", ofs
);
1087 if (ofs
< 0) ofs
+= olen
;
1089 STR_SET_LEN(newstr
, ofs
);
1090 return rb_str_cat(newstr
, ptr
, len
);
1093 rb_str_modify(newstr
);
1094 return str_cat_conv_enc_opts(newstr
, ofs
, ptr
, len
, from
,
1100 rb_str_initialize(VALUE str
, const char *ptr
, long len
, rb_encoding
*enc
)
1102 STR_SET_LEN(str
, 0);
1103 rb_enc_associate(str
, enc
);
1104 rb_str_cat(str
, ptr
, len
);
1109 str_cat_conv_enc_opts(VALUE newstr
, long ofs
, const char *ptr
, long len
,
1110 rb_encoding
*from
, rb_encoding
*to
,
1111 int ecflags
, VALUE ecopts
)
1114 rb_econv_result_t ret
;
1116 VALUE econv_wrapper
;
1117 const unsigned char *start
, *sp
;
1118 unsigned char *dest
, *dp
;
1119 size_t converted_output
= (size_t)ofs
;
1121 olen
= rb_str_capacity(newstr
);
1123 econv_wrapper
= rb_obj_alloc(rb_cEncodingConverter
);
1124 RBASIC_CLEAR_CLASS(econv_wrapper
);
1125 ec
= rb_econv_open_opts(from
->name
, to
->name
, ecflags
, ecopts
);
1126 if (!ec
) return Qnil
;
1127 DATA_PTR(econv_wrapper
) = ec
;
1129 sp
= (unsigned char*)ptr
;
1131 while ((dest
= (unsigned char*)RSTRING_PTR(newstr
)),
1132 (dp
= dest
+ converted_output
),
1133 (ret
= rb_econv_convert(ec
, &sp
, start
+ len
, &dp
, dest
+ olen
, 0)),
1134 ret
== econv_destination_buffer_full
) {
1135 /* destination buffer short */
1136 size_t converted_input
= sp
- start
;
1137 size_t rest
= len
- converted_input
;
1138 converted_output
= dp
- dest
;
1139 rb_str_set_len(newstr
, converted_output
);
1140 if (converted_input
&& converted_output
&&
1141 rest
< (LONG_MAX
/ converted_output
)) {
1142 rest
= (rest
* converted_output
) / converted_input
;
1147 olen
+= rest
< 2 ? 2 : rest
;
1148 rb_str_resize(newstr
, olen
);
1150 DATA_PTR(econv_wrapper
) = 0;
1153 case econv_finished
:
1154 len
= dp
- (unsigned char*)RSTRING_PTR(newstr
);
1155 rb_str_set_len(newstr
, len
);
1156 rb_enc_associate(newstr
, to
);
1165 rb_str_conv_enc(VALUE str
, rb_encoding
*from
, rb_encoding
*to
)