diff options
-rw-r--r-- | spec/ruby/core/string/append_as_bytes_spec.rb | 46 | ||||
-rw-r--r-- | string.c | 165 | ||||
-rw-r--r-- | test/ruby/test_string.rb | 49 |
3 files changed, 260 insertions, 0 deletions
diff --git a/spec/ruby/core/string/append_as_bytes_spec.rb b/spec/ruby/core/string/append_as_bytes_spec.rb new file mode 100644 index 0000000000..0e1d09558b --- /dev/null +++ b/spec/ruby/core/string/append_as_bytes_spec.rb @@ -0,0 +1,46 @@ +require_relative '../../spec_helper' + +describe "String#append_bytes" do + ruby_version_is "3.4" do + it "doesn't allow to mutate frozen strings" do + str = "hello".freeze + -> { str.append_as_bytes("\xE2\x82") }.should raise_error(FrozenError) + end + + it "allows creating broken strings" do + str = +"hello" + str.append_as_bytes("\xE2\x82") + str.valid_encoding?.should == false + + str.append_as_bytes("\xAC") + str.valid_encoding?.should == true + + str = "abc".encode(Encoding::UTF_32LE) + str.append_as_bytes("def") + str.encoding.should == Encoding::UTF_32LE + str.valid_encoding?.should == false + end + + it "never changes the receiver encoding" do + str = "".b + str.append_as_bytes("€") + str.encoding.should == Encoding::BINARY + end + + it "accepts variadic String or Integer arguments" do + str = "hello".b + str.append_as_bytes("\xE2\x82", 12, 43, "\xAC") + str.encoding.should == Encoding::BINARY + str.should == "hello\xE2\x82\f+\xAC".b + end + + it "only accepts strings or integers, and doesn't attempt to cast with #to_str or #to_int" do + to_str = mock("to_str") + to_str.should_not_receive(:to_str) + to_str.should_not_receive(:to_int) + + str = +"hello" + -> { str.append_as_bytes(to_str) }.should raise_error(TypeError, "wrong argument type MockObject (expected String or Integer)") + end + end +end @@ -3308,6 +3308,32 @@ rb_str_resize(VALUE str, long len) return str; } +static void +str_ensure_available_capa(VALUE str, long len) +{ + str_modify_keep_cr(str); + + const int termlen = TERM_LEN(str); + long olen = RSTRING_LEN(str); + + if (RB_UNLIKELY(olen > LONG_MAX - len)) { + rb_raise(rb_eArgError, "string sizes too big"); + } + + long total = olen + len; + long capa = str_capacity(str, termlen); + + if (capa < total) { + if (total >= LONG_MAX / 2) { + capa = total; + } + while (total > capa) { + capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */ + } + RESIZE_CAPA_TERM(str, capa, termlen); + } +} + static VALUE str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr) { @@ -3664,6 +3690,144 @@ rb_str_concat_multi(int argc, VALUE *argv, VALUE str) /* * call-seq: + * append_as_bytes(*objects) -> string + * + * Concatenates each object in +objects+ into +self+ without any encoding + * validation or conversion and returns +self+: + * + * s = 'foo' + * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82" + * s.valid_encoding? # => false + * s.append_as_bytes("\xAC 12") + * s.valid_encoding? # => true + * + * For each given object +object+ that is an Integer, + * the value is considered a Byte. If the Integer is bigger + * than one byte, only the lower byte is considered, similar to String#setbyte: + * + * s = "" + * s.append_as_bytes(0, 257) # => "\u0000\u0001" + * + * Related: String#<<, String#concat, which do an encoding aware concatenation. + */ + +VALUE +rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str) +{ + long needed_capacity = 0; + volatile VALUE t0; + enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc); + + for (int index = 0; index < argc; index++) { + VALUE obj = argv[index]; + enum ruby_value_type type = types[index] = rb_type(obj); + switch (type) { + case T_FIXNUM: + case T_BIGNUM: + needed_capacity++; + break; + case T_STRING: + needed_capacity += RSTRING_LEN(obj); + break; + default: + rb_raise( + rb_eTypeError, + "wrong argument type %"PRIsVALUE" (expected String or Integer)", + rb_obj_class(obj) + ); + break; + } + } + + str_ensure_available_capa(str, needed_capacity); + char *sptr = RSTRING_END(str); + + for (int index = 0; index < argc; index++) { + VALUE obj = argv[index]; + enum ruby_value_type type = types[index]; + switch (type) { + case T_FIXNUM: + case T_BIGNUM: { + argv[index] = obj = rb_int_and(obj, INT2FIX(0xff)); + char byte = (char)(NUM2INT(obj) & 0xFF); + *sptr = byte; + sptr++; + break; + } + case T_STRING: { + const char *ptr; + long len; + RSTRING_GETMEM(obj, ptr, len); + memcpy(sptr, ptr, len); + sptr += len; + break; + } + default: + UNREACHABLE; + RUBY_ASSERT("append_as_bytes arguments should have been validated"); + break; + } + } + + STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity); + TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */ + + int cr = ENC_CODERANGE(str); + switch (cr) { + case ENC_CODERANGE_7BIT: { + for (int index = 0; index < argc; index++) { + VALUE obj = argv[index]; + enum ruby_value_type type = types[index]; + switch (type) { + case T_FIXNUM: + case T_BIGNUM: { + if (!ISASCII(NUM2INT(obj))) { + goto clear_cr; + } + break; + } + case T_STRING: { + if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) { + goto clear_cr; + } + } + default: + UNREACHABLE; + RUBY_ASSERT("append_as_bytes arguments should have been validated"); + break; + } + } + break; + } + case ENC_CODERANGE_VALID: + if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) { + goto keep_cr; + } + else { + goto clear_cr; + } + break; + default: + goto clear_cr; + break; + } + + RB_GC_GUARD(t0); + + clear_cr: + // If no fast path was hit, we clear the coderange. + // append_as_bytes is predominently meant to be used in + // buffering situation, hence it's likely the coderange + // will never be scanned, so it's not worth spending time + // precomputing the coderange except for simple and common + // situations. + ENC_CODERANGE_CLEAR(str); + keep_cr: + return str; +} + +/* + * call-seq: * string << object -> string * * Concatenates +object+ to +self+ and returns +self+: @@ -12433,6 +12597,7 @@ Init_String(void) rb_define_method(rb_cString, "reverse", rb_str_reverse, 0); rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0); rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1); + rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1); rb_define_method(rb_cString, "<<", rb_str_concat, 1); rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1); rb_define_method(rb_cString, "crypt", rb_str_crypt, 1); diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index 7b3d63dd1b..8658097ae4 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -3630,6 +3630,55 @@ CODE assert_bytesplice_raise(ArgumentError, S("hello"), 0..-1, "bye", 0, 3) end + def test_append_bytes_into_binary + buf = S("".b) + assert_equal Encoding::BINARY, buf.encoding + + buf.append_as_bytes(S("hello")) + assert_equal "hello".b, buf + assert_equal Encoding::BINARY, buf.encoding + + buf.append_as_bytes(S("こんにちは")) + assert_equal S("helloこんにちは".b), buf + assert_equal Encoding::BINARY, buf.encoding + end + + def test_append_bytes_into_utf8 + buf = S("") + assert_equal Encoding::UTF_8, buf.encoding + + buf.append_as_bytes(S("hello")) + assert_equal S("hello"), buf + assert_equal Encoding::UTF_8, buf.encoding + assert_predicate buf, :ascii_only? + assert_predicate buf, :valid_encoding? + + buf.append_as_bytes(S("こんにちは")) + assert_equal S("helloこんにちは"), buf + assert_equal Encoding::UTF_8, buf.encoding + refute_predicate buf, :ascii_only? + assert_predicate buf, :valid_encoding? + + buf.append_as_bytes(S("\xE2\x82".b)) + assert_equal S("helloこんにちは\xE2\x82"), buf + assert_equal Encoding::UTF_8, buf.encoding + refute_predicate buf, :valid_encoding? + + buf.append_as_bytes(S("\xAC".b)) + assert_equal S("helloこんにちは€"), buf + assert_equal Encoding::UTF_8, buf.encoding + assert_predicate buf, :valid_encoding? + end + + def test_append_bytes_into_utf32 + buf = S("abc".encode(Encoding::UTF_32LE)) + assert_equal Encoding::UTF_32LE, buf.encoding + + buf.append_as_bytes("def") + assert_equal Encoding::UTF_32LE, buf.encoding + refute_predicate buf, :valid_encoding? + end + def test_chilled_string chilled_string = eval('"chilled"') |