summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean Boussier <[email protected]>2024-08-01 11:14:19 +0200
committerJean Boussier <[email protected]>2024-09-09 15:04:51 +0200
commit16f241f0aa047ed77ccea6b6c361b421a72d0454 (patch)
tree6eccf0e125885d7221343b7026cfb5703745f153
parent966901b39dd7b37eec68dd6e8a76a79827dc522f (diff)
Implement String#append_as_bytes(String | Integer, ...)
[Feature #20594] A handy method to construct a string out of multiple chunks. Contrary to `String#concat`, it doesn't do any encoding negociation, and simply append the content as bytes regardless of whether this result in a broken string or not. It's the caller responsibility to check for `String#valid_encoding?` in cases where it's needed. When passed integers, only the lower byte is considered, like in `String#setbyte`.
Notes
Notes: Merged: https://github.com/ruby/ruby/pull/11552
-rw-r--r--spec/ruby/core/string/append_as_bytes_spec.rb46
-rw-r--r--string.c165
-rw-r--r--test/ruby/test_string.rb49
3 files changed, 260 insertions, 0 deletions
diff --git a/spec/ruby/core/string/append_as_bytes_spec.rb b/spec/ruby/core/string/append_as_bytes_spec.rb
new file mode 100644
index 0000000000..0e1d09558b
--- /dev/null
+++ b/spec/ruby/core/string/append_as_bytes_spec.rb
@@ -0,0 +1,46 @@
+require_relative '../../spec_helper'
+
+describe "String#append_bytes" do
+ ruby_version_is "3.4" do
+ it "doesn't allow to mutate frozen strings" do
+ str = "hello".freeze
+ -> { str.append_as_bytes("\xE2\x82") }.should raise_error(FrozenError)
+ end
+
+ it "allows creating broken strings" do
+ str = +"hello"
+ str.append_as_bytes("\xE2\x82")
+ str.valid_encoding?.should == false
+
+ str.append_as_bytes("\xAC")
+ str.valid_encoding?.should == true
+
+ str = "abc".encode(Encoding::UTF_32LE)
+ str.append_as_bytes("def")
+ str.encoding.should == Encoding::UTF_32LE
+ str.valid_encoding?.should == false
+ end
+
+ it "never changes the receiver encoding" do
+ str = "".b
+ str.append_as_bytes("€")
+ str.encoding.should == Encoding::BINARY
+ end
+
+ it "accepts variadic String or Integer arguments" do
+ str = "hello".b
+ str.append_as_bytes("\xE2\x82", 12, 43, "\xAC")
+ str.encoding.should == Encoding::BINARY
+ str.should == "hello\xE2\x82\f+\xAC".b
+ end
+
+ it "only accepts strings or integers, and doesn't attempt to cast with #to_str or #to_int" do
+ to_str = mock("to_str")
+ to_str.should_not_receive(:to_str)
+ to_str.should_not_receive(:to_int)
+
+ str = +"hello"
+ -> { str.append_as_bytes(to_str) }.should raise_error(TypeError, "wrong argument type MockObject (expected String or Integer)")
+ end
+ end
+end
diff --git a/string.c b/string.c
index d9569d1515..de39e84cb1 100644
--- a/string.c
+++ b/string.c
@@ -3308,6 +3308,32 @@ rb_str_resize(VALUE str, long len)
return str;
}
+static void
+str_ensure_available_capa(VALUE str, long len)
+{
+ str_modify_keep_cr(str);
+
+ const int termlen = TERM_LEN(str);
+ long olen = RSTRING_LEN(str);
+
+ if (RB_UNLIKELY(olen > LONG_MAX - len)) {
+ rb_raise(rb_eArgError, "string sizes too big");
+ }
+
+ long total = olen + len;
+ long capa = str_capacity(str, termlen);
+
+ if (capa < total) {
+ if (total >= LONG_MAX / 2) {
+ capa = total;
+ }
+ while (total > capa) {
+ capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
+ }
+ RESIZE_CAPA_TERM(str, capa, termlen);
+ }
+}
+
static VALUE
str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
{
@@ -3664,6 +3690,144 @@ rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
/*
* call-seq:
+ * append_as_bytes(*objects) -> string
+ *
+ * Concatenates each object in +objects+ into +self+ without any encoding
+ * validation or conversion and returns +self+:
+ *
+ * s = 'foo'
+ * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
+ * s.valid_encoding? # => false
+ * s.append_as_bytes("\xAC 12")
+ * s.valid_encoding? # => true
+ *
+ * For each given object +object+ that is an Integer,
+ * the value is considered a Byte. If the Integer is bigger
+ * than one byte, only the lower byte is considered, similar to String#setbyte:
+ *
+ * s = ""
+ * s.append_as_bytes(0, 257) # => "\u0000\u0001"
+ *
+ * Related: String#<<, String#concat, which do an encoding aware concatenation.
+ */
+
+VALUE
+rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
+{
+ long needed_capacity = 0;
+ volatile VALUE t0;
+ enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
+
+ for (int index = 0; index < argc; index++) {
+ VALUE obj = argv[index];
+ enum ruby_value_type type = types[index] = rb_type(obj);
+ switch (type) {
+ case T_FIXNUM:
+ case T_BIGNUM:
+ needed_capacity++;
+ break;
+ case T_STRING:
+ needed_capacity += RSTRING_LEN(obj);
+ break;
+ default:
+ rb_raise(
+ rb_eTypeError,
+ "wrong argument type %"PRIsVALUE" (expected String or Integer)",
+ rb_obj_class(obj)
+ );
+ break;
+ }
+ }
+
+ str_ensure_available_capa(str, needed_capacity);
+ char *sptr = RSTRING_END(str);
+
+ for (int index = 0; index < argc; index++) {
+ VALUE obj = argv[index];
+ enum ruby_value_type type = types[index];
+ switch (type) {
+ case T_FIXNUM:
+ case T_BIGNUM: {
+ argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
+ char byte = (char)(NUM2INT(obj) & 0xFF);
+ *sptr = byte;
+ sptr++;
+ break;
+ }
+ case T_STRING: {
+ const char *ptr;
+ long len;
+ RSTRING_GETMEM(obj, ptr, len);
+ memcpy(sptr, ptr, len);
+ sptr += len;
+ break;
+ }
+ default:
+ UNREACHABLE;
+ RUBY_ASSERT("append_as_bytes arguments should have been validated");
+ break;
+ }
+ }
+
+ STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
+ TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
+
+ int cr = ENC_CODERANGE(str);
+ switch (cr) {
+ case ENC_CODERANGE_7BIT: {
+ for (int index = 0; index < argc; index++) {
+ VALUE obj = argv[index];
+ enum ruby_value_type type = types[index];
+ switch (type) {
+ case T_FIXNUM:
+ case T_BIGNUM: {
+ if (!ISASCII(NUM2INT(obj))) {
+ goto clear_cr;
+ }
+ break;
+ }
+ case T_STRING: {
+ if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
+ goto clear_cr;
+ }
+ }
+ default:
+ UNREACHABLE;
+ RUBY_ASSERT("append_as_bytes arguments should have been validated");
+ break;
+ }
+ }
+ break;
+ }
+ case ENC_CODERANGE_VALID:
+ if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
+ goto keep_cr;
+ }
+ else {
+ goto clear_cr;
+ }
+ break;
+ default:
+ goto clear_cr;
+ break;
+ }
+
+ RB_GC_GUARD(t0);
+
+ clear_cr:
+ // If no fast path was hit, we clear the coderange.
+ // append_as_bytes is predominently meant to be used in
+ // buffering situation, hence it's likely the coderange
+ // will never be scanned, so it's not worth spending time
+ // precomputing the coderange except for simple and common
+ // situations.
+ ENC_CODERANGE_CLEAR(str);
+ keep_cr:
+ return str;
+}
+
+/*
+ * call-seq:
* string << object -> string
*
* Concatenates +object+ to +self+ and returns +self+:
@@ -12433,6 +12597,7 @@ Init_String(void)
rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
+ rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
rb_define_method(rb_cString, "<<", rb_str_concat, 1);
rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb
index 7b3d63dd1b..8658097ae4 100644
--- a/test/ruby/test_string.rb
+++ b/test/ruby/test_string.rb
@@ -3630,6 +3630,55 @@ CODE
assert_bytesplice_raise(ArgumentError, S("hello"), 0..-1, "bye", 0, 3)
end
+ def test_append_bytes_into_binary
+ buf = S("".b)
+ assert_equal Encoding::BINARY, buf.encoding
+
+ buf.append_as_bytes(S("hello"))
+ assert_equal "hello".b, buf
+ assert_equal Encoding::BINARY, buf.encoding
+
+ buf.append_as_bytes(S("こんにちは"))
+ assert_equal S("helloこんにちは".b), buf
+ assert_equal Encoding::BINARY, buf.encoding
+ end
+
+ def test_append_bytes_into_utf8
+ buf = S("")
+ assert_equal Encoding::UTF_8, buf.encoding
+
+ buf.append_as_bytes(S("hello"))
+ assert_equal S("hello"), buf
+ assert_equal Encoding::UTF_8, buf.encoding
+ assert_predicate buf, :ascii_only?
+ assert_predicate buf, :valid_encoding?
+
+ buf.append_as_bytes(S("こんにちは"))
+ assert_equal S("helloこんにちは"), buf
+ assert_equal Encoding::UTF_8, buf.encoding
+ refute_predicate buf, :ascii_only?
+ assert_predicate buf, :valid_encoding?
+
+ buf.append_as_bytes(S("\xE2\x82".b))
+ assert_equal S("helloこんにちは\xE2\x82"), buf
+ assert_equal Encoding::UTF_8, buf.encoding
+ refute_predicate buf, :valid_encoding?
+
+ buf.append_as_bytes(S("\xAC".b))
+ assert_equal S("helloこんにちは€"), buf
+ assert_equal Encoding::UTF_8, buf.encoding
+ assert_predicate buf, :valid_encoding?
+ end
+
+ def test_append_bytes_into_utf32
+ buf = S("abc".encode(Encoding::UTF_32LE))
+ assert_equal Encoding::UTF_32LE, buf.encoding
+
+ buf.append_as_bytes("def")
+ assert_equal Encoding::UTF_32LE, buf.encoding
+ refute_predicate buf, :valid_encoding?
+ end
+
def test_chilled_string
chilled_string = eval('"chilled"')