diff options
-rw-r--r-- | benchmark/string_gsub.yml | 43 | ||||
-rw-r--r-- | common.mk | 1 | ||||
-rw-r--r-- | hash.c | 2 | ||||
-rw-r--r-- | internal/hash.h | 1 | ||||
-rw-r--r-- | string.c | 19 |
5 files changed, 63 insertions, 3 deletions
diff --git a/benchmark/string_gsub.yml b/benchmark/string_gsub.yml new file mode 100644 index 0000000000..0f964337dd --- /dev/null +++ b/benchmark/string_gsub.yml @@ -0,0 +1,43 @@ +prelude: | + # frozen_string_literal: true + STR = ((("a" * 31) + "<") * 1000).freeze + STR_UNICODE = ((("a" * 30) + "\u2028") * 1000).freeze + ESCAPED_CHARS_BINARY = { + "\u2028".b => '\u2028'.b, + "\u2029".b => '\u2029'.b, + ">".b => '\u003e'.b.freeze, + "<".b => '\u003c'.b.freeze, + "&".b => '\u0026'.b.freeze, + } + BINARY_PATTERN = Regexp.union(ESCAPED_CHARS_BINARY.keys) + + ESCAPED_CHARS = { + "\u2028" => '\u2028', + "\u2029" => '\u2029', + ">" => '\u003e', + "<" => '\u003c', + "&" => '\u0026', + } + ESCAPE_PATTERN = Regexp.union(ESCAPED_CHARS.keys) + + +benchmark: + escape: | + str = STR.dup + str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS) + str + + escape_bin: | + str = STR.b + str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY) + str.force_encoding(Encoding::UTF_8) + + escape_utf8: | + str = STR_UNICODE.dup + str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS) + str + + escape_utf8_bin: | + str = STR_UNICODE.b + str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY) + str.force_encoding(Encoding::UTF_8) @@ -17878,6 +17878,7 @@ string.$(OBJEXT): $(top_srcdir)/internal/encoding.h string.$(OBJEXT): $(top_srcdir)/internal/error.h string.$(OBJEXT): $(top_srcdir)/internal/fixnum.h string.$(OBJEXT): $(top_srcdir)/internal/gc.h +string.$(OBJEXT): $(top_srcdir)/internal/hash.h string.$(OBJEXT): $(top_srcdir)/internal/imemo.h string.$(OBJEXT): $(top_srcdir)/internal/numeric.h string.$(OBJEXT): $(top_srcdir)/internal/object.h @@ -2037,7 +2037,7 @@ call_default_proc(VALUE proc, VALUE hash, VALUE key) return rb_proc_call_with_block(proc, 2, args, Qnil); } -static bool +bool rb_hash_default_unredefined(VALUE hash) { VALUE klass = RBASIC_CLASS(hash); diff --git a/internal/hash.h b/internal/hash.h index d66b5b2d04..676f140496 100644 --- a/internal/hash.h +++ b/internal/hash.h @@ -86,6 +86,7 @@ VALUE rb_hash_set_pair(VALUE hash, VALUE pair); int rb_hash_stlike_delete(VALUE hash, st_data_t *pkey, st_data_t *pval); int rb_hash_stlike_foreach_with_replace(VALUE hash, st_foreach_check_callback_func *func, st_update_callback_func *replace, st_data_t arg); int rb_hash_stlike_update(VALUE hash, st_data_t key, st_update_callback_func *func, st_data_t arg); +bool rb_hash_default_unredefined(VALUE hash); VALUE rb_ident_hash_new_with_size(st_index_t size); void rb_hash_free(VALUE hash); RUBY_EXTERN VALUE rb_cHash_empty_frozen; @@ -31,6 +31,7 @@ #include "internal/encoding.h" #include "internal/error.h" #include "internal/gc.h" +#include "internal/hash.h" #include "internal/numeric.h" #include "internal/object.h" #include "internal/proc.h" @@ -6295,7 +6296,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil; long beg, beg0, end0; long offset, blen, slen, len, last; - enum {STR, ITER, MAP} mode = STR; + enum {STR, ITER, FAST_MAP, MAP} mode = STR; char *sp, *cp; int need_backref = -1; rb_encoding *str_enc; @@ -6311,6 +6312,9 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) if (NIL_P(hash)) { StringValue(repl); } + else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) { + mode = FAST_MAP; + } else { mode = MAP; } @@ -6355,7 +6359,18 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) val = rb_obj_as_string(rb_yield(match0)); } else { - val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0)); + struct RString fake_str; + VALUE key; + if (mode == FAST_MAP) { + // It is safe to use a fake_str here because we established that it won't escape, + // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a + // default proc. + key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str)); + } + else { + key = rb_str_subseq(str, beg0, end0 - beg0); + } + val = rb_hash_aref(hash, key); val = rb_obj_as_string(val); } str_mod_check(str, sp, slen); |