summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--benchmark/string_gsub.yml43
-rw-r--r--common.mk1
-rw-r--r--hash.c2
-rw-r--r--internal/hash.h1
-rw-r--r--string.c19
5 files changed, 63 insertions, 3 deletions
diff --git a/benchmark/string_gsub.yml b/benchmark/string_gsub.yml
new file mode 100644
index 0000000000..0f964337dd
--- /dev/null
+++ b/benchmark/string_gsub.yml
@@ -0,0 +1,43 @@
+prelude: |
+ # frozen_string_literal: true
+ STR = ((("a" * 31) + "<") * 1000).freeze
+ STR_UNICODE = ((("a" * 30) + "\u2028") * 1000).freeze
+ ESCAPED_CHARS_BINARY = {
+ "\u2028".b => '\u2028'.b,
+ "\u2029".b => '\u2029'.b,
+ ">".b => '\u003e'.b.freeze,
+ "<".b => '\u003c'.b.freeze,
+ "&".b => '\u0026'.b.freeze,
+ }
+ BINARY_PATTERN = Regexp.union(ESCAPED_CHARS_BINARY.keys)
+
+ ESCAPED_CHARS = {
+ "\u2028" => '\u2028',
+ "\u2029" => '\u2029',
+ ">" => '\u003e',
+ "<" => '\u003c',
+ "&" => '\u0026',
+ }
+ ESCAPE_PATTERN = Regexp.union(ESCAPED_CHARS.keys)
+
+
+benchmark:
+ escape: |
+ str = STR.dup
+ str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS)
+ str
+
+ escape_bin: |
+ str = STR.b
+ str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY)
+ str.force_encoding(Encoding::UTF_8)
+
+ escape_utf8: |
+ str = STR_UNICODE.dup
+ str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS)
+ str
+
+ escape_utf8_bin: |
+ str = STR_UNICODE.b
+ str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY)
+ str.force_encoding(Encoding::UTF_8)
diff --git a/common.mk b/common.mk
index a82e9bc49f..f050ab5f3d 100644
--- a/common.mk
+++ b/common.mk
@@ -17878,6 +17878,7 @@ string.$(OBJEXT): $(top_srcdir)/internal/encoding.h
string.$(OBJEXT): $(top_srcdir)/internal/error.h
string.$(OBJEXT): $(top_srcdir)/internal/fixnum.h
string.$(OBJEXT): $(top_srcdir)/internal/gc.h
+string.$(OBJEXT): $(top_srcdir)/internal/hash.h
string.$(OBJEXT): $(top_srcdir)/internal/imemo.h
string.$(OBJEXT): $(top_srcdir)/internal/numeric.h
string.$(OBJEXT): $(top_srcdir)/internal/object.h
diff --git a/hash.c b/hash.c
index 9e1fe4c8fe..0e3d5606af 100644
--- a/hash.c
+++ b/hash.c
@@ -2037,7 +2037,7 @@ call_default_proc(VALUE proc, VALUE hash, VALUE key)
return rb_proc_call_with_block(proc, 2, args, Qnil);
}
-static bool
+bool
rb_hash_default_unredefined(VALUE hash)
{
VALUE klass = RBASIC_CLASS(hash);
diff --git a/internal/hash.h b/internal/hash.h
index d66b5b2d04..676f140496 100644
--- a/internal/hash.h
+++ b/internal/hash.h
@@ -86,6 +86,7 @@ VALUE rb_hash_set_pair(VALUE hash, VALUE pair);
int rb_hash_stlike_delete(VALUE hash, st_data_t *pkey, st_data_t *pval);
int rb_hash_stlike_foreach_with_replace(VALUE hash, st_foreach_check_callback_func *func, st_update_callback_func *replace, st_data_t arg);
int rb_hash_stlike_update(VALUE hash, st_data_t key, st_update_callback_func *func, st_data_t arg);
+bool rb_hash_default_unredefined(VALUE hash);
VALUE rb_ident_hash_new_with_size(st_index_t size);
void rb_hash_free(VALUE hash);
RUBY_EXTERN VALUE rb_cHash_empty_frozen;
diff --git a/string.c b/string.c
index 6faeb5d00e..2c055bfd25 100644
--- a/string.c
+++ b/string.c
@@ -31,6 +31,7 @@
#include "internal/encoding.h"
#include "internal/error.h"
#include "internal/gc.h"
+#include "internal/hash.h"
#include "internal/numeric.h"
#include "internal/object.h"
#include "internal/proc.h"
@@ -6295,7 +6296,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
long beg, beg0, end0;
long offset, blen, slen, len, last;
- enum {STR, ITER, MAP} mode = STR;
+ enum {STR, ITER, FAST_MAP, MAP} mode = STR;
char *sp, *cp;
int need_backref = -1;
rb_encoding *str_enc;
@@ -6311,6 +6312,9 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
if (NIL_P(hash)) {
StringValue(repl);
}
+ else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
+ mode = FAST_MAP;
+ }
else {
mode = MAP;
}
@@ -6355,7 +6359,18 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
val = rb_obj_as_string(rb_yield(match0));
}
else {
- val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
+ struct RString fake_str;
+ VALUE key;
+ if (mode == FAST_MAP) {
+ // It is safe to use a fake_str here because we established that it won't escape,
+ // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
+ // default proc.
+ key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
+ }
+ else {
+ key = rb_str_subseq(str, beg0, end0 - beg0);
+ }
+ val = rb_hash_aref(hash, key);
val = rb_obj_as_string(val);
}
str_mod_check(str, sp, slen);