diff options
author | NAITOH Jun <[email protected]> | 2024-09-14 09:32:21 +0900 |
---|---|---|
committer | Hiroshi SHIBATA <[email protected]> | 2024-09-17 15:12:25 +0900 |
commit | d81b0588bb3c97167d1f7e2d2a74185e0c19b68c (patch) | |
tree | 568ed2979cc5b4a6d5e2af6146fba19ac0baf425 | |
parent | 7d80c139f777a2018a93ab8df4e57dbf4fd27572 (diff) |
[ruby/strscan] Accept String as a pattern at non head
(https://github.com/ruby/strscan/pull/106)
It supports non-head match cases such as StringScanner#scan_until.
If we use a String as a pattern, we can improve match performance.
Here is a result of the including benchmark.
## CRuby
It shows String as a pattern is 1.18x faster than Regexp as a pattern.
```
$ benchmark-driver benchmark/check_until.yaml
Warming up --------------------------------------
regexp 9.403M i/s - 9.548M times in 1.015459s (106.35ns/i)
regexp_var 9.162M i/s - 9.248M times in 1.009479s (109.15ns/i)
string 8.966M i/s - 9.274M times in 1.034343s (111.54ns/i)
string_var 11.051M i/s - 11.190M times in 1.012538s (90.49ns/i)
Calculating -------------------------------------
regexp 10.319M i/s - 28.209M times in 2.733707s (96.91ns/i)
regexp_var 10.032M i/s - 27.485M times in 2.739807s (99.68ns/i)
string 9.681M i/s - 26.897M times in 2.778397s (103.30ns/i)
string_var 12.162M i/s - 33.154M times in 2.726046s (82.22ns/i)
Comparison:
string_var: 12161920.6 i/s
regexp: 10318949.7 i/s - 1.18x slower
regexp_var: 10031617.6 i/s - 1.21x slower
string: 9680843.7 i/s - 1.26x slower
```
## JRuby
It shows String as a pattern is 2.11x faster than Regexp as a pattern.
```
$ benchmark-driver benchmark/check_until.yaml
Warming up --------------------------------------
regexp 7.591M i/s - 7.544M times in 0.993780s (131.74ns/i)
regexp_var 6.143M i/s - 6.125M times in 0.997038s (162.77ns/i)
string 14.135M i/s - 14.079M times in 0.996067s (70.75ns/i)
string_var 14.079M i/s - 14.057M times in 0.998420s (71.03ns/i)
Calculating -------------------------------------
regexp 9.409M i/s - 22.773M times in 2.420268s (106.28ns/i)
regexp_var 10.116M i/s - 18.430M times in 1.821820s (98.85ns/i)
string 21.389M i/s - 42.404M times in 1.982519s (46.75ns/i)
string_var 20.897M i/s - 42.237M times in 2.021187s (47.85ns/i)
Comparison:
string: 21389191.1 i/s
string_var: 20897327.5 i/s - 1.02x slower
regexp_var: 10116464.7 i/s - 2.11x slower
regexp: 9409222.3 i/s - 2.27x slower
```
See:
https://github.com/jruby/jruby/blob/be7815ec02356a58891c8727bb448f0c6a826d96/core/src/main/java/org/jruby/util/StringSupport.java#L1706-L1736
---------
https://github.com/ruby/strscan/commit/f9d96c446a
Co-authored-by: Sutou Kouhei <[email protected]>
-rw-r--r-- | ext/strscan/strscan.c | 25 | ||||
-rw-r--r-- | test/strscan/test_stringscanner.rb | 80 |
2 files changed, 89 insertions, 16 deletions
diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 606c44bc96..e272f92249 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -686,14 +686,6 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly { struct strscanner *p; - if (headonly) { - if (!RB_TYPE_P(pattern, T_REGEXP)) { - StringValue(pattern); - } - } - else { - Check_Type(pattern, T_REGEXP); - } GET_SCANNER(self, p); CLEAR_MATCH_STATUS(p); @@ -714,14 +706,25 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly } } else { + StringValue(pattern); rb_enc_check(p->str, pattern); if (S_RESTLEN(p) < RSTRING_LEN(pattern)) { return Qnil; } - if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) { - return Qnil; + + if (headonly) { + if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) { + return Qnil; + } + set_registers(p, RSTRING_LEN(pattern)); + } else { + long pos = rb_memsearch(RSTRING_PTR(pattern), RSTRING_LEN(pattern), + CURPTR(p), S_RESTLEN(p), rb_enc_get(pattern)); + if (pos == -1) { + return Qnil; + } + set_registers(p, RSTRING_LEN(pattern) + pos); } - set_registers(p, RSTRING_LEN(pattern)); } MATCHED(p); diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 143cf7197d..9b7b7910d0 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -262,7 +262,7 @@ module StringScannerTests end def test_scan - s = create_string_scanner('stra strb strc', true) + s = create_string_scanner("stra strb\0strc", true) tmp = s.scan(/\w+/) assert_equal 'stra', tmp @@ -270,7 +270,7 @@ module StringScannerTests assert_equal ' ', tmp assert_equal 'strb', s.scan(/\w+/) - assert_equal ' ', s.scan(/\s+/) + assert_equal "\u0000", s.scan(/\0/) tmp = s.scan(/\w+/) assert_equal 'strc', tmp @@ -312,11 +312,14 @@ module StringScannerTests end def test_scan_string - s = create_string_scanner('stra strb strc') + s = create_string_scanner("stra strb\0strc") assert_equal 'str', s.scan('str') assert_equal 'str', s[0] assert_equal 3, s.pos assert_equal 'a ', s.scan('a ') + assert_equal 'strb', s.scan('strb') + assert_equal "\u0000", s.scan("\0") + assert_equal 'strc', s.scan('strc') str = 'stra strb strc'.dup s = create_string_scanner(str, false) @@ -668,13 +671,47 @@ module StringScannerTests assert_equal(nil, s.exist?(/e/)) end - def test_exist_p_string + def test_exist_p_invalid_argument s = create_string_scanner("test string") assert_raise(TypeError) do - s.exist?(" ") + s.exist?(1) end end + def test_exist_p_string + omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby" + s = create_string_scanner("test string") + assert_equal(3, s.exist?("s")) + assert_equal(0, s.pos) + s.scan("test") + assert_equal(2, s.exist?("s")) + assert_equal(4, s.pos) + assert_equal(nil, s.exist?("e")) + end + + def test_scan_until + s = create_string_scanner("Foo Bar\0Baz") + assert_equal("Foo", s.scan_until(/Foo/)) + assert_equal(3, s.pos) + assert_equal(" Bar", s.scan_until(/Bar/)) + assert_equal(7, s.pos) + assert_equal(nil, s.skip_until(/Qux/)) + assert_equal("\u0000Baz", s.scan_until(/Baz/)) + assert_equal(11, s.pos) + end + + def test_scan_until_string + omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby" + s = create_string_scanner("Foo Bar\0Baz") + assert_equal("Foo", s.scan_until("Foo")) + assert_equal(3, s.pos) + assert_equal(" Bar", s.scan_until("Bar")) + assert_equal(7, s.pos) + assert_equal(nil, s.skip_until("Qux")) + assert_equal("\u0000Baz", s.scan_until("Baz")) + assert_equal(11, s.pos) + end + def test_skip_until s = create_string_scanner("Foo Bar Baz") assert_equal(3, s.skip_until(/Foo/)) @@ -684,6 +721,16 @@ module StringScannerTests assert_equal(nil, s.skip_until(/Qux/)) end + def test_skip_until_string + omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby" + s = create_string_scanner("Foo Bar Baz") + assert_equal(3, s.skip_until("Foo")) + assert_equal(3, s.pos) + assert_equal(4, s.skip_until("Bar")) + assert_equal(7, s.pos) + assert_equal(nil, s.skip_until("Qux")) + end + def test_check_until s = create_string_scanner("Foo Bar Baz") assert_equal("Foo", s.check_until(/Foo/)) @@ -693,6 +740,16 @@ module StringScannerTests assert_equal(nil, s.check_until(/Qux/)) end + def test_check_until_string + omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby" + s = create_string_scanner("Foo Bar Baz") + assert_equal("Foo", s.check_until("Foo")) + assert_equal(0, s.pos) + assert_equal("Foo Bar", s.check_until("Bar")) + assert_equal(0, s.pos) + assert_equal(nil, s.check_until("Qux")) + end + def test_search_full s = create_string_scanner("Foo Bar Baz") assert_equal(8, s.search_full(/Bar /, false, false)) @@ -705,6 +762,19 @@ module StringScannerTests assert_equal(11, s.pos) end + def test_search_full_string + omit("not implemented on TruffleRuby") if RUBY_ENGINE == "truffleruby" + s = create_string_scanner("Foo Bar Baz") + assert_equal(8, s.search_full("Bar ", false, false)) + assert_equal(0, s.pos) + assert_equal("Foo Bar ", s.search_full("Bar ", false, true)) + assert_equal(0, s.pos) + assert_equal(8, s.search_full("Bar ", true, false)) + assert_equal(8, s.pos) + assert_equal("Baz", s.search_full("az", true, true)) + assert_equal(11, s.pos) + end + def test_peek s = create_string_scanner("test string") assert_equal("test st", s.peek(7)) |