diff options
author | Kevin Newton <[email protected]> | 2024-05-29 10:12:51 -0400 |
---|---|---|
committer | Kevin Newton <[email protected]> | 2024-05-30 15:18:20 -0400 |
commit | 72452f43871b8034bfa718ed823bc62b5b81d6f9 (patch) | |
tree | 1bd87295dfcf10d20bce7f6d18184644bb079387 /test/prism/encoding_test.rb | |
parent | 1ab7c412d2e3880a7ad233c32e93961888f8145c (diff) |
[ruby/prism] Tests overhaul
https://github.com/ruby/prism/commit/6f886be0a4
Diffstat (limited to 'test/prism/encoding_test.rb')
-rw-r--r-- | test/prism/encoding_test.rb | 577 |
1 files changed, 0 insertions, 577 deletions
diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb deleted file mode 100644 index 2aee473ddf..0000000000 --- a/test/prism/encoding_test.rb +++ /dev/null @@ -1,577 +0,0 @@ -# frozen_string_literal: true - -return if RUBY_ENGINE != "ruby" - -require_relative "test_helper" - -module Prism - class EncodingTest < TestCase - codepoints_1byte = 0...0x100 - encodings = { - Encoding::ASCII_8BIT => codepoints_1byte, - Encoding::US_ASCII => codepoints_1byte - } - - if !ENV["PRISM_BUILD_MINIMAL"] - encodings[Encoding::Windows_1253] = codepoints_1byte - end - - # By default we don't test every codepoint in these encodings because it - # takes a very long time. - if ENV["PRISM_TEST_ALL_ENCODINGS"] - codepoints_2bytes = 0...0x10000 - codepoints_unicode = (0...0x110000) - - codepoints_eucjp = [ - *(0...0x10000), - *(0...0x10000).map { |bytes| bytes | 0x8F0000 } - ] - - codepoints_emacs_mule = [ - *(0...0x80), - *((0x81...0x90).flat_map { |byte1| (0x90...0x100).map { |byte2| byte1 << 8 | byte2 } }), - *((0x90...0x9C).flat_map { |byte1| (0xA0...0x100).flat_map { |byte2| (0xA0...0x100).flat_map { |byte3| byte1 << 16 | byte2 << 8 | byte3 } } }), - *((0xF0...0xF5).flat_map { |byte2| (0xA0...0x100).flat_map { |byte3| (0xA0...0x100).flat_map { |byte4| 0x9C << 24 | byte3 << 16 | byte3 << 8 | byte4 } } }), - ] - - codepoints_gb18030 = [ - *(0...0x80), - *((0x81..0xFE).flat_map { |byte1| (0x40...0x100).map { |byte2| byte1 << 8 | byte2 } }), - *((0x81..0xFE).flat_map { |byte1| (0x30...0x40).flat_map { |byte2| (0x81..0xFE).flat_map { |byte3| (0x2F...0x41).map { |byte4| byte1 << 24 | byte2 << 16 | byte3 << 8 | byte4 } } } }), - ] - - codepoints_euc_tw = [ - *(0..0x7F), - *(0xA1..0xFF).flat_map { |byte1| (0xA1..0xFF).map { |byte2| (byte1 << 8) | byte2 } }, - *(0xA1..0xB0).flat_map { |byte2| (0xA1..0xFF).flat_map { |byte3| (0xA1..0xFF).flat_map { |byte4| 0x8E << 24 | byte2 << 16 | byte3 << 8 | byte4 } } } - ] - - encodings.merge!( - Encoding::CP850 => codepoints_1byte, - Encoding::CP852 => codepoints_1byte, - Encoding::CP855 => codepoints_1byte, - Encoding::GB1988 => codepoints_1byte, - Encoding::IBM437 => codepoints_1byte, - Encoding::IBM720 => codepoints_1byte, - Encoding::IBM737 => codepoints_1byte, - Encoding::IBM775 => codepoints_1byte, - Encoding::IBM852 => codepoints_1byte, - Encoding::IBM855 => codepoints_1byte, - Encoding::IBM857 => codepoints_1byte, - Encoding::IBM860 => codepoints_1byte, - Encoding::IBM861 => codepoints_1byte, - Encoding::IBM862 => codepoints_1byte, - Encoding::IBM863 => codepoints_1byte, - Encoding::IBM864 => codepoints_1byte, - Encoding::IBM865 => codepoints_1byte, - Encoding::IBM866 => codepoints_1byte, - Encoding::IBM869 => codepoints_1byte, - Encoding::ISO_8859_1 => codepoints_1byte, - Encoding::ISO_8859_2 => codepoints_1byte, - Encoding::ISO_8859_3 => codepoints_1byte, - Encoding::ISO_8859_4 => codepoints_1byte, - Encoding::ISO_8859_5 => codepoints_1byte, - Encoding::ISO_8859_6 => codepoints_1byte, - Encoding::ISO_8859_7 => codepoints_1byte, - Encoding::ISO_8859_8 => codepoints_1byte, - Encoding::ISO_8859_9 => codepoints_1byte, - Encoding::ISO_8859_10 => codepoints_1byte, - Encoding::ISO_8859_11 => codepoints_1byte, - Encoding::ISO_8859_13 => codepoints_1byte, - Encoding::ISO_8859_14 => codepoints_1byte, - Encoding::ISO_8859_15 => codepoints_1byte, - Encoding::ISO_8859_16 => codepoints_1byte, - Encoding::KOI8_R => codepoints_1byte, - Encoding::KOI8_U => codepoints_1byte, - Encoding::MACCENTEURO => codepoints_1byte, - Encoding::MACCROATIAN => codepoints_1byte, - Encoding::MACCYRILLIC => codepoints_1byte, - Encoding::MACGREEK => codepoints_1byte, - Encoding::MACICELAND => codepoints_1byte, - Encoding::MACROMAN => codepoints_1byte, - Encoding::MACROMANIA => codepoints_1byte, - Encoding::MACTHAI => codepoints_1byte, - Encoding::MACTURKISH => codepoints_1byte, - Encoding::MACUKRAINE => codepoints_1byte, - Encoding::TIS_620 => codepoints_1byte, - Encoding::Windows_1250 => codepoints_1byte, - Encoding::Windows_1251 => codepoints_1byte, - Encoding::Windows_1252 => codepoints_1byte, - Encoding::Windows_1254 => codepoints_1byte, - Encoding::Windows_1255 => codepoints_1byte, - Encoding::Windows_1256 => codepoints_1byte, - Encoding::Windows_1257 => codepoints_1byte, - Encoding::Windows_1258 => codepoints_1byte, - Encoding::Windows_874 => codepoints_1byte, - Encoding::Big5 => codepoints_2bytes, - Encoding::Big5_HKSCS => codepoints_2bytes, - Encoding::Big5_UAO => codepoints_2bytes, - Encoding::CP949 => codepoints_2bytes, - Encoding::CP950 => codepoints_2bytes, - Encoding::CP951 => codepoints_2bytes, - Encoding::EUC_KR => codepoints_2bytes, - Encoding::GBK => codepoints_2bytes, - Encoding::GB12345 => codepoints_2bytes, - Encoding::GB2312 => codepoints_2bytes, - Encoding::MACJAPANESE => codepoints_2bytes, - Encoding::Shift_JIS => codepoints_2bytes, - Encoding::SJIS_DoCoMo => codepoints_2bytes, - Encoding::SJIS_KDDI => codepoints_2bytes, - Encoding::SJIS_SoftBank => codepoints_2bytes, - Encoding::Windows_31J => codepoints_2bytes, - Encoding::UTF_8 => codepoints_unicode, - Encoding::UTF8_MAC => codepoints_unicode, - Encoding::UTF8_DoCoMo => codepoints_unicode, - Encoding::UTF8_KDDI => codepoints_unicode, - Encoding::UTF8_SoftBank => codepoints_unicode, - Encoding::CESU_8 => codepoints_unicode, - Encoding::CP51932 => codepoints_eucjp, - Encoding::EUC_JP => codepoints_eucjp, - Encoding::EUCJP_MS => codepoints_eucjp, - Encoding::EUC_JIS_2004 => codepoints_eucjp, - Encoding::EMACS_MULE => codepoints_emacs_mule, - Encoding::STATELESS_ISO_2022_JP => codepoints_emacs_mule, - Encoding::STATELESS_ISO_2022_JP_KDDI => codepoints_emacs_mule, - Encoding::GB18030 => codepoints_gb18030, - Encoding::EUC_TW => codepoints_euc_tw - ) - end - - # These test that we're correctly parsing codepoints for each alias of each - # encoding that prism supports. - encodings.each do |encoding, range| - (encoding.names - %w[external internal filesystem locale]).each do |name| - define_method(:"test_encoding_#{name}") do - assert_encoding(encoding, name, range) - end - end - end - - # These test that we're correctly setting the flags on strings for each - # encoding that prism supports. - escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"] - escapes = escapes.concat(escapes.product(escapes).map(&:join)) - symbols = [:a, :ą, :+] - regexps = [/a/, /ą/, //] - - encodings.each_key do |encoding| - define_method(:"test_encoding_flags_#{encoding.name}") do - assert_encoding_flags(encoding, escapes) - end - - define_method(:"test_symbol_encoding_flags_#{encoding.name}") do - assert_symbol_encoding_flags(encoding, symbols) - end - - define_method(:"test_symbol_character_escape_encoding_flags_#{encoding.name}") do - assert_symbol_character_escape_encoding_flags(encoding, escapes) - end - - define_method(:"test_regular_expression_encoding_flags_#{encoding.name}") do - assert_regular_expression_encoding_flags(encoding, regexps.map(&:inspect)) - end - - define_method(:"test_regular_expression_escape_encoding_flags_#{encoding.name}") do - assert_regular_expression_encoding_flags(encoding, escapes.map { |e| "/#{e}/" }) - end - end - - encoding_modifiers = { ascii_8bit: "n", utf_8: "u", euc_jp: "e", windows_31j: "s" } - regexp_sources = ["abc", "garçon", "\\x80", "gar\\xC3\\xA7on", "gar\\u{E7}on", "abc\\u{FFFFFF}", "\\x80\\u{80}" ] - - encoding_modifiers.each_value do |modifier| - encodings.each_key do |encoding| - define_method(:"test_regular_expression_encoding_modifiers_/#{modifier}_#{encoding.name}") do - assert_regular_expression_encoding_flags( - encoding, - regexp_sources.product(encoding_modifiers.values).map { |r, modifier| "/#{r}/#{modifier}" } - ) - end - end - end - - def test_coding - result = Prism.parse("# coding: utf-8\n'string'") - actual = result.value.statements.body.first.unescaped.encoding - assert_equal Encoding.find("utf-8"), actual - end - - def test_coding_with_whitespace - result = Prism.parse("# coding \t \r \v : \t \v \r ascii-8bit \n'string'") - actual = result.value.statements.body.first.unescaped.encoding - assert_equal Encoding.find("ascii-8bit"), actual - end - - def test_emacs_style - result = Prism.parse("# -*- coding: utf-8 -*-\n'string'") - actual = result.value.statements.body.first.unescaped.encoding - assert_equal Encoding.find("utf-8"), actual - end - - def test_utf_8_variations - %w[ - utf-8-unix - utf-8-dos - utf-8-mac - utf-8-* - ].each do |encoding| - result = Prism.parse("# coding: #{encoding}\n'string'") - actual = result.value.statements.body.first.unescaped.encoding - assert_equal Encoding.find("utf-8"), actual - end - end - - def test_first_lexed_token - encoding = Prism.lex("# encoding: ascii-8bit").value[0][0].value.encoding - assert_equal Encoding.find("ascii-8bit"), encoding - end - - if !ENV["PRISM_BUILD_MINIMAL"] - # This test may be a little confusing. Basically when we use our strpbrk, - # it takes into account the encoding of the file. - def test_strpbrk_multibyte - result = Prism.parse(<<~RUBY) - # encoding: Shift_JIS - %w[\x81\x5c] - RUBY - - assert(result.errors.empty?) - assert_equal( - (+"\x81\x5c").force_encoding(Encoding::Shift_JIS), - result.value.statements.body.first.elements.first.unescaped - ) - end - - def test_slice_encoding - slice = Prism.parse("# encoding: Shift_JIS\nア").value.slice - assert_equal (+"ア").force_encoding(Encoding::SHIFT_JIS), slice - assert_equal Encoding::SHIFT_JIS, slice.encoding - end - - def test_multibyte_escapes - [ - ["'", "'"], - ["\"", "\""], - ["`", "`"], - ["/", "/"], - ["<<'HERE'\n", "\nHERE"], - ["<<-HERE\n", "\nHERE"] - ].each do |opening, closing| - assert Prism.parse_success?("# encoding: shift_jis\n'\\\x82\xA0'\n") - end - end - end - - private - - class ConstantContext < BasicObject - def self.const_missing(const) - const - end - end - - def constant_context - ConstantContext.new - end - - class IdentifierContext < BasicObject - def method_missing(name, *) - name - end - end - - def identifier_context - IdentifierContext.new - end - - def assert_encoding_constant(name, character) - source = "# encoding: #{name}\n#{character}" - expected = constant_context.instance_eval(source) - - result = Prism.parse(source) - assert result.success? - - actual = result.value.statements.body.last - assert_kind_of ConstantReadNode, actual - assert_equal expected, actual.name - end - - def assert_encoding_identifier(name, character) - source = "# encoding: #{name}\n#{character}" - expected = identifier_context.instance_eval(source) - - result = Prism.parse(source) - assert result.success? - - actual = result.value.statements.body.last - assert_kind_of CallNode, actual - assert_equal expected, actual.name - end - - # Check that we can properly parse every codepoint in the given encoding. - def assert_encoding(encoding, name, range) - # I'm not entirely sure, but I believe these codepoints are incorrect in - # their parsing in CRuby. They all report as matching `[[:lower:]]` but - # then they are parsed as constants. This is because CRuby determines if - # an identifier is a constant or not by case folding it down to lowercase - # and checking if there is a difference. And even though they report - # themselves as lowercase, their case fold is different. I have reported - # this bug upstream. - case encoding - when Encoding::UTF_8, Encoding::UTF_8_MAC, Encoding::UTF8_DoCoMo, Encoding::UTF8_KDDI, Encoding::UTF8_SoftBank, Encoding::CESU_8 - range = range.to_a - [ - 0x01c5, 0x01c8, 0x01cb, 0x01f2, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, - 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, - 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, - 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fbc, 0x1fcc, 0x1ffc, - ] - when Encoding::Windows_1253 - range = range.to_a - [0xb5] - end - - range.each do |codepoint| - character = codepoint.chr(encoding) - - if character.match?(/[[:alpha:]]/) - if character.match?(/[[:upper:]]/) - assert_encoding_constant(name, character) - else - assert_encoding_identifier(name, character) - end - elsif character.match?(/[[:alnum:]]/) - assert_encoding_identifier(name, "_#{character}") - else - next if ["/", "{"].include?(character) - - source = "# encoding: #{name}\n/(?##{character})/\n" - assert Prism.parse(source).success?, "Expected #{source.inspect} to parse successfully." - end - rescue RangeError - source = "# encoding: #{name}\n\\x#{codepoint.to_s(16)}" - refute Prism.parse(source).success? - end - end - - def assert_encoding_flags(encoding, escapes) - escapes.each do |escaped| - source = "# encoding: #{encoding.name}\n\"#{escaped}\"" - - expected = - begin - eval(source).encoding - rescue SyntaxError => error - if error.message.include?("UTF-8 mixed within") - error.message[/: (.+?)\n/, 1] - else - raise - end - end - - actual = - Prism.parse(source).then do |result| - if result.success? - string = result.value.statements.body.first - - if string.forced_utf8_encoding? - Encoding::UTF_8 - elsif string.forced_binary_encoding? - Encoding::ASCII_8BIT - else - encoding - end - else - error = result.errors.first - - if error.message.include?("mixed") - error.message - else - raise error.message - end - end - end - - assert_equal expected, actual - end - end - - # Test Symbol literals without any interpolation or escape sequences. - def assert_symbol_encoding_flags(encoding, symbols) - symbols.each do |symbol| - source = "# encoding: #{encoding.name}\n#{symbol.inspect}" - - expected = - begin - eval(source).encoding - rescue SyntaxError => error - unless error.message.include?("invalid multibyte char") - raise - end - end - - actual = - Prism.parse(source).then do |result| - if result.success? - symbol = result.value.statements.body.first - - if symbol.forced_utf8_encoding? - Encoding::UTF_8 - elsif symbol.forced_binary_encoding? - Encoding::ASCII_8BIT - elsif symbol.forced_us_ascii_encoding? - Encoding::US_ASCII - else - encoding - end - else - error = result.errors.last - - unless error.message.include?("invalid symbol") - raise error.message - end - end - end - - assert_equal expected, actual - end - end - - def assert_symbol_character_escape_encoding_flags(encoding, escapes) - escapes.each do |escaped| - source = "# encoding: #{encoding.name}\n:\"#{escaped}\"" - - expected = - begin - eval(source).encoding - rescue SyntaxError => error - if error.message.include?("UTF-8 mixed within") - error.message[/: (.+?)\n/, 1] - else - raise - end - end - - actual = - Prism.parse(source).then do |result| - if result.success? - symbol = result.value.statements.body.first - - if symbol.forced_utf8_encoding? - Encoding::UTF_8 - elsif symbol.forced_binary_encoding? - Encoding::ASCII_8BIT - elsif symbol.forced_us_ascii_encoding? - Encoding::US_ASCII - else - encoding - end - else - error = result.errors.first - - if error.message.include?("mixed") - error.message - else - raise error.message - end - end - end - - assert_equal expected, actual - end - end - - def assert_regular_expression_encoding_flags(encoding, regexps) - regexps.each do |regexp| - regexp_modifier_used = regexp.end_with?("/u") || regexp.end_with?("/e") || regexp.end_with?("/s") || regexp.end_with?("/n") - source = "# encoding: #{encoding.name}\n#{regexp}" - - encoding_errors = ["invalid multibyte char", "escaped non ASCII character in UTF-8 regexp", "differs from source encoding"] - skipped_errors = ["invalid multibyte escape", "incompatible character encoding", "UTF-8 character in non UTF-8 regexp", "invalid Unicode range", "invalid Unicode list"] - - # TODO (nirvdrum 21-Feb-2024): Prism currently does not handle Regexp validation unless modifiers are used. So, skip processing those errors for now: https://github.com/ruby/prism/issues/2104 - unless regexp_modifier_used - skipped_errors += encoding_errors - encoding_errors.clear - end - - expected = - begin - eval(source).encoding - rescue SyntaxError => error - if encoding_errors.find { |e| error.message.include?(e) } - error.message.split("\n").map { |m| m[/: (.+?)$/, 1] } - elsif skipped_errors.find { |e| error.message.include?(e) } - next - else - raise - end - end - - actual = - Prism.parse(source).then do |result| - if result.success? - regexp = result.value.statements.body.first - - actual_encoding = if regexp.forced_utf8_encoding? - Encoding::UTF_8 - elsif regexp.forced_binary_encoding? - Encoding::ASCII_8BIT - elsif regexp.forced_us_ascii_encoding? - Encoding::US_ASCII - elsif regexp.ascii_8bit? - Encoding::ASCII_8BIT - elsif regexp.utf_8? - Encoding::UTF_8 - elsif regexp.euc_jp? - Encoding::EUC_JP - elsif regexp.windows_31j? - Encoding::Windows_31J - else - encoding - end - - if regexp.utf_8? && actual_encoding != Encoding::UTF_8 - raise "expected regexp encoding to be UTF-8 due to '/u' modifier, but got #{actual_encoding.name}" - elsif regexp.ascii_8bit? && (actual_encoding != Encoding::ASCII_8BIT && actual_encoding != Encoding::US_ASCII) - raise "expected regexp encoding to be ASCII-8BIT or US-ASCII due to '/n' modifier, but got #{actual_encoding.name}" - elsif regexp.euc_jp? && actual_encoding != Encoding::EUC_JP - raise "expected regexp encoding to be EUC-JP due to '/e' modifier, but got #{actual_encoding.name}" - elsif regexp.windows_31j? && actual_encoding != Encoding::Windows_31J - raise "expected regexp encoding to be Windows-31J due to '/s' modifier, but got #{actual_encoding.name}" - end - - if regexp.utf_8? && regexp.forced_utf8_encoding? - raise "the forced_utf8 flag should not be set when the UTF-8 modifier (/u) is used" - elsif regexp.ascii_8bit? && regexp.forced_binary_encoding? - raise "the forced_ascii_8bit flag should not be set when the UTF-8 modifier (/u) is used" - end - - actual_encoding - else - errors = result.errors.map(&:message) - - if errors.last&.include?("UTF-8 mixed within") - nil - else - errors - end - end - end - - # TODO (nirvdrum 22-Feb-2024): Remove this workaround once Prism better maps CRuby's error messages. - # This class of error message is tricky. The part not being compared is a representation of the regexp. - # Depending on the source encoding and any encoding modifiers being used, CRuby alters how the regexp is represented. - # Sometimes it's an MBC string. Other times it uses hexadecimal character escapes. And in other cases it uses - # the long-form Unicode escape sequences. This short-circuit checks that the error message is mostly correct. - if expected.is_a?(Array) && actual.is_a?(Array) - if expected.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") && - actual.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") - expected.last.clear - actual.last.clear - end - end - - assert_equal expected, actual - end - end - end -end |