未验证 提交 79e9e686 编写于 作者: R Rafael França 提交者: Rafael Mendonça França

Merge pull request #36702 from cpruitt/raise-on-transliterate-ascii-8bit

Handle invalid string encodings and characters in ActiveSupport::Inflector.transliterate
上级 3b04715c
......@@ -56,14 +56,39 @@ module Inflector
#
# transliterate('Jürgen', locale: :de)
# # => "Juergen"
#
# Transliteration is restricted to UTF-8, US-ASCII and GB18030 strings
# Other encodings will raise an ArgumentError.
def transliterate(string, replacement = "?", locale: nil)
raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String)
I18n.transliterate(
allowed_encodings = [Encoding::UTF_8, Encoding::US_ASCII, Encoding::GB18030]
raise ArgumentError, "Can not transliterate strings with #{string.encoding} encoding" unless allowed_encodings.include?(string.encoding)
input_encoding = string.encoding
# US-ASCII is a subset of UTF-8 so we'll force encoding as UTF-8 if
# US-ASCII is given. This way we can let tidy_bytes handle the string
# in the same way as we do for UTF-8
string.force_encoding(Encoding::UTF_8) if string.encoding == Encoding::US_ASCII
# GB18030 is Unicode compatible but is not a direct mapping so needs to be
# transcoded. Using invalid/undef :replace will result in loss of data in
# the event of invalid characters, but since tidy_bytes will replace
# invalid/undef with a "?" we're safe to do the same beforehand
string.encode!(Encoding::UTF_8, invalid: :replace, undef: :replace) if string.encoding == Encoding::GB18030
transliterated = I18n.transliterate(
ActiveSupport::Multibyte::Unicode.tidy_bytes(string).unicode_normalize(:nfc),
replacement: replacement,
locale: locale
)
# Restore the string encoding of the input if it was not UTF-8.
# Apply invalid/undef :replace as tidy_bytes does
transliterated.encode!(input_encoding, invalid: :replace, undef: :replace) if input_encoding != transliterated.encoding
transliterated
end
# Replaces special characters in a string so that it may be used as part of
......
......@@ -57,4 +57,53 @@ def test_transliterate_handles_unknown_object
end
assert_equal "Can only transliterate strings. Received Object", exception.message
end
def test_transliterate_handles_strings_with_valid_utf8_encodings
string = String.new("A", encoding: Encoding::UTF_8)
assert_equal "A", ActiveSupport::Inflector.transliterate(string)
end
def test_transliterate_handles_strings_with_valid_us_ascii_encodings
string = String.new("A", encoding: Encoding::US_ASCII)
transcoded = ActiveSupport::Inflector.transliterate(string)
assert_equal "A", transcoded
assert_equal Encoding::US_ASCII, transcoded.encoding
end
def test_transliterate_handles_strings_with_valid_gb18030_encodings
string = String.new("A", encoding: Encoding::GB18030)
transcoded = ActiveSupport::Inflector.transliterate(string)
assert_equal "A", transcoded
assert_equal Encoding::GB18030, transcoded.encoding
end
def test_transliterate_handles_strings_with_incompatible_encodings
incompatible_encodings = Encoding.list - [
Encoding::UTF_8,
Encoding::US_ASCII,
Encoding::GB18030
]
incompatible_encodings.each do |encoding|
string = String.new("", encoding: encoding)
exception = assert_raises ArgumentError do
ActiveSupport::Inflector.transliterate(string)
end
assert_equal "Can not transliterate strings with #{encoding} encoding", exception.message
end
end
def test_transliterate_handles_strings_with_invalid_utf8_bytes
string = String.new("\255", encoding: Encoding::UTF_8)
assert_equal "?", ActiveSupport::Inflector.transliterate(string)
end
def test_transliterate_handles_strings_with_invalid_us_ascii_bytes
string = String.new("\255", encoding: Encoding::US_ASCII)
assert_equal "?", ActiveSupport::Inflector.transliterate(string)
end
def test_transliterate_handles_strings_with_invalid_gb18030_bytes
string = String.new("\255", encoding: Encoding::GB18030)
assert_equal "?", ActiveSupport::Inflector.transliterate(string)
end
end
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册