| // Copyright 2010-2014, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "base/util.h" |
| |
| #include <climits> |
| #include <cstdlib> |
| #include <cstring> |
| #include <ctime> |
| #include <map> |
| #include <sstream> |
| #include <string> |
| |
| #include "base/clock_mock.h" |
| #include "base/compiler_specific.h" |
| #include "base/file_stream.h" |
| #include "base/file_util.h" |
| #include "base/logging.h" |
| #include "base/mutex.h" |
| #include "base/number_util.h" |
| #include "base/thread.h" |
| #include "testing/base/public/googletest.h" |
| #include "testing/base/public/gunit.h" |
| |
| DECLARE_string(test_srcdir); |
| |
| namespace mozc { |
| |
| namespace { |
| |
| void FillTestCharacterSetMap(map<char32, Util::CharacterSet> *test_map) { |
| CHECK(test_map); |
| |
| const char kCharacterSetTestFile[] = |
| "data/test/character_set/character_set.tsv"; |
| const string &path = FileUtil::JoinPath(FLAGS_test_srcdir, |
| kCharacterSetTestFile); |
| CHECK(FileUtil::FileExists(path)) << path << " does not exist."; |
| |
| map<string, Util::CharacterSet> character_set_type_map; |
| character_set_type_map["ASCII"] = Util::ASCII; |
| character_set_type_map["JISX0201"] = Util::JISX0201; |
| character_set_type_map["JISX0208"] = Util::JISX0208; |
| character_set_type_map["JISX0212"] = Util::JISX0212; |
| character_set_type_map["JISX0213"] = Util::JISX0213; |
| character_set_type_map["CP932"] = Util::CP932; |
| // UNICODE_ONLY should not appear in the tsv file though. |
| character_set_type_map["UNICODE_ONLY"] = Util::UNICODE_ONLY; |
| |
| InputFileStream finput(path.c_str()); |
| |
| // Read tsv file. |
| string line; |
| while (!getline(finput, line).fail()) { |
| if (Util::StartsWith(line, "#")) { |
| // Skip comment line. |
| continue; |
| } |
| |
| vector<string> col; |
| mozc::Util::SplitStringUsing(line, "\t", &col); |
| CHECK_GE(col.size(), 2) << "format error: " << line; |
| const char32 ucs4 = NumberUtil::SimpleAtoi(col[0]); |
| map<string, Util::CharacterSet>::const_iterator itr = |
| character_set_type_map.find(col[1]); |
| // We cannot use CHECK_NE here because of overload resolution. |
| CHECK(character_set_type_map.end() != itr) |
| << "Unknown character set type: " << col[1]; |
| test_map->insert(make_pair(ucs4, itr->second)); |
| } |
| } |
| |
| Util::CharacterSet GetExpectedCharacterSet( |
| const map<char32, Util::CharacterSet> &test_map, |
| char32 ucs4) { |
| map<char32, Util::CharacterSet>::const_iterator itr = |
| test_map.find(ucs4); |
| if (test_map.find(ucs4) == test_map.end()) { |
| // If the test data does not have an entry, it should be |
| // interpreted as |Util::UNICODE_ONLY|. |
| return Util::UNICODE_ONLY; |
| } |
| return itr->second; |
| } |
| |
| } // namespace |
| |
| class ThreadTest: public Thread { |
| public: |
| virtual void Run() { |
| for (int i = 0; i < 3; ++i) { |
| Util::Sleep(1000); |
| } |
| } |
| }; |
| |
| TEST(UtilTest, JoinStrings) { |
| vector<string> input; |
| input.push_back("ab"); |
| input.push_back("cdef"); |
| input.push_back("ghr"); |
| string output; |
| Util::JoinStrings(input, ":", &output); |
| EXPECT_EQ("ab:cdef:ghr", output); |
| } |
| |
| TEST(UtilTest, JoinStringPieces) { |
| { |
| vector<StringPiece> input; |
| input.push_back("ab"); |
| string output; |
| Util::JoinStringPieces(input, ":", &output); |
| EXPECT_EQ("ab", output); |
| } |
| { |
| vector<StringPiece> input; |
| input.push_back("ab"); |
| input.push_back("cdef"); |
| input.push_back("ghr"); |
| string output; |
| Util::JoinStringPieces(input, ":", &output); |
| EXPECT_EQ("ab:cdef:ghr", output); |
| } |
| { |
| vector<StringPiece> input; |
| input.push_back("ab"); |
| input.push_back("cdef"); |
| input.push_back("ghr"); |
| string output; |
| Util::JoinStringPieces(input, "::", &output); |
| EXPECT_EQ("ab::cdef::ghr", output); |
| } |
| } |
| |
| TEST(UtilTest, ConcatStrings) { |
| string s; |
| |
| Util::ConcatStrings("", "", &s); |
| EXPECT_TRUE(s.empty()); |
| |
| Util::ConcatStrings("ABC", "", &s); |
| EXPECT_EQ("ABC", s); |
| |
| Util::ConcatStrings("", "DEF", &s); |
| EXPECT_EQ("DEF", s); |
| |
| Util::ConcatStrings("ABC", "DEF", &s); |
| EXPECT_EQ("ABCDEF", s); |
| } |
| |
| TEST(UtilTest, AppendStringWithDelimiter) { |
| string result; |
| string input; |
| const char kDelemiter[] = ":"; |
| |
| { |
| result.clear(); |
| Util::AppendStringWithDelimiter(kDelemiter, "test", &result); |
| EXPECT_EQ("test", result); |
| } |
| |
| { |
| result = "foo"; |
| Util::AppendStringWithDelimiter(kDelemiter, "test", &result); |
| EXPECT_EQ("foo:test", result); |
| } |
| |
| { |
| result = "foo"; |
| Util::AppendStringWithDelimiter(kDelemiter, "", &result); |
| EXPECT_EQ("foo:", result); |
| } |
| } |
| |
| TEST(UtilTest, SplitIterator_SingleDelimiter_SkipEmpty) { |
| typedef SplitIterator<SingleDelimiter, SkipEmpty> SplitIterator; |
| { |
| SplitIterator iter("", " "); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| SplitIterator iter(StringPiece(), " "); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| const char *s = "a b cde"; |
| SplitIterator iter(s, " "); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("a", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("b", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("cde", iter.Get()); |
| iter.Next(); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| const char *s = " a b cde "; |
| SplitIterator iter(s, " "); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("a", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("b", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("cde", iter.Get()); |
| iter.Next(); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| StringPiece s("a b cde ", 5); // s = "a b "; |
| SplitIterator iter(s, " "); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("a", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("b", iter.Get()); |
| iter.Next(); |
| EXPECT_TRUE(iter.Done()); |
| } |
| } |
| |
| TEST(UtilTest, SplitIterator_MultiDelimiter_SkipEmpty) { |
| typedef SplitIterator<MultiDelimiter, SkipEmpty> SplitIterator; |
| { |
| SplitIterator iter("", " \t,"); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| SplitIterator iter(StringPiece(), ",."); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| const char *s = "a b\tcde:fg"; |
| SplitIterator iter(s, " \t:"); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("a", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("b", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("cde", iter.Get()); |
| EXPECT_FALSE(iter.Done()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("fg", iter.Get()); |
| iter.Next(); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| const char *s = " \t:a b\t\tcde:fg:"; |
| SplitIterator iter(s, " \t:"); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("a", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("b", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("cde", iter.Get()); |
| EXPECT_FALSE(iter.Done()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("fg", iter.Get()); |
| iter.Next(); |
| EXPECT_TRUE(iter.Done()); |
| } |
| } |
| |
| TEST(UtilTest, SplitIterator_SingleDelimiter_AllowEmpty) { |
| typedef SplitIterator<SingleDelimiter, AllowEmpty> SplitIterator; |
| { |
| SplitIterator iter("", " "); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| SplitIterator iter(StringPiece(), " "); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| const char *s = "a b cde"; |
| SplitIterator iter(s, " "); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("a", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("b", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("cde", iter.Get()); |
| iter.Next(); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| const char *s = " a b cde "; |
| SplitIterator iter(s, " "); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("a", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("b", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("cde", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("", iter.Get()); |
| iter.Next(); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| StringPiece s("a b cde ", 5); // s = "a b "; |
| SplitIterator iter(s, " "); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("a", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("b", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("", iter.Get()); |
| iter.Next(); |
| EXPECT_TRUE(iter.Done()); |
| } |
| } |
| |
| TEST(UtilTest, SplitIterator_MultiDelimiter_AllowEmpty) { |
| typedef SplitIterator<MultiDelimiter, AllowEmpty> SplitIterator; |
| { |
| SplitIterator iter("", " \t,"); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| SplitIterator iter(StringPiece(), ",."); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| const char *s = "a b\tcde:fg"; |
| SplitIterator iter(s, " \t:"); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("a", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("b", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("cde", iter.Get()); |
| EXPECT_FALSE(iter.Done()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("fg", iter.Get()); |
| iter.Next(); |
| EXPECT_TRUE(iter.Done()); |
| } |
| { |
| const char *s = "a b\t\tcde:fg:"; |
| SplitIterator iter(s, " \t:"); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("a", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("b", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("cde", iter.Get()); |
| EXPECT_FALSE(iter.Done()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("fg", iter.Get()); |
| iter.Next(); |
| EXPECT_FALSE(iter.Done()); |
| EXPECT_EQ("", iter.Get()); |
| iter.Next(); |
| EXPECT_TRUE(iter.Done()); |
| } |
| } |
| |
| TEST(UtilTest, SplitStringUsing) { |
| { |
| const string input = "a b c def"; |
| vector<string> output; |
| Util::SplitStringUsing(input, " ", &output); |
| EXPECT_EQ(output.size(), 4); |
| EXPECT_EQ("a", output[0]); |
| EXPECT_EQ("b", output[1]); |
| EXPECT_EQ("c", output[2]); |
| EXPECT_EQ("def", output[3]); |
| } |
| { |
| const string input = " a b c"; |
| vector<string> output; |
| Util::SplitStringUsing(input, " ", &output); |
| EXPECT_EQ(output.size(), 3); |
| EXPECT_EQ("a", output[0]); |
| EXPECT_EQ("b", output[1]); |
| EXPECT_EQ("c", output[2]); |
| } |
| { |
| const string input = "a b c "; |
| vector<string> output; |
| Util::SplitStringUsing(input, " ", &output); |
| EXPECT_EQ(output.size(), 3); |
| EXPECT_EQ("a", output[0]); |
| EXPECT_EQ("b", output[1]); |
| EXPECT_EQ("c", output[2]); |
| } |
| { |
| const string input = "a:b cd "; |
| vector<string> output; |
| Util::SplitStringUsing(input, ": ", &output); |
| EXPECT_EQ(output.size(), 3); |
| EXPECT_EQ("a", output[0]); |
| EXPECT_EQ("b", output[1]); |
| EXPECT_EQ("cd", output[2]); |
| } |
| { |
| const string input = "Empty delimiter"; |
| vector<string> output; |
| Util::SplitStringUsing(input, "", &output); |
| EXPECT_EQ(output.size(), 1); |
| EXPECT_EQ(input, output[0]); |
| } |
| } |
| |
| TEST(UtilTest, SplitStringAllowEmpty) { |
| { |
| const string input = "a b c def"; |
| vector<string> output; |
| Util::SplitStringAllowEmpty(input, " ", &output); |
| EXPECT_EQ(output.size(), 5); |
| EXPECT_EQ("a", output[0]); |
| EXPECT_EQ("b", output[1]); |
| EXPECT_EQ("", output[2]); |
| EXPECT_EQ("c", output[3]); |
| EXPECT_EQ("def", output[4]); |
| } |
| { |
| const string input = " a b c"; |
| vector<string> output; |
| Util::SplitStringAllowEmpty(input, " ", &output); |
| EXPECT_EQ(output.size(), 5); |
| EXPECT_EQ("", output[0]); |
| EXPECT_EQ("a", output[1]); |
| EXPECT_EQ("b", output[2]); |
| EXPECT_EQ("", output[3]); |
| EXPECT_EQ("c", output[4]); |
| } |
| { |
| const string input = "a b c "; |
| vector<string> output; |
| Util::SplitStringAllowEmpty(input, " ", &output); |
| EXPECT_EQ(output.size(), 5); |
| EXPECT_EQ("a", output[0]); |
| EXPECT_EQ("b", output[1]); |
| EXPECT_EQ("", output[2]); |
| EXPECT_EQ("c", output[3]); |
| EXPECT_EQ("", output[4]); |
| } |
| { |
| const string input = "a:b c "; |
| vector<string> output; |
| Util::SplitStringAllowEmpty(input, ": ", &output); |
| EXPECT_EQ(output.size(), 5); |
| EXPECT_EQ("a", output[0]); |
| EXPECT_EQ("b", output[1]); |
| EXPECT_EQ("", output[2]); |
| EXPECT_EQ("c", output[3]); |
| EXPECT_EQ("", output[4]); |
| } |
| { |
| const string input = "Empty delimiter"; |
| vector<string> output; |
| Util::SplitStringAllowEmpty(input, "", &output); |
| EXPECT_EQ(output.size(), 1); |
| EXPECT_EQ(input, output[0]); |
| } |
| } |
| |
| TEST(UtilTest, StripWhiteSpaces) { |
| // basic scenario. |
| { |
| const string input = " foo "; |
| string output; |
| Util::StripWhiteSpaces(input, &output); |
| EXPECT_EQ("foo", output); |
| } |
| |
| // no space means just copy. |
| { |
| const string input = "foo"; |
| string output; |
| Util::StripWhiteSpaces(input, &output); |
| EXPECT_EQ("foo", output); |
| } |
| |
| // tabs and linebreaks are also spaces. |
| { |
| const string input = " \tfoo\n"; |
| string output; |
| Util::StripWhiteSpaces(input, &output); |
| EXPECT_EQ("foo", output); |
| } |
| |
| // spaces in the middle remains. |
| { |
| const string input = " foo bar baz "; |
| string output; |
| Util::StripWhiteSpaces(input, &output); |
| EXPECT_EQ("foo bar baz", output); |
| } |
| |
| // all spaces means clear out output. |
| { |
| const string input = " \v \r "; |
| string output; |
| Util::StripWhiteSpaces(input, &output); |
| EXPECT_TRUE(output.empty()); |
| } |
| |
| // empty input. |
| { |
| const string input = ""; |
| string output; |
| Util::StripWhiteSpaces(input, &output); |
| EXPECT_TRUE(output.empty()); |
| } |
| |
| // one character. |
| { |
| const string input = "a"; |
| string output; |
| Util::StripWhiteSpaces(input, &output); |
| EXPECT_EQ("a", output); |
| } |
| } |
| |
| TEST(UtilTest, SplitStringToUtf8Chars) { |
| { |
| vector<string> output; |
| Util::SplitStringToUtf8Chars("", &output); |
| EXPECT_EQ(0, output.size()); |
| } |
| |
| { |
| // "a" "あ" "A" "亜" "\n" "a" |
| const string kInputs[] = { |
| "a", |
| "\xE3\x81\x82", |
| "\xEF\xBC\xA1", |
| "\xE4\xBA\x9C", |
| "\n", |
| "a", |
| }; |
| string joined_string; |
| for (int i = 0; i < arraysize(kInputs); ++i) { |
| joined_string += kInputs[i]; |
| } |
| |
| vector<string> output; |
| Util::SplitStringToUtf8Chars(joined_string, &output); |
| EXPECT_EQ(arraysize(kInputs), output.size()); |
| |
| for (size_t i = 0; i < output.size(); ++i) { |
| EXPECT_EQ(kInputs[i], output[i]); |
| } |
| } |
| } |
| |
| TEST(UtilTest, SplitCSV) { |
| vector<string> answer_vector; |
| |
| Util::SplitCSV( |
| "Google,x,\"Buchheit, Paul\",\"string with \"\" quote in it\"", |
| &answer_vector); |
| CHECK_EQ(answer_vector.size(), 4); |
| CHECK_EQ(answer_vector[0], "Google"); |
| CHECK_EQ(answer_vector[1], "x"); |
| CHECK_EQ(answer_vector[2], "Buchheit, Paul"); |
| CHECK_EQ(answer_vector[3], "string with \" quote in it"); |
| |
| Util::SplitCSV("Google,hello,", &answer_vector); |
| CHECK_EQ(answer_vector.size(), 3); |
| CHECK_EQ(answer_vector[0], "Google"); |
| CHECK_EQ(answer_vector[1], "hello"); |
| CHECK_EQ(answer_vector[2], ""); |
| |
| Util::SplitCSV("Google rocks,hello", &answer_vector); |
| CHECK_EQ(answer_vector.size(), 2); |
| CHECK_EQ(answer_vector[0], "Google rocks"); |
| CHECK_EQ(answer_vector[1], "hello"); |
| |
| Util::SplitCSV(",,\"\",,", &answer_vector); |
| CHECK_EQ(answer_vector.size(), 5); |
| CHECK_EQ(answer_vector[0], ""); |
| CHECK_EQ(answer_vector[1], ""); |
| CHECK_EQ(answer_vector[2], ""); |
| CHECK_EQ(answer_vector[3], ""); |
| CHECK_EQ(answer_vector[4], ""); |
| |
| // Test a string containing a comma. |
| Util::SplitCSV("\",\",hello", &answer_vector); |
| CHECK_EQ(answer_vector.size(), 2); |
| CHECK_EQ(answer_vector[0], ","); |
| CHECK_EQ(answer_vector[1], "hello"); |
| |
| // Invalid CSV |
| Util::SplitCSV("\"no,last,quote", &answer_vector); |
| CHECK_EQ(answer_vector.size(), 1); |
| CHECK_EQ(answer_vector[0], "no,last,quote"); |
| |
| Util::SplitCSV("backslash\\,is,no,an,\"escape\"", &answer_vector); |
| CHECK_EQ(answer_vector.size(), 5); |
| CHECK_EQ(answer_vector[0], "backslash\\"); |
| CHECK_EQ(answer_vector[1], "is"); |
| CHECK_EQ(answer_vector[2], "no"); |
| CHECK_EQ(answer_vector[3], "an"); |
| CHECK_EQ(answer_vector[4], "escape"); |
| |
| Util::SplitCSV("", &answer_vector); |
| CHECK_EQ(answer_vector.size(), 0); |
| } |
| |
| TEST(UtilTest, ReplaceString) { |
| const string input = "foobarfoobar"; |
| string output; |
| Util::StringReplace(input, "bar", "buz", true, &output); |
| EXPECT_EQ("foobuzfoobuz", output); |
| |
| output.clear(); |
| Util::StringReplace(input, "bar", "buz", false, &output); |
| EXPECT_EQ("foobuzfoobar", output); |
| } |
| |
| TEST(UtilTest, LowerString) { |
| string s = "TeSTtest"; |
| Util::LowerString(&s); |
| EXPECT_EQ("testtest", s); |
| |
| // "TeST@ABCXYZ[`abcxyz{" |
| string s2 = "\xef\xbc\xb4\xef\xbd\x85\xef\xbc\xb3\xef\xbc\xb4\xef\xbc\xa0\xef" |
| "\xbc\xa1\xef\xbc\xa2\xef\xbc\xa3\xef\xbc\xb8\xef\xbc\xb9\xef\xbc" |
| "\xba\xef\xbc\xbb\xef\xbd\x80\xef\xbd\x81\xef\xbd\x82\xef\xbd\x83" |
| "\xef\xbd\x98\xef\xbd\x99\xef\xbd\x9a\xef\xbd\x9b"; |
| Util::LowerString(&s2); |
| // "test@abcxyz[`abcxyz{" |
| EXPECT_EQ("\xef\xbd\x94\xef\xbd\x85\xef\xbd\x93\xef\xbd\x94\xef\xbc\xa0\xef" |
| "\xbd\x81\xef\xbd\x82\xef\xbd\x83\xef\xbd\x98\xef\xbd\x99\xef\xbd" |
| "\x9a\xef\xbc\xbb\xef\xbd\x80\xef\xbd\x81\xef\xbd\x82\xef\xbd\x83" |
| "\xef\xbd\x98\xef\xbd\x99\xef\xbd\x9a\xef\xbd\x9b", s2); |
| } |
| |
| TEST(UtilTest, UpperString) { |
| string s = "TeSTtest"; |
| Util::UpperString(&s); |
| EXPECT_EQ("TESTTEST", s); |
| |
| // "TeST@ABCXYZ[`abcxyz{" |
| string s2 = "\xef\xbc\xb4\xef\xbd\x85\xef\xbc\xb3\xef\xbc\xb4\xef\xbc\xa0\xef" |
| "\xbc\xa1\xef\xbc\xa2\xef\xbc\xa3\xef\xbc\xb8\xef\xbc\xb9\xef\xbc" |
| "\xba\xef\xbc\xbb\xef\xbd\x80\xef\xbd\x81\xef\xbd\x82\xef\xbd\x83" |
| "\xef\xbd\x98\xef\xbd\x99\xef\xbd\x9a\xef\xbd\x9b"; |
| Util::UpperString(&s2); |
| // "TEST@ABCXYZ[`ABCXYZ{" |
| EXPECT_EQ("\xef\xbc\xb4\xef\xbc\xa5\xef\xbc\xb3\xef\xbc\xb4\xef\xbc\xa0\xef" |
| "\xbc\xa1\xef\xbc\xa2\xef\xbc\xa3\xef\xbc\xb8\xef\xbc\xb9\xef\xbc" |
| "\xba\xef\xbc\xbb\xef\xbd\x80\xef\xbc\xa1\xef\xbc\xa2\xef\xbc\xa3" |
| "\xef\xbc\xb8\xef\xbc\xb9\xef\xbc\xba\xef\xbd\x9b", s2); |
| } |
| |
| TEST(UtilTest, CapitalizeString) { |
| string s = "TeSTtest"; |
| Util::CapitalizeString(&s); |
| EXPECT_EQ("Testtest", s); |
| |
| // "TeST@ABCXYZ[`abcxyz{" |
| string s2 = "\xef\xbc\xb4\xef\xbd\x85\xef\xbc\xb3\xef\xbc\xb4\xef\xbc\xa0\xef" |
| "\xbc\xa1\xef\xbc\xa2\xef\xbc\xa3\xef\xbc\xb8\xef\xbc\xb9\xef\xbc" |
| "\xba\xef\xbc\xbb\xef\xbd\x80\xef\xbd\x81\xef\xbd\x82\xef\xbd\x83" |
| "\xef\xbd\x98\xef\xbd\x99\xef\xbd\x9a\xef\xbd\x9b"; |
| Util::CapitalizeString(&s2); |
| // "Test@abcxyz[`abcxyz{" |
| EXPECT_EQ("\xef\xbc\xb4\xef\xbd\x85\xef\xbd\x93\xef\xbd\x94\xef\xbc\xa0\xef" |
| "\xbd\x81\xef\xbd\x82\xef\xbd\x83\xef\xbd\x98\xef\xbd\x99\xef\xbd" |
| "\x9a\xef\xbc\xbb\xef\xbd\x80\xef\xbd\x81\xef\xbd\x82\xef\xbd\x83" |
| "\xef\xbd\x98\xef\xbd\x99\xef\xbd\x9a\xef\xbd\x9b", s2); |
| } |
| |
| TEST(UtilTest, IsLowerAscii) { |
| EXPECT_TRUE(Util::IsLowerAscii("")); |
| EXPECT_TRUE(Util::IsLowerAscii("hello")); |
| EXPECT_FALSE(Util::IsLowerAscii("HELLO")); |
| EXPECT_FALSE(Util::IsLowerAscii("Hello")); |
| EXPECT_FALSE(Util::IsLowerAscii("HeLlO")); |
| EXPECT_FALSE(Util::IsLowerAscii("symbol!")); |
| EXPECT_FALSE(Util::IsLowerAscii( // "Hello" |
| "\xEF\xBC\xA8\xEF\xBD\x85\xEF\xBD\x8C\xEF\xBD\x8C\xEF\xBD\x8F")); |
| } |
| |
| TEST(UtilTest, IsUpperAscii) { |
| EXPECT_TRUE(Util::IsUpperAscii("")); |
| EXPECT_FALSE(Util::IsUpperAscii("hello")); |
| EXPECT_TRUE(Util::IsUpperAscii("HELLO")); |
| EXPECT_FALSE(Util::IsUpperAscii("Hello")); |
| EXPECT_FALSE(Util::IsUpperAscii("HeLlO")); |
| EXPECT_FALSE(Util::IsUpperAscii("symbol!")); |
| EXPECT_FALSE(Util::IsUpperAscii( // "Hello" |
| "\xEF\xBC\xA8\xEF\xBD\x85\xEF\xBD\x8C\xEF\xBD\x8C\xEF\xBD\x8F")); |
| } |
| |
| TEST(UtilTest, IsCapitalizedAscii) { |
| EXPECT_TRUE(Util::IsCapitalizedAscii("")); |
| EXPECT_FALSE(Util::IsCapitalizedAscii("hello")); |
| EXPECT_FALSE(Util::IsCapitalizedAscii("HELLO")); |
| EXPECT_TRUE(Util::IsCapitalizedAscii("Hello")); |
| EXPECT_FALSE(Util::IsCapitalizedAscii("HeLlO")); |
| EXPECT_FALSE(Util::IsCapitalizedAscii("symbol!")); |
| EXPECT_FALSE(Util::IsCapitalizedAscii( // "Hello" |
| "\xEF\xBC\xA8\xEF\xBD\x85\xEF\xBD\x8C\xEF\xBD\x8C\xEF\xBD\x8F")); |
| } |
| |
| TEST(UtilTest, IsLowerOrUpperAscii) { |
| EXPECT_TRUE(Util::IsLowerOrUpperAscii("")); |
| EXPECT_TRUE(Util::IsLowerOrUpperAscii("hello")); |
| EXPECT_TRUE(Util::IsLowerOrUpperAscii("HELLO")); |
| EXPECT_FALSE(Util::IsLowerOrUpperAscii("Hello")); |
| EXPECT_FALSE(Util::IsLowerOrUpperAscii("HeLlO")); |
| EXPECT_FALSE(Util::IsLowerOrUpperAscii("symbol!")); |
| EXPECT_FALSE(Util::IsLowerOrUpperAscii( // "Hello" |
| "\xEF\xBC\xA8\xEF\xBD\x85\xEF\xBD\x8C\xEF\xBD\x8C\xEF\xBD\x8F")); |
| } |
| |
| TEST(UtilTest, IsUpperOrCapitalizedAscii) { |
| EXPECT_TRUE(Util::IsUpperOrCapitalizedAscii("")); |
| EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii("hello")); |
| EXPECT_TRUE(Util::IsUpperOrCapitalizedAscii("HELLO")); |
| EXPECT_TRUE(Util::IsUpperOrCapitalizedAscii("Hello")); |
| EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii("HeLlO")); |
| EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii("symbol!")); |
| EXPECT_FALSE(Util::IsUpperOrCapitalizedAscii( // "Hello" |
| "\xEF\xBC\xA8\xEF\xBD\x85\xEF\xBD\x8C\xEF\xBD\x8C\xEF\xBD\x8F")); |
| } |
| |
| void VerifyUTF8ToUCS4(const string &text, char32 expected_ucs4, |
| size_t expected_len) { |
| const char *begin = text.data(); |
| const char *end = begin + text.size(); |
| size_t mblen = 0; |
| char32 result = Util::UTF8ToUCS4(begin, end, &mblen); |
| EXPECT_EQ(expected_ucs4, result) << text << " " << expected_ucs4; |
| EXPECT_EQ(expected_len, mblen) << text << " " << expected_len; |
| } |
| |
| TEST(UtilTest, UTF8ToUCS4) { |
| VerifyUTF8ToUCS4("", 0, 0); |
| VerifyUTF8ToUCS4("\x01", 1, 1); |
| VerifyUTF8ToUCS4("\x7F", 0x7F, 1); |
| VerifyUTF8ToUCS4("\xC2\x80", 0x80, 2); |
| VerifyUTF8ToUCS4("\xDF\xBF", 0x7FF, 2); |
| VerifyUTF8ToUCS4("\xE0\xA0\x80", 0x800, 3); |
| VerifyUTF8ToUCS4("\xEF\xBF\xBF", 0xFFFF, 3); |
| VerifyUTF8ToUCS4("\xF0\x90\x80\x80", 0x10000, 4); |
| VerifyUTF8ToUCS4("\xF7\xBF\xBF\xBF", 0x1FFFFF, 4); |
| // do not test 5-6 bytes because it's out of spec of UTF8. |
| } |
| |
| TEST(UtilTest, UCS4ToUTF8) { |
| string output; |
| |
| // Do nothing if |c| is NUL. Previous implementation of UCS4ToUTF8 worked like |
| // this even though the reason is unclear. |
| Util::UCS4ToUTF8(0, &output); |
| EXPECT_TRUE(output.empty()); |
| |
| Util::UCS4ToUTF8(0x7F, &output); |
| EXPECT_EQ("\x7F", output); |
| Util::UCS4ToUTF8(0x80, &output); |
| EXPECT_EQ("\xC2\x80", output); |
| Util::UCS4ToUTF8(0x7FF, &output); |
| EXPECT_EQ("\xDF\xBF", output); |
| Util::UCS4ToUTF8(0x800, &output); |
| EXPECT_EQ("\xE0\xA0\x80", output); |
| Util::UCS4ToUTF8(0xFFFF, &output); |
| EXPECT_EQ("\xEF\xBF\xBF", output); |
| Util::UCS4ToUTF8(0x10000, &output); |
| EXPECT_EQ("\xF0\x90\x80\x80", output); |
| Util::UCS4ToUTF8(0x1FFFFF, &output); |
| EXPECT_EQ("\xF7\xBF\xBF\xBF", output); |
| } |
| |
| TEST(UtilTest, CharsLen) { |
| // "私の名前は中野です" |
| const string src = "\xe7\xa7\x81\xe3\x81\xae\xe5\x90\x8d\xe5\x89\x8d\xe3\x81" |
| "\xaf\xe4\xb8\xad\xe9\x87\x8e\xe3\x81\xa7\xe3\x81\x99"; |
| EXPECT_EQ(Util::CharsLen(src.c_str(), src.size()), 9); |
| } |
| |
| TEST(UtilTest, SubStringPiece) { |
| // "私の名前は中野です" |
| const string src = "\xe7\xa7\x81\xe3\x81\xae\xe5\x90\x8d\xe5\x89\x8d\xe3\x81" |
| "\xaf\xe4\xb8\xad\xe9\x87\x8e\xe3\x81\xa7\xe3\x81\x99"; |
| StringPiece result; |
| |
| result = Util::SubStringPiece(src, 0, 2); |
| // "私の" |
| EXPECT_EQ("\xe7\xa7\x81\xe3\x81\xae", result); |
| // |result|'s data should point to the same memory block as src. |
| EXPECT_LE(src.data(), result.data()); |
| |
| result = Util::SubStringPiece(src, 4, 1); |
| // "は" |
| EXPECT_EQ("\xe3\x81\xaf", result); |
| EXPECT_LE(src.data(), result.data()); |
| |
| result = Util::SubStringPiece(src, 5, 3); |
| // "中野で" |
| EXPECT_EQ("\xe4\xb8\xad\xe9\x87\x8e\xe3\x81\xa7", result); |
| EXPECT_LE(src.data(), result.data()); |
| |
| result = Util::SubStringPiece(src, 6, 10); |
| // "野です" |
| EXPECT_EQ("\xe9\x87\x8e\xe3\x81\xa7\xe3\x81\x99", result); |
| EXPECT_LE(src.data(), result.data()); |
| |
| result = Util::SubStringPiece(src, 4, 2); |
| // "は中" |
| EXPECT_EQ("\xe3\x81\xaf\xe4\xb8\xad", result); |
| EXPECT_LE(src.data(), result.data()); |
| |
| result = Util::SubStringPiece(src, 2, string::npos); |
| // "名前は中野です" |
| EXPECT_EQ("\xe5\x90\x8d\xe5\x89\x8d\xe3\x81\xaf\xe4\xb8\xad\xe9\x87" |
| "\x8e\xe3\x81\xa7\xe3\x81\x99", |
| result); |
| EXPECT_LE(src.data(), result.data()); |
| |
| result = Util::SubStringPiece(src, 5, string::npos); |
| // "中野です" |
| EXPECT_EQ("\xe4\xb8\xad\xe9\x87\x8e\xe3\x81\xa7\xe3\x81\x99", result); |
| EXPECT_LE(src.data(), result.data()); |
| } |
| |
| TEST(UtilTest, SubStringPiece2) { |
| // "私はGoogleです" |
| const string src = |
| "\xE7\xA7\x81\xE3\x81\xAF\x47\x6F\x6F\x67\x6C\x65" |
| "\xE3\x81\xA7\xE3\x81\x99"; |
| |
| StringPiece result; |
| |
| result = Util::SubStringPiece(src, 0); |
| EXPECT_EQ(src, result); |
| |
| result = Util::SubStringPiece(src, 5); |
| // "gleです" |
| EXPECT_EQ("\x67\x6C\x65\xE3\x81\xA7\xE3\x81\x99", result); |
| |
| result = Util::SubStringPiece(src, 10); |
| EXPECT_TRUE(result.empty()); |
| |
| result = Util::SubStringPiece(src, 13); |
| EXPECT_TRUE(result.empty()); |
| } |
| |
| TEST(UtilTest, SubString) { |
| // "私の名前は中野です" |
| const string src = "\xe7\xa7\x81\xe3\x81\xae\xe5\x90\x8d\xe5\x89\x8d\xe3\x81" |
| "\xaf\xe4\xb8\xad\xe9\x87\x8e\xe3\x81\xa7\xe3\x81\x99"; |
| string result; |
| |
| result.clear(); |
| Util::SubString(src, 0, 2, &result); |
| // "私の" |
| EXPECT_EQ(result, "\xe7\xa7\x81\xe3\x81\xae"); |
| |
| result.clear(); |
| Util::SubString(src, 4, 1, &result); |
| // "は" |
| EXPECT_EQ(result, "\xe3\x81\xaf"); |
| |
| result.clear(); |
| Util::SubString(src, 5, 3, &result); |
| // "中野で" |
| EXPECT_EQ(result, "\xe4\xb8\xad\xe9\x87\x8e\xe3\x81\xa7"); |
| |
| result.clear(); |
| Util::SubString(src, 6, 10, &result); |
| // "野です" |
| EXPECT_EQ(result, "\xe9\x87\x8e\xe3\x81\xa7\xe3\x81\x99"); |
| |
| result.clear(); |
| Util::SubString(src, 4, 2, &result); |
| // "は中" |
| EXPECT_EQ(result, "\xe3\x81\xaf\xe4\xb8\xad"); |
| |
| result.clear(); |
| Util::SubString(src, 2, string::npos, &result); |
| // "名前は中野です" |
| EXPECT_EQ(result, "\xe5\x90\x8d\xe5\x89\x8d\xe3\x81\xaf\xe4\xb8\xad\xe9\x87" |
| "\x8e\xe3\x81\xa7\xe3\x81\x99"); |
| |
| result.clear(); |
| Util::SubString(src, 5, string::npos, &result); |
| // "中野です" |
| EXPECT_EQ(result, "\xe4\xb8\xad\xe9\x87\x8e\xe3\x81\xa7\xe3\x81\x99"); |
| |
| // Doesn't clear result and call Util::SubString |
| Util::SubString(src, 5, string::npos, &result); |
| // "中野です" |
| EXPECT_EQ(result, "\xe4\xb8\xad\xe9\x87\x8e\xe3\x81\xa7\xe3\x81\x99"); |
| } |
| |
| TEST(UtilTest, StartsWith) { |
| const string str = "abcdefg"; |
| EXPECT_TRUE(Util::StartsWith(str, "")); |
| EXPECT_TRUE(Util::StartsWith(str, "a")); |
| EXPECT_TRUE(Util::StartsWith(str, "abc")); |
| EXPECT_TRUE(Util::StartsWith(str, "abcdefg")); |
| EXPECT_FALSE(Util::StartsWith(str, "abcdefghi")); |
| EXPECT_FALSE(Util::StartsWith(str, "foobar")); |
| } |
| |
| TEST(UtilTest, EndsWith) { |
| const string str = "abcdefg"; |
| EXPECT_TRUE(Util::EndsWith(str, "")); |
| EXPECT_TRUE(Util::EndsWith(str, "g")); |
| EXPECT_TRUE(Util::EndsWith(str, "fg")); |
| EXPECT_TRUE(Util::EndsWith(str, "abcdefg")); |
| EXPECT_FALSE(Util::EndsWith(str, "aaabcdefg")); |
| EXPECT_FALSE(Util::EndsWith(str, "foobar")); |
| EXPECT_FALSE(Util::EndsWith(str, "foobarbuzbuz")); |
| } |
| |
| TEST(UtilTest, StripUTF8BOM) { |
| string line; |
| |
| // Should be stripped. |
| line = "\xef\xbb\xbf" "abc"; |
| Util::StripUTF8BOM(&line); |
| EXPECT_EQ("abc", line); |
| |
| // Should be stripped. |
| line = "\xef\xbb\xbf"; |
| Util::StripUTF8BOM(&line); |
| EXPECT_EQ("", line); |
| |
| // BOM in the middle of text. Shouldn't be stripped. |
| line = "a" "\xef\xbb\xbf" "bc"; |
| Util::StripUTF8BOM(&line); |
| EXPECT_EQ("a" "\xef\xbb\xbf" "bc", line); |
| |
| // Incomplete BOM. Shouldn't be stripped. |
| line = "\xef\xbb" "abc"; |
| Util::StripUTF8BOM(&line); |
| EXPECT_EQ("\xef\xbb" "abc", line); |
| |
| // String shorter than the BOM. Do nothing. |
| line = "a"; |
| Util::StripUTF8BOM(&line); |
| EXPECT_EQ("a", line); |
| |
| // Empty string. Do nothing. |
| line = ""; |
| Util::StripUTF8BOM(&line); |
| EXPECT_EQ("", line); |
| } |
| |
| TEST(UtilTest, IsUTF16BOM) { |
| EXPECT_FALSE(Util::IsUTF16BOM("")); |
| EXPECT_FALSE(Util::IsUTF16BOM("abc")); |
| EXPECT_TRUE(Util::IsUTF16BOM("\xfe\xff")); |
| EXPECT_TRUE(Util::IsUTF16BOM("\xff\xfe")); |
| EXPECT_TRUE(Util::IsUTF16BOM("\xfe\xff ")); |
| EXPECT_TRUE(Util::IsUTF16BOM("\xff\xfe ")); |
| EXPECT_FALSE(Util::IsUTF16BOM(" \xfe\xff")); |
| EXPECT_FALSE(Util::IsUTF16BOM(" \xff\xfe")); |
| EXPECT_FALSE(Util::IsUTF16BOM("\xff\xff")); |
| } |
| |
| TEST(UtilTest, IsAndroidPuaEmoji) { |
| EXPECT_FALSE(Util::IsAndroidPuaEmoji("")); |
| EXPECT_FALSE(Util::IsAndroidPuaEmoji("A")); |
| EXPECT_FALSE(Util::IsAndroidPuaEmoji("a")); |
| |
| string str; |
| Util::UCS4ToUTF8(0xFDFFF, &str); |
| EXPECT_FALSE(Util::IsAndroidPuaEmoji(str)); |
| Util::UCS4ToUTF8(0xFE000, &str); |
| EXPECT_TRUE(Util::IsAndroidPuaEmoji(str)); |
| Util::UCS4ToUTF8(0xFE800, &str); |
| EXPECT_TRUE(Util::IsAndroidPuaEmoji(str)); |
| Util::UCS4ToUTF8(0xFEEA0, &str); |
| EXPECT_TRUE(Util::IsAndroidPuaEmoji(str)); |
| Util::UCS4ToUTF8(0xFEEA1, &str); |
| EXPECT_FALSE(Util::IsAndroidPuaEmoji(str)); |
| |
| // If it has two ucs4 chars (or more), just expect false. |
| Util::UCS4ToUTF8(0xFE000, &str); |
| Util::UCS4ToUTF8Append(0xFE000, &str); |
| EXPECT_FALSE(Util::IsAndroidPuaEmoji(str)); |
| } |
| |
| TEST(UtilTest, StringPrintf) { |
| // On GCC, |EXPECT_EQ("", Util::StringPrintf(""))| may cause |
| // "warning: zero-length printf format string" so we disable this check. |
| MOZC_GCC_DISABLE_WARNING_INLINE(format-zero-length); |
| |
| // strings |
| EXPECT_EQ("", Util::StringPrintf("")); |
| EXPECT_EQ("", Util::StringPrintf("%s", "")); |
| EXPECT_EQ("hello, world", Util::StringPrintf("hello, world")); |
| EXPECT_EQ("hello, world", Util::StringPrintf("%s", "hello, world")); |
| EXPECT_EQ("hello, world", Util::StringPrintf("%s, %s", "hello", "world")); |
| const char kHello[] = "\xE3\x81\xAF\xE3\x82\x8D\xE3\x83\xBC" // はろー |
| "\xE4\xB8\x96\xE7\x95\x8C"; // 世界 |
| EXPECT_EQ(kHello, Util::StringPrintf("%s", kHello)); |
| |
| // 32-bit integers |
| EXPECT_EQ("-2147483648", Util::StringPrintf("%d", kint32min)); |
| EXPECT_EQ("2147483647", Util::StringPrintf("%d", kint32max)); |
| EXPECT_EQ("4294967295", Util::StringPrintf("%u", kuint32max)); |
| EXPECT_EQ("80000000", Util::StringPrintf("%x", kint32min)); |
| EXPECT_EQ("7fffffff", Util::StringPrintf("%x", kint32max)); |
| EXPECT_EQ("FFFFFFFF", Util::StringPrintf("%X", kuint32max)); |
| |
| // 64-bit integers |
| EXPECT_EQ("-9223372036854775808", |
| Util::StringPrintf("%" GG_LL_FORMAT "d", kint64min)); |
| EXPECT_EQ("9223372036854775807", |
| Util::StringPrintf("%" GG_LL_FORMAT "d", kint64max)); |
| EXPECT_EQ("18446744073709551615", |
| Util::StringPrintf("%" GG_LL_FORMAT "u", kuint64max)); |
| EXPECT_EQ("8000000000000000", |
| Util::StringPrintf("%" GG_LL_FORMAT "x", kint64min)); |
| EXPECT_EQ("7fffffffffffffff", |
| Util::StringPrintf("%" GG_LL_FORMAT "x", kint64max)); |
| EXPECT_EQ("FFFFFFFFFFFFFFFF", |
| Util::StringPrintf("%" GG_LL_FORMAT "X", kuint64max)); |
| |
| // Simple test for floating point numbers |
| EXPECT_EQ("-1.75", Util::StringPrintf("%.2f", -1.75)); |
| |
| // 4096 is greater than a temporary buffer size (1024 bytes) |
| // which is used in StringPrintf(). |
| const string kLongStrA(4096, '.'); |
| const string kLongStrB(4096, '_'); |
| const string& result = Util::StringPrintf("%s\t%s\n", |
| kLongStrA.c_str(), |
| kLongStrB.c_str()); |
| EXPECT_EQ(kLongStrA + "\t" + kLongStrB + "\n", result); |
| } |
| |
| TEST(UtilTest, HiraganaToKatakana) { |
| { |
| // "あいうえおぁぃぅぇぉかきくけこがぎぐげごさしすせそざじずぜぞたちつてと" |
| // "だぢづでどっなにぬねのはひふへほばびぶべぼぱぴぷぺぽまみむめもやゆよゃ" |
| // "ゅょらりるれろわゎをんゔ" |
| const string input = |
| "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A" |
| "\xE3\x81\x81\xE3\x81\x83\xE3\x81\x85\xE3\x81\x87\xE3\x81\x89" |
| "\xE3\x81\x8B\xE3\x81\x8D\xE3\x81\x8F\xE3\x81\x91\xE3\x81\x93" |
| "\xE3\x81\x8C\xE3\x81\x8E\xE3\x81\x90\xE3\x81\x92\xE3\x81\x94" |
| "\xE3\x81\x95\xE3\x81\x97\xE3\x81\x99\xE3\x81\x9B\xE3\x81\x9D" |
| "\xE3\x81\x96\xE3\x81\x98\xE3\x81\x9A\xE3\x81\x9C\xE3\x81\x9E" |
| "\xE3\x81\x9F\xE3\x81\xA1\xE3\x81\xA4\xE3\x81\xA6\xE3\x81\xA8" |
| "\xE3\x81\xA0\xE3\x81\xA2\xE3\x81\xA5\xE3\x81\xA7\xE3\x81\xA9" |
| "\xE3\x81\xA3\xE3\x81\xAA\xE3\x81\xAB\xE3\x81\xAC\xE3\x81\xAD" |
| "\xE3\x81\xAE\xE3\x81\xAF\xE3\x81\xB2\xE3\x81\xB5\xE3\x81\xB8" |
| "\xE3\x81\xBB\xE3\x81\xB0\xE3\x81\xB3\xE3\x81\xB6\xE3\x81\xB9" |
| "\xE3\x81\xBC\xE3\x81\xB1\xE3\x81\xB4\xE3\x81\xB7\xE3\x81\xBA" |
| "\xE3\x81\xBD\xE3\x81\xBE\xE3\x81\xBF\xE3\x82\x80\xE3\x82\x81" |
| "\xE3\x82\x82\xE3\x82\x84\xE3\x82\x86\xE3\x82\x88\xE3\x82\x83" |
| "\xE3\x82\x85\xE3\x82\x87\xE3\x82\x89\xE3\x82\x8A\xE3\x82\x8B" |
| "\xE3\x82\x8C\xE3\x82\x8D\xE3\x82\x8F\xE3\x82\x8E\xE3\x82\x92" |
| "\xE3\x82\x93\xE3\x82\x94"; |
| string output; |
| Util::HiraganaToKatakana(input, &output); |
| // "アイウエオァィゥェォカキクケコガギグゲゴサシスセソザジズゼゾタチツテト" |
| // "ダヂヅデドッナニヌネノハヒフヘホバビブベボパピプペポマミムメモヤユヨャ" |
| // "ュョラリルレロワヮヲンヴ" |
| EXPECT_EQ("\xE3\x82\xA2\xE3\x82\xA4\xE3\x82\xA6\xE3\x82\xA8\xE3\x82\xAA" |
| "\xE3\x82\xA1\xE3\x82\xA3\xE3\x82\xA5\xE3\x82\xA7\xE3\x82\xA9" |
| "\xE3\x82\xAB\xE3\x82\xAD\xE3\x82\xAF\xE3\x82\xB1\xE3\x82\xB3" |
| "\xE3\x82\xAC\xE3\x82\xAE\xE3\x82\xB0\xE3\x82\xB2\xE3\x82\xB4" |
| "\xE3\x82\xB5\xE3\x82\xB7\xE3\x82\xB9\xE3\x82\xBB\xE3\x82\xBD" |
| "\xE3\x82\xB6\xE3\x82\xB8\xE3\x82\xBA\xE3\x82\xBC\xE3\x82\xBE" |
| "\xE3\x82\xBF\xE3\x83\x81\xE3\x83\x84\xE3\x83\x86\xE3\x83\x88" |
| "\xE3\x83\x80\xE3\x83\x82\xE3\x83\x85\xE3\x83\x87\xE3\x83\x89" |
| "\xE3\x83\x83\xE3\x83\x8A\xE3\x83\x8B\xE3\x83\x8C\xE3\x83\x8D" |
| "\xE3\x83\x8E\xE3\x83\x8F\xE3\x83\x92\xE3\x83\x95\xE3\x83\x98" |
| "\xE3\x83\x9B\xE3\x83\x90\xE3\x83\x93\xE3\x83\x96\xE3\x83\x99" |
| "\xE3\x83\x9C\xE3\x83\x91\xE3\x83\x94\xE3\x83\x97\xE3\x83\x9A" |
| "\xE3\x83\x9D\xE3\x83\x9E\xE3\x83\x9F\xE3\x83\xA0\xE3\x83\xA1" |
| "\xE3\x83\xA2\xE3\x83\xA4\xE3\x83\xA6\xE3\x83\xA8\xE3\x83\xA3" |
| "\xE3\x83\xA5\xE3\x83\xA7\xE3\x83\xA9\xE3\x83\xAA\xE3\x83\xAB" |
| "\xE3\x83\xAC\xE3\x83\xAD\xE3\x83\xAF\xE3\x83\xAE\xE3\x83\xB2" |
| "\xE3\x83\xB3\xE3\x83\xB4", |
| output); |
| } |
| { |
| // "わたしのなまえはなかのですうまーよろしゅう" |
| const string input = "\xe3\x82\x8f\xe3\x81\x9f\xe3\x81\x97\xe3\x81\xae\xe3" |
| "\x81\xaa\xe3\x81\xbe\xe3\x81\x88\xe3\x81\xaf\xe3\x81" |
| "\xaa\xe3\x81\x8b\xe3\x81\xae\xe3\x81\xa7\xe3\x81\x99" |
| "\xe3\x81\x86\xe3\x81\xbe\xe3\x83\xbc\xe3\x82\x88\xe3" |
| "\x82\x8d\xe3\x81\x97\xe3\x82\x85\xe3\x81\x86"; |
| string output; |
| Util::HiraganaToKatakana(input, &output); |
| // "ワタシノナマエハナカノデスウマーヨロシュウ" |
| EXPECT_EQ("\xe3\x83\xaf\xe3\x82\xbf\xe3\x82\xb7\xe3\x83\x8e\xe3\x83\x8a\xe3" |
| "\x83\x9e\xe3\x82\xa8\xe3\x83\x8f\xe3\x83\x8a\xe3\x82\xab\xe3\x83" |
| "\x8e\xe3\x83\x87\xe3\x82\xb9\xe3\x82\xa6\xe3\x83\x9e\xe3\x83\xbc" |
| "\xe3\x83\xa8\xe3\x83\xad\xe3\x82\xb7\xe3\x83\xa5\xe3\x82\xa6", |
| output); |
| } |
| { |
| // "グーグル工藤よろしくabc" |
| const string input = "\xe3\x82\xb0\xe3\x83\xbc\xe3\x82\xb0\xe3\x83\xab\xe5" |
| "\xb7\xa5\xe8\x97\xa4\xe3\x82\x88\xe3\x82\x8d\xe3\x81" |
| "\x97\xe3\x81\x8f\x61\x62\x63"; |
| string output; |
| Util::HiraganaToKatakana(input, &output); |
| // "グーグル工藤ヨロシクabc" |
| EXPECT_EQ("\xe3\x82\xb0\xe3\x83\xbc\xe3\x82\xb0\xe3\x83\xab\xe5\xb7\xa5\xe8" |
| "\x97\xa4\xe3\x83\xa8\xe3\x83\xad\xe3\x82\xb7\xe3\x82\xaf\x61\x62" |
| "\x63", output); |
| } |
| } |
| |
| TEST(UtilTest, KatakanaToHiragana) { |
| { |
| // "アイウエオァィゥェォカキクケコガギグゲゴサシスセソザジズゼゾタチツテト" |
| // "ダヂヅデドッナニヌネノハヒフヘホバビブベボパピプペポマミムメモヤユヨャ" |
| // "ュョラリルレロワヮヲンヰヱヴ" |
| const string input = |
| "\xE3\x82\xA2\xE3\x82\xA4\xE3\x82\xA6\xE3\x82\xA8\xE3\x82\xAA" |
| "\xE3\x82\xA1\xE3\x82\xA3\xE3\x82\xA5\xE3\x82\xA7\xE3\x82\xA9" |
| "\xE3\x82\xAB\xE3\x82\xAD\xE3\x82\xAF\xE3\x82\xB1\xE3\x82\xB3" |
| "\xE3\x82\xAC\xE3\x82\xAE\xE3\x82\xB0\xE3\x82\xB2\xE3\x82\xB4" |
| "\xE3\x82\xB5\xE3\x82\xB7\xE3\x82\xB9\xE3\x82\xBB\xE3\x82\xBD" |
| "\xE3\x82\xB6\xE3\x82\xB8\xE3\x82\xBA\xE3\x82\xBC\xE3\x82\xBE" |
| "\xE3\x82\xBF\xE3\x83\x81\xE3\x83\x84\xE3\x83\x86\xE3\x83\x88" |
| "\xE3\x83\x80\xE3\x83\x82\xE3\x83\x85\xE3\x83\x87\xE3\x83\x89" |
| "\xE3\x83\x83\xE3\x83\x8A\xE3\x83\x8B\xE3\x83\x8C\xE3\x83\x8D" |
| "\xE3\x83\x8E\xE3\x83\x8F\xE3\x83\x92\xE3\x83\x95\xE3\x83\x98" |
| "\xE3\x83\x9B\xE3\x83\x90\xE3\x83\x93\xE3\x83\x96\xE3\x83\x99" |
| "\xE3\x83\x9C\xE3\x83\x91\xE3\x83\x94\xE3\x83\x97\xE3\x83\x9A" |
| "\xE3\x83\x9D\xE3\x83\x9E\xE3\x83\x9F\xE3\x83\xA0\xE3\x83\xA1" |
| "\xE3\x83\xA2\xE3\x83\xA4\xE3\x83\xA6\xE3\x83\xA8\xE3\x83\xA3" |
| "\xE3\x83\xA5\xE3\x83\xA7\xE3\x83\xA9\xE3\x83\xAA\xE3\x83\xAB" |
| "\xE3\x83\xAC\xE3\x83\xAD\xE3\x83\xAF\xE3\x83\xAE\xE3\x83\xB2" |
| "\xE3\x83\xB3\xE3\x83\xB0\xE3\x83\xB1\xE3\x83\xB4"; |
| string output; |
| Util::KatakanaToHiragana(input, &output); |
| // "あいうえおぁぃぅぇぉかきくけこがぎぐげごさしすせそざじずぜぞたちつてと" |
| // "だぢづでどっなにぬねのはひふへほばびぶべぼぱぴぷぺぽまみむめもやゆよゃ" |
| // "ゅょらりるれろわゎをんゐゑゔ" |
| EXPECT_EQ("\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A" |
| "\xE3\x81\x81\xE3\x81\x83\xE3\x81\x85\xE3\x81\x87\xE3\x81\x89" |
| "\xE3\x81\x8B\xE3\x81\x8D\xE3\x81\x8F\xE3\x81\x91\xE3\x81\x93" |
| "\xE3\x81\x8C\xE3\x81\x8E\xE3\x81\x90\xE3\x81\x92\xE3\x81\x94" |
| "\xE3\x81\x95\xE3\x81\x97\xE3\x81\x99\xE3\x81\x9B\xE3\x81\x9D" |
| "\xE3\x81\x96\xE3\x81\x98\xE3\x81\x9A\xE3\x81\x9C\xE3\x81\x9E" |
| "\xE3\x81\x9F\xE3\x81\xA1\xE3\x81\xA4\xE3\x81\xA6\xE3\x81\xA8" |
| "\xE3\x81\xA0\xE3\x81\xA2\xE3\x81\xA5\xE3\x81\xA7\xE3\x81\xA9" |
| "\xE3\x81\xA3\xE3\x81\xAA\xE3\x81\xAB\xE3\x81\xAC\xE3\x81\xAD" |
| "\xE3\x81\xAE\xE3\x81\xAF\xE3\x81\xB2\xE3\x81\xB5\xE3\x81\xB8" |
| "\xE3\x81\xBB\xE3\x81\xB0\xE3\x81\xB3\xE3\x81\xB6\xE3\x81\xB9" |
| "\xE3\x81\xBC\xE3\x81\xB1\xE3\x81\xB4\xE3\x81\xB7\xE3\x81\xBA" |
| "\xE3\x81\xBD\xE3\x81\xBE\xE3\x81\xBF\xE3\x82\x80\xE3\x82\x81" |
| "\xE3\x82\x82\xE3\x82\x84\xE3\x82\x86\xE3\x82\x88\xE3\x82\x83" |
| "\xE3\x82\x85\xE3\x82\x87\xE3\x82\x89\xE3\x82\x8A\xE3\x82\x8B" |
| "\xE3\x82\x8C\xE3\x82\x8D\xE3\x82\x8F\xE3\x82\x8E\xE3\x82\x92" |
| "\xE3\x82\x93\xE3\x82\x90\xE3\x82\x91\xE3\x82\x94", |
| output); |
| } |
| { |
| // "ワタシノナマエハナカノデスウマーヨロシュウ" |
| const string input = |
| "\xE3\x82\x8F\xE3\x81\x9F\xE3\x81\x97\xE3\x81\xAE\xE3\x81\xAA" |
| "\xE3\x81\xBE\xE3\x81\x88\xE3\x81\xAF\xE3\x81\xAA\xE3\x81\x8B" |
| "\xE3\x81\xAE\xE3\x81\xA7\xE3\x81\x99\xE3\x81\x86\xE3\x81\xBE" |
| "\xE3\x83\xBC\xE3\x82\x88\xE3\x82\x8D\xE3\x81\x97\xE3\x82\x85" |
| "\xE3\x81\x86"; |
| string output; |
| Util::KatakanaToHiragana(input, &output); |
| // "わたしのなまえはなかのですうまーよろしゅう" |
| EXPECT_EQ("\xE3\x82\x8F\xE3\x81\x9F\xE3\x81\x97\xE3\x81\xAE" |
| "\xE3\x81\xAA\xE3\x81\xBE\xE3\x81\x88\xE3\x81\xAF" |
| "\xE3\x81\xAA\xE3\x81\x8B\xE3\x81\xAE\xE3\x81\xA7" |
| "\xE3\x81\x99\xE3\x81\x86\xE3\x81\xBE\xE3\x83\xBC" |
| "\xE3\x82\x88\xE3\x82\x8D\xE3\x81\x97\xE3\x82\x85" |
| "\xE3\x81\x86", |
| output); |
| } |
| { |
| // "グーグル工藤ヨロシクabc" |
| const string input = |
| "\xE3\x82\xB0\xE3\x83\xBC\xE3\x82\xB0\xE3\x83\xAB\xE5\xB7\xA5" |
| "\xE8\x97\xA4\xE3\x83\xA8\xE3\x83\xAD\xE3\x82\xB7\xE3\x82\xAF" |
| "\x61\x62\x63"; |
| string output; |
| Util::KatakanaToHiragana(input, &output); |
| // "ぐーぐる工藤よろしくabc" |
| EXPECT_EQ("\xE3\x81\x90\xE3\x83\xBC\xE3\x81\x90\xE3\x82\x8B" |
| "\xE5\xB7\xA5\xE8\x97\xA4\xE3\x82\x88\xE3\x82\x8D" |
| "\xE3\x81\x97\xE3\x81\x8F\x61\x62\x63", |
| output); |
| } |
| } |
| |
| TEST(UtilTest, RomanjiToHiragana) { |
| struct { |
| const char *input; |
| const char *expected; |
| } kTestCases[] = { |
| {"watasinonamaehatakahashinoriyukidesu", |
| // "わたしのなまえはたかはしのりゆきです" |
| "\xE3\x82\x8F\xE3\x81\x9F\xE3\x81\x97\xE3\x81\xAE\xE3\x81\xAA" |
| "\xE3\x81\xBE\xE3\x81\x88\xE3\x81\xAF\xE3\x81\x9F\xE3\x81\x8B" |
| "\xE3\x81\xAF\xE3\x81\x97\xE3\x81\xAE\xE3\x82\x8A\xE3\x82\x86" |
| "\xE3\x81\x8D\xE3\x81\xA7\xE3\x81\x99"}, |
| {"majissukamajiyabexe", |
| // "まじっすかまじやべぇ" |
| "\xE3\x81\xBE\xE3\x81\x98\xE3\x81\xA3\xE3\x81\x99\xE3\x81\x8B" |
| "\xE3\x81\xBE\xE3\x81\x98\xE3\x82\x84\xE3\x81\xB9\xE3\x81\x87"}, |
| {"kk", |
| // "っk" |
| "\xE3\x81\xA3\x6B"}, |
| {"xyz", "xyz"}, |
| }; |
| for (size_t i = 0; i < arraysize(kTestCases); ++i) { |
| string actual; |
| Util::RomanjiToHiragana(kTestCases[i].input, &actual); |
| EXPECT_EQ(kTestCases[i].expected, actual); |
| } |
| } |
| |
| TEST(UtilTest, NormalizeVoicedSoundMark) { |
| // "僕のう゛ぁいおりん" |
| const string input = "\xe5\x83\x95\xe3\x81\xae\xe3\x81\x86\xe3\x82\x9b\xe3" |
| "\x81\x81\xe3\x81\x84\xe3\x81\x8a\xe3\x82\x8a\xe3\x82" |
| "\x93"; |
| string output; |
| Util::NormalizeVoicedSoundMark(input, &output); |
| // "僕のゔぁいおりん" |
| EXPECT_EQ("\xe5\x83\x95\xe3\x81\xae\xe3\x82\x94\xe3\x81\x81\xe3\x81\x84\xe3" |
| "\x81\x8a\xe3\x82\x8a\xe3\x82\x93", output); |
| } |
| |
| TEST(UtilTest, IsFullWidthSymbolInHalfWidthKatakana) { |
| // "グーグル" |
| EXPECT_FALSE(Util::IsFullWidthSymbolInHalfWidthKatakana("\xe3\x82\xb0\xe3\x83" |
| "\xbc\xe3\x82\xb0\xe3" |
| "\x83\xab")); |
| // "ー" |
| EXPECT_TRUE(Util::IsFullWidthSymbolInHalfWidthKatakana("\xe3\x83\xbc")); |
| // "。" |
| EXPECT_TRUE(Util::IsFullWidthSymbolInHalfWidthKatakana("\xe3\x80\x82")); |
| // "グーグル。" |
| EXPECT_FALSE(Util::IsFullWidthSymbolInHalfWidthKatakana("\xe3\x82\xb0\xe3\x83" |
| "\xbc\xe3\x82\xb0\xe3" |
| "\x83\xab\xe3\x80" |
| "\x82")); |
| // "ー。" |
| EXPECT_TRUE(Util::IsFullWidthSymbolInHalfWidthKatakana( |
| "\xe3\x83\xbc\xe3\x80\x82")); |
| // "ーグ。" |
| EXPECT_FALSE(Util::IsFullWidthSymbolInHalfWidthKatakana( |
| "\xe3\x83\xbc\xe3\x82\xb0\xe3\x80\x82")); |
| } |
| |
| TEST(UtilTest, IsHalfWidthKatakanaSymbol) { |
| // "グーグル" |
| EXPECT_FALSE(Util::IsHalfWidthKatakanaSymbol("\xef\xbd\xb8\xef\xbe\x9e\xef" |
| "\xbd\xb0\xef\xbd\xb8\xef\xbe" |
| "\x9e\xef\xbe\x99")); |
| // "ー" |
| EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("\xef\xbd\xb0")); |
| // "。" |
| EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("\xef\xbd\xa1")); |
| // "、" |
| EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("\xef\xbd\xa4")); |
| // "グーグル。" |
| EXPECT_FALSE(Util::IsHalfWidthKatakanaSymbol("\xe3\x82\xb0\xe3\x83\xbc\xe3" |
| "\x82\xb0\xe3\x83\xab\xef\xbd" |
| "\xa1")); |
| // "、。" |
| // "not 。、" |
| EXPECT_TRUE(Util::IsHalfWidthKatakanaSymbol("\xef\xbd\xa4\xef\xbd\xa1")); |
| } |
| |
| TEST(UtilTest, FullWidthAndHalfWidth) { |
| string output; |
| |
| Util::FullWidthToHalfWidth("", &output); |
| CHECK_EQ("", output); |
| |
| Util::HalfWidthToFullWidth("", &output); |
| CHECK_EQ("", output); |
| |
| Util::HalfWidthToFullWidth("abc[]?.", &output); |
| // "abc[]?." |
| CHECK_EQ("\xef\xbd\x81\xef\xbd\x82\xef\xbd\x83\xef\xbc\xbb\xef\xbc\xbd\xef" |
| "\xbc\x9f\xef\xbc\x8e", output); |
| |
| // "インターネット「」" |
| Util::HalfWidthToFullWidth("\xef\xbd\xb2\xef\xbe\x9d\xef\xbe\x80\xef\xbd\xb0" |
| "\xef\xbe\x88\xef\xbd\xaf\xef\xbe\x84\xef\xbd\xa2" |
| "\xe3\x80\x8d", &output); |
| // "インターネット「」" |
| CHECK_EQ("\xe3\x82\xa4\xe3\x83\xb3\xe3\x82\xbf\xe3\x83\xbc\xe3\x83\x8d\xe3" |
| "\x83\x83\xe3\x83\x88\xe3\x80\x8c\xe3\x80\x8d", output); |
| |
| // "インターネットグーグル" |
| Util::HalfWidthToFullWidth("\xef\xbd\xb2\xef\xbe\x9d\xef\xbe\x80\xef\xbd\xb0" |
| "\xef\xbe\x88\xef\xbd\xaf\xef\xbe\x84\xe3\x82\xb0" |
| "\xe3\x83\xbc\xe3\x82\xb0\xe3\x83\xab", &output); |
| // "インターネットグーグル" |
| CHECK_EQ("\xe3\x82\xa4\xe3\x83\xb3\xe3\x82\xbf\xe3\x83\xbc\xe3\x83\x8d\xe3" |
| "\x83\x83\xe3\x83\x88\xe3\x82\xb0\xe3\x83\xbc\xe3\x82\xb0\xe3\x83" |
| "\xab", output); |
| |
| // "abc[]?." |
| Util::FullWidthToHalfWidth("\xef\xbd\x81\xef\xbd\x82\xef\xbd\x83\xef\xbc\xbb" |
| "\xef\xbc\xbd\xef\xbc\x9f\xef\xbc\x8e", &output); |
| CHECK_EQ("abc[]?.", output); |
| |
| // "インターネット" |
| Util::FullWidthToHalfWidth("\xe3\x82\xa4\xe3\x83\xb3\xe3\x82\xbf\xe3\x83\xbc" |
| "\xe3\x83\x8d\xe3\x83\x83\xe3\x83\x88", &output); |
| // "インターネット" |
| CHECK_EQ("\xef\xbd\xb2\xef\xbe\x9d\xef\xbe\x80\xef\xbd\xb0\xef\xbe\x88\xef" |
| "\xbd\xaf\xef\xbe\x84", output); |
| |
| // "インターネットグーグル" |
| Util::FullWidthToHalfWidth("\xef\xbd\xb2\xef\xbe\x9d\xef\xbe\x80\xef\xbd\xb0" |
| "\xef\xbe\x88\xef\xbd\xaf\xef\xbe\x84\xe3\x82\xb0" |
| "\xe3\x83\xbc\xe3\x82\xb0\xe3\x83\xab", &output); |
| // "インターネットグーグル" |
| CHECK_EQ("\xef\xbd\xb2\xef\xbe\x9d\xef\xbe\x80\xef\xbd\xb0\xef\xbe\x88\xef" |
| "\xbd\xaf\xef\xbe\x84\xef\xbd\xb8\xef\xbe\x9e\xef\xbd\xb0\xef\xbd" |
| "\xb8\xef\xbe\x9e\xef\xbe\x99", output); |
| |
| // spaces |
| // " " |
| Util::FullWidthToHalfWidth("\x20\xe3\x80\x80", &output); |
| CHECK_EQ(" ", output); |
| |
| // " " |
| Util::HalfWidthToFullWidth("\x20\xe3\x80\x80", &output); |
| // " " |
| CHECK_EQ("\xe3\x80\x80\xe3\x80\x80", output); |
| |
| // spaces are treated as Ascii here |
| // " " |
| Util::FullWidthAsciiToHalfWidthAscii("\x20\xe3\x80\x80", &output); |
| CHECK_EQ(" ", output); |
| |
| // " " |
| Util::HalfWidthAsciiToFullWidthAscii("\x20\xe3\x80\x80", &output); |
| // " " |
| CHECK_EQ("\xe3\x80\x80\xe3\x80\x80", output); |
| |
| // " " |
| Util::FullWidthKatakanaToHalfWidthKatakana("\x20\xe3\x80\x80", &output); |
| // " " |
| CHECK_EQ("\x20\xe3\x80\x80", output); |
| |
| // " " |
| Util::HalfWidthKatakanaToFullWidthKatakana("\x20\xe3\x80\x80", &output); |
| // " " |
| CHECK_EQ("\x20\xe3\x80\x80", output); |
| } |
| |
| TEST(UtilTest, BracketTest) { |
| static const struct BracketType { |
| const char *open_bracket; |
| const char *close_bracket; |
| } kBracketType[] = { |
| // { "(", ")" }, |
| // { "〔", "〕" }, |
| // { "[", "]" }, |
| // { "{", "}" }, |
| // { "〈", "〉" }, |
| // { "《", "》" }, |
| // { "「", "」" }, |
| // { "『", "』" }, |
| // { "【", "】" }, |
| // { "〘", "〙" }, |
| // { "〚", "〛" }, |
| { "\xEF\xBC\x88", "\xEF\xBC\x89" }, |
| { "\xE3\x80\x94", "\xE3\x80\x95" }, |
| { "\xEF\xBC\xBB", "\xEF\xBC\xBD" }, |
| { "\xEF\xBD\x9B", "\xEF\xBD\x9D" }, |
| { "\xE3\x80\x88", "\xE3\x80\x89" }, |
| { "\xE3\x80\x8A", "\xE3\x80\x8B" }, |
| { "\xE3\x80\x8C", "\xE3\x80\x8D" }, |
| { "\xE3\x80\x8E", "\xE3\x80\x8F" }, |
| { "\xE3\x80\x90", "\xE3\x80\x91" }, |
| { "\xe3\x80\x98", "\xe3\x80\x99" }, |
| { "\xe3\x80\x9a", "\xe3\x80\x9b" }, |
| { NULL, NULL }, // sentinel |
| }; |
| |
| string pair; |
| for (size_t i = 0; |
| (kBracketType[i].open_bracket != NULL || |
| kBracketType[i].close_bracket != NULL); |
| ++i) { |
| EXPECT_TRUE(Util::IsOpenBracket(kBracketType[i].open_bracket, &pair)); |
| EXPECT_EQ(kBracketType[i].close_bracket, pair); |
| EXPECT_TRUE(Util::IsCloseBracket(kBracketType[i].close_bracket, &pair)); |
| EXPECT_EQ(kBracketType[i].open_bracket, pair); |
| EXPECT_FALSE(Util::IsOpenBracket(kBracketType[i].close_bracket, &pair)); |
| EXPECT_FALSE(Util::IsCloseBracket(kBracketType[i].open_bracket, &pair)); |
| } |
| } |
| |
| TEST(UtilTest, IsEnglishTransliteration) { |
| EXPECT_TRUE(Util::IsEnglishTransliteration("ABC")); |
| EXPECT_TRUE(Util::IsEnglishTransliteration("Google")); |
| EXPECT_TRUE(Util::IsEnglishTransliteration("Google Map")); |
| EXPECT_TRUE(Util::IsEnglishTransliteration("ABC-DEF")); |
| EXPECT_TRUE(Util::IsEnglishTransliteration("Foo-bar")); |
| EXPECT_TRUE(Util::IsEnglishTransliteration("Foo!")); |
| EXPECT_TRUE(Util::IsEnglishTransliteration("Who's")); |
| EXPECT_TRUE(Util::IsEnglishTransliteration("!")); |
| EXPECT_TRUE(Util::IsEnglishTransliteration(" ")); |
| // EXPECT_FALSE(Util::IsEnglishTransliteration("てすと")); |
| // EXPECT_FALSE(Util::IsEnglishTransliteration("テスト")); |
| // EXPECT_FALSE(Util::IsEnglishTransliteration("東京")); |
| EXPECT_FALSE(Util::IsEnglishTransliteration( |
| "\xE3\x81\xA6\xE3\x81\x99\xE3\x81\xA8")); |
| EXPECT_FALSE(Util::IsEnglishTransliteration( |
| "\xE3\x83\x86\xE3\x82\xB9\xE3\x83\x88")); |
| EXPECT_FALSE(Util::IsEnglishTransliteration( |
| "\xE6\x9D\xB1\xE4\xBA\xAC")); |
| } |
| |
| TEST(MutexTest, MutexTest) { |
| mozc::Mutex mutex; |
| mozc::scoped_lock l(&mutex); |
| } |
| |
| TEST(ThreadTest, ThreadTest) { |
| ThreadTest test; |
| // test.SetJoinable(true); |
| // test.Join(); |
| } |
| |
| TEST(UtilTest, ChopReturns) { |
| string line = "line\n"; |
| EXPECT_TRUE(Util::ChopReturns(&line)); |
| EXPECT_EQ("line", line); |
| |
| line = "line\r"; |
| EXPECT_TRUE(Util::ChopReturns(&line)); |
| EXPECT_EQ("line", line); |
| |
| line = "line\r\n"; |
| EXPECT_TRUE(Util::ChopReturns(&line)); |
| EXPECT_EQ("line", line); |
| |
| line = "line"; |
| EXPECT_FALSE(Util::ChopReturns(&line)); |
| EXPECT_EQ("line", line); |
| |
| line = "line1\nline2\n"; |
| EXPECT_TRUE(Util::ChopReturns(&line)); |
| EXPECT_EQ("line1\nline2", line); |
| |
| line = "line\n\n\n"; |
| EXPECT_TRUE(Util::ChopReturns(&line)); |
| EXPECT_EQ("line", line); |
| } |
| |
| // 2020-12-23 13:24:35 (Wed) UTC |
| // 123456 [usec] |
| const uint64 kTestSeconds = 1608729875uLL; |
| const uint32 kTestMicroSeconds = 123456u; |
| |
| // time utility test with mock clock |
| TEST(UtilTest, TimeTestWithMock) { |
| scoped_ptr<ClockMock> mock_clock( |
| new ClockMock(kTestSeconds, kTestMicroSeconds)); |
| Util::SetClockHandler(mock_clock.get()); |
| |
| // GetTime, |
| { |
| EXPECT_EQ(kTestSeconds, Util::GetTime()); |
| } |
| |
| // GetTimeOfDay |
| { |
| uint64 current_sec; |
| uint32 current_usec; |
| Util::GetTimeOfDay(¤t_sec, ¤t_usec); |
| EXPECT_EQ(kTestSeconds, current_sec); |
| EXPECT_EQ(kTestMicroSeconds, current_usec); |
| } |
| |
| // GetCurrentTm |
| // 2020-12-23 13:24:35 (Wed) |
| { |
| tm current_tm; |
| Util::GetCurrentTm(¤t_tm); |
| EXPECT_EQ(120, current_tm.tm_year); |
| EXPECT_EQ(11, current_tm.tm_mon); |
| EXPECT_EQ(23, current_tm.tm_mday); |
| EXPECT_EQ(13, current_tm.tm_hour); |
| EXPECT_EQ(24, current_tm.tm_min); |
| EXPECT_EQ(35, current_tm.tm_sec); |
| EXPECT_EQ(3, current_tm.tm_wday); |
| } |
| |
| // GetTmWithoutOffsetSecond |
| // 2024/02/23 23:11:15 (Fri) |
| { |
| const int offset_seconds = 100000000; |
| tm offset_tm; |
| Util::GetTmWithOffsetSecond(&offset_tm, offset_seconds); |
| EXPECT_EQ(124, offset_tm.tm_year); |
| EXPECT_EQ(1, offset_tm.tm_mon); |
| EXPECT_EQ(23, offset_tm.tm_mday); |
| EXPECT_EQ(23, offset_tm.tm_hour); |
| EXPECT_EQ(11, offset_tm.tm_min); |
| EXPECT_EQ(15, offset_tm.tm_sec); |
| EXPECT_EQ(5, offset_tm.tm_wday); |
| } |
| |
| // GetFrequency / GetTicks |
| { |
| const uint64 kFrequency = 12345; |
| const uint64 kTicks = 54321; |
| mock_clock->SetFrequency(kFrequency); |
| EXPECT_EQ(kFrequency, Util::GetFrequency()); |
| mock_clock->SetTicks(kTicks); |
| EXPECT_EQ(kTicks, Util::GetTicks()); |
| } |
| |
| // unset clock handler |
| Util::SetClockHandler(NULL); |
| |
| // GetFrequency / GetTicks without ClockMock |
| { |
| EXPECT_NE(0, Util::GetFrequency()); |
| EXPECT_NE(0, Util::GetTicks()); |
| } |
| } |
| |
| // time utility test without mock clock |
| TEST(UtilTest, TimeTestWithoutMock) { |
| uint64 get_time_of_day_sec, get_time_sec; |
| uint32 get_time_of_day_usec; |
| |
| Util::GetTimeOfDay(&get_time_of_day_sec, &get_time_of_day_usec); |
| get_time_sec = Util::GetTime(); |
| |
| // hmm, unstable test. |
| const int margin = 1; |
| EXPECT_NEAR(get_time_of_day_sec, get_time_sec, margin) |
| << ": This test have possibilities to fail " |
| << "when system is busy and slow."; |
| } |
| |
| TEST(UtilTest, EncodeURI) { |
| string encoded; |
| // "もずく" |
| Util::EncodeURI("\xe3\x82\x82\xe3\x81\x9a\xe3\x81\x8f", &encoded); |
| EXPECT_EQ("%E3%82%82%E3%81%9A%E3%81%8F", encoded); |
| |
| encoded.clear(); |
| Util::EncodeURI("mozc", &encoded); |
| EXPECT_EQ("mozc", encoded); |
| |
| encoded.clear(); |
| Util::EncodeURI("http://mozc/?q=Hello World", &encoded); |
| EXPECT_EQ("http%3A%2F%2Fmozc%2F%3Fq%3DHello%20World", encoded); |
| } |
| |
| TEST(UtilTest, DecodeURI) { |
| string decoded; |
| Util::DecodeURI("%E3%82%82%E3%81%9A%E3%81%8F", &decoded); |
| // "もずく" |
| EXPECT_EQ("\xe3\x82\x82\xe3\x81\x9a\xe3\x81\x8f", decoded); |
| |
| decoded.clear(); |
| Util::DecodeURI("mozc", &decoded); |
| EXPECT_EQ("mozc", decoded); |
| |
| decoded.clear(); |
| Util::DecodeURI("http%3A%2F%2Fmozc%2F%3Fq%3DHello+World", &decoded); |
| EXPECT_EQ("http://mozc/?q=Hello World", decoded); |
| } |
| |
| TEST(UtilTest, AppendCGIParams) { |
| vector<pair<string, string> > params; |
| string url; |
| Util::AppendCGIParams(params, &url); |
| EXPECT_TRUE(url.empty()); |
| |
| params.push_back(make_pair("foo", "b a+r")); |
| url = "http://mozc.com?"; |
| Util::AppendCGIParams(params, &url); |
| EXPECT_EQ("http://mozc.com?foo=b%20a%2Br", url); |
| |
| params.push_back(make_pair("buzz", "mozc")); |
| url.clear(); |
| Util::AppendCGIParams(params, &url); |
| EXPECT_EQ("foo=b%20a%2Br&buzz=mozc", url); |
| } |
| |
| TEST(UtilTest, Escape) { |
| string escaped; |
| // "らむだ" |
| Util::Escape("\xe3\x82\x89\xe3\x82\x80\xe3\x81\xa0", &escaped); |
| EXPECT_EQ("\\xE3\\x82\\x89\\xE3\\x82\\x80\\xE3\\x81\\xA0", escaped); |
| } |
| |
| TEST(UtilTest, EscapeUrl) { |
| string escaped; |
| // "らむだ" |
| Util::EscapeUrl("\xe3\x82\x89\xe3\x82\x80\xe3\x81\xa0", &escaped); |
| EXPECT_EQ("%E3%82%89%E3%82%80%E3%81%A0", escaped); |
| EXPECT_EQ("%E3%82%89%E3%82%80%E3%81%A0", |
| Util::EscapeUrl("\xe3\x82\x89\xe3\x82\x80\xe3\x81\xa0")); |
| } |
| |
| TEST(UtilTest, EscapeHtml) { |
| string escaped; |
| Util::EscapeHtml("<>&'\"abc", &escaped); |
| EXPECT_EQ("<>&'"abc", escaped); |
| } |
| |
| TEST(UtilTest, EscapeCss) { |
| string escaped; |
| Util::EscapeCss("<>&'\"abc", &escaped); |
| EXPECT_EQ("<>&'\"abc", escaped); |
| } |
| |
| TEST(UtilTest, ScriptType) { |
| // "くどう" |
| EXPECT_TRUE(Util::IsScriptType("\xe3\x81\x8f\xe3\x81\xa9\xe3\x81\x86", |
| Util::HIRAGANA)); |
| // "京都" |
| EXPECT_TRUE(Util::IsScriptType("\xe4\xba\xac\xe9\x83\xbd", Util::KANJI)); |
| // "人々" (b/4201140) |
| EXPECT_TRUE(Util::IsScriptType("\xE4\xBA\xBA\xE3\x80\x85", Util::KANJI)); |
| // "モズク" |
| EXPECT_TRUE(Util::IsScriptType("\xe3\x83\xa2\xe3\x82\xba\xe3\x82\xaf", |
| Util::KATAKANA)); |
| // "モズクモズク" |
| EXPECT_TRUE(Util::IsScriptType("\xe3\x83\xa2\xe3\x82\xba\xe3\x82\xaf\xef\xbe" |
| "\x93\xef\xbd\xbd\xef\xbe\x9e\xef\xbd\xb8", |
| Util::KATAKANA)); |
| // "ぐーぐる" |
| EXPECT_TRUE(Util::IsScriptType("\xe3\x81\x90\xe3\x83\xbc\xe3\x81\x90\xe3\x82" |
| "\x8b", Util::HIRAGANA)); |
| // "グーグル" |
| EXPECT_TRUE(Util::IsScriptType("\xe3\x82\xb0\xe3\x83\xbc\xe3\x82\xb0\xe3\x83" |
| "\xab", Util::KATAKANA)); |
| // "ゟ" U+309F: HIRAGANA DIGRAPH YORI |
| EXPECT_TRUE(Util::IsScriptType("\xE3\x82\x9F", Util::HIRAGANA)); |
| // "ヿ" U+30FF: KATAKANA DIGRAPH KOTO |
| EXPECT_TRUE(Util::IsScriptType("\xE3\x83\xBF", Util::KATAKANA)); |
| // "ヷヸヹヺㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ" |
| EXPECT_TRUE(Util::IsScriptType( |
| "\xE3\x83\xB7\xE3\x83\xB8\xE3\x83\xB9\xE3\x83\xBA\xE3\x87\xB0" |
| "\xE3\x87\xB1\xE3\x87\xB2\xE3\x87\xB3\xE3\x87\xB4\xE3\x87\xB5" |
| "\xE3\x87\xB6\xE3\x87\xB7\xE3\x87\xB8\xE3\x87\xB9\xE3\x87\xBA" |
| "\xE3\x87\xBB\xE3\x87\xBC\xE3\x87\xBD\xE3\x87\xBE\xE3\x87\xBF", |
| Util::KATAKANA)); |
| // "𛀀" U+1B000: KATAKANA LETTER ARCHAIC E |
| EXPECT_TRUE(Util::IsScriptType("\xF0\x9B\x80\x80", Util::KATAKANA)); |
| // "𛀁" U+1B001: HIRAGANA LETTER ARCHAIC YE |
| EXPECT_TRUE(Util::IsScriptType("\xF0\x9B\x80\x81", Util::HIRAGANA)); |
| |
| EXPECT_TRUE(Util::IsScriptType("012", Util::NUMBER)); |
| // "012012" |
| EXPECT_TRUE(Util::IsScriptType("\xef\xbc\x90\xef\xbc\x91\xef\xbc\x92\x30\x31" |
| "\x32", Util::NUMBER)); |
| EXPECT_TRUE(Util::IsScriptType("abcABC", Util::ALPHABET)); |
| // "ABCD" |
| EXPECT_TRUE(Util::IsScriptType("\xef\xbc\xa1\xef\xbc\xa2\xef\xbc\xa3\xef\xbc" |
| "\xa4", Util::ALPHABET)); |
| EXPECT_TRUE(Util::IsScriptType("@!#", Util::UNKNOWN_SCRIPT)); |
| |
| // "くどカう" |
| EXPECT_FALSE(Util::IsScriptType("\xe3\x81\x8f\xe3\x81\xa9\xe3\x82\xab\xe3\x81" |
| "\x86", Util::HIRAGANA)); |
| // "京あ都" |
| EXPECT_FALSE(Util::IsScriptType("\xe4\xba\xac\xe3\x81\x82\xe9\x83\xbd", |
| Util::KANJI)); |
| // "モズあク" |
| EXPECT_FALSE(Util::IsScriptType("\xe3\x83\xa2\xe3\x82\xba\xe3\x81\x82\xe3\x82" |
| "\xaf", Util::KATAKANA)); |
| // "モあズクモズク" |
| EXPECT_FALSE(Util::IsScriptType("\xe3\x83\xa2\xe3\x81\x82\xe3\x82\xba\xe3\x82" |
| "\xaf\xef\xbe\x93\xef\xbd\xbd\xef\xbe\x9e\xef" |
| "\xbd\xb8", Util::KATAKANA)); |
| // "012あ" |
| EXPECT_FALSE(Util::IsScriptType("\x30\x31\x32\xe3\x81\x82", Util::NUMBER)); |
| // "012あ012" |
| EXPECT_FALSE(Util::IsScriptType("\xef\xbc\x90\xef\xbc\x91\xef\xbc\x92\xe3\x81" |
| "\x82\x30\x31\x32", Util::NUMBER)); |
| // "abcABあC" |
| EXPECT_FALSE(Util::IsScriptType("\x61\x62\x63\x41\x42\xe3\x81\x82\x43", |
| Util::ALPHABET)); |
| // "ABあCD" |
| EXPECT_FALSE(Util::IsScriptType("\xef\xbc\xa1\xef\xbc\xa2\xe3\x81\x82\xef\xbc" |
| "\xa3\xef\xbc\xa4", Util::ALPHABET)); |
| // "ぐーぐるグ" |
| EXPECT_FALSE(Util::IsScriptType("\xe3\x81\x90\xe3\x83\xbc\xe3\x81\x90\xe3\x82" |
| "\x8b\xe3\x82\xb0", Util::HIRAGANA)); |
| // "グーグルぐ" |
| EXPECT_FALSE(Util::IsScriptType("\xe3\x82\xb0\xe3\x83\xbc\xe3\x82\xb0\xe3\x83" |
| "\xab\xe3\x81\x90", Util::KATAKANA)); |
| |
| // "グーグルsuggest" |
| EXPECT_TRUE(Util::ContainsScriptType("\xe3\x82\xb0\xe3\x83\xbc\xe3\x82\xb0" |
| "\xe3\x83\xab\x73\x75\x67\x67\x65\x73" |
| "\x74", Util::ALPHABET)); |
| // "グーグルサジェスト" |
| EXPECT_FALSE(Util::ContainsScriptType("\xe3\x82\xb0\xe3\x83\xbc\xe3\x82\xb0" |
| "\xe3\x83\xab\xe3\x82\xb5\xe3\x82\xb8" |
| "\xe3\x82\xa7\xe3\x82\xb9\xe3\x83\x88", |
| Util::ALPHABET)); |
| |
| // "くどう" |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("\xe3\x81\x8f\xe3\x81\xa9\xe3" |
| "\x81\x86")); |
| // "京都" |
| EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xe4\xba\xac\xe9\x83\xbd")); |
| // "人々" (b/4201140) |
| EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xE4\xBA\xBA\xE3\x80\x85")); |
| // "モズク" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xe3\x83\xa2\xe3\x82\xba\xe3" |
| "\x82\xaf")); |
| // "モズクモズク" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xe3\x83\xa2\xe3\x82\xba\xe3" |
| "\x82\xaf\xef\xbe\x93\xef\xbd" |
| "\xbd\xef\xbe\x9e\xef\xbd" |
| "\xb8")); |
| // "ぐーぐる" |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("\xe3\x81\x90\xe3\x83\xbc\xe3" |
| "\x81\x90\xe3\x82\x8b")); |
| EXPECT_EQ(Util::HIRAGANA, |
| Util::GetFirstScriptType("\xe3\x81\x90\xe3\x83\xbc\xe3\x81\x90" |
| "\xe3\x82\x8b")); |
| |
| // "グーグル" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xe3\x82\xb0\xe3\x83\xbc\xe3" |
| "\x82\xb0\xe3\x83\xab")); |
| EXPECT_EQ(Util::KATAKANA, |
| Util::GetFirstScriptType("\xe3\x82\xb0\xe3\x83\xbc\xe3\x82\xb0" |
| "\xe3\x83\xab")); |
| // "ゟ" U+309F HIRAGANA DIGRAPH YORI |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("\xE3\x82\x9F")); |
| EXPECT_EQ(Util::HIRAGANA, Util::GetFirstScriptType("\xE3\x82\x9F")); |
| |
| // "ヿ" U+30FF KATAKANA DIGRAPH KOTO |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xE3\x83\xBF")); |
| // "ヷヸヹヺㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType( |
| "\xE3\x83\xB7\xE3\x83\xB8\xE3\x83\xB9\xE3\x83\xBA\xE3\x87\xB0" |
| "\xE3\x87\xB1\xE3\x87\xB2\xE3\x87\xB3\xE3\x87\xB4\xE3\x87\xB5" |
| "\xE3\x87\xB6\xE3\x87\xB7\xE3\x87\xB8\xE3\x87\xB9\xE3\x87\xBA" |
| "\xE3\x87\xBB\xE3\x87\xBC\xE3\x87\xBD\xE3\x87\xBE\xE3\x87\xBF")); |
| // "𛀀" U+1B000 KATAKANA LETTER ARCHAIC E |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xF0\x9B\x80\x80")); |
| // "𛀁" U+1B001 HIRAGANA LETTER ARCHAIC YE |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("\xF0\x9B\x80\x81")); |
| |
| // "!グーグル" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\x21\xe3\x82\xb0\xe3\x83" |
| "\xbc\xe3\x82\xb0\xe3\x83" |
| "\xab")); |
| // "ー" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xe3\x83\xbc")); |
| EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("\xe3\x83\xbc")); |
| // "ーー" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xe3\x83\xbc\xe3\x83" |
| "\xbc")); |
| EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("\xe3\x83\xbc\xe3" |
| "\x83\xbc")); |
| // "゛" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xe3\x82\x9b")); |
| // "゜" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xe3\x82\x9c")); |
| |
| EXPECT_EQ(Util::NUMBER, Util::GetScriptType("012")); |
| // "012012" |
| EXPECT_EQ(Util::NUMBER, Util::GetScriptType("\xef\xbc\x90\xef\xbc\x91\xef\xbc" |
| "\x92\x30\x31\x32")); |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptType("abcABC")); |
| // "ABCD" |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptType("\xef\xbc\xa1\xef\xbc\xa2\xef" |
| "\xbc\xa3\xef\xbc\xa4")); |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("@!#")); |
| // "@!#" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xef\xbc\xa0\xef\xbc\x81" |
| "\xef\xbc\x83")); |
| |
| // "ーひらがな" |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("\xe3\x83\xbc\xe3\x81\xb2\xe3" |
| "\x82\x89\xe3\x81\x8c\xe3\x81" |
| "\xaa")); |
| EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("\xe3\x83\xbc\xe3\x81\xb2" |
| "\xe3\x82\x89\xe3\x81\x8c" |
| "\xe3\x81\xaa")); |
| // "ーカタカナ" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xe3\x83\xbc\xe3\x82\xab\xe3" |
| "\x82\xbf\xe3\x82\xab\xe3\x83" |
| "\x8a")); |
| // "ーカタカナ" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xef\xbd\xb0\xef\xbd\xb6\xef" |
| "\xbe\x80\xef\xbd\xb6\xef\xbe" |
| "\x85")); |
| // "ひらがなー" |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("\xe3\x81\xb2\xe3\x82\x89\xe3" |
| "\x81\x8c\xe3\x81\xaa\xe3\x83" |
| "\xbc")); |
| // "カタカナー" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xe3\x82\xab\xe3\x82\xbf\xe3" |
| "\x82\xab\xe3\x83\x8a\xe3\x83" |
| "\xbc")); |
| // "カタカナー" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xef\xbd\xb6\xef\xbe\x80\xef" |
| "\xbd\xb6\xef\xbe\x85\xef\xbd" |
| "\xb0")); |
| |
| // "あ゛っ" |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("\xe3\x81\x82\xe3\x82\x9b\xe3" |
| "\x81\xa3")); |
| // "あ゜っ" |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptType("\xe3\x81\x82\xe3\x82\x9c\xe3" |
| "\x81\xa3")); |
| // "ア゛ッ" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xe3\x82\xa2\xe3\x82\x9b\xe3" |
| "\x83\x83")); |
| // "ア゜ッ" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptType("\xe3\x82\xa2\xe3\x82\x9c\xe3" |
| "\x83\x83")); |
| |
| // "くどカう" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xe3\x81\x8f\xe3\x81\xa9" |
| "\xe3\x82\xab\xe3\x81" |
| "\x86")); |
| // "京あ都" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xe4\xba\xac\xe3\x81\x82" |
| "\xe9\x83\xbd")); |
| EXPECT_EQ(Util::KANJI, Util::GetFirstScriptType("\xe4\xba\xac\xe3\x81\x82" |
| "\xe9\x83\xbd")); |
| |
| // "モズあク" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xe3\x83\xa2\xe3\x82\xba" |
| "\xe3\x81\x82\xe3\x82" |
| "\xaf")); |
| EXPECT_EQ(Util::KATAKANA, Util::GetFirstScriptType("\xe3\x83\xa2\xe3\x82\xba" |
| "\xe3\x81\x82\xe3\x82" |
| "\xaf")); |
| |
| // "モあズクモズク" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xe3\x83\xa2\xe3\x81\x82" |
| "\xe3\x82\xba\xe3\x82\xaf" |
| "\xef\xbe\x93\xef\xbd\xbd" |
| "\xef\xbe\x9e\xef\xbd" |
| "\xb8")); |
| // "012あ" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\x30\x31\x32\xe3\x81" |
| "\x82")); |
| EXPECT_EQ(Util::NUMBER, Util::GetFirstScriptType("\x30\x31\x32\xe3\x81" |
| "\x82")); |
| // "012あ012" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xef\xbc\x90\xef\xbc\x91" |
| "\xef\xbc\x92\xe3\x81\x82" |
| "\x30\x31\x32")); |
| EXPECT_EQ(Util::NUMBER, Util::GetFirstScriptType("\xef\xbc\x90\xef\xbc\x91" |
| "\xef\xbc\x92\xe3\x81\x82" |
| "\x30\x31\x32")); |
| // "abcABあC" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\x61\x62\x63\x41\x42\xe3" |
| "\x81\x82\x43")); |
| // "ABあCD" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xef\xbc\xa1\xef\xbc\xa2" |
| "\xe3\x81\x82\xef\xbc\xa3" |
| "\xef\xbc\xa4")); |
| // "ぐーぐるグ" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xe3\x81\x90\xe3\x83\xbc" |
| "\xe3\x81\x90\xe3\x82\x8b" |
| "\xe3\x82\xb0")); |
| // "グーグルぐ" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptType("\xe3\x82\xb0\xe3\x83\xbc" |
| "\xe3\x82\xb0\xe3\x83\xab" |
| "\xe3\x81\x90")); |
| |
| // "龦" U+9FA6 |
| EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xE9\xBE\xA6")); |
| // "龻" U+9FBB |
| EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xE9\xBE\xBB")); |
| // U+9FFF is not assigned yet but reserved for CJK Unified Ideographs. |
| EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xE9\xBF\xBF")); |
| // "𠮟咤" U+20B9F U+54A4 |
| EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xF0\xA0\xAE\x9F\xE5\x92\xA4")); |
| // "𠮷野" U+20BB7 U+91CE |
| EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xF0\xA0\xAE\xB7\xE9\x87\x8E")); |
| // "巽" U+2F884 |
| EXPECT_EQ(Util::KANJI, Util::GetScriptType("\xF0\xAF\xA2\x84")); |
| |
| // U+1F466, BOY/smile emoji |
| EXPECT_EQ(Util::EMOJI, Util::GetScriptType("\xF0\x9F\x91\xA6")); |
| } |
| |
| |
| TEST(UtilTest, ScriptTypeWithoutSymbols) { |
| // "くど う" |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols( |
| "\xe3\x81\x8f\xe3\x81\xa9 \xe3\x81\x86")); |
| // "京 都" |
| EXPECT_EQ(Util::KANJI, Util::GetScriptTypeWithoutSymbols( |
| "\xe4\xba\xac \xe9\x83\xbd")); |
| // "モズ ク" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptTypeWithoutSymbols( |
| "\xe3\x83\xa2\xe3\x82\xba\xe3\x82\xaf")); |
| // "モズ クモズク" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptTypeWithoutSymbols( |
| "\xe3\x83\xa2\xe3\x82\xba \xe3\x82\xaf\xef\xbe\x93\xef\xbd" |
| "\xbd\xef\xbe\x9e\xef\xbd\xb8")); |
| // "Google Earth" |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols( |
| "Google Earth")); |
| // "Google " |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols( |
| "Google ")); |
| // " Google" |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols( |
| " Google")); |
| // " Google " |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols( |
| " Google ")); |
| // " g" |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols( |
| " g")); |
| // "" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols( |
| "")); |
| // " " |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols( |
| " ")); |
| // " " |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols( |
| " ")); |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("Hello!")); |
| // "Hello!あ" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols( |
| "\x48\x65\x6c\x6c\x6f\x21\xe3\x81\x82")); |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("CD-ROM")); |
| // "CD-ROMア" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols( |
| "\x43\x44\x2d\x52\x4f\x4d\xe3\x82\xa2")); |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols("-")); |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("-A")); |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("--A")); |
| EXPECT_EQ(Util::ALPHABET, Util::GetScriptTypeWithoutSymbols("--A---")); |
| // "--A-ア-" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols( |
| "\x2d\x2d\x41\x2d\xef\xbd\xb1\x2d")); |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols("!")); |
| // "・あ" |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols( |
| "\xe3\x83\xbb\xe3\x81\x82")); |
| // "・・あ" |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols( |
| "\xe3\x83\xbb\xe3\x83\xbb\xe3\x81\x82")); |
| // "コギト・エルゴ・スム" |
| EXPECT_EQ(Util::KATAKANA, Util::GetScriptTypeWithoutSymbols( |
| "\xe3\x82\xb3\xe3\x82\xae\xe3\x83\x88\xe3\x83\xbb\xe3\x82\xa8" |
| "\xe3\x83\xab\xe3\x82\xb4\xe3\x83\xbb\xe3\x82\xb9\xe3\x83\xa0")); |
| // "コギト・エルゴ・住む" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols( |
| "\xe3\x82\xb3\xe3\x82\xae\xe3\x83\x88\xe3\x83\xbb\xe3\x82\xa8" |
| "\xe3\x83\xab\xe3\x82\xb4\xe3\x83\xbb\xe4\xbd\x8f\xe3\x82\x80")); |
| // "人☆名" |
| EXPECT_EQ(Util::KANJI, Util::GetScriptTypeWithoutSymbols( |
| "\xe4\xba\xba\xe2\x98\x86\xe5\x90\x8d")); |
| // "ひとの☆なまえ" |
| EXPECT_EQ(Util::HIRAGANA, Util::GetScriptTypeWithoutSymbols( |
| "\xe3\x81\xb2\xe3\x81\xa8\xe3\x81\xae\xe2\x98\x86\xe3\x81\xaa" |
| "\xe3\x81\xbe\xe3\x81\x88")); |
| // "超☆最高です" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols( |
| "\xe8\xb6\x85\xe2\x98\x86\xe6\x9c\x80\xe9\xab\x98\xe3\x81\xa7" |
| "\xe3\x81\x99")); |
| // "・--☆" |
| EXPECT_EQ(Util::UNKNOWN_SCRIPT, Util::GetScriptTypeWithoutSymbols( |
| "\xe3\x83\xbb\x2d\x2d\xe2\x98\x86")); |
| } |
| |
| TEST(UtilTest, FormType) { |
| // "くどう" |
| EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("\xe3\x81\x8f\xe3\x81\xa9\xe3" |
| "\x81\x86")); |
| // "京都" |
| EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("\xe4\xba\xac\xe9\x83\xbd")); |
| // "モズク" |
| EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("\xe3\x83\xa2\xe3\x82\xba\xe3" |
| "\x82\xaf")); |
| // "モズク" |
| EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("\xef\xbe\x93\xef\xbd\xbd\xef" |
| "\xbe\x9e\xef\xbd\xb8")); |
| // "ぐーぐる" |
| EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("\xe3\x81\x90\xe3\x83\xbc\xe3" |
| "\x81\x90\xe3\x82\x8b")); |
| // "グーグル" |
| EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("\xe3\x82\xb0\xe3\x83\xbc\xe3" |
| "\x82\xb0\xe3\x83\xab")); |
| // "グーグル" |
| EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("\xef\xbd\xb8\xef\xbe\x9e\xef" |
| "\xbd\xb0\xef\xbd\xb8\xef\xbe" |
| "\x9e\xef\xbe\x99")); |
| // "ー" |
| EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("\xef\xbd\xb0")); |
| // "ー" |
| EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("\xe3\x83\xbc")); |
| |
| // "¢£¥¦¬¯" |
| EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("\xc2\xa2\xc2\xa3\xc2\xa5" |
| "\xc2\xa6\xc2\xac\xc2\xaf")); |
| |
| // "│←↑→↓■○" |
| EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType( |
| "\xef\xbf\xa8\xef\xbf\xa9\xef\xbf\xaa\xef\xbf\xab\xef\xbf\xac" |
| "\xef\xbf\xad\xef\xbf\xae")); |
| |
| // Half-width mathematical symbols |
| // [U+27E6, U+27ED], U+2985, and U+2986 |
| EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType( |
| "\xe2\x9f\xa6\xe2\x9f\xa7\xe2\x9f\xa8\xe2\x9f\xa9\xe2\x9f\xaa\xe2" |
| "\x9f\xab\xe2\x9f\xac\xe2\x9f\xad\xe2\xa6\x85\xe2\xa6\x86")); |
| |
| // Half-width hangul "ᅠᄀᄁ" |
| EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("\xef\xbe\xa0\xef\xbe\xa1" |
| "\xef\xbe\xa2")); |
| |
| // Half-width won "₩" |
| EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("\xe2\x82\xa9")); |
| |
| EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("012")); |
| // "012012" |
| EXPECT_EQ(Util::UNKNOWN_FORM, Util::GetFormType("\xef\xbc\x90\xef\xbc\x91\xef" |
| "\xbc\x92\x30\x31\x32")); |
| EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("abcABC")); |
| // "ABCD" |
| EXPECT_EQ(Util::FULL_WIDTH, Util::GetFormType("\xef\xbc\xa1\xef\xbc\xa2\xef" |
| "\xbc\xa3\xef\xbc\xa4")); |
| EXPECT_EQ(Util::HALF_WIDTH, Util::GetFormType("@!#")); |
| } |
| |
| // We have a snapshot of the result of |Util::GetCharacterSet(ucs4)| in |
| // data/test/character_set/character_set.tsv. |
| // Compare the result for each character just in case. |
| TEST(UtilTest, CharacterSetFullTest) { |
| map<char32, Util::CharacterSet> test_set; |
| FillTestCharacterSetMap(&test_set); |
| EXPECT_FALSE(test_set.empty()); |
| |
| // Unicode characters consist of [U+0000, U+10FFFF]. |
| for (char32 ucs4 = 0; ucs4 <= 0x10ffff; ++ucs4) { |
| EXPECT_EQ(GetExpectedCharacterSet(test_set, ucs4), |
| Util::GetCharacterSet(ucs4)) |
| << "Character set changed at " << ucs4; |
| } |
| } |
| |
| TEST(UtilTest, CharacterSet_gen_character_set) { |
| // [0x00, 0x7f] are ASCII |
| for (size_t i = 0; i <= 0x7f; ++i) { |
| EXPECT_EQ(Util::ASCII, Util::GetCharacterSet(i)); |
| } |
| // [0x80, 0xff] are not ASCII |
| for (size_t i = 0x80; i <= 0xff; ++i) { |
| EXPECT_NE(Util::ASCII, Util::GetCharacterSet(i)); |
| } |
| |
| // 0213 |
| // "Ⅰ" |
| EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x2160)); |
| // "①" |
| EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x2460)); |
| // "㊤" |
| EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x32A4)); |
| // "ð ®" from UCS4 range (b/4176888) |
| EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x20B9F)); |
| // "ðª²" from UCS4 range (b/4176888) |
| EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet(0x2A6B2)); |
| |
| // only in CP932 |
| // "凬" |
| EXPECT_EQ(Util::CP932, Util::GetCharacterSet(0x51EC)); |
| |
| // only in Unicode |
| // "₩" |
| EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet(0xFFE6)); |
| // "ð ®·" from UCS4 range (b/4176888) |
| EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet(0x20BB7)); |
| } |
| |
| TEST(UtilTest, CharacterSet) { |
| // "あいうえお" |
| EXPECT_EQ(Util::JISX0208, Util::GetCharacterSet("\xe3\x81\x82\xe3\x81\x84\xe3" |
| "\x81\x86\xe3\x81\x88\xe3\x81" |
| "\x8a")); |
| EXPECT_EQ(Util::ASCII, Util::GetCharacterSet("abc")); |
| // "abcあいう" |
| EXPECT_EQ(Util::JISX0208, Util::GetCharacterSet("\x61\x62\x63\xe3\x81\x82\xe3" |
| "\x81\x84\xe3\x81\x86")); |
| |
| // half width katakana |
| // "カタカナ" |
| EXPECT_EQ(Util::JISX0201, Util::GetCharacterSet("\xef\xbd\xb6\xef\xbe\x80\xef" |
| "\xbd\xb6\xef\xbe\x85")); |
| // "カタカナカタカナ" |
| EXPECT_EQ(Util::JISX0208, Util::GetCharacterSet("\xef\xbd\xb6\xef\xbe\x80\xef" |
| "\xbd\xb6\xef\xbe\x85\xe3\x82" |
| "\xab\xe3\x82\xbf\xe3\x82\xab" |
| "\xe3\x83\x8a")); |
| |
| // 0213 |
| // "Ⅰ" |
| EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("\xe2\x85\xa0")); |
| // "①" |
| EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("\xe2\x91\xa0")); |
| // "㊤" |
| EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("\xe3\x8a\xa4")); |
| // "ð ®" from UCS4 range (b/4176888) |
| EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("\xF0\xA0\xAE\x9F")); |
| // "ðª²" from UCS4 range (b/4176888) |
| EXPECT_EQ(Util::JISX0213, Util::GetCharacterSet("\xF0\xAA\x9A\xB2")); |
| |
| // only in CP932 |
| // "凬" |
| EXPECT_EQ(Util::CP932, Util::GetCharacterSet("\xe5\x87\xac")); |
| |
| // only in Unicode |
| // "₩" |
| EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet("\xef\xbf\xa6")); |
| // "ð ®·" from UCS4 range (b/4176888) |
| EXPECT_EQ(Util::UNICODE_ONLY, Util::GetCharacterSet("\xF0\xA0\xAE\xB7")); |
| } |
| |
| #ifdef OS_WIN |
| TEST(UtilTest, WideCharsLen) { |
| // "að ®b" |
| const string input_utf8 = "a\360\240\256\237b"; |
| EXPECT_EQ(4, Util::WideCharsLen(input_utf8)); |
| EXPECT_EQ(0, Util::WideCharsLen(Util::SubString(input_utf8, 0, 0))); |
| EXPECT_EQ(1, Util::WideCharsLen(Util::SubString(input_utf8, 0, 1))); |
| EXPECT_EQ(3, Util::WideCharsLen(Util::SubString(input_utf8, 0, 2))); |
| EXPECT_EQ(4, Util::WideCharsLen(Util::SubString(input_utf8, 0, 3))); |
| } |
| |
| TEST(UtilTest, UTF8ToWide) { |
| const string input_utf8 = "abc"; |
| wstring output_wide; |
| Util::UTF8ToWide(input_utf8, &output_wide); |
| |
| string output_utf8; |
| Util::WideToUTF8(output_wide, &output_utf8); |
| EXPECT_EQ("abc", output_utf8); |
| } |
| |
| TEST(UtilTest, WideToUTF8_SurrogatePairSupport) { |
| // Visual C++ 2008 does not support embedding surrogate pair in string |
| // literals like L"\uD842\uDF9F". This is why we use wchar_t array instead. |
| // "ð ®" |
| const wchar_t input_wide[] = {0xD842, 0xDF9F, 0}; |
| string output_utf8; |
| Util::WideToUTF8(input_wide, &output_utf8); |
| |
| wstring output_wide; |
| Util::UTF8ToWide(output_utf8, &output_wide); |
| |
| EXPECT_EQ("\360\240\256\237", output_utf8); |
| EXPECT_EQ(input_wide, output_wide); |
| } |
| #endif // OS_WIN |
| |
| TEST(UtilTest, IsKanaSymbolContained) { |
| const string kFullstop("\xe3\x80\x82"); // "。" |
| const string kSpace(" "); |
| EXPECT_TRUE(Util::IsKanaSymbolContained(kFullstop)); |
| EXPECT_TRUE(Util::IsKanaSymbolContained(kSpace + kFullstop)); |
| EXPECT_TRUE(Util::IsKanaSymbolContained(kFullstop + kSpace)); |
| EXPECT_FALSE(Util::IsKanaSymbolContained(kSpace)); |
| EXPECT_FALSE(Util::IsKanaSymbolContained("")); |
| } |
| |
| #ifdef OS_ANDROID |
| // At the moment, encoding is not the target of build for Android. |
| #else |
| TEST(UtilTest, Issue2190350) { |
| string result = ""; |
| // \xE3\x81\x82 == Hiragana a in UTF8 |
| Util::UTF8ToSJIS("\xE3\x81\x82", &result); |
| EXPECT_EQ(2, result.length()); |
| // \x82\xA0 == Hiragana a in Shift-JIS |
| EXPECT_EQ("\x82\xA0", result); |
| |
| result = ""; |
| Util::SJISToUTF8("\x82\xA0", &result); |
| EXPECT_EQ(3, result.length()); |
| EXPECT_EQ("\xE3\x81\x82", result); |
| } |
| |
| #ifdef OS_WIN |
| // The following "ToUTF" tests fail in the windows environment where the target |
| // code pages are not installed. |
| #else |
| TEST(UtilTest, ToUTF8) { |
| string result = ""; |
| Util::ToUTF8("ISO8859-1", "\x61", &result); |
| EXPECT_EQ("a", result); |
| |
| // http://en.wikipedia.org/wiki/ISO/IEC_8859 |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("ISO8859-1", "\xc0", &result)); |
| EXPECT_EQ("\xC3\x80", result) << "ISO8859-1"; |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("ISO8859-2", "\xc0", &result)); |
| EXPECT_EQ("\xC5\x94", result) << "ISO8859-2"; |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("ISO8859-3", "\xc5", &result)); |
| EXPECT_EQ("\xC4\x8A", result) << "ISO8859-3"; |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("ISO8859-4", "\xbb", &result)); |
| EXPECT_EQ("\xC4\xA3", result) << "ISO8859-4"; |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("ISO8859-5", "\xbb", &result)); |
| EXPECT_EQ("\xD0\x9B", result) << "ISO8859-5"; |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("ISO8859-6", "\xbf", &result)); |
| EXPECT_EQ("\xD8\x9F", result) << "ISO8859-6"; |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("ISO8859-7", "\xbf", &result)); |
| EXPECT_EQ("\xCE\x8F", result) << "ISO8859-7"; |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("ISO8859-8", "\xfa", &result)); |
| EXPECT_EQ("\xD7\xAA", result) << "ISO8859-8"; |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("ISO8859-9", "\xbf", &result)); |
| EXPECT_EQ("\xC2\xBF", result) << "ISO8859-9"; |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("ISO8859-13", "\xbf", &result)); |
| EXPECT_EQ("\xC3\xA6", result) << "ISO8859-13"; |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("ISO8859-15", "\xbf", &result)); |
| EXPECT_EQ("\xC2\xBF", result) << "ISO8859-15"; |
| |
| // http://en.wikipedia.org/wiki/KOI8-R |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("KOI8-R", "\xc6", &result)); |
| EXPECT_EQ("\xD1\x84", result) << "KOI8-R"; |
| |
| // http://en.wikipedia.org/wiki/Windows-1251 |
| result.clear(); |
| EXPECT_TRUE(Util::ToUTF8("windows-1251", "\xc6", &result)); |
| EXPECT_EQ("\xD0\x96", result) << "windows-1251"; |
| |
| result.clear(); |
| EXPECT_FALSE(Util::ToUTF8("DUMMY_CODE", "a", &result)); |
| } |
| #endif |
| #endif |
| |
| TEST(UtilTest, Fingerprint32WithSeed_uint32) { |
| const uint32 seed = 0xabcdef; |
| |
| const uint32 num = 0x12345678; // Assumed little endian |
| const uint32 num_hash = Util::Fingerprint32WithSeed(num, seed); |
| |
| const char* str = "\x78\x56\x34\x12"; |
| const uint32 str_hash = Util::Fingerprint32WithSeed(str, 4, seed); |
| |
| EXPECT_EQ(num_hash, str_hash) << num_hash << " != " << str_hash; |
| } |
| |
| TEST(UtilTest, RandomSeedTest) { |
| Util::SetRandomSeed(0); |
| const int first_try = Util::Random(INT_MAX); |
| const int second_try = Util::Random(INT_MAX); |
| EXPECT_NE(first_try, second_try); |
| |
| // Reset the seed. |
| Util::SetRandomSeed(0); |
| EXPECT_EQ(first_try, Util::Random(INT_MAX)); |
| } |
| |
| TEST(UtilTest, SplitFirstChar32) { |
| StringPiece rest; |
| char32 c = 0; |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("", &c, &rest)); |
| EXPECT_EQ(0, c); |
| EXPECT_TRUE(rest.empty()); |
| |
| // Allow NULL to ignore the matched value. |
| rest.clear(); |
| EXPECT_TRUE(Util::SplitFirstChar32("01", NULL, &rest)); |
| EXPECT_EQ("1", rest); |
| |
| // Allow NULL to ignore the matched value. |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("01", &c, NULL)); |
| EXPECT_EQ('0', c); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\x01 ", &c, &rest)); |
| EXPECT_EQ(1, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\x7F ", &c, &rest)); |
| EXPECT_EQ(0x7F, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\xC2\x80 ", &c, &rest)); |
| EXPECT_EQ(0x80, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\xDF\xBF ", &c, &rest)); |
| EXPECT_EQ(0x7FF, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\xE0\xA0\x80 ", &c, &rest)); |
| EXPECT_EQ(0x800, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\xEF\xBF\xBF ", &c, &rest)); |
| EXPECT_EQ(0xFFFF, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\xF0\x90\x80\x80 ", &c, &rest)); |
| EXPECT_EQ(0x10000, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\xF7\xBF\xBF\xBF ", &c, &rest)); |
| EXPECT_EQ(0x1FFFFF, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\xF8\x88\x80\x80\x80 ", &c, &rest)); |
| EXPECT_EQ(0x200000, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\xFB\xBF\xBF\xBF\xBF ", &c, &rest)); |
| EXPECT_EQ(0x3FFFFFF, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\xFC\x84\x80\x80\x80\x80 ", &c, &rest)); |
| EXPECT_EQ(0x4000000, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitFirstChar32("\xFD\xBF\xBF\xBF\xBF\xBF ", &c, &rest)); |
| EXPECT_EQ(0x7FFFFFFF, c); |
| EXPECT_EQ(" ", rest); |
| |
| // If there is any invalid sequence, the entire text should be treated as |
| // am empty string. |
| { |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("\xC2 ", &c, &rest)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("\xC2\xC2 ", &c, &rest)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("\xE0 ", &c, &rest)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("\xE0\xE0\xE0 ", &c, &rest)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("\xF0 ", &c, &rest)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("\xF0\xF0\xF0\xF0 ", &c, &rest)); |
| EXPECT_EQ(0, c); |
| } |
| |
| // BOM should be treated as invalid byte. |
| { |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("\xFF ", &c, &rest)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("\xFE ", &c, &rest)); |
| EXPECT_EQ(0, c); |
| } |
| |
| // Invalid sequence for U+002F (redundant encoding) |
| { |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("\xC0\xAF", &c, &rest)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("\xE0\x80\xAF", &c, &rest)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitFirstChar32("\xF0\x80\x80\xAF", &c, &rest)); |
| EXPECT_EQ(0, c); |
| } |
| } |
| |
| TEST(UtilTest, SplitLastChar32) { |
| StringPiece rest; |
| char32 c = 0; |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32("", &rest, &c)); |
| EXPECT_EQ(0, c); |
| EXPECT_TRUE(rest.empty()); |
| |
| // Allow NULL to ignore the matched value. |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32("01", NULL, &c)); |
| EXPECT_EQ('1', c); |
| |
| // Allow NULL to ignore the matched value. |
| rest.clear(); |
| EXPECT_TRUE(Util::SplitLastChar32("01", &rest, NULL)); |
| EXPECT_EQ("0", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \x01", &rest, &c)); |
| EXPECT_EQ(1, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \x7F", &rest, &c)); |
| EXPECT_EQ(0x7F, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \xC2\x80", &rest, &c)); |
| EXPECT_EQ(0x80, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \xDF\xBF", &rest, &c)); |
| EXPECT_EQ(0x7FF, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \xE0\xA0\x80", &rest, &c)); |
| EXPECT_EQ(0x800, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \xEF\xBF\xBF", &rest, &c)); |
| EXPECT_EQ(0xFFFF, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \xF0\x90\x80\x80", &rest, &c)); |
| EXPECT_EQ(0x10000, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \xF7\xBF\xBF\xBF", &rest, &c)); |
| EXPECT_EQ(0x1FFFFF, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \xF8\x88\x80\x80\x80", &rest, &c)); |
| EXPECT_EQ(0x200000, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \xFB\xBF\xBF\xBF\xBF", &rest, &c)); |
| EXPECT_EQ(0x3FFFFFF, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \xFC\x84\x80\x80\x80\x80", &rest, &c)); |
| EXPECT_EQ(0x4000000, c); |
| EXPECT_EQ(" ", rest); |
| |
| rest.clear(); |
| c = 0; |
| EXPECT_TRUE(Util::SplitLastChar32(" \xFD\xBF\xBF\xBF\xBF\xBF", &rest, &c)); |
| EXPECT_EQ(0x7FFFFFFF, c); |
| EXPECT_EQ(" ", rest); |
| |
| // If there is any invalid sequence, the entire text should be treated as |
| // am empty string. |
| { |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32(" \xC2", &rest, &c)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32(" \xC2\xC2", &rest, &c)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32(" \xE0", &rest, &c)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32(" \xE0\xE0\xE0", &rest, &c)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32(" \xF0", &rest, &c)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32(" \xF0\xF0\xF0\xF0", &rest, &c)); |
| EXPECT_EQ(0, c); |
| } |
| |
| // BOM should be treated as invalid byte. |
| { |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32(" \xFF", &rest, &c)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32(" \xFE", &rest, &c)); |
| EXPECT_EQ(0, c); |
| } |
| |
| // Invalid sequence for U+002F (redundant encoding) |
| { |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32("\xC0\xAF", &rest, &c)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32("\xE0\x80\xAF", &rest, &c)); |
| EXPECT_EQ(0, c); |
| |
| c = 0; |
| EXPECT_FALSE(Util::SplitLastChar32("\xF0\x80\x80\xAF", &rest, &c)); |
| EXPECT_EQ(0, c); |
| } |
| } |
| |
| } // namespace mozc |