| // Copyright 2010-2015, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "base/util.h" |
| |
| #ifdef OS_WIN |
| #include <Windows.h> |
| #include <WinCrypt.h> |
| #include <time.h> |
| #include <stdio.h> // MSVC requires this for _vsnprintf |
| #else // OS_WIN |
| |
| #ifdef OS_MACOSX |
| #include <mach/mach.h> |
| #include <mach/mach_time.h> |
| |
| #elif defined(__native_client__) // OS_MACOSX |
| #include <irt.h> |
| #endif // OS_MACOSX or __native_client__ |
| #include <sys/mman.h> |
| #include <sys/time.h> |
| #include <unistd.h> |
| #endif // OS_WIN |
| |
| #include <algorithm> |
| #include <cctype> |
| #include <cstdarg> |
| #include <cstdlib> |
| #include <cstring> |
| #include <fstream> |
| #include <iterator> |
| #include <map> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/compiler_specific.h" |
| #include "base/logging.h" |
| #include "base/port.h" |
| #include "base/scoped_ptr.h" |
| #include "base/singleton.h" |
| #include "base/string_piece.h" |
| #include "base/text_converter.h" |
| |
| |
| namespace { |
| |
| #if MOZC_MSVC_VERSION_LT(18, 0) |
| void va_copy(va_list &a, va_list &b) { |
| a = b; |
| } |
| #endif // Visual C++ 2012 and prior |
| |
| // Lower-level routine that takes a va_list and appends to a specified |
| // string. All other routines of sprintf family are just convenience |
| // wrappers around it. |
| void StringAppendV(string *dst, const char *format, va_list ap) { |
| // First try with a small fixed size buffer |
| char space[1024]; |
| |
| // It's possible for methods that use a va_list to invalidate |
| // the data in it upon use. The fix is to make a copy |
| // of the structure before using it and use that copy instead. |
| va_list backup_ap; |
| va_copy(backup_ap, ap); |
| int result = vsnprintf(space, sizeof(space), format, backup_ap); |
| va_end(backup_ap); |
| |
| if ((result >= 0) && (result < sizeof(space))) { |
| // It fit |
| dst->append(space, result); |
| return; |
| } |
| |
| // Repeatedly increase buffer size until it fits |
| int length = sizeof(space); |
| while (true) { |
| if (result < 0) { |
| // Older behavior: just try doubling the buffer size |
| length *= 2; |
| } else { |
| // We need exactly "result+1" characters |
| length = result+1; |
| } |
| char *buf = new char[length]; |
| |
| // Restore the va_list before we use it again |
| va_copy(backup_ap, ap); |
| result = vsnprintf(buf, length, format, backup_ap); |
| va_end(backup_ap); |
| |
| if ((result >= 0) && (result < length)) { |
| // It fit |
| dst->append(buf, result); |
| delete[] buf; |
| return; |
| } |
| delete[] buf; |
| } |
| } |
| } // namespace |
| |
| namespace mozc { |
| |
| ConstChar32Iterator::ConstChar32Iterator(StringPiece utf8_string) |
| : utf8_string_(utf8_string), |
| current_(0), |
| done_(false) { |
| Next(); |
| } |
| |
| char32 ConstChar32Iterator::Get() const { |
| DCHECK(!done_); |
| return current_; |
| } |
| |
| void ConstChar32Iterator::Next() { |
| if (!done_) { |
| done_ = !Util::SplitFirstChar32(utf8_string_, ¤t_, &utf8_string_); |
| } |
| } |
| |
| bool ConstChar32Iterator::Done() const { |
| return done_; |
| } |
| |
| ConstChar32ReverseIterator::ConstChar32ReverseIterator(StringPiece utf8_string) |
| : utf8_string_(utf8_string), |
| current_(0), |
| done_(false) { |
| Next(); |
| } |
| |
| char32 ConstChar32ReverseIterator::Get() const { |
| DCHECK(!done_); |
| return current_; |
| } |
| |
| void ConstChar32ReverseIterator::Next() { |
| if (!done_) { |
| done_ = !Util::SplitLastChar32(utf8_string_, &utf8_string_, ¤t_); |
| } |
| } |
| |
| bool ConstChar32ReverseIterator::Done() const { |
| return done_; |
| } |
| |
| MultiDelimiter::MultiDelimiter(const char* delim) { |
| fill(lookup_table_, lookup_table_ + kTableSize, 0); |
| for (const char* p = delim; *p != '\0'; ++p) { |
| const unsigned char c = static_cast<unsigned char>(*p); |
| lookup_table_[c >> 3] |= 1 << (c & 0x07); |
| } |
| } |
| |
| template <typename Delimiter> |
| SplitIterator<Delimiter, SkipEmpty>::SplitIterator(StringPiece s, |
| const char *delim) |
| : end_(s.data() + s.size()), |
| delim_(delim), |
| sp_begin_(s.data()), |
| sp_len_(0) { |
| while (sp_begin_ != end_ && delim_.Contains(*sp_begin_)) ++sp_begin_; |
| const char *p = sp_begin_; |
| for (; p != end_ && !delim_.Contains(*p); ++p) {} |
| sp_len_ = p - sp_begin_; |
| } |
| |
| template <typename Delimiter> |
| void SplitIterator<Delimiter, SkipEmpty>::Next() { |
| sp_begin_ += sp_len_; |
| while (sp_begin_ != end_ && delim_.Contains(*sp_begin_)) ++sp_begin_; |
| if (sp_begin_ == end_) { |
| sp_len_ = 0; |
| return; |
| } |
| const char *p = sp_begin_; |
| for (; p != end_ && !delim_.Contains(*p); ++p) {} |
| sp_len_ = p - sp_begin_; |
| } |
| |
| template <typename Delimiter> |
| SplitIterator<Delimiter, AllowEmpty>::SplitIterator(StringPiece s, |
| const char *delim) |
| : end_(s.data() + s.size()), |
| delim_(delim), |
| sp_begin_(s.data()), |
| sp_len_(0), |
| done_(sp_begin_ == end_) { |
| const char *p = sp_begin_; |
| for (; p != end_ && !delim_.Contains(*p); ++p) {} |
| sp_len_ = p - sp_begin_; |
| } |
| |
| template <typename Delimiter> |
| void SplitIterator<Delimiter, AllowEmpty>::Next() { |
| sp_begin_ += sp_len_; |
| if (sp_begin_ == end_) { |
| sp_len_ = 0; |
| done_ = true; |
| return; |
| } |
| const char *p = ++sp_begin_; |
| for (; p != end_ && !delim_.Contains(*p); ++p) {} |
| sp_len_ = p - sp_begin_; |
| } |
| |
| // Explicitly instantiate the implementations of 4 patterns. |
| template class SplitIterator<SingleDelimiter, SkipEmpty>; |
| template class SplitIterator<MultiDelimiter, SkipEmpty>; |
| template class SplitIterator<SingleDelimiter, AllowEmpty>; |
| template class SplitIterator<MultiDelimiter, AllowEmpty>; |
| |
| void Util::SplitStringUsing(StringPiece str, |
| const char *delim, |
| vector<string> *output) { |
| if (delim[0] != '\0' && delim[1] == '\0') { |
| for (SplitIterator<SingleDelimiter> iter(str, delim); |
| !iter.Done(); iter.Next()) { |
| PushBackStringPiece(iter.Get(), output); |
| } |
| } else { |
| for (SplitIterator<MultiDelimiter> iter(str, delim); |
| !iter.Done(); iter.Next()) { |
| PushBackStringPiece(iter.Get(), output); |
| } |
| } |
| } |
| |
| void Util::SplitStringUsing(StringPiece str, |
| const char *delim, |
| vector<StringPiece> *output) { |
| if (delim[0] != '\0' && delim[1] == '\0') { |
| for (SplitIterator<SingleDelimiter> iter(str, delim); |
| !iter.Done(); iter.Next()) { |
| output->push_back(iter.Get()); |
| } |
| } else { |
| for (SplitIterator<MultiDelimiter> iter(str, delim); |
| !iter.Done(); iter.Next()) { |
| output->push_back(iter.Get()); |
| } |
| } |
| } |
| |
| void Util::SplitStringAllowEmpty(StringPiece str, |
| const char *delim, |
| vector<string> *output) { |
| if (delim[0] != '\0' && delim[1] == '\0') { |
| for (SplitIterator<SingleDelimiter, AllowEmpty> iter(str, delim); |
| !iter.Done(); iter.Next()) { |
| PushBackStringPiece(iter.Get(), output); |
| } |
| } else { |
| for (SplitIterator<MultiDelimiter, AllowEmpty> iter(str, delim); |
| !iter.Done(); iter.Next()) { |
| PushBackStringPiece(iter.Get(), output); |
| } |
| } |
| } |
| |
| void Util::SplitStringToUtf8Chars(const string &str, vector<string> *output) { |
| size_t begin = 0; |
| const size_t end = str.size(); |
| |
| while (begin < end) { |
| const size_t mblen = OneCharLen(str.c_str() + begin); |
| output->push_back(str.substr(begin, mblen)); |
| begin += mblen; |
| } |
| DCHECK_EQ(begin, end); |
| } |
| |
| void Util::SplitCSV(const string &input, vector<string> *output) { |
| scoped_ptr<char[]> tmp(new char[input.size() + 1]); |
| char *str = tmp.get(); |
| memcpy(str, input.data(), input.size()); |
| str[input.size()] = '\0'; |
| |
| char *eos = str + input.size(); |
| char *start = NULL; |
| char *end = NULL; |
| output->clear(); |
| |
| while (str < eos) { |
| while (*str == ' ' || *str == '\t') { |
| ++str; |
| } |
| |
| if (*str == '"') { |
| start = ++str; |
| end = start; |
| for (; str < eos; ++str) { |
| if (*str == '"') { |
| str++; |
| if (*str != '"') |
| break; |
| } |
| *end++ = *str; |
| } |
| str = find(str, eos, ','); |
| } else { |
| start = str; |
| str = find(str, eos, ','); |
| end = str; |
| } |
| bool end_is_empty = false; |
| if (*end == ',' && end == eos - 1) { |
| end_is_empty = true; |
| } |
| *end = '\0'; |
| output->push_back(start); |
| if (end_is_empty) { |
| output->push_back(""); |
| } |
| |
| ++str; |
| } |
| } |
| |
| void Util::JoinStrings(const vector<string> &input, |
| const char *delim, |
| string *output) { |
| output->clear(); |
| for (size_t i = 0; i < input.size(); ++i) { |
| if (i > 0) { |
| *output += delim; |
| } |
| *output += input[i]; |
| } |
| } |
| |
| void Util::JoinStringPieces(const vector<StringPiece> &pieces, |
| const char *delim, |
| string *output) { |
| if (pieces.empty()) { |
| output->clear(); |
| return; |
| } |
| |
| const size_t delim_len = strlen(delim); |
| size_t len = delim_len * (pieces.size() - 1); |
| for (size_t i = 0; i < pieces.size(); ++i) { |
| len += pieces[i].size(); |
| } |
| output->reserve(len); |
| pieces[0].CopyToString(output); |
| for (size_t i = 1; i < pieces.size(); ++i) { |
| output->append(delim, delim_len); |
| output->append(pieces[i].data(), pieces[i].size()); |
| } |
| } |
| |
| void Util::ConcatStrings(StringPiece s1, StringPiece s2, string *output) { |
| s1.CopyToString(output); |
| s2.AppendToString(output); |
| } |
| |
| void Util::AppendStringWithDelimiter(StringPiece delimiter, |
| StringPiece append_string, |
| string *output) { |
| CHECK(output); |
| if (!output->empty()) { |
| delimiter.AppendToString(output); |
| } |
| append_string.AppendToString(output); |
| } |
| |
| |
| void Util::StringReplace(StringPiece s, StringPiece oldsub, |
| StringPiece newsub, bool replace_all, |
| string *res) { |
| if (oldsub.empty()) { |
| s.AppendToString(res); // if empty, append the given string. |
| return; |
| } |
| |
| string::size_type start_pos = 0; |
| string::size_type pos; |
| do { |
| pos = s.find(oldsub, start_pos); |
| if (pos == string::npos) { |
| break; |
| } |
| res->append(s.data() + start_pos, pos - start_pos); |
| newsub.AppendToString(res); |
| start_pos = pos + oldsub.size(); // start searching again after the "old" |
| } while (replace_all); |
| res->append(s.data() + start_pos, s.length() - start_pos); |
| } |
| |
| // The offset value to transform the upper case character to the lower |
| // case. The value comes from both of (0x0061 "a" - 0x0041 "A") and |
| // (0xFF41 "a" - 0xFF21 "A"). |
| namespace { |
| const size_t kOffsetFromUpperToLower = 0x0020; |
| } |
| |
| void Util::LowerString(string *str) { |
| const char *begin = str->data(); |
| size_t mblen = 0; |
| |
| string utf8; |
| size_t pos = 0; |
| while (pos < str->size()) { |
| char32 ucs4 = UTF8ToUCS4(begin + pos, begin + str->size(), &mblen); |
| if (mblen == 0) { |
| break; |
| } |
| // ('A' <= ucs4 && ucs4 <= 'Z') || ('A' <= ucs4 && ucs4 <= 'Z') |
| if ((0x0041 <= ucs4 && ucs4 <= 0x005A) || |
| (0xFF21 <= ucs4 && ucs4 <= 0xFF3A)) { |
| ucs4 += kOffsetFromUpperToLower; |
| UCS4ToUTF8(ucs4, &utf8); |
| // The size of upper case character must be equal to the source |
| // lower case character. The following check asserts it. |
| if (utf8.size() != mblen) { |
| LOG(ERROR) << "The generated size differs from the source."; |
| return; |
| } |
| str->replace(pos, mblen, utf8); |
| } |
| pos += mblen; |
| } |
| } |
| |
| void Util::UpperString(string *str) { |
| const char *begin = str->data(); |
| size_t mblen = 0; |
| |
| string utf8; |
| size_t pos = 0; |
| while (pos < str->size()) { |
| char32 ucs4 = UTF8ToUCS4(begin + pos, begin + str->size(), &mblen); |
| // ('a' <= ucs4 && ucs4 <= 'z') || ('a' <= ucs4 && ucs4 <= 'z') |
| if ((0x0061 <= ucs4 && ucs4 <= 0x007A) || |
| (0xFF41 <= ucs4 && ucs4 <= 0xFF5A)) { |
| ucs4 -= kOffsetFromUpperToLower; |
| UCS4ToUTF8(ucs4, &utf8); |
| // The size of upper case character must be equal to the source |
| // lower case character. The following check asserts it. |
| if (utf8.size() != mblen) { |
| LOG(ERROR) << "The generated size differs from the source."; |
| return; |
| } |
| str->replace(pos, mblen, utf8); |
| } |
| pos += mblen; |
| } |
| } |
| |
| void Util::CapitalizeString(string *str) { |
| string first_str; |
| SubString(*str, 0, 1, &first_str); |
| UpperString(&first_str); |
| |
| string tailing_str; |
| SubString(*str, 1, string::npos, &tailing_str); |
| LowerString(&tailing_str); |
| |
| str->assign(first_str + tailing_str); |
| } |
| |
| bool Util::IsLowerAscii(StringPiece s) { |
| for (StringPiece::const_iterator iter = s.begin(); iter != s.end(); ++iter) { |
| if (!islower(*iter)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| bool Util::IsUpperAscii(StringPiece s) { |
| for (StringPiece::const_iterator iter = s.begin(); iter != s.end(); ++iter) { |
| if (!isupper(*iter)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| bool Util::IsCapitalizedAscii(StringPiece s) { |
| if (s.empty()) { |
| return true; |
| } |
| if (isupper(*s.begin())) { |
| return IsLowerAscii(s.substr(1)); |
| } |
| return false; |
| } |
| |
| bool Util::IsLowerOrUpperAscii(StringPiece s) { |
| if (s.empty()) { |
| return true; |
| } |
| if (islower(*s.begin())) { |
| return IsLowerAscii(s.substr(1)); |
| } |
| if (isupper(*s.begin())) { |
| return IsUpperAscii(s.substr(1)); |
| } |
| return false; |
| } |
| |
| bool Util::IsUpperOrCapitalizedAscii(StringPiece s) { |
| if (s.empty()) { |
| return true; |
| } |
| if (isupper(*s.begin())) { |
| return IsLowerOrUpperAscii(s.substr(1)); |
| } |
| return false; |
| } |
| |
| void Util::StripWhiteSpaces(const string &input, string *output) { |
| DCHECK(output); |
| output->clear(); |
| |
| if (input.empty()) { |
| return; |
| } |
| |
| size_t start = 0; |
| size_t end = input.size() - 1; |
| for (; start < input.size() && isspace(input[start]); ++start) {} |
| for (; end > start && isspace(input[end]); --end) {} |
| |
| if (end >= start) { |
| output->assign(input.data() + start, end - start + 1); |
| } |
| } |
| |
| namespace { |
| |
| // Table of UTF-8 character lengths, based on first byte |
| const unsigned char kUTF8LenTbl[256] = { |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 |
| }; |
| |
| bool IsUTF8TrailingByte(uint8 c) { |
| return (c & 0xc0) == 0x80; |
| } |
| |
| } // namespace |
| |
| // Return length of a single UTF-8 source character |
| size_t Util::OneCharLen(const char *src) { |
| return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)]; |
| } |
| |
| size_t Util::CharsLen(const char *src, size_t length) { |
| const char *begin = src; |
| const char *end = src + length; |
| int result = 0; |
| while (begin < end) { |
| ++result; |
| begin += OneCharLen(begin); |
| } |
| return result; |
| } |
| |
| char32 Util::UTF8ToUCS4(const char *begin, |
| const char *end, |
| size_t *mblen) { |
| StringPiece s(begin, end - begin); |
| StringPiece rest; |
| char32 c = 0; |
| if (!Util::SplitFirstChar32(s, &c, &rest)) { |
| *mblen = 0; |
| return 0; |
| } |
| *mblen = rest.begin() - s.begin(); |
| return c; |
| } |
| |
| bool Util::SplitFirstChar32(StringPiece s, |
| char32 *first_char32, |
| StringPiece *rest) { |
| char32 dummy_char32 = 0; |
| if (first_char32 == NULL) { |
| first_char32 = &dummy_char32; |
| } |
| StringPiece dummy_rest; |
| if (rest == NULL) { |
| rest = &dummy_rest; |
| } |
| |
| *first_char32 = 0; |
| rest->clear(); |
| |
| while (true) { |
| if (s.empty()) { |
| return false; |
| } |
| |
| char32 result = 0; |
| size_t len = 0; |
| char32 min_value = 0; |
| char32 max_value = 0xffffffff; |
| { |
| const uint8 leading_byte = static_cast<uint8>(s[0]); |
| if (leading_byte < 0x80) { |
| *first_char32 = leading_byte; |
| *rest = s.substr(1); |
| return true; |
| } |
| |
| if (IsUTF8TrailingByte(leading_byte)) { |
| // UTF-8 sequence should not start trailing bytes. |
| return false; |
| } |
| |
| if ((leading_byte & 0xe0) == 0xc0) { |
| len = 2; |
| min_value = 0x0080; |
| max_value = 0x07ff; |
| result = (leading_byte & 0x1f); |
| } else if ((leading_byte & 0xf0) == 0xe0) { |
| len = 3; |
| min_value = 0x0800; |
| max_value = 0xffff; |
| result = (leading_byte & 0x0f); |
| } else if ((leading_byte & 0xf8) == 0xf0) { |
| len = 4; |
| min_value = 0x010000; |
| max_value = 0x1fffff; |
| result = (leading_byte & 0x07); |
| // Below is out of UCS4 but acceptable in 32-bit. |
| } else if ((leading_byte & 0xfc) == 0xf8) { |
| len = 5; |
| min_value = 0x0200000; |
| max_value = 0x3ffffff; |
| result = (leading_byte & 0x03); |
| } else if ((leading_byte & 0xfe) == 0xfc) { |
| len = 6; |
| min_value = 0x4000000; |
| max_value = 0x7fffffff; |
| result = (leading_byte & 0x01); |
| } else { |
| // Currently 0xFE and 0xFF are treated as invalid. |
| return false; |
| } |
| } |
| |
| if (s.size() < len) { |
| // Data length is too short. |
| return false; |
| } |
| |
| for (size_t i = 1; i < len; ++i) { |
| const uint8 c = static_cast<uint8>(s[i]); |
| if (!IsUTF8TrailingByte(c)) { |
| // Trailing bytes not found. |
| return false; |
| } |
| result <<= 6; |
| result += (c & 0x3f); |
| } |
| if ((result < min_value) || (max_value < result)) { |
| // redundant UTF-8 sequence found. |
| return false; |
| } |
| *first_char32 = result; |
| *rest = s.substr(len); |
| return true; |
| } |
| } |
| |
| bool Util::SplitLastChar32(StringPiece s, |
| StringPiece *rest, |
| char32 *last_char32) { |
| StringPiece dummy_rest; |
| if (rest == NULL) { |
| rest = &dummy_rest; |
| } |
| char32 dummy_char32 = 0; |
| if (last_char32 == NULL) { |
| last_char32 = &dummy_char32; |
| } |
| |
| *last_char32 = 0; |
| rest->clear(); |
| |
| if (s.empty()) { |
| return false; |
| } |
| StringPiece::const_reverse_iterator it = s.rbegin(); |
| for (; (it != s.rend()) && IsUTF8TrailingByte(*it); ++it) {} |
| if (it == s.rend()) { |
| return false; |
| } |
| const StringPiece::difference_type len = distance(s.rbegin(), it) + 1; |
| const StringPiece last_piece = s.substr(s.size() - len); |
| StringPiece result_piece; |
| if (!SplitFirstChar32(last_piece, last_char32, &result_piece)) { |
| return false; |
| } |
| if (!result_piece.empty()) { |
| return false; |
| } |
| *rest = s; |
| rest->remove_suffix(len); |
| return true; |
| } |
| |
| void Util::UCS4ToUTF8(char32 c, string *output) { |
| output->clear(); |
| UCS4ToUTF8Append(c, output); |
| } |
| |
| void Util::UCS4ToUTF8Append(char32 c, string *output) { |
| if (c == 0) { |
| // Do nothing if |c| is NUL. Previous implementation of UCS4ToUTF8Append |
| // worked like this. |
| return; |
| } |
| if (c < 0x00080) { |
| output->push_back(static_cast<char>(c & 0xFF)); |
| return; |
| } |
| if (c < 0x00800) { |
| const char buf[] = { |
| static_cast<char>(0xC0 + ((c >> 6) & 0x1F)), |
| static_cast<char>(0x80 + (c & 0x3F)), |
| }; |
| output->append(buf, arraysize(buf)); |
| return; |
| } |
| if (c < 0x10000) { |
| const char buf[] = { |
| static_cast<char>(0xE0 + ((c >> 12) & 0x0F)), |
| static_cast<char>(0x80 + ((c >> 6) & 0x3F)), |
| static_cast<char>(0x80 + (c & 0x3F)), |
| }; |
| output->append(buf, arraysize(buf)); |
| return; |
| } |
| if (c < 0x200000) { |
| const char buf[] = { |
| static_cast<char>(0xF0 + ((c >> 18) & 0x07)), |
| static_cast<char>(0x80 + ((c >> 12) & 0x3F)), |
| static_cast<char>(0x80 + ((c >> 6) & 0x3F)), |
| static_cast<char>(0x80 + (c & 0x3F)), |
| }; |
| output->append(buf, arraysize(buf)); |
| return; |
| } |
| // below is not in UCS4 but in 32bit int. |
| if (c < 0x8000000) { |
| const char buf[] = { |
| static_cast<char>(0xF8 + ((c >> 24) & 0x03)), |
| static_cast<char>(0x80 + ((c >> 18) & 0x3F)), |
| static_cast<char>(0x80 + ((c >> 12) & 0x3F)), |
| static_cast<char>(0x80 + ((c >> 6) & 0x3F)), |
| static_cast<char>(0x80 + (c & 0x3F)), |
| }; |
| output->append(buf, arraysize(buf)); |
| return; |
| } |
| const char buf[] = { |
| static_cast<char>(0xFC + ((c >> 30) & 0x01)), |
| static_cast<char>(0x80 + ((c >> 24) & 0x3F)), |
| static_cast<char>(0x80 + ((c >> 18) & 0x3F)), |
| static_cast<char>(0x80 + ((c >> 12) & 0x3F)), |
| static_cast<char>(0x80 + ((c >> 6) & 0x3F)), |
| static_cast<char>(0x80 + (c & 0x3F)), |
| }; |
| output->append(buf, arraysize(buf)); |
| } |
| |
| #ifdef OS_WIN |
| size_t Util::WideCharsLen(StringPiece src) { |
| const int num_chars = |
| ::MultiByteToWideChar(CP_UTF8, 0, src.begin(), src.size(), NULL, 0); |
| if (num_chars <= 0) { |
| return 0; |
| } |
| return num_chars; |
| } |
| |
| int Util::UTF8ToWide(StringPiece input, wstring *output) { |
| const size_t output_length = WideCharsLen(input); |
| if (output_length == 0) { |
| return 0; |
| } |
| |
| const size_t buffer_len = output_length + 1; |
| scoped_ptr<wchar_t[]> input_wide(new wchar_t[buffer_len]); |
| const int copied_num_chars = ::MultiByteToWideChar( |
| CP_UTF8, 0, input.begin(), input.size(), input_wide.get(), |
| buffer_len); |
| if (0 <= copied_num_chars && copied_num_chars < buffer_len) { |
| output->assign(input_wide.get(), copied_num_chars); |
| } |
| return copied_num_chars; |
| } |
| |
| int Util::WideToUTF8(const wchar_t *input, string *output) { |
| const int output_length = WideCharToMultiByte(CP_UTF8, 0, input, -1, NULL, 0, |
| NULL, NULL); |
| if (output_length == 0) { |
| return 0; |
| } |
| |
| scoped_ptr<char[]> input_encoded(new char[output_length + 1]); |
| const int result = WideCharToMultiByte(CP_UTF8, 0, input, -1, |
| input_encoded.get(), |
| output_length + 1, NULL, NULL); |
| if (result > 0) { |
| output->assign(input_encoded.get()); |
| } |
| return result; |
| } |
| |
| int Util::WideToUTF8(const wstring &input, string *output) { |
| return WideToUTF8(input.c_str(), output); |
| } |
| #endif // OS_WIN |
| |
| StringPiece Util::SubStringPiece(StringPiece src, size_t start) { |
| const char *begin = src.data(); |
| const char *end = begin + src.size(); |
| for (size_t i = 0; i < start && begin < end; ++i) { |
| begin += OneCharLen(begin); |
| } |
| const size_t prefix_len = begin - src.data(); |
| return StringPiece(begin, src.size() - prefix_len); |
| } |
| |
| StringPiece Util::SubStringPiece( |
| StringPiece src, size_t start, size_t length) { |
| src = SubStringPiece(src, start); |
| size_t l = length; |
| const char *substr_end = src.data(); |
| const char *const end = src.data() + src.size(); |
| while (l > 0 && substr_end < end) { |
| substr_end += OneCharLen(substr_end); |
| --l; |
| } |
| return StringPiece(src.data(), substr_end - src.data()); |
| } |
| |
| void Util::SubString(StringPiece src, size_t start, size_t length, |
| string *result) { |
| DCHECK(result); |
| const StringPiece substr = SubStringPiece(src, start, length); |
| substr.CopyToString(result); |
| } |
| |
| bool Util::StartsWith(StringPiece str, StringPiece prefix) { |
| if (str.size() < prefix.size()) { |
| return false; |
| } |
| return (0 == memcmp(str.data(), prefix.data(), prefix.size())); |
| } |
| |
| bool Util::EndsWith(StringPiece str, StringPiece suffix) { |
| if (str.size() < suffix.size()) { |
| return false; |
| } |
| return (0 == memcmp(str.data() + str.size() - suffix.size(), |
| suffix.data(), suffix.size())); |
| } |
| |
| void Util::StripUTF8BOM(string *line) { |
| static const char kUTF8BOM[] = "\xef\xbb\xbf"; |
| if (line->substr(0, 3) == kUTF8BOM) { |
| line->erase(0, 3); |
| } |
| } |
| |
| bool Util::IsUTF16BOM(const string &line) { |
| static const char kUTF16LEBOM[] = "\xff\xfe"; |
| static const char kUTF16BEBOM[] = "\xfe\xff"; |
| if (line.size() >= 2 && |
| (line.substr(0, 2) == kUTF16LEBOM || |
| line.substr(0, 2) == kUTF16BEBOM)) { |
| return true; |
| } |
| return false; |
| } |
| |
| bool Util::IsAndroidPuaEmoji(StringPiece s) { |
| static const char kUtf8MinAndroidPuaEmoji[] = "\xf3\xbe\x80\x80"; |
| static const char kUtf8MaxAndroidPuaEmoji[] = "\xf3\xbe\xba\xa0"; |
| return (s.size() == 4 && |
| kUtf8MinAndroidPuaEmoji <= s && s <= kUtf8MaxAndroidPuaEmoji); |
| } |
| |
| string Util::StringPrintf(const char *format, ...) { |
| va_list ap; |
| va_start(ap, format); |
| string result; |
| StringAppendV(&result, format, ap); |
| va_end(ap); |
| return result; |
| } |
| |
| bool Util::ChopReturns(string *line) { |
| const string::size_type line_end = line->find_last_not_of("\r\n"); |
| if (line_end + 1 != line->size()) { |
| line->erase(line_end + 1); |
| return true; |
| } |
| return false; |
| } |
| |
| namespace { |
| bool GetSecureRandomSequence(char *buf, size_t buf_size) { |
| memset(buf, '\0', buf_size); |
| #ifdef OS_WIN |
| HCRYPTPROV hprov; |
| if (!::CryptAcquireContext(&hprov, |
| NULL, |
| NULL, |
| PROV_RSA_FULL, |
| CRYPT_VERIFYCONTEXT)) { |
| return false; |
| } |
| if (!::CryptGenRandom(hprov, |
| static_cast<DWORD>(buf_size), |
| reinterpret_cast<BYTE *>(buf))) { |
| ::CryptReleaseContext(hprov, 0); |
| return false; |
| } |
| ::CryptReleaseContext(hprov, 0); |
| return true; |
| #elif defined(__native_client__) |
| struct nacl_irt_random interface; |
| |
| if (nacl_interface_query(NACL_IRT_RANDOM_v0_1, &interface, |
| sizeof(interface)) != sizeof(interface)) { |
| DLOG(ERROR) << "Cannot get NACL_IRT_RANDOM_v0_1 interface"; |
| return false; |
| } |
| |
| size_t nread; |
| const int error = interface.get_random_bytes(buf, buf_size, &nread); |
| if (error != 0) { |
| LOG(ERROR) << "interface.get_random_bytes error: " << error; |
| return false; |
| } else if (nread != buf_size) { |
| LOG(ERROR) << "interface.get_random_bytes error. nread: " << nread |
| << " buf_size: " << buf_size; |
| return false; |
| } |
| return true; |
| #else // !OS_WIN && !__native_client__ |
| // Use non blocking interface on Linux. |
| // Mac also have /dev/urandom (although it's identical with /dev/random) |
| ifstream ifs("/dev/urandom", ios::binary); |
| if (!ifs) { |
| return false; |
| } |
| ifs.read(buf, buf_size); |
| return true; |
| #endif // OS_WIN or __native_client__ |
| } |
| } // namespace |
| |
| void Util::GetRandomSequence(char *buf, size_t buf_size) { |
| if (GetSecureRandomSequence(buf, buf_size)) { |
| return; |
| } |
| LOG(ERROR) << "Failed to generate secure random sequence. " |
| << "Make it with Util::Random()"; |
| for (size_t i = 0; i < buf_size; ++i) { |
| buf[i] = static_cast<char>(Util::Random(256)); |
| } |
| } |
| |
| void Util::GetRandomAsciiSequence(char *buf, size_t buf_size) { |
| // We use this map to convert a random byte value to an ascii character. |
| // Its size happens to be 64, which is just one fourth of the number of |
| // values that can be represented by a single byte value. This accidental |
| // coincidence makes implementation of the method quite simple. |
| const char kCharMap[] = |
| "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_"; |
| GetRandomSequence(buf, buf_size); |
| for (size_t i = 0; i < buf_size; ++i) { |
| // The size of kCharMap is just one fourth of 256. So we don't need to |
| // care if probability distribution over the characters is biased. |
| buf[i] = kCharMap[static_cast<unsigned char>(buf[i]) % 64]; |
| } |
| } |
| |
| int Util::Random(int size) { |
| DLOG_IF(FATAL, size < 0) << "|size| should be positive or 0. size: " << size; |
| // Caveat: RAND_MAX is likely to be too small to achieve fine-grained |
| // uniform distribution. |
| // TODO(yukawa): Improve the resolution. |
| return static_cast<int> (1.0 * size * rand() / (RAND_MAX + 1.0)); |
| } |
| |
| void Util::SetRandomSeed(uint32 seed) { |
| ::srand(seed); |
| } |
| |
| namespace { |
| class ClockImpl : public Util::ClockInterface { |
| public: |
| #ifndef __native_client__ |
| ClockImpl() {} |
| #else // __native_client__ |
| ClockImpl() : timezone_offset_sec_(0) {} |
| #endif // __native_client__ |
| virtual ~ClockImpl() {} |
| |
| virtual void GetTimeOfDay(uint64 *sec, uint32 *usec) { |
| #ifdef OS_WIN |
| FILETIME file_time; |
| GetSystemTimeAsFileTime(&file_time); |
| ULARGE_INTEGER time_value; |
| time_value.HighPart = file_time.dwHighDateTime; |
| time_value.LowPart = file_time.dwLowDateTime; |
| // Convert into microseconds |
| time_value.QuadPart /= 10; |
| // kDeltaEpochInMicroSecs is difference between January 1, 1970 and |
| // January 1, 1601 in microsecond. |
| // This number is calculated as follows. |
| // ((1970 - 1601) * 365 + 89) * 24 * 60 * 60 * 1000000 |
| // 89 is the number of leap years between 1970 and 1601. |
| const uint64 kDeltaEpochInMicroSecs = 11644473600000000ULL; |
| // Convert file time to unix epoch |
| time_value.QuadPart -= kDeltaEpochInMicroSecs; |
| *sec = static_cast<uint64>(time_value.QuadPart / 1000000UL); |
| *usec = static_cast<uint32>(time_value.QuadPart % 1000000UL); |
| #else // OS_WIN |
| struct timeval tv; |
| gettimeofday(&tv, NULL); |
| *sec = tv.tv_sec; |
| *usec = tv.tv_usec; |
| #endif // OS_WIN |
| } |
| |
| virtual uint64 GetTime() { |
| #ifdef OS_WIN |
| return static_cast<uint64>(_time64(NULL)); |
| #else |
| return static_cast<uint64>(time(NULL)); |
| #endif // OS_WIN |
| } |
| |
| virtual bool GetTmWithOffsetSecond(time_t offset_sec, tm *output) { |
| const time_t current_sec = static_cast<time_t>(this->GetTime()); |
| const time_t modified_sec = current_sec + offset_sec; |
| |
| #ifdef OS_WIN |
| if (_localtime64_s(output, &modified_sec) != 0) { |
| return false; |
| } |
| #elif defined(__native_client__) |
| const time_t localtime_sec = modified_sec + timezone_offset_sec_; |
| if (gmtime_r(&localtime_sec, output) == NULL) { |
| return false; |
| } |
| #else // !OS_WIN && !__native_client__ |
| if (localtime_r(&modified_sec, output) == NULL) { |
| return false; |
| } |
| #endif // OS_WIN |
| return true; |
| } |
| |
| virtual uint64 GetFrequency() { |
| #if defined(OS_WIN) |
| LARGE_INTEGER timestamp; |
| // TODO(yukawa): Consider the case where QueryPerformanceCounter is not |
| // available. |
| const BOOL result = ::QueryPerformanceFrequency(×tamp); |
| return static_cast<uint64>(timestamp.QuadPart); |
| #elif defined(OS_MACOSX) |
| static mach_timebase_info_data_t timebase_info; |
| mach_timebase_info(&timebase_info); |
| return static_cast<uint64>( |
| 1.0e9 * timebase_info.denom / timebase_info.numer); |
| #elif defined(OS_LINUX) |
| #if defined(HAVE_LIBRT) |
| return 1000000000uLL; |
| #else // HAVE_LIBRT |
| return 1000000uLL; |
| #endif // HAVE_LIBRT |
| #else // platforms (OS_WIN, OS_MACOSX, OS_LINUX, ...) |
| #error "Not supported platform" |
| #endif // platforms (OS_WIN, OS_MACOSX, OS_LINUX, ...) |
| } |
| |
| virtual uint64 GetTicks() { |
| #if defined(OS_WIN) |
| LARGE_INTEGER timestamp; |
| // TODO(yukawa): Consider the case where QueryPerformanceCounter is not |
| // available. |
| const BOOL result = ::QueryPerformanceCounter(×tamp); |
| return static_cast<uint64>(timestamp.QuadPart); |
| #elif defined(OS_MACOSX) |
| return static_cast<uint64>(mach_absolute_time()); |
| #elif defined(OS_LINUX) |
| #if defined(HAVE_LIBRT) |
| struct timespec timestamp; |
| if (-1 == clock_gettime(CLOCK_REALTIME, ×tamp)) { |
| return 0; |
| } |
| return timestamp.tv_sec * 1000000000uLL + timestamp.tv_nsec; |
| #else // HAVE_LIBRT |
| // librt is not linked on Android, so we uses GetTimeOfDay instead. |
| // GetFrequency() always returns 1MHz when librt is not available, |
| // so we uses microseconds as ticks. |
| uint64 sec; |
| uint32 usec; |
| GetTimeOfDay(&sec, &usec); |
| return sec * 1000000 + usec; |
| #endif // HAVE_LIBRT |
| #else // platforms (OS_WIN, OS_MACOSX, OS_LINUX, ...) |
| #error "Not supported platform" |
| #endif // platforms (OS_WIN, OS_MACOSX, OS_LINUX, ...) |
| } |
| |
| #ifdef __native_client__ |
| virtual void SetTimezoneOffset(int32 timezone_offset_sec) { |
| timezone_offset_sec_ = timezone_offset_sec; |
| } |
| |
| private: |
| int32 timezone_offset_sec_; |
| #endif // __native_client__ |
| }; |
| |
| Util::ClockInterface *g_clock_handler = NULL; |
| |
| Util::ClockInterface *GetClockHandler() { |
| if (g_clock_handler != NULL) { |
| return g_clock_handler; |
| } else { |
| return Singleton<ClockImpl>::get(); |
| } |
| } |
| |
| } // namespace |
| |
| void Util::SetClockHandler(Util::ClockInterface *handler) { |
| g_clock_handler = handler; |
| } |
| |
| void Util::GetTimeOfDay(uint64 *sec, uint32 *usec) { |
| GetClockHandler()->GetTimeOfDay(sec, usec); |
| } |
| |
| uint64 Util::GetTime() { |
| return GetClockHandler()->GetTime(); |
| } |
| |
| bool Util::GetCurrentTm(tm *current_time) { |
| return GetTmWithOffsetSecond(current_time, 0); |
| } |
| |
| bool Util::GetTmWithOffsetSecond(tm *time_with_offset, int offset_sec) { |
| return GetClockHandler()->GetTmWithOffsetSecond(offset_sec, time_with_offset); |
| } |
| |
| uint64 Util::GetFrequency() { |
| return GetClockHandler()->GetFrequency(); |
| } |
| |
| uint64 Util::GetTicks() { |
| return GetClockHandler()->GetTicks(); |
| } |
| |
| void Util::Sleep(uint32 msec) { |
| #ifdef OS_WIN |
| ::Sleep(msec); |
| #else // OS_WIN |
| usleep(msec * 1000); |
| #endif // OS_WIN |
| } |
| |
| #ifdef __native_client__ |
| void Util::SetTimezoneOffset(int32 timezone_offset_sec) { |
| return GetClockHandler()->SetTimezoneOffset(timezone_offset_sec); |
| } |
| #endif // __native_client__ |
| |
| namespace { |
| |
| void EscapeInternal(char input, const string &prefix, string *output) { |
| const int hi = ((static_cast<int>(input) & 0xF0) >> 4); |
| const int lo = (static_cast<int>(input) & 0x0F); |
| *output += prefix; |
| *output += static_cast<char>(hi >= 10 ? hi - 10 + 'A' : hi + '0'); |
| *output += static_cast<char>(lo >= 10 ? lo - 10 + 'A' : lo + '0'); |
| } |
| |
| } // namespace |
| |
| // Load Rules |
| #include "base/japanese_util_rule.h" |
| |
| void Util::HiraganaToKatakana(StringPiece input, string *output) { |
| TextConverter::Convert(hiragana_to_katakana_da, |
| hiragana_to_katakana_table, |
| input, |
| output); |
| } |
| |
| void Util::HiraganaToHalfwidthKatakana(StringPiece input, |
| string *output) { |
| // combine two rules |
| string tmp; |
| TextConverter::Convert(hiragana_to_katakana_da, |
| hiragana_to_katakana_table, |
| input, &tmp); |
| TextConverter::Convert(fullwidthkatakana_to_halfwidthkatakana_da, |
| fullwidthkatakana_to_halfwidthkatakana_table, |
| tmp, output); |
| } |
| |
| void Util::HiraganaToRomanji(StringPiece input, string *output) { |
| TextConverter::Convert(hiragana_to_romanji_da, |
| hiragana_to_romanji_table, |
| input, |
| output); |
| } |
| |
| void Util::HalfWidthAsciiToFullWidthAscii(StringPiece input, |
| string *output) { |
| TextConverter::Convert(halfwidthascii_to_fullwidthascii_da, |
| halfwidthascii_to_fullwidthascii_table, |
| input, |
| output); |
| } |
| |
| void Util::FullWidthAsciiToHalfWidthAscii(StringPiece input, |
| string *output) { |
| TextConverter::Convert(fullwidthascii_to_halfwidthascii_da, |
| fullwidthascii_to_halfwidthascii_table, |
| input, |
| output); |
| } |
| |
| void Util::HiraganaToFullwidthRomanji(StringPiece input, string *output) { |
| string tmp; |
| TextConverter::Convert(hiragana_to_romanji_da, |
| hiragana_to_romanji_table, |
| input, |
| &tmp); |
| TextConverter::Convert(halfwidthascii_to_fullwidthascii_da, |
| halfwidthascii_to_fullwidthascii_table, |
| tmp, |
| output); |
| } |
| |
| void Util::RomanjiToHiragana(StringPiece input, string *output) { |
| TextConverter::Convert(romanji_to_hiragana_da, |
| romanji_to_hiragana_table, |
| input, |
| output); |
| } |
| |
| void Util::KatakanaToHiragana(StringPiece input, string *output) { |
| TextConverter::Convert(katakana_to_hiragana_da, |
| katakana_to_hiragana_table, |
| input, |
| output); |
| } |
| |
| void Util::HalfWidthKatakanaToFullWidthKatakana(StringPiece input, |
| string *output) { |
| TextConverter::Convert(halfwidthkatakana_to_fullwidthkatakana_da, |
| halfwidthkatakana_to_fullwidthkatakana_table, |
| input, |
| output); |
| } |
| |
| void Util::FullWidthKatakanaToHalfWidthKatakana(StringPiece input, |
| string *output) { |
| TextConverter::Convert(fullwidthkatakana_to_halfwidthkatakana_da, |
| fullwidthkatakana_to_halfwidthkatakana_table, |
| input, |
| output); |
| } |
| |
| void Util::FullWidthToHalfWidth(StringPiece input, string *output) { |
| string tmp; |
| FullWidthAsciiToHalfWidthAscii(input, &tmp); |
| output->clear(); |
| FullWidthKatakanaToHalfWidthKatakana(tmp, output); |
| } |
| |
| void Util::HalfWidthToFullWidth(StringPiece input, string *output) { |
| string tmp; |
| HalfWidthAsciiToFullWidthAscii(input, &tmp); |
| output->clear(); |
| HalfWidthKatakanaToFullWidthKatakana(tmp, output); |
| } |
| |
| // TODO(tabata): Add another function to split voice mark |
| // of some UNICODE only characters (required to display |
| // and commit for old clients) |
| void Util::NormalizeVoicedSoundMark(StringPiece input, string *output) { |
| TextConverter::Convert(normalize_voiced_sound_da, |
| normalize_voiced_sound_table, |
| input, |
| output); |
| } |
| |
| namespace { |
| class BracketHandler { |
| public: |
| BracketHandler() { |
| VLOG(1) << "Init bracket mapping"; |
| |
| const struct BracketType { |
| const char *open_bracket; |
| const char *close_bracket; |
| } kBracketType[] = { |
| // { "(", ")" }, |
| // { "〔", "〕" }, |
| // { "[", "]" }, |
| // { "{", "}" }, |
| // { "〈", "〉" }, |
| // { "《", "》" }, |
| // { "「", "」" }, |
| // { "『", "』" }, |
| // { "【", "】" }, |
| // { "〘", "〙" }, |
| // { "〚", "〛" }, |
| { "\xEF\xBC\x88", "\xEF\xBC\x89" }, |
| { "\xE3\x80\x94", "\xE3\x80\x95" }, |
| { "\xEF\xBC\xBB", "\xEF\xBC\xBD" }, |
| { "\xEF\xBD\x9B", "\xEF\xBD\x9D" }, |
| { "\xE3\x80\x88", "\xE3\x80\x89" }, |
| { "\xE3\x80\x8A", "\xE3\x80\x8B" }, |
| { "\xE3\x80\x8C", "\xE3\x80\x8D" }, |
| { "\xE3\x80\x8E", "\xE3\x80\x8F" }, |
| { "\xE3\x80\x90", "\xE3\x80\x91" }, |
| { "\xe3\x80\x98", "\xe3\x80\x99" }, |
| { "\xe3\x80\x9a", "\xe3\x80\x9b" }, |
| { NULL, NULL }, // sentinel |
| }; |
| string open_full_width, open_half_width; |
| string close_full_width, close_half_width; |
| |
| for (size_t i = 0; |
| (kBracketType[i].open_bracket != NULL || |
| kBracketType[i].close_bracket != NULL); |
| ++i) { |
| Util::FullWidthToHalfWidth(kBracketType[i].open_bracket, |
| &open_full_width); |
| Util::HalfWidthToFullWidth(kBracketType[i].open_bracket, |
| &open_half_width); |
| Util::FullWidthToHalfWidth(kBracketType[i].close_bracket, |
| &close_full_width); |
| Util::HalfWidthToFullWidth(kBracketType[i].close_bracket, |
| &close_half_width); |
| open_bracket_[open_half_width] = close_half_width; |
| open_bracket_[open_full_width] = close_full_width; |
| close_bracket_[close_half_width] = open_half_width; |
| close_bracket_[close_full_width] = open_full_width; |
| } |
| } |
| ~BracketHandler() {} |
| |
| bool IsOpenBracket(const string &key, string *close_bracket) const { |
| map<string, string>::const_iterator it = |
| open_bracket_.find(key); |
| if (it == open_bracket_.end()) { |
| return false; |
| } |
| *close_bracket = it->second; |
| return true; |
| } |
| |
| bool IsCloseBracket(const string &key, string *open_bracket) const { |
| map<string, string>::const_iterator it = |
| close_bracket_.find(key); |
| if (it == close_bracket_.end()) { |
| return false; |
| } |
| *open_bracket = it->second; |
| return true; |
| } |
| |
| private: |
| map<string, string> open_bracket_; |
| map<string, string> close_bracket_; |
| }; |
| } // namespace |
| |
| bool Util::IsOpenBracket(const string &key, string *close_bracket) { |
| return Singleton<BracketHandler>::get()->IsOpenBracket(key, close_bracket); |
| } |
| |
| bool Util::IsCloseBracket(const string &key, string *open_bracket) { |
| return Singleton<BracketHandler>::get()->IsCloseBracket(key, open_bracket); |
| } |
| |
| bool Util::IsFullWidthSymbolInHalfWidthKatakana(const string &input) { |
| for (ConstChar32Iterator iter(input); !iter.Done(); iter.Next()) { |
| switch (iter.Get()) { |
| case 0x3002: // FULLSTOP "。" |
| case 0x300C: // LEFT CORNER BRACKET "「" |
| case 0x300D: // RIGHT CORNER BRACKET "」" |
| case 0x3001: // COMMA "、" |
| case 0x30FB: // MIDDLE DOT "・" |
| case 0x30FC: // SOUND_MARK "ー" |
| case 0x3099: // VOICE SOUND MARK "゙" |
| case 0x309A: // SEMI VOICE SOUND MARK "゚" |
| break; |
| default: |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| bool Util::IsHalfWidthKatakanaSymbol(const string &input) { |
| for (ConstChar32Iterator iter(input); !iter.Done(); iter.Next()) { |
| switch (iter.Get()) { |
| case 0xFF61: // FULLSTOP "。" |
| case 0xFF62: // LEFT CORNER BRACKET "「" |
| case 0xFF63: // RIGHT CORNER BRACKET "」" |
| case 0xFF64: // COMMA "、" |
| case 0xFF65: // MIDDLE DOT "・" |
| case 0xFF70: // SOUND_MARK "ー" |
| case 0xFF9E: // VOICE SOUND MARK "゙" |
| case 0xFF9F: // SEMI VOICE SOUND MARK "゚" |
| break; |
| default: |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| bool Util::IsKanaSymbolContained(const string &input) { |
| for (ConstChar32Iterator iter(input); !iter.Done(); iter.Next()) { |
| switch (iter.Get()) { |
| case 0x3002: // FULLSTOP "。" |
| case 0x300C: // LEFT CORNER BRACKET "「" |
| case 0x300D: // RIGHT CORNER BRACKET "」" |
| case 0x3001: // COMMA "、" |
| case 0x30FB: // MIDDLE DOT "・" |
| case 0x30FC: // SOUND_MARK "ー" |
| case 0x3099: // VOICE SOUND MARK "゙" |
| case 0x309A: // SEMI VOICE SOUND MARK "゚" |
| case 0xFF61: // FULLSTOP "。" |
| case 0xFF62: // LEFT CORNER BRACKET "「" |
| case 0xFF63: // RIGHT CORNER BRACKET "」" |
| case 0xFF64: // COMMA "、" |
| case 0xFF65: // MIDDLE DOT "・" |
| case 0xFF70: // SOUND_MARK "ー" |
| case 0xFF9E: // VOICE SOUND MARK "゙" |
| case 0xFF9F: // SEMI VOICE SOUND MARK "゚" |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| bool Util::IsEnglishTransliteration(const string &value) { |
| for (size_t i = 0; i < value.size(); ++i) { |
| if (value[i] == 0x20 || value[i] == 0x21 || |
| value[i] == 0x27 || value[i] == 0x2D || |
| // " ", "!", "'", "-" |
| (value[i] >= 0x41 && value[i] <= 0x5A) || // A..Z |
| (value[i] >= 0x61 && value[i] <= 0x7A)) { // a..z |
| // do nothing |
| } else { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| // URL |
| void Util::EncodeURI(const string &input, string *output) { |
| const char kDigits[] = "0123456789ABCDEF"; |
| const char *begin = input.data(); |
| const char *end = input.data() + input.size(); |
| output->clear(); |
| while (begin < end) { |
| if (isascii(*begin) && |
| (isdigit(*begin) || isalpha(*begin))) { |
| *output += *begin; |
| } else { |
| *output += '%'; |
| *output += kDigits[(*begin >> 4) & 0x0f]; |
| *output += kDigits[*begin & 0x0f]; |
| } |
| ++begin; |
| } |
| } |
| |
| void Util::DecodeURI(const string &src, string *output) { |
| output->clear(); |
| const char *p = src.data(); |
| const char *end = src.data() + src.size(); |
| while (p < end) { |
| if (*p == '%' && p + 2 < end) { |
| const char h = toupper(p[1]); |
| const char l = toupper(p[2]); |
| const int vh = isalpha(h) ? (10 + (h -'A')) : (h - '0'); |
| const int vl = isalpha(l) ? (10 + (l -'A')) : (l - '0'); |
| *output += ((vh << 4) + vl); |
| p += 3; |
| } else if (*p == '+') { |
| *output += ' '; |
| p++; |
| } else { |
| *output += *p++; |
| } |
| } |
| } |
| |
| void Util::AppendCGIParams(const vector<pair<string, string> > ¶ms, |
| string *base) { |
| if (params.size() == 0 || base == NULL) { |
| return; |
| } |
| |
| string encoded; |
| for (vector<pair<string, string> >::const_iterator it = params.begin(); |
| it != params.end(); |
| ++it) { |
| // Append "<first>=<encoded second>&" |
| base->append(it->first); |
| base->append("="); |
| EncodeURI(it->second, &encoded); |
| base->append(encoded); |
| base->append("&"); |
| } |
| |
| // Delete the last "&". |
| if (!base->empty()) { |
| base->erase(base->size() - 1); |
| } |
| } |
| |
| void Util::Escape(const string &input, string *output) { |
| output->clear(); |
| for (size_t i = 0; i < input.size(); ++i) { |
| EscapeInternal(input[i], "\\x", output); |
| } |
| } |
| |
| void Util::EscapeUrl(const string &input, string *output) { |
| output->clear(); |
| for (size_t i = 0; i < input.size(); ++i) { |
| EscapeInternal(input[i], "%", output); |
| } |
| } |
| |
| string Util::EscapeUrl(const string &input) { |
| string escaped_input; |
| EscapeUrl(input, &escaped_input); |
| return escaped_input; |
| } |
| |
| void Util::EscapeHtml(const string &plain, string *escaped) { |
| string tmp1, tmp2, tmp3, tmp4; |
| StringReplace(plain, "&", "&", true, &tmp1); |
| StringReplace(tmp1, "<", "<", true, &tmp2); |
| StringReplace(tmp2, ">", ">", true, &tmp3); |
| StringReplace(tmp3, "\"", """, true, &tmp4); |
| StringReplace(tmp4, "'", "'", true, escaped); |
| } |
| |
| void Util::UnescapeHtml(const string &escaped, string *plain) { |
| string tmp1, tmp2, tmp3, tmp4; |
| StringReplace(escaped, "&", "&", true, &tmp1); |
| StringReplace(tmp1, "<", "<", true, &tmp2); |
| StringReplace(tmp2, ">", ">", true, &tmp3); |
| StringReplace(tmp3, """, "\"", true, &tmp4); |
| StringReplace(tmp4, "'", "'", true, plain); |
| } |
| |
| void Util::EscapeCss(const string &plain, string *escaped) { |
| // ">" and "&" are not escaped because they are used for operands of |
| // CSS. |
| StringReplace(plain, "<", "<", true, escaped); |
| } |
| |
| #define INRANGE(w, a, b) ((w) >= (a) && (w) <= (b)) |
| |
| // script type |
| // TODO(yukawa, team): Make a mechanism to keep this classifier up-to-date |
| // based on the original data from Unicode.org. |
| Util::ScriptType Util::GetScriptType(char32 w) { |
| if (INRANGE(w, 0x0030, 0x0039) || // ascii number |
| INRANGE(w, 0xFF10, 0xFF19)) { // full width number |
| return NUMBER; |
| } else if ( |
| INRANGE(w, 0x0041, 0x005A) || // ascii upper |
| INRANGE(w, 0x0061, 0x007A) || // ascii lower |
| INRANGE(w, 0xFF21, 0xFF3A) || // fullwidth ascii upper |
| INRANGE(w, 0xFF41, 0xFF5A)) { // fullwidth ascii lower |
| return ALPHABET; |
| } else if ( |
| w == 0x3005 || // IDEOGRAPHIC ITERATION MARK "々" |
| INRANGE(w, 0x3400, 0x4DBF) || // CJK Unified Ideographs Extension A |
| INRANGE(w, 0x4E00, 0x9FFF) || // CJK Unified Ideographs |
| INRANGE(w, 0xF900, 0xFAFF) || // CJK Compatibility Ideographs |
| INRANGE(w, 0x20000, 0x2A6DF) || // CJK Unified Ideographs Extension B |
| INRANGE(w, 0x2A700, 0x2B73F) || // CJK Unified Ideographs Extension C |
| INRANGE(w, 0x2B740, 0x2B81F) || // CJK Unified Ideographs Extension D |
| INRANGE(w, 0x2F800, 0x2FA1F)) { // CJK Compatibility Ideographs |
| // As of Unicode 6.0.2, each block has the following characters assigned. |
| // [U+3400, U+4DB5]: CJK Unified Ideographs Extension A |
| // [U+4E00, U+9FCB]: CJK Unified Ideographs |
| // [U+4E00, U+FAD9]: CJK Compatibility Ideographs |
| // [U+20000, U+2A6D6]: CJK Unified Ideographs Extension B |
| // [U+2A700, U+2B734]: CJK Unified Ideographs Extension C |
| // [U+2B740, U+2B81D]: CJK Unified Ideographs Extension D |
| // [U+2F800, U+2FA1D]: CJK Compatibility Ideographs |
| return KANJI; |
| } else if ( |
| INRANGE(w, 0x3041, 0x309F) || // hiragana |
| w == 0x1B001) { // HIRAGANA LETTER ARCHAIC YE |
| return HIRAGANA; |
| } else if ( |
| INRANGE(w, 0x30A1, 0x30FF) || // full width katakana |
| INRANGE(w, 0x31F0, 0x31FF) || // Katakana Phonetic Extensions for Ainu |
| INRANGE(w, 0xFF65, 0xFF9F) || // half width katakana |
| w == 0x1B000) { // KATAKANA LETTER ARCHAIC E |
| return KATAKANA; |
| } else if ( |
| INRANGE(w, 0x02300, 0x023F3) || // Miscellaneous Technical |
| INRANGE(w, 0x02700, 0x027BF) || // Dingbats |
| INRANGE(w, 0x1F000, 0x1F02F) || // Mahjong tiles |
| INRANGE(w, 0x1F030, 0x1F09F) || // Domino tiles |
| INRANGE(w, 0x1F0A0, 0x1F0FF) || // Playing cards |
| INRANGE(w, 0x1F100, 0x1F2FF) || // Enclosed Alphanumeric Supplement |
| INRANGE(w, 0x1F200, 0x1F2FF) || // Enclosed Ideographic Supplement |
| INRANGE(w, 0x1F300, 0x1F5FF) || // Miscellaneous Symbols And Pictographs |
| INRANGE(w, 0x1F600, 0x1F64F) || // Emoticons |
| INRANGE(w, 0x1F680, 0x1F6FF) || // Transport And Map Symbols |
| INRANGE(w, 0x1F700, 0x1F77F) || // Alchemical Symbols |
| w == 0x26CE) { // Ophiuchus |
| return EMOJI; |
| } |
| |
| return UNKNOWN_SCRIPT; |
| } |
| |
| Util::FormType Util::GetFormType(char32 w) { |
| // 'Unicode Standard Annex #11: EAST ASIAN WIDTH' |
| // http://www.unicode.org/reports/tr11/ |
| |
| // Characters marked as 'Na' in |
| // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt |
| if (INRANGE(w, 0x0020, 0x007F) || // ascii |
| INRANGE(w, 0x27E6, 0x27ED) || // narrow mathematical symbols |
| INRANGE(w, 0x2985, 0x2986)) { // narrow white parentheses |
| return HALF_WIDTH; |
| } |
| |
| // Other characters marked as 'Na' in |
| // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt |
| if (INRANGE(w, 0x00A2, 0x00AF)) { |
| switch (w) { |
| case 0x00A2: // CENT SIGN |
| case 0x00A3: // POUND SIGN |
| case 0x00A5: // YEN SIGN |
| case 0x00A6: // BROKEN BAR |
| case 0x00AC: // NOT SIGN |
| case 0x00AF: // MACRON |
| return HALF_WIDTH; |
| } |
| } |
| |
| // Characters marked as 'H' in |
| // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt |
| if (w == 0x20A9 || // WON SIGN |
| INRANGE(w, 0xFF61, 0xFF9F) || // half-width katakana |
| INRANGE(w, 0xFFA0, 0xFFBE) || // half-width hangul |
| INRANGE(w, 0xFFC2, 0xFFCF) || // half-width hangul |
| INRANGE(w, 0xFFD2, 0xFFD7) || // half-width hangul |
| INRANGE(w, 0xFFDA, 0xFFDC) || // half-width hangul |
| INRANGE(w, 0xFFE8, 0xFFEE)) { // half-width symbols |
| return HALF_WIDTH; |
| } |
| |
| return FULL_WIDTH; |
| } |
| |
| #undef INRANGE |
| |
| // return script type of first character in str |
| Util::ScriptType Util::GetScriptType(const char *begin, |
| const char *end, size_t *mblen) { |
| const char32 w = UTF8ToUCS4(begin, end, mblen); |
| return GetScriptType(w); |
| } |
| |
| namespace { |
| |
| Util::ScriptType GetScriptTypeInternal(const string &str, bool ignore_symbols) { |
| Util::ScriptType result = Util::SCRIPT_TYPE_SIZE; |
| |
| for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) { |
| const char32 w = iter.Get(); |
| Util::ScriptType type = Util::GetScriptType(w); |
| if ((w == 0x30FC || w == 0x30FB || (w >= 0x3099 && w <= 0x309C)) && |
| // PROLONGEDSOUND MARK|MIDLE_DOT|VOICED_SOUND_MARKS |
| // are HIRAGANA as well |
| (result == Util::SCRIPT_TYPE_SIZE || |
| result == Util::HIRAGANA || result == Util::KATAKANA)) { |
| type = result; // restore the previous state |
| } |
| |
| // Ignore symbols |
| // Regard UNKNOWN_SCRIPT as symbols here |
| if (ignore_symbols && |
| result != Util::UNKNOWN_SCRIPT && |
| type == Util::UNKNOWN_SCRIPT) { |
| continue; |
| } |
| |
| // Periods are NUMBER as well, if it is not the first character. |
| // 0xFF0E == '.', 0x002E == '.' in UCS4 encoding. |
| if (result == Util::NUMBER && (w == 0xFF0E || w == 0x002E)) { |
| continue; |
| } |
| |
| // Not first character. |
| // Note: GetScriptType doesn't return SCRIPT_TYPE_SIZE, thus if result |
| // is not SCRIPT_TYPE_SIZE, it is not the first character. |
| if (result != Util::SCRIPT_TYPE_SIZE && type != result) { |
| return Util::UNKNOWN_SCRIPT; |
| } |
| result = type; |
| } |
| |
| if (result == Util::SCRIPT_TYPE_SIZE) { // everything is "ー" |
| return Util::UNKNOWN_SCRIPT; |
| } |
| |
| return result; |
| } |
| |
| } // namespace |
| |
| Util::ScriptType Util::GetScriptType(const string &str) { |
| return GetScriptTypeInternal(str, false); |
| } |
| |
| Util::ScriptType Util::GetFirstScriptType(const string &str) { |
| size_t mblen = 0; |
| return GetScriptType(str.c_str(), |
| str.c_str() + str.size(), |
| &mblen); |
| } |
| |
| Util::ScriptType Util::GetScriptTypeWithoutSymbols(const string &str) { |
| return GetScriptTypeInternal(str, true); |
| } |
| |
| // return true if all script_type in str is "type" |
| bool Util::IsScriptType(StringPiece str, Util::ScriptType type) { |
| for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) { |
| const char32 w = iter.Get(); |
| // Exception: 30FC (PROLONGEDSOUND MARK is categorized as HIRAGANA as well) |
| if (type != GetScriptType(w) && (w != 0x30FC || type != HIRAGANA)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| // return true if the string contains script_type char |
| bool Util::ContainsScriptType(StringPiece str, ScriptType type) { |
| for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) { |
| if (type == GetScriptType(iter.Get())) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // return the Form Type of string |
| Util::FormType Util::GetFormType(const string &str) { |
| // TODO(hidehiko): get rid of using FORM_TYPE_SIZE. |
| FormType result = FORM_TYPE_SIZE; |
| |
| for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) { |
| const FormType type = GetFormType(iter.Get()); |
| if (type == UNKNOWN_FORM || |
| (result != FORM_TYPE_SIZE && type != result)) { |
| return UNKNOWN_FORM; |
| } |
| result = type; |
| } |
| |
| return result; |
| } |
| |
| // Util::CharcterSet Util::GetCharacterSet(char32 ucs4); |
| #include "base/character_set.h" |
| |
| Util::CharacterSet Util::GetCharacterSet(const string &str) { |
| CharacterSet result = ASCII; |
| for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) { |
| result = max(result, GetCharacterSet(iter.Get())); |
| } |
| return result; |
| } |
| |
| } // namespace mozc |