src/base/number_util.cc - mozc - Git at Google

 // Copyright 2010-2015, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
 // copyright notice, this list of conditions and the following disclaimer
 // in the documentation and/or other materials provided with the
 // distribution.
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "base/number_util.h"

 #include <algorithm>
 #include <cctype>
 #include <cerrno>
 #include <cstdio>
 #include <cstring>
 #include <limits>
 #include <sstream>
 #include <string>
 #include <vector>

 #include "base/logging.h"
 #include "base/text_converter.h"
 #include "base/util.h"

 namespace mozc {

 namespace {

 // Table of number character of Kansuji
 const char *const kNumKanjiDigits[] = {
   // "〇", "一", "二", "三", "四", "五", "六", "七", "八", "九", nullptr
   "\xe3\x80\x87", "\xe4\xb8\x80", "\xe4\xba\x8c", "\xe4\xb8\x89",
   "\xe5\x9b\x9b", "\xe4\xba\x94", "\xe5\x85\xad", "\xe4\xb8\x83",
   "\xe5\x85\xab", "\xe4\xb9\x9d", nullptr
 };
 const char *const kNumKanjiOldDigits[] = {
   // nullptr, "壱", "弐", "参", "四", "五", "六", "七", "八", "九"
   nullptr, "\xe5\xa3\xb1", "\xe5\xbc\x90", "\xe5\x8f\x82", "\xe5\x9b\x9b",
   "\xe4\xba\x94", "\xe5\x85\xad", "\xe4\xb8\x83", "\xe5\x85\xab",
   "\xe4\xb9\x9d"
 };
 const char *const kNumFullWidthDigits[] = {
   // "０", "１", "２", "３", "４", "５", "６", "７", "８", "９", nullptr
   "\xef\xbc\x90", "\xef\xbc\x91", "\xef\xbc\x92", "\xef\xbc\x93",
   "\xef\xbc\x94", "\xef\xbc\x95", "\xef\xbc\x96", "\xef\xbc\x97",
   "\xef\xbc\x98", "\xef\xbc\x99", nullptr
 };
 const char *const kNumHalfWidthDigits[] = {
   "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", nullptr
 };

 // Table of Kanji number ranks
 const char *const kNumKanjiRanks[] = {
   // nullptr, "", "十", "百", "千"
   nullptr, "", "\xe5\x8d\x81", "\xe7\x99\xbe", "\xe5\x8d\x83"
 };
 const char *const kNumKanjiBiggerRanks[] = {
   // "", "万", "億", "兆", "京"
   "", "\xe4\xb8\x87", "\xe5\x84\x84", "\xe5\x85\x86", "\xe4\xba\xac"
 };
 const char *const kNumKanjiOldRanks[] = {
   // nullptr, "", "拾", "百", "阡"
   nullptr, "", "\xe6\x8b\xbe", "\xe7\x99\xbe", "\xe9\x98\xa1"
 };
 const char *const kNumKanjiBiggerOldRanks[] = {
   // "", "萬", "億", "兆", "京"
   "", "\xe8\x90\xac", "\xe5\x84\x84", "\xe5\x85\x86", "\xe4\xba\xac"
 };

 const char *const kRomanNumbersCapital[] = {
   // nullptr, "Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ",
   // "Ⅹ", "Ⅺ", "Ⅻ", nullptr
   nullptr, "\xe2\x85\xa0", "\xe2\x85\xa1", "\xe2\x85\xa2", "\xe2\x85\xa3",
   "\xe2\x85\xa4", "\xe2\x85\xa5", "\xe2\x85\xa6", "\xe2\x85\xa7",
   "\xe2\x85\xa8", "\xe2\x85\xa9", "\xe2\x85\xaa", "\xe2\x85\xab", nullptr
 };

 const char *const kRomanNumbersSmall[] = {
   // nullptr, "ⅰ", "ⅱ", "ⅲ", "ⅳ", "ⅴ", "ⅵ", "ⅶ", "ⅷ", "ⅸ",
   // "ⅹ", "ⅺ", "ⅻ", nullptr
   nullptr, "\xe2\x85\xb0", "\xe2\x85\xb1", "\xe2\x85\xb2", "\xe2\x85\xb3",
   "\xe2\x85\xb4", "\xe2\x85\xb5", "\xe2\x85\xb6", "\xe2\x85\xb7",
   "\xe2\x85\xb8", "\xe2\x85\xb9", "\xe2\x85\xba", "\xe2\x85\xbb", nullptr
 };

 const char *const kCircledNumbers[] = {
   nullptr,
   // "①", "②", "③", "④", "⑤", "⑥", "⑦", "⑧", "⑨", "⑩"
   "\xe2\x91\xa0", "\xe2\x91\xa1", "\xe2\x91\xa2", "\xe2\x91\xa3",
   "\xe2\x91\xa4", "\xe2\x91\xa5", "\xe2\x91\xa6", "\xe2\x91\xa7",
   "\xe2\x91\xa8", "\xe2\x91\xa9",
   // "⑪", "⑫", "⑬", "⑭", "⑮", "⑯", "⑰", "⑱", "⑲", "⑳"
   "\xe2\x91\xaa", "\xe2\x91\xab", "\xe2\x91\xac", "\xe2\x91\xad",
   "\xe2\x91\xae", "\xe2\x91\xaf", "\xe2\x91\xb0", "\xe2\x91\xb1",
   "\xe2\x91\xb2", "\xe2\x91\xb3",
   // Circled 21-35
   "\xE3\x89\x91", "\xE3\x89\x92", "\xE3\x89\x93", "\xE3\x89\x94",
   "\xE3\x89\x95", "\xE3\x89\x96", "\xE3\x89\x97", "\xE3\x89\x98",
   "\xE3\x89\x99", "\xE3\x89\x9A", "\xE3\x89\x9B", "\xE3\x89\x9C",
   "\xE3\x89\x9D", "\xE3\x89\x9E", "\xE3\x89\x9F",
   // Circled 36-50
   "\xE3\x8A\xB1", "\xE3\x8A\xB2", "\xE3\x8A\xB3", "\xE3\x8A\xB4",
   "\xE3\x8A\xB5", "\xE3\x8A\xB6", "\xE3\x8A\xB7", "\xE3\x8A\xB8",
   "\xE3\x8A\xB9", "\xE3\x8A\xBA", "\xE3\x8A\xBB", "\xE3\x8A\xBC",
   "\xE3\x8A\xBD", "\xE3\x8A\xBE", "\xE3\x8A\xBF",
   nullptr
 };

 // Structure to store character set variations.
 struct NumberStringVariation {
   const char *const *const digits;
   const int numbers_size;
   const char *description;
   const char *separator;
   const char *point;
   const NumberUtil::NumberString::Style style;
 };

 // Judges given string is a decimal number (including integer) or not.
 // It accepts strings whose last point is a decimal point like "123456."
 bool IsDecimalNumber(StringPiece str) {
   int num_point = 0;
   for (size_t i = 0; i < str.size(); ++i) {
     if (str[i] == '.') {
       ++num_point;
       // A valid decimal number has at most one decimal point.
       if (num_point >= 2) {
         return false;
       }
     } else if (!isdigit(str[i])) {
       return false;
     }
   }

   return true;
 }

 const char kAsciiZero = '0';
 const char kAsciiOne = '1';
 const char kAsciiNine = '9';

 template <typename T>
 string SimpleItoaImpl(T number) {
   stringstream ss;
   ss << number;
   return ss.str();
 }

 }  // namespace

 string NumberUtil::SimpleItoa(int32 number) {
   return SimpleItoaImpl(number);
 }

 string NumberUtil::SimpleItoa(uint32 number) {
   return SimpleItoaImpl(number);
 }

 string NumberUtil::SimpleItoa(int64 number) {
   return SimpleItoaImpl(number);
 }

 string NumberUtil::SimpleItoa(uint64 number) {
   return SimpleItoaImpl(number);
 }

 int NumberUtil::SimpleAtoi(StringPiece str) {
   stringstream ss;
   ss << str;
   int i = 0;
   ss >> i;
   return i;
 }

 namespace {

 // TODO(hidehiko): Refactoring with GetScriptType in Util class.
 inline bool IsArabicDecimalChar32(char32 ucs4) {
   // Halfwidth digit.
   if (kAsciiZero <= ucs4 && ucs4 <= kAsciiNine) {
     return true;
   }

   // Fullwidth digit.
   if (0xFF10 <= ucs4 && ucs4 <= 0xFF19) {
     return true;
   }

   return false;
 }

 }  // namespace

 bool NumberUtil::IsArabicNumber(StringPiece input_string) {
   if (input_string.empty()) {
     return false;
   }
   for (ConstChar32Iterator iter(input_string); !iter.Done(); iter.Next()) {
     if (!IsArabicDecimalChar32(iter.Get())) {
       // Found non-Arabic decimal character.
       return false;
     }
   }

   // All characters are numbers.
   return true;
 }

 bool NumberUtil::IsDecimalInteger(StringPiece str) {
   if (str.empty()) {
     return false;
   }
   for (size_t i = 0; i < str.size(); ++i) {
     if (!isdigit(str[i])) {
       return false;
     }
   }
   return true;
 }

 namespace {

 // To know what "大字" means, please refer
 // http://ja.wikipedia.org/wiki/%E5%A4%A7%E5%AD%97_(%E6%95%B0%E5%AD%97)
 const NumberStringVariation kKanjiVariations[] = {
   // "数字"
   {kNumHalfWidthDigits, 10, "\xE6\x95\xB0\xE5\xAD\x97", nullptr, nullptr,
    NumberUtil::NumberString::NUMBER_ARABIC_AND_KANJI_HALFWIDTH},
   // "数字"
   {kNumFullWidthDigits, 10, "\xE6\x95\xB0\xE5\xAD\x97", nullptr, nullptr,
    NumberUtil::NumberString::NUMBER_ARABIC_AND_KANJI_FULLWIDTH},
   // "漢数字"
   {kNumKanjiDigits, 10, "\xE6\xBC\xA2\xE6\x95\xB0\xE5\xAD\x97",
    nullptr, nullptr, NumberUtil::NumberString::NUMBER_KANJI},
   // "大字"
   {kNumKanjiOldDigits, 10, "\xE5\xA4\xA7\xE5\xAD\x97", nullptr, nullptr,
    NumberUtil::NumberString::NUMBER_OLD_KANJI},
 };

 // "弐拾"
 const char kOldTwoTen[] = "\xE5\xBC\x90\xE6\x8B\xBE";
 const size_t kOldTwoTenLength = arraysize(kOldTwoTen) - 1;
 // "廿"
 const char kOldTwenty[] = "\xE5\xBB\xBF";

 }  // namespace

 bool NumberUtil::ArabicToKanji(StringPiece input_num,
                                vector<NumberString> *output) {
   DCHECK(output);
   // "零"
   const char *const kNumZero = "\xe9\x9b\xb6";
   const int kDigitsInBigRank = 4;

   if (!IsDecimalInteger(input_num)) {
     return false;
   }

   {
     // We don't convert a number starting with '0', other than 0 itself.
     StringPiece::size_type i;
     for (i = 0; i < input_num.size() && input_num[i] == kAsciiZero; ++i) {}
     if (i == input_num.size()) {
       output->push_back(
           // "大字"
           NumberString(kNumZero, "\xE5\xA4\xA7\xE5\xAD\x97",
                        NumberString::NUMBER_OLD_KANJI));
       return true;
     }
   }

   // If given number needs higher ranks than our expectations,
   // we don't convert it.
   if (arraysize(kNumKanjiBiggerRanks) * kDigitsInBigRank < input_num.size()) {
     return false;
   }

   // Fill '0' in the beginning of input_num to make its length
   // (N * kDigitsInBigRank).
   const int filled_zero_num = (kDigitsInBigRank -
       (input_num.size() % kDigitsInBigRank)) % kDigitsInBigRank;
   string input(filled_zero_num, kAsciiZero);
   input_num.AppendToString(&input);

   // Segment into kDigitsInBigRank-digits pieces
   vector<string> ranked_numbers;
   for (int i = static_cast<int>(input.size()) - kDigitsInBigRank; i >= 0;
        i -= kDigitsInBigRank) {
     ranked_numbers.push_back(input.substr(i, kDigitsInBigRank));
   }
   const size_t rank_size = ranked_numbers.size();

   for (size_t variation_index = 0;
        variation_index < arraysize(kKanjiVariations); ++variation_index) {
     const NumberStringVariation &variation = kKanjiVariations[variation_index];
     const char *const *const digits = variation.digits;
     const NumberString::Style style = variation.style;

     if (rank_size == 1 &&
         (style == NumberString::NUMBER_ARABIC_AND_KANJI_HALFWIDTH ||
          style == NumberString::NUMBER_ARABIC_AND_KANJI_FULLWIDTH)) {
       continue;
     }

     const char *const *ranks;
     const char *const *bigger_ranks;
     if (style == NumberString::NUMBER_OLD_KANJI) {
       ranks = kNumKanjiOldRanks;
       bigger_ranks = kNumKanjiBiggerOldRanks;
     } else {
       ranks = kNumKanjiRanks;
       bigger_ranks = kNumKanjiBiggerRanks;
     }

     // TODO(peria): Bring |result| out if it improves the performance.
     string result;

     // Converts each segment, and merges them with rank Kanjis.
     for (int rank = rank_size - 1; rank >= 0; --rank) {
       const string &segment = ranked_numbers[rank];
       string segment_result;
       bool leading = true;
       for (size_t i = 0; i < segment.size(); ++i) {
         if (leading && segment[i] == kAsciiZero) {
           continue;
         }

         leading = false;
         if (style == NumberString::NUMBER_ARABIC_AND_KANJI_HALFWIDTH ||
             style == NumberString::NUMBER_ARABIC_AND_KANJI_FULLWIDTH) {
           segment_result += digits[segment[i] - kAsciiZero];
         } else {
           if (segment[i] == kAsciiZero) {
             continue;
           }
           // In "大字" style, "壱" is also required on every rank.
           if (style == NumberString::NUMBER_OLD_KANJI ||
               i == kDigitsInBigRank - 1 || segment[i] != kAsciiOne) {
             segment_result += digits[segment[i] - kAsciiZero];
           }
           segment_result += ranks[kDigitsInBigRank - i];
         }
       }
       if (!segment_result.empty()) {
         result += segment_result + bigger_ranks[rank];
       }
     }

     const char *description = variation.description;
     // Add simply converted numbers.
     output->push_back(NumberString(result, description, style));

     // Add specialized style numbers.
     if (style == NumberString::NUMBER_OLD_KANJI) {
       size_t index = result.find(kOldTwoTen);
       if (index != string::npos) {
         string result2(result);
         do {
           result2.replace(index, kOldTwoTenLength, kOldTwenty);
           index = result2.find(kOldTwoTen, index);
         } while (index != string::npos);
         output->push_back(NumberString(result2, description, style));
       }

       // for single kanji
       if (input == "0010") {
         // "拾"
         output->push_back(NumberString("\xE6\x8B\xBE", description, style));
       }
       if (input == "1000") {
         // "阡"
         output->push_back(NumberString("\xE9\x98\xA1", description, style));
       }
     }
   }

   return true;
 }

 namespace {

 const NumberStringVariation kNumDigitsVariations[] = {
   // "数字"
   {kNumHalfWidthDigits, 10, "\xE6\x95\xB0\xE5\xAD\x97", ",", ".",
    NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_HALFWIDTH},
   // "数字", "，", "．"
   {kNumFullWidthDigits, 10, "\xE6\x95\xB0\xE5\xAD\x97", "\xef\xbc\x8c",
    "\xEF\xBC\x8E", NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_FULLWIDTH},
 };

 }  // namespace

 bool NumberUtil::ArabicToSeparatedArabic(
     StringPiece input_num, vector<NumberString> *output) {
   DCHECK(output);

   if (!IsDecimalNumber(input_num)) {
     return false;
   }

   // Separate a number into an integral part and a fractional part.
   StringPiece::size_type point_pos = input_num.find('.');
   if (point_pos == StringPiece::npos) {
     point_pos = input_num.size();
   }
   const StringPiece integer(input_num, 0, point_pos);
   // |fraction| has the decimal point with digits in fractional part.
   const StringPiece fraction(input_num, point_pos,
                              input_num.size() - point_pos);

   // We don't add separator to number whose integral part starts with '0'
   if (integer[0] == kAsciiZero) {
     return false;
   }

   for (size_t i = 0; i < arraysize(kNumDigitsVariations); ++i) {
     const NumberStringVariation &variation = kNumDigitsVariations[i];
     const char *const *const digits = variation.digits;
     // TODO(peria): Bring |result| out if it improves the performance.
     string result;

     // integral part
     for (StringPiece::size_type j = 0; j < integer.size(); ++j) {
       // We don't add separater first
       if (j != 0 && (integer.size() - j) % 3 == 0) {
         result.append(variation.separator);
       }
       const uint32 d = static_cast<uint32>(integer[j] - kAsciiZero);
       if (d <= 9 && digits[d]) {
         result.append(digits[d]);
       }
     }

     // fractional part
     if (!fraction.empty()) {
       DCHECK_EQ(fraction[0], '.');
       result.append(variation.point);
       for (StringPiece::size_type j = 1; j < fraction.size(); ++j) {
         result.append(digits[static_cast<int>(fraction[j] - kAsciiZero)]);
       }
     }

     output->push_back(
         NumberString(result, variation.description, variation.style));
   }
   return true;
 }

 namespace {

 // use default for wide Arabic, because half/full width for
 // normal number is learned by charactor form manager.
 const NumberStringVariation kSingleDigitsVariations[] = {
   // "漢数字"
   {kNumKanjiDigits, 10, "\xE6\xBC\xA2\xE6\x95\xB0\xE5\xAD\x97",
    nullptr, nullptr, NumberUtil::NumberString::NUMBER_KANJI_ARABIC},
   // "数字"
   {kNumFullWidthDigits, 10, "\xE6\x95\xB0\xE5\xAD\x97",
    nullptr, nullptr, NumberUtil::NumberString::DEFAULT_STYLE},
 };

 }  // namespace

 bool NumberUtil::ArabicToWideArabic(
     StringPiece input_num, vector<NumberString> *output) {
   DCHECK(output);

   if (!IsDecimalInteger(input_num)) {
     return false;
   }

   for (size_t i = 0; i < arraysize(kSingleDigitsVariations); ++i) {
     const NumberStringVariation &variation = kSingleDigitsVariations[i];
     // TODO(peria): Bring |result| out if it improves the performance.
     string result;
     for (StringPiece::size_type j = 0; j < input_num.size(); ++j) {
       result.append(
           variation.digits[static_cast<int>(input_num[j] - kAsciiZero)]);
     }
     if (!result.empty()) {
       output->push_back(
           NumberString(result, variation.description, variation.style));
     }
   }
   return true;
 }

 namespace {

 const NumberStringVariation kSpecialNumericVariations[] = {
   {kRomanNumbersCapital, arraysize(kRomanNumbersCapital),
    // "ローマ数字(大文字)",
    "\xE3\x83\xAD\xE3\x83\xBC\xE3\x83\x9E\xE6\x95\xB0"
    "\xE5\xAD\x97(\xE5\xA4\xA7\xE6\x96\x87\xE5\xAD\x97)",
    nullptr, nullptr, NumberUtil::NumberString::NUMBER_ROMAN_CAPITAL},
   {kRomanNumbersSmall, arraysize(kRomanNumbersSmall),
    // "ローマ数字(小文字)",
    "\xE3\x83\xAD\xE3\x83\xBC\xE3\x83\x9E\xE6\x95\xB0"
    "\xE5\xAD\x97(\xE5\xB0\x8F\xE6\x96\x87\xE5\xAD\x97)",
    nullptr, nullptr, NumberUtil::NumberString::NUMBER_ROMAN_SMALL},
   {kCircledNumbers, arraysize(kCircledNumbers),
    // "丸数字"
    "\xE4\xB8\xB8\xE6\x95\xB0\xE5\xAD\x97",
    nullptr, nullptr, NumberUtil::NumberString::NUMBER_CIRCLED},
 };

 }  // namespace

 bool NumberUtil::ArabicToOtherForms(
     StringPiece input_num, vector<NumberString> *output) {
   DCHECK(output);

   if (!IsDecimalInteger(input_num)) {
     return false;
   }

   bool converted = false;

   // Googol
   {
     // 10^100
     const char *const kNumGoogol =
         "100000000000000000000000000000000000000000000000000"
         "00000000000000000000000000000000000000000000000000";

     if (input_num == kNumGoogol) {
       output->push_back(
           NumberString("Googol", "", NumberString::DEFAULT_STYLE));
       converted = true;
     }
   }

   // Following conversions require uint64 number.
   uint64 n;
   if (!SafeStrToUInt64(input_num, &n)) {
     return converted;
   }

   // Special forms
   for (size_t i = 0; i < arraysize(kSpecialNumericVariations); ++i) {
     const NumberStringVariation &variation = kSpecialNumericVariations[i];
     if (n < variation.numbers_size && variation.digits[n]) {
       output->push_back(
           NumberString(variation.digits[n], variation.description,
                        variation.style));
       converted = true;
     }
   }

   return converted;
 }

 namespace {

 // Enough size to store MAX_INT64 in octal digits with prefix.
 // Must be larger than or equal to Ceil(64 / 3) + 1 ("0") + 1 ('\0') = 24
 const int kMaxInt64Size = 24;

 }  // namespace

 bool NumberUtil::ArabicToOtherRadixes(
     StringPiece input_num, vector<NumberString> *output) {
   DCHECK(output);

   if (!IsDecimalInteger(input_num)) {
     return false;
   }

   uint64 n;
   if (!SafeStrToUInt64(input_num, &n)) {
     return false;
   }

   // Hexadecimal
   if (n > 9) {
     // Keep
     char hex[kMaxInt64Size];
     snprintf(hex, kMaxInt64Size, "0x%llx", n);
     // "16進数"
     output->push_back(NumberString(hex, "16\xE9\x80\xB2\xE6\x95\xB0",
                                    NumberString::NUMBER_HEX));
   }

   // Octal
   if (n > 7) {
     char oct[kMaxInt64Size];
     snprintf(oct, kMaxInt64Size, "0%llo", n);
     // "8進数"
     output->push_back(NumberString(oct, "8\xE9\x80\xB2\xE6\x95\xB0",
                                    NumberString::NUMBER_OCT));
   }

   // Binary
   if (n > 1) {
     string binary;
     for (uint64 num = n; num; num >>= 1) {
       binary.push_back(kAsciiZero + static_cast<char>(num & 0x1));
     }
     // "b0" will be "0b" in head of |binary|
     binary.append("b0");
     reverse(binary.begin(), binary.end());
     // "2進数"
     output->push_back(NumberString(binary, "2\xE9\x80\xB2\xE6\x95\xB0",
                                    NumberString::NUMBER_BIN));
   }

   return (n > 1);
 }

 namespace {

 const StringPiece SkipWhiteSpace(StringPiece str) {
   StringPiece::size_type i;
   for (i = 0; i < str.size() && isspace(str[i]); ++i) {}
   DCHECK(i == str.size() || !isspace(str[i]));
   return StringPiece(str, i);
 }

 // There is an informative discussion about the overflow detection in
 // "Hacker's Delight" (http://www.hackersdelight.org/basics.pdf)
 //   2-12 'Overflow Detection'

 // *output = arg1 + arg2
 // return false when an integer overflow happens.
 bool AddAndCheckOverflow(uint64 arg1, uint64 arg2, uint64 *output) {
   *output = arg1 + arg2;
   if (arg2 > (kuint64max - arg1)) {
     // overflow happens
     return false;
   }
   return true;
 }

 // *output = arg1 * arg2
 // return false when an integer overflow happens.
 bool MultiplyAndCheckOverflow(uint64 arg1, uint64 arg2, uint64 *output) {
   *output = arg1 * arg2;
   if (arg1 != 0 && arg2 > (kuint64max / arg1)) {
     // overflow happens
     return false;
   }
   return true;
 }

 // A simple wrapper of strtoull function. |c_str| must be terminated by '\0'.
 inline uint64 StrToUint64(const char* c_str, char** end_ptr, int base) {
 #ifdef OS_WIN
   return _strtoui64(c_str, end_ptr, base);
 #else  // OS_WIN
   return strtoull(c_str, end_ptr, base);
 #endif  // OS_WIN
 }

 // Converts a string which describes a number into an uint64 value in |base|
 // radix.  Does not convert octal or hexadecimal strings with "0" or "0x"
 // suffixes.
 bool SafeStrToUInt64WithBase(StringPiece str, int base, uint64 *value) {
   DCHECK(value);

   // Maximum possible length of number string, including terminating '\0'. Note
   // that the maximum possible length is achieved when str="111...11" (64
   // unities) and base=2.
   const size_t kMaxPossibleLength = 65;

   // Leading white spaces are allowed.
   const StringPiece stripped_str = SkipWhiteSpace(str);
   if (stripped_str.empty() || stripped_str.size() >= kMaxPossibleLength) {
     return false;
   }
   // StrToUint64() does not check if the input is negative.  However, a leading
   // '+' is OK.
   if (stripped_str[0] == '-') {
     return false;
   }

   // Since StringPiece doesn't end with '\0', we make a c-string on stack here.
   char buf[kMaxPossibleLength];
   memcpy(buf, str.data(), str.size());
   buf[str.size()] = '\0';

   char *end_ptr = nullptr;
   errno = 0;
   *value = StrToUint64(buf, &end_ptr, base);
   if (errno != 0 || end_ptr == buf) {  // Failed to parse uint64.
     return false;
   }
   // Trailing white spaces are allowed.
   const StringPiece trailing_str(end_ptr, buf + str.size() - end_ptr);
   return SkipWhiteSpace(trailing_str).empty();
 }

 template <typename T1, typename T2>
 struct GenericFalseTypeArity2 {
   // TODO(yukawa): Use std::false_type once C++11 is enabled everywhere.
   static const bool value = false;
 };

 template <typename SrcType, typename DestType>
 bool SafeCast(SrcType src, DestType *dest) {
   static_assert(GenericFalseTypeArity2<SrcType, DestType>::value,
                 "Shouldn't be used with implicit type conversion.");
   return false;
 }

 template <>
 bool SafeCast(int64 src, int32 *dest) {
   if (src < static_cast<int64>(kint32min) ||
       static_cast<int64>(kint32max) < src) {
     return false;
   }
   *dest = static_cast<int32>(src);
   return true;
 }

 template <>
 bool SafeCast(uint64 src, int64 *dest) {
   if (src > static_cast<uint64>(kint64max)) {
     return false;
   }
   *dest = static_cast<int64>(src);
   return true;
 }

 template <>
 bool SafeCast(uint64 src, uint32 *dest) {
   if (src > static_cast<uint64>(kuint32max)) {
     return false;
   }
   *dest = static_cast<uint32>(src);
   return true;
 }

 template <typename SrcType, typename DestType>
 bool SafeUnaryNegation(SrcType src, DestType *dest) {
   static_assert(GenericFalseTypeArity2<SrcType, DestType>::value,
                 "Shouldn't be used with implicit type conversion.");
   return false;
 }

 template <>
 bool SafeUnaryNegation(uint64 src, int64 *dest) {
   int64 tmp = 0;
   if (!SafeCast(src, &tmp)) {
     if (src == 0x8000000000000000ul) {
       // This is an exceptional case. |src| isn't in the range of int64,
       // but |-src| is in the range.
       *dest = kint64min;
       return true;
     }
     return false;
   }
   *dest = -tmp;
   return true;
 }

 }  // namespace

 bool NumberUtil::SafeStrToInt32(StringPiece str, int32 *value) {
   int64 tmp;
   if (!SafeStrToInt64(str, &tmp)) {
     return false;
   }
   return SafeCast(tmp, value);
 }

 bool NumberUtil::SafeStrToInt64(StringPiece str, int64 *value) {
   const StringPiece stripped_str = SkipWhiteSpace(str);
   if (stripped_str.empty()) {
     return false;
   }
   uint64 tmp;
   if (stripped_str[0] == '-') {
     StringPiece opposite_str = StringPiece(stripped_str,
                                            1,
                                            stripped_str.size() - 1);
     if (!SafeStrToUInt64WithBase(opposite_str, 10, &tmp)) {
       return false;
     }
     return SafeUnaryNegation(tmp, value);
   }
   if (!SafeStrToUInt64WithBase(str, 10, &tmp)) {
     return false;
   }
   return SafeCast(tmp, value);
 }

 bool NumberUtil::SafeStrToUInt32(StringPiece str, uint32 *value) {
   uint64 tmp;
   if (!SafeStrToUInt64WithBase(str, 10, &tmp)) {
     return false;
   }
   return SafeCast(tmp, value);
 }

 bool NumberUtil::SafeHexStrToUInt32(StringPiece str, uint32 *value) {
   uint64 tmp;
   if (!SafeStrToUInt64WithBase(str, 16, &tmp)) {
     return false;
   }
   return SafeCast(tmp, value);
 }

 bool NumberUtil::SafeOctStrToUInt32(StringPiece str, uint32 *value) {
   uint64 tmp;
   if (!SafeStrToUInt64WithBase(str, 8, &tmp)) {
     return false;
   }
   return SafeCast(tmp, value);
 }

 bool NumberUtil::SafeStrToUInt64(StringPiece str, uint64 *value) {
   return SafeStrToUInt64WithBase(str, 10, value);
 }

 bool NumberUtil::SafeStrToDouble(StringPiece str, double *value) {
   DCHECK(value);
   // Note that StringPiece isn't terminated by '\0'.  However, since strtod
   // requires null-terminated string, we make a string here. If we have a good
   // estimate of the maximum possible length of the input string, we may be able
   // to use char buffer instead.  Note: const reference ensures the life of this
   // temporary string until the end!
   const string &s = str.as_string();
   const char* ptr = s.c_str();

   char *end_ptr;
   errno = 0;  // errno only gets set on errors
   // strtod of GCC accepts hexadecimal number like "0x1234", but that of
   // VisualC++ does not.
   // Note that strtod accepts white spaces at the beginning of the parameter.
   *value = strtod(ptr, &end_ptr);
   if (errno != 0 ||
       ptr == end_ptr ||
       *value ==  numeric_limits<double>::infinity() ||
       *value == -numeric_limits<double>::infinity()) {
     return false;
   }
   // Trailing white spaces are allowed.
   const StringPiece trailing_str(end_ptr, ptr + s.size() - end_ptr);
   return SkipWhiteSpace(trailing_str).empty();
 }

 bool NumberUtil::SafeStrToFloat(StringPiece str, float *value) {
   double double_value;
   if (!SafeStrToDouble(str, &double_value)) {
     return false;
   }
   *value = static_cast<float>(double_value);

   if ((*value ==  numeric_limits<float>::infinity()) ||
       (*value == -numeric_limits<float>::infinity())) {
     return false;
   }
   return true;
 }

 namespace {

 // Reduces leading digits less than 10 as their base10 interpretation, e.g.,
 //   [1, 2, 3, 10, 100] => begin points to [10, 100], output = 123
 // Returns false when overflow happened.
 bool ReduceLeadingNumbersAsBase10System(
     vector<uint64>::const_iterator *begin,
     const vector<uint64>::const_iterator &end,
     uint64 *output) {
   *output = 0;
   for (; *begin < end; ++*begin) {
     if (**begin >= 10) {
       return true;
     }
     // *output = *output * 10 + *it
     if (!MultiplyAndCheckOverflow(*output, 10, output) ||
         !AddAndCheckOverflow(*output, **begin, output)) {
       return false;
     }
   }
   return true;
 }

 // Interprets digits as base10 system, e.g.,
 //   [1, 2, 3] => 123
 //   [1, 2, 3, 10] => false
 // Returns false if a number greater than 10 was found or overflow happened.
 bool InterpretNumbersAsBase10System(const vector<uint64> &numbers,
                                     uint64 *output) {
   auto begin = numbers.begin();
   const bool success =
       ReduceLeadingNumbersAsBase10System(&begin, numbers.end(), output);
   // Check if the whole numbers were reduced.
   return (success && begin == numbers.end());
 }

 // Reads a leading number in a sequence and advances the iterator. Returns false
 // if the range is empty or the leading number is not less than 10.
 bool ReduceOnesDigit(vector<uint64>::const_iterator *begin,
                      const vector<uint64>::const_iterator &end,
                      uint64 *num) {
   if (*begin == end || **begin >= 10) {
     return false;
   }
   *num = **begin;
   ++*begin;
   return true;
 }

 // Given expected_base, 10, 100, or 1000, reads leading one or two numbers and
 // calculates the number in the follwoing way:
 //   Case: expected_base == 10
 //     [10, ...] => 10
 //     [2, 10, ...] => 20
 //     [1, 10, ...] => error because we don't write "一十" in Japanese.
 //     [20, ...] => 20 because "廿" is interpreted as 20.
 //     [2, 0, ...] => 20
 //   Case: expected_base == 100
 //     [100, ...] => 100
 //     [2, 100, ...] => 200
 //     [1, 100, ...] => error because we don't write "一百" in Japanese.
 //     [1, 2, 3, ...] => 123
 //   Case: expected_base == 1000
 //     [1000, ...] => 1000
 //     [2, 1000, ...] => 2000
 //     [1, 1000, ...] => 1000
 //     [1, 2, 3, 4, ...] => 1234
 bool ReduceDigitsHelper(vector<uint64>::const_iterator *begin,
                         const vector<uint64>::const_iterator &end,
                         uint64 *num,
                         const uint64 expected_base) {
   // Skip leading zero(s).
   while (*begin != end && **begin == 0) {
     ++*begin;
   }
   if (*begin == end) {
     return false;
   }
   const uint64 leading_number = **begin;

   // If the leading number is less than 10, e.g., patterns like [2, 10], we need
   // to check the next number.
   if (leading_number < 10) {
     if (end - *begin < 2) {
       return false;
     }
     const uint64 next_number = *(*begin + 1);

     // If the next number is also less than 10, this pattern is like
     // [1, 2, ...] => 12. In this case, the result must be less than
     // 10 * expected_base.
     if (next_number < 10) {
       if (!ReduceLeadingNumbersAsBase10System(begin, end, num) ||
           *num >= expected_base * 10 ||
           (*begin != end && **begin < 10000)) {
         *begin = end;  // Force to ignore the rest of the sequence.
         return false;
       }
       return true;
     }

     // Patterns like [2, 10, ...] and [1, 1000, ...].
     if (next_number != expected_base ||
         (leading_number == 1 && expected_base != 1000)) {
       return false;
     }
     *num = leading_number * expected_base;
     *begin += 2;
     return true;
   }

   // Patterns like [10, ...], [100, ...], [1000, ...], [20, ...]. The leading 20
   // is a special case for Kanji "廿".
   if (leading_number == expected_base ||
       (expected_base == 10 && leading_number == 20)) {
     *num = leading_number;
     ++*begin;
     return true;
   }
   return false;
 }

 inline bool ReduceTensDigit(vector<uint64>::const_iterator *begin,
                             const vector<uint64>::const_iterator &end,
                             uint64 *num) {
   return ReduceDigitsHelper(begin, end, num, 10);
 }

 inline bool ReduceHundredsDigit(vector<uint64>::const_iterator *begin,
                                 const vector<uint64>::const_iterator &end,
                                 uint64 *num) {
   return ReduceDigitsHelper(begin, end, num, 100);
 }

 inline bool ReduceThousandsDigit(vector<uint64>::const_iterator *begin,
                                  const vector<uint64>::const_iterator &end,
                                  uint64 *num) {
   return ReduceDigitsHelper(begin, end, num, 1000);
 }

 // Reduces leading digits as a number less than 10000 and advances the
 // iterator. For example:
 //   [1, 1000, 2, 100, 3, 10, 4, 10000, ...]
 //        => begin points to [10000, ...], num = 1234
 //   [3, 100, 4, 100]
 //        => error because same base number appears twice
 bool ReduceNumberLessThan10000(vector<uint64>::const_iterator *begin,
                                const vector<uint64>::const_iterator &end,
                                uint64 *num) {
   *num = 0;
   bool success = false;
   uint64 n = 0;
   // Note: the following additions never overflow.
   if (ReduceThousandsDigit(begin, end, &n)) {
     *num += n;
     success = true;
   }
   if (ReduceHundredsDigit(begin, end, &n)) {
     *num += n;
     success = true;
   }
   if (ReduceTensDigit(begin, end, &n)) {
     *num += n;
     success = true;
   }
   if (ReduceOnesDigit(begin, end, &n)) {
     *num += n;
     success = true;
   }
   // If at least one reduce was successful, no number remains in the sequence or
   // the next number should be a base number greater than 1000 (e.g., 10000,
   // 100000, etc.). Strictly speaking, better to check **begin % 10 == 0.
   return success && (*begin == end || **begin >= 10000);
 }

 // Interprets a sequence of numbers in a Japanese reading way. For example:
 //   "一万二千三百四十五" = [1, 10000, 2, 1000, 3, 100, 4, 10, 5] => 12345
 // Base-10 numbers must be decreasing, i.e.,
 //   "一十二百" = [1, 10, 2, 100] => error
 bool InterpretNumbersInJapaneseWay(const vector<uint64> &numbers,
                                    uint64 *output) {
   uint64 last_base = kuint64max;
   auto begin = numbers.begin();
   *output = 0;
   do {
     uint64 coef = 0;
     if (!ReduceNumberLessThan10000(&begin, numbers.end(), &coef)) {
       return false;
     }
     if (begin == numbers.end()) {
       return AddAndCheckOverflow(*output, coef, output);
     }
     if (*begin >= last_base) {
       return false;  // Increasing order of base-10 numbers.
     }
     // Safely performs *output += coef * *begin.
     uint64 delta = 0;
     if (!MultiplyAndCheckOverflow(coef, *begin, &delta) ||
         !AddAndCheckOverflow(*output, delta, output)) {
       return false;
     }
     last_base = *begin++;
   } while (begin != numbers.end());

   return true;
 }

 // Interprets a sequence of numbers directly or in a Japanese reading way
 // depending on the maximum number in the sequence.
 bool NormalizeNumbersHelper(const vector<uint64> &numbers,
                             uint64 *number_output) {
   const auto itr_max = max_element(numbers.begin(), numbers.end());
   if (itr_max == numbers.end()) {
     return false;  // numbers is empty
   }

   // When no scaling number is found, convert number directly.
   // For example, [5,4,3] => 543
   if (*itr_max < 10) {
     return InterpretNumbersAsBase10System(numbers, number_output);
   }
   return InterpretNumbersInJapaneseWay(numbers, number_output);
 }

 // TODO(peria): Do refactoring this method.
 bool NormalizeNumbersInternal(StringPiece input,
                               bool trim_leading_zeros,
                               bool allow_suffix,
                               string *kanji_output,
                               string *arabic_output,
                               string *suffix) {
   DCHECK(kanji_output);
   DCHECK(arabic_output);
   const char *begin = input.data();
   const char *end = input.data() + input.size();
   vector<uint64> numbers;
   numbers.reserve(input.size());

   // Map Kanji number string to digits, e.g., "二百十一" -> [2, 100, 10, 1].
   // Simultaneously, constructs a Kanji number string.
   kanji_output->clear();
   arabic_output->clear();
   string kanji_char;

   while (begin < end) {
     size_t mblen = 0;
     const char32 wchar = Util::UTF8ToUCS4(begin, end, &mblen);
     kanji_char.assign(begin, mblen);

     string tmp;
     NumberUtil::KanjiNumberToArabicNumber(kanji_char, &tmp);

     uint64 n = 0;
     if (!NumberUtil::SafeStrToUInt64(tmp, &n)) {
       break;
     }

     if (wchar >= 0x0030 && wchar <= 0x0039) {  // '0' <= wchar <= '9'
       kanji_char.assign(kNumKanjiDigits[wchar - 0x0030], 3);
     } else if (wchar >= 0xFF10 && wchar <= 0xFF19) {  // '０' <= wchar <= '９'
       kanji_char.assign(kNumKanjiDigits[wchar - 0xFF10], 3);
     }
     kanji_output->append(kanji_char);
     numbers.push_back(n);
     begin += mblen;
   }
   if (begin < end) {
     if (!allow_suffix) {
       return false;
     }
     DCHECK(suffix);
     suffix->assign(begin, end);
   }

   if (numbers.empty()) {
     return false;
   }

   // Try interpreting the sequence of digits.
   uint64 n = 0;
   if (!NormalizeNumbersHelper(numbers, &n)) {
     return false;
   }

   if (!trim_leading_zeros) {
     // If |numbers| contains only k zeros, add (k - 1) zeros to the output.
     // Otherwise, add the same number of leading zeros.
     size_t num_zeros;
     for (num_zeros = 0; num_zeros < numbers.size(); ++num_zeros) {
       if (numbers[num_zeros] != 0) {
         break;
       }
     }
     if (num_zeros == numbers.size()) {
       --num_zeros;
     }
     arabic_output->append(num_zeros, kAsciiZero);
   }

   char buf[kMaxInt64Size];
   snprintf(buf, sizeof(buf), "%llu", n);
   *arabic_output += buf;
   return true;
 }

 }  // end of anonymous namespace

 // Convert Kanji numbers into Arabic numbers:
 // e.g. "百二十万" -> 1200000
 bool NumberUtil::NormalizeNumbers(StringPiece input,
                                   bool trim_leading_zeros,
                                   string *kanji_output,
                                   string *arabic_output) {
   return NormalizeNumbersInternal(input,
                                   trim_leading_zeros,
                                   false,  // allow_suffix
                                   kanji_output,
                                   arabic_output,
                                   nullptr);
 }

 bool NumberUtil::NormalizeNumbersWithSuffix(StringPiece input,
                                             bool trim_leading_zeros,
                                             string *kanji_output,
                                             string *arabic_output,
                                             string *suffix) {
   return NormalizeNumbersInternal(input,
                                   trim_leading_zeros,
                                   true,  // allow_suffix
                                   kanji_output,
                                   arabic_output,
                                   suffix);
 }

 namespace {

 // Load  Rules
 // TODO(peria): Split following header file.  No need to include Janapese
 //     character constants.
 #include "base/japanese_util_rule.h"

 }  // namespace

 void NumberUtil::KanjiNumberToArabicNumber(StringPiece input,
                                            string *output) {
   TextConverter::Convert(kanjinumber_to_arabicnumber_da,
                          kanjinumber_to_arabicnumber_table,
                          input,
                          output);
 }

 }  // namespace mozc