blob: 746f38bdaec13ed868f148ab822a88665ca64a01 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "base/number_util.h"
#include <algorithm>
#include <cctype>
#include <cerrno>
#include <cstdio>
#include <cstring>
#include <limits>
#include <sstream>
#include <string>
#include <vector>
#include "base/logging.h"
#include "base/text_converter.h"
#include "base/util.h"
namespace mozc {
namespace {
// Table of number character of Kansuji
const char *const kNumKanjiDigits[] = {
// "〇", "一", "二", "三", "四", "五", "六", "七", "八", "九", nullptr
"\xe3\x80\x87", "\xe4\xb8\x80", "\xe4\xba\x8c", "\xe4\xb8\x89",
"\xe5\x9b\x9b", "\xe4\xba\x94", "\xe5\x85\xad", "\xe4\xb8\x83",
"\xe5\x85\xab", "\xe4\xb9\x9d", nullptr
};
const char *const kNumKanjiOldDigits[] = {
// nullptr, "壱", "弐", "参", "四", "五", "六", "七", "八", "九"
nullptr, "\xe5\xa3\xb1", "\xe5\xbc\x90", "\xe5\x8f\x82", "\xe5\x9b\x9b",
"\xe4\xba\x94", "\xe5\x85\xad", "\xe4\xb8\x83", "\xe5\x85\xab",
"\xe4\xb9\x9d"
};
const char *const kNumFullWidthDigits[] = {
// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", nullptr
"\xef\xbc\x90", "\xef\xbc\x91", "\xef\xbc\x92", "\xef\xbc\x93",
"\xef\xbc\x94", "\xef\xbc\x95", "\xef\xbc\x96", "\xef\xbc\x97",
"\xef\xbc\x98", "\xef\xbc\x99", nullptr
};
const char *const kNumHalfWidthDigits[] = {
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", nullptr
};
// Table of Kanji number ranks
const char *const kNumKanjiRanks[] = {
// nullptr, "", "十", "百", "千"
nullptr, "", "\xe5\x8d\x81", "\xe7\x99\xbe", "\xe5\x8d\x83"
};
const char *const kNumKanjiBiggerRanks[] = {
// "", "万", "億", "兆", "京"
"", "\xe4\xb8\x87", "\xe5\x84\x84", "\xe5\x85\x86", "\xe4\xba\xac"
};
const char *const kNumKanjiOldRanks[] = {
// nullptr, "", "拾", "百", "阡"
nullptr, "", "\xe6\x8b\xbe", "\xe7\x99\xbe", "\xe9\x98\xa1"
};
const char *const kNumKanjiBiggerOldRanks[] = {
// "", "萬", "億", "兆", "京"
"", "\xe8\x90\xac", "\xe5\x84\x84", "\xe5\x85\x86", "\xe4\xba\xac"
};
const char *const kRomanNumbersCapital[] = {
// nullptr, "Ⅰ", "Ⅱ", "Ⅲ", "Ⅳ", "Ⅴ", "Ⅵ", "Ⅶ", "Ⅷ", "Ⅸ",
// "Ⅹ", "Ⅺ", "Ⅻ", nullptr
nullptr, "\xe2\x85\xa0", "\xe2\x85\xa1", "\xe2\x85\xa2", "\xe2\x85\xa3",
"\xe2\x85\xa4", "\xe2\x85\xa5", "\xe2\x85\xa6", "\xe2\x85\xa7",
"\xe2\x85\xa8", "\xe2\x85\xa9", "\xe2\x85\xaa", "\xe2\x85\xab", nullptr
};
const char *const kRomanNumbersSmall[] = {
// nullptr, "ⅰ", "ⅱ", "ⅲ", "ⅳ", "ⅴ", "ⅵ", "ⅶ", "ⅷ", "ⅸ",
// "ⅹ", "ⅺ", "ⅻ", nullptr
nullptr, "\xe2\x85\xb0", "\xe2\x85\xb1", "\xe2\x85\xb2", "\xe2\x85\xb3",
"\xe2\x85\xb4", "\xe2\x85\xb5", "\xe2\x85\xb6", "\xe2\x85\xb7",
"\xe2\x85\xb8", "\xe2\x85\xb9", "\xe2\x85\xba", "\xe2\x85\xbb", nullptr
};
const char *const kCircledNumbers[] = {
nullptr,
// "①", "②", "③", "④", "⑤", "⑥", "⑦", "⑧", "⑨", "⑩"
"\xe2\x91\xa0", "\xe2\x91\xa1", "\xe2\x91\xa2", "\xe2\x91\xa3",
"\xe2\x91\xa4", "\xe2\x91\xa5", "\xe2\x91\xa6", "\xe2\x91\xa7",
"\xe2\x91\xa8", "\xe2\x91\xa9",
// "⑪", "⑫", "⑬", "⑭", "⑮", "⑯", "⑰", "⑱", "⑲", "⑳"
"\xe2\x91\xaa", "\xe2\x91\xab", "\xe2\x91\xac", "\xe2\x91\xad",
"\xe2\x91\xae", "\xe2\x91\xaf", "\xe2\x91\xb0", "\xe2\x91\xb1",
"\xe2\x91\xb2", "\xe2\x91\xb3",
// Circled 21-35
"\xE3\x89\x91", "\xE3\x89\x92", "\xE3\x89\x93", "\xE3\x89\x94",
"\xE3\x89\x95", "\xE3\x89\x96", "\xE3\x89\x97", "\xE3\x89\x98",
"\xE3\x89\x99", "\xE3\x89\x9A", "\xE3\x89\x9B", "\xE3\x89\x9C",
"\xE3\x89\x9D", "\xE3\x89\x9E", "\xE3\x89\x9F",
// Circled 36-50
"\xE3\x8A\xB1", "\xE3\x8A\xB2", "\xE3\x8A\xB3", "\xE3\x8A\xB4",
"\xE3\x8A\xB5", "\xE3\x8A\xB6", "\xE3\x8A\xB7", "\xE3\x8A\xB8",
"\xE3\x8A\xB9", "\xE3\x8A\xBA", "\xE3\x8A\xBB", "\xE3\x8A\xBC",
"\xE3\x8A\xBD", "\xE3\x8A\xBE", "\xE3\x8A\xBF",
nullptr
};
// Structure to store character set variations.
struct NumberStringVariation {
const char *const *const digits;
const int numbers_size;
const char *description;
const char *separator;
const char *point;
const NumberUtil::NumberString::Style style;
};
// Judges given string is a decimal number (including integer) or not.
// It accepts strings whose last point is a decimal point like "123456."
bool IsDecimalNumber(StringPiece str) {
int num_point = 0;
for (size_t i = 0; i < str.size(); ++i) {
if (str[i] == '.') {
++num_point;
// A valid decimal number has at most one decimal point.
if (num_point >= 2) {
return false;
}
} else if (!isdigit(str[i])) {
return false;
}
}
return true;
}
const char kAsciiZero = '0';
const char kAsciiOne = '1';
const char kAsciiNine = '9';
template <typename T>
string SimpleItoaImpl(T number) {
stringstream ss;
ss << number;
return ss.str();
}
} // namespace
string NumberUtil::SimpleItoa(int32 number) {
return SimpleItoaImpl(number);
}
string NumberUtil::SimpleItoa(uint32 number) {
return SimpleItoaImpl(number);
}
string NumberUtil::SimpleItoa(int64 number) {
return SimpleItoaImpl(number);
}
string NumberUtil::SimpleItoa(uint64 number) {
return SimpleItoaImpl(number);
}
int NumberUtil::SimpleAtoi(StringPiece str) {
stringstream ss;
ss << str;
int i = 0;
ss >> i;
return i;
}
namespace {
// TODO(hidehiko): Refactoring with GetScriptType in Util class.
inline bool IsArabicDecimalChar32(char32 ucs4) {
// Halfwidth digit.
if (kAsciiZero <= ucs4 && ucs4 <= kAsciiNine) {
return true;
}
// Fullwidth digit.
if (0xFF10 <= ucs4 && ucs4 <= 0xFF19) {
return true;
}
return false;
}
} // namespace
bool NumberUtil::IsArabicNumber(StringPiece input_string) {
if (input_string.empty()) {
return false;
}
for (ConstChar32Iterator iter(input_string); !iter.Done(); iter.Next()) {
if (!IsArabicDecimalChar32(iter.Get())) {
// Found non-Arabic decimal character.
return false;
}
}
// All characters are numbers.
return true;
}
bool NumberUtil::IsDecimalInteger(StringPiece str) {
if (str.empty()) {
return false;
}
for (size_t i = 0; i < str.size(); ++i) {
if (!isdigit(str[i])) {
return false;
}
}
return true;
}
namespace {
// To know what "大字" means, please refer
// http://ja.wikipedia.org/wiki/%E5%A4%A7%E5%AD%97_(%E6%95%B0%E5%AD%97)
const NumberStringVariation kKanjiVariations[] = {
// "数字"
{kNumHalfWidthDigits, 10, "\xE6\x95\xB0\xE5\xAD\x97", nullptr, nullptr,
NumberUtil::NumberString::NUMBER_ARABIC_AND_KANJI_HALFWIDTH},
// "数字"
{kNumFullWidthDigits, 10, "\xE6\x95\xB0\xE5\xAD\x97", nullptr, nullptr,
NumberUtil::NumberString::NUMBER_ARABIC_AND_KANJI_FULLWIDTH},
// "漢数字"
{kNumKanjiDigits, 10, "\xE6\xBC\xA2\xE6\x95\xB0\xE5\xAD\x97",
nullptr, nullptr, NumberUtil::NumberString::NUMBER_KANJI},
// "大字"
{kNumKanjiOldDigits, 10, "\xE5\xA4\xA7\xE5\xAD\x97", nullptr, nullptr,
NumberUtil::NumberString::NUMBER_OLD_KANJI},
};
// "弐拾"
const char kOldTwoTen[] = "\xE5\xBC\x90\xE6\x8B\xBE";
const size_t kOldTwoTenLength = arraysize(kOldTwoTen) - 1;
// "廿"
const char kOldTwenty[] = "\xE5\xBB\xBF";
} // namespace
bool NumberUtil::ArabicToKanji(StringPiece input_num,
vector<NumberString> *output) {
DCHECK(output);
// "零"
const char *const kNumZero = "\xe9\x9b\xb6";
const int kDigitsInBigRank = 4;
if (!IsDecimalInteger(input_num)) {
return false;
}
{
// We don't convert a number starting with '0', other than 0 itself.
StringPiece::size_type i;
for (i = 0; i < input_num.size() && input_num[i] == kAsciiZero; ++i) {}
if (i == input_num.size()) {
output->push_back(
// "大字"
NumberString(kNumZero, "\xE5\xA4\xA7\xE5\xAD\x97",
NumberString::NUMBER_OLD_KANJI));
return true;
}
}
// If given number needs higher ranks than our expectations,
// we don't convert it.
if (arraysize(kNumKanjiBiggerRanks) * kDigitsInBigRank < input_num.size()) {
return false;
}
// Fill '0' in the beginning of input_num to make its length
// (N * kDigitsInBigRank).
const int filled_zero_num = (kDigitsInBigRank -
(input_num.size() % kDigitsInBigRank)) % kDigitsInBigRank;
string input(filled_zero_num, kAsciiZero);
input_num.AppendToString(&input);
// Segment into kDigitsInBigRank-digits pieces
vector<string> ranked_numbers;
for (int i = static_cast<int>(input.size()) - kDigitsInBigRank; i >= 0;
i -= kDigitsInBigRank) {
ranked_numbers.push_back(input.substr(i, kDigitsInBigRank));
}
const size_t rank_size = ranked_numbers.size();
for (size_t variation_index = 0;
variation_index < arraysize(kKanjiVariations); ++variation_index) {
const NumberStringVariation &variation = kKanjiVariations[variation_index];
const char *const *const digits = variation.digits;
const NumberString::Style style = variation.style;
if (rank_size == 1 &&
(style == NumberString::NUMBER_ARABIC_AND_KANJI_HALFWIDTH ||
style == NumberString::NUMBER_ARABIC_AND_KANJI_FULLWIDTH)) {
continue;
}
const char *const *ranks;
const char *const *bigger_ranks;
if (style == NumberString::NUMBER_OLD_KANJI) {
ranks = kNumKanjiOldRanks;
bigger_ranks = kNumKanjiBiggerOldRanks;
} else {
ranks = kNumKanjiRanks;
bigger_ranks = kNumKanjiBiggerRanks;
}
// TODO(peria): Bring |result| out if it improves the performance.
string result;
// Converts each segment, and merges them with rank Kanjis.
for (int rank = rank_size - 1; rank >= 0; --rank) {
const string &segment = ranked_numbers[rank];
string segment_result;
bool leading = true;
for (size_t i = 0; i < segment.size(); ++i) {
if (leading && segment[i] == kAsciiZero) {
continue;
}
leading = false;
if (style == NumberString::NUMBER_ARABIC_AND_KANJI_HALFWIDTH ||
style == NumberString::NUMBER_ARABIC_AND_KANJI_FULLWIDTH) {
segment_result += digits[segment[i] - kAsciiZero];
} else {
if (segment[i] == kAsciiZero) {
continue;
}
// In "大字" style, "壱" is also required on every rank.
if (style == NumberString::NUMBER_OLD_KANJI ||
i == kDigitsInBigRank - 1 || segment[i] != kAsciiOne) {
segment_result += digits[segment[i] - kAsciiZero];
}
segment_result += ranks[kDigitsInBigRank - i];
}
}
if (!segment_result.empty()) {
result += segment_result + bigger_ranks[rank];
}
}
const char *description = variation.description;
// Add simply converted numbers.
output->push_back(NumberString(result, description, style));
// Add specialized style numbers.
if (style == NumberString::NUMBER_OLD_KANJI) {
size_t index = result.find(kOldTwoTen);
if (index != string::npos) {
string result2(result);
do {
result2.replace(index, kOldTwoTenLength, kOldTwenty);
index = result2.find(kOldTwoTen, index);
} while (index != string::npos);
output->push_back(NumberString(result2, description, style));
}
// for single kanji
if (input == "0010") {
// "拾"
output->push_back(NumberString("\xE6\x8B\xBE", description, style));
}
if (input == "1000") {
// "阡"
output->push_back(NumberString("\xE9\x98\xA1", description, style));
}
}
}
return true;
}
namespace {
const NumberStringVariation kNumDigitsVariations[] = {
// "数字"
{kNumHalfWidthDigits, 10, "\xE6\x95\xB0\xE5\xAD\x97", ",", ".",
NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_HALFWIDTH},
// "数字", ",", "."
{kNumFullWidthDigits, 10, "\xE6\x95\xB0\xE5\xAD\x97", "\xef\xbc\x8c",
"\xEF\xBC\x8E", NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_FULLWIDTH},
};
} // namespace
bool NumberUtil::ArabicToSeparatedArabic(
StringPiece input_num, vector<NumberString> *output) {
DCHECK(output);
if (!IsDecimalNumber(input_num)) {
return false;
}
// Separate a number into an integral part and a fractional part.
StringPiece::size_type point_pos = input_num.find('.');
if (point_pos == StringPiece::npos) {
point_pos = input_num.size();
}
const StringPiece integer(input_num, 0, point_pos);
// |fraction| has the decimal point with digits in fractional part.
const StringPiece fraction(input_num, point_pos,
input_num.size() - point_pos);
// We don't add separator to number whose integral part starts with '0'
if (integer[0] == kAsciiZero) {
return false;
}
for (size_t i = 0; i < arraysize(kNumDigitsVariations); ++i) {
const NumberStringVariation &variation = kNumDigitsVariations[i];
const char *const *const digits = variation.digits;
// TODO(peria): Bring |result| out if it improves the performance.
string result;
// integral part
for (StringPiece::size_type j = 0; j < integer.size(); ++j) {
// We don't add separater first
if (j != 0 && (integer.size() - j) % 3 == 0) {
result.append(variation.separator);
}
const uint32 d = static_cast<uint32>(integer[j] - kAsciiZero);
if (d <= 9 && digits[d]) {
result.append(digits[d]);
}
}
// fractional part
if (!fraction.empty()) {
DCHECK_EQ(fraction[0], '.');
result.append(variation.point);
for (StringPiece::size_type j = 1; j < fraction.size(); ++j) {
result.append(digits[static_cast<int>(fraction[j] - kAsciiZero)]);
}
}
output->push_back(
NumberString(result, variation.description, variation.style));
}
return true;
}
namespace {
// use default for wide Arabic, because half/full width for
// normal number is learned by charactor form manager.
const NumberStringVariation kSingleDigitsVariations[] = {
// "漢数字"
{kNumKanjiDigits, 10, "\xE6\xBC\xA2\xE6\x95\xB0\xE5\xAD\x97",
nullptr, nullptr, NumberUtil::NumberString::NUMBER_KANJI_ARABIC},
// "数字"
{kNumFullWidthDigits, 10, "\xE6\x95\xB0\xE5\xAD\x97",
nullptr, nullptr, NumberUtil::NumberString::DEFAULT_STYLE},
};
} // namespace
bool NumberUtil::ArabicToWideArabic(
StringPiece input_num, vector<NumberString> *output) {
DCHECK(output);
if (!IsDecimalInteger(input_num)) {
return false;
}
for (size_t i = 0; i < arraysize(kSingleDigitsVariations); ++i) {
const NumberStringVariation &variation = kSingleDigitsVariations[i];
// TODO(peria): Bring |result| out if it improves the performance.
string result;
for (StringPiece::size_type j = 0; j < input_num.size(); ++j) {
result.append(
variation.digits[static_cast<int>(input_num[j] - kAsciiZero)]);
}
if (!result.empty()) {
output->push_back(
NumberString(result, variation.description, variation.style));
}
}
return true;
}
namespace {
const NumberStringVariation kSpecialNumericVariations[] = {
{kRomanNumbersCapital, arraysize(kRomanNumbersCapital),
// "ローマ数字(大文字)",
"\xE3\x83\xAD\xE3\x83\xBC\xE3\x83\x9E\xE6\x95\xB0"
"\xE5\xAD\x97(\xE5\xA4\xA7\xE6\x96\x87\xE5\xAD\x97)",
nullptr, nullptr, NumberUtil::NumberString::NUMBER_ROMAN_CAPITAL},
{kRomanNumbersSmall, arraysize(kRomanNumbersSmall),
// "ローマ数字(小文字)",
"\xE3\x83\xAD\xE3\x83\xBC\xE3\x83\x9E\xE6\x95\xB0"
"\xE5\xAD\x97(\xE5\xB0\x8F\xE6\x96\x87\xE5\xAD\x97)",
nullptr, nullptr, NumberUtil::NumberString::NUMBER_ROMAN_SMALL},
{kCircledNumbers, arraysize(kCircledNumbers),
// "丸数字"
"\xE4\xB8\xB8\xE6\x95\xB0\xE5\xAD\x97",
nullptr, nullptr, NumberUtil::NumberString::NUMBER_CIRCLED},
};
} // namespace
bool NumberUtil::ArabicToOtherForms(
StringPiece input_num, vector<NumberString> *output) {
DCHECK(output);
if (!IsDecimalInteger(input_num)) {
return false;
}
bool converted = false;
// Googol
{
// 10^100
const char *const kNumGoogol =
"100000000000000000000000000000000000000000000000000"
"00000000000000000000000000000000000000000000000000";
if (input_num == kNumGoogol) {
output->push_back(
NumberString("Googol", "", NumberString::DEFAULT_STYLE));
converted = true;
}
}
// Following conversions require uint64 number.
uint64 n;
if (!SafeStrToUInt64(input_num, &n)) {
return converted;
}
// Special forms
for (size_t i = 0; i < arraysize(kSpecialNumericVariations); ++i) {
const NumberStringVariation &variation = kSpecialNumericVariations[i];
if (n < variation.numbers_size && variation.digits[n]) {
output->push_back(
NumberString(variation.digits[n], variation.description,
variation.style));
converted = true;
}
}
return converted;
}
namespace {
// Enough size to store MAX_INT64 in octal digits with prefix.
// Must be larger than or equal to Ceil(64 / 3) + 1 ("0") + 1 ('\0') = 24
const int kMaxInt64Size = 24;
} // namespace
bool NumberUtil::ArabicToOtherRadixes(
StringPiece input_num, vector<NumberString> *output) {
DCHECK(output);
if (!IsDecimalInteger(input_num)) {
return false;
}
uint64 n;
if (!SafeStrToUInt64(input_num, &n)) {
return false;
}
// Hexadecimal
if (n > 9) {
// Keep
char hex[kMaxInt64Size];
snprintf(hex, kMaxInt64Size, "0x%llx", n);
// "16進数"
output->push_back(NumberString(hex, "16\xE9\x80\xB2\xE6\x95\xB0",
NumberString::NUMBER_HEX));
}
// Octal
if (n > 7) {
char oct[kMaxInt64Size];
snprintf(oct, kMaxInt64Size, "0%llo", n);
// "8進数"
output->push_back(NumberString(oct, "8\xE9\x80\xB2\xE6\x95\xB0",
NumberString::NUMBER_OCT));
}
// Binary
if (n > 1) {
string binary;
for (uint64 num = n; num; num >>= 1) {
binary.push_back(kAsciiZero + static_cast<char>(num & 0x1));
}
// "b0" will be "0b" in head of |binary|
binary.append("b0");
reverse(binary.begin(), binary.end());
// "2進数"
output->push_back(NumberString(binary, "2\xE9\x80\xB2\xE6\x95\xB0",
NumberString::NUMBER_BIN));
}
return (n > 1);
}
namespace {
const StringPiece SkipWhiteSpace(StringPiece str) {
StringPiece::size_type i;
for (i = 0; i < str.size() && isspace(str[i]); ++i) {}
DCHECK(i == str.size() || !isspace(str[i]));
return StringPiece(str, i);
}
// There is an informative discussion about the overflow detection in
// "Hacker's Delight" (http://www.hackersdelight.org/basics.pdf)
// 2-12 'Overflow Detection'
// *output = arg1 + arg2
// return false when an integer overflow happens.
bool AddAndCheckOverflow(uint64 arg1, uint64 arg2, uint64 *output) {
*output = arg1 + arg2;
if (arg2 > (kuint64max - arg1)) {
// overflow happens
return false;
}
return true;
}
// *output = arg1 * arg2
// return false when an integer overflow happens.
bool MultiplyAndCheckOverflow(uint64 arg1, uint64 arg2, uint64 *output) {
*output = arg1 * arg2;
if (arg1 != 0 && arg2 > (kuint64max / arg1)) {
// overflow happens
return false;
}
return true;
}
// A simple wrapper of strtoull function. |c_str| must be terminated by '\0'.
inline uint64 StrToUint64(const char* c_str, char** end_ptr, int base) {
#ifdef OS_WIN
return _strtoui64(c_str, end_ptr, base);
#else // OS_WIN
return strtoull(c_str, end_ptr, base);
#endif // OS_WIN
}
// Converts a string which describes a number into an uint64 value in |base|
// radix. Does not convert octal or hexadecimal strings with "0" or "0x"
// suffixes.
bool SafeStrToUInt64WithBase(StringPiece str, int base, uint64 *value) {
DCHECK(value);
// Maximum possible length of number string, including terminating '\0'. Note
// that the maximum possible length is achieved when str="111...11" (64
// unities) and base=2.
const size_t kMaxPossibleLength = 65;
// Leading white spaces are allowed.
const StringPiece stripped_str = SkipWhiteSpace(str);
if (stripped_str.empty() || stripped_str.size() >= kMaxPossibleLength) {
return false;
}
// StrToUint64() does not check if the input is negative. However, a leading
// '+' is OK.
if (stripped_str[0] == '-') {
return false;
}
// Since StringPiece doesn't end with '\0', we make a c-string on stack here.
char buf[kMaxPossibleLength];
memcpy(buf, str.data(), str.size());
buf[str.size()] = '\0';
char *end_ptr = nullptr;
errno = 0;
*value = StrToUint64(buf, &end_ptr, base);
if (errno != 0 || end_ptr == buf) { // Failed to parse uint64.
return false;
}
// Trailing white spaces are allowed.
const StringPiece trailing_str(end_ptr, buf + str.size() - end_ptr);
return SkipWhiteSpace(trailing_str).empty();
}
template <typename T1, typename T2>
struct GenericFalseTypeArity2 {
// TODO(yukawa): Use std::false_type once C++11 is enabled everywhere.
static const bool value = false;
};
template <typename SrcType, typename DestType>
bool SafeCast(SrcType src, DestType *dest) {
static_assert(GenericFalseTypeArity2<SrcType, DestType>::value,
"Shouldn't be used with implicit type conversion.");
return false;
}
template <>
bool SafeCast(int64 src, int32 *dest) {
if (src < static_cast<int64>(kint32min) ||
static_cast<int64>(kint32max) < src) {
return false;
}
*dest = static_cast<int32>(src);
return true;
}
template <>
bool SafeCast(uint64 src, int64 *dest) {
if (src > static_cast<uint64>(kint64max)) {
return false;
}
*dest = static_cast<int64>(src);
return true;
}
template <>
bool SafeCast(uint64 src, uint32 *dest) {
if (src > static_cast<uint64>(kuint32max)) {
return false;
}
*dest = static_cast<uint32>(src);
return true;
}
template <typename SrcType, typename DestType>
bool SafeUnaryNegation(SrcType src, DestType *dest) {
static_assert(GenericFalseTypeArity2<SrcType, DestType>::value,
"Shouldn't be used with implicit type conversion.");
return false;
}
template <>
bool SafeUnaryNegation(uint64 src, int64 *dest) {
int64 tmp = 0;
if (!SafeCast(src, &tmp)) {
if (src == 0x8000000000000000ul) {
// This is an exceptional case. |src| isn't in the range of int64,
// but |-src| is in the range.
*dest = kint64min;
return true;
}
return false;
}
*dest = -tmp;
return true;
}
} // namespace
bool NumberUtil::SafeStrToInt32(StringPiece str, int32 *value) {
int64 tmp;
if (!SafeStrToInt64(str, &tmp)) {
return false;
}
return SafeCast(tmp, value);
}
bool NumberUtil::SafeStrToInt64(StringPiece str, int64 *value) {
const StringPiece stripped_str = SkipWhiteSpace(str);
if (stripped_str.empty()) {
return false;
}
uint64 tmp;
if (stripped_str[0] == '-') {
StringPiece opposite_str = StringPiece(stripped_str,
1,
stripped_str.size() - 1);
if (!SafeStrToUInt64WithBase(opposite_str, 10, &tmp)) {
return false;
}
return SafeUnaryNegation(tmp, value);
}
if (!SafeStrToUInt64WithBase(str, 10, &tmp)) {
return false;
}
return SafeCast(tmp, value);
}
bool NumberUtil::SafeStrToUInt32(StringPiece str, uint32 *value) {
uint64 tmp;
if (!SafeStrToUInt64WithBase(str, 10, &tmp)) {
return false;
}
return SafeCast(tmp, value);
}
bool NumberUtil::SafeHexStrToUInt32(StringPiece str, uint32 *value) {
uint64 tmp;
if (!SafeStrToUInt64WithBase(str, 16, &tmp)) {
return false;
}
return SafeCast(tmp, value);
}
bool NumberUtil::SafeOctStrToUInt32(StringPiece str, uint32 *value) {
uint64 tmp;
if (!SafeStrToUInt64WithBase(str, 8, &tmp)) {
return false;
}
return SafeCast(tmp, value);
}
bool NumberUtil::SafeStrToUInt64(StringPiece str, uint64 *value) {
return SafeStrToUInt64WithBase(str, 10, value);
}
bool NumberUtil::SafeStrToDouble(StringPiece str, double *value) {
DCHECK(value);
// Note that StringPiece isn't terminated by '\0'. However, since strtod
// requires null-terminated string, we make a string here. If we have a good
// estimate of the maximum possible length of the input string, we may be able
// to use char buffer instead. Note: const reference ensures the life of this
// temporary string until the end!
const string &s = str.as_string();
const char* ptr = s.c_str();
char *end_ptr;
errno = 0; // errno only gets set on errors
// strtod of GCC accepts hexadecimal number like "0x1234", but that of
// VisualC++ does not.
// Note that strtod accepts white spaces at the beginning of the parameter.
*value = strtod(ptr, &end_ptr);
if (errno != 0 ||
ptr == end_ptr ||
*value == numeric_limits<double>::infinity() ||
*value == -numeric_limits<double>::infinity()) {
return false;
}
// Trailing white spaces are allowed.
const StringPiece trailing_str(end_ptr, ptr + s.size() - end_ptr);
return SkipWhiteSpace(trailing_str).empty();
}
bool NumberUtil::SafeStrToFloat(StringPiece str, float *value) {
double double_value;
if (!SafeStrToDouble(str, &double_value)) {
return false;
}
*value = static_cast<float>(double_value);
if ((*value == numeric_limits<float>::infinity()) ||
(*value == -numeric_limits<float>::infinity())) {
return false;
}
return true;
}
namespace {
// Reduces leading digits less than 10 as their base10 interpretation, e.g.,
// [1, 2, 3, 10, 100] => begin points to [10, 100], output = 123
// Returns false when overflow happened.
bool ReduceLeadingNumbersAsBase10System(
vector<uint64>::const_iterator *begin,
const vector<uint64>::const_iterator &end,
uint64 *output) {
*output = 0;
for (; *begin < end; ++*begin) {
if (**begin >= 10) {
return true;
}
// *output = *output * 10 + *it
if (!MultiplyAndCheckOverflow(*output, 10, output) ||
!AddAndCheckOverflow(*output, **begin, output)) {
return false;
}
}
return true;
}
// Interprets digits as base10 system, e.g.,
// [1, 2, 3] => 123
// [1, 2, 3, 10] => false
// Returns false if a number greater than 10 was found or overflow happened.
bool InterpretNumbersAsBase10System(const vector<uint64> &numbers,
uint64 *output) {
auto begin = numbers.begin();
const bool success =
ReduceLeadingNumbersAsBase10System(&begin, numbers.end(), output);
// Check if the whole numbers were reduced.
return (success && begin == numbers.end());
}
// Reads a leading number in a sequence and advances the iterator. Returns false
// if the range is empty or the leading number is not less than 10.
bool ReduceOnesDigit(vector<uint64>::const_iterator *begin,
const vector<uint64>::const_iterator &end,
uint64 *num) {
if (*begin == end || **begin >= 10) {
return false;
}
*num = **begin;
++*begin;
return true;
}
// Given expected_base, 10, 100, or 1000, reads leading one or two numbers and
// calculates the number in the follwoing way:
// Case: expected_base == 10
// [10, ...] => 10
// [2, 10, ...] => 20
// [1, 10, ...] => error because we don't write "一十" in Japanese.
// [20, ...] => 20 because "廿" is interpreted as 20.
// [2, 0, ...] => 20
// Case: expected_base == 100
// [100, ...] => 100
// [2, 100, ...] => 200
// [1, 100, ...] => error because we don't write "一百" in Japanese.
// [1, 2, 3, ...] => 123
// Case: expected_base == 1000
// [1000, ...] => 1000
// [2, 1000, ...] => 2000
// [1, 1000, ...] => 1000
// [1, 2, 3, 4, ...] => 1234
bool ReduceDigitsHelper(vector<uint64>::const_iterator *begin,
const vector<uint64>::const_iterator &end,
uint64 *num,
const uint64 expected_base) {
// Skip leading zero(s).
while (*begin != end && **begin == 0) {
++*begin;
}
if (*begin == end) {
return false;
}
const uint64 leading_number = **begin;
// If the leading number is less than 10, e.g., patterns like [2, 10], we need
// to check the next number.
if (leading_number < 10) {
if (end - *begin < 2) {
return false;
}
const uint64 next_number = *(*begin + 1);
// If the next number is also less than 10, this pattern is like
// [1, 2, ...] => 12. In this case, the result must be less than
// 10 * expected_base.
if (next_number < 10) {
if (!ReduceLeadingNumbersAsBase10System(begin, end, num) ||
*num >= expected_base * 10 ||
(*begin != end && **begin < 10000)) {
*begin = end; // Force to ignore the rest of the sequence.
return false;
}
return true;
}
// Patterns like [2, 10, ...] and [1, 1000, ...].
if (next_number != expected_base ||
(leading_number == 1 && expected_base != 1000)) {
return false;
}
*num = leading_number * expected_base;
*begin += 2;
return true;
}
// Patterns like [10, ...], [100, ...], [1000, ...], [20, ...]. The leading 20
// is a special case for Kanji "廿".
if (leading_number == expected_base ||
(expected_base == 10 && leading_number == 20)) {
*num = leading_number;
++*begin;
return true;
}
return false;
}
inline bool ReduceTensDigit(vector<uint64>::const_iterator *begin,
const vector<uint64>::const_iterator &end,
uint64 *num) {
return ReduceDigitsHelper(begin, end, num, 10);
}
inline bool ReduceHundredsDigit(vector<uint64>::const_iterator *begin,
const vector<uint64>::const_iterator &end,
uint64 *num) {
return ReduceDigitsHelper(begin, end, num, 100);
}
inline bool ReduceThousandsDigit(vector<uint64>::const_iterator *begin,
const vector<uint64>::const_iterator &end,
uint64 *num) {
return ReduceDigitsHelper(begin, end, num, 1000);
}
// Reduces leading digits as a number less than 10000 and advances the
// iterator. For example:
// [1, 1000, 2, 100, 3, 10, 4, 10000, ...]
// => begin points to [10000, ...], num = 1234
// [3, 100, 4, 100]
// => error because same base number appears twice
bool ReduceNumberLessThan10000(vector<uint64>::const_iterator *begin,
const vector<uint64>::const_iterator &end,
uint64 *num) {
*num = 0;
bool success = false;
uint64 n = 0;
// Note: the following additions never overflow.
if (ReduceThousandsDigit(begin, end, &n)) {
*num += n;
success = true;
}
if (ReduceHundredsDigit(begin, end, &n)) {
*num += n;
success = true;
}
if (ReduceTensDigit(begin, end, &n)) {
*num += n;
success = true;
}
if (ReduceOnesDigit(begin, end, &n)) {
*num += n;
success = true;
}
// If at least one reduce was successful, no number remains in the sequence or
// the next number should be a base number greater than 1000 (e.g., 10000,
// 100000, etc.). Strictly speaking, better to check **begin % 10 == 0.
return success && (*begin == end || **begin >= 10000);
}
// Interprets a sequence of numbers in a Japanese reading way. For example:
// "一万二千三百四十五" = [1, 10000, 2, 1000, 3, 100, 4, 10, 5] => 12345
// Base-10 numbers must be decreasing, i.e.,
// "一十二百" = [1, 10, 2, 100] => error
bool InterpretNumbersInJapaneseWay(const vector<uint64> &numbers,
uint64 *output) {
uint64 last_base = kuint64max;
auto begin = numbers.begin();
*output = 0;
do {
uint64 coef = 0;
if (!ReduceNumberLessThan10000(&begin, numbers.end(), &coef)) {
return false;
}
if (begin == numbers.end()) {
return AddAndCheckOverflow(*output, coef, output);
}
if (*begin >= last_base) {
return false; // Increasing order of base-10 numbers.
}
// Safely performs *output += coef * *begin.
uint64 delta = 0;
if (!MultiplyAndCheckOverflow(coef, *begin, &delta) ||
!AddAndCheckOverflow(*output, delta, output)) {
return false;
}
last_base = *begin++;
} while (begin != numbers.end());
return true;
}
// Interprets a sequence of numbers directly or in a Japanese reading way
// depending on the maximum number in the sequence.
bool NormalizeNumbersHelper(const vector<uint64> &numbers,
uint64 *number_output) {
const auto itr_max = max_element(numbers.begin(), numbers.end());
if (itr_max == numbers.end()) {
return false; // numbers is empty
}
// When no scaling number is found, convert number directly.
// For example, [5,4,3] => 543
if (*itr_max < 10) {
return InterpretNumbersAsBase10System(numbers, number_output);
}
return InterpretNumbersInJapaneseWay(numbers, number_output);
}
// TODO(peria): Do refactoring this method.
bool NormalizeNumbersInternal(StringPiece input,
bool trim_leading_zeros,
bool allow_suffix,
string *kanji_output,
string *arabic_output,
string *suffix) {
DCHECK(kanji_output);
DCHECK(arabic_output);
const char *begin = input.data();
const char *end = input.data() + input.size();
vector<uint64> numbers;
numbers.reserve(input.size());
// Map Kanji number string to digits, e.g., "二百十一" -> [2, 100, 10, 1].
// Simultaneously, constructs a Kanji number string.
kanji_output->clear();
arabic_output->clear();
string kanji_char;
while (begin < end) {
size_t mblen = 0;
const char32 wchar = Util::UTF8ToUCS4(begin, end, &mblen);
kanji_char.assign(begin, mblen);
string tmp;
NumberUtil::KanjiNumberToArabicNumber(kanji_char, &tmp);
uint64 n = 0;
if (!NumberUtil::SafeStrToUInt64(tmp, &n)) {
break;
}
if (wchar >= 0x0030 && wchar <= 0x0039) { // '0' <= wchar <= '9'
kanji_char.assign(kNumKanjiDigits[wchar - 0x0030], 3);
} else if (wchar >= 0xFF10 && wchar <= 0xFF19) { // '0' <= wchar <= '9'
kanji_char.assign(kNumKanjiDigits[wchar - 0xFF10], 3);
}
kanji_output->append(kanji_char);
numbers.push_back(n);
begin += mblen;
}
if (begin < end) {
if (!allow_suffix) {
return false;
}
DCHECK(suffix);
suffix->assign(begin, end);
}
if (numbers.empty()) {
return false;
}
// Try interpreting the sequence of digits.
uint64 n = 0;
if (!NormalizeNumbersHelper(numbers, &n)) {
return false;
}
if (!trim_leading_zeros) {
// If |numbers| contains only k zeros, add (k - 1) zeros to the output.
// Otherwise, add the same number of leading zeros.
size_t num_zeros;
for (num_zeros = 0; num_zeros < numbers.size(); ++num_zeros) {
if (numbers[num_zeros] != 0) {
break;
}
}
if (num_zeros == numbers.size()) {
--num_zeros;
}
arabic_output->append(num_zeros, kAsciiZero);
}
char buf[kMaxInt64Size];
snprintf(buf, sizeof(buf), "%llu", n);
*arabic_output += buf;
return true;
}
} // end of anonymous namespace
// Convert Kanji numbers into Arabic numbers:
// e.g. "百二十万" -> 1200000
bool NumberUtil::NormalizeNumbers(StringPiece input,
bool trim_leading_zeros,
string *kanji_output,
string *arabic_output) {
return NormalizeNumbersInternal(input,
trim_leading_zeros,
false, // allow_suffix
kanji_output,
arabic_output,
nullptr);
}
bool NumberUtil::NormalizeNumbersWithSuffix(StringPiece input,
bool trim_leading_zeros,
string *kanji_output,
string *arabic_output,
string *suffix) {
return NormalizeNumbersInternal(input,
trim_leading_zeros,
true, // allow_suffix
kanji_output,
arabic_output,
suffix);
}
namespace {
// Load Rules
// TODO(peria): Split following header file. No need to include Janapese
// character constants.
#include "base/japanese_util_rule.h"
} // namespace
void NumberUtil::KanjiNumberToArabicNumber(StringPiece input,
string *output) {
TextConverter::Convert(kanjinumber_to_arabicnumber_da,
kanjinumber_to_arabicnumber_table,
input,
output);
}
} // namespace mozc