src/base/util.h - mozc - Git at Google

 // Copyright 2010-2015, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
 // copyright notice, this list of conditions and the following disclaimer
 // in the documentation and/or other materials provided with the
 // distribution.
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef MOZC_BASE_UTIL_H_
 #define MOZC_BASE_UTIL_H_

 #include <climits>
 #include <ctime>
 #include <string>
 #include <utility>
 #include <vector>

 #include "base/logging.h"
 #include "base/port.h"
 #include "base/string_piece.h"

 struct tm;

 namespace mozc {

 // SplitIterator - Iteratively splits a StringPiece to sub-StringPieces.
 //
 // This template class takes two template parameters, Delimiter and Option.
 //
 // Delimiter:
 //   - SingleDelimiter: Input is splitted by only one character.  If your
 //         delimiter is a single character, use this parameter because algorithm
 //         is optimized for this common case.
 //   - MultiDelimiter: Input is splitted by any of the specified characters.
 //
 // Option:
 //   - SkipEmpty (default): empty pieces are ignored:
 //         ",a,b,,c," -> {"a", "b", "c"}  (delimiter = ',')
 //   - AllowEmpty: empty pieces are not ignored:
 //         ",a,b,,c," -> {"", "a", "b", "", "c", ""}  (delimiter = ',')
 //
 // Usage Example:
 //
 // // 1. SingleDelimiter and SkipEmpty
 // for (SplitIterator<SingleDelimiter> iter("this,is,,mozc", ",");
 //      !iter.Done(); iter.Next()) {
 //   StringPiece sp = iter.Get();  // "this", "is", and finally "mozc"
 //   ...
 // }
 //
 // // 2. SingleDelimiter and AllowEmpty
 // for (SplitIterator<SingleDelimiter, AllowEmpty> iter("this,is,,mozc", ",");
 //      !iter.Done(); iter.Next()) {
 //   StringPiece sp = iter.Get();  // "this", "is", "", and finally "mozc"
 //   ...
 // }
 //
 // // 3. MultiDelimiter and SkipEmpty
 // for (SplitIterator<MultiDelimiter> iter("this,is:\tmozc", ",:\t");
 //      !iter.Done(); iter.Next()) {
 //   StringPiece sp = iter.Get();  // "this", "is", and finally "mozc"
 //   ...
 // }
 //
 // // 4. MultiDelimiter and AllowEmpty
 // for (SplitIterator<MultiDelimiter, AllowEmpty>
 //          iter("this,is::\tmozc", ",:\t"); !iter.Done(); iter.Next()) {
 //   StringPiece sp = iter.Get();  // "this", "is", "", "", and finally "mozc"
 //   ...
 // }
 class SingleDelimiter;
 class MultiDelimiter;
 struct SkipEmpty {};
 struct AllowEmpty {};

 template <typename Delimiter, typename Option = SkipEmpty>
 class SplitIterator {
  public:
   SplitIterator(StringPiece s, const char *delim);
   StringPiece Get() const;
   void Next();
   bool Done() const;
 };

 class Util {
  public:
   // String utils
   template <typename StringContainer>
   static void PushBackStringPiece(StringPiece s, StringContainer *container) {
     container->push_back(string());
     s.CopyToString(&container->back());
   }

   static void SplitStringUsing(StringPiece str,
                                const char *delm,
                                vector<string> *output);
   static void SplitStringUsing(StringPiece str,
                                const char *delm,
                                vector<StringPiece> *output);

   static void SplitStringAllowEmpty(StringPiece str,
                                     const char *delm,
                                     vector<string> *output);

   static void SplitStringToUtf8Chars(const string &str,
                                      vector<string> *output);

   static void SplitCSV(const string &str, vector<string> *output);

   static void JoinStrings(const vector<string> &str,
                           const char *delm,
                           string *output);
   static void JoinStringPieces(const vector<StringPiece> &str,
                                const char *delm,
                                string *output);
   static void ConcatStrings(StringPiece s1, StringPiece s2, string *output);

   static void AppendStringWithDelimiter(StringPiece delimiter,
                                         StringPiece append_string,
                                         string *output);

   static void StringReplace(StringPiece s, StringPiece oldsub,
                             StringPiece newsub, bool replace_all,
                             string *res);

   static void LowerString(string *output);
   static void UpperString(string *output);

   // Transforms the first character to the upper case and tailing characters to
   // the lower cases.  ex. "abCd" => "Abcd".
   static void CapitalizeString(string *output);

   // Returns true if the characters in [first, last) are all in lower case
   // ASCII.
   static bool IsLowerAscii(StringPiece s);

   // Returns true if the characters in [first, last) are all in upper case
   // ASCII.
   static bool IsUpperAscii(StringPiece s);

   // Returns true if the text in the rage [first, last) is capitalized ASCII.
   static bool IsCapitalizedAscii(StringPiece s);

   // Returns true if the characters in [first, last) are all in lower case ASCII
   // or all in upper case ASCII. Namely, equivalent to
   //     IsLowerAscii(first, last) || IsUpperAscii(first last)
   static bool IsLowerOrUpperAscii(StringPiece s);

   // Returns true if the text in the range [first, last) is 1) all in upper case
   // ASCII, or 2) capitalized.
   static bool IsUpperOrCapitalizedAscii(StringPiece s);

   // Strips the leading/trailing white spaces from the input and stores it to
   // the output.  If the input does not have such white spaces, this method just
   // copies the input into the output.  It clears the output always.
   static void StripWhiteSpaces(const string &str, string *output);

   static size_t OneCharLen(const char *src);

   static size_t CharsLen(const char *src, size_t size);

   static size_t CharsLen(StringPiece str) {
     return CharsLen(str.data(), str.size());
   }

   static char32 UTF8ToUCS4(const char *begin,
                            const char *end,
                            size_t *mblen);

   // Returns true if |s| is split into |first_char32| + |rest|.
   // You can pass NULL to |first_char32| and/or |rest| to ignore the matched
   // value.
   // Returns false if an invalid UTF-8 sequence is prefixed. That is, |rest| may
   // contain any invalid sequence even when this method returns true.
   static bool SplitFirstChar32(StringPiece s,
                                char32 *first_char32,
                                StringPiece *rest);

   // Returns true if |s| is split into |rest| + |last_char32|.
   // You can pass NULL to |rest| and/or |last_char32| to ignore the matched
   // value.
   // Returns false if an invalid UTF-8 sequence is suffixed. That is, |rest| may
   // contain any invalid sequence even when this method returns true.
   static bool SplitLastChar32(StringPiece s,
                               StringPiece *rest,
                               char32 *last_char32);

   static void UCS4ToUTF8(char32 c, string *output);
   static void UCS4ToUTF8Append(char32 c, string *output);

 #ifdef OS_WIN
   // Returns how many wide characters are necessary in UTF-16 to represent
   // given UTF-8 string. Note that the result of this method becomes greater
   // than that of Util::CharsLen if |src| contains any character which is
   // encoded by the surrogate-pair in UTF-16.
   static size_t WideCharsLen(StringPiece src);
   // Converts the encoding of the specified string from UTF-8 to UTF-16, and
   // vice versa.
   static int UTF8ToWide(StringPiece input, wstring *output);
   static int WideToUTF8(const wchar_t *input, string *output);
   static int WideToUTF8(const wstring &input, string *output);
 #endif  // OS_WIN

   // Extracts a substring range, where both start and length are in terms of
   // UTF8 size. Note that the returned string piece refers to the same memory
   // block as the input.
   static StringPiece SubStringPiece(StringPiece src,
                                     size_t start, size_t length);
   // This version extracts the substring to the end.
   static StringPiece SubStringPiece(StringPiece src, size_t start);

   // Extracts a substring of length |length| starting at |start|.
   // Note: |start| is the start position in UTF8, not byte position.
   static void SubString(StringPiece src, size_t start, size_t length,
                         string *result);

   static string SubString(StringPiece src, size_t start, size_t length) {
     string result;
     SubString(src, start, length, &result);
     return result;
   }

   // Determines whether the beginning of |str| matches |prefix|.
   static bool StartsWith(StringPiece str, StringPiece prefix);

   // Determines whether the end of |str| matches |suffix|.
   static bool EndsWith(StringPiece str, StringPiece suffix);

   // Strip a heading UTF-8 BOM (binary order mark) sequence (= \xef\xbb\xbf).
   static void StripUTF8BOM(string *line);

   // return true the line starts with UTF16-LE/UTF16-BE BOM.
   static bool IsUTF16BOM(const string &line);

   // Returns true if the given |s| has only one ucs4 character, and it is
   // in the range of Android Emoji PUA.
   static bool IsAndroidPuaEmoji(StringPiece s);

   // C++ string version of sprintf.
   static string StringPrintf(const char *format, ...)
       // Tell the compiler to do printf format string checking.
       PRINTF_ATTRIBUTE(1, 2);

   // Chop the return characters (i.e. '\n' and '\r') at the end of the
   // given line.
   static bool ChopReturns(string *line);

   // 32bit Fingerprint
   static uint32 Fingerprint32(const string &key);
   static uint32 Fingerprint32(const char *str, size_t length);
   static uint32 Fingerprint32(const char *str);

   static uint32 Fingerprint32WithSeed(const string &key,
                                       uint32 seed);
   static uint32 Fingerprint32WithSeed(const char *str,
                                       size_t length, uint32 seed);
   static uint32 Fingerprint32WithSeed(const char *str,
                                       uint32 seed);
   static uint32 Fingerprint32WithSeed(uint32 num, uint32 seed);

   // 64bit Fingerprint
   static uint64 Fingerprint(const string &key);
   static uint64 Fingerprint(const char *str, size_t length);

   static uint64 FingerprintWithSeed(const string &key, uint32 seed);

   static uint64 FingerprintWithSeed(const char *str,
                                     size_t length, uint32 seed);

   // Generate a random sequence. It uses secure method if possible, or Random()
   // as a fallback method.
   static void GetRandomSequence(char *buf, size_t buf_size);
   static void GetRandomAsciiSequence(char *buf, size_t buf_size);

   // Return random variable whose range is [0..size-1].
   // This function uses rand() internally, so don't use it for
   // security-sensitive purpose.
   // Caveats: The returned value does not have good granularity especially
   // when |size| is larger than |RAND_MAX|.
   // TODO(yukawa): Improve the granularity.
   // TODO(yukawa): Clarify the semantics when |size| is 0 or smaller.
   static int Random(int size);

   // Set the seed of Util::Random().
   static void SetRandomSeed(uint32 seed);

   // Get the current time info using gettimeofday-like functions.
   // sec: number of seconds from epoch
   // usec: micro-second passed: [0,1000000)
   static void GetTimeOfDay(uint64 *sec, uint32 *usec);

   // Get the current time info using time-like function
   // For Windows, _time64() is used.
   // For Linux/Mac, time() is used.
   static uint64 GetTime();

   // Get the current local time to current_time.  Returns true if succeeded.
   static bool GetCurrentTm(tm *current_time);
   // Get local time, which is offset_sec seconds after now. Returns true if
   // succeeded.
   static bool GetTmWithOffsetSecond(tm *time_with_offset, int offset_sec);

   // Get the system frequency to calculate the time from ticks.
   static uint64 GetFrequency();

   // Get the current ticks. It may return incorrect value on Virtual Machines.
   // If you'd like to get a value in secs, it is necessary to divide a result by
   // GetFrequency().
   static uint64 GetTicks();

 #ifdef __native_client__
   // Sets the time difference between local time and UTC time in seconds.
   // We use this function in NaCl Mozc because we can't know the local timezone
   // in NaCl environment.
   static void SetTimezoneOffset(int32 timezone_offset_sec);
 #endif  // __native_client__

   // Interface of the helper class.
   // Default implementation is defined in the .cc file.
   class ClockInterface {
    public:
     virtual ~ClockInterface() {}
     virtual void GetTimeOfDay(uint64 *sec, uint32 *usec) = 0;
     virtual uint64 GetTime() = 0;
     virtual bool GetTmWithOffsetSecond(time_t offset_sec, tm *output) = 0;

     // High accuracy clock.
     virtual uint64 GetFrequency() = 0;
     virtual uint64 GetTicks() = 0;
 #ifdef __native_client__
     virtual void SetTimezoneOffset(int32 timezone_offset_sec) = 0;
 #endif  // __native_client__
   };

   // This function is provided for test.
   // The behavior of system clock can be customized by replacing this handler.
   static void SetClockHandler(Util::ClockInterface *handler);

   // Suspends the execution of the current thread until
   // the time-out interval elapses.
   static void Sleep(uint32 msec);

   // Japanese utilities for character form transliteration.
   static void HiraganaToKatakana(StringPiece input, string *output);
   static void HiraganaToHalfwidthKatakana(StringPiece input, string *output);
   static void HiraganaToRomanji(StringPiece input, string *output);
   static void HalfWidthAsciiToFullWidthAscii(StringPiece input, string *output);
   static void FullWidthAsciiToHalfWidthAscii(StringPiece input, string *output);
   static void HiraganaToFullwidthRomanji(StringPiece input, string *output);
   static void RomanjiToHiragana(StringPiece input, string *output);
   static void KatakanaToHiragana(StringPiece input, string *output);
   static void HalfWidthKatakanaToFullWidthKatakana(StringPiece input,
                                                    string *output);
   static void FullWidthKatakanaToHalfWidthKatakana(StringPiece input,
                                                    string *output);
   static void FullWidthToHalfWidth(StringPiece input, string *output);
   static void HalfWidthToFullWidth(StringPiece input, string *output);

   // Returns true if all chars in input are both defined
   // in full width and half-width-katakana area
   static bool IsFullWidthSymbolInHalfWidthKatakana(const string &input);

   // Returns true if all chars are defiend in half-width-katakana area.
   static bool IsHalfWidthKatakanaSymbol(const string &input);

   // Returns true if one or more Kana-symbol characters are in the input.
   static bool IsKanaSymbolContained(const string &input);

   // Returns true if |input| looks like a pure English word.
   static bool IsEnglishTransliteration(const string &input);

   static void NormalizeVoicedSoundMark(StringPiece input, string *output);

   // Returns true if key is an open bracket.  If key is an open bracket,
   // corresponding close bracket is assigned.
   static bool IsOpenBracket(const string &key, string *close_bracket);

   // Returns true if key is a close bracket.  If key is a close bracket,
   // corresponding open bracket is assigned.
   static bool IsCloseBracket(const string &key, string *open_bracket);

   // Code converter
 #ifndef OS_WIN
   static void UTF8ToEUC(const string &input, string *output);
   static void EUCToUTF8(const string &input, string *output);
 #endif  // OS_WIDNWOS

   static void UTF8ToSJIS(const string &input, string *output);
   static void SJISToUTF8(const string &input, string *output);

   static void EncodeURI(const string &input, string *output);
   static void DecodeURI(const string &input, string *output);

   // Make a string for CGI parameters from params and append it to
   // base.  The result looks like:
   //   <base><key1>=<encoded val1>&<key2>=<encoded val2>
   // The base is supposed to end "?" or "&".
   static void AppendCGIParams(const vector<pair<string, string> > &params,
                               string *base);

   // Escape any characters into \x prefixed hex digits.
   // ex.  "ABC" => "\x41\x42\x43".
   static void Escape(const string &input, string *output);

   // Escape any characters into % prefixed hex digits.
   // ex. "ABC" => "%41%42%43"
   static void EscapeUrl(const string &input, string *output);
   static string EscapeUrl(const string &input);

   // Escape/Unescape unsafe html characters such as <, > and &.
   static void EscapeHtml(const string &text, string *res);
   static void UnescapeHtml(const string &text, string *res);

   // Escape unsafe CSS characters like <.  Note > and & are not
   // escaped becaused they are operands of CSS.
   static void EscapeCss(const string &text, string *result);

   enum ScriptType {
     UNKNOWN_SCRIPT,
     KATAKANA,
     HIRAGANA,
     KANJI,
     NUMBER,
     ALPHABET,
     EMOJI,
     SCRIPT_TYPE_SIZE,
   };

   // return script type of w
   static ScriptType GetScriptType(char32 w);

   // return script type of first character in [begin, end)
   // This function finds the first UTF-8 chars and returns its script type.
   // The length of the character will be returned in *mblen.
   // This function calls GetScriptType(char32) internally.
   static ScriptType GetScriptType(const char *begin, const char *end,
                                   size_t *mblen);

   // return script type of first character in str
   static ScriptType GetFirstScriptType(const string &str);

   // return script type of string. all chars in str must be
   // KATAKANA/HIRAGANA/KANJI/NUMBER or ALPHABET.
   // If str has mixed scripts, this function returns UNKNOWN_SCRIPT
   static ScriptType GetScriptType(const string &str);

   // The same as GetScryptType(), but it ignores symbols
   // in the |str|.
   static ScriptType GetScriptTypeWithoutSymbols(const string &str);

   // return true if all script_type in str is "type"
   static bool IsScriptType(StringPiece str, ScriptType type);

   // return true if the string contains script_type char
   static bool ContainsScriptType(StringPiece str, ScriptType type);

   // See 'Unicode Standard Annex #11: EAST ASIAN WIDTH'
   // http://www.unicode.org/reports/tr11/
   // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
   enum FormType {
     UNKNOWN_FORM,
     HALF_WIDTH,  // [Na] and [H] in 'Unicode Standard Annex #11'
     FULL_WIDTH,  // Any other characters
     FORM_TYPE_SIZE,
   };

   // return Form type of single character.
   // This function never returns UNKNOWN_FORM.
   static FormType GetFormType(char32 w);

   // return FormType of string.
   // return UNKNOWN_FORM if |str| contains both HALF_WIDTH and FULL_WIDTH.
   static FormType GetFormType(const string &str);

   // Basically, if charset >= JIX0212, the char is platform dependent char.
   enum CharacterSet {
     ASCII,         // ASCII (simply ucs4 <= 0x007F)
     JISX0201,      // defined at least in 0201 (can be in 0208/0212/0213/CP9232)
     JISX0208,      // defined at least in 0208 (can be in 0212/0213/CP932)
     JISX0212,      // defined at least in 0212 (can be in 0213/CP932)
     JISX0213,      // defined at least in 0213 (can be in CP932)
     CP932,         // defined only in CP932, not in JISX02*
     UNICODE_ONLY,  // defined only in UNICODE, not in JISX* nor CP932
     CHARACTER_SET_SIZE,
   };

   // return CharacterSet
   static CharacterSet GetCharacterSet(char32 ucs4);

   // return CharacterSet of string.
   // if the given string contains multiple charasets, return
   // the maximum character set.
   static CharacterSet GetCharacterSet(const string &str);

  private:
   DISALLOW_IMPLICIT_CONSTRUCTORS(Util);
 };

 // Const iterator implementation to traverse on a (utf8) string as a char32
 // string.
 //
 // Example usage:
 //   string utf8;
 //   for (ConstChar32Iterator iter(utf8); !iter.Done(); iter.Next()) {
 //     char32 c = iter.Get();
 //     ...
 //   }
 class ConstChar32Iterator {
  public:
   explicit ConstChar32Iterator(StringPiece utf8_string);
   char32 Get() const;
   void Next();
   bool Done() const;

  private:
   StringPiece utf8_string_;
   char32 current_;
   bool done_;

   DISALLOW_COPY_AND_ASSIGN(ConstChar32Iterator);
 };

 // Const reverse iterator implementation to traverse on a (utf8) string as a
 // char32 string.
 //
 // Example usage:
 //   string utf8;
 //   for (ConstChar32ReverseIterator iter(utf8); !iter.Done(); iter.Next()) {
 //     char32 c = iter.Get();
 //     ...
 //   }
 class ConstChar32ReverseIterator {
  public:
   explicit ConstChar32ReverseIterator(StringPiece utf8_string);
   char32 Get() const;
   void Next();
   bool Done() const;

  private:
   StringPiece utf8_string_;
   char32 current_;
   bool done_;

   DISALLOW_COPY_AND_ASSIGN(ConstChar32ReverseIterator);
 };

 // Actual definitions of delimiter classes.
 class SingleDelimiter {
  public:
   explicit SingleDelimiter(const char *delim) : delim_(*delim) {}
   bool Contains(char c) const { return c == delim_; }

  private:
   const char delim_;
   DISALLOW_COPY_AND_ASSIGN(SingleDelimiter);
 };

 class MultiDelimiter {
  public:
   static const size_t kTableSize = UCHAR_MAX / 8;

   explicit MultiDelimiter(const char* delim);

   bool Contains(char c) const {
     const unsigned char uc = static_cast<unsigned char>(c);
     return (lookup_table_[uc >> 3] & (1 << (uc & 0x07))) != 0;
   }

  private:
   // Bit field for looking up delimiters. Think of this as a 256-bit array where
   // n-th bit is set to 1 if the delimiters contain a character whose unsigned
   // char code is n.
   unsigned char lookup_table_[kTableSize];
   DISALLOW_COPY_AND_ASSIGN(MultiDelimiter);
 };

 // Declarations of the partial specializations of SplitIterator for two options.
 // Implementations are explicitly instantiated in util.cc.
 template <typename Delimiter>
 class SplitIterator<Delimiter, SkipEmpty> {
  public:
   SplitIterator(StringPiece s, const char *delim);
   StringPiece Get() const { return StringPiece(sp_begin_, sp_len_); }
   bool Done() const { return sp_begin_ == end_; }
   void Next();

  private:
   const char *const end_;
   const Delimiter delim_;
   const char *sp_begin_;
   StringPiece::size_type sp_len_;

   DISALLOW_COPY_AND_ASSIGN(SplitIterator);
 };

 template <typename Delimiter>
 class SplitIterator<Delimiter, AllowEmpty> {
  public:
   SplitIterator(StringPiece s, const char *delim);
   StringPiece Get() const { return StringPiece(sp_begin_, sp_len_); }
   bool Done() const { return done_; }
   void Next();

  private:
   const char *const end_;
   const Delimiter delim_;
   const char *sp_begin_;
   StringPiece::size_type sp_len_;
   bool done_;

   DISALLOW_COPY_AND_ASSIGN(SplitIterator);
 };

 }  // namespace mozc

 #endif  // MOZC_BASE_UTIL_H_