| // Copyright 2010-2015, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "dictionary/system/codec.h" |
| |
| #include <sstream> |
| |
| #include "base/logging.h" |
| #include "base/port.h" |
| #include "base/singleton.h" |
| #include "base/string_piece.h" |
| #include "base/util.h" |
| #include "dictionary/dictionary_token.h" |
| #include "dictionary/system/words_info.h" |
| |
| namespace mozc { |
| namespace dictionary { |
| namespace { |
| void EncodeDecodeKeyImpl(const StringPiece src, string *dst); |
| size_t GetEncodedDecodedKeyLengthImpl(const StringPiece src); |
| |
| uint8 GetFlagsForToken(const vector<TokenInfo> &tokens, int index); |
| |
| uint8 GetFlagForPos(const TokenInfo &token_info, const Token *token); |
| |
| uint8 GetFlagForValue(const TokenInfo &token_info, const Token *token); |
| |
| void EncodeCost(const TokenInfo &token_info, uint8 *dst, int *offset); |
| |
| void EncodePos( |
| const TokenInfo &token_info, uint8 flags, uint8 *dst, int *offset); |
| |
| void EncodeValueInfo( |
| const TokenInfo &token_info, uint8 flags, uint8 *dst, int *offset); |
| |
| uint8 ReadFlags(uint8 val); |
| |
| void DecodeCost(const uint8 *ptr, TokenInfo *token, int *offset); |
| |
| void DecodePos(const uint8 *ptr, uint8 flags, TokenInfo *token, int *offset); |
| |
| void DecodeValueInfo( |
| const uint8 *ptr, uint8 flags, TokenInfo *token_info, int *offset); |
| |
| void ReadValueInfo( |
| const uint8 *ptr, uint8 flags, int *value_id, int *offset); |
| |
| //// Constants for section name //// |
| const char kKeySectionName[] = "k"; |
| const char kValueSectionName[] = "v"; |
| const char kTokensSectionName[] = "t"; |
| const char kPosSectionName[] = "p"; |
| |
| //// Constants for validation //// |
| // 12 bits |
| const int kPosMax = 0x0fff; |
| // 15 bits |
| const int kCostMax = 0x7fff; |
| // 22 bits |
| const int kValueTrieIdMax = 0x3fffff; |
| |
| //// Constants for value //// |
| // Unused for now. |
| // We are using from 0x00~0xfa for the Kanji, Hiragana and Katakana. |
| // Please see the comments for EncodeValue for details. |
| // const uint8 kValueCharMarkReserved = 0xfb; |
| // ASCII character. |
| const uint8 kValueCharMarkAscii = 0xfc; |
| // UCS4 character 0x??00. |
| const uint8 kValueCharMarkXX00 = 0xfd; |
| // This UCS4 character is neither Hiragana nor above 2 patterns 0x???? |
| const uint8 kValueCharMarkOtherUCS2 = 0xfe; |
| |
| // UCS4 character 0x00?????? (beyond UCS2 range) |
| // UCS4 characters never exceed 10FFFF. (three 8bits, A-B-C). |
| // For left most 8bits A, we will use upper 2bits for the flag |
| // that indicating whether B and C is 0 or not. |
| const uint8 kValueCharMarkUCS4 = 0xff; |
| const uint8 kValueCharMarkUCS4Middle0 = 0x80; |
| const uint8 kValueCharMarkUCS4Right0 = 0x40; |
| const uint8 kValueCharMarkUCS4LeftMask = 0x1f; |
| |
| // character code related constants |
| const int kValueKanjiOffset = 0x01; |
| const int kValueHiraganaOffset = 0x4b; |
| const int kValueKatakanaOffset = 0x9f; |
| |
| //// Cost encoding flag //// |
| const uint8 kSmallCostFlag = 0x80; |
| const uint8 kSmallCostMask = 0x7f; |
| |
| //// Flags for token //// |
| const uint8 kTokenTerminationFlag = 0xff; |
| // Note that the flag for the first token for a certain key cannot be 0xff. |
| // First token cannot be kSameAsPrevValueFlag(0x33) nor kSameAsPrevPosFlag(0x0c) |
| |
| // 7 kLastTokenFlag |
| // 6 <id encoding> |
| // below bits will be used for upper 6 bits of token value |
| // when CRAM_VALUE_FLAG is set. |
| // 5 <reserved(unused)> |
| // 4 kSpellingCorrectionFlag |
| // 3 <pos encoding(high)> |
| // 2 <pos encoding(low)> |
| // 1 <value encoding(high)> |
| // 0 <value encoding(low)> |
| |
| //// Value encoding flag //// |
| // There are 4 mutually exclusive cases |
| // 1) Same as index hiragana key |
| // 2) Value is katakana |
| // 3) Same as previous token |
| // 4) Others. We have to store the value |
| const uint8 kValueTypeFlagMask = 0x03; |
| // Same as index hiragana word |
| const uint8 kAsIsHiraganaValueFlag = 0x01; |
| // Same as index katakana word |
| const uint8 kAsIsKatakanaValueFlag = 0x2; |
| // has same word |
| const uint8 kSameAsPrevValueFlag = 0x03; |
| // other cases |
| const uint8 kNormalValueFlag = 0x00; |
| |
| //// Pos encoding flag //// |
| // There are 4 mutually exclusive cases |
| // 1) Same pos with previous token |
| // 2) Not same, frequent 1 byte pos |
| // 3) Not same, full_pos but lid==rid, 2 byte |
| // 4) Not same, full_pos 4 byte (no flag for this) |
| const uint8 kPosTypeFlagMask = 0x0c; |
| // Pos(left/right ID) is coded into 3 bytes |
| // Note that lid/rid is less than 12 bits |
| // We need 24 bits (= 3 bytes) to store full pos. |
| const uint8 kFullPosFlag = 0x04; |
| // lid == rid 8 bits |
| const uint8 kMonoPosFlag = 0x08; |
| // has same left/right id as previous token |
| const uint8 kSameAsPrevPosFlag = 0x0c; |
| // frequent |
| const uint8 kFrequentPosFlag = 0x00; |
| |
| //// Spelling Correction flag //// |
| const uint8 kSpellingCorrectionFlag = 0x10; |
| |
| //// Reverved //// |
| // You can use one more flag! |
| // const int kReservedFlag = 0x20; |
| |
| //// Id encoding flag //// |
| // According to lower 6 bits of flags there are 2 patterns. |
| // 1) lower 6 bits are used. |
| // - Store an id in a trie use 3 bytes |
| // 2) lower 6 bits are not used. |
| // - Set CRAM_VALUE_FLAGS and use lower 6 bits. |
| // We need another 2 bytes to store the id in the trie. |
| // Note that we are assuming each id in the trie is less than 22 bits. |
| // Lower 6 bits of flags field are used to store upper part of id |
| // in value trie. |
| const uint8 kCrammedIDFlag = 0x40; |
| // Mask to cover upper valid 2bits when kCrammedIDFlag is used |
| const uint8 kUpperFlagsMask = 0xc0; |
| // Mask to get upper 6bits from flags value |
| const uint8 kUpperCrammedIDMask = 0x3f; |
| |
| //// Last token flag //// |
| // This token is last token for a index word |
| const uint8 kLastTokenFlag = 0x80; |
| } // namespace |
| |
| SystemDictionaryCodec::SystemDictionaryCodec() {} |
| |
| SystemDictionaryCodec::~SystemDictionaryCodec() {} |
| |
| const string SystemDictionaryCodec::GetSectionNameForKey() const { |
| return kKeySectionName; |
| } |
| |
| const string SystemDictionaryCodec::GetSectionNameForValue() const { |
| return kValueSectionName; |
| } |
| |
| const string SystemDictionaryCodec::GetSectionNameForTokens() const { |
| return kTokensSectionName; |
| } |
| |
| const string SystemDictionaryCodec::GetSectionNameForPos() const { |
| return kPosSectionName; |
| } |
| |
| void SystemDictionaryCodec::EncodeKey( |
| const StringPiece src, string *dst) const { |
| EncodeDecodeKeyImpl(src, dst); |
| } |
| |
| void SystemDictionaryCodec::DecodeKey( |
| const StringPiece src, string *dst) const { |
| EncodeDecodeKeyImpl(src, dst); |
| } |
| |
| size_t SystemDictionaryCodec::GetEncodedKeyLength( |
| const StringPiece src) const { |
| return GetEncodedDecodedKeyLengthImpl(src); |
| } |
| |
| size_t SystemDictionaryCodec::GetDecodedKeyLength( |
| const StringPiece src) const { |
| return GetEncodedDecodedKeyLengthImpl(src); |
| } |
| |
| // This encodes each UCS4 character into following areas |
| // The trickier part in this encoding is handling of \0 byte in UCS4 |
| // character. To avoid \0 in converted string, this function uses |
| // VALUE_CHAR_MARK_* markers. |
| // Kanji in 0x4e00~0x97ff -> 0x01 0x00 ~ 0x4a 0xff (74*256 characters) |
| // Hiragana 0x3041~0x3095 -> 0x4b~0x9f (84 characters) |
| // Katakana 0x30a1~0x30fc -> 0x9f~0xfa (91 characters) |
| // 0x?? (ASCII) -> VALUE_CHAR_MARK_ASCII ?? |
| // 0x??00 -> VALUE_CHAR_MARK_XX00 ?? |
| // Other 0x?? ?? -> VALUE_CHAR_MARK_OTHER ?? ?? |
| // 0x?????? -> VALUE_CHAR_MARK_BIG ?? ?? ?? |
| |
| void SystemDictionaryCodec::EncodeValue( |
| const StringPiece src, string *dst) const { |
| DCHECK(dst); |
| for (ConstChar32Iterator iter(src); !iter.Done(); iter.Next()) { |
| static_assert(sizeof(uint32) == sizeof(char32), |
| "char32 must be 32-bit integer size."); |
| const uint32 c = iter.Get(); |
| if (c >= 0x3041 && c < 0x3095) { |
| // Hiragana(85 characters) are encoded into 1 byte. |
| dst->push_back(c - 0x3041 + kValueHiraganaOffset); |
| } else if (c >= 0x30a1 && c < 0x30fd) { |
| // Katakana (92 characters) are encoded into 1 byte. |
| dst->push_back(c - 0x30a1 + kValueKatakanaOffset); |
| } else if (c < 0x10000 && ((c >> 8) & 255) == 0) { |
| // 0x00?? (ASCII) are encoded into 2 bytes. |
| dst->push_back(kValueCharMarkAscii); |
| dst->push_back(c & 255); |
| } else if (c < 0x10000 && (c & 255) == 0) { |
| // 0x??00 are encoded into 2 bytes. |
| dst->push_back(kValueCharMarkXX00); |
| dst->push_back((c >> 8) & 255); |
| } else if (c >= 0x4e00 && c < 0x9800) { |
| // Frequent Kanji and others (74*256 characters) are encoded |
| // into 2 bytes. |
| // (Kanji in 0x9800 to 0x9fff are encoded in 3 bytes) |
| const int h = ((c - 0x4e00) >> 8) + kValueKanjiOffset; |
| dst->push_back(h); |
| dst->push_back(c & 255); |
| } else if (0x10000 <= c && c <= 0x10ffff) { |
| // charaters encoded into 2-4bytes. |
| int left = ((c >> 16) & 255); |
| const int middle = ((c >> 8) & 255); |
| const int right = (c & 255); |
| if (middle == 0) { |
| left |= kValueCharMarkUCS4Middle0; |
| } |
| if (right == 0) { |
| left |= kValueCharMarkUCS4Right0; |
| } |
| dst->push_back(kValueCharMarkUCS4); |
| dst->push_back(left); |
| if (middle != 0) { |
| dst->push_back(middle); |
| } |
| if (right != 0) { |
| dst->push_back(right); |
| } |
| } else { |
| DCHECK_LE(c, 0x10ffff); |
| // Other charaters encoded into 3bytes. |
| dst->push_back(kValueCharMarkOtherUCS2); |
| dst->push_back((c >> 8) & 255); |
| dst->push_back(c & 255); |
| } |
| } |
| } |
| |
| void SystemDictionaryCodec::DecodeValue( |
| const StringPiece src, string *dst) const { |
| DCHECK(dst); |
| const uint8 *p = reinterpret_cast<const uint8 *>(src.data()); |
| const uint8 *const end = p + src.size(); |
| while (p < end) { |
| int cc = p[0]; |
| int c = 0; |
| if (kValueHiraganaOffset <= cc && cc < kValueKatakanaOffset) { |
| // Hiragana |
| c = 0x3041 + p[0] - kValueHiraganaOffset; |
| p += 1; |
| } else if (kValueKatakanaOffset <= cc && cc < kValueCharMarkAscii) { |
| // Katakana |
| c = 0x30a1 + p[0] - kValueKatakanaOffset; |
| p += 1; |
| } else if (cc == kValueCharMarkAscii) { |
| // Ascii |
| c = p[1]; |
| p += 2; |
| } else if (cc == kValueCharMarkXX00) { |
| // xx00 |
| c = (p[1] << 8); |
| p += 2; |
| } else if (cc == kValueCharMarkUCS4) { |
| // UCS4 |
| c = ((p[1] & kValueCharMarkUCS4LeftMask) << 16); |
| int pos = 2; |
| if (!(p[1] & kValueCharMarkUCS4Middle0)) { |
| c += (p[pos++] << 8); |
| } |
| if (!(p[1] & kValueCharMarkUCS4Right0)) { |
| c += p[pos++]; |
| } |
| p += pos; |
| } else if (cc == kValueCharMarkOtherUCS2) { |
| // other |
| c = (p[1] << 8); |
| c += p[2]; |
| p += 3; |
| } else if (cc < kValueHiraganaOffset) { |
| // Frequent kanji |
| c = (((p[0] - kValueKanjiOffset) << 8) + 0x4e00); |
| c += p[1]; |
| p += 2; |
| } else { |
| VLOG(1) << "should never come here"; |
| } |
| Util::UCS4ToUTF8Append(c, dst); |
| } |
| } |
| |
| uint8 SystemDictionaryCodec::GetTokensTerminationFlag() const { |
| return kTokenTerminationFlag; |
| } |
| |
| void SystemDictionaryCodec::EncodeTokens( |
| const vector<TokenInfo> &tokens, string *output) const { |
| DCHECK(output); |
| output->clear(); |
| |
| for (size_t i = 0; i < tokens.size(); ++i) { |
| EncodeToken(tokens, i, output); |
| } |
| CHECK((*output)[0] != GetTokensTerminationFlag()); |
| } |
| |
| // Each token is encoded as following. |
| // |
| // Flags: 1 byte |
| // Cost: |
| // For words without homonyms, 1 byte |
| // Other words, 2 bytes |
| // Pos: |
| // For pos same as the previous token, 0 byte |
| // For frequent pos, 1 byte |
| // For pos of left id == right id, 2 bytes |
| // For other pos-es left id + right id 3 bytes |
| // Index: (less than 2^22) |
| // When kCrammedIDFlag is set, 2 bytes |
| // Othewise, 3 bytes |
| void SystemDictionaryCodec::EncodeToken( |
| const vector<TokenInfo> &tokens, int index, string *output) const { |
| CHECK_LT(index, tokens.size()); |
| |
| // Determines the flags for this token. |
| const uint8 flags = GetFlagsForToken(tokens, index); |
| |
| // Encodes token into bytes. |
| uint8 buff[9]; |
| buff[0] = flags; |
| int offset = 1; |
| |
| const TokenInfo &token_info = tokens[index]; |
| EncodePos(token_info, flags, buff, &offset); // <= 3 bytes |
| EncodeCost(token_info, buff, &offset); // <= 2 bytes |
| EncodeValueInfo(token_info, flags, buff, &offset); // <= 3 bytes |
| |
| CHECK_LE(offset, 9); |
| output->append(reinterpret_cast<char *>(buff), offset); |
| } |
| |
| void SystemDictionaryCodec::DecodeTokens( |
| const uint8 *ptr, vector<TokenInfo> *tokens) const { |
| DCHECK(tokens); |
| int offset = 0; |
| while (true) { |
| int read_bytes = 0; |
| Token *token = new Token(); |
| tokens->push_back(TokenInfo(token)); |
| if (!DecodeToken(ptr + offset, &(tokens->back()), &read_bytes)) { |
| break; |
| } |
| DCHECK_GT(read_bytes, 0); |
| offset += read_bytes; |
| } |
| } |
| |
| bool SystemDictionaryCodec::DecodeToken( |
| const uint8 *ptr, TokenInfo *token_info, int *read_bytes) const { |
| DCHECK(ptr); |
| DCHECK(token_info); |
| DCHECK(read_bytes); |
| |
| const uint8 flags = ReadFlags(ptr[0]); |
| if (flags & kSpellingCorrectionFlag) { |
| token_info->token->attributes = Token::SPELLING_CORRECTION; |
| } |
| |
| int offset = 1; |
| DecodePos(ptr, flags, token_info, &offset); // <= 3bytes |
| DecodeCost(ptr, token_info, &offset); // <= 2bytes |
| DecodeValueInfo(ptr, flags, token_info, &offset); // <= 3bytes |
| DCHECK_LE(offset, 9); |
| *read_bytes = offset; |
| if (flags & kLastTokenFlag) { |
| return false; |
| } else { |
| return true; |
| } |
| } |
| |
| bool SystemDictionaryCodec::ReadTokenForReverseLookup( |
| const uint8 *ptr, int *value_id, int *read_bytes) const { |
| DCHECK(ptr); |
| DCHECK(value_id); |
| DCHECK(read_bytes); |
| |
| const uint8 flags = ReadFlags(ptr[0]); |
| int offset = 1; |
| // Read pos |
| const uint8 pos_flag = (flags & kPosTypeFlagMask); |
| if (pos_flag == kFrequentPosFlag) { |
| offset += 1; |
| } else if (pos_flag == kMonoPosFlag) { |
| offset += 2; |
| } else if (pos_flag == kFullPosFlag) { |
| offset += 3; |
| } |
| // Read cost |
| if ((ptr[offset] & kSmallCostFlag)) { |
| offset += 1; |
| } else { |
| offset += 2; |
| } |
| ReadValueInfo(ptr, flags, value_id, &offset); |
| *read_bytes = offset; |
| return !(flags & kLastTokenFlag); |
| } |
| |
| |
| namespace { |
| |
| // Swap the area for Hiragana, prolonged sound mark and middle dot with |
| // the one for control codes and alphabets. |
| // |
| // U+3041 - U+305F ("ぁ" - "た") <=> U+0001 - U+001F |
| // U+3060 - U+3095 ("だ" - "ゕ") <=> U+0040 - U+0075 |
| // U+30FB - U+30FC ("・" - "ー") <=> U+0076 - U+0077 |
| // |
| // U+0020 - U+003F are left intact to represent numbers and hyphen in 1 byte. |
| void EncodeDecodeKeyImpl(const StringPiece src, string *dst) { |
| for (ConstChar32Iterator iter(src); !iter.Done(); iter.Next()) { |
| static_assert(sizeof(uint32) == sizeof(char32), |
| "char32 must be 32-bit integer size."); |
| uint32 code = iter.Get(); |
| int32 offset = 0; |
| if ((code >= 0x0001 && code <= 0x001f) || |
| (code >= 0x3041 && code <= 0x305f)) { |
| offset = 0x3041 - 0x0001; |
| } else if ((code >= 0x0040 && code <= 0x0075) || |
| (code >= 0x3060 && code <= 0x3095)) { |
| offset = 0x3060 - 0x0040; |
| } else if ((code >= 0x0076 && code <= 0x0077) || |
| (code >= 0x30FB && code <= 0x30FC)) { |
| offset = 0x30FB - 0x0076; |
| } |
| if (code < 0x80) { |
| code += offset; |
| } else { |
| code -= offset; |
| } |
| DCHECK_GT(code, 0); |
| Util::UCS4ToUTF8Append(code, dst); |
| } |
| } |
| |
| size_t GetEncodedDecodedKeyLengthImpl(const StringPiece src) { |
| size_t size = src.size(); |
| for (ConstChar32Iterator iter(src); !iter.Done(); iter.Next()) { |
| static_assert(sizeof(uint32) == sizeof(char32), |
| "char32 must be 32-bit integer size."); |
| uint32 code = iter.Get(); |
| if ((code >= 0x3041 && code <= 0x3095) || |
| (code >= 0x30FB && code <= 0x30FC)) { |
| // This code point takes three bytes in UTF-8 encoding, |
| // and will be swapped with a code point which takes one byte in UTF-8 |
| // encoding. |
| size -= 2; |
| continue; |
| } |
| if ((code >= 0x0001 && code <= 0x001F) || |
| (code >= 0x0040 && code <= 0x0077)) { |
| // Vice versa on above. |
| size += 2; |
| continue; |
| } |
| } |
| return size; |
| } |
| |
| // Return flags for token |
| uint8 GetFlagsForToken(const vector<TokenInfo> &tokens, |
| int index) { |
| // Determines the flags for this token. |
| uint8 flags = 0; |
| if (index == tokens.size() - 1) { |
| flags |= kLastTokenFlag; |
| } |
| |
| const TokenInfo &token_info = tokens[index]; |
| const Token *token = token_info.token; |
| |
| // Special treatment for spelling correction. |
| if (token->attributes & Token::SPELLING_CORRECTION) { |
| flags |= kSpellingCorrectionFlag; |
| } |
| |
| // Pos flag |
| flags |= GetFlagForPos(token_info, token); |
| |
| if (index == 0) { |
| CHECK_NE(flags & kPosTypeFlagMask, kSameAsPrevPosFlag) |
| << "First token cannot become the SameAsPrevPos."; |
| } |
| |
| // Value flag |
| flags |= GetFlagForValue(token_info, token); |
| if (index == 0) { |
| CHECK_NE(flags & kValueTypeFlagMask, kSameAsPrevValueFlag) |
| << "First token cannot become the SameAsPrevValue."; |
| } |
| |
| if ((flags & kUpperCrammedIDMask) == 0) { |
| // Lower 6bits are available. Use it for value trie id. |
| flags |= kCrammedIDFlag; |
| } |
| return flags; |
| } |
| |
| uint8 GetFlagForPos( |
| const TokenInfo &token_info, |
| const Token *token) { |
| CHECK(token); |
| const uint16 lid = token->lid; |
| const uint16 rid = token->rid; |
| if (lid > kPosMax || rid > kPosMax) { |
| // We can use LOG(FATAL) here, as this code runs in dictionary_builder. |
| LOG(FATAL) << "Too large pos id: lid " << lid << ", rid " << rid; |
| } |
| |
| if (token_info.pos_type == TokenInfo::FREQUENT_POS) { |
| return kFrequentPosFlag; |
| } else if (token_info.pos_type == TokenInfo::SAME_AS_PREV_POS) { |
| return kSameAsPrevPosFlag; |
| } else if (lid == rid) { |
| return kMonoPosFlag; |
| } else { |
| return kFullPosFlag; |
| } |
| } |
| |
| uint8 GetFlagForValue( |
| const TokenInfo &token_info, |
| const Token *token) { |
| CHECK(token); |
| if (token_info.value_type == TokenInfo::SAME_AS_PREV_VALUE) { |
| return kSameAsPrevValueFlag; |
| } else if (token_info.value_type == TokenInfo::AS_IS_HIRAGANA) { |
| return kAsIsHiraganaValueFlag; |
| } else if (token_info.value_type == TokenInfo::AS_IS_KATAKANA) { |
| return kAsIsKatakanaValueFlag; |
| } else { |
| return kNormalValueFlag; |
| } |
| } |
| |
| void EncodeCost( |
| const TokenInfo &token_info, uint8 *dst, int *offset) { |
| const Token *token = token_info.token; |
| CHECK_LE(token->cost, kCostMax) << "Assuming cost is within 15bits."; |
| if (token_info.cost_type == TokenInfo::CAN_USE_SMALL_ENCODING) { |
| dst[*offset] = (token->cost >> 8) | kSmallCostFlag; |
| *offset += 1; |
| } else { |
| dst[*offset] = token->cost >> 8; |
| dst[*offset + 1] = token->cost & 0xff; |
| *offset += 2; |
| } |
| } |
| |
| void EncodePos( |
| const TokenInfo &token_info, uint8 flags, uint8 *dst, int *offset) { |
| const uint8 pos_flag = flags & kPosTypeFlagMask; |
| const Token *token = token_info.token; |
| const uint16 lid = token->lid; |
| const uint16 rid = token->rid; |
| switch (pos_flag) { |
| case kFullPosFlag: { |
| // 3 bytes |
| dst[*offset] = (lid & 255); |
| dst[*offset + 1] = ((rid << 4) & 255) | (lid >> 8); |
| dst[*offset + 2] = (rid >> 4) & 255; |
| *offset += 3; |
| break; |
| } |
| case kMonoPosFlag: { |
| // 2 bytes |
| dst[*offset] = (lid & 255); |
| dst[*offset + 1] = (lid >> 8); |
| *offset += 2; |
| break; |
| } |
| case kFrequentPosFlag: { |
| // Frequent 1 byte pos. |
| const int id = token_info.id_in_frequent_pos_map; |
| CHECK_GE(id, 0); |
| dst[*offset] = id; |
| *offset += 1; |
| break; |
| } |
| case kSameAsPrevPosFlag: { |
| break; |
| } |
| default: { |
| // We can use LOG(FATAL) here. This code runs in dictionary_builder. |
| LOG(FATAL) << "Should not come here"; |
| break; |
| } |
| } |
| } |
| |
| void EncodeValueInfo( |
| const TokenInfo &token_info, uint8 flags, uint8 *dst, int *offset) { |
| const uint8 value_type_flag = flags & kValueTypeFlagMask; |
| if (value_type_flag != kNormalValueFlag) { |
| // No need to store id for word trie |
| return; |
| } |
| const uint32 id = token_info.id_in_value_trie; |
| if (id > kValueTrieIdMax) { // 22 bits |
| // We can use LOG(FATAL) here. |
| LOG(FATAL) << "Too large word trie (should be less than 2^22)\t" << id; |
| } |
| |
| if (flags & kCrammedIDFlag) { |
| dst[*offset] = id & 255; |
| dst[*offset + 1] = (id >> 8) & 255; |
| // Uses lower 6 bits of flags. |
| dst[0] |= (id >> 16) & kUpperCrammedIDMask; |
| *offset += 2; |
| } else { |
| dst[*offset] = id & 255; |
| dst[*offset + 1] = (id >> 8) & 255; |
| dst[*offset + 2] = (id >> 16) & 255; |
| *offset += 3; |
| } |
| } |
| |
| uint8 ReadFlags(uint8 val) { |
| uint8 ret = val; |
| if (ret & kCrammedIDFlag) { |
| ret &= kUpperFlagsMask; |
| } |
| return ret; |
| } |
| |
| void DecodeCost(const uint8 *ptr, TokenInfo *token_info, int *offset) { |
| DCHECK(ptr); |
| DCHECK(token_info); |
| DCHECK(offset); |
| if (ptr[*offset] & kSmallCostFlag) { |
| token_info->token->cost = ((ptr[*offset] & kSmallCostMask) << 8); |
| *offset += 1; |
| } else { |
| token_info->token->cost = (ptr[*offset] << 8); |
| token_info->token->cost += ptr[*offset + 1]; |
| *offset += 2; |
| } |
| } |
| |
| void DecodePos( |
| const uint8 *ptr, uint8 flags, TokenInfo *token_info, int *offset) { |
| DCHECK(ptr); |
| DCHECK(token_info); |
| DCHECK(offset); |
| const uint8 pos_flag = (flags & kPosTypeFlagMask); |
| Token *token = token_info->token; |
| switch (pos_flag) { |
| case kFrequentPosFlag: { |
| const int pos_id = ptr[*offset]; |
| token_info->pos_type = TokenInfo::FREQUENT_POS; |
| token_info->id_in_frequent_pos_map = pos_id; |
| *offset += 1; |
| break; |
| } |
| case kSameAsPrevPosFlag: { |
| token_info->pos_type = TokenInfo::SAME_AS_PREV_POS; |
| break; |
| } |
| case kMonoPosFlag: { |
| const uint16 id = ((ptr[*offset + 1] << 8) | ptr[*offset]); |
| token->lid = id; |
| token->rid = id; |
| *offset += 2; |
| break; |
| } |
| case kFullPosFlag: { |
| token->lid = ptr[*offset]; |
| token->lid += ((ptr[*offset + 1] & 0x0f) << 8); |
| token->rid = (ptr[*offset + 1] >> 4); |
| token->rid += (ptr[*offset + 2] << 4); |
| *offset += 3; |
| break; |
| } |
| default: { |
| DLOG(FATAL) << "should never come here"; |
| break; |
| } |
| } |
| } |
| |
| void DecodeValueInfo(const uint8 *ptr, |
| uint8 flags, |
| TokenInfo *token_info, |
| int *offset) { |
| DCHECK(ptr); |
| DCHECK(token_info); |
| DCHECK(offset); |
| const uint8 value_flag = (flags & kValueTypeFlagMask); |
| switch (value_flag) { |
| case kAsIsHiraganaValueFlag: { |
| token_info->value_type = TokenInfo::AS_IS_HIRAGANA; |
| break; |
| } |
| case kAsIsKatakanaValueFlag: { |
| token_info->value_type = TokenInfo::AS_IS_KATAKANA; |
| break; |
| } |
| case kSameAsPrevValueFlag: { |
| token_info->value_type = TokenInfo::SAME_AS_PREV_VALUE; |
| break; |
| } |
| case kNormalValueFlag: { |
| token_info->value_type = TokenInfo::DEFAULT_VALUE; |
| uint32 id = ((ptr[*offset + 1] << 8) | ptr[*offset]); |
| if (flags & kCrammedIDFlag) { |
| id |= ((ptr[0] & kUpperCrammedIDMask) << 16); |
| *offset += 2; |
| } else { |
| id |= (ptr[*offset + 2] << 16); |
| *offset += 3; |
| } |
| token_info->id_in_value_trie = id; |
| break; |
| } |
| default: { |
| DLOG(FATAL) << "should never come here"; |
| break; |
| } |
| } |
| } |
| |
| // Get value id only for reverse conversion |
| void ReadValueInfo(const uint8 *ptr, uint8 flags, int *value_id, int *offset) { |
| DCHECK(ptr); |
| DCHECK(value_id); |
| DCHECK(offset); |
| *value_id = -1; |
| const uint8 value_flag = (flags & kValueTypeFlagMask); |
| if (value_flag == kNormalValueFlag) { |
| uint32 id = ((ptr[*offset + 1] << 8) | ptr[*offset]); |
| if (flags & kCrammedIDFlag) { |
| id |= ((ptr[0] & kUpperCrammedIDMask) << 16); |
| *offset += 2; |
| } else { |
| id |= (ptr[*offset + 2] << 16); |
| *offset += 3; |
| } |
| *value_id = id; |
| } |
| } |
| } // namespace |
| |
| namespace { |
| SystemDictionaryCodecInterface *g_system_dictionary_codec = NULL; |
| typedef SystemDictionaryCodec DefaultCodec; |
| } // namespace |
| |
| SystemDictionaryCodecInterface *SystemDictionaryCodecFactory::GetCodec() { |
| if (g_system_dictionary_codec == NULL) { |
| return Singleton<DefaultCodec>::get(); |
| } else { |
| return g_system_dictionary_codec; |
| } |
| } |
| |
| void SystemDictionaryCodecFactory::SetCodec( |
| SystemDictionaryCodecInterface *codec) { |
| g_system_dictionary_codec = codec; |
| } |
| } // namespace dictionary |
| } // namespace mozc |