blob: f33abf6800beaea622ea736e8b99803f54293134 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dictionary/system/system_dictionary_builder.h"
#include <algorithm>
#include <climits>
#include <cstring>
#include <sstream>
#include <unordered_set>
#include "base/file_stream.h"
#include "base/flags.h"
#include "base/logging.h"
#include "base/util.h"
#include "dictionary/dictionary_token.h"
#include "dictionary/file/codec_interface.h"
#include "dictionary/pos_matcher.h"
#include "dictionary/system/codec.h"
#include "dictionary/system/codec_interface.h"
#include "dictionary/system/words_info.h"
#include "dictionary/text_dictionary_loader.h"
#include "storage/louds/bit_vector_based_array_builder.h"
#include "storage/louds/louds_trie_builder.h"
DEFINE_bool(preserve_intermediate_dictionary, false,
"preserve inetemediate dictionary file.");
DEFINE_int32(min_key_length_to_use_small_cost_encoding, 6,
"minimum key length to use 1 byte cost encoding.");
namespace mozc {
namespace dictionary {
using mozc::storage::louds::LoudsTrieBuilder;
using mozc::storage::louds::BitVectorBasedArrayBuilder;
namespace {
struct TokenGreaterThan {
inline bool operator()(const TokenInfo& lhs,
const TokenInfo& rhs) const {
if (lhs.token->lid != rhs.token->lid) {
return lhs.token->lid > rhs.token->lid;
}
if (lhs.token->rid != rhs.token->rid) {
return lhs.token->rid > rhs.token->rid;
}
if (lhs.id_in_value_trie != rhs.id_in_value_trie) {
return lhs.id_in_value_trie < rhs.id_in_value_trie;
}
return lhs.token->attributes < rhs.token->attributes;
}
};
void WriteSectionToFile(const DictionaryFileSection &section,
const string &filename) {
OutputFileStream ofs(filename.c_str(), ios::binary | ios::out);
ofs.write(section.ptr, section.len);
}
} // namespace
SystemDictionaryBuilder::SystemDictionaryBuilder()
: value_trie_builder_(new LoudsTrieBuilder),
key_trie_builder_(new LoudsTrieBuilder),
token_array_builder_(new BitVectorBasedArrayBuilder),
codec_(SystemDictionaryCodecFactory::GetCodec()) {}
// This class does not have the ownership of |codec|.
SystemDictionaryBuilder::SystemDictionaryBuilder(
const SystemDictionaryCodecInterface *codec)
: value_trie_builder_(new LoudsTrieBuilder),
key_trie_builder_(new LoudsTrieBuilder),
token_array_builder_(new BitVectorBasedArrayBuilder),
codec_(codec) {}
SystemDictionaryBuilder::~SystemDictionaryBuilder() {}
void SystemDictionaryBuilder::BuildFromTokens(const vector<Token *> &tokens) {
KeyInfoList key_info_list;
ReadTokens(tokens, &key_info_list);
BuildFrequentPos(key_info_list);
BuildValueTrie(key_info_list);
BuildKeyTrie(key_info_list);
SetIdForValue(&key_info_list);
SetIdForKey(&key_info_list);
SortTokenInfo(&key_info_list);
SetCostType(&key_info_list);
SetPosType(&key_info_list);
SetValueType(&key_info_list);
BuildTokenArray(key_info_list);
}
void SystemDictionaryBuilder::WriteToFile(const string &output_file) const {
OutputFileStream ofs(output_file.c_str(), ios::binary | ios::out);
WriteToStream(output_file, &ofs);
}
void SystemDictionaryBuilder::WriteToStream(
const string &intermediate_output_file_base_path,
ostream *output_stream) const {
// Memory images of each section
vector<DictionaryFileSection> sections;
DictionaryFileCodecInterface *file_codec =
DictionaryFileCodecFactory::GetCodec();
DictionaryFileSection value_trie_section(
value_trie_builder_->image().data(),
value_trie_builder_->image().size(),
file_codec->GetSectionName(codec_->GetSectionNameForValue()));
sections.push_back(value_trie_section);
DictionaryFileSection key_trie_section(
key_trie_builder_->image().data(),
key_trie_builder_->image().size(),
file_codec->GetSectionName(codec_->GetSectionNameForKey()));
sections.push_back(key_trie_section);
DictionaryFileSection token_array_section(
token_array_builder_->image().data(),
token_array_builder_->image().size(),
file_codec->GetSectionName(codec_->GetSectionNameForTokens()));
sections.push_back(token_array_section);
uint32 frequent_pos_array[256] = {0};
for (map<uint32, int>::const_iterator i = frequent_pos_.begin();
i != frequent_pos_.end(); ++i) {
frequent_pos_array[i->second] = i->first;
}
DictionaryFileSection frequent_pos_section(
reinterpret_cast<const char *>(frequent_pos_array),
sizeof frequent_pos_array,
file_codec->GetSectionName(codec_->GetSectionNameForPos()));
sections.push_back(frequent_pos_section);
if (FLAGS_preserve_intermediate_dictionary &&
!intermediate_output_file_base_path.empty()) {
// Write out intermediate results to files.
const string &basepath = intermediate_output_file_base_path;
LOG(INFO) << "Writing intermediate files.";
WriteSectionToFile(value_trie_section, basepath + ".value");
WriteSectionToFile(key_trie_section, basepath + ".key");
WriteSectionToFile(token_array_section, basepath + ".tokens");
WriteSectionToFile(frequent_pos_section, basepath + ".freq_pos");
}
LOG(INFO) << "Start writing dictionary file.";
DictionaryFileCodecFactory::GetCodec()->WriteSections(sections,
output_stream);
LOG(INFO) << "Start writing dictionary file... done.";
}
namespace {
uint32 GetCombinedPos(uint16 lid, uint16 rid) {
return (lid << 16) | rid;
}
TokenInfo::ValueType GetValueType(const Token *token) {
if (token->value == token->key) {
return TokenInfo::AS_IS_HIRAGANA;
}
string katakana;
Util::HiraganaToKatakana(token->key, &katakana);
if (token->value == katakana) {
return TokenInfo::AS_IS_KATAKANA;
}
return TokenInfo::DEFAULT_VALUE;
}
bool HasHomonymsInSamePos(
const SystemDictionaryBuilder::KeyInfo &key_info) {
// Early exit path mainly for performance.
if (key_info.tokens.size() == 1) {
return false;
}
std::unordered_set<uint32> seen;
for (size_t i = 0; i < key_info.tokens.size(); ++i) {
const Token *token = key_info.tokens[i].token;
const uint32 pos = GetCombinedPos(token->lid, token->rid);
if (!seen.insert(pos).second) {
// Insertion failed, which means we already have |pos|.
return true;
}
}
return false;
}
struct TokenPtrLessThan {
inline bool operator()(const Token* lhs, const Token* rhs) const {
return lhs->key < rhs->key;
}
};
} // namespace
void SystemDictionaryBuilder::ReadTokens(const vector<Token *> &tokens,
KeyInfoList *key_info_list) const {
// Create KeyInfoList in two steps.
// 1. Create an array of Token with (stably) sorting Token::key.
// [Token 1(key:aaa)][Token 2(key:aaa)][Token 3(key:abc)][...]
// 2. Group Token(s) by Token::Key and convert them into KeyInfo.
// [KeyInfo(key:aaa)[Token 1][Token 2]][KeyInfo(key:abc)[Token 3]][...]
// Step 1.
typedef vector<Token *> ReduceBuffer;
ReduceBuffer reduce_buffer;
reduce_buffer.reserve(tokens.size());
for (vector<Token *>::const_iterator iter = tokens.begin();
iter != tokens.end(); ++iter) {
Token *token = *iter;
CHECK(!token->key.empty()) << "empty key string in input";
CHECK(!token->value.empty()) << "empty value string in input";
reduce_buffer.push_back(token);
}
stable_sort(reduce_buffer.begin(), reduce_buffer.end(), TokenPtrLessThan());
// Step 2.
key_info_list->clear();
if (reduce_buffer.size() == 0) {
return;
}
KeyInfo last_key_info;
last_key_info.key = reduce_buffer.front()->key;
for (ReduceBuffer::const_iterator iter = reduce_buffer.begin();
iter != reduce_buffer.end(); ++iter) {
Token *token = *iter;
if (last_key_info.key != token->key) {
key_info_list->push_back(last_key_info);
last_key_info = KeyInfo();
last_key_info.key = token->key;
}
last_key_info.tokens.push_back(TokenInfo(token));
last_key_info.tokens.back().value_type = GetValueType(token);
}
key_info_list->push_back(last_key_info);
}
void SystemDictionaryBuilder::BuildFrequentPos(
const KeyInfoList &key_info_list) {
// Calculate frequency of each pos
// TODO(toshiyuki): It might be better to count frequency
// with considering same_as_prev_pos.
map<uint32, int> pos_map;
for (KeyInfoList::const_iterator itr = key_info_list.begin();
itr != key_info_list.end(); ++itr) {
const KeyInfo &key_info = *itr;
for (size_t i = 0; i < key_info.tokens.size(); ++i) {
const Token *token = key_info.tokens[i].token;
pos_map[GetCombinedPos(token->lid, token->rid)]++;
}
}
// Get histgram of frequency
map<int, int> freq_map;
for (map<uint32, int>::const_iterator jt = pos_map.begin();
jt != pos_map.end(); ++jt) {
freq_map[jt->second]++;
}
// Compute the lower threshold of frequence
int num_freq_pos = 0;
int freq_threshold = INT_MAX;
for (map<int, int>::reverse_iterator kt = freq_map.rbegin();
kt != freq_map.rend(); ++kt) {
if (num_freq_pos + kt->second > 255) {
break;
}
freq_threshold = kt->first;
num_freq_pos += kt->second;
}
// Collect high frequent pos
VLOG(1) << "num_freq_pos" << num_freq_pos;
VLOG(1) << "Pos threshold=" << freq_threshold;
int freq_pos_idx = 0;
int num_tokens = 0;
map<uint32, int>::iterator lt;
for (lt = pos_map.begin(); lt != pos_map.end(); ++lt) {
if (lt->second >= freq_threshold) {
frequent_pos_[lt->first] = freq_pos_idx;
freq_pos_idx++;
num_tokens += lt->second;
}
}
CHECK(freq_pos_idx == num_freq_pos)
<< "inconsistent result to find frequent pos";
VLOG(1) << freq_pos_idx << " high frequent Pos has "
<< num_tokens << " tokens";
}
void SystemDictionaryBuilder::BuildValueTrie(const KeyInfoList &key_info_list) {
for (KeyInfoList::const_iterator itr = key_info_list.begin();
itr != key_info_list.end(); ++itr) {
const KeyInfo &key_info = *itr;
for (size_t i = 0; i < key_info.tokens.size(); ++i) {
const TokenInfo &token_info = key_info.tokens[i];
if (token_info.value_type == TokenInfo::AS_IS_HIRAGANA ||
token_info.value_type == TokenInfo::AS_IS_KATAKANA) {
// These values will be stored in token array as flags
continue;
}
string value_str;
codec_->EncodeValue(token_info.token->value, &value_str);
value_trie_builder_->Add(value_str);
}
}
value_trie_builder_->Build();
}
void SystemDictionaryBuilder::SetIdForValue(KeyInfoList *key_info_list) const {
for (KeyInfoList::iterator itr = key_info_list->begin();
itr != key_info_list->end(); ++itr) {
for (size_t i = 0; i < itr->tokens.size(); ++i) {
TokenInfo *token_info = &(itr->tokens[i]);
string value_str;
codec_->EncodeValue(token_info->token->value, &value_str);
token_info->id_in_value_trie =
value_trie_builder_->GetId(value_str);
}
}
}
void SystemDictionaryBuilder::SortTokenInfo(KeyInfoList *key_info_list) const {
for (KeyInfoList::iterator itr = key_info_list->begin();
itr != key_info_list->end(); ++itr) {
KeyInfo *key_info = &(*itr);
sort(key_info->tokens.begin(), key_info->tokens.end(), TokenGreaterThan());
}
}
void SystemDictionaryBuilder::SetCostType(KeyInfoList *key_info_list) const {
for (KeyInfoList::iterator itr = key_info_list->begin();
itr != key_info_list->end(); ++itr) {
KeyInfo *key_info = &(*itr);
if (HasHomonymsInSamePos(*key_info)) {
continue;
}
for (size_t i = 0; i < key_info->tokens.size(); ++i) {
TokenInfo *token_info = &key_info->tokens[i];
const int key_len = Util::CharsLen(token_info->token->key);
if (key_len >= FLAGS_min_key_length_to_use_small_cost_encoding) {
token_info->cost_type = TokenInfo::CAN_USE_SMALL_ENCODING;
}
}
}
}
void SystemDictionaryBuilder::SetPosType(KeyInfoList *key_info_list) const {
for (KeyInfoList::iterator itr = key_info_list->begin();
itr != key_info_list->end(); ++itr) {
KeyInfo *key_info = &(*itr);
for (size_t i = 0; i < key_info->tokens.size(); ++i) {
TokenInfo *token_info = &(key_info->tokens[i]);
const uint32 pos = GetCombinedPos(token_info->token->lid,
token_info->token->rid);
map<uint32, int>::const_iterator itr = frequent_pos_.find(pos);
if (itr != frequent_pos_.end()) {
token_info->pos_type = TokenInfo::FREQUENT_POS;
token_info->id_in_frequent_pos_map = itr->second;
}
if (i >= 1) {
const TokenInfo &prev_token_info = key_info->tokens[i - 1];
const uint32 prev_pos = GetCombinedPos(prev_token_info.token->lid,
prev_token_info.token->rid);
if (prev_pos == pos) {
// we can overwrite FREQUENT_POS
token_info->pos_type = TokenInfo::SAME_AS_PREV_POS;
}
}
}
}
}
void SystemDictionaryBuilder::SetValueType(KeyInfoList *key_info_list) const {
for (KeyInfoList::iterator itr = key_info_list->begin();
itr != key_info_list->end(); ++itr) {
KeyInfo *key_info = &(*itr);
for (size_t i = 1; i < key_info->tokens.size(); ++i) {
const TokenInfo *prev_token_info = &(key_info->tokens[i - 1]);
TokenInfo *token_info = &(key_info->tokens[i]);
if (token_info->value_type != TokenInfo::AS_IS_HIRAGANA &&
token_info->value_type != TokenInfo::AS_IS_KATAKANA &&
(token_info->token->value == prev_token_info->token->value)) {
token_info->value_type = TokenInfo::SAME_AS_PREV_VALUE;
}
}
}
}
void SystemDictionaryBuilder::BuildKeyTrie(const KeyInfoList &key_info_list) {
for (KeyInfoList::const_iterator itr = key_info_list.begin();
itr != key_info_list.end(); ++itr) {
string key_str;
codec_->EncodeKey(itr->key, &key_str);
key_trie_builder_->Add(key_str);
}
key_trie_builder_->Build();
}
void SystemDictionaryBuilder::SetIdForKey(KeyInfoList *key_info_list) const {
for (KeyInfoList::iterator itr = key_info_list->begin();
itr != key_info_list->end(); ++itr) {
KeyInfo *key_info = &(*itr);
string key_str;
codec_->EncodeKey(key_info->key, &key_str);
key_info->id_in_key_trie =
key_trie_builder_->GetId(key_str);
}
}
void SystemDictionaryBuilder::BuildTokenArray(
const KeyInfoList &key_info_list) {
// Here we make a reverse lookup table as follows:
// |key_info_list[X].id_in_key_trie| -> |key_info_list[X]|
// assuming |key_info_list[X].id_in_key_trie| is unique and successive.
{
vector<const KeyInfo *> id_to_keyinfo_table;
id_to_keyinfo_table.resize(key_info_list.size());
for (KeyInfoList::const_iterator itr = key_info_list.begin();
itr != key_info_list.end(); ++itr) {
const KeyInfo &key_info = *itr;
const int id = key_info.id_in_key_trie;
id_to_keyinfo_table[id] = &key_info;
}
for (size_t i = 0; i < id_to_keyinfo_table.size(); ++i) {
const KeyInfo &key_info = *id_to_keyinfo_table[i];
string tokens_str;
codec_->EncodeTokens(key_info.tokens, &tokens_str);
token_array_builder_->Add(tokens_str);
}
}
token_array_builder_->Add(string(1, codec_->GetTokensTerminationFlag()));
token_array_builder_->Build();
}
} // namespace dictionary
} // namespace mozc