blob: fc9c9dd5db844879357dd06534260c09d90f6d32 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dictionary/user_dictionary.h"
#include <algorithm>
#include <limits>
#include <set>
#include <string>
#include "base/compiler_specific.h"
#include "base/logging.h"
#include "base/mutex.h"
#include "base/singleton.h"
#include "base/stl_util.h"
#include "base/thread.h"
#include "base/util.h"
#include "config/config.pb.h"
#include "config/config_handler.h"
#include "dictionary/dictionary_token.h"
#include "dictionary/pos_matcher.h"
#include "dictionary/suppression_dictionary.h"
#include "dictionary/user_dictionary_storage.h"
#include "dictionary/user_dictionary_util.h"
#include "dictionary/user_pos.h"
#include "usage_stats/usage_stats.h"
namespace mozc {
namespace {
struct OrderByKey {
bool operator()(const UserPOS::Token *lhs,
const UserPOS::Token *rhs) const {
return lhs->key < rhs->key;
}
};
struct OrderByKeyThenById {
bool operator()(const UserPOS::Token *lhs,
const UserPOS::Token *rhs) const {
const int comp = lhs->key.compare(rhs->key);
return comp == 0 ? (lhs->id < rhs->id) : (comp < 0);
}
};
class UserDictionaryFileManager {
public:
UserDictionaryFileManager() {}
const string GetFileName() {
scoped_lock l(&mutex_);
if (filename_.empty()) {
return UserDictionaryUtil::GetUserDictionaryFileName();
} else {
return filename_;
}
}
void SetFileName(const string &filename) {
scoped_lock l(&mutex_);
filename_ = filename;
}
private:
string filename_;
Mutex mutex_;
DISALLOW_COPY_AND_ASSIGN(UserDictionaryFileManager);
};
void FillTokenFromUserPOSToken(const UserPOS::Token &user_pos_token,
Token *token) {
token->key = user_pos_token.key;
token->value = user_pos_token.value;
token->cost = user_pos_token.cost;
token->lid = user_pos_token.id;
token->rid = user_pos_token.id;
token->attributes = Token::USER_DICTIONARY;
}
} // namespace
class TokensIndex : public vector<UserPOS::Token *> {
public:
explicit TokensIndex(const UserPOSInterface *user_pos,
SuppressionDictionary *suppression_dictionary)
: user_pos_(user_pos),
suppression_dictionary_(suppression_dictionary) {}
virtual ~TokensIndex() {
Clear();
}
void Clear() {
STLDeleteElements(this);
clear();
}
void Load(const user_dictionary::UserDictionaryStorage &storage) {
Clear();
set<uint64> seen;
vector<UserPOS::Token> tokens;
if (!suppression_dictionary_->IsLocked()) {
LOG(ERROR) << "SuppressionDictionary must be locked first";
}
suppression_dictionary_->Clear();
for (size_t i = 0; i < storage.dictionaries_size(); ++i) {
const UserDictionaryStorage::UserDictionary &dic =
storage.dictionaries(i);
if (!dic.enabled() || dic.entries_size() == 0) {
continue;
}
for (size_t j = 0; j < dic.entries_size(); ++j) {
const UserDictionaryStorage::UserDictionaryEntry &entry =
dic.entries(j);
if (!UserDictionaryUtil::IsValidEntry(*user_pos_, entry)) {
continue;
}
string tmp, reading;
UserDictionaryUtil::NormalizeReading(entry.key(), &tmp);
// We cannot call NormalizeVoiceSoundMark inside NormalizeReading,
// because the normalization is user-visible.
// http://b/2480844
Util::NormalizeVoicedSoundMark(tmp, &reading);
DCHECK_LE(0, entry.pos());
MOZC_CLANG_PUSH_WARNING();
#if MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
MOZC_CLANG_DISABLE_WARNING(tautological-constant-out-of-range-compare);
#endif // MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
DCHECK_LE(entry.pos(), 255);
MOZC_CLANG_POP_WARNING();
const uint64 fp = Util::Fingerprint(reading +
"\t" +
entry.value() +
"\t" +
static_cast<char>(entry.pos()));
if (!seen.insert(fp).second) {
VLOG(1) << "Found dup item";
continue;
}
// "抑制単語"
if (entry.pos() == user_dictionary::UserDictionary::SUPPRESSION_WORD) {
suppression_dictionary_->AddEntry(reading, entry.value());
} else {
tokens.clear();
user_pos_->GetTokens(
reading, entry.value(),
UserDictionaryUtil::GetStringPosType(entry.pos()), &tokens);
for (size_t k = 0; k < tokens.size(); ++k) {
this->push_back(new UserPOS::Token(tokens[k]));
Util::StripWhiteSpaces(entry.comment(), &this->back()->comment);
}
}
}
}
// Sort first by key and then by POS ID.
sort(this->begin(), this->end(), OrderByKeyThenById());
suppression_dictionary_->UnLock();
VLOG(1) << this->size() << " user dic entries loaded";
usage_stats::UsageStats::SetInteger("UserRegisteredWord",
static_cast<int>(this->size()));
}
private:
const UserPOSInterface *user_pos_;
SuppressionDictionary *suppression_dictionary_;
};
class UserDictionaryReloader : public Thread {
public:
explicit UserDictionaryReloader(UserDictionary *dic)
: auto_register_mode_(false), dic_(dic) {
DCHECK(dic_);
}
virtual ~UserDictionaryReloader() {
Join();
}
void StartAutoRegistration(const string &key,
const string &value,
user_dictionary::UserDictionary::PosType pos) {
{
scoped_lock l(&mutex_);
auto_register_mode_ = true;
key_ = key;
value_ = value;
pos_ = pos;
}
Start();
}
void StartReload() {
Start();
}
virtual void Run() {
scoped_ptr<UserDictionaryStorage> storage(new UserDictionaryStorage(
Singleton<UserDictionaryFileManager>::get()->GetFileName()));
// Load from file
if (!storage->Load()) {
return;
}
if (storage->ConvertSyncDictionariesToNormalDictionaries()) {
LOG(INFO) << "Syncable dictionaries are converted to normal dictionaries";
if (storage->Lock()) {
storage->Save();
storage->UnLock();
}
}
if (auto_register_mode_ &&
!storage->AddToAutoRegisteredDictionary(key_, value_, pos_)) {
LOG(ERROR) << "failed to execute AddToAutoRegisteredDictionary";
auto_register_mode_ = false;
return;
}
auto_register_mode_ = false;
dic_->Load(*(storage.get()));
}
private:
Mutex mutex_;
bool auto_register_mode_;
UserDictionary *dic_;
string key_;
string value_;
user_dictionary::UserDictionary::PosType pos_;
DISALLOW_COPY_AND_ASSIGN(UserDictionaryReloader);
};
UserDictionary::UserDictionary(const UserPOSInterface *user_pos,
const POSMatcher *pos_matcher,
SuppressionDictionary *suppression_dictionary)
: ALLOW_THIS_IN_INITIALIZER_LIST(
reloader_(new UserDictionaryReloader(this))),
user_pos_(user_pos),
pos_matcher_(pos_matcher),
suppression_dictionary_(suppression_dictionary),
tokens_(new TokensIndex(user_pos_.get(), suppression_dictionary)),
mutex_(new ReaderWriterMutex) {
DCHECK(user_pos_.get());
DCHECK(pos_matcher_);
DCHECK(suppression_dictionary_);
Reload();
}
UserDictionary::~UserDictionary() {
reloader_->Join();
delete tokens_;
}
bool UserDictionary::HasKey(StringPiece key) const {
// TODO(noriyukit): Currently, we don't support HasKey() for user dictionary
// because we need to search tokens linearly, which might be slow in extreme
// cases where 100K entries exist.
return false;
}
bool UserDictionary::HasValue(StringPiece value) const {
// TODO(noriyukit): Currently, we don't support HasValue() for user dictionary
// because we need to search tokens linearly, which might be slow in extreme
// cases where 100K entries exist. Note: HasValue() method is used only in
// UserHistoryPredictor for privacy sensitivity check.
return false;
}
void UserDictionary::LookupPredictive(
StringPiece key,
bool, // use_kana_modifier_insensitive_lookup
Callback *callback) const {
scoped_reader_lock l(mutex_.get());
if (key.empty()) {
VLOG(2) << "string of length zero is passed.";
return;
}
if (tokens_->empty()) {
return;
}
if (GET_CONFIG(incognito_mode)) {
return;
}
// Find the starting point of iteration over dictionary contents.
UserPOS::Token key_token;
key.CopyToString(&key_token.key);
vector<UserPOS::Token *>::const_iterator it =
lower_bound(tokens_->begin(), tokens_->end(), &key_token, OrderByKey());
Token token;
for (; it != tokens_->end(); ++it) {
if (!Util::StartsWith((*it)->key, key)) {
break;
}
switch (callback->OnKey((*it)->key)) {
case Callback::TRAVERSE_DONE:
return;
case Callback::TRAVERSE_NEXT_KEY:
case Callback::TRAVERSE_CULL:
continue;
default:
break;
}
FillTokenFromUserPOSToken(**it, &token);
// Override POS IDs for suggest only words.
if (pos_matcher_->IsSuggestOnlyWord((*it)->id)) {
token.lid = token.rid = pos_matcher_->GetUnknownId();
}
if (callback->OnToken((*it)->key, (*it)->key, token) ==
Callback::TRAVERSE_DONE) {
return;
}
}
}
// UserDictionary doesn't support kana modifier insensitive lookup.
void UserDictionary::LookupPrefix(
StringPiece key, bool /*use_kana_modifier_insensitive_lookup*/,
Callback *callback) const {
scoped_reader_lock l(mutex_.get());
if (key.empty()) {
LOG(WARNING) << "string of length zero is passed.";
return;
}
if (tokens_->empty()) {
return;
}
if (GET_CONFIG(incognito_mode)) {
return;
}
// Find the starting point for iteration over dictionary contents.
UserPOS::Token key_token;
key_token.key.assign(key.data(), Util::OneCharLen(key.data()));
vector<UserPOS::Token *>::const_iterator it =
lower_bound(tokens_->begin(), tokens_->end(), &key_token, OrderByKey());
Token token;
for (; it != tokens_->end(); ++it) {
if ((*it)->key > key) {
break;
}
if (pos_matcher_->IsSuggestOnlyWord((*it)->id)) {
continue;
}
if (!Util::StartsWith(key, (*it)->key)) {
continue;
}
switch (callback->OnKey((*it)->key)) {
case Callback::TRAVERSE_DONE:
return;
case Callback::TRAVERSE_NEXT_KEY:
continue;
case Callback::TRAVERSE_CULL:
LOG(FATAL) << "UserDictionary doesn't support culling.";
break;
default:
break;
}
FillTokenFromUserPOSToken(**it, &token);
switch (callback->OnToken((*it)->key, (*it)->key, token)) {
case Callback::TRAVERSE_DONE:
return;
case Callback::TRAVERSE_CULL:
LOG(FATAL) << "UserDictionary doesn't support culling.";
break;
default:
break;
}
}
}
void UserDictionary::LookupExact(StringPiece key, Callback *callback) const {
scoped_reader_lock l(mutex_.get());
if (key.empty() || tokens_->empty() || GET_CONFIG(incognito_mode)) {
return;
}
UserPOS::Token key_token;
key.CopyToString(&key_token.key);
typedef vector<UserPOS::Token *>::const_iterator TokenIterator;
pair<TokenIterator, TokenIterator> range =
equal_range(tokens_->begin(), tokens_->end(), &key_token, OrderByKey());
if (range.first == range.second) {
return;
}
if (callback->OnKey(key) != Callback::TRAVERSE_CONTINUE) {
return;
}
Token token;
for (; range.first != range.second; ++range.first) {
const UserPOS::Token &user_pos_token = **range.first;
if (pos_matcher_->IsSuggestOnlyWord(user_pos_token.id)) {
continue;
}
FillTokenFromUserPOSToken(user_pos_token, &token);
if (callback->OnToken(key, key, token) != Callback::TRAVERSE_CONTINUE) {
return;
}
}
}
void UserDictionary::LookupReverse(StringPiece str,
NodeAllocatorInterface *allocator,
Callback *callback) const {
if (GET_CONFIG(incognito_mode)) {
return;
}
}
bool UserDictionary::LookupComment(StringPiece key, StringPiece value,
string *comment) const {
if (key.empty() || GET_CONFIG(incognito_mode)) {
return false;
}
scoped_reader_lock l(mutex_.get());
if (tokens_->empty()) {
return false;
}
UserPOS::Token key_token;
key.CopyToString(&key_token.key);
typedef vector<UserPOS::Token *>::const_iterator TokenIterator;
pair<TokenIterator, TokenIterator> range =
equal_range(tokens_->begin(), tokens_->end(), &key_token, OrderByKey());
// Set the comment that was found first.
for (; range.first != range.second; ++range.first) {
const UserPOS::Token *token = *range.first;
if (token->value == value && !token->comment.empty()) {
comment->assign(token->comment);
return true;
}
}
return false;
}
bool UserDictionary::Reload() {
if (reloader_->IsRunning()) {
return false;
}
suppression_dictionary_->Lock();
DCHECK(suppression_dictionary_->IsLocked());
reloader_->StartReload();
return true;
}
namespace {
class FindValueCallback : public DictionaryInterface::Callback {
public:
explicit FindValueCallback(StringPiece value)
: value_(value), found_(false) {}
virtual ResultType OnToken(StringPiece, // key
StringPiece, // actual_key
const Token &token) {
if (token.value == value_) {
found_ = true;
return TRAVERSE_DONE;
}
return TRAVERSE_CONTINUE;
}
bool found() const { return found_; }
private:
const StringPiece value_;
bool found_;
};
} // namespace
bool UserDictionary::AddToAutoRegisteredDictionary(
const string &key, const string &value,
user_dictionary::UserDictionary::PosType pos) {
if (reloader_->IsRunning()) {
return false;
}
FindValueCallback callback(value);
LookupExact(key, &callback);
if (callback.found()) {
// Already registered.
return false;
}
suppression_dictionary_->Lock();
DCHECK(suppression_dictionary_->IsLocked());
reloader_->StartAutoRegistration(key, value, pos);
return true;
}
void UserDictionary::WaitForReloader() {
reloader_->Join();
}
void UserDictionary::Swap(TokensIndex *new_tokens) {
DCHECK(new_tokens);
TokensIndex *old_tokens = tokens_;
{
scoped_writer_lock l(mutex_.get());
tokens_ = new_tokens;
}
delete old_tokens;
}
bool UserDictionary::Load(
const user_dictionary::UserDictionaryStorage &storage) {
size_t size = 0;
{
scoped_reader_lock l(mutex_.get());
size = tokens_->size();
}
// If UserDictionary is pretty big, we first remove the
// current dictionary to save memory usage.
#ifdef OS_ANDROID
const size_t kVeryBigUserDictionarySize = 5000;
#else
const size_t kVeryBigUserDictionarySize = 100000;
#endif
if (size >= kVeryBigUserDictionarySize) {
TokensIndex *dummy_empty_tokens = new TokensIndex(user_pos_.get(),
suppression_dictionary_);
Swap(dummy_empty_tokens);
}
TokensIndex *tokens = new TokensIndex(user_pos_.get(),
suppression_dictionary_);
tokens->Load(storage);
Swap(tokens);
return true;
}
void UserDictionary::SetUserDictionaryName(const string &filename) {
Singleton<UserDictionaryFileManager>::get()->SetFileName(filename);
}
} // namespace mozc