blob: 27b657d27c6477217f4eb9a5181a14cdf671913d [file] [log] [blame]
// Copyright 2010-2014, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dictionary/user_dictionary_importer.h"
#ifdef OS_WIN
#include <windows.h>
#ifdef HAS_MSIME_HEADER
#indlude <msime.h>
#endif // HAS_MSIME_HEADER
#endif // OS_WIN
#include <algorithm>
#include <map>
#include <set>
#include <string>
#include <vector>
#include "base/compiler_specific.h"
#include "base/mmap.h"
#include "base/number_util.h"
#include "base/port.h"
#include "base/system_util.h"
#include "base/util.h"
#include "base/win_util.h"
#include "dictionary/user_dictionary_util.h"
namespace mozc {
using user_dictionary::UserDictionary;
using user_dictionary::UserDictionaryCommandStatus;
namespace {
uint64 EntryFingerprint(const UserDictionary::Entry &entry) {
DCHECK_LE(0, entry.pos());
MOZC_CLANG_PUSH_WARNING();
#if MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
MOZC_CLANG_DISABLE_WARNING(tautological-constant-out-of-range-compare);
#endif // MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
DCHECK_LE(entry.pos(), 255);
MOZC_CLANG_POP_WARNING();
return Util::Fingerprint(entry.key() + "\t" +
entry.value() + "\t" +
static_cast<char>(entry.pos()));
}
void NormalizePOS(const string &input, string *output) {
string tmp;
output->clear();
Util::FullWidthAsciiToHalfWidthAscii(input, &tmp);
Util::HalfWidthKatakanaToFullWidthKatakana(tmp, output);
}
// A data type to hold conversion rules of POSes. If mozc_pos is set to be an
// empty string (""), it means that words of the POS should be ignored in Mozc.
struct POSMap {
const char *source_pos; // POS string of a third party IME.
UserDictionary::PosType mozc_pos; // POS of Mozc.
};
// Include actual POS mapping rules defined outside the file.
#include "dictionary/pos_map.h"
// A functor for searching an array of POSMap for the given POS. The class is
// used with std::lower_bound().
class POSMapCompare {
public:
bool operator() (const POSMap &l_pos_map, const POSMap &r_pos_map) const {
return (strcmp(l_pos_map.source_pos, r_pos_map.source_pos) < 0);
}
};
// Convert POS of a third party IME to that of Mozc using the given mapping.
bool ConvertEntryInternal(
const POSMap *pos_map,
size_t map_size,
const UserDictionaryImporter::RawEntry &from,
UserDictionary::Entry *to) {
if (to == NULL) {
LOG(ERROR) << "Null pointer is passed.";
return false;
}
to->Clear();
if (from.pos.empty()) {
return false;
}
// Normalize POS (remove full width ascii and half width katakana)
string pos;
NormalizePOS(from.pos, &pos);
// ATOK's POS has a special marker for distinguishing auto-registered
// words/manually-registered words. Remove the mark here.
// TODO(yukawa): Use string::back once C++11 is enabled on Mac.
if (!pos.empty() && (*pos.rbegin() == '$' || *pos.rbegin() == '*')) {
// TODO(matsuzakit): Use pop_back instead when C++11 is ready on Android.
pos.resize(pos.size() - 1);
}
POSMap key;
key.source_pos = pos.c_str();
key.mozc_pos = static_cast<UserDictionary::PosType>(0);
// Search for mapping for the given POS.
const POSMap *found = lower_bound(pos_map, pos_map + map_size,
key, POSMapCompare());
if (found == pos_map + map_size ||
strcmp(found->source_pos, key.source_pos) != 0) {
LOG(WARNING) << "Invalid POS is passed: " << from.pos;
return false;
}
if (!UserDictionary::PosType_IsValid(found->mozc_pos)) {
to->clear_key();
to->clear_value();
to->clear_pos();
return false;
}
to->set_key(from.key);
to->set_value(from.value);
to->set_pos(found->mozc_pos);
// Normalize reading.
string normalized_key;
UserDictionaryUtil::NormalizeReading(to->key(), &normalized_key);
to->set_key(normalized_key);
// Copy comment.
if (!from.comment.empty()) {
to->set_comment(from.comment);
}
// Validation.
if (UserDictionaryUtil::ValidateEntry(*to) !=
UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS) {
return false;
}
return true;
}
} // namespace
#if defined(OS_WIN) && defined(HAS_MSIME_HEADER)
namespace {
const size_t kBufferSize = 256;
// ProgID of MS-IME Japanese.
const wchar_t kVersionIndependentProgIdForMSIME[] = L"MSIME.Japan";
// Interface identifier of user dictionary in MS-IME.
// {019F7153-E6DB-11d0-83C3-00C04FDDB82E}
const GUID kIidIFEDictionary = {
0x19f7153, 0xe6db, 0x11d0, {0x83, 0xc3, 0x0, 0xc0, 0x4f, 0xdd, 0xb8, 0x2e}
};
IFEDictionary *CreateIFEDictionary() {
CLSID class_id = GUID_NULL;
// On Windows 7 and prior, multiple versions of MS-IME can be installed
// side-by-side. As far as we've observed, the latest version will be chosen
// with version-independent ProgId.
HRESULT result = ::CLSIDFromProgID(kVersionIndependentProgIdForMSIME,
&class_id);
if (FAILED(result)) {
LOG(ERROR) << "CLSIDFromProgID() failed: " << result;
return nullptr;
}
IFEDictionary *obj = nullptr;
result = ::CoCreateInstance(class_id,
nullptr,
CLSCTX_INPROC_SERVER,
kIidIFEDictionary,
reinterpret_cast<void **>(&obj));
if (FAILED(result)) {
LOG(ERROR) << "CoCreateInstance() failed: " << result;
return nullptr;
}
VLOG(1) << "Can create IFEDictionary successfully";
return obj;
}
class ScopedIFEDictionary {
public:
explicit ScopedIFEDictionary(IFEDictionary *dic)
: dic_(dic) {}
~ScopedIFEDictionary() {
if (dic_ != NULL) {
dic_->Close();
dic_->Release();
}
}
IFEDictionary & operator*() const { return *dic_; }
IFEDictionary* operator->() const { return dic_; }
IFEDictionary* get() const { return dic_; }
private:
IFEDictionary *dic_;
};
// Iterator for MS-IME user dictionary
class MSIMEImportIterator
: public UserDictionaryImporter::InputIteratorInterface {
public:
MSIMEImportIterator()
: dic_(CreateIFEDictionary()),
buf_(kBufferSize), result_(E_FAIL), size_(0), index_(0) {
if (dic_.get() == NULL) {
LOG(ERROR) << "IFEDictionaryFactory returned NULL";
return;
}
// open user dictionary
HRESULT result = dic_->Open(NULL, NULL);
if (S_OK != result) {
LOG(ERROR) << "Cannot open user dictionary: " << result_;
return;
}
POSTBL *pos_table = NULL;
int pos_size = 0;
result_ = dic_->GetPosTable(&pos_table, &pos_size);
if (S_OK != result_ || pos_table == NULL || pos_size == 0) {
LOG(ERROR) << "Cannot get POS table: " << result;
result_ = E_FAIL;
return;
}
string name;
for (int i = 0; i < pos_size; ++i) {
Util::SJISToUTF8(reinterpret_cast<char *>(pos_table->szName), &name);
pos_map_.insert(make_pair(pos_table->nPos, name));
++pos_table;
}
// extract all words registered by user.
// Don't use auto-registered words, since Mozc may not be able to
// handle auto_registered words correctly, and user is basically
// unaware of auto-registered words.
result_ = dic_->GetWords(NULL, NULL, NULL,
IFED_POS_ALL,
IFED_SELECT_ALL,
IFED_REG_USER, // | FED_REG_AUTO
reinterpret_cast<UCHAR *>(&buf_[0]),
kBufferSize * sizeof(IMEWRD),
&size_);
}
bool IsAvailable() const {
return result_ == IFED_S_MORE_ENTRIES || result_ == S_OK;
}
// NOTE: Without "UserDictionaryImporter::", Visual C++ 2008 somehow fails
// to look up the type name.
bool Next(UserDictionaryImporter::RawEntry *entry) {
if (!IsAvailable()) {
LOG(ERROR) << "Iterator is not available";
return false;
}
if (entry == NULL) {
LOG(ERROR) << "Entry is NULL";
return false;
}
entry->Clear();
if (index_ < size_) {
if (buf_[index_].pwchReading == NULL ||
buf_[index_].pwchDisplay == NULL) {
++index_;
LOG(ERROR) << "pwchDisplay or pwchReading is NULL";
return true;
}
// set key/value
Util::WideToUTF8(buf_[index_].pwchReading, &entry->key);
Util::WideToUTF8(buf_[index_].pwchDisplay, &entry->value);
// set POS
map<int, string>::const_iterator it = pos_map_.find(buf_[index_].nPos1);
if (it == pos_map_.end()) {
++index_;
LOG(ERROR) << "Unknown POS id: " << buf_[index_].nPos1;
entry->Clear();
return true;
}
entry->pos = it->second;
// set comment
if (buf_[index_].pvComment != NULL) {
if (buf_[index_].uct == IFED_UCT_STRING_SJIS) {
Util::SJISToUTF8(
reinterpret_cast<const char *>(buf_[index_].pvComment),
&entry->comment);
} else if (buf_[index_].uct == IFED_UCT_STRING_UNICODE) {
Util::WideToUTF8(
reinterpret_cast<const wchar_t *>(buf_[index_].pvComment),
&entry->comment);
}
}
}
if (index_ < size_) {
++index_;
return true;
} else if (result_ == S_OK) {
return false;
} else if (result_ == IFED_S_MORE_ENTRIES) {
result_ = dic_->NextWords(reinterpret_cast<UCHAR *>(&buf_[0]),
kBufferSize * sizeof(IMEWRD),
&size_);
if (result_ == E_FAIL) {
LOG(ERROR) << "NextWords() failed";
return false;
}
index_ = 0;
return true;
}
return false;
}
private:
vector<IMEWRD> buf_;
ScopedIFEDictionary dic_;
map<int, string> pos_map_;
HRESULT result_;
ULONG size_;
ULONG index_;
};
} // namespace
#endif // OS_WIN && HAS_MSIME_HEADER
UserDictionaryImporter::ErrorType UserDictionaryImporter::ImportFromMSIME(
UserDictionary *user_dic) {
DCHECK(user_dic);
#if defined(OS_WIN) && defined(HAS_MSIME_HEADER)
MSIMEImportIterator iter;
return ImportFromIterator(&iter, user_dic);
#endif // OS_WIN && HAS_MSIME_HEADER
return IMPORT_NOT_SUPPORTED;
}
UserDictionaryImporter::ErrorType UserDictionaryImporter::ImportFromIterator(
InputIteratorInterface *iter, UserDictionary *user_dic) {
if (iter == NULL || user_dic == NULL) {
LOG(ERROR) << "iter or user_dic is NULL";
return IMPORT_FATAL;
}
const size_t max_size = UserDictionaryUtil::max_entry_size();
ErrorType ret = IMPORT_NO_ERROR;
set<uint64> existent_entries;
for (size_t i = 0; i < user_dic->entries_size(); ++i) {
existent_entries.insert(EntryFingerprint(user_dic->entries(i)));
}
UserDictionary::Entry entry;
RawEntry raw_entry;
while (iter->Next(&raw_entry)) {
if (user_dic->entries_size() >= max_size) {
LOG(WARNING) << "Too many words in one dictionary";
return IMPORT_TOO_MANY_WORDS;
}
if (raw_entry.key.empty() &&
raw_entry.value.empty() &&
raw_entry.comment.empty()) {
// Empty entry is just skipped. It could be annoying if we show a
// warning dialog when these empty candidates exist.
continue;
}
if (!ConvertEntry(raw_entry, &entry)) {
LOG(WARNING) << "Entry is not valid";
ret = IMPORT_INVALID_ENTRIES;
continue;
}
// Don't register words if it is aleady in the current dictionary.
if (!existent_entries.insert(EntryFingerprint(entry)).second) {
continue;
}
UserDictionary::Entry *new_entry = user_dic->add_entries();
DCHECK(new_entry);
new_entry->CopyFrom(entry);
}
return ret;
}
UserDictionaryImporter::ErrorType
UserDictionaryImporter::ImportFromTextLineIterator(
IMEType ime_type,
TextLineIteratorInterface *iter,
UserDictionary *user_dic) {
TextInputIterator text_iter(ime_type, iter);
if (text_iter.ime_type() == NUM_IMES) {
return IMPORT_NOT_SUPPORTED;
}
return ImportFromIterator(&text_iter, user_dic);
}
UserDictionaryImporter::StringTextLineIterator::StringTextLineIterator(
StringPiece data) : data_(data, 0), position_(0) {}
UserDictionaryImporter::StringTextLineIterator::~StringTextLineIterator() {}
bool UserDictionaryImporter::StringTextLineIterator::IsAvailable() const {
return position_ < data_.length();
}
bool UserDictionaryImporter::StringTextLineIterator::Next(string *line) {
if (!IsAvailable()) {
return false;
}
const StringPiece crlf("\r\n");
for (size_t i = position_; i < data_.length(); ++i) {
if (data_[i] == '\n' || data_[i] == '\r') {
const StringPiece next_line = data_.substr(position_, i - position_);
next_line.CopyToString(line);
// Handles CR/LF issue.
const StringPiece possible_crlf = data_.substr(i, 2);
position_ = possible_crlf.compare(crlf) == 0 ? (i + 2) : (i + 1);
return true;
}
}
const StringPiece next_line =
data_.substr(position_, data_.size() - position_);
next_line.CopyToString(line);
position_ = data_.length();
return true;
}
void UserDictionaryImporter::StringTextLineIterator::Reset() {
position_ = 0;
}
UserDictionaryImporter::TextInputIterator::TextInputIterator(
IMEType ime_type,
TextLineIteratorInterface *iter)
: ime_type_(NUM_IMES), iter_(iter) {
CHECK(iter_);
if (!iter_->IsAvailable()) {
return;
}
IMEType guessed_type = NUM_IMES;
string line;
if (iter_->Next(&line)) {
guessed_type = GuessIMEType(line);
iter_->Reset();
}
ime_type_ = DetermineFinalIMEType(ime_type, guessed_type);
VLOG(1) << "Setting type to: " << static_cast<int>(ime_type_);
}
UserDictionaryImporter::TextInputIterator::~TextInputIterator() {}
bool UserDictionaryImporter::TextInputIterator::IsAvailable() const {
DCHECK(iter_);
return (iter_->IsAvailable() &&
ime_type_ != IME_AUTO_DETECT &&
ime_type_ != NUM_IMES);
}
bool UserDictionaryImporter::TextInputIterator::Next(RawEntry *entry) {
DCHECK(iter_);
if (!IsAvailable()) {
LOG(ERROR) << "iterator is not available";
return false;
}
if (entry == NULL) {
LOG(ERROR) << "Entry is NULL";
return false;
}
entry->Clear();
string line;
while (iter_->Next(&line)) {
Util::ChopReturns(&line);
// Skip empty lines.
if (line.empty()) {
continue;
}
// Skip comment lines.
// TODO(yukawa): Use string::front once C++11 is enabled on Mac.
if (((ime_type_ == MSIME || ime_type_ == ATOK) && line[0] == '!') ||
(ime_type_ == MOZC && line[0] == '#') ||
(ime_type_ == KOTOERI && line.find("//") == 0)) {
continue;
}
VLOG(2) << line;
vector<string> values;
switch (ime_type_) {
case MSIME:
case ATOK:
case MOZC:
Util::SplitStringAllowEmpty(line, "\t", &values);
if (values.size() < 3) {
continue; // Ignore this line.
}
entry->key = values[0];
entry->value = values[1];
entry->pos = values[2];
if (values.size() >= 4) {
entry->comment = values[3];
}
return true;
break;
case KOTOERI:
Util::SplitCSV(line, &values);
if (values.size() < 3) {
continue; // Ignore this line.
}
entry->key = values[0];
entry->value = values[1];
entry->pos = values[2];
return true;
break;
default:
LOG(ERROR) << "Unknown format: " << static_cast<int>(ime_type_);
return false;
}
}
return false;
}
bool UserDictionaryImporter::ConvertEntry(
const RawEntry &from, UserDictionary::Entry *to) {
return ConvertEntryInternal(kPOSMap, arraysize(kPOSMap), from, to);
}
UserDictionaryImporter::IMEType
UserDictionaryImporter::GuessIMEType(StringPiece line) {
if (line.empty()) {
return NUM_IMES;
}
string lower = line.as_string();
Util::LowerString(&lower);
if (lower.find("!microsoft ime") == 0) {
return MSIME;
}
// Old ATOK format (!!DICUT10) is not supported for now
// http://b/2455897
if (lower.find("!!dicut") == 0 && lower.size() > 7) {
const string version(lower, 7, lower.size() - 7);
if (NumberUtil::SimpleAtoi(version) >= 11) {
return ATOK;
} else {
return NUM_IMES;
}
}
if (lower.find("!!atok_tango_text_header") == 0) {
return ATOK;
}
if (*line.begin() == '"' && *line.rbegin() == '"' &&
line.find("\t") == string::npos) {
return KOTOERI;
}
if (*line.begin() == '#' || line.find("\t") != string::npos) {
return MOZC;
}
return NUM_IMES;
}
UserDictionaryImporter::IMEType UserDictionaryImporter::DetermineFinalIMEType(
IMEType user_ime_type, IMEType guessed_ime_type) {
IMEType result_ime_type = NUM_IMES;
if (user_ime_type == IME_AUTO_DETECT) {
// Trust guessed type.
result_ime_type = guessed_ime_type;
} else if (user_ime_type == MOZC) {
// MOZC is compatible with MS-IME and ATOK.
// Even if the auto detection failed, try to use Mozc format.
if (guessed_ime_type != KOTOERI) {
result_ime_type = user_ime_type;
}
} else {
// ATOK, MS-IME and Kotoeri can be detected with 100% accuracy.
if (guessed_ime_type == user_ime_type) {
result_ime_type = user_ime_type;
}
}
return result_ime_type;
}
UserDictionaryImporter::EncodingType
UserDictionaryImporter::GuessEncodingType(StringPiece str) {
// Unicode BOM.
if (str.size() >= 2 &&
((static_cast<uint8>(str[0]) == 0xFF &&
static_cast<uint8>(str[1]) == 0xFE) ||
(static_cast<uint8>(str[0]) == 0xFE &&
static_cast<uint8>(str[1]) == 0xFF))) {
return UTF16;
}
// UTF-8 BOM.
if (str.size() >= 3 &&
static_cast<uint8>(str[0]) == 0xEF &&
static_cast<uint8>(str[1]) == 0xBB &&
static_cast<uint8>(str[2]) == 0xBF) {
return UTF8;
}
// Count valid UTF8 characters.
// TODO(taku): Improve the accuracy by making a DFA.
const char *begin = str.data();
const char *end = str.data() + str.size();
size_t valid_utf8 = 0;
size_t valid_script = 0;
while (begin < end) {
size_t mblen = 0;
const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen);
if (mblen == 0) {
break;
}
++valid_utf8;
for (size_t i = 1; i < mblen; ++i) {
if (begin[i] >= 0x80 && begin[i] <= 0xBF) {
++valid_utf8;
}
}
// "\n\r\t " or Japanese code point
if (ucs4 == 0x000A || ucs4 == 0x000D ||
ucs4 == 0x0020 || ucs4 == 0x0009 ||
Util::GetScriptType(ucs4) != Util::UNKNOWN_SCRIPT) {
valid_script += mblen;
}
begin += mblen;
}
// TODO(taku): No theoretical justification for these parameters.
if (1.0 * valid_utf8 / str.size() >= 0.9 &&
1.0 * valid_script / str.size() >= 0.5) {
return UTF8;
}
return SHIFT_JIS;
}
UserDictionaryImporter::EncodingType
UserDictionaryImporter::GuessFileEncodingType(const string &filename) {
Mmap mmap;
if (!mmap.Open(filename.c_str(), "r")) {
LOG(ERROR) << "cannot open: " << filename;
return NUM_ENCODINGS;
}
const size_t kMaxCheckSize = 1024;
const size_t size = min(kMaxCheckSize, static_cast<size_t>(mmap.size()));
const StringPiece mapped_data(static_cast<const char *>(mmap.begin()), size);
return GuessEncodingType(mapped_data);
}
} // namespace mozc