src/dictionary/user_dictionary_importer.cc - mozc - Git at Google

 // Copyright 2010-2015, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
 // copyright notice, this list of conditions and the following disclaimer
 // in the documentation and/or other materials provided with the
 // distribution.
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "dictionary/user_dictionary_importer.h"

 #ifdef OS_WIN
 #include <windows.h>
 #ifdef HAS_MSIME_HEADER
 #indlude <msime.h>
 #endif  // HAS_MSIME_HEADER
 #endif  // OS_WIN

 #include <algorithm>
 #include <map>
 #include <set>
 #include <string>
 #include <vector>

 #include "base/compiler_specific.h"
 #include "base/mmap.h"
 #include "base/number_util.h"
 #include "base/port.h"
 #include "base/system_util.h"
 #include "base/util.h"
 #include "base/win_util.h"
 #include "dictionary/user_dictionary_util.h"

 namespace mozc {

 using user_dictionary::UserDictionary;
 using user_dictionary::UserDictionaryCommandStatus;

 namespace {

 uint64 EntryFingerprint(const UserDictionary::Entry &entry) {
   DCHECK_LE(0, entry.pos());
 MOZC_CLANG_PUSH_WARNING();
 #if MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
 MOZC_CLANG_DISABLE_WARNING(tautological-constant-out-of-range-compare);
 #endif  // MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
   DCHECK_LE(entry.pos(), 255);
 MOZC_CLANG_POP_WARNING();
   return Util::Fingerprint(entry.key() + "\t" +
                            entry.value() + "\t" +
                            static_cast<char>(entry.pos()));
 }

 void NormalizePOS(const string &input, string *output) {
   string tmp;
   output->clear();
   Util::FullWidthAsciiToHalfWidthAscii(input, &tmp);
   Util::HalfWidthKatakanaToFullWidthKatakana(tmp, output);
 }

 // A data type to hold conversion rules of POSes. If mozc_pos is set to be an
 // empty string (""), it means that words of the POS should be ignored in Mozc.
 struct POSMap {
   const char *source_pos;  // POS string of a third party IME.
   UserDictionary::PosType mozc_pos;  // POS of Mozc.
 };

 // Include actual POS mapping rules defined outside the file.
 #include "dictionary/pos_map.h"

 // A functor for searching an array of POSMap for the given POS. The class is
 // used with std::lower_bound().
 class POSMapCompare {
  public:
   bool operator() (const POSMap &l_pos_map, const POSMap &r_pos_map) const {
     return (strcmp(l_pos_map.source_pos, r_pos_map.source_pos) < 0);
   }
 };

 // Convert POS of a third party IME to that of Mozc using the given mapping.
 bool ConvertEntryInternal(
     const POSMap *pos_map,
     size_t map_size,
     const UserDictionaryImporter::RawEntry &from,
     UserDictionary::Entry *to) {
   if (to == NULL) {
     LOG(ERROR) << "Null pointer is passed.";
     return false;
   }

   to->Clear();

   if (from.pos.empty()) {
     return false;
   }

   // Normalize POS (remove full width ascii and half width katakana)
   string pos;
   NormalizePOS(from.pos, &pos);

   // ATOK's POS has a special marker for distinguishing auto-registered
   // words/manually-registered words. Remove the mark here.
   // TODO(yukawa): Use string::back once C++11 is enabled on Mac.
   if (!pos.empty() && (*pos.rbegin() == '$' || *pos.rbegin() == '*')) {
     // TODO(matsuzakit): Use pop_back instead when C++11 is ready on Android.
     pos.resize(pos.size() - 1);
   }

   POSMap key;
   key.source_pos = pos.c_str();
   key.mozc_pos = static_cast<UserDictionary::PosType>(0);

   // Search for mapping for the given POS.
   const POSMap *found = lower_bound(pos_map, pos_map + map_size,
                                     key, POSMapCompare());
   if (found == pos_map + map_size ||
       strcmp(found->source_pos, key.source_pos) != 0) {
     LOG(WARNING) << "Invalid POS is passed: " << from.pos;
     return false;
   }
   if (!UserDictionary::PosType_IsValid(found->mozc_pos)) {
     to->clear_key();
     to->clear_value();
     to->clear_pos();
     return false;
   }

   to->set_key(from.key);
   to->set_value(from.value);
   to->set_pos(found->mozc_pos);

   // Normalize reading.
   string normalized_key;
   UserDictionaryUtil::NormalizeReading(to->key(), &normalized_key);
   to->set_key(normalized_key);

   // Copy comment.
   if (!from.comment.empty()) {
     to->set_comment(from.comment);
   }

   // Validation.
   if (UserDictionaryUtil::ValidateEntry(*to) !=
       UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS) {
     return false;
   }

   return true;
 }

 }  // namespace

 #if defined(OS_WIN) && defined(HAS_MSIME_HEADER)
 namespace {

 const size_t kBufferSize = 256;

 // ProgID of MS-IME Japanese.
 const wchar_t kVersionIndependentProgIdForMSIME[] = L"MSIME.Japan";

 // Interface identifier of user dictionary in MS-IME.
 // {019F7153-E6DB-11d0-83C3-00C04FDDB82E}
 const GUID kIidIFEDictionary = {
   0x19f7153, 0xe6db, 0x11d0, {0x83, 0xc3, 0x0, 0xc0, 0x4f, 0xdd, 0xb8, 0x2e}
 };

 IFEDictionary *CreateIFEDictionary() {
   CLSID class_id = GUID_NULL;
   // On Windows 7 and prior, multiple versions of MS-IME can be installed
   // side-by-side. As far as we've observed, the latest version will be chosen
   // with version-independent ProgId.
   HRESULT result = ::CLSIDFromProgID(kVersionIndependentProgIdForMSIME,
                                      &class_id);
   if (FAILED(result)) {
     LOG(ERROR) << "CLSIDFromProgID() failed: " << result;
     return nullptr;
   }
   IFEDictionary *obj = nullptr;
   result = ::CoCreateInstance(class_id,
                               nullptr,
                               CLSCTX_INPROC_SERVER,
                               kIidIFEDictionary,
                               reinterpret_cast<void **>(&obj));
   if (FAILED(result)) {
     LOG(ERROR) << "CoCreateInstance() failed: " << result;
     return nullptr;
   }
   VLOG(1) << "Can create IFEDictionary successfully";
   return obj;
 }

 class ScopedIFEDictionary {
  public:
   explicit ScopedIFEDictionary(IFEDictionary *dic)
       : dic_(dic) {}

   ~ScopedIFEDictionary() {
     if (dic_ != NULL) {
       dic_->Close();
       dic_->Release();
     }
   }

   IFEDictionary & operator*() const { return *dic_; }
   IFEDictionary* operator->() const { return dic_; }
   IFEDictionary* get() const { return dic_; }

  private:
   IFEDictionary *dic_;
 };

 // Iterator for MS-IME user dictionary
 class MSIMEImportIterator
     : public UserDictionaryImporter::InputIteratorInterface {
  public:
   MSIMEImportIterator()
       : dic_(CreateIFEDictionary()),
         buf_(kBufferSize), result_(E_FAIL), size_(0), index_(0) {
     if (dic_.get() == NULL) {
       LOG(ERROR) << "IFEDictionaryFactory returned NULL";
       return;
     }

     // open user dictionary
     HRESULT result = dic_->Open(NULL, NULL);
     if (S_OK != result) {
       LOG(ERROR) << "Cannot open user dictionary: " << result_;
       return;
     }

     POSTBL *pos_table = NULL;
     int pos_size = 0;
     result_ = dic_->GetPosTable(&pos_table, &pos_size);
     if (S_OK != result_ || pos_table == NULL || pos_size == 0) {
       LOG(ERROR) << "Cannot get POS table: " << result;
       result_ = E_FAIL;
       return;
     }

     string name;
     for (int i = 0; i < pos_size; ++i) {
       Util::SJISToUTF8(reinterpret_cast<char *>(pos_table->szName), &name);
       pos_map_.insert(make_pair(pos_table->nPos, name));
       ++pos_table;
     }

     // extract all words registered by user.
     // Don't use auto-registered words, since Mozc may not be able to
     // handle auto_registered words correctly, and user is basically
     // unaware of auto-registered words.
     result_ = dic_->GetWords(NULL, NULL, NULL,
                              IFED_POS_ALL,
                              IFED_SELECT_ALL,
                              IFED_REG_USER,  // | FED_REG_AUTO
                              reinterpret_cast<UCHAR *>(&buf_[0]),
                              kBufferSize * sizeof(IMEWRD),
                              &size_);
   }

   bool IsAvailable() const {
     return result_ == IFED_S_MORE_ENTRIES || result_ == S_OK;
   }

   // NOTE: Without "UserDictionaryImporter::", Visual C++ 2008 somehow fails
   //     to look up the type name.
   bool Next(UserDictionaryImporter::RawEntry *entry) {
     if (!IsAvailable()) {
       LOG(ERROR) << "Iterator is not available";
       return false;
     }

     if (entry == NULL) {
       LOG(ERROR) << "Entry is NULL";
       return false;
     }
     entry->Clear();

     if (index_ < size_) {
       if (buf_[index_].pwchReading == NULL ||
           buf_[index_].pwchDisplay == NULL) {
         ++index_;
         LOG(ERROR) << "pwchDisplay or pwchReading is NULL";
         return true;
       }

       // set key/value
       Util::WideToUTF8(buf_[index_].pwchReading, &entry->key);
       Util::WideToUTF8(buf_[index_].pwchDisplay, &entry->value);

       // set POS
       map<int, string>::const_iterator it = pos_map_.find(buf_[index_].nPos1);
       if (it == pos_map_.end()) {
         ++index_;
         LOG(ERROR) << "Unknown POS id: " << buf_[index_].nPos1;
         entry->Clear();
         return true;
       }
       entry->pos = it->second;

       // set comment
       if (buf_[index_].pvComment != NULL) {
         if (buf_[index_].uct == IFED_UCT_STRING_SJIS) {
           Util::SJISToUTF8(
               reinterpret_cast<const char *>(buf_[index_].pvComment),
               &entry->comment);
         } else if (buf_[index_].uct == IFED_UCT_STRING_UNICODE) {
           Util::WideToUTF8(
               reinterpret_cast<const wchar_t *>(buf_[index_].pvComment),
               &entry->comment);
         }
       }
     }

     if (index_ < size_) {
       ++index_;
       return true;
     } else if (result_ == S_OK) {
       return false;
     } else if (result_ == IFED_S_MORE_ENTRIES) {
       result_ = dic_->NextWords(reinterpret_cast<UCHAR *>(&buf_[0]),
                                 kBufferSize * sizeof(IMEWRD),
                                &size_);
       if (result_ == E_FAIL) {
         LOG(ERROR) << "NextWords() failed";
         return false;
       }
       index_ = 0;
       return true;
     }

     return false;
   }

  private:
   vector<IMEWRD> buf_;
   ScopedIFEDictionary dic_;
   map<int, string> pos_map_;
   HRESULT result_;
   ULONG size_;
   ULONG index_;
 };

 }  // namespace
 #endif  // OS_WIN && HAS_MSIME_HEADER

 UserDictionaryImporter::ErrorType UserDictionaryImporter::ImportFromMSIME(
     UserDictionary *user_dic) {
   DCHECK(user_dic);
 #if defined(OS_WIN) && defined(HAS_MSIME_HEADER)
   MSIMEImportIterator iter;
   return ImportFromIterator(&iter, user_dic);
 #endif  // OS_WIN && HAS_MSIME_HEADER
   return IMPORT_NOT_SUPPORTED;
 }

 UserDictionaryImporter::ErrorType UserDictionaryImporter::ImportFromIterator(
     InputIteratorInterface *iter, UserDictionary *user_dic) {
   if (iter == NULL || user_dic == NULL) {
     LOG(ERROR) << "iter or user_dic is NULL";
     return IMPORT_FATAL;
   }

   const size_t max_size = UserDictionaryUtil::max_entry_size();

   ErrorType ret = IMPORT_NO_ERROR;

   set<uint64> existent_entries;
   for (size_t i = 0; i < user_dic->entries_size(); ++i) {
     existent_entries.insert(EntryFingerprint(user_dic->entries(i)));
   }

   UserDictionary::Entry entry;
   RawEntry raw_entry;
   while (iter->Next(&raw_entry)) {
     if (user_dic->entries_size() >= max_size) {
       LOG(WARNING) << "Too many words in one dictionary";
       return IMPORT_TOO_MANY_WORDS;
     }

     if (raw_entry.key.empty() &&
         raw_entry.value.empty() &&
         raw_entry.comment.empty()) {
       // Empty entry is just skipped. It could be annoying if we show a
       // warning dialog when these empty candidates exist.
       continue;
     }

     if (!ConvertEntry(raw_entry, &entry)) {
       LOG(WARNING) << "Entry is not valid";
       ret = IMPORT_INVALID_ENTRIES;
       continue;
     }

     // Don't register words if it is aleady in the current dictionary.
     if (!existent_entries.insert(EntryFingerprint(entry)).second) {
       continue;
     }

     UserDictionary::Entry *new_entry = user_dic->add_entries();
     DCHECK(new_entry);
     new_entry->CopyFrom(entry);
   }

   return ret;
 }

 UserDictionaryImporter::ErrorType
 UserDictionaryImporter::ImportFromTextLineIterator(
     IMEType ime_type,
     TextLineIteratorInterface *iter,
     UserDictionary *user_dic) {
   TextInputIterator text_iter(ime_type, iter);
   if (text_iter.ime_type() == NUM_IMES) {
     return IMPORT_NOT_SUPPORTED;
   }

   return ImportFromIterator(&text_iter, user_dic);
 }

 UserDictionaryImporter::StringTextLineIterator::StringTextLineIterator(
     StringPiece data) : data_(data, 0), position_(0) {}

 UserDictionaryImporter::StringTextLineIterator::~StringTextLineIterator() {}

 bool UserDictionaryImporter::StringTextLineIterator::IsAvailable() const {
   return position_ < data_.length();
 }

 bool UserDictionaryImporter::StringTextLineIterator::Next(string *line) {
   if (!IsAvailable()) {
     return false;
   }

   const StringPiece crlf("\r\n");
   for (size_t i = position_; i < data_.length(); ++i) {
     if (data_[i] == '\n' || data_[i] == '\r') {
       const StringPiece next_line = data_.substr(position_, i - position_);
       next_line.CopyToString(line);
       // Handles CR/LF issue.
       const StringPiece possible_crlf = data_.substr(i, 2);
       position_ = possible_crlf.compare(crlf) == 0 ? (i + 2) : (i + 1);
       return true;
     }
   }

   const StringPiece next_line =
       data_.substr(position_, data_.size() - position_);
   next_line.CopyToString(line);
   position_ = data_.length();
   return true;
 }

 void UserDictionaryImporter::StringTextLineIterator::Reset() {
   position_ = 0;
 }

 UserDictionaryImporter::TextInputIterator::TextInputIterator(
     IMEType ime_type,
     TextLineIteratorInterface *iter)
     : ime_type_(NUM_IMES), iter_(iter) {
   CHECK(iter_);
   if (!iter_->IsAvailable()) {
     return;
   }

   IMEType guessed_type = NUM_IMES;
   string line;
   if (iter_->Next(&line)) {
     guessed_type = GuessIMEType(line);
     iter_->Reset();
   }

   ime_type_ = DetermineFinalIMEType(ime_type, guessed_type);

   VLOG(1) << "Setting type to: " << static_cast<int>(ime_type_);
 }

 UserDictionaryImporter::TextInputIterator::~TextInputIterator() {}

 bool UserDictionaryImporter::TextInputIterator::IsAvailable() const {
   DCHECK(iter_);
   return (iter_->IsAvailable() &&
           ime_type_ != IME_AUTO_DETECT &&
           ime_type_ != NUM_IMES);
 }

 bool UserDictionaryImporter::TextInputIterator::Next(RawEntry *entry) {
   DCHECK(iter_);
   if (!IsAvailable()) {
     LOG(ERROR) << "iterator is not available";
     return false;
   }

   if (entry == NULL) {
     LOG(ERROR) << "Entry is NULL";
     return false;
   }

   entry->Clear();

   string line;
   while (iter_->Next(&line)) {
     Util::ChopReturns(&line);
     // Skip empty lines.
     if (line.empty()) {
       continue;
     }
     // Skip comment lines.
     // TODO(yukawa): Use string::front once C++11 is enabled on Mac.
     if (((ime_type_ == MSIME || ime_type_ == ATOK) && line[0] == '!') ||
         (ime_type_ == MOZC && line[0] == '#') ||
         (ime_type_ == KOTOERI && line.find("//") == 0)) {
       continue;
     }

     VLOG(2) << line;

     vector<string> values;
     switch (ime_type_) {
       case MSIME:
       case ATOK:
       case MOZC:
         Util::SplitStringAllowEmpty(line, "\t", &values);
         if (values.size() < 3) {
           continue;  // Ignore this line.
         }
         entry->key = values[0];
         entry->value = values[1];
         entry->pos = values[2];
         if (values.size() >= 4) {
           entry->comment = values[3];
         }
         return true;
         break;
       case KOTOERI:
         Util::SplitCSV(line, &values);
         if (values.size() < 3) {
           continue;  // Ignore this line.
         }
         entry->key = values[0];
         entry->value = values[1];
         entry->pos = values[2];
         return true;
         break;
       default:
         LOG(ERROR) << "Unknown format: " << static_cast<int>(ime_type_);
         return false;
     }
   }

   return false;
 }

 bool UserDictionaryImporter::ConvertEntry(
     const RawEntry &from, UserDictionary::Entry *to) {
   return ConvertEntryInternal(kPOSMap, arraysize(kPOSMap), from, to);
 }

 UserDictionaryImporter::IMEType
 UserDictionaryImporter::GuessIMEType(StringPiece line) {
   if (line.empty()) {
     return NUM_IMES;
   }

   string lower = line.as_string();
   Util::LowerString(&lower);

   if (lower.find("!microsoft ime") == 0) {
     return MSIME;
   }

   // Old ATOK format (!!DICUT10) is not supported for now
   // http://b/2455897
   if (lower.find("!!dicut") == 0 && lower.size() > 7) {
     const string version(lower, 7, lower.size() - 7);
     if (NumberUtil::SimpleAtoi(version) >= 11) {
       return ATOK;
     } else {
       return NUM_IMES;
     }
   }

   if (lower.find("!!atok_tango_text_header") == 0) {
     return ATOK;
   }

   if (*line.begin() == '"' && *line.rbegin() == '"' &&
       line.find("\t") == string::npos) {
     return KOTOERI;
   }

   if (*line.begin() == '#' || line.find("\t") != string::npos) {
     return MOZC;
   }

   return NUM_IMES;
 }

 UserDictionaryImporter::IMEType UserDictionaryImporter::DetermineFinalIMEType(
     IMEType user_ime_type, IMEType guessed_ime_type) {
   IMEType result_ime_type = NUM_IMES;

   if (user_ime_type == IME_AUTO_DETECT) {
     // Trust guessed type.
     result_ime_type = guessed_ime_type;
   } else if (user_ime_type == MOZC) {
     // MOZC is compatible with MS-IME and ATOK.
     // Even if the auto detection failed, try to use Mozc format.
     if (guessed_ime_type != KOTOERI) {
       result_ime_type = user_ime_type;
     }
   } else {
     // ATOK, MS-IME and Kotoeri can be detected with 100% accuracy.
     if (guessed_ime_type == user_ime_type) {
       result_ime_type = user_ime_type;
     }
   }

   return result_ime_type;
 }

 UserDictionaryImporter::EncodingType
 UserDictionaryImporter::GuessEncodingType(StringPiece str) {
   // Unicode BOM.
   if (str.size() >= 2 &&
       ((static_cast<uint8>(str[0]) == 0xFF &&
         static_cast<uint8>(str[1]) == 0xFE) ||
        (static_cast<uint8>(str[0]) == 0xFE &&
         static_cast<uint8>(str[1]) == 0xFF))) {
     return UTF16;
   }

   // UTF-8 BOM.
   if (str.size() >= 3 &&
       static_cast<uint8>(str[0]) == 0xEF &&
       static_cast<uint8>(str[1]) == 0xBB &&
       static_cast<uint8>(str[2]) == 0xBF) {
     return UTF8;
   }

   // Count valid UTF8 characters.
   // TODO(taku): Improve the accuracy by making a DFA.
   const char *begin = str.data();
   const char *end = str.data() + str.size();
   size_t valid_utf8 = 0;
   size_t valid_script = 0;
   while (begin < end) {
     size_t mblen = 0;
     const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen);
     if (mblen == 0) {
       break;
     }
     ++valid_utf8;
     for (size_t i = 1; i < mblen; ++i) {
       if (begin[i] >= 0x80 && begin[i] <= 0xBF) {
         ++valid_utf8;
       }
     }

     // "\n\r\t " or Japanese code point
     if (ucs4 == 0x000A || ucs4 == 0x000D ||
         ucs4 == 0x0020 || ucs4 == 0x0009 ||
         Util::GetScriptType(ucs4) != Util::UNKNOWN_SCRIPT) {
       valid_script += mblen;
     }

     begin += mblen;
   }

   // TODO(taku): No theoretical justification for these parameters.
   if (1.0 * valid_utf8 / str.size() >= 0.9 &&
       1.0 * valid_script / str.size() >= 0.5) {
     return UTF8;
   }

   return SHIFT_JIS;
 }

 UserDictionaryImporter::EncodingType
 UserDictionaryImporter::GuessFileEncodingType(const string &filename) {
   Mmap mmap;
   if (!mmap.Open(filename.c_str(), "r")) {
     LOG(ERROR) << "cannot open: " << filename;
     return NUM_ENCODINGS;
   }
   const size_t kMaxCheckSize = 1024;
   const size_t size = min(kMaxCheckSize, static_cast<size_t>(mmap.size()));
   const StringPiece mapped_data(static_cast<const char *>(mmap.begin()), size);
   return GuessEncodingType(mapped_data);
 }

 }  // namespace mozc
	// Copyright 2010-2015, Google Inc.
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are
	// met:
	//
	// * Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above
	// copyright notice, this list of conditions and the following disclaimer
	// in the documentation and/or other materials provided with the
	// distribution.
	// * Neither the name of Google Inc. nor the names of its
	// contributors may be used to endorse or promote products derived from
	// this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	#include "dictionary/user_dictionary_importer.h"

	#ifdef OS_WIN
	#include <windows.h>
	#ifdef HAS_MSIME_HEADER
	#indlude <msime.h>
	#endif // HAS_MSIME_HEADER
	#endif // OS_WIN

	#include <algorithm>
	#include <map>
	#include <set>
	#include <string>
	#include <vector>

	#include "base/compiler_specific.h"
	#include "base/mmap.h"
	#include "base/number_util.h"
	#include "base/port.h"
	#include "base/system_util.h"
	#include "base/util.h"
	#include "base/win_util.h"
	#include "dictionary/user_dictionary_util.h"

	namespace mozc {

	using user_dictionary::UserDictionary;
	using user_dictionary::UserDictionaryCommandStatus;

	namespace {

	uint64 EntryFingerprint(const UserDictionary::Entry &entry) {
	DCHECK_LE(0, entry.pos());
	MOZC_CLANG_PUSH_WARNING();
	#if MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
	MOZC_CLANG_DISABLE_WARNING(tautological-constant-out-of-range-compare);
	#endif // MOZC_CLANG_HAS_WARNING(tautological-constant-out-of-range-compare)
	DCHECK_LE(entry.pos(), 255);
	MOZC_CLANG_POP_WARNING();
	return Util::Fingerprint(entry.key() + "\t" +
	entry.value() + "\t" +
	static_cast<char>(entry.pos()));
	}

	void NormalizePOS(const string &input, string *output) {
	string tmp;
	output->clear();
	Util::FullWidthAsciiToHalfWidthAscii(input, &tmp);
	Util::HalfWidthKatakanaToFullWidthKatakana(tmp, output);
	}

	// A data type to hold conversion rules of POSes. If mozc_pos is set to be an
	// empty string (""), it means that words of the POS should be ignored in Mozc.
	struct POSMap {
	const char *source_pos; // POS string of a third party IME.
	UserDictionary::PosType mozc_pos; // POS of Mozc.
	};

	// Include actual POS mapping rules defined outside the file.
	#include "dictionary/pos_map.h"

	// A functor for searching an array of POSMap for the given POS. The class is
	// used with std::lower_bound().
	class POSMapCompare {
	public:
	bool operator() (const POSMap &l_pos_map, const POSMap &r_pos_map) const {
	return (strcmp(l_pos_map.source_pos, r_pos_map.source_pos) < 0);
	}
	};

	// Convert POS of a third party IME to that of Mozc using the given mapping.
	bool ConvertEntryInternal(
	const POSMap *pos_map,
	size_t map_size,
	const UserDictionaryImporter::RawEntry &from,
	UserDictionary::Entry *to) {
	if (to == NULL) {
	LOG(ERROR) << "Null pointer is passed.";
	return false;
	}

	to->Clear();

	if (from.pos.empty()) {
	return false;
	}

	// Normalize POS (remove full width ascii and half width katakana)
	string pos;
	NormalizePOS(from.pos, &pos);

	// ATOK's POS has a special marker for distinguishing auto-registered
	// words/manually-registered words. Remove the mark here.
	// TODO(yukawa): Use string::back once C++11 is enabled on Mac.
	if (!pos.empty() && (pos.rbegin() == '$' \|\| pos.rbegin() == '*')) {
	// TODO(matsuzakit): Use pop_back instead when C++11 is ready on Android.
	pos.resize(pos.size() - 1);
	}

	POSMap key;
	key.source_pos = pos.c_str();
	key.mozc_pos = static_cast<UserDictionary::PosType>(0);

	// Search for mapping for the given POS.
	const POSMap *found = lower_bound(pos_map, pos_map + map_size,
	key, POSMapCompare());
	if (found == pos_map + map_size \|\|
	strcmp(found->source_pos, key.source_pos) != 0) {
	LOG(WARNING) << "Invalid POS is passed: " << from.pos;
	return false;
	}
	if (!UserDictionary::PosType_IsValid(found->mozc_pos)) {
	to->clear_key();
	to->clear_value();
	to->clear_pos();
	return false;
	}

	to->set_key(from.key);
	to->set_value(from.value);
	to->set_pos(found->mozc_pos);

	// Normalize reading.
	string normalized_key;
	UserDictionaryUtil::NormalizeReading(to->key(), &normalized_key);
	to->set_key(normalized_key);

	// Copy comment.
	if (!from.comment.empty()) {
	to->set_comment(from.comment);
	}

	// Validation.
	if (UserDictionaryUtil::ValidateEntry(*to) !=
	UserDictionaryCommandStatus::USER_DICTIONARY_COMMAND_SUCCESS) {
	return false;
	}

	return true;
	}

	} // namespace

	#if defined(OS_WIN) && defined(HAS_MSIME_HEADER)
	namespace {

	const size_t kBufferSize = 256;

	// ProgID of MS-IME Japanese.
	const wchar_t kVersionIndependentProgIdForMSIME[] = L"MSIME.Japan";

	// Interface identifier of user dictionary in MS-IME.
	// {019F7153-E6DB-11d0-83C3-00C04FDDB82E}
	const GUID kIidIFEDictionary = {
	0x19f7153, 0xe6db, 0x11d0, {0x83, 0xc3, 0x0, 0xc0, 0x4f, 0xdd, 0xb8, 0x2e}
	};

	IFEDictionary *CreateIFEDictionary() {
	CLSID class_id = GUID_NULL;
	// On Windows 7 and prior, multiple versions of MS-IME can be installed
	// side-by-side. As far as we've observed, the latest version will be chosen
	// with version-independent ProgId.
	HRESULT result = ::CLSIDFromProgID(kVersionIndependentProgIdForMSIME,
	&class_id);
	if (FAILED(result)) {
	LOG(ERROR) << "CLSIDFromProgID() failed: " << result;
	return nullptr;
	}
	IFEDictionary *obj = nullptr;
	result = ::CoCreateInstance(class_id,
	nullptr,
	CLSCTX_INPROC_SERVER,
	kIidIFEDictionary,
	reinterpret_cast<void **>(&obj));
	if (FAILED(result)) {
	LOG(ERROR) << "CoCreateInstance() failed: " << result;
	return nullptr;
	}
	VLOG(1) << "Can create IFEDictionary successfully";
	return obj;
	}

	class ScopedIFEDictionary {
	public:
	explicit ScopedIFEDictionary(IFEDictionary *dic)
	: dic_(dic) {}

	~ScopedIFEDictionary() {
	if (dic_ != NULL) {
	dic_->Close();
	dic_->Release();
	}
	}

	IFEDictionary & operator() const { return dic_; }
	IFEDictionary* operator->() const { return dic_; }
	IFEDictionary* get() const { return dic_; }

	private:
	IFEDictionary *dic_;
	};

	// Iterator for MS-IME user dictionary
	class MSIMEImportIterator
	: public UserDictionaryImporter::InputIteratorInterface {
	public:
	MSIMEImportIterator()
	: dic_(CreateIFEDictionary()),
	buf_(kBufferSize), result_(E_FAIL), size_(0), index_(0) {
	if (dic_.get() == NULL) {
	LOG(ERROR) << "IFEDictionaryFactory returned NULL";
	return;
	}

	// open user dictionary
	HRESULT result = dic_->Open(NULL, NULL);
	if (S_OK != result) {
	LOG(ERROR) << "Cannot open user dictionary: " << result_;
	return;
	}

	POSTBL *pos_table = NULL;
	int pos_size = 0;
	result_ = dic_->GetPosTable(&pos_table, &pos_size);
	if (S_OK != result_ \|\| pos_table == NULL \|\| pos_size == 0) {
	LOG(ERROR) << "Cannot get POS table: " << result;
	result_ = E_FAIL;
	return;
	}

	string name;
	for (int i = 0; i < pos_size; ++i) {
	Util::SJISToUTF8(reinterpret_cast<char *>(pos_table->szName), &name);
	pos_map_.insert(make_pair(pos_table->nPos, name));
	++pos_table;
	}

	// extract all words registered by user.
	// Don't use auto-registered words, since Mozc may not be able to
	// handle auto_registered words correctly, and user is basically
	// unaware of auto-registered words.
	result_ = dic_->GetWords(NULL, NULL, NULL,
	IFED_POS_ALL,
	IFED_SELECT_ALL,
	IFED_REG_USER, // \| FED_REG_AUTO
	reinterpret_cast<UCHAR *>(&buf_[0]),
	kBufferSize * sizeof(IMEWRD),
	&size_);
	}

	bool IsAvailable() const {
	return result_ == IFED_S_MORE_ENTRIES \|\| result_ == S_OK;
	}

	// NOTE: Without "UserDictionaryImporter::", Visual C++ 2008 somehow fails
	// to look up the type name.
	bool Next(UserDictionaryImporter::RawEntry *entry) {
	if (!IsAvailable()) {
	LOG(ERROR) << "Iterator is not available";
	return false;
	}

	if (entry == NULL) {
	LOG(ERROR) << "Entry is NULL";
	return false;
	}
	entry->Clear();

	if (index_ < size_) {
	if (buf_[index_].pwchReading == NULL \|\|
	buf_[index_].pwchDisplay == NULL) {
	++index_;
	LOG(ERROR) << "pwchDisplay or pwchReading is NULL";
	return true;
	}

	// set key/value
	Util::WideToUTF8(buf_[index_].pwchReading, &entry->key);
	Util::WideToUTF8(buf_[index_].pwchDisplay, &entry->value);

	// set POS
	map<int, string>::const_iterator it = pos_map_.find(buf_[index_].nPos1);
	if (it == pos_map_.end()) {
	++index_;
	LOG(ERROR) << "Unknown POS id: " << buf_[index_].nPos1;
	entry->Clear();
	return true;
	}
	entry->pos = it->second;

	// set comment
	if (buf_[index_].pvComment != NULL) {
	if (buf_[index_].uct == IFED_UCT_STRING_SJIS) {
	Util::SJISToUTF8(
	reinterpret_cast<const char *>(buf_[index_].pvComment),
	&entry->comment);
	} else if (buf_[index_].uct == IFED_UCT_STRING_UNICODE) {
	Util::WideToUTF8(
	reinterpret_cast<const wchar_t *>(buf_[index_].pvComment),
	&entry->comment);
	}
	}
	}

	if (index_ < size_) {
	++index_;
	return true;
	} else if (result_ == S_OK) {
	return false;
	} else if (result_ == IFED_S_MORE_ENTRIES) {
	result_ = dic_->NextWords(reinterpret_cast<UCHAR *>(&buf_[0]),
	kBufferSize * sizeof(IMEWRD),
	&size_);
	if (result_ == E_FAIL) {
	LOG(ERROR) << "NextWords() failed";
	return false;
	}
	index_ = 0;
	return true;
	}

	return false;
	}

	private:
	vector<IMEWRD> buf_;
	ScopedIFEDictionary dic_;
	map<int, string> pos_map_;
	HRESULT result_;
	ULONG size_;
	ULONG index_;
	};

	} // namespace
	#endif // OS_WIN && HAS_MSIME_HEADER

	UserDictionaryImporter::ErrorType UserDictionaryImporter::ImportFromMSIME(
	UserDictionary *user_dic) {
	DCHECK(user_dic);
	#if defined(OS_WIN) && defined(HAS_MSIME_HEADER)
	MSIMEImportIterator iter;
	return ImportFromIterator(&iter, user_dic);
	#endif // OS_WIN && HAS_MSIME_HEADER
	return IMPORT_NOT_SUPPORTED;
	}

	UserDictionaryImporter::ErrorType UserDictionaryImporter::ImportFromIterator(
	InputIteratorInterface iter, UserDictionary user_dic) {
	if (iter == NULL \|\| user_dic == NULL) {
	LOG(ERROR) << "iter or user_dic is NULL";
	return IMPORT_FATAL;
	}

	const size_t max_size = UserDictionaryUtil::max_entry_size();

	ErrorType ret = IMPORT_NO_ERROR;

	set<uint64> existent_entries;
	for (size_t i = 0; i < user_dic->entries_size(); ++i) {
	existent_entries.insert(EntryFingerprint(user_dic->entries(i)));
	}

	UserDictionary::Entry entry;
	RawEntry raw_entry;
	while (iter->Next(&raw_entry)) {
	if (user_dic->entries_size() >= max_size) {
	LOG(WARNING) << "Too many words in one dictionary";
	return IMPORT_TOO_MANY_WORDS;
	}

	if (raw_entry.key.empty() &&
	raw_entry.value.empty() &&
	raw_entry.comment.empty()) {
	// Empty entry is just skipped. It could be annoying if we show a
	// warning dialog when these empty candidates exist.
	continue;
	}

	if (!ConvertEntry(raw_entry, &entry)) {
	LOG(WARNING) << "Entry is not valid";
	ret = IMPORT_INVALID_ENTRIES;
	continue;
	}

	// Don't register words if it is aleady in the current dictionary.
	if (!existent_entries.insert(EntryFingerprint(entry)).second) {
	continue;
	}

	UserDictionary::Entry *new_entry = user_dic->add_entries();
	DCHECK(new_entry);
	new_entry->CopyFrom(entry);
	}

	return ret;
	}

	UserDictionaryImporter::ErrorType
	UserDictionaryImporter::ImportFromTextLineIterator(
	IMEType ime_type,
	TextLineIteratorInterface *iter,
	UserDictionary *user_dic) {
	TextInputIterator text_iter(ime_type, iter);
	if (text_iter.ime_type() == NUM_IMES) {
	return IMPORT_NOT_SUPPORTED;
	}

	return ImportFromIterator(&text_iter, user_dic);
	}

	UserDictionaryImporter::StringTextLineIterator::StringTextLineIterator(
	StringPiece data) : data_(data, 0), position_(0) {}

	UserDictionaryImporter::StringTextLineIterator::~StringTextLineIterator() {}

	bool UserDictionaryImporter::StringTextLineIterator::IsAvailable() const {
	return position_ < data_.length();
	}

	bool UserDictionaryImporter::StringTextLineIterator::Next(string *line) {
	if (!IsAvailable()) {
	return false;
	}

	const StringPiece crlf("\r\n");
	for (size_t i = position_; i < data_.length(); ++i) {
	if (data_[i] == '\n' \|\| data_[i] == '\r') {
	const StringPiece next_line = data_.substr(position_, i - position_);
	next_line.CopyToString(line);
	// Handles CR/LF issue.
	const StringPiece possible_crlf = data_.substr(i, 2);
	position_ = possible_crlf.compare(crlf) == 0 ? (i + 2) : (i + 1);
	return true;
	}
	}

	const StringPiece next_line =
	data_.substr(position_, data_.size() - position_);
	next_line.CopyToString(line);
	position_ = data_.length();
	return true;
	}

	void UserDictionaryImporter::StringTextLineIterator::Reset() {
	position_ = 0;
	}

	UserDictionaryImporter::TextInputIterator::TextInputIterator(
	IMEType ime_type,
	TextLineIteratorInterface *iter)
	: ime_type_(NUM_IMES), iter_(iter) {
	CHECK(iter_);
	if (!iter_->IsAvailable()) {
	return;
	}

	IMEType guessed_type = NUM_IMES;
	string line;
	if (iter_->Next(&line)) {
	guessed_type = GuessIMEType(line);
	iter_->Reset();
	}

	ime_type_ = DetermineFinalIMEType(ime_type, guessed_type);

	VLOG(1) << "Setting type to: " << static_cast<int>(ime_type_);
	}

	UserDictionaryImporter::TextInputIterator::~TextInputIterator() {}

	bool UserDictionaryImporter::TextInputIterator::IsAvailable() const {
	DCHECK(iter_);
	return (iter_->IsAvailable() &&
	ime_type_ != IME_AUTO_DETECT &&
	ime_type_ != NUM_IMES);
	}

	bool UserDictionaryImporter::TextInputIterator::Next(RawEntry *entry) {
	DCHECK(iter_);
	if (!IsAvailable()) {
	LOG(ERROR) << "iterator is not available";
	return false;
	}

	if (entry == NULL) {
	LOG(ERROR) << "Entry is NULL";
	return false;
	}

	entry->Clear();

	string line;
	while (iter_->Next(&line)) {
	Util::ChopReturns(&line);
	// Skip empty lines.
	if (line.empty()) {
	continue;
	}
	// Skip comment lines.
	// TODO(yukawa): Use string::front once C++11 is enabled on Mac.
	if (((ime_type_ == MSIME \|\| ime_type_ == ATOK) && line[0] == '!') \|\|
	(ime_type_ == MOZC && line[0] == '#') \|\|
	(ime_type_ == KOTOERI && line.find("//") == 0)) {
	continue;
	}

	VLOG(2) << line;

	vector<string> values;
	switch (ime_type_) {
	case MSIME:
	case ATOK:
	case MOZC:
	Util::SplitStringAllowEmpty(line, "\t", &values);
	if (values.size() < 3) {
	continue; // Ignore this line.
	}
	entry->key = values[0];
	entry->value = values[1];
	entry->pos = values[2];
	if (values.size() >= 4) {
	entry->comment = values[3];
	}
	return true;
	break;
	case KOTOERI:
	Util::SplitCSV(line, &values);
	if (values.size() < 3) {
	continue; // Ignore this line.
	}
	entry->key = values[0];
	entry->value = values[1];
	entry->pos = values[2];
	return true;
	break;
	default:
	LOG(ERROR) << "Unknown format: " << static_cast<int>(ime_type_);
	return false;
	}
	}

	return false;
	}

	bool UserDictionaryImporter::ConvertEntry(
	const RawEntry &from, UserDictionary::Entry *to) {
	return ConvertEntryInternal(kPOSMap, arraysize(kPOSMap), from, to);
	}

	UserDictionaryImporter::IMEType
	UserDictionaryImporter::GuessIMEType(StringPiece line) {
	if (line.empty()) {
	return NUM_IMES;
	}

	string lower = line.as_string();
	Util::LowerString(&lower);

	if (lower.find("!microsoft ime") == 0) {
	return MSIME;
	}

	// Old ATOK format (!!DICUT10) is not supported for now
	// http://b/2455897
	if (lower.find("!!dicut") == 0 && lower.size() > 7) {
	const string version(lower, 7, lower.size() - 7);
	if (NumberUtil::SimpleAtoi(version) >= 11) {
	return ATOK;
	} else {
	return NUM_IMES;
	}
	}

	if (lower.find("!!atok_tango_text_header") == 0) {
	return ATOK;
	}

	if (line.begin() == '"' && line.rbegin() == '"' &&
	line.find("\t") == string::npos) {
	return KOTOERI;
	}

	if (*line.begin() == '#' \|\| line.find("\t") != string::npos) {
	return MOZC;
	}

	return NUM_IMES;
	}

	UserDictionaryImporter::IMEType UserDictionaryImporter::DetermineFinalIMEType(
	IMEType user_ime_type, IMEType guessed_ime_type) {
	IMEType result_ime_type = NUM_IMES;

	if (user_ime_type == IME_AUTO_DETECT) {
	// Trust guessed type.
	result_ime_type = guessed_ime_type;
	} else if (user_ime_type == MOZC) {
	// MOZC is compatible with MS-IME and ATOK.
	// Even if the auto detection failed, try to use Mozc format.
	if (guessed_ime_type != KOTOERI) {
	result_ime_type = user_ime_type;
	}
	} else {
	// ATOK, MS-IME and Kotoeri can be detected with 100% accuracy.
	if (guessed_ime_type == user_ime_type) {
	result_ime_type = user_ime_type;
	}
	}

	return result_ime_type;
	}

	UserDictionaryImporter::EncodingType
	UserDictionaryImporter::GuessEncodingType(StringPiece str) {
	// Unicode BOM.
	if (str.size() >= 2 &&
	((static_cast<uint8>(str[0]) == 0xFF &&
	static_cast<uint8>(str[1]) == 0xFE) \|\|
	(static_cast<uint8>(str[0]) == 0xFE &&
	static_cast<uint8>(str[1]) == 0xFF))) {
	return UTF16;
	}

	// UTF-8 BOM.
	if (str.size() >= 3 &&
	static_cast<uint8>(str[0]) == 0xEF &&
	static_cast<uint8>(str[1]) == 0xBB &&
	static_cast<uint8>(str[2]) == 0xBF) {
	return UTF8;
	}

	// Count valid UTF8 characters.
	// TODO(taku): Improve the accuracy by making a DFA.
	const char *begin = str.data();
	const char *end = str.data() + str.size();
	size_t valid_utf8 = 0;
	size_t valid_script = 0;
	while (begin < end) {
	size_t mblen = 0;
	const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen);
	if (mblen == 0) {
	break;
	}
	++valid_utf8;
	for (size_t i = 1; i < mblen; ++i) {
	if (begin[i] >= 0x80 && begin[i] <= 0xBF) {
	++valid_utf8;
	}
	}

	// "\n\r\t " or Japanese code point
	if (ucs4 == 0x000A \|\| ucs4 == 0x000D \|\|
	ucs4 == 0x0020 \|\| ucs4 == 0x0009 \|\|
	Util::GetScriptType(ucs4) != Util::UNKNOWN_SCRIPT) {
	valid_script += mblen;
	}

	begin += mblen;
	}

	// TODO(taku): No theoretical justification for these parameters.
	if (1.0 * valid_utf8 / str.size() >= 0.9 &&
	1.0 * valid_script / str.size() >= 0.5) {
	return UTF8;
	}

	return SHIFT_JIS;
	}

	UserDictionaryImporter::EncodingType
	UserDictionaryImporter::GuessFileEncodingType(const string &filename) {
	Mmap mmap;
	if (!mmap.Open(filename.c_str(), "r")) {
	LOG(ERROR) << "cannot open: " << filename;
	return NUM_ENCODINGS;
	}
	const size_t kMaxCheckSize = 1024;
	const size_t size = min(kMaxCheckSize, static_cast<size_t>(mmap.size()));
	const StringPiece mapped_data(static_cast<const char *>(mmap.begin()), size);
	return GuessEncodingType(mapped_data);
	}

	} // namespace mozc