| // Copyright 2010-2015, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| // NOTE(tabata): This code is used mainly by louds trie builder to build |
| // dictionary. Please check error handling, if you want to include |
| // this to run within client. |
| |
| #include "dictionary/text_dictionary_loader.h" |
| |
| #include <algorithm> |
| #include <limits> |
| #include <string> |
| #include <vector> |
| |
| #include "base/file_stream.h" |
| #include "base/flags.h" |
| #include "base/iterator_adapter.h" |
| #include "base/logging.h" |
| #include "base/multifile.h" |
| #include "base/number_util.h" |
| #include "base/stl_util.h" |
| #include "base/string_piece.h" |
| #include "base/util.h" |
| #include "dictionary/dictionary_token.h" |
| #include "dictionary/pos_matcher.h" |
| |
| DEFINE_int32(tokens_reserve_size, 1400000, |
| "Reserve the specified size of token buffer in advance."); |
| |
| namespace mozc { |
| namespace dictionary { |
| namespace { |
| |
| // Functor to sort a sequence of Tokens first by value and then by key. |
| struct OrderByValueThenByKey { |
| bool operator()(const Token *l, const Token *r) const { |
| const int comp = l->value.compare(r->value); |
| return comp == 0 ? (l->key < r->key) : (comp < 0); |
| } |
| }; |
| |
| // Provides a view of Token iterator as a pair of value and key. Used to look |
| // up tokens from a sorted range of Token pointers using value and key. |
| struct AsValueAndKey : public AdapterBase<pair<StringPiece, StringPiece> > { |
| value_type operator()(vector<Token *>::const_iterator iter) const { |
| const Token *token = *iter; |
| return pair<StringPiece, StringPiece>(token->value, token->key); |
| } |
| }; |
| |
| // Provides a view of Token iterator as a value string. Used to look up tokens |
| // from a sorted range of Tokens using value. |
| struct AsValue : public AdapterBase<StringPiece> { |
| value_type operator()(vector<Token *>::const_iterator iter) const { |
| return StringPiece((*iter)->value); |
| } |
| }; |
| |
| // Parses one line of reading correction file. Since the result is returned as |
| // StringPieces, |line| needs to outlive |value_key|. |
| void ParseReadingCorrectionTSV(const string &line, |
| pair<StringPiece, StringPiece> *value_key) { |
| // Format: value\terror\tcorrect |
| SplitIterator<SingleDelimiter> iter(line, "\t"); |
| CHECK(!iter.Done()); |
| value_key->first = iter.Get(); |
| iter.Next(); |
| CHECK(!iter.Done()); |
| value_key->second = iter.Get(); |
| } |
| |
| // Helper function to parse an integer from a string. |
| inline bool SafeStrToInt(StringPiece s, int *n) { |
| uint32 u32 = 0; |
| const bool ret = NumberUtil::SafeStrToUInt32(s, &u32); |
| *n = u32; |
| return ret; |
| } |
| |
| // Helper functions to get const iterators. |
| inline vector<Token *>::const_iterator CBegin(const vector<Token *> &tokens) { |
| return tokens.begin(); |
| } |
| |
| inline vector<Token *>::const_iterator CEnd(const vector<Token *> &tokens) { |
| return tokens.end(); |
| } |
| |
| } // namespace |
| |
| TextDictionaryLoader::TextDictionaryLoader(const POSMatcher &pos_matcher) |
| : pos_matcher_(&pos_matcher) { |
| } |
| |
| TextDictionaryLoader::~TextDictionaryLoader() { |
| Clear(); |
| } |
| |
| bool TextDictionaryLoader::RewriteSpecialToken(Token *token, |
| StringPiece label) const { |
| CHECK(token); |
| if (label.empty()) { |
| return true; |
| } |
| if (Util::StartsWith(label, "SPELLING_CORRECTION")) { |
| token->attributes = Token::SPELLING_CORRECTION; |
| return true; |
| } |
| if (Util::StartsWith(label, "ZIP_CODE")) { |
| token->lid = pos_matcher_->GetZipcodeId(); |
| token->rid = pos_matcher_->GetZipcodeId(); |
| return true; |
| } |
| if (Util::StartsWith(label, "ENGLISH")) { |
| // TODO(noriyukit): Might be better to use special POS for english words. |
| token->lid = pos_matcher_->GetIsolatedWordId(); |
| token->rid = pos_matcher_->GetIsolatedWordId(); |
| return true; |
| } |
| LOG(ERROR) << "Unknown special label: " << label; |
| return false; |
| } |
| |
| void TextDictionaryLoader::Load(const string &dictionary_filename, |
| const string &reading_correction_filename) { |
| LoadWithLineLimit(dictionary_filename, reading_correction_filename, -1); |
| } |
| |
| void TextDictionaryLoader::LoadWithLineLimit( |
| const string &dictionary_filename, |
| const string &reading_correction_filename, |
| int limit) { |
| Clear(); |
| |
| // Roughly allocate buffers for Token pointers. |
| if (limit < 0) { |
| tokens_.reserve(FLAGS_tokens_reserve_size); |
| limit = numeric_limits<int>::max(); |
| } else { |
| tokens_.reserve(limit); |
| } |
| |
| // Read system dictionary. |
| { |
| InputMultiFile file(dictionary_filename); |
| string line; |
| while (limit > 0 && file.ReadLine(&line)) { |
| Util::ChopReturns(&line); |
| Token *token = ParseTSVLine(line); |
| if (token) { |
| tokens_.push_back(token); |
| --limit; |
| } |
| } |
| LOG(INFO) << tokens_.size() << " tokens from " << dictionary_filename; |
| } |
| |
| if (reading_correction_filename.empty() || limit <= 0) { |
| return; |
| } |
| |
| // Prepare for loading reading corrections. We sort |tokens_| first by value |
| // and then by key so that we can perform the following operations both in |
| // O(log(N)), where N is the size of tokens. |
| // 1. Checking the existence of any key-value pairs: This can be done by |
| // binary-searching for a pair of value and key. |
| // 2. Accessing all the tokens that have the same value: Since tokens are |
| // also sorted in order of value, this can be done by finding a range of |
| // tokens that have the same value. |
| sort(tokens_.begin(), tokens_.end(), OrderByValueThenByKey()); |
| |
| vector<Token *> reading_correction_tokens; |
| LoadReadingCorrectionTokens(reading_correction_filename, tokens_, |
| &limit, &reading_correction_tokens); |
| const size_t tokens_size = tokens_.size(); |
| tokens_.resize(tokens_size + reading_correction_tokens.size()); |
| for (size_t i = 0; i < reading_correction_tokens.size(); ++i) { |
| // |tokens_| takes the ownership of each allocated token. |
| tokens_[tokens_size + i] = reading_correction_tokens[i]; |
| } |
| } |
| |
| // Loads reading correction data into |tokens|. The second argument is used to |
| // determine costs of reading correction tokens and must be sorted by |
| // OrderByValueThenByKey(). The output tokens are newly allocated and the |
| // caller is responsible to delete them. |
| void TextDictionaryLoader::LoadReadingCorrectionTokens( |
| const string &reading_correction_filename, |
| const vector<Token *> &ref_sorted_tokens, |
| int *limit, |
| vector<Token *> *tokens) { |
| // Load reading correction entries. |
| int reading_correction_size = 0; |
| |
| InputMultiFile file(reading_correction_filename); |
| string line; |
| while (file.ReadLine(&line)) { |
| if (line.empty() || line[0] == '#') { |
| continue; |
| } |
| |
| // Parse TSV line in a pair of value and key (Note: first element is value |
| // and the second key). |
| Util::ChopReturns(&line); |
| pair<StringPiece, StringPiece> value_key; |
| ParseReadingCorrectionTSV(line, &value_key); |
| |
| // Filter the entry if this key value pair already exists in the system |
| // dictionary. |
| if (binary_search( |
| MakeIteratorAdapter(ref_sorted_tokens.begin(), AsValueAndKey()), |
| MakeIteratorAdapter(ref_sorted_tokens.end(), AsValueAndKey()), |
| value_key)) { |
| VLOG(1) << "System dictionary has the same key-value: " << line; |
| continue; |
| } |
| |
| // Since reading correction entries lack POS and cost, we recover those |
| // fields from a token in the system dictionary that has the same value. |
| // Since multple tokens may have the same value, from such tokens, we |
| // select the one that has the maximum cost. |
| typedef vector<Token *>::const_iterator TokenIterator; |
| typedef IteratorAdapter<TokenIterator, AsValue> AsValueIterator; |
| typedef pair<AsValueIterator, AsValueIterator> Range; |
| Range range = equal_range( |
| MakeIteratorAdapter(CBegin(ref_sorted_tokens), AsValue()), |
| MakeIteratorAdapter(CEnd(ref_sorted_tokens), AsValue()), |
| value_key.first); |
| TokenIterator begin = range.first.base(), end = range.second.base(); |
| if (begin == end) { |
| VLOG(1) << "Cannot find the value in system dicitonary - ignored:" |
| << line; |
| continue; |
| } |
| // Now [begin, end) contains all the tokens that have the same value as |
| // this reading correction entry. Next, find the token that has the |
| // maximum cost in [begin, end). Note that linear search is sufficiently |
| // fast here because the size of the range is small. |
| const Token *max_cost_token = *begin; |
| for (++begin; begin != end; ++begin) { |
| if ((*begin)->cost > max_cost_token->cost) { |
| max_cost_token = *begin; |
| } |
| } |
| |
| // The cost is calculated as -log(prob) * 500. |
| // We here assume that the wrong reading appear with 1/100 probability |
| // of the original (correct) reading. |
| const int kCostPenalty = 2302; // -log(1/100) * 500; |
| scoped_ptr<Token> token(new Token); |
| value_key.second.CopyToString(&token->key); |
| token->value = max_cost_token->value; |
| token->lid = max_cost_token->lid; |
| token->rid = max_cost_token->rid; |
| token->cost = max_cost_token->cost + kCostPenalty; |
| // We don't set SPELLING_CORRECTION. The entries in reading_correction |
| // data are also stored in rewriter/correction_rewriter.cc. |
| // reading_correction_rewriter annotates the spelling correction |
| // notations. |
| token->attributes = Token::NONE; |
| tokens->push_back(token.release()); |
| ++reading_correction_size; |
| if (--*limit <= 0) { |
| break; |
| } |
| } |
| LOG(INFO) << reading_correction_size << " tokens from " |
| << reading_correction_filename; |
| } |
| |
| void TextDictionaryLoader::Clear() { |
| STLDeleteElements(&tokens_); |
| } |
| |
| void TextDictionaryLoader::CollectTokens(vector<Token *> *res) const { |
| DCHECK(res); |
| res->reserve(res->size() + tokens_.size()); |
| res->insert(res->end(), tokens_.begin(), tokens_.end()); |
| } |
| |
| Token *TextDictionaryLoader::ParseTSVLine(StringPiece line) const { |
| vector<StringPiece> columns; |
| Util::SplitStringUsing(line, "\t", &columns); |
| return ParseTSV(columns); |
| } |
| |
| Token *TextDictionaryLoader::ParseTSV( |
| const vector<StringPiece> &columns) const { |
| CHECK_LE(5, columns.size()) << "Lack of columns: " << columns.size(); |
| |
| scoped_ptr<Token> token(new Token); |
| |
| // Parse key, lid, rid, cost, value. |
| Util::NormalizeVoicedSoundMark(columns[0], &token->key); |
| CHECK(SafeStrToInt(columns[1], &token->lid)) |
| << "Wrong lid: " << columns[1]; |
| CHECK(SafeStrToInt(columns[2], &token->rid)) |
| << "Wrong rid: " << columns[2]; |
| CHECK(SafeStrToInt(columns[3], &token->cost)) |
| << "Wrong cost: " << columns[3]; |
| Util::NormalizeVoicedSoundMark(columns[4], &token->value); |
| |
| // Optionally, label (SPELLING_CORRECTION, ZIP_CODE, etc.) may be provided in |
| // column 6. |
| if (columns.size() > 5) { |
| CHECK(RewriteSpecialToken(token.get(), columns[5])) |
| << "Invalid label: " << columns[5]; |
| } |
| return token.release(); |
| } |
| |
| } // namespace dictionary |
| } // namespace mozc |