| // Copyright 2010-2014, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "rewriter/emoticon_rewriter.h" |
| |
| #include <algorithm> |
| #include <cstring> |
| #include <string> |
| #include <vector> |
| |
| #include "base/logging.h" |
| #include "base/singleton.h" |
| #include "base/util.h" |
| #include "config/config.pb.h" |
| #include "config/config_handler.h" |
| #include "converter/conversion_request.h" |
| #include "converter/segments.h" |
| #include "rewriter/embedded_dictionary.h" |
| #include "rewriter/rewriter_interface.h" |
| #include "session/commands.pb.h" |
| |
| namespace mozc { |
| namespace { |
| |
| #include "rewriter/emoticon_rewriter_data.h" |
| |
| class EmoticonDictionary { |
| public: |
| EmoticonDictionary() |
| : dic_(new EmbeddedDictionary(kEmoticonData_token_data, |
| kEmoticonData_token_size)) {} |
| |
| ~EmoticonDictionary() {} |
| |
| EmbeddedDictionary *GetDictionary() const { |
| return dic_.get(); |
| } |
| |
| private: |
| scoped_ptr<EmbeddedDictionary> dic_; |
| }; |
| |
| class ValueCostCompare { |
| public: |
| bool operator() (const EmbeddedDictionary::Value *a, |
| const EmbeddedDictionary::Value *b) const { |
| return a->cost < b->cost; |
| } |
| }; |
| |
| class IsEqualValue { |
| public: |
| bool operator() (const EmbeddedDictionary::Value *a, |
| const EmbeddedDictionary::Value *b) const { |
| return strcmp(a->value, b->value) == 0; |
| } |
| }; |
| |
| // Insert Emoticon into the |segment| |
| // Top |initial_insert_size| candidates are inserted from |initial_insert_pos|. |
| // Remained candidates are added to the buttom. |
| void InsertCandidates(const EmbeddedDictionary::Value *value, |
| size_t value_size, |
| size_t initial_insert_pos, |
| size_t initial_insert_size, |
| bool is_no_learning, |
| Segment *segment) { |
| if (segment->candidates_size() == 0) { |
| LOG(WARNING) << "candiadtes_size is 0"; |
| return; |
| } |
| |
| const Segment::Candidate &base_candidate = segment->candidate(0); |
| size_t offset = min(initial_insert_pos, segment->candidates_size()); |
| |
| // Sort values by cost just in case |
| vector<const EmbeddedDictionary::Value *> sorted_value; |
| for (size_t i = 0; i < value_size; ++i) { |
| sorted_value.push_back(&value[i]); |
| } |
| |
| sort(sorted_value.begin(), sorted_value.end(), ValueCostCompare()); |
| |
| // after sorting the valeus by |cost|, adjacent candidates |
| // will have the same value. It is almost OK to use std::unique to |
| // remove dup entries, it is not a perfect way though. |
| sorted_value.erase(unique(sorted_value.begin(), |
| sorted_value.end(), |
| IsEqualValue()), |
| sorted_value.end()); |
| |
| for (size_t i = 0; i < sorted_value.size(); ++i) { |
| Segment::Candidate *c = NULL; |
| |
| if (i < initial_insert_size) { |
| c = segment->insert_candidate(offset); |
| ++offset; |
| } else { |
| c = segment->push_back_candidate(); |
| } |
| |
| if (c == NULL) { |
| LOG(ERROR) << "cannot insert candidate at " << offset; |
| continue; |
| } |
| |
| c->Init(); |
| // TODO(taku): set an appropriate POS here. |
| c->lid = sorted_value[i]->lid; |
| c->rid = sorted_value[i]->rid; |
| c->cost = base_candidate.cost; |
| c->value = sorted_value[i]->value; |
| c->content_value = sorted_value[i]->value; |
| c->key = base_candidate.key; |
| c->content_key = base_candidate.content_key; |
| // no full/half width normalizations |
| c->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION; |
| c->attributes |= Segment::Candidate::CONTEXT_SENSITIVE; |
| if (is_no_learning) { |
| c->attributes |= Segment::Candidate::NO_LEARNING; |
| } |
| |
| // "顔文字"; |
| const char kBaseEmoticonDescription[] |
| = "\xE9\xA1\x94\xE6\x96\x87\xE5\xAD\x97"; |
| |
| if (sorted_value[i]->description == NULL) { |
| c->description = kBaseEmoticonDescription; |
| } else { |
| string description = kBaseEmoticonDescription; |
| description.append(" "); |
| description.append(sorted_value[i]->description); |
| c->description = description; |
| } |
| } |
| } |
| |
| bool RewriteCandidate(Segments *segments) { |
| bool modified = false; |
| for (size_t i = 0; i < segments->conversion_segments_size(); ++i) { |
| const string &key = segments->conversion_segment(i).key(); |
| if (key.empty()) { |
| LOG(ERROR) << "Key is empty"; |
| continue; |
| } |
| bool is_no_learning = false; |
| const EmbeddedDictionary::Value *value = NULL; |
| size_t value_size = 0; |
| size_t initial_insert_size = 0; |
| size_t initial_insert_pos = 0; |
| |
| // TODO(taku): Emoticon dictionary does not always include "facemark". |
| // Displaying non-facemarks with "かおもじ" is not always correct. |
| // We have to distinguish pure facemarks and other symbol marks. |
| |
| // "かおもじ" |
| if (key == "\xE3\x81\x8B\xE3\x81\x8A\xE3\x82\x82\xE3\x81\x98") { |
| // When key is "かおもじ", default candidate size should be small enough. |
| // It is safe to expand all candidates at this time. |
| const EmbeddedDictionary::Token *token |
| = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken(); |
| CHECK(token); |
| // set large value(100) so that all candidates are pushed to the bottom |
| value = token->value; |
| value_size = token->value_size; |
| initial_insert_pos = 100; |
| initial_insert_size = token->value_size; |
| // "かお" |
| } else if (key == "\xE3\x81\x8B\xE3\x81\x8A") { |
| // When key is "かお", expand all candidates in conservative way. |
| const EmbeddedDictionary::Token *token |
| = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken(); |
| CHECK(token); |
| // first 6 candidates are inserted at 4 th position. |
| // Other candidates are pushed to the buttom. |
| value = token->value; |
| value_size = token->value_size; |
| initial_insert_pos = 4; |
| initial_insert_size = 6; |
| } else if (key == "\xE3\x81\xB5\xE3\x81\x8F\xE3\x82\x8F" |
| "\xE3\x82\x89\xE3\x81\x84") { // "ふくわらい" |
| // Choose one emoticon randomly from the dictionary. |
| // TODO(taku): want to make it "generate" more funny emoticon. |
| const EmbeddedDictionary::Token *token |
| = Singleton<EmoticonDictionary>::get()->GetDictionary()->AllToken(); |
| CHECK(token); |
| uint32 n = 0; |
| // use secure random not to predict the next emoticon. |
| Util::GetRandomSequence(reinterpret_cast<char *>(&n), sizeof(n)); |
| value = token->value + n % token->value_size; |
| value_size = 1; |
| initial_insert_pos = 4; |
| initial_insert_size = 1; |
| is_no_learning = true; // do not learn this candidate. |
| } else { |
| const EmbeddedDictionary::Token *token |
| = Singleton<EmoticonDictionary>::get()->GetDictionary()->Lookup(key); |
| // by default, insert canidate at 7 th position. |
| if (token != NULL) { |
| value = token->value; |
| value_size = token->value_size; |
| initial_insert_pos = 6; |
| initial_insert_size = token == NULL ? 0 : token->value_size; |
| } |
| } |
| |
| if (value == NULL || value_size == 0) { |
| continue; |
| } |
| |
| InsertCandidates(value, value_size, |
| initial_insert_pos, |
| initial_insert_size, |
| is_no_learning, |
| segments->mutable_conversion_segment(i)); |
| modified = true; |
| } |
| |
| return modified; |
| } |
| } // namespace |
| |
| EmoticonRewriter::EmoticonRewriter() {} |
| |
| EmoticonRewriter::~EmoticonRewriter() {} |
| |
| int EmoticonRewriter::capability(const ConversionRequest &request) const { |
| if (request.request().mixed_conversion()) { |
| return RewriterInterface::ALL; |
| } |
| return RewriterInterface::CONVERSION; |
| } |
| |
| bool EmoticonRewriter::Rewrite(const ConversionRequest &request, |
| Segments *segments) const { |
| if (!GET_CONFIG(use_emoticon_conversion)) { |
| VLOG(2) << "no use_emoticon_conversion"; |
| return false; |
| } |
| return RewriteCandidate(segments); |
| } |
| } // namespace mozc |