blob: d3cbdc96047f61673a989f0c9bac22a1080623e8 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "converter/lattice.h"
#include <sstream>
#include <string>
#include <vector>
#include "base/logging.h"
#include "base/port.h"
#include "base/singleton.h"
#include "base/util.h"
#include "converter/node.h"
#include "converter/node_allocator.h"
namespace mozc {
namespace {
Node *InitBOSNode(Lattice *lattice, uint16 length) {
Node *bos_node = lattice->NewNode();
DCHECK(bos_node);
bos_node->rid = 0; // 0 is reserved for EOS/BOS
bos_node->lid = 0;
bos_node->key.clear();
bos_node->value = "BOS";
bos_node->node_type = Node::BOS_NODE;
bos_node->wcost = 0;
bos_node->cost = 0;
bos_node->begin_pos = length;
bos_node->end_pos = length;
bos_node->enext = NULL;
return bos_node;
}
Node *InitEOSNode(Lattice *lattice, uint16 length) {
Node *eos_node = lattice->NewNode();
DCHECK(eos_node);
eos_node->rid = 0; // 0 is reserved for EOS/BOS
eos_node->lid = 0;
eos_node->key.clear();
eos_node->value = "EOS";
eos_node->node_type = Node::EOS_NODE;
eos_node->wcost = 0;
eos_node->cost = 0;
eos_node->begin_pos = length;
eos_node->end_pos = length;
eos_node->bnext = NULL;
return eos_node;
}
bool PathContainsString(const Node *node, size_t begin_pos, size_t end_pos,
const string &str) {
CHECK(node);
for (; node->prev != NULL; node = node->prev) {
if (node->begin_pos == begin_pos && node->end_pos == end_pos &&
node->value == str) {
return true;
}
}
return false;
}
string GetDebugStringForNode(const Node *node, const Node *prev_node) {
CHECK(node);
stringstream os;
os << "[con:" << node->cost - (prev_node ? prev_node->cost : 0) -
node->wcost << "]";
os << "[lid:" << node->lid << "]";
os << "\"" << node->value << "\"";
os << "[wcost:" << node->wcost << "]";
os << "[cost:" << node->cost << "]";
os << "[rid:" << node->rid << "]";
return os.str();
}
string GetDebugStringForPath(const Node *end_node) {
CHECK(end_node);
stringstream os;
vector<const Node *> node_vector;
for (const Node *node = end_node; node; node = node->prev) {
node_vector.push_back(node);
}
const Node *prev_node = NULL;
for (int i = static_cast<int>(node_vector.size()) - 1; i >= 0; --i) {
const Node *node = node_vector[i];
os << GetDebugStringForNode(node, prev_node);
prev_node = node;
}
return os.str();
}
string GetCommonPrefix(const string &str1, const string &str2) {
vector<string> split1, split2;
Util::SplitStringToUtf8Chars(str1, &split1);
Util::SplitStringToUtf8Chars(str2, &split2);
string common_prefix = "";
for (int i = 0; i < min(split1.size(), split2.size()); ++i) {
if (split1[i] == split2[i]) {
common_prefix += split1[i];
} else {
break;
}
}
return common_prefix;
}
} // namespace
struct LatticeDisplayNodeInfo {
size_t display_node_begin_pos_;
size_t display_node_end_pos_;
string display_node_str_;
};
Lattice::Lattice() : history_end_pos_(0), node_allocator_(new NodeAllocator) {}
Lattice::~Lattice() {}
NodeAllocator *Lattice::node_allocator() const {
return node_allocator_.get();
}
Node *Lattice::NewNode() {
return node_allocator_->NewNode();
}
Node *Lattice::begin_nodes(size_t pos) const {
return begin_nodes_[pos];
}
Node *Lattice::end_nodes(size_t pos) const {
return end_nodes_[pos];
}
void Lattice::SetKey(StringPiece key) {
Clear();
key.CopyToString(&key_);
const size_t size = key.size();
begin_nodes_.resize(size + 4);
end_nodes_.resize(size + 4);
cache_info_.resize(size + 4);
fill(begin_nodes_.begin(), begin_nodes_.end(),
static_cast<Node *>(NULL));
fill(end_nodes_.begin(), end_nodes_.end(),
static_cast<Node *>(NULL));
fill(cache_info_.begin(), cache_info_.end(), 0);
end_nodes_[0] = InitBOSNode(this,
static_cast<uint16>(0));
begin_nodes_[key_.size()] =
InitEOSNode(this, static_cast<uint16>(key_.size()));
}
Node *Lattice::bos_nodes() const {
return end_nodes_[0];
}
Node *Lattice::eos_nodes() const {
return begin_nodes_[key_.size()];
}
void Lattice::Insert(size_t pos, Node *node) {
for (Node *rnode = node; rnode != NULL; rnode = rnode->bnext) {
const size_t end_pos = min(rnode->key.size() + pos, key_.size());
rnode->begin_pos = static_cast<uint16>(pos);
rnode->end_pos = static_cast<uint16>(end_pos);
rnode->prev = NULL;
rnode->next = NULL;
rnode->cost = 0;
rnode->enext = end_nodes_[end_pos];
end_nodes_[end_pos] = rnode;
}
if (begin_nodes_[pos] == NULL) {
begin_nodes_[pos] = node;
} else {
for (Node *rnode = node; rnode != NULL; rnode = rnode->bnext) {
if (rnode->bnext == NULL) {
rnode->bnext = begin_nodes_[pos];
begin_nodes_[pos] = node;
break;
}
}
}
}
const string &Lattice::key() const {
return key_;
}
bool Lattice::has_lattice() const {
return !begin_nodes_.empty();
}
void Lattice::Clear() {
key_.clear();
begin_nodes_.clear();
end_nodes_.clear();
node_allocator_->Free();
cache_info_.clear();
history_end_pos_ = 0;
}
void Lattice::SetDebugDisplayNode(size_t begin_pos, size_t end_pos,
const string &str) {
LatticeDisplayNodeInfo *info = Singleton<LatticeDisplayNodeInfo>::get();
info->display_node_begin_pos_ = begin_pos;
info->display_node_end_pos_ = end_pos;
info->display_node_str_ = str;
}
void Lattice::ResetDebugDisplayNode() {
LatticeDisplayNodeInfo *info = Singleton<LatticeDisplayNodeInfo>::get();
info->display_node_str_.clear();
}
void Lattice::set_history_end_pos(size_t pos) {
history_end_pos_ = pos;
}
size_t Lattice::history_end_pos() const {
return history_end_pos_;
}
void Lattice::UpdateKey(const string &new_key) {
const string old_key = key_;
const string common_prefix = GetCommonPrefix(new_key, old_key);
// if the length of common prefix is too short, call SetKey
if (common_prefix.size() <= old_key.size() / 2) {
SetKey(new_key);
return;
}
// if node_allocator has many nodes, then clean up
const size_t size_threshold = node_allocator_->max_nodes_size();
if (node_allocator_->node_count() > size_threshold) {
SetKey(new_key);
return;
}
// erase the suffix of old_key so that the key becomes common_prefix
ShrinkKey(common_prefix.size());
// add a suffix so that the key becomes new_key
AddSuffix(new_key.substr(common_prefix.size()));
}
void Lattice::AddSuffix(const string &suffix_key) {
if (suffix_key.empty()) {
return;
}
const size_t old_size = key_.size();
const size_t new_size = key_.size() + suffix_key.size();
// update begin_nodes and end_nodes
begin_nodes_.resize(new_size + 4);
end_nodes_.resize(new_size + 4);
fill(begin_nodes_.begin() + old_size, begin_nodes_.end(),
static_cast<Node *>(NULL));
fill(end_nodes_.begin() + old_size + 1, end_nodes_.end(),
static_cast<Node *>(NULL));
end_nodes_[0] = InitBOSNode(this,
static_cast<uint16>(0));
begin_nodes_[new_size] =
InitEOSNode(this, static_cast<uint16>(new_size));
// update cache_info
cache_info_.resize(new_size + 4, 0);
// update key
key_ += suffix_key;
}
void Lattice::ShrinkKey(const size_t new_len) {
const size_t old_len = key_.size();
CHECK_LE(new_len, old_len);
if (new_len == old_len) {
return;
}
// erase nodes whose end position exceeds new_len
for (size_t i = 0; i < new_len; ++i) {
Node *begin = begin_nodes_[i];
if (begin == NULL) {
continue;
}
for (Node *prev = begin, *curr = begin->bnext; curr != NULL; ) {
CHECK(prev);
if (curr->end_pos > new_len) {
prev->bnext = curr->bnext;
curr = curr->bnext;
} else {
prev = prev->bnext;
curr = curr->bnext;
}
}
if (begin->end_pos > new_len) {
begin_nodes_[i] = begin->bnext;
}
}
// update begin_nodes and end_nodes
for (size_t i = new_len; i <= old_len; ++i) {
begin_nodes_[i] = NULL;
}
for (size_t i = new_len + 1; i <= old_len; ++i) {
end_nodes_[i] = NULL;
}
begin_nodes_[new_len] =
InitEOSNode(this, static_cast<uint16>(new_len));
// update cache_info
for (size_t i = 0; i < new_len; ++i) {
cache_info_[i] = min(cache_info_[i], new_len - i);
}
fill(cache_info_.begin() + new_len, cache_info_.end(), 0);
// update key
key_.erase(new_len);
}
size_t Lattice::cache_info(const size_t pos) const {
CHECK_LE(pos, key_.size());
return cache_info_[pos];
}
void Lattice::SetCacheInfo(const size_t pos, const size_t len) {
CHECK_LE(pos, key_.size());
cache_info_[pos] = len;
}
void Lattice::ResetNodeCost() {
for (size_t i = 0; i <= key_.size(); ++i) {
if (begin_nodes_[i] != NULL) {
Node *prev = NULL;
for (Node *node = begin_nodes_[i]; node != NULL; node = node->bnext) {
// do not process BOS / EOS nodes
if (node->node_type == Node::BOS_NODE ||
node->node_type == Node::EOS_NODE) {
continue;
}
// if the node has ENABLE_CACHE attribute, then revert its wcost.
// Otherwise, erase the node from the lattice.
if (node->attributes & Node::ENABLE_CACHE) {
node->wcost = node->raw_wcost;
} else {
if (node == begin_nodes_[i]) {
if (node->bnext == NULL) {
begin_nodes_[i] = NULL;
} else {
begin_nodes_[i] = node->bnext;
}
} else {
CHECK(prev);
CHECK_EQ(prev->bnext, node);
prev->next = node->bnext;
}
}
// traverse a next node
prev = node;
}
}
if (end_nodes_[i] != NULL) {
Node *prev = NULL;
for (Node *node = end_nodes_[i]; node != NULL; node = node->enext) {
if (node->node_type == Node::BOS_NODE ||
node->node_type == Node::EOS_NODE) {
continue;
}
if (node->attributes & Node::ENABLE_CACHE) {
node->wcost = node->raw_wcost;
} else {
if (node == end_nodes_[i]) {
if (node->enext == NULL) {
end_nodes_[i] = NULL;
} else {
end_nodes_[i] = node->enext;
}
} else {
CHECK(prev);
CHECK_EQ(prev->enext, node);
prev->next = node->enext;
}
}
prev = node;
}
}
}
}
string Lattice::DebugString() const {
stringstream os;
if (!has_lattice()) {
return "";
}
vector<const Node *> best_path_nodes;
const Node *node = eos_nodes();
// Print the best path
os << "Best path: ";
os << GetDebugStringForPath(node);
os << endl;
LatticeDisplayNodeInfo *info = Singleton<LatticeDisplayNodeInfo>::get();
if (info->display_node_str_.empty()) {
return os.str();
}
for (; node != NULL; node = node->prev) {
best_path_nodes.push_back(node);
}
// Print tha path that contains the designated node
for (vector<const Node *>::const_iterator it = best_path_nodes.begin();
it != best_path_nodes.end(); ++it) {
const Node *best_path_node = *it;
if (best_path_node->begin_pos < info->display_node_end_pos_) {
break;
}
for (const Node *prev_node = end_nodes(best_path_node->begin_pos);
prev_node; prev_node = prev_node->enext) {
if (!PathContainsString(prev_node,
info->display_node_begin_pos_,
info->display_node_end_pos_,
info->display_node_str_)) {
continue;
}
os << "The path " << GetDebugStringForPath(prev_node) <<
" ( + connection cost + wcost: " << best_path_node->wcost << ")"
<< endl << "was defeated"
<< " by the path " << endl
<< GetDebugStringForPath(best_path_node->prev)
<< " connecting to the node "
<< GetDebugStringForNode(best_path_node, best_path_node->prev)
<< endl;
}
}
return os.str();
}
} // namespace mozc