blob: 95c1bd7087e512a12a3bd06661667ace0c944584 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "data_manager/packed/packed_data_manager.h"
#include <memory>
#include "base/logging.h"
#include "base/mmap.h"
#include "base/protobuf/coded_stream.h"
#include "base/protobuf/gzip_stream.h"
#include "base/protobuf/zero_copy_stream_impl.h"
#include "converter/boundary_struct.h"
#include "data_manager/data_manager_interface.h"
#include "data_manager/packed/system_dictionary_data.pb.h"
#include "data_manager/packed/system_dictionary_format_version.h"
#include "dictionary/pos_matcher.h"
#include "dictionary/suffix_dictionary_token.h"
#include "rewriter/correction_rewriter.h"
#include "rewriter/counter_suffix.h"
#include "rewriter/embedded_dictionary.h"
#ifndef NO_USAGE_REWRITER
#include "rewriter/usage_rewriter_data_structs.h"
#endif // NO_USAGE_REWRITER
DEFINE_string(dataset,
"",
"The dataset tag of the POS data.");
using std::unique_ptr;
namespace mozc {
namespace packed {
namespace {
// Default value of the total bytes limit defined in protobuf library is 64MB.
// Our big dictionary size is about 50MB. So we don't need to change it.
const size_t kDefaultTotalBytesLimit = 64 << 20;
class PackedPOSMatcher : public POSMatcher {
public:
PackedPOSMatcher(const uint16 *const rule_id_table,
const Range *const *const range_table)
: POSMatcher(rule_id_table, range_table) {
}
};
unique_ptr<PackedDataManager> g_data_manager;
} // namespace
class PackedDataManager::Impl {
public:
Impl();
~Impl();
bool Init(const string &system_dictionary_data);
bool InitWithZippedData(const string &zipped_system_dictionary_data);
string GetDictionaryVersion();
const UserPOS::POSToken *GetUserPOSData() const;
const POSMatcher *GetPOSMatcher() const;
const uint8 *GetPosGroupData() const;
void GetConnectorData(const char **data, size_t *size) const;
void GetSegmenterData(
size_t *l_num_elements, size_t *r_num_elements,
const uint16 **l_table, const uint16 **r_table,
size_t *bitarray_num_bytes, const char **bitarray_data,
const BoundaryData **boundary_data) const;
void GetSystemDictionaryData(const char **data, int *size) const;
void GetSuffixDictionaryData(const SuffixToken **data,
size_t *size) const;
void GetReadingCorrectionData(const ReadingCorrectionItem **array,
size_t *size) const;
void GetCollocationData(const char **array, size_t *size) const;
void GetCollocationSuppressionData(const char **array,
size_t *size) const;
void GetSuggestionFilterData(const char **data, size_t *size) const;
void GetSymbolRewriterData(const EmbeddedDictionary::Token **data,
size_t *size) const;
#ifndef NO_USAGE_REWRITER
void GetUsageRewriterData(
const ConjugationSuffix **base_conjugation_suffix,
const ConjugationSuffix **conjugation_suffix_data,
const int **conjugation_suffix_data_index,
const UsageDictItem **usage_data_value) const;
#endif // NO_USAGE_REWRITER
const uint16 *GetRuleIdTableForTest() const;
const void *GetRangeTablesForTest() const;
void GetCounterSuffixSortedArray(const CounterSuffixEntry **array,
size_t *size) const;
private:
// Non-const struct of POSMatcher::Range
struct Range {
uint16 lower;
uint16 upper;
};
bool InitializeWithSystemDictionaryData();
unique_ptr<UserPOS::POSToken[]> pos_token_;
unique_ptr<UserPOS::ConjugationType[]> conjugation_array_;
unique_ptr<uint16[]> rule_id_table_;
unique_ptr<POSMatcher::Range *[]> range_tables_;
unique_ptr<Range[]> range_table_items_;
unique_ptr<BoundaryData[]> boundary_data_;
unique_ptr<SuffixToken[]> suffix_tokens_;
unique_ptr<ReadingCorrectionItem[]> reading_corrections_;
size_t compressed_l_size_;
size_t compressed_r_size_;
unique_ptr<uint16[]> compressed_lid_table_;
unique_ptr<uint16[]> compressed_rid_table_;
unique_ptr<EmbeddedDictionary::Value[]> symbol_data_values_;
size_t symbol_data_token_size_;
unique_ptr<EmbeddedDictionary::Token[]> symbol_data_tokens_;
unique_ptr<POSMatcher> pos_matcher_;
unique_ptr<SystemDictionaryData> system_dictionary_data_;
#ifndef NO_USAGE_REWRITER
unique_ptr<ConjugationSuffix[]> base_conjugation_suffix_;
unique_ptr<ConjugationSuffix[]> conjugation_suffix_data_;
unique_ptr<int[]> conjugation_suffix_data_index_;
unique_ptr<UsageDictItem[]> usage_data_value_;
#endif // NO_USAGE_REWRITER
unique_ptr<CounterSuffixEntry[]> counter_suffix_data_;
};
PackedDataManager::Impl::Impl()
: compressed_l_size_(0),
compressed_r_size_(0),
symbol_data_token_size_(0) {
}
PackedDataManager::Impl::~Impl() {
}
bool PackedDataManager::Impl::Init(const string &system_dictionary_data) {
system_dictionary_data_.reset(new SystemDictionaryData);
if (!system_dictionary_data_->ParseFromString(system_dictionary_data)) {
LOG(ERROR) << "System dictionary data protobuf format error!";
return false;
}
return InitializeWithSystemDictionaryData();
}
bool PackedDataManager::Impl::InitWithZippedData(
const string &zipped_system_dictionary_data) {
protobuf::io::ArrayInputStream input(zipped_system_dictionary_data.data(),
zipped_system_dictionary_data.size());
protobuf::io::GzipInputStream gzip_stream(&input);
protobuf::io::CodedInputStream coded_stream(&gzip_stream);
// Disables the total bytes warning.
coded_stream.SetTotalBytesLimit(kDefaultTotalBytesLimit, -1);
system_dictionary_data_.reset(new SystemDictionaryData);
if (!system_dictionary_data_->ParseFromCodedStream(&coded_stream)) {
LOG(ERROR) << "System dictionary data protobuf format error!";
return false;
}
return InitializeWithSystemDictionaryData();
}
string PackedDataManager::Impl::GetDictionaryVersion() {
return system_dictionary_data_->product_version();
}
bool PackedDataManager::Impl::InitializeWithSystemDictionaryData() {
// Checks format version.
if (system_dictionary_data_->format_version() !=
kSystemDictionaryFormatVersion) {
LOG(ERROR) << "System dictionary data format version miss match! "
<< " expected:" << kSystemDictionaryFormatVersion
<< " actual:" << system_dictionary_data_->format_version();
return false;
}
// Makes UserPOS data.
pos_token_.reset(
new UserPOS::POSToken[system_dictionary_data_->pos_tokens_size()]);
size_t conjugation_count = 0;
for (size_t i = 0; i < system_dictionary_data_->pos_tokens_size(); ++i) {
conjugation_count +=
system_dictionary_data_->pos_tokens(i).conjugation_forms_size();
}
conjugation_array_.reset(new UserPOS::ConjugationType[conjugation_count]);
size_t conjugation_index = 0;
for (size_t i = 0; i < system_dictionary_data_->pos_tokens_size(); ++i) {
const SystemDictionaryData::PosToken &pos_token =
system_dictionary_data_->pos_tokens(i);
if (pos_token.has_pos()) {
pos_token_[i].pos = pos_token.pos().data();
} else {
pos_token_[i].pos = NULL;
}
pos_token_[i].conjugation_size =
pos_token.conjugation_forms_size();
pos_token_[i].conjugation_form = &conjugation_array_[conjugation_index];
if (pos_token.conjugation_forms_size() == 0) {
pos_token_[i].conjugation_form = NULL;
}
for (size_t j = 0; j < pos_token.conjugation_forms_size(); ++j) {
const SystemDictionaryData::PosToken::ConjugationType &conjugation_form =
pos_token.conjugation_forms(j);
if (conjugation_form.has_key_suffix()) {
conjugation_array_[conjugation_index].key_suffix =
conjugation_form.key_suffix().data();
} else {
conjugation_array_[conjugation_index].key_suffix = NULL;
}
if (conjugation_form.has_value_suffix()) {
conjugation_array_[conjugation_index].value_suffix =
conjugation_form.value_suffix().data();
} else {
conjugation_array_[conjugation_index].value_suffix = NULL;
}
conjugation_array_[conjugation_index].id = conjugation_form.id();
++conjugation_index;
}
}
// Makes POSMatcher data.
rule_id_table_.reset(
new uint16[
system_dictionary_data_->pos_matcher_data().rule_id_table_size()]);
for (size_t i = 0;
i < system_dictionary_data_->pos_matcher_data().rule_id_table_size();
++i) {
rule_id_table_[i] =
system_dictionary_data_->pos_matcher_data().rule_id_table(i);
}
const SystemDictionaryData::PosMatcherData &pos_matcher_data =
system_dictionary_data_->pos_matcher_data();
range_tables_.reset(
new POSMatcher::Range*[pos_matcher_data.range_tables_size()]);
size_t range_count = 0;
for (size_t i = 0; i < pos_matcher_data.range_tables_size(); ++i) {
range_count += pos_matcher_data.range_tables(i).ranges_size();
}
range_table_items_.reset(
new Range[range_count + pos_matcher_data.range_tables_size()]);
size_t range_index = 0;
for (size_t i = 0; i < pos_matcher_data.range_tables_size(); ++i) {
const SystemDictionaryData::PosMatcherData::RangeTable &table =
pos_matcher_data.range_tables(i);
range_tables_[i] =
reinterpret_cast<POSMatcher::Range *>(&range_table_items_[range_index]);
for (size_t j = 0; j < table.ranges_size(); ++j) {
const SystemDictionaryData::PosMatcherData::RangeTable::Range &range =
table.ranges(j);
range_table_items_[range_index].lower = range.lower();
range_table_items_[range_index].upper = range.upper();
++range_index;
}
range_table_items_[range_index].lower = static_cast<uint16>(0xFFFF);
range_table_items_[range_index].upper = static_cast<uint16>(0xFFFF);
++range_index;
}
// Makes boundary data.
boundary_data_.reset(
new BoundaryData[system_dictionary_data_->boundary_data_size()]);
for (size_t i = 0; i < system_dictionary_data_->boundary_data_size(); ++i) {
const SystemDictionaryData::BoundaryData &boundary_data =
system_dictionary_data_->boundary_data(i);
boundary_data_[i].prefix_penalty = boundary_data.prefix_penalty();
boundary_data_[i].suffix_penalty = boundary_data.suffix_penalty();
}
// Makes suffix data.
suffix_tokens_.reset(
new SuffixToken[system_dictionary_data_->suffix_tokens_size()]);
for (size_t i = 0;
i < system_dictionary_data_->suffix_tokens_size();
++i) {
const SystemDictionaryData::SuffixToken &suffix_token =
system_dictionary_data_->suffix_tokens(i);
if (suffix_token.has_key()) {
suffix_tokens_[i].key = suffix_token.key().data();
} else {
suffix_tokens_[i].key = NULL;
}
if (suffix_token.has_value()) {
suffix_tokens_[i].value = suffix_token.value().data();
} else {
suffix_tokens_[i].value = NULL;
}
suffix_tokens_[i].lid = suffix_token.lid();
suffix_tokens_[i].rid = suffix_token.rid();
suffix_tokens_[i].wcost = suffix_token.wcost();
}
// Makes reading correction data.
reading_corrections_.reset(
new ReadingCorrectionItem[
system_dictionary_data_->reading_corrections_size()]);
for (size_t i = 0;
i < system_dictionary_data_->reading_corrections_size();
++i) {
const SystemDictionaryData::ReadingCorrectionItem &item =
system_dictionary_data_->reading_corrections(i);
if (item.has_value()) {
reading_corrections_[i].value = item.value().data();
} else {
reading_corrections_[i].value = NULL;
}
if (item.has_error()) {
reading_corrections_[i].error = item.error().data();
} else {
reading_corrections_[i].error = NULL;
}
if (item.has_correction()) {
reading_corrections_[i].correction = item.correction().data();
} else {
reading_corrections_[i].correction = NULL;
}
}
// Makes segment data.
const SystemDictionaryData::SegmenterData &segmenter_data =
system_dictionary_data_->segmenter_data();
compressed_l_size_ = segmenter_data.compressed_l_size();
compressed_r_size_ = segmenter_data.compressed_r_size();
compressed_lid_table_.reset(
new uint16[segmenter_data.compressed_lid_table_size()]);
for (size_t i = 0; i < segmenter_data.compressed_lid_table_size(); ++i) {
compressed_lid_table_[i] = segmenter_data.compressed_lid_table(i);
}
compressed_rid_table_.reset(
new uint16[segmenter_data.compressed_rid_table_size()]);
for (size_t i = 0; i < segmenter_data.compressed_rid_table_size(); ++i) {
compressed_rid_table_[i] = segmenter_data.compressed_rid_table(i);
}
// Makes symbol dictionary data.
const SystemDictionaryData::EmbeddedDictionary &symbol_dictionary =
system_dictionary_data_->symbol_dictionary();
size_t symbol_value_count = 0;
for (size_t i = 0; i < symbol_dictionary.tokens_size(); ++i) {
symbol_value_count += symbol_dictionary.tokens(i).values_size();
}
symbol_data_values_.reset(
new EmbeddedDictionary::Value[symbol_value_count + 1]);
symbol_data_token_size_ = symbol_dictionary.tokens_size();
symbol_data_tokens_.reset(
new EmbeddedDictionary::Token[symbol_data_token_size_ + 1]);
EmbeddedDictionary::Value *value_ptr = symbol_data_values_.get();
EmbeddedDictionary::Token *token_ptr = symbol_data_tokens_.get();
for (size_t i = 0; i < symbol_dictionary.tokens_size(); ++i) {
const SystemDictionaryData::EmbeddedDictionary::Token &token =
symbol_dictionary.tokens(i);
token_ptr->key = token.key().data();
token_ptr->value = value_ptr;
token_ptr->value_size = token.values_size();
++token_ptr;
for (size_t j = 0; j < token.values_size(); ++j) {
const SystemDictionaryData::EmbeddedDictionary::Value &value =
token.values(j);
if (value.has_value()) {
value_ptr->value = value.value().data();
} else {
value_ptr->value = NULL;
}
if (value.has_description()) {
value_ptr->description = value.description().data();
} else {
value_ptr->description = NULL;
}
if (value.has_additional_description()) {
value_ptr->additional_description =
value.additional_description().data();
} else {
value_ptr->additional_description = NULL;
}
value_ptr->lid = value.lid();
value_ptr->rid = value.rid();
value_ptr->cost = value.cost();
++value_ptr;
}
}
value_ptr->value = NULL;
value_ptr->description = NULL;
value_ptr->additional_description = NULL;
value_ptr->lid = 0;
value_ptr->rid = 0;
value_ptr->cost = 0;
token_ptr->key = NULL;
token_ptr->value = symbol_data_values_.get();
token_ptr->value_size = symbol_value_count;
// Makes POSMatcher.
pos_matcher_.reset(
new PackedPOSMatcher(rule_id_table_.get(), range_tables_.get()));
#ifndef NO_USAGE_REWRITER
// Makes Usge rewriter data.
const SystemDictionaryData::UsageRewriterData &usage_rewriter_data =
system_dictionary_data_->usage_rewriter_data();
const size_t conjugation_num = usage_rewriter_data.conjugations_size();
base_conjugation_suffix_.reset(new ConjugationSuffix[conjugation_num]);
conjugation_suffix_data_index_.reset(new int[conjugation_num + 1]);
size_t suffix_data_num = 0;
conjugation_suffix_data_index_[0] = 0;
for (size_t i = 0; i < conjugation_num; ++i) {
const SystemDictionaryData::UsageRewriterData::Conjugation &conjugation =
usage_rewriter_data.conjugations(i);
base_conjugation_suffix_[i].value_suffix =
conjugation.base_suffix().value_suffix().data();
base_conjugation_suffix_[i].key_suffix =
conjugation.base_suffix().key_suffix().data();
suffix_data_num +=
usage_rewriter_data.conjugations(i).conjugation_suffixes_size();
conjugation_suffix_data_index_[i + 1] = suffix_data_num;
}
conjugation_suffix_data_.reset(new ConjugationSuffix[suffix_data_num]);
size_t conjugation_suffix_id = 0;
for (size_t i = 0; i < conjugation_num; ++i) {
const SystemDictionaryData::UsageRewriterData::Conjugation &conjugation =
usage_rewriter_data.conjugations(i);
for (size_t j = 0; j < conjugation.conjugation_suffixes_size(); ++j) {
conjugation_suffix_data_[conjugation_suffix_id].value_suffix =
conjugation.conjugation_suffixes(j).value_suffix().data();
conjugation_suffix_data_[conjugation_suffix_id].key_suffix =
conjugation.conjugation_suffixes(j).key_suffix().data();
++conjugation_suffix_id;
}
}
usage_data_value_.reset(
new UsageDictItem[usage_rewriter_data.usage_data_values_size() + 1]);
for (size_t i = 0; i < usage_rewriter_data.usage_data_values_size(); ++i) {
const SystemDictionaryData::UsageRewriterData::UsageDictItem &item =
usage_rewriter_data.usage_data_values(i);
usage_data_value_[i].id = item.id();
usage_data_value_[i].key = item.key().data();
usage_data_value_[i].value = item.value().data();
usage_data_value_[i].conjugation_id = item.conjugation_id();
usage_data_value_[i].meaning = item.meaning().data();
}
UsageDictItem *last_item =
&usage_data_value_[usage_rewriter_data.usage_data_values_size()];
last_item->id = 0;
last_item->key = NULL;
last_item->value = NULL;
last_item->conjugation_id = 0;
last_item->meaning = NULL;
#endif // NO_USAGE_REWRITER
// Makes counter suffix sorted array.
{
const size_t size =
system_dictionary_data_->counter_suffix_data_size();
if (size > 0) {
counter_suffix_data_.reset(new CounterSuffixEntry[size]);
for (size_t i = 0; i < size; ++i) {
counter_suffix_data_[i].suffix =
system_dictionary_data_->counter_suffix_data(i).data();
counter_suffix_data_[i].size =
system_dictionary_data_->counter_suffix_data(i).size();
}
}
}
return true;
}
const UserPOS::POSToken *PackedDataManager::Impl::GetUserPOSData() const {
return pos_token_.get();
}
const POSMatcher *PackedDataManager::Impl::GetPOSMatcher() const {
return pos_matcher_.get();
}
const uint8 *PackedDataManager::Impl::GetPosGroupData() const {
return reinterpret_cast<const uint8 *>(
system_dictionary_data_->lid_group_data().data());
}
void PackedDataManager::Impl::GetConnectorData(
const char **data,
size_t *size) const {
*data = system_dictionary_data_->connection_data().data();
*size = system_dictionary_data_->connection_data().size();
}
void PackedDataManager::Impl::GetSegmenterData(
size_t *l_num_elements, size_t *r_num_elements,
const uint16 **l_table, const uint16 **r_table,
size_t *bitarray_num_bytes, const char **bitarray_data,
const BoundaryData **boundary_data) const {
*l_num_elements = compressed_l_size_;
*r_num_elements = compressed_r_size_;
*l_table = compressed_lid_table_.get();
*r_table = compressed_rid_table_.get();
*bitarray_num_bytes =
system_dictionary_data_->segmenter_data().bit_array_data().size();
*bitarray_data =
system_dictionary_data_->segmenter_data().bit_array_data().data();
*boundary_data = boundary_data_.get();
}
void PackedDataManager::Impl::GetSystemDictionaryData(
const char **data,
int *size) const {
*data = system_dictionary_data_->dictionary_data().data();
*size = system_dictionary_data_->dictionary_data().size();
}
void PackedDataManager::Impl::GetSuffixDictionaryData(
const SuffixToken **data,
size_t *size) const {
*data = suffix_tokens_.get();
*size = system_dictionary_data_->suffix_tokens().size();
}
void PackedDataManager::Impl::GetReadingCorrectionData(
const ReadingCorrectionItem **array,
size_t *size) const {
*array = reading_corrections_.get();
*size = system_dictionary_data_->reading_corrections().size();
}
void PackedDataManager::Impl::GetCollocationData(
const char **array,
size_t *size) const {
*array = system_dictionary_data_->collocation_data().data();
*size = system_dictionary_data_->collocation_data().size();
}
void PackedDataManager::Impl::GetCollocationSuppressionData(
const char **array,
size_t *size) const {
*array = system_dictionary_data_->collocation_suppression_data().data();
*size = system_dictionary_data_->collocation_suppression_data().size();
}
void PackedDataManager::Impl::GetSuggestionFilterData(
const char **data,
size_t *size) const {
*data = system_dictionary_data_->suggestion_filter_data().data();
*size = system_dictionary_data_->suggestion_filter_data().size();
}
void PackedDataManager::Impl::GetSymbolRewriterData(
const EmbeddedDictionary::Token **data,
size_t *size) const {
*data = symbol_data_tokens_.get();
*size = symbol_data_token_size_;
}
#ifndef NO_USAGE_REWRITER
void PackedDataManager::Impl::GetUsageRewriterData(
const ConjugationSuffix **base_conjugation_suffix,
const ConjugationSuffix **conjugation_suffix_data,
const int **conjugation_suffix_data_index,
const UsageDictItem **usage_data_value) const {
*base_conjugation_suffix = base_conjugation_suffix_.get();
*conjugation_suffix_data = conjugation_suffix_data_.get();
*conjugation_suffix_data_index = conjugation_suffix_data_index_.get();
*usage_data_value = usage_data_value_.get();
}
#endif // NO_USAGE_REWRITER
const uint16 *PackedDataManager::Impl::GetRuleIdTableForTest() const {
return rule_id_table_.get();
}
const void *PackedDataManager::Impl::GetRangeTablesForTest() const {
return range_tables_.get();
}
void PackedDataManager::Impl::GetCounterSuffixSortedArray(
const CounterSuffixEntry **array, size_t *size) const {
*array = counter_suffix_data_.get();
*size = system_dictionary_data_->counter_suffix_data_size();
}
PackedDataManager::PackedDataManager() {
}
PackedDataManager::~PackedDataManager() {
}
bool PackedDataManager::Init(const string &system_dictionary_data) {
manager_impl_.reset(new Impl());
if (manager_impl_->Init(system_dictionary_data)) {
return true;
}
LOG(ERROR) << "PackedDataManager initialization error";
manager_impl_.reset();
return false;
}
bool PackedDataManager::InitWithZippedData(
const string &zipped_system_dictionary_data) {
manager_impl_.reset(new Impl());
if (manager_impl_->InitWithZippedData(zipped_system_dictionary_data)) {
return true;
}
LOG(ERROR) << "PackedDataManager initialization error";
manager_impl_.reset();
return false;
}
string PackedDataManager::GetDictionaryVersion() {
return manager_impl_->GetDictionaryVersion();
}
const UserPOS::POSToken *PackedDataManager::GetUserPOSData() const {
return manager_impl_->GetUserPOSData();
}
PackedDataManager *PackedDataManager::GetUserPosManager() {
if (!g_data_manager.get()) {
LOG(INFO) << "PackedDataManager::GetUserPosManager null!";
LOG(INFO) << "FLAGS_dataset: [" << FLAGS_dataset << "]";
if (FLAGS_dataset.empty()) {
LOG(FATAL) << "PackedDataManager::GetUserPosManager ERROR!";
} else {
unique_ptr<PackedDataManager> data_manager(new PackedDataManager);
string buffer;
{
Mmap mmap;
CHECK(mmap.Open(FLAGS_dataset.c_str(), "r"));
buffer.assign(mmap.begin(), mmap.size());
}
if (data_manager->Init(buffer)) {
RegisterPackedDataManager(data_manager.release());
}
}
}
CHECK(g_data_manager.get()) << "PackedDataManager::GetUserPosManager ERROR!";
return g_data_manager.get();
}
const POSMatcher *PackedDataManager::GetPOSMatcher() const {
return manager_impl_->GetPOSMatcher();
}
const uint8 *PackedDataManager::GetPosGroupData() const {
return manager_impl_->GetPosGroupData();
}
void PackedDataManager::GetConnectorData(
const char **data,
size_t *size) const {
manager_impl_->GetConnectorData(data, size);
}
void PackedDataManager::GetSegmenterData(
size_t *l_num_elements, size_t *r_num_elements,
const uint16 **l_table, const uint16 **r_table,
size_t *bitarray_num_bytes, const char **bitarray_data,
const BoundaryData **boundary_data) const {
manager_impl_->GetSegmenterData(l_num_elements,
r_num_elements,
l_table,
r_table,
bitarray_num_bytes,
bitarray_data,
boundary_data);
}
void PackedDataManager::GetSystemDictionaryData(
const char **data,
int *size) const {
manager_impl_->GetSystemDictionaryData(data, size);
}
void PackedDataManager::GetSuffixDictionaryData(
const SuffixToken **data,
size_t *size) const {
manager_impl_->GetSuffixDictionaryData(data, size);
}
void PackedDataManager::GetReadingCorrectionData(
const ReadingCorrectionItem **array,
size_t *size) const {
manager_impl_->GetReadingCorrectionData(array, size);
}
void PackedDataManager::GetCollocationData(
const char **array,
size_t *size) const {
manager_impl_->GetCollocationData(array, size);
}
void PackedDataManager::GetCollocationSuppressionData(
const char **array,
size_t *size) const {
manager_impl_->GetCollocationSuppressionData(array, size);
}
void PackedDataManager::GetSuggestionFilterData(
const char **data,
size_t *size) const {
manager_impl_->GetSuggestionFilterData(data, size);
}
void PackedDataManager::GetSymbolRewriterData(
const EmbeddedDictionary::Token **data,
size_t *size) const {
manager_impl_->GetSymbolRewriterData(data, size);
}
#ifndef NO_USAGE_REWRITER
void PackedDataManager::GetUsageRewriterData(
const ConjugationSuffix **base_conjugation_suffix,
const ConjugationSuffix **conjugation_suffix_data,
const int **conjugation_suffix_data_index,
const UsageDictItem **usage_data_value) const {
manager_impl_->GetUsageRewriterData(base_conjugation_suffix,
conjugation_suffix_data,
conjugation_suffix_data_index,
usage_data_value);
}
#endif // NO_USAGE_REWRITER
void PackedDataManager::GetCounterSuffixSortedArray(
const CounterSuffixEntry **array, size_t *size) const {
manager_impl_->GetCounterSuffixSortedArray(array, size);
}
const uint16 *PackedDataManager::GetRuleIdTableForTest() const {
return manager_impl_->GetRuleIdTableForTest();
}
const void *PackedDataManager::GetRangeTablesForTest() const {
return manager_impl_->GetRangeTablesForTest();
}
void RegisterPackedDataManager(PackedDataManager *packed_data_manager) {
g_data_manager.reset(packed_data_manager);
}
PackedDataManager *GetPackedDataManager() {
return g_data_manager.get();
}
} // namespace packed
} // namespace mozc