blob: f29a779cc5cc4b820d9cfe1b11a26f7716714eb8 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "data_manager/packed/system_dictionary_data_packer.h"
#include <string>
#include "base/codegen_bytearray_stream.h"
#include "base/file_stream.h"
#include "base/protobuf/gzip_stream.h"
#include "base/protobuf/zero_copy_stream_impl.h"
#include "base/logging.h"
#include "base/version.h"
#include "converter/boundary_struct.h"
#include "data_manager/packed/system_dictionary_data.pb.h"
#include "data_manager/packed/system_dictionary_format_version.h"
#include "dictionary/suffix_dictionary_token.h"
#include "dictionary/pos_group.h"
#include "dictionary/pos_matcher.h"
#include "dictionary/user_pos.h"
#include "rewriter/correction_rewriter.h"
#include "rewriter/counter_suffix.h"
#include "rewriter/embedded_dictionary.h"
#ifndef NO_USAGE_REWRITER
#include "rewriter/usage_rewriter_data_structs.h"
#endif // NO_USAGE_REWRITER
using mozc::dictionary::POSMatcher;
using mozc::dictionary::SuffixToken;
using mozc::dictionary::UserPOS;
namespace mozc {
namespace packed {
SystemDictionaryDataPacker::SystemDictionaryDataPacker(const string &version) {
system_dictionary_.reset(new SystemDictionaryData());
system_dictionary_->set_product_version(version);
system_dictionary_->set_format_version(kSystemDictionaryFormatVersion);
}
SystemDictionaryDataPacker::~SystemDictionaryDataPacker() {
}
void SystemDictionaryDataPacker::SetPosTokens(
const UserPOS::POSToken *pos_token_data,
size_t token_count) {
for (size_t i = 0; i < token_count; ++i) {
SystemDictionaryData::PosToken *pos_token =
system_dictionary_->add_pos_tokens();
if (pos_token_data[i].pos) {
pos_token->set_pos(pos_token_data[i].pos);
}
for (size_t j = 0; j < pos_token_data[i].conjugation_size; ++j) {
SystemDictionaryData::PosToken::ConjugationType *conjugation_form
= pos_token->add_conjugation_forms();
if (pos_token_data[i].conjugation_form[j].key_suffix) {
conjugation_form->set_key_suffix(
pos_token_data[i].conjugation_form[j].key_suffix);
}
if (pos_token_data[i].conjugation_form[j].value_suffix) {
conjugation_form->set_value_suffix(
pos_token_data[i].conjugation_form[j].value_suffix);
}
conjugation_form->set_id(
pos_token_data[i].conjugation_form[j].id);
}
}
}
void SystemDictionaryDataPacker::SetPosMatcherData(
const uint16 *rule_id_table,
size_t rule_id_table_count,
const POSMatcher::Range *const *range_tables,
size_t range_tables_count) {
SystemDictionaryData::PosMatcherData *pos_matcher_data =
system_dictionary_->mutable_pos_matcher_data();
for (size_t i = 0; i < rule_id_table_count; ++i) {
pos_matcher_data->add_rule_id_table(rule_id_table[i]);
}
for (size_t i = 0; i < range_tables_count; ++i) {
SystemDictionaryData::PosMatcherData::RangeTable *range_table =
pos_matcher_data->add_range_tables();
for (size_t j = 0;
range_tables[i][j].lower != static_cast<uint16>(0xFFFF);
++j) {
SystemDictionaryData::PosMatcherData::RangeTable::Range *range
= range_table->add_ranges();
range->set_lower(range_tables[i][j].lower);
range->set_upper(range_tables[i][j].upper);
}
}
}
void SystemDictionaryDataPacker::SetBoundaryData(
const BoundaryData *boundary_data,
size_t boundary_data_count) {
for (size_t i = 0; i < boundary_data_count; ++i) {
SystemDictionaryData::BoundaryData *boundary =
system_dictionary_->add_boundary_data();
boundary->set_prefix_penalty(boundary_data[i].prefix_penalty);
boundary->set_suffix_penalty(boundary_data[i].suffix_penalty);
}
}
void SystemDictionaryDataPacker::SetLidGroupData(
const void *lid_group_data,
size_t lid_group_data_size) {
system_dictionary_->set_lid_group_data(
lid_group_data,
lid_group_data_size);
}
void SystemDictionaryDataPacker::SetSuffixTokens(
const SuffixToken *suffix_tokens,
size_t suffix_tokens_count) {
for (size_t i = 0; i < suffix_tokens_count; ++i) {
SystemDictionaryData::SuffixToken *suffix_token =
system_dictionary_->add_suffix_tokens();
if (suffix_tokens[i].key) {
suffix_token->set_key(suffix_tokens[i].key);
}
if (suffix_tokens[i].value) {
suffix_token->set_value(suffix_tokens[i].value);
}
suffix_token->set_lid(suffix_tokens[i].lid);
suffix_token->set_rid(suffix_tokens[i].rid);
suffix_token->set_wcost(suffix_tokens[i].wcost);
}
}
void SystemDictionaryDataPacker::SetReadingCorretions(
const ReadingCorrectionItem *reading_corrections,
size_t reading_corrections_count) {
for (size_t i = 0; i < reading_corrections_count; ++i) {
SystemDictionaryData::ReadingCorrectionItem *item =
system_dictionary_->add_reading_corrections();
if (reading_corrections[i].value) {
item->set_value(reading_corrections[i].value);
}
if (reading_corrections[i].error) {
item->set_error(reading_corrections[i].error);
}
if (reading_corrections[i].correction) {
item->set_correction(reading_corrections[i].correction);
}
}
}
void SystemDictionaryDataPacker::SetSegmenterData(
size_t compressed_l_size,
size_t compressed_r_size,
const uint16 *compressed_lid_table,
size_t compressed_lid_table_size,
const uint16 *compressed_rid_table,
size_t compressed_rid_table_size,
const char *segmenter_bit_array_data,
size_t segmenter_bit_array_data_size) {
SystemDictionaryData::SegmenterData *segmenter =
system_dictionary_->mutable_segmenter_data();
segmenter->set_compressed_l_size(compressed_l_size);
segmenter->set_compressed_r_size(compressed_r_size);
for (size_t i = 0; i < compressed_lid_table_size; ++i) {
segmenter->add_compressed_lid_table(compressed_lid_table[i]);
}
for (size_t i = 0; i < compressed_rid_table_size; ++i) {
segmenter->add_compressed_rid_table(compressed_rid_table[i]);
}
segmenter->set_bit_array_data(segmenter_bit_array_data,
segmenter_bit_array_data_size);
}
void SystemDictionaryDataPacker::SetSuggestionFilterData(
const void *suggestion_filter_data,
size_t suggestion_filter_data_size) {
system_dictionary_->set_suggestion_filter_data(
suggestion_filter_data,
suggestion_filter_data_size);
}
void SystemDictionaryDataPacker::SetConnectionData(
const void *connection_data,
size_t connection_data_size) {
system_dictionary_->set_connection_data(
connection_data,
connection_data_size);
}
void SystemDictionaryDataPacker::SetDictionaryData(
const void *dictionary_data,
size_t dictionary_data_size) {
system_dictionary_->set_dictionary_data(
dictionary_data,
dictionary_data_size);
}
void SystemDictionaryDataPacker::SetCollocationData(
const void *collocation_data,
size_t collocation_data_size) {
system_dictionary_->set_collocation_data(
collocation_data,
collocation_data_size);
}
void SystemDictionaryDataPacker::SetCollocationSuppressionData(
const void *collocation_suppression_data,
size_t collocation_suppression_data_size) {
system_dictionary_->set_collocation_suppression_data(
collocation_suppression_data,
collocation_suppression_data_size);
}
void SystemDictionaryDataPacker::SetSymbolRewriterData(
const mozc::EmbeddedDictionary::Token *token_data,
size_t token_size) {
SystemDictionaryData::EmbeddedDictionary *symbol_dictionary =
system_dictionary_->mutable_symbol_dictionary();
for (size_t i = 0; i < token_size; ++i) {
SystemDictionaryData::EmbeddedDictionary::Token *token =
symbol_dictionary->add_tokens();
token->set_key(token_data[i].key);
for (size_t j = 0; j < token_data[i].value_size; ++j) {
const mozc::EmbeddedDictionary::Value &value_data =
token_data[i].value[j];
SystemDictionaryData::EmbeddedDictionary::Value *value =
token->add_values();
if (value_data.value) {
value->set_value(value_data.value);
}
if (value_data.description) {
value->set_description(value_data.description);
}
if (value_data.additional_description) {
value->set_additional_description(value_data.additional_description);
}
value->set_lid(value_data.lid);
value->set_rid(value_data.rid);
value->set_cost(value_data.cost);
}
}
}
#ifndef NO_USAGE_REWRITER
void SystemDictionaryDataPacker::SetUsageRewriterData(
int conjugation_num,
const ConjugationSuffix *base_conjugation_suffix,
const ConjugationSuffix *conjugation_suffix_data,
const int *conjugation_suffix_data_index,
size_t usage_data_size,
const UsageDictItem *usage_data_value) {
SystemDictionaryData::UsageRewriterData *usage_rewriter_data =
system_dictionary_->mutable_usage_rewriter_data();
for (size_t i = 0; i < conjugation_num; ++i) {
SystemDictionaryData::UsageRewriterData::Conjugation *conjugation =
usage_rewriter_data->add_conjugations();
if (base_conjugation_suffix[i].value_suffix) {
conjugation->mutable_base_suffix()->set_value_suffix(
base_conjugation_suffix[i].value_suffix);
}
if (base_conjugation_suffix[i].key_suffix) {
conjugation->mutable_base_suffix()->set_key_suffix(
base_conjugation_suffix[i].key_suffix);
}
for (size_t j = conjugation_suffix_data_index[i];
j < conjugation_suffix_data_index[i + 1];
++j) {
SystemDictionaryData::UsageRewriterData::ConjugationSuffix *suffix =
conjugation->add_conjugation_suffixes();
if (conjugation_suffix_data[j].value_suffix) {
suffix->set_value_suffix(conjugation_suffix_data[j].value_suffix);
}
if (conjugation_suffix_data[j].key_suffix) {
suffix->set_key_suffix(conjugation_suffix_data[j].key_suffix);
}
}
}
for (size_t i = 0; i < usage_data_size; ++i) {
SystemDictionaryData::UsageRewriterData::UsageDictItem *item =
usage_rewriter_data->add_usage_data_values();
item->set_id(usage_data_value[i].id);
if (usage_data_value[i].key) {
item->set_key(usage_data_value[i].key);
}
if (usage_data_value[i].value) {
item->set_value(usage_data_value[i].value);
}
item->set_conjugation_id(usage_data_value[i].conjugation_id);
if (usage_data_value[i].meaning) {
item->set_meaning(usage_data_value[i].meaning);
}
}
}
#endif // NO_USAGE_REWRITER
void SystemDictionaryDataPacker::SetCounterSuffixSortedArray(
const CounterSuffixEntry *suffix_array, size_t size) {
for (size_t i = 0; i < size; ++i) {
const string suffix_str(suffix_array[i].suffix, suffix_array[i].size);
system_dictionary_->add_counter_suffix_data(suffix_str);
}
}
bool SystemDictionaryDataPacker::Output(const string &file_path,
bool use_gzip) {
OutputFileStream output(file_path.c_str(),
ios::out | ios::binary | ios::trunc);
if (use_gzip) {
protobuf::io::OstreamOutputStream zero_copy_output(&output);
protobuf::io::GzipOutputStream gzip_stream(&zero_copy_output);
if (!system_dictionary_->SerializeToZeroCopyStream(&gzip_stream) ||
!gzip_stream.Close()) {
LOG(ERROR) << "Serialize to gzip stream failed.";
}
} else {
if (!system_dictionary_->SerializeToOstream(&output)) {
LOG(ERROR) << "Failed to write data to " << file_path;
return false;
}
}
return true;
}
bool SystemDictionaryDataPacker::OutputHeader(
const string &file_path,
bool use_gzip) {
scoped_ptr<ostream> output_stream(
new mozc::OutputFileStream(file_path.c_str(), ios::out | ios::trunc));
CodeGenByteArrayOutputStream *codegen_stream;
output_stream.reset(
codegen_stream = new mozc::CodeGenByteArrayOutputStream(
output_stream.release(),
mozc::codegenstream::OWN_STREAM));
codegen_stream->OpenVarDef("PackedSystemDictionary");
if (use_gzip) {
protobuf::io::OstreamOutputStream zero_copy_output(output_stream.get());
protobuf::io::GzipOutputStream gzip_stream(&zero_copy_output);
if (!system_dictionary_->SerializeToZeroCopyStream(&gzip_stream) ||
!gzip_stream.Close()) {
LOG(ERROR) << "Serialize to gzip stream failed.";
}
} else {
if (!system_dictionary_->SerializeToOstream(output_stream.get())) {
LOG(ERROR) << "Failed to write data to " << file_path;
return false;
}
}
return true;
}
} // namespace packed
} // namespace mozc