src/rewriter/gen_usage_rewriter_dictionary_main.cc - mozc - Git at Google

 // Copyright 2010-2014, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
 // copyright notice, this list of conditions and the following disclaimer
 // in the documentation and/or other materials provided with the
 // distribution.
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 // Usage dictionary generator:
 // % gen_usage_rewriter_dictionary_main
 //    --usage_data_file=usage_data.txt
 //    --cforms_file=cforms.def
 //    --output=output_header

 #include <algorithm>
 #include <iostream>
 #include <map>
 #include <set>
 #include <string>
 #include <vector>

 #include "base/file_stream.h"
 #include "base/logging.h"
 #include "base/util.h"

 DEFINE_string(usage_data_file, "", "usage data file");
 DEFINE_string(cforms_file, "", "cforms file");
 DEFINE_string(output, "", "output header file");

 namespace mozc {
 namespace {
 struct ConjugationType {
   string form;
   string value_suffix;
   string key_suffix;
 };

 struct UsageItem {
   string key;
   string value;
   string conjugation;
   int conjugation_id;
   string meaning;
 };

 bool UsageItemKeynameCmp(const UsageItem& l, const UsageItem& r) {
   return l.key < r.key;
 }

 // Load cforms_file
 void LoadConjugation(const string &filename,
                      map<string, vector<ConjugationType> > *output,
                      map<string, ConjugationType> *baseform_map) {
   InputFileStream ifs(filename.c_str());
   CHECK(ifs.good());

   string line;
   vector<string> fields;
   while (!getline(ifs, line).fail()) {
     if (line.empty() || line[0] == '#') {
       continue;
     }
     fields.clear();
     Util::SplitStringUsing(line, "\t ", &fields);
     CHECK_GE(fields.size(), 4)  << "format error: " << line;

     ConjugationType tmp;
     tmp.form = fields[1];
     tmp.value_suffix = ((fields[2] == "*") ? "" : fields[2]);
     tmp.key_suffix   = ((fields[3] == "*") ? "" : fields[3]);
     (*output)[fields[0]].push_back(tmp);   // insert

     if (tmp.form == "\xE5\x9F\xBA\xE6\x9C\xAC\xE5\xBD\xA2") {  // 基本形
       (*baseform_map)[fields[0]] = tmp;
     }
   }
 }

 // Load usage_data_file
 void LoadUsage(const string &filename,
                vector<UsageItem> *usage_entries,
                vector<string> *conjugation_list) {
   InputFileStream ifs(filename.c_str());

   if (!ifs.good()) {
     LOG(WARNING) << "Can't open file:" << filename;
     return;
   }

   string line;
   vector<string> fields;
   map<string, int> conjugation_id_map;

   int conjugation_id = 0;
   while (!getline(ifs, line).fail()) {
     if (line.empty() || line[0] == '#') {
       // starting with '#' is a comment line.
       continue;
     }
     fields.clear();
     Util::SplitStringAllowEmpty(line, "\t", &fields);
     CHECK_GE(fields.size(), 4) << "format error: " << line;

     UsageItem item;
     item.key = ((fields[0] == "*") ? "" : fields[0]);
     item.value = ((fields[1] == "*") ? "" : fields[1]);
     item.conjugation = ((fields[2] == "*") ? "" : fields[2]);
     string tmp = ((fields[3] == "*") ? "" : fields[3]);
     Util::StringReplace(tmp, "\\n", "\n", true, &item.meaning);

     map<string, int>::iterator it = conjugation_id_map.find(item.conjugation);
     if (it == conjugation_id_map.end()) {
       conjugation_id_map.insert(
         pair<string, int>(item.conjugation, conjugation_id));
       item.conjugation_id = conjugation_id;
       conjugation_list->push_back(item.conjugation);
       ++conjugation_id;
     } else {
       item.conjugation_id = it->second;
     }
     usage_entries->push_back(item);
   }
 }

 // remove "基本形"'s conjugation suffix
 void RemoveBaseformConjugationSuffix(
   const map<string, ConjugationType> &baseform_map,
   vector<UsageItem> *usage_entries) {
   for (vector<UsageItem>::iterator usage_itr = usage_entries->begin();
       usage_itr != usage_entries->end(); ++usage_itr) {
     const map<string, ConjugationType>::const_iterator baseform_itr =
       baseform_map.find(usage_itr->conjugation);
     if (baseform_itr == baseform_map.end()) {
       continue;
     }
     const ConjugationType &type = baseform_itr->second;

     if (usage_itr->key.length() <= type.key_suffix.length()) {
       LOG(WARNING) << "key:[" << usage_itr->key << "] is not longer then "
                    << "baseform.key_suffix  of \"" << usage_itr->conjugation
                    << "\" : [" << type.key_suffix << "]";
     }
     if (usage_itr->value.length() <= type.value_suffix.length()) {
       LOG(WARNING) << "value:[" << usage_itr->value << "] is not longer then "
                    << "baseform.value_suffix  of \"" << usage_itr->conjugation
                    << "\" : [" << type.value_suffix << "]";
     }

     usage_itr->key.erase(usage_itr->key.length() - type.key_suffix.length());
     usage_itr->value.erase(
         usage_itr->value.length() - type.value_suffix.length());
   }
 }

 void Convert() {
   // Load cforms_file
   map<string, vector<ConjugationType> > inflection_map;
   map<string, ConjugationType> baseform_map;
   LoadConjugation(FLAGS_cforms_file, &inflection_map, &baseform_map);

   // Load usage_data_file
   vector<UsageItem> usage_entries;
   vector<string> conjugation_list;
   LoadUsage(FLAGS_usage_data_file, &usage_entries, &conjugation_list);

   ostream *ofs = &cout;
   if (!FLAGS_output.empty()) {
     ofs = new OutputFileStream(FLAGS_output.c_str());
   }

   *ofs << "// This header file is generated by "
        << "gen_usage_rewriter_dictionary_main."
        << endl;

   // Output kConjugationNum
   *ofs << "static const int kConjugationNum = " <<
           conjugation_list.size() << ";" << endl;

   // Output kBaseConjugationSuffix
   *ofs << "static const ConjugationSuffix kBaseConjugationSuffix[] = {" << endl;
   for (size_t i = 0; i < conjugation_list.size(); ++i) {
     string value_suffix, key_suffix;
     Util::Escape(baseform_map[conjugation_list[i]].value_suffix, &value_suffix);
     Util::Escape(baseform_map[conjugation_list[i]].key_suffix, &key_suffix);
       *ofs << "  {\"" << value_suffix << "\", \"" << key_suffix << "\"},  "
         << "// " << conjugation_list[i] << endl;
   }
   *ofs << "};" << endl;

   // Output kConjugationSuffixData
   vector<int> conjugation_index(conjugation_list.size() + 1);
   *ofs << "static const ConjugationSuffix kConjugationSuffixData[] = {" << endl;
   int out_count = 0;
   for (size_t i = 0; i < conjugation_list.size(); ++i) {
     vector<ConjugationType> conjugations = inflection_map[conjugation_list[i]];
     conjugation_index[i] = out_count;
     if (conjugations.size() == 0) {
       *ofs << "  // " << i << ": (" << out_count << "-" << out_count
            << "): no conjugations" << endl;
       *ofs << "  {\"\",\"\"}," << endl;
       ++out_count;
     } else {
       typedef pair<string, string> StrPair;
       set<StrPair> key_and_value_suffix_set;
       for (size_t j = 0; j < conjugations.size(); ++j) {
         StrPair key_and_value_suffix(conjugations[j].value_suffix,
                                     conjugations[j].key_suffix);
         key_and_value_suffix_set.insert(key_and_value_suffix);
       }
       *ofs << "  // " << i << ": (" << out_count << "-"
            << (out_count + key_and_value_suffix_set.size()-1)
            << "): " << conjugation_list[i] << endl << " ";
       set<StrPair>::iterator itr;
       for (itr = key_and_value_suffix_set.begin();
           itr != key_and_value_suffix_set.end(); ++itr) {
         string value_suffix, key_suffix;
         Util::Escape(itr->first, &value_suffix);
         Util::Escape(itr->second, &key_suffix);
         *ofs << " {\"" << value_suffix <<
                 "\", \"" << key_suffix << "\"},";
         ++out_count;
       }
       *ofs << endl;
     }
   }
   *ofs << "};" << endl;
   conjugation_index[conjugation_list.size()] = out_count;

   // Output kConjugationSuffixDataIndex
   *ofs << "static const int kConjugationSuffixDataIndex[] = {";
   for (size_t i = 0; i < conjugation_index.size(); ++i) {
     if (i != 0) {
       *ofs << ", ";
     }
     *ofs << conjugation_index[i];
   }
   *ofs << "};" << endl;

   RemoveBaseformConjugationSuffix(baseform_map, &usage_entries);
   sort(usage_entries.begin(), usage_entries.end(), UsageItemKeynameCmp);

   // Output kUsageDataSize
   *ofs << "static const size_t kUsageDataSize = "
        << usage_entries.size() << ";" << endl;

   // Output kUsageData_value
   *ofs << "static const UsageDictItem kUsageData_value[] = {" << endl;
   int32 usage_id = 0;
   for (vector<UsageItem>::iterator i = usage_entries.begin();
       i != usage_entries.end(); i++) {
     string key, value, meaning;
     Util::Escape(i->key, &key);
     Util::Escape(i->value, &value);
     Util::Escape(i->meaning, &meaning);
     *ofs <<  "  {" << usage_id << ", \"" << key << "\", "
         << "\"" << value << "\", "
         << "" << i->conjugation_id << ", "
         << "\"" << meaning << "\"}, // "
         << i->value << "(" << i->key << ")" << endl;
     ++usage_id;
   }
   *ofs << "  { 0, NULL, NULL, 0, NULL }" << endl;
   *ofs << "};" << endl;

   if (ofs != &cout) {
     delete ofs;
   }
 }

 }  // namespace
 }  // namespace mozc

 int main(int argc, char **argv) {
   InitGoogle(argv[0], &argc, &argv, true);
   mozc::Convert();
   return 0;
 }
	// Copyright 2010-2014, Google Inc.
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are
	// met:
	//
	// * Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above
	// copyright notice, this list of conditions and the following disclaimer
	// in the documentation and/or other materials provided with the
	// distribution.
	// * Neither the name of Google Inc. nor the names of its
	// contributors may be used to endorse or promote products derived from
	// this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	// Usage dictionary generator:
	// % gen_usage_rewriter_dictionary_main
	// --usage_data_file=usage_data.txt
	// --cforms_file=cforms.def
	// --output=output_header

	#include <algorithm>
	#include <iostream>
	#include <map>
	#include <set>
	#include <string>
	#include <vector>

	#include "base/file_stream.h"
	#include "base/logging.h"
	#include "base/util.h"

	DEFINE_string(usage_data_file, "", "usage data file");
	DEFINE_string(cforms_file, "", "cforms file");
	DEFINE_string(output, "", "output header file");

	namespace mozc {
	namespace {
	struct ConjugationType {
	string form;
	string value_suffix;
	string key_suffix;
	};

	struct UsageItem {
	string key;
	string value;
	string conjugation;
	int conjugation_id;
	string meaning;
	};

	bool UsageItemKeynameCmp(const UsageItem& l, const UsageItem& r) {
	return l.key < r.key;
	}

	// Load cforms_file
	void LoadConjugation(const string &filename,
	map<string, vector<ConjugationType> > *output,
	map<string, ConjugationType> *baseform_map) {
	InputFileStream ifs(filename.c_str());
	CHECK(ifs.good());

	string line;
	vector<string> fields;
	while (!getline(ifs, line).fail()) {
	if (line.empty() \|\| line[0] == '#') {
	continue;
	}
	fields.clear();
	Util::SplitStringUsing(line, "\t ", &fields);
	CHECK_GE(fields.size(), 4) << "format error: " << line;

	ConjugationType tmp;
	tmp.form = fields[1];
	tmp.value_suffix = ((fields[2] == "*") ? "" : fields[2]);
	tmp.key_suffix = ((fields[3] == "*") ? "" : fields[3]);
	(*output)[fields[0]].push_back(tmp); // insert

	if (tmp.form == "\xE5\x9F\xBA\xE6\x9C\xAC\xE5\xBD\xA2") { // 基本形
	(*baseform_map)[fields[0]] = tmp;
	}
	}
	}

	// Load usage_data_file
	void LoadUsage(const string &filename,
	vector<UsageItem> *usage_entries,
	vector<string> *conjugation_list) {
	InputFileStream ifs(filename.c_str());

	if (!ifs.good()) {
	LOG(WARNING) << "Can't open file:" << filename;
	return;
	}

	string line;
	vector<string> fields;
	map<string, int> conjugation_id_map;

	int conjugation_id = 0;
	while (!getline(ifs, line).fail()) {
	if (line.empty() \|\| line[0] == '#') {
	// starting with '#' is a comment line.
	continue;
	}
	fields.clear();
	Util::SplitStringAllowEmpty(line, "\t", &fields);
	CHECK_GE(fields.size(), 4) << "format error: " << line;

	UsageItem item;
	item.key = ((fields[0] == "*") ? "" : fields[0]);
	item.value = ((fields[1] == "*") ? "" : fields[1]);
	item.conjugation = ((fields[2] == "*") ? "" : fields[2]);
	string tmp = ((fields[3] == "*") ? "" : fields[3]);
	Util::StringReplace(tmp, "\\n", "\n", true, &item.meaning);

	map<string, int>::iterator it = conjugation_id_map.find(item.conjugation);
	if (it == conjugation_id_map.end()) {
	conjugation_id_map.insert(
	pair<string, int>(item.conjugation, conjugation_id));
	item.conjugation_id = conjugation_id;
	conjugation_list->push_back(item.conjugation);
	++conjugation_id;
	} else {
	item.conjugation_id = it->second;
	}
	usage_entries->push_back(item);
	}
	}

	// remove "基本形"'s conjugation suffix
	void RemoveBaseformConjugationSuffix(
	const map<string, ConjugationType> &baseform_map,
	vector<UsageItem> *usage_entries) {
	for (vector<UsageItem>::iterator usage_itr = usage_entries->begin();
	usage_itr != usage_entries->end(); ++usage_itr) {
	const map<string, ConjugationType>::const_iterator baseform_itr =
	baseform_map.find(usage_itr->conjugation);
	if (baseform_itr == baseform_map.end()) {
	continue;
	}
	const ConjugationType &type = baseform_itr->second;

	if (usage_itr->key.length() <= type.key_suffix.length()) {
	LOG(WARNING) << "key:[" << usage_itr->key << "] is not longer then "
	<< "baseform.key_suffix of \"" << usage_itr->conjugation
	<< "\" : [" << type.key_suffix << "]";
	}
	if (usage_itr->value.length() <= type.value_suffix.length()) {
	LOG(WARNING) << "value:[" << usage_itr->value << "] is not longer then "
	<< "baseform.value_suffix of \"" << usage_itr->conjugation
	<< "\" : [" << type.value_suffix << "]";
	}

	usage_itr->key.erase(usage_itr->key.length() - type.key_suffix.length());
	usage_itr->value.erase(
	usage_itr->value.length() - type.value_suffix.length());
	}
	}

	void Convert() {
	// Load cforms_file
	map<string, vector<ConjugationType> > inflection_map;
	map<string, ConjugationType> baseform_map;
	LoadConjugation(FLAGS_cforms_file, &inflection_map, &baseform_map);

	// Load usage_data_file
	vector<UsageItem> usage_entries;
	vector<string> conjugation_list;
	LoadUsage(FLAGS_usage_data_file, &usage_entries, &conjugation_list);

	ostream *ofs = &cout;
	if (!FLAGS_output.empty()) {
	ofs = new OutputFileStream(FLAGS_output.c_str());
	}

	*ofs << "// This header file is generated by "
	<< "gen_usage_rewriter_dictionary_main."
	<< endl;

	// Output kConjugationNum
	*ofs << "static const int kConjugationNum = " <<
	conjugation_list.size() << ";" << endl;

	// Output kBaseConjugationSuffix
	*ofs << "static const ConjugationSuffix kBaseConjugationSuffix[] = {" << endl;
	for (size_t i = 0; i < conjugation_list.size(); ++i) {
	string value_suffix, key_suffix;
	Util::Escape(baseform_map[conjugation_list[i]].value_suffix, &value_suffix);
	Util::Escape(baseform_map[conjugation_list[i]].key_suffix, &key_suffix);
	*ofs << " {\"" << value_suffix << "\", \"" << key_suffix << "\"}, "
	<< "// " << conjugation_list[i] << endl;
	}
	*ofs << "};" << endl;

	// Output kConjugationSuffixData
	vector<int> conjugation_index(conjugation_list.size() + 1);
	*ofs << "static const ConjugationSuffix kConjugationSuffixData[] = {" << endl;
	int out_count = 0;
	for (size_t i = 0; i < conjugation_list.size(); ++i) {
	vector<ConjugationType> conjugations = inflection_map[conjugation_list[i]];
	conjugation_index[i] = out_count;
	if (conjugations.size() == 0) {
	*ofs << " // " << i << ": (" << out_count << "-" << out_count
	<< "): no conjugations" << endl;
	*ofs << " {\"\",\"\"}," << endl;
	++out_count;
	} else {
	typedef pair<string, string> StrPair;
	set<StrPair> key_and_value_suffix_set;
	for (size_t j = 0; j < conjugations.size(); ++j) {
	StrPair key_and_value_suffix(conjugations[j].value_suffix,
	conjugations[j].key_suffix);
	key_and_value_suffix_set.insert(key_and_value_suffix);
	}
	*ofs << " // " << i << ": (" << out_count << "-"
	<< (out_count + key_and_value_suffix_set.size()-1)
	<< "): " << conjugation_list[i] << endl << " ";
	set<StrPair>::iterator itr;
	for (itr = key_and_value_suffix_set.begin();
	itr != key_and_value_suffix_set.end(); ++itr) {
	string value_suffix, key_suffix;
	Util::Escape(itr->first, &value_suffix);
	Util::Escape(itr->second, &key_suffix);
	*ofs << " {\"" << value_suffix <<
	"\", \"" << key_suffix << "\"},";
	++out_count;
	}
	*ofs << endl;
	}
	}
	*ofs << "};" << endl;
	conjugation_index[conjugation_list.size()] = out_count;

	// Output kConjugationSuffixDataIndex
	*ofs << "static const int kConjugationSuffixDataIndex[] = {";
	for (size_t i = 0; i < conjugation_index.size(); ++i) {
	if (i != 0) {
	*ofs << ", ";
	}
	*ofs << conjugation_index[i];
	}
	*ofs << "};" << endl;

	RemoveBaseformConjugationSuffix(baseform_map, &usage_entries);
	sort(usage_entries.begin(), usage_entries.end(), UsageItemKeynameCmp);

	// Output kUsageDataSize
	*ofs << "static const size_t kUsageDataSize = "
	<< usage_entries.size() << ";" << endl;

	// Output kUsageData_value
	*ofs << "static const UsageDictItem kUsageData_value[] = {" << endl;
	int32 usage_id = 0;
	for (vector<UsageItem>::iterator i = usage_entries.begin();
	i != usage_entries.end(); i++) {
	string key, value, meaning;
	Util::Escape(i->key, &key);
	Util::Escape(i->value, &value);
	Util::Escape(i->meaning, &meaning);
	*ofs << " {" << usage_id << ", \"" << key << "\", "
	<< "\"" << value << "\", "
	<< "" << i->conjugation_id << ", "
	<< "\"" << meaning << "\"}, // "
	<< i->value << "(" << i->key << ")" << endl;
	++usage_id;
	}
	*ofs << " { 0, NULL, NULL, 0, NULL }" << endl;
	*ofs << "};" << endl;

	if (ofs != &cout) {
	delete ofs;
	}
	}

	} // namespace
	} // namespace mozc

	int main(int argc, char **argv) {
	InitGoogle(argv[0], &argc, &argv, true);
	mozc::Convert();
	return 0;
	}