| // Copyright 2010-2015, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| // Single Kanji dictionary generator: |
| // % gen_symbol_ewriter_dictionary_main |
| // --sorting_table=sorting_table_file |
| // --ordering_rule=ordering_rule_file |
| // --input=input.tsv --output=output_header |
| |
| #include <climits> |
| #include <map> |
| #include <vector> |
| #include <set> |
| #include <string> |
| #include <algorithm> |
| |
| #include "base/file_stream.h" |
| #include "base/file_util.h" |
| #include "base/logging.h" |
| #include "base/util.h" |
| #include "rewriter/dictionary_generator.h" |
| #include "rewriter/embedded_dictionary.h" |
| |
| DEFINE_string(sorting_table, "", "sorting table file"); |
| DEFINE_string(ordering_rule, "", "sorting order file"); |
| DEFINE_string(input, "", "symbol dictionary file"); |
| DEFINE_string(output, "", "output header file"); |
| |
| namespace mozc { |
| namespace { |
| |
| void GetSortingMap(const string &auto_file, |
| const string &rule_file, |
| map<string, uint16> *sorting_map) { |
| CHECK(sorting_map); |
| sorting_map->clear(); |
| string line; |
| int sorting_key = 0; |
| InputFileStream rule_ifs(rule_file.c_str()); |
| CHECK(rule_ifs.good()); |
| while (!getline(rule_ifs, line).fail()) { |
| if (line.empty() || line[0] == '#') { |
| continue; |
| } |
| sorting_map->insert(make_pair(line, sorting_key)); |
| ++sorting_key; |
| } |
| |
| InputFileStream auto_ifs(auto_file.c_str()); |
| CHECK(auto_ifs.good()); |
| |
| while (!getline(auto_ifs, line).fail()) { |
| if (line.empty() || line[0] == '#') { |
| continue; |
| } |
| vector<string> fields; |
| Util::SplitStringUsing(line, "\t ", &fields); |
| CHECK_GE(fields.size(), 2); |
| const char32 ucs4 = strtol(fields[1].c_str(), NULL, 16); |
| string utf8; |
| Util::UCS4ToUTF8(ucs4, &utf8); |
| if (sorting_map->find(utf8) != sorting_map->end()) { |
| // ordered by rule |
| continue; |
| } |
| sorting_map->insert(make_pair(utf8, sorting_key)); |
| ++sorting_key; |
| } |
| } |
| |
| void AddSymbolToDictionary(const string &pos, |
| const string &value, |
| const vector<string> &keys, |
| const string &description, |
| const string &additional_description, |
| const map<string, uint16> &sorting_map, |
| rewriter::DictionaryGenerator *dictionary) { |
| // use first char of value as sorting key. |
| const string first_value = Util::SubString(value, 0, 1); |
| map<string, uint16>::const_iterator itr = sorting_map.find(first_value); |
| uint16 sorting_key = 0; |
| if (itr == sorting_map.end()) { |
| DLOG(WARNING) << first_value << " is not defined in sorting map."; |
| // If the character is platform-dependent, put the character at the last. |
| const Util::CharacterSet cset = Util::GetCharacterSet(value); |
| if (cset >= Util::JISX0212) { |
| sorting_key = USHRT_MAX; |
| } |
| } else { |
| sorting_key = itr->second; |
| } |
| |
| for (size_t i = 0; i < keys.size(); ++i) { |
| const string &key = keys[i]; |
| |
| rewriter::Token token; |
| token.set_sorting_key(sorting_key); |
| token.set_key(key); |
| token.set_value(value); |
| token.set_pos(pos); |
| token.set_description(description); |
| token.set_additional_description(additional_description); |
| dictionary->AddToken(token); |
| |
| string fw_key; |
| Util::HalfWidthAsciiToFullWidthAscii(key, &fw_key); |
| if (fw_key != key) { |
| token.set_key(fw_key); |
| dictionary->AddToken(token); |
| } |
| } |
| } |
| |
| // Read dic: |
| void MakeDictionary(const string &symbol_dictionary_file, |
| const string &sorting_map_file, |
| const string &ordering_rule_file, |
| rewriter::DictionaryGenerator *dictionary) { |
| set<string> seen; |
| map<string, uint16> sorting_map; |
| GetSortingMap(sorting_map_file, ordering_rule_file, &sorting_map); |
| |
| InputFileStream ifs(symbol_dictionary_file.c_str()); |
| CHECK(ifs.good()); |
| |
| string line; |
| CHECK(!getline(ifs, line).fail()); // get first line |
| |
| vector<string> fields; |
| while (!getline(ifs, line).fail()) { |
| fields.clear(); |
| // Format: |
| // POS <tab> value <tab> readings(space delimitered) <tab> |
| // description <tab> memo |
| Util::SplitStringAllowEmpty(line, "\t", &fields); |
| if (fields.size() < 3 || |
| (fields[1].empty() && fields[2].empty())) { |
| VLOG(3) << "invalid format. skip line: " << line; |
| continue; |
| } |
| string pos = fields[0]; |
| Util::UpperString(&pos); |
| const string &value = fields[1]; |
| if (seen.find(value) != seen.end()) { |
| LOG(WARNING) << "already inserted: " << value; |
| continue; |
| } else { |
| seen.insert(value); |
| } |
| string keys_str; |
| // \xE3\x80\x80 is full width space |
| Util::StringReplace(fields[2], "\xE3\x80\x80", " ", true, &keys_str); |
| vector<string> keys; |
| Util::SplitStringUsing(keys_str, " ", &keys); |
| const string &description = (fields.size()) > 3 ? fields[3] : ""; |
| const string &additional_description = (fields.size()) > 4 ? fields[4] : ""; |
| AddSymbolToDictionary(pos, value, keys, description, |
| additional_description, |
| sorting_map, dictionary); |
| } |
| // Add space as a symbol |
| vector<string> keys_space; |
| keys_space.push_back(" "); |
| // "記号", "空白" |
| AddSymbolToDictionary("\xe8\xa8\x98\xe5\x8f\xb7", " ", keys_space, |
| "\xe7\xa9\xba\xe7\x99\xbd", "", sorting_map, |
| dictionary); |
| } |
| } // namespace |
| } // namespace mozc |
| |
| int main(int argc, char **argv) { |
| InitGoogle(argv[0], &argc, &argv, true); |
| |
| if ((FLAGS_input.empty() || |
| FLAGS_sorting_table.empty() || |
| FLAGS_ordering_rule.empty()) && argc > 3) { |
| FLAGS_input = argv[1]; |
| FLAGS_sorting_table = argv[2]; |
| FLAGS_ordering_rule = argv[3]; |
| } |
| |
| const string tmp_text_file = FLAGS_output + ".txt"; |
| static const char kHeaderName[] = "SymbolData"; |
| |
| mozc::rewriter::DictionaryGenerator dictionary; |
| mozc::MakeDictionary(FLAGS_input, |
| FLAGS_sorting_table, |
| FLAGS_ordering_rule, |
| &dictionary); |
| dictionary.Output(tmp_text_file); |
| mozc::EmbeddedDictionary::Compile(kHeaderName, |
| tmp_text_file, |
| FLAGS_output); |
| mozc::FileUtil::Unlink(tmp_text_file); |
| |
| return 0; |
| } |