blob: 24799152aec64eeac8a1e5f73ad10eac43b921b7 [file] [log] [blame]
# -*- coding: utf-8 -*-
# Copyright 2010-2015, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""Single kanji dictionary generator.
How to run this script:
gen_single_kanji_rewriter_data.py
--single_kanji_file=single_kanji.tsv
--variant_file=variant_rule.txt
--output=single_kanji_data.h
"""
__author__ = "hidehiko"
import optparse
from build_tools import code_generator_util
from rewriter import embedded_dictionary_compiler
# key, value, rank
NOUN_PREFIX = [
['お', 'お', 1],
['ご', 'ご', 1],
# ['ご', '誤'], # don't register it as 誤 isn't in the ipadic.
# ['み', 'み'], # seems to be rare.
['もと', 'もと', 1],
['だい', '代', 1],
['てい', '低', 0],
['もと', '元', 1],
['ぜん', '全', 0],
['さい', '再', 0],
['しょ', '初', 1],
['はつ', '初', 0],
['ぜん', '前', 1],
['かく', '各', 1],
['どう', '同', 1],
['だい', '大', 1],
['おお', '大', 1],
['とう', '当', 1],
['ご', '御', 1],
['お', '御', 1],
['しん', '新', 1],
['さい', '最', 1],
['み', '未', 0],
['ほん', '本', 1],
['む', '無', 0],
['だい', '第', 1],
['とう', '等', 1],
['やく', '約', 1],
['ひ', '被', 1],
['ちょう', '超', 1],
['ちょう', '長', 1],
['なが', '長', 1],
['ひ', '非', 1],
['こう', '高', 1]
]
def ReadSingleKanji(stream):
"""Parses single kanji dictionary data from stream."""
stream = code_generator_util.SkipLineComment(stream)
stream = code_generator_util.ParseColumnStream(stream, num_column=2)
outputs = list(stream)
# For binary search by |key|, sort outputs here.
outputs.sort(lambda x, y: cmp(x[0], y[0]))
return outputs
def ReadVariant(stream):
"""Parses variant data from stream."""
variant_types = []
variant_items = []
stream = code_generator_util.SkipLineComment(stream)
stream = code_generator_util.ParseColumnStream(stream)
for tokens in stream:
if len(tokens) == 1:
variant_types.append(tokens[0])
elif len(tokens) == 2 and variant_types:
(target, original) = tokens
variant_items.append([target, original, len(variant_types) - 1])
# For binary search by |target|, sort variant items here.
variant_items.sort(lambda x, y: cmp(x[0], y[0]))
return (variant_types, variant_items)
def GenNounPrefix():
"""Generates noun prefix embedded dictionary entries."""
token_map = {}
for entry in NOUN_PREFIX:
key = entry[0] if entry[0] else None
value = entry[1] if entry[1] else None
rank = entry[2]
token_map.setdefault(key, []).append(
embedded_dictionary_compiler.Token(
key, value, None, None, 0, 0, rank))
return token_map
def WriteSingleKanji(outputs, stream):
"""Writes single kanji list for readings."""
stream.write('static const SingleKanjiList kSingleKanjis[] = {\n')
for output in outputs:
(key, values) = output
stream.write(' // %s, %s\n' % (key, values))
stream.write(code_generator_util.FormatWithCppEscape(
' { %s, %s },\n', key, values))
stream.write('};\n')
def WriteVariantInfo(variant_info, stream):
"""Writes single kanji variants info."""
(variant_types, variant_items) = variant_info
stream.write('static const char *kKanjiVariantTypes[] = {\n')
for variant_type in variant_types:
stream.write(code_generator_util.FormatWithCppEscape(
' %s,', variant_type))
stream.write(' // %s\n' % variant_type)
stream.write('};\n')
stream.write('static const KanjiVariantItem kKanjiVariants[] = {\n')
for item in variant_items:
(target, original, variant_type) = item
stream.write(code_generator_util.FormatWithCppEscape(
' { %s, %s, %d },', target, original, variant_type))
stream.write(' // %s, %s, %d\n' % (target, original, variant_type))
stream.write('};\n')
def _ParseOptions():
parser = optparse.OptionParser()
parser.add_option('--single_kanji_file', dest='single_kanji_file',
help='Single kanji file')
parser.add_option('--variant_file', dest='variant_file',
help='Variant rule file')
parser.add_option('--output', dest='output', help='Output header file.')
return parser.parse_args()[0]
def main():
options = _ParseOptions()
with open(options.single_kanji_file, 'r') as single_kanji_stream:
single_kanji = ReadSingleKanji(single_kanji_stream)
with open(options.variant_file, 'r') as variant_stream:
variant_info = ReadVariant(variant_stream)
noun_prefix = GenNounPrefix()
with open(options.output, 'w') as output_stream:
WriteSingleKanji(single_kanji, output_stream)
WriteVariantInfo(variant_info, output_stream)
embedded_dictionary_compiler.Compile(
'NounPrefixData', noun_prefix, output_stream)
if __name__ == '__main__':
main()