blob: 2bc927e28bcc8cb5f84aad1ee3a1cea0167ce936 [file] [log] [blame]
# -*- coding: utf-8 -*-
# Copyright 2010-2015, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""A tool to generate POS matcher."""
__author__ = "taku"
import optparse
import re
import sys
from dictionary import pos_util
def OutputPosMatcherData(pos_matcher, output):
"""Generates the data used by POSMatcher.
Two data arrays are generated:
1) const uint16 kRuleIdTable[]
This contains POS ID for each rule in pos_matcher_rule.def. The data is
used by the method Get<RuleName>() generated by this script. Each array
index corresponds to one rule name in its declared order. Namely, if
pos_matcher_rule.def contain three rules, say
Rule0 Regexp0
Rule1 Regexp1
Rule2 Regexp2
Then kRuleIdTable[0] contains the result of GetRule0(), etc.
2) const POSMatcher::Range kRangeTable[]
Each element is a pointer to another array containing ranges of POS IDs
whose union is the set of all POS IDs that match the regexp. Each array
of ranges ends with the endmark { 0xFFFF, 0xFFFF }.
Generated data can be passed to POSMatcher, which is also generated by
this script, to get a POSMatcher corresponding to given data set.
"""
# Generate kRuleIdTable[].
output.write('const uint16 kRuleIdTable[] = {\n')
for rule_name in pos_matcher.GetRuleNameList():
output.write(
' %(id)4d, // %(rule_name)s "%(original_pattern)s"\n'
% { 'id': pos_matcher.GetId(rule_name),
'rule_name': rule_name,
'original_pattern': pos_matcher.GetOriginalPattern(rule_name) })
output.write(' static_cast<uint16>(0xFFFF),\n')
output.write('};\n')
# Generate arrays of ranges each of which will be an element of kRangeTable[].
output.write('namespace {\n')
for rule_name in pos_matcher.GetRuleNameList():
output.write(
'// %(rule_name)s "%(original_pattern)s"\n'
'const ::mozc::POSMatcher::Range kRangeTable_%(rule_name)s[] = {\n'
% { 'rule_name': rule_name,
'original_pattern': pos_matcher.GetOriginalPattern(rule_name) })
for id_range in pos_matcher.GetRange(rule_name):
output.write(' { %4d, %4d },\n' % id_range)
# End mark for this array of ranges.
output.write(
' { static_cast<uint16>(0xFFFF), static_cast<uint16>(0xFFFF) },\n'
'};\n')
output.write('} // namespace\n')
# Generate kRangeTable[].
output.write(
'const ::mozc::POSMatcher::Range *const kRangeTables[%d] = {\n'
% (len(pos_matcher.GetRuleNameList()) + 1))
for rule_name in pos_matcher.GetRuleNameList():
output.write(' kRangeTable_%s,\n' % rule_name)
output.write(' NULL,\n')
output.write('};\n')
def OutputPosMatcherHeader(pos_matcher, output):
"""Generates the definition of POSMatcher class.
POSMatcher is independent of the actual input data but just provides logic
for POS matching. To use a generated class, it's required to pass two arrays,
kRuleIdTable[] and kRangeTables[], to the constructor of POSMatcher.
"""
output.write(
'#ifndef MOZC_DICTIONARY_POS_MATCHER_H_\n'
'#define MOZC_DICTIONARY_POS_MATCHER_H_\n'
'#include "./base/port.h"\n'
'namespace mozc {\n'
'class POSMatcher {\n'
' public:\n'
' struct Range {\n'
' uint16 lower;\n'
' uint16 upper;\n'
' };\n')
# Helper function to generate Get<RuleName>Id() method from rule name and its
# corresponding index.
def _GenerateGetMethod(rule_name, index):
return (' inline uint16 Get%(rule_name)sId() const {\n'
' return rule_id_table_[%(index)d];\n'
' }' % { 'rule_name': rule_name, 'index': index })
# Helper function to generate Is<RuleName>(uint16 id) method from rule name
# and its corresponding index. The generated function checks if the given id
# belongs to some range in kRangeTable[index] = kRangeTable_RuleName[].
def _GenerateIsMethod(rule_name, index):
return (' inline bool Is%(rule_name)s(uint16 id) const {\n'
' for (const Range *range = range_table_[%(index)d];\n'
' range->lower != static_cast<uint16>(0xFFFF); ++range) {\n'
' if (id >= range->lower && id <= range->upper) {\n'
' return true;\n'
' }\n'
' }\n'
' return false;\n'
' }' % { 'rule_name': rule_name, 'index': index })
# Generate Get<RuleName>Id() and Is<RuleName>(uint16 id) for each rule.
for i, rule_name in enumerate(pos_matcher.GetRuleNameList()):
output.write(
' // %(rule_name)s "%(original_pattern)s"\n'
'%(get_method)s\n'
'%(is_method)s\n' % {
'rule_name': rule_name,
'original_pattern': pos_matcher.GetOriginalPattern(rule_name),
'get_method': _GenerateGetMethod(rule_name, i),
'is_method': _GenerateIsMethod(rule_name, i) })
# Constructor takes two pointers to arrays generated by OutputPosMatcherData()
# function.
output.write(
' public:\n'
' POSMatcher(const uint16 *const rule_id_table,\n'
' const Range *const *const range_table)\n'
' : rule_id_table_(rule_id_table),\n'
' range_table_(range_table) {}\n'
' private:\n'
' const uint16 *const rule_id_table_;\n'
' const Range *const *const range_table_;\n'
'};\n'
'} // namespace mozc\n'
'#endif // MOZC_DICTIONARY_POS_MATCHER_H_\n')
def ParseOptions():
parser = optparse.OptionParser()
parser.add_option('--id_file', dest='id_file', help='Path to id.def')
parser.add_option('--special_pos_file', dest='special_pos_file',
help='Path to special_pos.def')
parser.add_option('--pos_matcher_rule_file', dest='pos_matcher_rule_file',
help='Path to pos_matcher_rule.def')
parser.add_option('--output_pos_matcher_data',
dest='output_pos_matcher_data',
default='',
help='Path to the output header file of pos matcher data.')
parser.add_option('--output_pos_matcher_h',
dest='output_pos_matcher_h',
default='',
help='Path to the output header file of POSMatcher.')
return parser.parse_args()[0]
def main():
options = ParseOptions()
if options.output_pos_matcher_h:
# To generate a header file of POSMatcher, you don't need to specify
# --id_file and --special_pos_file because empty POS database sufficies.
pos_database = pos_util.PosDataBase()
pos_matcher = pos_util.PosMatcher(pos_database)
pos_matcher.Parse(options.pos_matcher_rule_file)
with open(options.output_pos_matcher_h, 'w') as stream:
OutputPosMatcherHeader(pos_matcher, stream)
if options.output_pos_matcher_data:
pos_database = pos_util.PosDataBase()
pos_database.Parse(options.id_file, options.special_pos_file)
pos_matcher = pos_util.PosMatcher(pos_database)
pos_matcher.Parse(options.pos_matcher_rule_file)
with open(options.output_pos_matcher_data, 'w') as stream:
OutputPosMatcherData(pos_matcher, stream)
if __name__ == "__main__":
main()