blob: 8c32af7aa68155b0cfbeea46061a35195ecade42 [file] [log] [blame]
# -*- coding: utf-8 -*-
# Copyright 2010-2015, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""Generate symbol data file (.java) from .tsv file.
In this file, "symbol" means both "emoticon (e.g. \(^o^)/)"
and "symbol character (e.g. ♪)".
Generated .java file is used by Android version.
The first line of .tsv file is ignored because it is used as label.
"""
__author__ = "matsuzakit"
import io
import optparse
import sys
import unicodedata
PACKAGE_NAME = 'org.mozc.android.inputmethod.japanese'
TEMPLATE_CLASS = """package %s;
public class %s {
%s
}
"""
def ParseSymbolFile(file_name, value_column, category_column,
expand_variant_tags):
"""Parses symbol file and returns tag->symbols dictionary."""
tag2symbol = {}
is_first_line = True
for line in io.open(file_name, encoding='utf-8'):
line_parts = line.rstrip().split('\t')
if is_first_line:
# Skip the first line, which is used as label.
is_first_line = False
continue
if len(line_parts) <= value_column or len(line_parts) <= category_column:
continue
symbol = line_parts[value_column]
if not symbol:
continue
normalized = unicodedata.normalize('NFKC', symbol)
for tag in line_parts[category_column].split(' '):
if not tag:
continue
tag2symbol.setdefault(tag, []).append(symbol)
if tag in expand_variant_tags and symbol != normalized:
tag2symbol[tag].append(normalized)
return tag2symbol
def GetStringArrayOfSymbols(tag_name, original_symbols, ordering_rule_list):
"""Generate Java string literal from tag_name and symbol list."""
def Compare(l, r):
"""Comparing method.
The rules are:
1. Characters on the ordering rule is prioritized.
2. If both are on the rule list, comprare based on the rule.
3. Lower character code is prioritized.
"""
l_in_ordering_rule = l in ordering_rule_list
r_in_ordering_rule = r in ordering_rule_list
if l_in_ordering_rule and r_in_ordering_rule:
return cmp(ordering_rule_list.index(l), ordering_rule_list.index(r))
if l_in_ordering_rule:
return -1
if r_in_ordering_rule:
return +1
return cmp(l, r)
lines = [' public static final String[] %s_VALUES = new String[]{'
% tag_name]
if ordering_rule_list:
symbols = sorted(original_symbols, Compare)
else:
symbols = original_symbols
_ESCAPE = (u'"', u'\\')
for symbol in symbols:
# Escape characters (defined above) have to be escaped by backslashes.
# e.g.
# input : \(・∀・")
# output : \u005C\u005C\u0028\u30FB\u2200\u30FB\u005C\u0022\u0029
# Note that the results of unescape '\u####' by Java compiler are
# treated as the same as raw characters.
# Thus;
# - '\u005c' will be treated as '\' and the next character
# will be escaped by '\u005c'.
# - '\u0022' will be treated as '"' and the string literal
# which include '\u0022' will terminate here.
# They are not what we want so before such characters we place '\'
# in order to escape them.
line = ['%s\\u%04x' % ('' if c not in _ESCAPE else '\u005c', ord(c))
for c in symbol]
# The white space is quick fix for the backslash at the tail of symbol.
lines.append(' "%s", // %s ' % (''.join(line), symbol))
lines.append(' };')
return '\n'.join(lines)
def WriteOut(output, tag2symbol, class_name, ordering_rule_list):
body = [GetStringArrayOfSymbols(tag, symbols, ordering_rule_list)
for tag, symbols in tag2symbol.iteritems()]
with io.open(output, 'w', encoding='utf-8') as out_file:
out_file.write(TEMPLATE_CLASS % (PACKAGE_NAME, class_name, '\n'.join(body)))
def ParseOption():
"""Parse command line options."""
parser = optparse.OptionParser()
parser.add_option('--input', dest='input', help='Input file (.tsv)')
parser.add_option('--output', dest='output', help='Output file (.java)')
parser.add_option('--class_name', dest='class_name',
help='Generated class\'s name')
parser.add_option('--ordering_rule', dest='ordering_rule',
help='(optional) Ordering rule file')
parser.add_option('--value_column', dest='value_column', type='int',
help='Column number of the values (0-origin)')
parser.add_option('--category_column', dest='category_column', type='int',
help='Column number of the categories (0-origin)')
parser.add_option('--expand_variant_tag',
action='append', default=[], dest='expand_variant_tags',
help='(optional, repeatedly definable) Category name'
'of which values\' variants are expanded')
return parser.parse_args()[0]
def CreateOrderingRuleList(file_name):
ordering_rule_list = []
for line in io.open(file_name, encoding='utf-8'):
# Do not forget such line of which content is ' '.
# Such line has to be appended into the list.
if not line.startswith(u'# ') and not line.startswith(u'\n'):
value = line.rstrip(u'\r\n')
ordering_rule_list.append(value)
return ordering_rule_list
def main():
options = ParseOption()
if not (options.input and options.output and options.class_name and
options.value_column is not None and
options.category_column is not None):
print 'Some options cannot be omitted. See --help.'
sys.exit(1)
tag2symbol = ParseSymbolFile(options.input,
options.value_column,
options.category_column,
options.expand_variant_tags)
if options.ordering_rule is not None:
ordering_rule_list = CreateOrderingRuleList(options.ordering_rule)
else:
ordering_rule_list = []
WriteOut(options.output, tag2symbol, options.class_name, ordering_rule_list)
if __name__ == '__main__':
main()