blob: 1e93043691d1a3aab068d9b1145d14bb97938f06 [file] [log] [blame]
# -*- coding: utf-8 -*-
# Copyright 2010-2015, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
__author__ = "taku"
import itertools
import optparse
import re
import string
import sys
# We use 2-bits bitmap data for JISX0208, JISX0212 and JISX0213 clustering.
# For remaining categories (i.e. ASCII, CP932, JISX0201 and UNICODE_ONLY),
# we introduce heuristics to check them and use '00' bits for all of them.
CATEGORY_BITMAP = {
'JISX0208': 1,
'JISX0212': 2,
'JISX0213': 3,
}
def IsValidUCS2(n):
"""Returns True if the n is valid code in UCS2."""
return 0 <= n <= 0xFFFF
def IsValidUCS4(n):
"""Returns True if the n is valid code in UCS4."""
return 0 <= n <= 0x7FFFFFFF
class CodePointCategorizer(object):
"""Categorizer of ucs4 code points."""
_UCS2_PATTERN = re.compile(r'^0x([0-9A-F]{4})')
# UCS4 pattern supports only JIS X 0213.
# Note that Some JIS X 0213 characters are described as 'U+xxxx+xxxx',
# and this pattern ignores latter +xxxx part intentionally.
_UCS4_PATTERN = re.compile(r'^U\+([0-9A-F]+)')
def __init__(self, cp932file, jisx0201file, jisx0208file,
jisx0212file, jisx0213file):
self._cp932 = CodePointCategorizer._LoadCP932(cp932file)
self._jisx0201 = CodePointCategorizer._LoadJISX0201(jisx0201file)
self._jisx0208 = CodePointCategorizer._LoadJISX0208(jisx0208file)
self._jisx0212 = CodePointCategorizer._LoadJISX0212(jisx0212file)
self._jisx0213 = CodePointCategorizer._LoadJISX0213(jisx0213file)
self._exceptions = CodePointCategorizer._LoadExceptions()
# Make a list of code point tables in the priority order.
self._table_list = [
('JISX0208', self._exceptions), # Vender specific code.
('JISX0201', self._jisx0201),
('JISX0208', self._jisx0208),
('JISX0212', self._jisx0212),
('JISX0213', self._jisx0213),
('CP932', self._cp932)]
@staticmethod
def _LoadTable(filename, column_index, pattern, validater):
result = set()
for line in open(filename):
if line.startswith('#'):
# Skip a comment line.
continue
columns = line.split()
match = pattern.search(columns[column_index])
if match:
ucs = int(match.group(1), 16)
if validater(ucs):
result.add(ucs)
return result
@staticmethod
def _LoadCP932(filename):
return CodePointCategorizer._LoadTable(
filename, 1, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)
@staticmethod
def _LoadJISX0201(filename):
return CodePointCategorizer._LoadTable(
filename, 1, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)
@staticmethod
def _LoadJISX0208(filename):
result = CodePointCategorizer._LoadTable(
filename, 2, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)
result.update([
0xFF3C, # (FULLWIDTH REVERSE SOLIDS) should be in JISX0208
0xFF0D, # (FULLWIDTH HYPHEN MINUS) should be in JISX0208
])
return result
@staticmethod
def _LoadJISX0212(filename):
return CodePointCategorizer._LoadTable(
filename, 1, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)
@staticmethod
def _LoadJISX0213(filename):
return CodePointCategorizer._LoadTable(
filename, 1, CodePointCategorizer._UCS4_PATTERN, IsValidUCS4)
# The following chars have different mapping in
# Windows and Mac. Technically, they are platform dependent
# characters, but Mozc treat them so that they are normal characters
# defined in JISX0208
@staticmethod
def _LoadExceptions():
# treat Unicode Japanese incompatible characters as JISX0208.
return set([
0x00A5, # YEN SIGN
0x203E, # OVERLINE
0x301C, # WAVE DASH
0xFF5E, # FULL WIDTH TILDE
0x2016, # DOUBLE VERTICAL LINE
0x2225, # PARALEL TO
0x2212, # MINUS SIGN
0xFF0D, # FULL WIDTH HYPHEN MINUS
0x00A2, # CENT SIGN
0xFFE0, # FULL WIDTH CENT SIGN
0x00A3, # POUND SIGN
0xFFE1, # FULL WIDTH POUND SIGN
0x00AC, # NOT SIGN
0xFFE2, # FULL WIDTH NOT SIGN
])
def GetCategory(self, codepoint):
"""Returns category name of the codepoint, or None for invalid input."""
if not IsValidUCS4(codepoint):
return None
# Special handling for ascii code point.
if codepoint <= 0x007F:
return 'ASCII'
# Then look for loaded table list in the order.
for name, table in self._table_list:
if codepoint in table:
return name
# Not found in any tables, so return "UNICODE_ONLY" as a fallback.
return 'UNICODE_ONLY'
def MaxCodePoint(self):
"""Returns the max of code points in the loaded table."""
return max(max(table) for _, table in self._table_list)
def GroupConsecutiveCodepoints(codepoint_list):
"""Takes sorted codepoint list and groups by consecutive code points."""
result = []
prev = None
current = []
for codepoint in codepoint_list:
if prev is not None and codepoint != prev + 1:
result.append(current)
current = []
current.append(codepoint)
prev = codepoint
if current:
result.append(current)
return result
def FindCodePoint(category_list, category_name):
"""Returns a list of code points which belong to the given category_name."""
return [codepoint for codepoint, category in enumerate(category_list)
if category == category_name]
def ParseOptions():
"""Parses command line options."""
parser = optparse.OptionParser()
parser.add_option('--cp932file', dest='cp932file',
help='File path for the unicode\'s CP932.TXT file')
parser.add_option('--jisx0201file', dest='jisx0201file',
help='File path for the unicode\'s JIS0201.TXT file')
parser.add_option('--jisx0208file', dest='jisx0208file',
help='File path for the unicode\'s JIS0208.TXT file')
parser.add_option('--jisx0212file', dest='jisx0212file',
help='File path for the unicode\'s JIS0212.TXT file')
parser.add_option('--jisx0213file', dest='jisx0213file',
help='File path for the unicode\'s jisx0213-2004-std.txt '
'file')
parser.add_option('--output', dest='output',
help='output file path. If not specified, '
'output to stdout.')
return parser.parse_args()[0]
def GenerateCategoryBitmap(category_list, name):
r"""Generats bitmap data code.
The generated data looks something like:
namespace {
const char name[] =
"\xXX\xXX\xXX\xXX...\xXX"
"\xXX\xXX\xXX\xXX...\xXX"
"\xXX\xXX\xXX\xXX...\xXX"
"\xXX\xXX\xXX\xXX...\xXX"
;
} // namespace
Args:
category_list: a list of categories.
name: a bitmap name.
"""
lines = []
# Create bitmap list.
# Encode each code point category to 2-bits.
# The result should be a byte (8-bits), so group each consecutive
# (at most) four code points.
bit_list = []
for _, group in itertools.groupby(enumerate(category_list),
lambda (codepoint, _): codepoint / 4):
# Fill bits from LSB to MSB for each group.
bits = 0
for index, (_, category) in enumerate(group):
bits |= CATEGORY_BITMAP.get(category, 0) << (index * 2)
bit_list.append(bits)
# Header.
lines.extend(['namespace {\n',
'const char %s[] =\n' % name])
# Output the content. Each line would have (at most) 16 bytes.
for _, group in itertools.groupby(enumerate(bit_list),
lambda (index, _): index / 16):
line = [' \"']
for _, bits in group:
line.append('\\x%02X' % bits)
line.append('\"\n')
lines.append(''.join(line))
lines.extend([';\n',
'} // namespace\n'])
return lines
def GenerateIfStatement(codepoint_list, var_name, num_indent, return_type):
"""Generates a if-case statement for given arguments.
This method generates a if-case statement, which checks if the value of
the given 'var_name' variable is in 'codepoint_list'
and returns 'return_type' if contained. The condition expression would be
a simple range-based linear check.
The generated code would be something like:
if (var_name == 0xXXXX || // for a single element list.
(0xXXXX <= var_name && var_name <= 0xXXXX) || // for range check.
:
(0xXXXX <= var_name && var_name <= 0xXXXX)) {
return RETURN_TYPE;
}
Args:
codepoint_list: a sorted list of code points.
var_name: a variable name to be checked.
num_indent: the indent depth.
return_type: a return category type which should be returned if
'var_name' is in the 'codepoint_list'
Returns: a list of lines of the generated switch-case statement.
"""
conditions = []
for codepoint_group in GroupConsecutiveCodepoints(codepoint_list):
if len(codepoint_group) == 1:
conditions.append('%s == %d' % (var_name, codepoint_group[0]))
else:
conditions.append(
'(%d <= %s && %s <= %d)' % (codepoint_group[0], var_name,
var_name, codepoint_group[-1]))
condition = (' ||\n' + ' ' * (num_indent + 4)).join(conditions)
lines = [''.join([' ' * num_indent, 'if (', condition, ') {\n']),
' ' * (num_indent + 2) + 'return %s;\n' % return_type,
' ' * num_indent + '}\n']
return lines
def GenerateSwitchStatement(codepoint_list, var_name, num_indent, return_type):
"""Generates a switch-case statement for given arguments.
This method generates a switch-case statement, which checks if the value of
the given 'var_name' variable is in 'codepoint_list'
and returns 'return_type' if contained.
The generated code would be something like:
switch (var_name) {
case 0xXXXX:
case 0xXXXX:
:
case 0xXXXX:
return RETURN_TYPE;
}
Args:
codepoint_list: a sorted list of code points.
var_name: a variable name to be checked.
num_indent: the indent depth.
return_type: a return category type which should be returned if
'var_name' is in the 'codepoint_list'
Returns: a list of lines of the generated switch-case statement.
"""
lines = [' ' * num_indent + ('switch (%s) {\n' % var_name)]
for codepoint in codepoint_list:
lines.append(' ' * (num_indent + 2) + 'case 0x%08X:\n' % codepoint)
lines.extend([' ' * (num_indent + 4) + ('return %s;\n' % return_type),
' ' * num_indent + '}\n'])
return lines
def GenerateGetCharacterSet(category_list, bitmap_name, bitmap_size):
"""Generates function body of a Util::GetCharacterSet method."""
lines = []
# Function header.
lines.append('Util::CharacterSet Util::GetCharacterSet(char32 ucs4) {\n')
# First, check if the given code is valid or not. If not, returns
# UNICODE_ONLY as a fallback.
# TODO(komatsu): add INVALID instead of UNICODE_ONLY.
lines.extend([' if (ucs4 > 0x10FFFF) {\n',
' return UNICODE_ONLY;\n',
' }\n'])
lines.append('\n')
# Check if the argument is ASCII or not.
lines.extend([' if (ucs4 <= 0x7F) {\n',
' return ASCII;\n',
' }\n'])
lines.append('\n')
# Check if the argument is JIS0201.
# We check this by rangebased if statement, because almost JISX0201 code
# points are consecutive.
lines.extend(GenerateIfStatement(
FindCodePoint(category_list, 'JISX0201'), 'ucs4', 2, 'JISX0201'))
lines.append('\n')
# Check if the argument is CP932.
# Check by a switch-case statement as CP932 code points are discrete.
lines.extend(GenerateSwitchStatement(
FindCodePoint(category_list, 'CP932'), 'ucs4', 2, 'CP932'))
lines.append('\n')
# Bitmap lookup.
# TODO(hidehiko): the bitmap has two huge 0-bits ranges. Reduce them.
category_map = [
(bits, category) for category, bits in CATEGORY_BITMAP.iteritems()]
category_map.sort()
lines.extend([
' if (ucs4 < %d) {\n' % bitmap_size,
' switch ((kCategoryBitmap[ucs4 >> 2] >> ((ucs4 & 3) * 2)) & 3) {\n'])
lines.extend((' case %d: return %s;\n' % item) for item in category_map)
lines.extend([' }\n',
' return UNICODE_ONLY;\n',
' }\n',
'\n'])
# For codepoint > bitmap_size.
# Remaining category should be only JISX0213 or UNICODE_ONLY.
# The number of JISX0213 code points are much small, so we just check it
# by a switch-case statement.
lines.extend(GenerateSwitchStatement(
[codepoint for codepoint in FindCodePoint(category_list, 'JISX0213')
if codepoint >= bitmap_size ],
'ucs4', 2, 'JISX0213'))
# Returns UNICODE_ONLY as a last resort.
lines.extend([
' return UNICODE_ONLY;\n',
'}\n'])
return lines
def GenerateCharacterSetHeader(category_list):
"""Generates lines of character_set.h file."""
bitmap_name = "kCategoryBitmap"
bitmap_size = 65536
lines = []
# File header comments.
lines.extend(['// This file is generated by base/gen_character_set.py\n',
'// Do not edit me!\n',
'\n'])
# We use 2-bits bitmap to check JISX0208, JISX0212 and JISX0213, for
# code point 0 to 65535 (inclusive).
lines.extend(
GenerateCategoryBitmap(category_list[:bitmap_size], bitmap_name))
lines.append('\n')
# Then add Util::GetCharacterSet method.
lines.extend(
GenerateGetCharacterSet(category_list, bitmap_name, bitmap_size))
return lines
def main():
options = ParseOptions()
# Generates lines of the header file.
categorizer = CodePointCategorizer(options.cp932file,
options.jisx0201file,
options.jisx0208file,
options.jisx0212file,
options.jisx0213file)
category_list = [
categorizer.GetCategory(codepoint)
for codepoint in xrange(categorizer.MaxCodePoint() + 1)]
generated_character_set_header = GenerateCharacterSetHeader(category_list)
# Write the result.
if options.output:
output = open(options.output, 'w')
try:
output.writelines(generated_character_set_header)
finally:
output.close()
else:
sys.stdout.writelines(generated_character_set_header)
if __name__ == "__main__":
main()