src/base/gen_character_set.py - mozc - Git at Google

 # -*- coding: utf-8 -*-
 # Copyright 2010-2015, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 #     * Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
 # copyright notice, this list of conditions and the following disclaimer
 # in the documentation and/or other materials provided with the
 # distribution.
 #     * Neither the name of Google Inc. nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 __author__ = "taku"

 import itertools
 import optparse
 import re
 import string
 import sys


 # We use 2-bits bitmap data for JISX0208, JISX0212 and JISX0213 clustering.
 # For remaining categories (i.e. ASCII, CP932, JISX0201 and UNICODE_ONLY),
 # we introduce heuristics to check them and use '00' bits for all of them.
 CATEGORY_BITMAP = {
     'JISX0208': 1,
     'JISX0212': 2,
     'JISX0213': 3,
 }


 def IsValidUCS2(n):
   """Returns True if the n is valid code in UCS2."""
   return 0 <= n <= 0xFFFF


 def IsValidUCS4(n):
   """Returns True if the n is valid code in UCS4."""
   return 0 <= n <= 0x7FFFFFFF


 class CodePointCategorizer(object):
   """Categorizer of ucs4 code points."""

   _UCS2_PATTERN = re.compile(r'^0x([0-9A-F]{4})')

   # UCS4 pattern supports only JIS X 0213.
   # Note that Some JIS X 0213 characters are described as 'U+xxxx+xxxx',
   # and this pattern ignores latter +xxxx part intentionally.
   _UCS4_PATTERN = re.compile(r'^U\+([0-9A-F]+)')

   def __init__(self, cp932file, jisx0201file, jisx0208file,
                jisx0212file, jisx0213file):
     self._cp932 = CodePointCategorizer._LoadCP932(cp932file)
     self._jisx0201 = CodePointCategorizer._LoadJISX0201(jisx0201file)
     self._jisx0208 = CodePointCategorizer._LoadJISX0208(jisx0208file)
     self._jisx0212 = CodePointCategorizer._LoadJISX0212(jisx0212file)
     self._jisx0213 = CodePointCategorizer._LoadJISX0213(jisx0213file)
     self._exceptions = CodePointCategorizer._LoadExceptions()

     # Make a list of code point tables in the priority order.
     self._table_list = [
         ('JISX0208', self._exceptions),  # Vender specific code.
         ('JISX0201', self._jisx0201),
         ('JISX0208', self._jisx0208),
         ('JISX0212', self._jisx0212),
         ('JISX0213', self._jisx0213),
         ('CP932', self._cp932)]


   @staticmethod
   def _LoadTable(filename, column_index, pattern, validater):
     result = set()
     for line in open(filename):
       if line.startswith('#'):
         # Skip a comment line.
         continue

       columns = line.split()
       match = pattern.search(columns[column_index])
       if match:
         ucs = int(match.group(1), 16)
         if validater(ucs):
           result.add(ucs)

     return result

   @staticmethod
   def _LoadCP932(filename):
     return CodePointCategorizer._LoadTable(
         filename, 1, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)

   @staticmethod
   def _LoadJISX0201(filename):
     return CodePointCategorizer._LoadTable(
         filename, 1, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)

   @staticmethod
   def _LoadJISX0208(filename):
     result = CodePointCategorizer._LoadTable(
         filename, 2, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)
     result.update([
         0xFF3C, # (FULLWIDTH REVERSE SOLIDS) should be in JISX0208
         0xFF0D, # (FULLWIDTH HYPHEN MINUS) should be in JISX0208
         ])
     return result

   @staticmethod
   def _LoadJISX0212(filename):
     return CodePointCategorizer._LoadTable(
         filename, 1, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)

   @staticmethod
   def _LoadJISX0213(filename):
     return CodePointCategorizer._LoadTable(
         filename, 1, CodePointCategorizer._UCS4_PATTERN, IsValidUCS4)

   # The following chars have different mapping in
   # Windows and Mac. Technically, they are platform dependent
   # characters, but Mozc treat them so that they are normal characters
   # defined in JISX0208
   @staticmethod
   def _LoadExceptions():
     # treat Unicode Japanese incompatible characters as JISX0208.
     return set([
         0x00A5,  # YEN SIGN
         0x203E,  # OVERLINE
         0x301C,  # WAVE DASH
         0xFF5E,  # FULL WIDTH TILDE
         0x2016,  # DOUBLE VERTICAL LINE
         0x2225,  # PARALEL TO
         0x2212,  # MINUS SIGN
         0xFF0D,  # FULL WIDTH HYPHEN MINUS
         0x00A2,  # CENT SIGN
         0xFFE0,  # FULL WIDTH CENT SIGN
         0x00A3,  # POUND SIGN
         0xFFE1,  # FULL WIDTH POUND SIGN
         0x00AC,  # NOT SIGN
         0xFFE2,  # FULL WIDTH NOT SIGN
         ])

   def GetCategory(self, codepoint):
     """Returns category name of the codepoint, or None for invalid input."""
     if not IsValidUCS4(codepoint):
       return None

     # Special handling for ascii code point.
     if codepoint <= 0x007F:
       return 'ASCII'

     # Then look for loaded table list in the order.
     for name, table in self._table_list:
       if codepoint in table:
         return name

     # Not found in any tables, so return "UNICODE_ONLY" as a fallback.
     return 'UNICODE_ONLY'

   def MaxCodePoint(self):
     """Returns the max of code points in the loaded table."""
     return max(max(table) for _, table in self._table_list)


 def GroupConsecutiveCodepoints(codepoint_list):
   """Takes sorted codepoint list and groups by consecutive code points."""
   result = []

   prev = None
   current = []
   for codepoint in codepoint_list:
     if prev is not None and codepoint != prev + 1:
       result.append(current)
       current = []
     current.append(codepoint)
     prev = codepoint

   if current:
     result.append(current)

   return result


 def FindCodePoint(category_list, category_name):
   """Returns a list of code points which belong to the given category_name."""
   return [codepoint for codepoint, category in enumerate(category_list)
           if category == category_name]


 def ParseOptions():
   """Parses command line options."""
   parser = optparse.OptionParser()
   parser.add_option('--cp932file', dest='cp932file',
                     help='File path for the unicode\'s CP932.TXT file')
   parser.add_option('--jisx0201file', dest='jisx0201file',
                     help='File path for the unicode\'s JIS0201.TXT file')
   parser.add_option('--jisx0208file', dest='jisx0208file',
                     help='File path for the unicode\'s JIS0208.TXT file')
   parser.add_option('--jisx0212file', dest='jisx0212file',
                     help='File path for the unicode\'s JIS0212.TXT file')
   parser.add_option('--jisx0213file', dest='jisx0213file',
                     help='File path for the unicode\'s jisx0213-2004-std.txt '
                     'file')
   parser.add_option('--output', dest='output',
                     help='output file path. If not specified, '
                     'output to stdout.')

   return parser.parse_args()[0]


 def GenerateCategoryBitmap(category_list, name):
   r"""Generats bitmap data code.

   The generated data looks something like:
   namespace {
   const char name[] =
       "\xXX\xXX\xXX\xXX...\xXX"
       "\xXX\xXX\xXX\xXX...\xXX"
       "\xXX\xXX\xXX\xXX...\xXX"
       "\xXX\xXX\xXX\xXX...\xXX"
   ;
   }  // namespace

   Args:
     category_list: a list of categories.
     name: a bitmap name.
   """
   lines = []

   # Create bitmap list.
   # Encode each code point category to 2-bits.
   # The result should be a byte (8-bits), so group each consecutive
   # (at most) four code points.
   bit_list = []
   for _, group in itertools.groupby(enumerate(category_list),
                                     lambda (codepoint, _): codepoint / 4):
     # Fill bits from LSB to MSB for each group.
     bits = 0
     for index, (_, category) in enumerate(group):
       bits |= CATEGORY_BITMAP.get(category, 0) << (index * 2)
     bit_list.append(bits)

   # Header.
   lines.extend(['namespace {\n',
                 'const char %s[] =\n' % name])

   # Output the content. Each line would have (at most) 16 bytes.
   for _, group in itertools.groupby(enumerate(bit_list),
                                     lambda (index, _): index / 16):
     line = ['    \"']
     for _, bits in group:
       line.append('\\x%02X' % bits)
     line.append('\"\n')
     lines.append(''.join(line))

   lines.extend([';\n',
                 '}  // namespace\n'])

   return lines


 def GenerateIfStatement(codepoint_list, var_name, num_indent, return_type):
   """Generates a if-case statement for given arguments.

   This method generates a if-case statement, which checks if the value of
   the given 'var_name' variable is in 'codepoint_list'
   and returns 'return_type' if contained. The condition expression would be
   a simple range-based linear check.

   The generated code would be something like:

   if (var_name == 0xXXXX ||    // for a single element list.
       (0xXXXX <= var_name && var_name <= 0xXXXX) ||   // for range check.
          :
       (0xXXXX <= var_name && var_name <= 0xXXXX)) {
     return RETURN_TYPE;
   }

   Args:
     codepoint_list: a sorted list of code points.
     var_name: a variable name to be checked.
     num_indent: the indent depth.
     return_type: a return category type which should be returned if
       'var_name' is in the 'codepoint_list'
   Returns: a list of lines of the generated switch-case statement.
   """
   conditions = []
   for codepoint_group in GroupConsecutiveCodepoints(codepoint_list):
     if len(codepoint_group) == 1:
       conditions.append('%s == %d' % (var_name, codepoint_group[0]))
     else:
       conditions.append(
           '(%d <= %s && %s <= %d)' % (codepoint_group[0], var_name,
                                       var_name, codepoint_group[-1]))

   condition = (' ||\n' + ' ' * (num_indent + 4)).join(conditions)
   lines = [''.join([' ' * num_indent, 'if (', condition, ') {\n']),
            ' ' * (num_indent + 2) + 'return %s;\n' % return_type,
            ' ' * num_indent + '}\n']
   return lines


 def GenerateSwitchStatement(codepoint_list, var_name, num_indent, return_type):
   """Generates a switch-case statement for given arguments.

   This method generates a switch-case statement, which checks if the value of
   the given 'var_name' variable is in 'codepoint_list'
   and returns 'return_type' if contained.

   The generated code would be something like:
   switch (var_name) {
     case 0xXXXX:
     case 0xXXXX:
       :
     case 0xXXXX:
       return RETURN_TYPE;
   }

   Args:
     codepoint_list: a sorted list of code points.
     var_name: a variable name to be checked.
     num_indent: the indent depth.
     return_type: a return category type which should be returned if
       'var_name' is in the 'codepoint_list'
   Returns: a list of lines of the generated switch-case statement.
   """
   lines = [' ' * num_indent + ('switch (%s) {\n' % var_name)]
   for codepoint in codepoint_list:
     lines.append(' ' * (num_indent + 2) + 'case 0x%08X:\n' % codepoint)
   lines.extend([' ' * (num_indent + 4) + ('return %s;\n' % return_type),
                 ' ' * num_indent + '}\n'])
   return lines


 def GenerateGetCharacterSet(category_list, bitmap_name, bitmap_size):
   """Generates function body of a Util::GetCharacterSet method."""
   lines = []

   # Function header.
   lines.append('Util::CharacterSet Util::GetCharacterSet(char32 ucs4) {\n')

   # First, check if the given code is valid or not. If not, returns
   # UNICODE_ONLY as a fallback.
   # TODO(komatsu): add INVALID instead of UNICODE_ONLY.
   lines.extend(['  if (ucs4 > 0x10FFFF) {\n',
                 '    return UNICODE_ONLY;\n',
                 '  }\n'])
   lines.append('\n')

   # Check if the argument is ASCII or not.
   lines.extend(['  if (ucs4 <= 0x7F) {\n',
                 '    return ASCII;\n',
                 '  }\n'])
   lines.append('\n')

   # Check if the argument is JIS0201.
   # We check this by rangebased if statement, because almost JISX0201 code
   # points are consecutive.
   lines.extend(GenerateIfStatement(
       FindCodePoint(category_list, 'JISX0201'), 'ucs4', 2, 'JISX0201'))
   lines.append('\n')

   # Check if the argument is CP932.
   # Check by a switch-case statement as CP932 code points are discrete.
   lines.extend(GenerateSwitchStatement(
       FindCodePoint(category_list, 'CP932'), 'ucs4', 2, 'CP932'))
   lines.append('\n')

   # Bitmap lookup.
   # TODO(hidehiko): the bitmap has two huge 0-bits ranges. Reduce them.
   category_map = [
       (bits, category) for category, bits in CATEGORY_BITMAP.iteritems()]
   category_map.sort()

   lines.extend([
       '  if (ucs4 < %d) {\n' % bitmap_size,
       '    switch ((kCategoryBitmap[ucs4 >> 2] >> ((ucs4 & 3) * 2)) & 3) {\n'])
   lines.extend(('      case %d: return %s;\n' % item) for item in category_map)
   lines.extend(['    }\n',
                 '    return UNICODE_ONLY;\n',
                 '  }\n',
                 '\n'])

   # For codepoint > bitmap_size.
   # Remaining category should be only JISX0213 or UNICODE_ONLY.
   # The number of JISX0213 code points are much small, so we just check it
   # by a switch-case statement.
   lines.extend(GenerateSwitchStatement(
       [codepoint for codepoint in FindCodePoint(category_list, 'JISX0213')
        if codepoint >= bitmap_size ],
       'ucs4', 2, 'JISX0213'))

   # Returns UNICODE_ONLY as a last resort.
   lines.extend([
       '  return UNICODE_ONLY;\n',
       '}\n'])

   return lines


 def GenerateCharacterSetHeader(category_list):
   """Generates lines of character_set.h file."""
   bitmap_name = "kCategoryBitmap"
   bitmap_size = 65536

   lines = []

   # File header comments.
   lines.extend(['// This file is generated by base/gen_character_set.py\n',
                 '// Do not edit me!\n',
                 '\n'])

   # We use 2-bits bitmap to check JISX0208, JISX0212 and JISX0213, for
   # code point 0 to 65535 (inclusive).
   lines.extend(
       GenerateCategoryBitmap(category_list[:bitmap_size], bitmap_name))
   lines.append('\n')

   # Then add Util::GetCharacterSet method.
   lines.extend(
       GenerateGetCharacterSet(category_list, bitmap_name, bitmap_size))

   return lines


 def main():
   options = ParseOptions()

   # Generates lines of the header file.
   categorizer = CodePointCategorizer(options.cp932file,
                                      options.jisx0201file,
                                      options.jisx0208file,
                                      options.jisx0212file,
                                      options.jisx0213file)
   category_list = [
       categorizer.GetCategory(codepoint)
       for codepoint in xrange(categorizer.MaxCodePoint() + 1)]
   generated_character_set_header = GenerateCharacterSetHeader(category_list)

   # Write the result.
   if options.output:
     output = open(options.output, 'w')
     try:
       output.writelines(generated_character_set_header)
     finally:
       output.close()
   else:
     sys.stdout.writelines(generated_character_set_header)


 if __name__ == "__main__":
   main()
	# -- coding: utf-8 --
	# Copyright 2010-2015, Google Inc.
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are
	# met:
	#
	# * Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# * Redistributions in binary form must reproduce the above
	# copyright notice, this list of conditions and the following disclaimer
	# in the documentation and/or other materials provided with the
	# distribution.
	# * Neither the name of Google Inc. nor the names of its
	# contributors may be used to endorse or promote products derived from
	# this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	__author__ = "taku"

	import itertools
	import optparse
	import re
	import string
	import sys


	# We use 2-bits bitmap data for JISX0208, JISX0212 and JISX0213 clustering.
	# For remaining categories (i.e. ASCII, CP932, JISX0201 and UNICODE_ONLY),
	# we introduce heuristics to check them and use '00' bits for all of them.
	CATEGORY_BITMAP = {
	'JISX0208': 1,
	'JISX0212': 2,
	'JISX0213': 3,
	}


	def IsValidUCS2(n):
	"""Returns True if the n is valid code in UCS2."""
	return 0 <= n <= 0xFFFF


	def IsValidUCS4(n):
	"""Returns True if the n is valid code in UCS4."""
	return 0 <= n <= 0x7FFFFFFF


	class CodePointCategorizer(object):
	"""Categorizer of ucs4 code points."""

	_UCS2_PATTERN = re.compile(r'^0x([0-9A-F]{4})')

	# UCS4 pattern supports only JIS X 0213.
	# Note that Some JIS X 0213 characters are described as 'U+xxxx+xxxx',
	# and this pattern ignores latter +xxxx part intentionally.
	_UCS4_PATTERN = re.compile(r'^U\+([0-9A-F]+)')

	def __init__(self, cp932file, jisx0201file, jisx0208file,
	jisx0212file, jisx0213file):
	self._cp932 = CodePointCategorizer._LoadCP932(cp932file)
	self._jisx0201 = CodePointCategorizer._LoadJISX0201(jisx0201file)
	self._jisx0208 = CodePointCategorizer._LoadJISX0208(jisx0208file)
	self._jisx0212 = CodePointCategorizer._LoadJISX0212(jisx0212file)
	self._jisx0213 = CodePointCategorizer._LoadJISX0213(jisx0213file)
	self._exceptions = CodePointCategorizer._LoadExceptions()

	# Make a list of code point tables in the priority order.
	self._table_list = [
	('JISX0208', self._exceptions), # Vender specific code.
	('JISX0201', self._jisx0201),
	('JISX0208', self._jisx0208),
	('JISX0212', self._jisx0212),
	('JISX0213', self._jisx0213),
	('CP932', self._cp932)]


	@staticmethod
	def _LoadTable(filename, column_index, pattern, validater):
	result = set()
	for line in open(filename):
	if line.startswith('#'):
	# Skip a comment line.
	continue

	columns = line.split()
	match = pattern.search(columns[column_index])
	if match:
	ucs = int(match.group(1), 16)
	if validater(ucs):
	result.add(ucs)

	return result

	@staticmethod
	def _LoadCP932(filename):
	return CodePointCategorizer._LoadTable(
	filename, 1, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)

	@staticmethod
	def _LoadJISX0201(filename):
	return CodePointCategorizer._LoadTable(
	filename, 1, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)

	@staticmethod
	def _LoadJISX0208(filename):
	result = CodePointCategorizer._LoadTable(
	filename, 2, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)
	result.update([
	0xFF3C, # (FULLWIDTH REVERSE SOLIDS) should be in JISX0208
	0xFF0D, # (FULLWIDTH HYPHEN MINUS) should be in JISX0208
	])
	return result

	@staticmethod
	def _LoadJISX0212(filename):
	return CodePointCategorizer._LoadTable(
	filename, 1, CodePointCategorizer._UCS2_PATTERN, IsValidUCS2)

	@staticmethod
	def _LoadJISX0213(filename):
	return CodePointCategorizer._LoadTable(
	filename, 1, CodePointCategorizer._UCS4_PATTERN, IsValidUCS4)

	# The following chars have different mapping in
	# Windows and Mac. Technically, they are platform dependent
	# characters, but Mozc treat them so that they are normal characters
	# defined in JISX0208
	@staticmethod
	def _LoadExceptions():
	# treat Unicode Japanese incompatible characters as JISX0208.
	return set([
	0x00A5, # YEN SIGN
	0x203E, # OVERLINE
	0x301C, # WAVE DASH
	0xFF5E, # FULL WIDTH TILDE
	0x2016, # DOUBLE VERTICAL LINE
	0x2225, # PARALEL TO
	0x2212, # MINUS SIGN
	0xFF0D, # FULL WIDTH HYPHEN MINUS
	0x00A2, # CENT SIGN
	0xFFE0, # FULL WIDTH CENT SIGN
	0x00A3, # POUND SIGN
	0xFFE1, # FULL WIDTH POUND SIGN
	0x00AC, # NOT SIGN
	0xFFE2, # FULL WIDTH NOT SIGN
	])

	def GetCategory(self, codepoint):
	"""Returns category name of the codepoint, or None for invalid input."""
	if not IsValidUCS4(codepoint):
	return None

	# Special handling for ascii code point.
	if codepoint <= 0x007F:
	return 'ASCII'

	# Then look for loaded table list in the order.
	for name, table in self._table_list:
	if codepoint in table:
	return name

	# Not found in any tables, so return "UNICODE_ONLY" as a fallback.
	return 'UNICODE_ONLY'

	def MaxCodePoint(self):
	"""Returns the max of code points in the loaded table."""
	return max(max(table) for _, table in self._table_list)


	def GroupConsecutiveCodepoints(codepoint_list):
	"""Takes sorted codepoint list and groups by consecutive code points."""
	result = []

	prev = None
	current = []
	for codepoint in codepoint_list:
	if prev is not None and codepoint != prev + 1:
	result.append(current)
	current = []
	current.append(codepoint)
	prev = codepoint

	if current:
	result.append(current)

	return result


	def FindCodePoint(category_list, category_name):
	"""Returns a list of code points which belong to the given category_name."""
	return [codepoint for codepoint, category in enumerate(category_list)
	if category == category_name]


	def ParseOptions():
	"""Parses command line options."""
	parser = optparse.OptionParser()
	parser.add_option('--cp932file', dest='cp932file',
	help='File path for the unicode\'s CP932.TXT file')
	parser.add_option('--jisx0201file', dest='jisx0201file',
	help='File path for the unicode\'s JIS0201.TXT file')
	parser.add_option('--jisx0208file', dest='jisx0208file',
	help='File path for the unicode\'s JIS0208.TXT file')
	parser.add_option('--jisx0212file', dest='jisx0212file',
	help='File path for the unicode\'s JIS0212.TXT file')
	parser.add_option('--jisx0213file', dest='jisx0213file',
	help='File path for the unicode\'s jisx0213-2004-std.txt '
	'file')
	parser.add_option('--output', dest='output',
	help='output file path. If not specified, '
	'output to stdout.')

	return parser.parse_args()[0]


	def GenerateCategoryBitmap(category_list, name):
	r"""Generats bitmap data code.

	The generated data looks something like:
	namespace {
	const char name[] =
	"\xXX\xXX\xXX\xXX...\xXX"
	"\xXX\xXX\xXX\xXX...\xXX"
	"\xXX\xXX\xXX\xXX...\xXX"
	"\xXX\xXX\xXX\xXX...\xXX"
	;
	} // namespace

	Args:
	category_list: a list of categories.
	name: a bitmap name.
	"""
	lines = []

	# Create bitmap list.
	# Encode each code point category to 2-bits.
	# The result should be a byte (8-bits), so group each consecutive
	# (at most) four code points.
	bit_list = []
	for _, group in itertools.groupby(enumerate(category_list),
	lambda (codepoint, _): codepoint / 4):
	# Fill bits from LSB to MSB for each group.
	bits = 0
	for index, (_, category) in enumerate(group):
	bits \|= CATEGORY_BITMAP.get(category, 0) << (index * 2)
	bit_list.append(bits)

	# Header.
	lines.extend(['namespace {\n',
	'const char %s[] =\n' % name])

	# Output the content. Each line would have (at most) 16 bytes.
	for _, group in itertools.groupby(enumerate(bit_list),
	lambda (index, _): index / 16):
	line = [' \"']
	for _, bits in group:
	line.append('\\x%02X' % bits)
	line.append('\"\n')
	lines.append(''.join(line))

	lines.extend([';\n',
	'} // namespace\n'])

	return lines


	def GenerateIfStatement(codepoint_list, var_name, num_indent, return_type):
	"""Generates a if-case statement for given arguments.

	This method generates a if-case statement, which checks if the value of
	the given 'var_name' variable is in 'codepoint_list'
	and returns 'return_type' if contained. The condition expression would be
	a simple range-based linear check.

	The generated code would be something like:

	if (var_name == 0xXXXX \|\| // for a single element list.
	(0xXXXX <= var_name && var_name <= 0xXXXX) \|\| // for range check.
	:
	(0xXXXX <= var_name && var_name <= 0xXXXX)) {
	return RETURN_TYPE;
	}

	Args:
	codepoint_list: a sorted list of code points.
	var_name: a variable name to be checked.
	num_indent: the indent depth.
	return_type: a return category type which should be returned if
	'var_name' is in the 'codepoint_list'
	Returns: a list of lines of the generated switch-case statement.
	"""
	conditions = []
	for codepoint_group in GroupConsecutiveCodepoints(codepoint_list):
	if len(codepoint_group) == 1:
	conditions.append('%s == %d' % (var_name, codepoint_group[0]))
	else:
	conditions.append(
	'(%d <= %s && %s <= %d)' % (codepoint_group[0], var_name,
	var_name, codepoint_group[-1]))

	condition = (' \|\|\n' + ' ' * (num_indent + 4)).join(conditions)
	lines = [''.join([' ' * num_indent, 'if (', condition, ') {\n']),
	' ' * (num_indent + 2) + 'return %s;\n' % return_type,
	' ' * num_indent + '}\n']
	return lines


	def GenerateSwitchStatement(codepoint_list, var_name, num_indent, return_type):
	"""Generates a switch-case statement for given arguments.

	This method generates a switch-case statement, which checks if the value of
	the given 'var_name' variable is in 'codepoint_list'
	and returns 'return_type' if contained.

	The generated code would be something like:
	switch (var_name) {
	case 0xXXXX:
	case 0xXXXX:
	:
	case 0xXXXX:
	return RETURN_TYPE;
	}

	Args:
	codepoint_list: a sorted list of code points.
	var_name: a variable name to be checked.
	num_indent: the indent depth.
	return_type: a return category type which should be returned if
	'var_name' is in the 'codepoint_list'
	Returns: a list of lines of the generated switch-case statement.
	"""
	lines = [' ' * num_indent + ('switch (%s) {\n' % var_name)]
	for codepoint in codepoint_list:
	lines.append(' ' * (num_indent + 2) + 'case 0x%08X:\n' % codepoint)
	lines.extend([' ' * (num_indent + 4) + ('return %s;\n' % return_type),
	' ' * num_indent + '}\n'])
	return lines


	def GenerateGetCharacterSet(category_list, bitmap_name, bitmap_size):
	"""Generates function body of a Util::GetCharacterSet method."""
	lines = []

	# Function header.
	lines.append('Util::CharacterSet Util::GetCharacterSet(char32 ucs4) {\n')

	# First, check if the given code is valid or not. If not, returns
	# UNICODE_ONLY as a fallback.
	# TODO(komatsu): add INVALID instead of UNICODE_ONLY.
	lines.extend([' if (ucs4 > 0x10FFFF) {\n',
	' return UNICODE_ONLY;\n',
	' }\n'])
	lines.append('\n')

	# Check if the argument is ASCII or not.
	lines.extend([' if (ucs4 <= 0x7F) {\n',
	' return ASCII;\n',
	' }\n'])
	lines.append('\n')

	# Check if the argument is JIS0201.
	# We check this by rangebased if statement, because almost JISX0201 code
	# points are consecutive.
	lines.extend(GenerateIfStatement(
	FindCodePoint(category_list, 'JISX0201'), 'ucs4', 2, 'JISX0201'))
	lines.append('\n')

	# Check if the argument is CP932.
	# Check by a switch-case statement as CP932 code points are discrete.
	lines.extend(GenerateSwitchStatement(
	FindCodePoint(category_list, 'CP932'), 'ucs4', 2, 'CP932'))
	lines.append('\n')

	# Bitmap lookup.
	# TODO(hidehiko): the bitmap has two huge 0-bits ranges. Reduce them.
	category_map = [
	(bits, category) for category, bits in CATEGORY_BITMAP.iteritems()]
	category_map.sort()

	lines.extend([
	' if (ucs4 < %d) {\n' % bitmap_size,
	' switch ((kCategoryBitmap[ucs4 >> 2] >> ((ucs4 & 3) * 2)) & 3) {\n'])
	lines.extend((' case %d: return %s;\n' % item) for item in category_map)
	lines.extend([' }\n',
	' return UNICODE_ONLY;\n',
	' }\n',
	'\n'])

	# For codepoint > bitmap_size.
	# Remaining category should be only JISX0213 or UNICODE_ONLY.
	# The number of JISX0213 code points are much small, so we just check it
	# by a switch-case statement.
	lines.extend(GenerateSwitchStatement(
	[codepoint for codepoint in FindCodePoint(category_list, 'JISX0213')
	if codepoint >= bitmap_size ],
	'ucs4', 2, 'JISX0213'))

	# Returns UNICODE_ONLY as a last resort.
	lines.extend([
	' return UNICODE_ONLY;\n',
	'}\n'])

	return lines


	def GenerateCharacterSetHeader(category_list):
	"""Generates lines of character_set.h file."""
	bitmap_name = "kCategoryBitmap"
	bitmap_size = 65536

	lines = []

	# File header comments.
	lines.extend(['// This file is generated by base/gen_character_set.py\n',
	'// Do not edit me!\n',
	'\n'])

	# We use 2-bits bitmap to check JISX0208, JISX0212 and JISX0213, for
	# code point 0 to 65535 (inclusive).
	lines.extend(
	GenerateCategoryBitmap(category_list[:bitmap_size], bitmap_name))
	lines.append('\n')

	# Then add Util::GetCharacterSet method.
	lines.extend(
	GenerateGetCharacterSet(category_list, bitmap_name, bitmap_size))

	return lines


	def main():
	options = ParseOptions()

	# Generates lines of the header file.
	categorizer = CodePointCategorizer(options.cp932file,
	options.jisx0201file,
	options.jisx0208file,
	options.jisx0212file,
	options.jisx0213file)
	category_list = [
	categorizer.GetCategory(codepoint)
	for codepoint in xrange(categorizer.MaxCodePoint() + 1)]
	generated_character_set_header = GenerateCharacterSetHeader(category_list)

	# Write the result.
	if options.output:
	output = open(options.output, 'w')
	try:
	output.writelines(generated_character_set_header)
	finally:
	output.close()
	else:
	sys.stdout.writelines(generated_character_set_header)


	if __name__ == "__main__":
	main()