| # -*- coding: utf-8 -*- |
| # Copyright 2010-2014, Google Inc. |
| # All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions are |
| # met: |
| # |
| # * Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # * Redistributions in binary form must reproduce the above |
| # copyright notice, this list of conditions and the following disclaimer |
| # in the documentation and/or other materials provided with the |
| # distribution. |
| # * Neither the name of Google Inc. nor the names of its |
| # contributors may be used to endorse or promote products derived from |
| # this software without specific prior written permission. |
| # |
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| """Generator script for connection data.""" |
| |
| __author__ = "hidehiko" |
| |
| import cStringIO as StringIO |
| import itertools |
| import logging |
| import optparse |
| import os |
| import struct |
| import sys |
| |
| from build_tools import code_generator_util |
| |
| INVALID_COST = 30000 |
| INVALID_1BYTE_COST = 255 |
| RESOLUTION_FOR_1BYTE = 64 |
| FILE_MAGIC = '\xAB\xCD' |
| |
| FALSE_VALUES = ['f', 'false', '0'] |
| TRUE_VALUES = ['t', 'true', '1'] |
| |
| |
| def ParseBoolFlag(value): |
| if value is None: |
| return False |
| |
| value = value.lower() |
| if value in TRUE_VALUES: |
| return True |
| if value in FALSE_VALUES: |
| return False |
| |
| # Unknown value. |
| logging.critical('Unknown boolean flag: %s', value) |
| sys.exit(1) |
| |
| |
| def GetPosSize(filepath): |
| # The pos-size should be equal to the number of lines. |
| # TODO(hidehiko): Merge this method with pos_util in dictionary. |
| with open(filepath, 'r') as stream: |
| stream = code_generator_util.SkipLineComment(stream) |
| # Count the number of lines. |
| return sum(1 for _ in stream) |
| |
| |
| def ParseConnectionFile(text_connection_file, pos_size, special_pos_size): |
| # The result is a square matrix. |
| mat_size = pos_size + special_pos_size |
| |
| matrix = [[0] * mat_size for _ in xrange(mat_size)] |
| with open(text_connection_file) as stream: |
| stream = code_generator_util.SkipLineComment(stream) |
| # The first line contains the matrix column/row size. |
| size = stream.next().rstrip() |
| assert (int(size) == pos_size), '%s != %d' % (size, pos_size) |
| |
| for array_index, cost in enumerate(stream): |
| cost = int(cost.rstrip()) |
| rid = array_index / pos_size |
| lid = array_index % pos_size |
| if rid == 0 and lid == 0: |
| cost = 0 |
| matrix[rid][lid] = cost |
| |
| # Fill INVALID_COST in matrix elements for special POS. |
| for rid in xrange(pos_size, mat_size): |
| for lid in xrange(1, mat_size): # Skip EOS |
| matrix[rid][lid] = INVALID_COST |
| |
| for lid in xrange(pos_size, mat_size): |
| for rid in xrange(1, mat_size): # Skip BOS |
| matrix[rid][lid] = INVALID_COST |
| |
| return matrix |
| |
| |
| def CreateModeValueList(matrix): |
| """Create a list of modes of each row.""" |
| result = [] |
| for row in matrix: |
| m = {} |
| for cost in row: |
| if cost == INVALID_COST: |
| # Heuristically, we do not compress INVALID_COST. |
| continue |
| m[cost] = m.get(cost, 0) + 1 |
| mode_value = max(m.iteritems(), key=lambda (_, count): count)[0] |
| result.append(mode_value) |
| return result |
| |
| |
| def CompressMatrixByModeValue(matrix, mode_value_list): |
| # To compress the data size, we hold mode values for each row in a separate |
| # list, and fill None into the matrix if it equals to the corresponding |
| # mode value. |
| assert len(matrix) == len(mode_value_list) |
| for row, mode_value in itertools.izip(matrix, mode_value_list): |
| for index in xrange(len(row)): |
| if row[index] == mode_value: |
| row[index] = None |
| |
| |
| def OutputBitList(bit_list, stream): |
| # Make sure that the bit list is aligned to the byte boundary. |
| assert len(bit_list) % 8 == 0 |
| for bits in code_generator_util.SplitChunk(bit_list, 8): |
| byte = 0 |
| for bit_index, bit in enumerate(bits): |
| if bit: |
| # Fill in LSB to MSB order. |
| byte |= (1 << bit_index) |
| stream.write(struct.pack('B', byte)) |
| |
| |
| def BuildBinaryData(matrix, mode_value_list, use_1byte_cost): |
| # To compress the connection data, we use two-level succinct bit vector. |
| # |
| # The basic idea to compress the rid-lid matrix is compressing each row as |
| # follows: |
| # find the mode value of the row, and set the cells containins the value |
| # empty, thus we get a sparse array. |
| # We can compress sparse array by using succinct bit vector. |
| # (Please see also storage/louds/simple_succinct_bit_vector_index and |
| # storage/louds/bit_vector_based_array.) |
| # In addition, we compress the bit vector, too. Fortunately the distribution |
| # of bits is biased, so we group consecutive 8-bits and create another |
| # bit vector, named chunk-bits; |
| # - if no bits are 1, the corresponding bit is 0, otherwise 1. |
| # By using the bit vector, we can compact the original bit vector by skipping |
| # consecutive eight 0-bits. We can calculate the actual bit position in |
| # the compact bit vector by using Rank1 operation on chunk-bits. |
| # |
| # The file format is as follows: |
| # FILE_MAGIC (\xAB\xCD): 2bytes |
| # Resolution: 2bytes |
| # Num rids: 2bytes |
| # Num lids: 2bytes |
| # A list of mode values: 2bytes * rids (aligned to 32bits) |
| # A list of row data. |
| # |
| # The row data format is as follows: |
| # The size of compact bits in bytes: 2bytes |
| # The size of values in bytes: 2bytes |
| # chunk_bits, compact_bits, followed by values. |
| |
| if use_1byte_cost: |
| resolution = RESOLUTION_FOR_1BYTE |
| else: |
| resolution = 1 |
| stream = StringIO.StringIO() |
| |
| # Output header. |
| stream.write(FILE_MAGIC) |
| matrix_size = len(matrix) |
| assert 0 <= matrix_size <= 65535 |
| stream.write(struct.pack('<HHH', resolution, matrix_size, matrix_size)) |
| |
| # Output mode value list. |
| for value in mode_value_list: |
| assert 0 <= value <= 65536 |
| stream.write(struct.pack('<H', value)) |
| |
| # 4 bytes alignment. |
| if len(mode_value_list) % 2: |
| stream.write('\x00\x00') |
| |
| # Process each row: |
| for row in matrix: |
| chunk_bits = [] |
| compact_bits = [] |
| values = [] |
| |
| for chunk in code_generator_util.SplitChunk(row, 8): |
| if all(cost is None for cost in chunk): |
| # All bits are 0, so output 0-chunk bit. |
| chunk_bits.append(False) |
| continue |
| |
| chunk_bits.append(True) |
| for cost in chunk: |
| if cost is None: |
| compact_bits.append(False) |
| else: |
| compact_bits.append(True) |
| if use_1byte_cost: |
| if cost == INVALID_COST: |
| cost = INVALID_1BYTE_COST |
| else: |
| cost /= resolution |
| assert cost != INVALID_1BYTE_COST |
| values.append(cost) |
| |
| # 4 bytes alignment. |
| while len(chunk_bits) % 32: |
| chunk_bits.append(False) |
| while len(compact_bits) % 32: |
| compact_bits.append(False) |
| if use_1byte_cost: |
| while len(values) % 4: |
| values.append(0) |
| values_size = len(values) |
| else: |
| while len(values) % 2: |
| values.append(0) |
| values_size = len(values) * 2 |
| |
| # Output the bits for a row. |
| stream.write(struct.pack('<HH', len(compact_bits) / 8, values_size)) |
| OutputBitList(chunk_bits, stream) |
| OutputBitList(compact_bits, stream) |
| if use_1byte_cost: |
| for value in values: |
| assert 0 <= value <= 255 |
| stream.write(struct.pack('<B', value)) |
| else: |
| for value in values: |
| assert 0 <= value <= 65535 |
| stream.write(struct.pack('<H', value)) |
| |
| return stream.getvalue() |
| |
| |
| def ParseOptions(): |
| parser = optparse.OptionParser() |
| parser.add_option('--text_connection_file', dest='text_connection_file') |
| parser.add_option('--id_file', dest='id_file') |
| parser.add_option('--special_pos_file', dest='special_pos_file') |
| parser.add_option('--target_compiler', dest='target_compiler') |
| parser.add_option('--use_1byte_cost', dest='use_1byte_cost') |
| parser.add_option('--binary_output_file', dest='binary_output_file') |
| parser.add_option('--header_output_file', dest='header_output_file') |
| return parser.parse_args()[0] |
| |
| |
| def main(): |
| options = ParseOptions() |
| |
| pos_size = GetPosSize(options.id_file) |
| special_pos_size = GetPosSize(options.special_pos_file) |
| matrix = ParseConnectionFile( |
| options.text_connection_file, pos_size, special_pos_size) |
| mode_value_list = CreateModeValueList(matrix) |
| CompressMatrixByModeValue(matrix, mode_value_list) |
| binary = BuildBinaryData( |
| matrix, mode_value_list, ParseBoolFlag(options.use_1byte_cost)) |
| |
| if options.binary_output_file: |
| dirpath = os.path.dirname(options.binary_output_file) |
| if not os.path.exists(dirpath): |
| os.makedirs(dirpath) |
| with open(options.binary_output_file, 'wb') as stream: |
| stream.write(binary) |
| |
| if options.header_output_file: |
| dirpath = os.path.dirname(options.header_output_file) |
| if not os.path.exists(dirpath): |
| os.makedirs(dirpath) |
| with open(options.header_output_file, 'wb') as stream: |
| code_generator_util.WriteCppDataArray( |
| binary, 'ConnectionData', options.target_compiler, stream) |
| |
| |
| if __name__ == '__main__': |
| main() |