blob: f0586baf5ed3c42fad0b83fbfd4a55b05c75c9d9 [file] [log] [blame]
# -*- coding: utf-8 -*-
# Copyright 2010-2015, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
# * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
A tool to generate segmenter-code from human-readable rule file
"""
__author__ = "taku"
import sys
import re
HEADER="""
namespace {
const int kLSize = %d;
const int kRSize = %d;
bool IsBoundaryInternal(uint16 rid, uint16 lid) {
// BOS * or * EOS true
if (rid == 0 || lid == 0) { return true; }"""
FOOTER=""" return true; // default
}
} // namespace
"""
def ReadPOSID(id_file, special_pos_file):
pos = {}
max_id = 0
for line in open(id_file, "r"):
fields = line.split()
pos[fields[1]] = fields[0]
max_id = max(int(fields[0]), max_id)
max_id = max_id + 1
for line in open(special_pos_file, "r"):
if len(line) <= 1 or line[0] == '#':
continue
fields = line.split()
pos[fields[0]] = ("%d" % max_id)
max_id = max_id + 1
return pos
def PatternToRegexp(pattern):
return pattern.replace("*", "[^,]+")
def GetRange(pos, pattern, name):
if pattern == "*":
return ""
pat = re.compile(PatternToRegexp(pattern))
min = -1;
max = -1;
keys = pos.keys()
keys.sort()
range = []
for p in keys:
id = pos[p]
if pat.match(p):
if min == -1:
min = id
max = id
else:
max = id
else:
if min != -1:
range.append([min, max])
min = -1
if min != -1:
range.append([min, max])
tmp = []
for r in range:
if r[0] == r[1]:
tmp.append("(%s == %s)" % (name, r[0]))
else:
tmp.append("(%s >= %s && %s <= %s)" % (name, r[0], name, r[1]))
if len(tmp) == 0:
print "FATAL: No rule fiind %s" % (pattern)
sys.exit(-1)
return " || ".join(tmp)
def main():
pos = ReadPOSID(sys.argv[1], sys.argv[2])
print HEADER % (len(pos.keys()), len(pos.keys()))
for line in open(sys.argv[3], "r"):
if len(line) <= 1 or line[0] == '#':
continue
(l, r, result) = line.split()
result = result.lower()
lcond = GetRange(pos, l, "rid") or "true";
rcond = GetRange(pos, r, "lid") or "true";
print " // %s %s %s" % (l, r, result)
print " if ((%s) && (%s)) { return %s; }" % (lcond, rcond, result)
print FOOTER
if __name__ == "__main__":
main()