| # -*- coding: utf-8 -*- |
| # Copyright 2010-2015, Google Inc. |
| # All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions are |
| # met: |
| # |
| # * Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # * Redistributions in binary form must reproduce the above |
| # copyright notice, this list of conditions and the following disclaimer |
| # in the documentation and/or other materials provided with the |
| # distribution. |
| # * Neither the name of Google Inc. nor the names of its |
| # contributors may be used to endorse or promote products derived from |
| # this software without specific prior written permission. |
| # |
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| """Zip code dictionary generator. |
| |
| The tool for generating zip code dictionary. |
| Input files are shift-jis csv. |
| Output lines will be printed as utf-8. |
| |
| usage: |
| ./gen_zip_code_seed.py --zip_code=zip_code.csv --jigyosyo=jigyosyo.csv |
| > zip_code_seed.tsv |
| |
| Zip code sample input line: |
| 01101,"060 ","0600007","ホッカイドウ", |
| "サッポロシチュウオウク","キタ7ジョウニシ","北海道", |
| "札幌市中央区","北七条西",0 |
| |
| Jigyosyo zip code sample input line: |
| 01101,"サツポロシチユウオウクヤクシヨ", |
| "札幌市中央区役所","北海道","札幌市中央区", |
| "南三条西","11丁目","0608612","060 ","札幌",0,0,0 |
| """ |
| |
| __author__ = "toshiyuki" |
| |
| import codecs |
| import optparse |
| import re |
| import sys |
| import unicodedata |
| |
| |
| ZIP_CODE_LABEL = 'ZIP_CODE' |
| ZIP_CODE_COST = 7000 |
| |
| |
| class ZipEntry(object): |
| """Container class for zip code entry.""" |
| |
| def __init__(self, zip_code, address): |
| self.zip_code = zip_code |
| self.address = address |
| |
| def FormatZip(self, zip_code): |
| """Get formatted zip code.""" |
| # XXX-XXXX format |
| return '-'.join([zip_code[0:3], zip_code[3:]]) |
| |
| def Output(self): |
| """Output entry.""" |
| zip_code = self.FormatZip(self.zip_code) |
| address = unicodedata.normalize('NFKC', self.address) |
| line = '\t'.join([zip_code, '0', '0', str(ZIP_CODE_COST), |
| address, ZIP_CODE_LABEL]) |
| print line.encode('utf-8') |
| |
| |
| def ProcessZipCodeCSV(file_name): |
| """Process zip code csv.""" |
| csv_lines = ReadCSV(file_name) |
| merged_csv_lines = MergeCSV(csv_lines) |
| for tokens in merged_csv_lines: |
| for entry in ReadZipCodeEntries(tokens[2], tokens[6], tokens[7], tokens[8]): |
| entry.Output() |
| |
| |
| def ProcessJigyosyoCSV(file_name): |
| """Process jigyosyo csv.""" |
| for tokens in ReadCSV(file_name): |
| entry = ReadJigyosyoEntry(tokens[7], tokens[3], tokens[4], |
| tokens[5], tokens[2]) |
| entry.Output() |
| |
| |
| def ReadCSV(file_name): |
| """Read CSV file.""" |
| # Do not use csv reader module because it does not support unicode |
| return [GetCells(line) for line in codecs.open(file_name, |
| 'r', |
| 'shift_jis', |
| errors='replace')] |
| |
| |
| def GetCells(line): |
| """Get cells.""" |
| # [A, B, C, ..] from "A","B ",C,.. |
| return [column.strip('"').strip() for column in line.strip().split(',')] |
| |
| |
| def MergeCSV(csv_lines): |
| """Merge CSV.""" |
| # When the flag says a zip code have no multiple entry while we can see |
| # multiple line for that zip code, we have to merge them. |
| zip_count = {} |
| ret = [] |
| for entry in csv_lines: |
| zip_code = entry[2] |
| zip_count[zip_code] = zip_count.get(zip_code, 0) + 1 |
| if not ShouldMerge(zip_count, entry): |
| ret.append(entry) |
| else: |
| last_entry = ret[-1] |
| last_entry[8] += entry[8] # '町域' |
| return ret |
| |
| |
| def ShouldMerge(zip_count, entry): |
| """Return true if this entry should be merged to the previous entry.""" |
| zip_code = entry[2] |
| flag_multi = (entry[12] == '1') |
| should_merge = (zip_count[zip_code] > 1 and not flag_multi) |
| should_merge_special = ShouldMergeSpecial(entry) |
| return should_merge or should_merge_special |
| |
| |
| class SpecialMergeZip(object): |
| """Container class for special zip code entry to be merged.""" |
| |
| def __init__(self, zip_code, pref, city, towns): |
| self.zip_code = zip_code |
| self.pref = pref |
| self.city = city |
| self.towns = towns |
| |
| |
| _SPECIAL_CASES = [ |
| SpecialMergeZip(u'5900111', u'大阪府', u'堺市中区', [u'三原台']), |
| SpecialMergeZip(u'8710046', u'大分県', u'中津市', |
| [u'金谷', u'西堀端', u'東堀端', u'古金谷']), |
| SpecialMergeZip(u'9218046', u'石川県', u'金沢市', |
| [u'大桑町', u'三小牛町']), |
| ] |
| |
| |
| def ShouldMergeSpecial(entry): |
| """Return true for special cases to be merged.""" |
| zip_code = entry[2] |
| level1 = entry[6] |
| level2 = entry[7] |
| level3 = entry[8] |
| for special_case in _SPECIAL_CASES: |
| if (zip_code == special_case.zip_code and |
| level1 == special_case.pref and |
| level2 == special_case.city and |
| ContinuedLine(level3, special_case.towns)): |
| return True |
| return False |
| |
| |
| def ContinuedLine(level3, towns): |
| """Return true if this seems continued line.""" |
| for town in towns: |
| if level3.startswith(town): |
| return False |
| return True |
| |
| |
| def ReadZipCodeEntries(zip_code, level1, level2, level3): |
| """Read zip code entries.""" |
| return [ZipEntry(zip_code, u''.join([level1, level2, town])) |
| for town in ParseTownName(level3)] |
| |
| |
| def ReadJigyosyoEntry(zip_code, level1, level2, level3, name): |
| """Read jigyosyo entry.""" |
| return ZipEntry(zip_code, |
| u''.join([level1, level2, level3, u' ', name])) |
| |
| |
| def ParseTownName(level3): |
| """Parse town name.""" |
| if level3.find(u'以下に掲載がない場合') != -1: |
| return [''] |
| |
| assert CanParseAddress(level3), ('failed to be merged %s' |
| % level3.encode('utf-8')) |
| |
| # We ignore additional information here. |
| level3 = re.sub(u'(.*)', u'', level3, re.U) |
| |
| # For 地割, we have these cases. |
| # XX1地割 |
| # XX1地割、XX2地割、 |
| # XX1地割〜XX2地割、 |
| # XX第1地割 |
| # XX第1地割、XX第2地割、 |
| # XX第1地割〜XX第2地割、 |
| # We simply use XX for them. |
| chiwari_match = re.match(u'(\D*?)第?\d+地割.*', level3, re.U) |
| if chiwari_match: |
| town = chiwari_match.group(1) |
| return [town] |
| |
| # For "、" |
| # XX町YY、ZZ |
| # -> XX町YY and (XX町)ZZ |
| # YY、ZZ |
| # -> YY and ZZ |
| chou_match = re.match(u'(.*町)?(.*)', level3, re.U) |
| if chou_match: |
| chou = u'' |
| if chou_match.group(1): |
| chou = chou_match.group(1) |
| rests = chou_match.group(2) |
| return [chou + rest for rest in rests.split(u'、')] |
| |
| return [level3] |
| |
| |
| def CanParseAddress(address): |
| """Return true for valid address.""" |
| return (address.find(u'(') == -1 or |
| address.find(u')') != -1) |
| |
| |
| def ParseOptions(): |
| """Parse command line options.""" |
| parser = optparse.OptionParser(usage='Usage: %prog [options]') |
| parser.add_option('--zip_code', dest='zip_code', |
| action='store', default='', |
| help='specify zip code csv file path.') |
| parser.add_option('--jigyosyo', dest='jigyosyo', |
| action='store', default='', |
| help='specify zip code csv file path.') |
| (options, unused_args) = parser.parse_args() |
| return options |
| |
| |
| def main(): |
| options = ParseOptions() |
| |
| if options.zip_code: |
| ProcessZipCodeCSV(options.zip_code) |
| |
| if options.jigyosyo: |
| ProcessJigyosyoCSV(options.jigyosyo) |
| |
| return 0 |
| |
| |
| if __name__ == '__main__': |
| sys.exit(main()) |