#! /usr/bin/python import htmllib import formatter import sys import os import re class WordAddressParser(htmllib.HTMLParser): def __init__(self, verbose=0): f = formatter.AbstractFormatter(formatter.NullWriter()) htmllib.HTMLParser.__init__(self, f, verbose) self.accum = [] self.goop = None def start_td(self, attrs): self.goop = [] def end_td(self): if len(self.goop): self.accum.append(self.goop) self.goop = None def handle_data(self, data): if self.goop is not None: data = data.strip() if len(data) is 0: return data = data.replace('\r', '') data = data.replace(' ', ' ') data = re.sub(',$', '', data) if (len(data) is 1) and (ord(data[0]) == 160): return if (len(self.goop) == 0) and (data == 'The'): return if (len(self.goop) == 1) and (data == '&'): return if (len(self.goop) == 1) and (data == 'Family'): return self.goop.append(data.strip()) trimTF = re.compile('^(The ){0,1}(?P.*)( Family){0,1}$') cityStateZip = re.compile('^(?P[^,]+), (?P.+(?= \d)) (?P[\d-]+)$') cityState = re.compile('^(?P[^,]+), (?P.+(?!,))$') zip = re.compile('^(?P[\d-]+)$') ADRHead = 'ADR;type=HOME:;;' def writeVCardContents(outFile, x): familyName = trimTF.match(x[0]).group('data') outFile.write("FN:%s\n" % familyName) splitFN = familyName.split(' ') if len(splitFN) is 2 and 'Family' not in splitFN: outFile.write("N:%s;%s;;;\n" % (splitFN[1], splitFN[0])) if len(x) is 1: return if len(x) is 2: return outFile.write(ADRHead) if len(x) is 3: outFile.write("%s;" % x[1]) match = cityStateZip.match(x[2]) if match: outFile.write(';'.join(match.groups())) else: outFile.write(x[2]) outFile.write('\n') return if len(x) is 4: cityStateMatch = cityState.match(x[2]) zipMatch = zip.match(x[3]) if cityStateMatch and zipMatch: out = '%s;%s;%s;\n' % (x[1], ';'.join(cityStateMatch.groups()), zipMatch.group('zip')) outFile.write(out) return outFile.write("%s\n" % ';'.join(x[1:])) def convertFile(inFilePath): parser = WordAddressParser(verbose=1) parser.feed(open(inFilePath).read()) parser.close() outFilePathNoExt = os.path.splitext(inFilePath)[0] outFilePath = outFilePathNoExt + ".vcf" i = 1 while os.path.exists(outFilePath): outFilePath = '%s-%d.vcf' % (outFilePathNoExt, i) i = i+1 outFile = open(outFilePath, 'w') for x in parser.accum: outFile.write("BEGIN:VCARD\n") outFile.write("VERSION:3.0\n") writeVCardContents(outFile, x) outFile.write("END:VCARD\n") outFile.close() for f in sys.argv[1:]: convertFile(f)