#! /usr/bin/python
import htmllib
import formatter
import sys
import os
import re
class WordAddressParser(htmllib.HTMLParser):
def __init__(self, verbose=0):
f = formatter.AbstractFormatter(formatter.NullWriter())
htmllib.HTMLParser.__init__(self, f, verbose)
self.accum = []
self.goop = None
def start_td(self, attrs):
self.goop = []
def end_td(self):
if len(self.goop):
self.accum.append(self.goop)
self.goop = None
def handle_data(self, data):
if self.goop is not None:
data = data.strip()
if len(data) is 0: return
data = data.replace('\r', '')
data = data.replace(' ', ' ')
data = re.sub(',$', '', data)
if (len(data) is 1) and (ord(data[0]) == 160): return
if (len(self.goop) == 0) and (data == 'The'): return
if (len(self.goop) == 1) and (data == '&'): return
if (len(self.goop) == 1) and (data == 'Family'): return
self.goop.append(data.strip())
trimTF = re.compile('^(The ){0,1}(?P.*)( Family){0,1}$')
cityStateZip = re.compile('^(?P[^,]+), (?P.+(?= \d)) (?P[\d-]+)$')
cityState = re.compile('^(?P[^,]+), (?P.+(?!,))$')
zip = re.compile('^(?P[\d-]+)$')
ADRHead = 'ADR;type=HOME:;;'
def writeVCardContents(outFile, x):
familyName = trimTF.match(x[0]).group('data')
outFile.write("FN:%s\n" % familyName)
splitFN = familyName.split(' ')
if len(splitFN) is 2 and 'Family' not in splitFN:
outFile.write("N:%s;%s;;;\n" % (splitFN[1], splitFN[0]))
if len(x) is 1:
return
if len(x) is 2:
return
outFile.write(ADRHead)
if len(x) is 3:
outFile.write("%s;" % x[1])
match = cityStateZip.match(x[2])
if match:
outFile.write(';'.join(match.groups()))
else:
outFile.write(x[2])
outFile.write('\n')
return
if len(x) is 4:
cityStateMatch = cityState.match(x[2])
zipMatch = zip.match(x[3])
if cityStateMatch and zipMatch:
out = '%s;%s;%s;\n' % (x[1], ';'.join(cityStateMatch.groups()), zipMatch.group('zip'))
outFile.write(out)
return
outFile.write("%s\n" % ';'.join(x[1:]))
def convertFile(inFilePath):
parser = WordAddressParser(verbose=1)
parser.feed(open(inFilePath).read())
parser.close()
outFilePathNoExt = os.path.splitext(inFilePath)[0]
outFilePath = outFilePathNoExt + ".vcf"
i = 1
while os.path.exists(outFilePath):
outFilePath = '%s-%d.vcf' % (outFilePathNoExt, i)
i = i+1
outFile = open(outFilePath, 'w')
for x in parser.accum:
outFile.write("BEGIN:VCARD\n")
outFile.write("VERSION:3.0\n")
writeVCardContents(outFile, x)
outFile.write("END:VCARD\n")
outFile.close()
for f in sys.argv[1:]:
convertFile(f)