[[[ Add the --skip-regexps option, and drive it. (There are also some debug statements, because this all started with trying to figure out why dcsops-dev@ wasn't getting into my addresses. This fix doesn't work, and anyway I'm not not sure that it's the right approach.) ]]] Index: mailaprop.py =================================================================== --- mailaprop.py (revision 5638) +++ mailaprop.py (working copy) @@ -288,6 +288,8 @@ sending to it; otherwise just increase its regular count.""" key_addr = addr.lower() # canonicalize the key, as usual # Sanity check -- this should never fail. + if addr.find("dcsops-dev@") != -1: + sys.stderr.write("DEBUG: take(): in update(): '%s <%s>'\n" % (name, addr)) if self.key_addr != key_addr: raise AddressDifference("'%s' and '%s' differ by more than case" % (self.addr, other_ah.addr)) @@ -331,6 +333,8 @@ # it matches more than once, something is wrong, and we # will raise a FullAddressDuplication exception. matched = False + if addr.find("dcsops-dev@") != -1: + sys.stderr.write("DEBUG: take(): past first test in update(): '%s <%s>'\n" % (name, addr)) for other_full_addr in self.full_addrs.keys(): # If they differ only by case, pick the better one or combine. if candidate_full_addr.lower() == other_full_addr.lower(): @@ -351,6 +355,17 @@ new_full_addr = self.make_full_addr(new_addr, new_name) new_sent_count = other_sent_count + incr_sent new_recv_count = other_recv_count + incr_recv + if addr.find("dcsops-dev@") != -1: + sys.stderr.write("DEBUG: take(): update(): start del seq:\n") + sys.stderr.write(" candidate: '%s'\n" % candidate_full_addr) + sys.stderr.write(" other_fa: '%s'\n" % other_full_addr) + sys.stderr.write(" new_fa: '%s'\n" % new_full_addr) + sys.stderr.write(" new_name: '%s'\n" % new_name) + sys.stderr.write(" new_addr: '%s'\n" % new_addr) + sys.stderr.write(" new_date: '%s'\n" % new_date) + sys.stderr.write(" new_sent_ct: '%s'\n" % new_sent_count) + sys.stderr.write(" new_recv_ct: '%s'\n" % new_recv_count) + sys.stderr.write(" new_full_addr: '%s'\n" % new_full_addr) del self.full_addrs[other_full_addr] if self.full_addrs.has_key(candidate_full_addr): del self.full_addrs[candidate_full_addr] @@ -357,6 +372,8 @@ self.full_addrs[new_full_addr] = \ [new_date, new_sent_count, new_recv_count,] if not matched: + if addr.find("dcsops-dev@") != -1: + sys.stderr.write("DEBUG: take(): update(): not matched '%s <%s>'\n" % (name, addr)) self.full_addrs[candidate_full_addr] = \ [date, incr_sent, incr_recv,] @@ -371,15 +388,24 @@ key_addr = addr.lower() ah = self.get(key_addr) if ah is None: + if addr.find("dcsops-dev@") != -1: + sys.stderr.write("DEBUG: take(): ah is None: '%s <%s>'\n" % (name, addr)) self[key_addr] = AddressHistory(name, addr, date, sent_to) else: + if addr.find("dcsops-dev@") != -1: + sys.stderr.write("DEBUG: take(): about to update(): '%s <%s>'\n" % (name, addr)) ah.update(name, addr, date, sent_to) reversed_unquoted_name_re = re.compile("([^, ]+), +([^, ]+)") -def absorb_message(msg, addresses, restricteds): +def absorb_message(msg, addresses, skip_regexps, restricteds): """File email.Message MSG into AddressBook ADDRESSES appropriately. + +SKIP_REGEXPS is a list of compiled regular expressions. Any address +that matches any of the regular expressions will be skipped, that is, +it will not be placed into ADDRESSES. + RESTRICTEDS is a nested dictionary whose keys are lower-cased raw email addresses and whose values are subdictionaries, with each subdictionary's keys being the lower-cased permissible names (the values @@ -391,6 +417,8 @@ bccs = msg.get_all('bcc', [ ]) raw_date = msg.get_all('date', None) for name, addr in email.Utils.getaddresses(froms + tos + ccs + bccs): + if addr.find("dcsops-dev@") != -1: + sys.stderr.write("DEBUG: found '%s <%s>'\n" % (name, addr)) # Certain special cases can be eliminated right out of the gate. if (name.find("via StreetEasy") >= 0 or addr.find("via StreetEasy") >= 0 # Anyone named Viagra has already changed their name by now, right? @@ -411,6 +439,8 @@ or addr.find("@unknown.email") >= 0 or addr.find("notify@twitter.com") >= 0 or addr.find("@postmaster.twitter.com") >= 0): + if addr.find("dcsops-dev@") != -1: + sys.stderr.write("DEBUG: continued here '%s <%s>'\n" % (name, addr)) continue # Clean up the name. name = \ @@ -468,9 +498,13 @@ sent_to = False for sender_addr in froms: if sender_addr.find("kfogel@") != -1: + if addr.find("dcsops-dev@") != -1: + sys.stderr.write("DEBUG: sent to '%s <%s>'\n" % (name, addr)) sent_to = True break # Okay, ready for prime time. + if addr.find("dcsops-dev@") != -1: + sys.stderr.write("DEBUG: about to test restricteds: '%s <%s>'\n" % (name, addr)) if (restricteds.has_key(addr.lower()) and not restricteds[addr.lower()].has_key(name.lower())): sys.stderr.write("DEBUG: restricted out: '%s <%s>'\n" @@ -477,7 +511,12 @@ % (name, addr)) pass elif addr.find("@") != -1: - addresses.take(addr, name, date, sent_to) + if addr.find("dcsops-dev@") != -1: + sys.stderr.write("DEBUG: about to take: '%s <%s>'\n" % (name, addr)) + if not any(skip_re.match(addr) for skip_re in skip_regexps): + addresses.take(addr, name, date, sent_to) + else: + sys.stderr.write("DEBUG: skip_re match: '%s'\n" % addr) def main(): @@ -494,9 +533,14 @@ # See absorb_message() for the format of this dictionary. restricteds = {} + # List of compiled regular expressions. If an address matches any + # of these, it is skipped; see absorb_message() for details. + skip_regexps = [] + try: (opts, args) = getopt.getopt(sys.argv[1:], "", - [ "restricteds=", ]) + [ "restricteds=", + "skip-regexps=",]) except getopt.GetoptError, err: sys.stderr.write(str(err)) sys.stderr.write("\n") @@ -521,6 +565,11 @@ ignored_directories.append(optarg) elif opt in ("--ignore-contained"): ignored_if_containing.append(optarg) + elif opt in ("--skip-regexps"): + with open(optarg) as f: + skip_regexps = [re.compile(x) for x in f.readlines()] + for r in skip_regexps: + sys.stderr.write("DEBUG: skip regexp: '%s'\n" % r) if len(args) < 1: roots = (".",) @@ -535,7 +584,7 @@ if msg_start_re.match(line): if msg_str: msg = p.parsestr(msg_str) - absorb_message(msg, addresses, restricteds) + absorb_message(msg, addresses, skip_regexps, restricteds) msg_str = line else: msg_str += line @@ -543,7 +592,7 @@ # Polish off the last message. if msg_str: msg = p.parsestr(msg_str) - absorb_message(msg, addresses, restricteds) + absorb_message(msg, addresses, skip_regexps, restricteds) # Print out the elisp core. def elisp_addr(ah): """Return the Elisp expression for AddressHistory AH.""" Index: run-mailaprop.sh =================================================================== --- run-mailaprop.sh (revision 5638) +++ run-mailaprop.sh (working copy) @@ -60,8 +60,11 @@ MAILDIR=${HOME}/mail echo "Parsing mboxes..." - find ${MAILDIR} -type f -regex ".*/[0-9]+$\|.*outwent" -print \ - | xargs cat | ${MAILAPROP} >> ${OUTFILE_TMP} + find ${MAILDIR} -type f -regex ".*/[0-9]+$\|.*outwent" -print \ + | xargs cat \ + | ${MAILAPROP} \ + --skip-regexps ${HOME}/private/mailaprop/problem-addresses.txt \ + >> ${OUTFILE_TMP} echo "Done." fi