#!/usr/bin/env python

import re
import sys
import string
import urllib2
import MySQLdb
import _mysql_exceptions


# See http://www.shearersoftware.com/software/developers/htmlfilter/\
# HTMLFilter.py.txt for the original source of this.
def HTMLDecode(value):
  """Decoder for strings containing HTML entities. ("&gt;" becomes ">", etc.)
    
  Parses numeric entities in hex and decimal, as well as all entities
  listed in Python's standard htmlentitydefs module.
  
  On a Unicode-capable Python, the input and output are assumed to be
  Unicode strings.  (This is because Python complains when
  concatenating Unicode strings with non-Unicode strings containing
  characters outside 7-bit ASCII. Numeric entities are converted to
  Unicode characters, so the combination of those and 8-bit input
  strings would result in an error.)
  
  CHECK: should newlines & whitespace be collapsed?  This would reduce
  the fidelity of attribute values--bad for form element preset
  values, where browsers tend to respect whitespace.
  
  """
  entityStart = string.find(value, '&')
  if entityStart != -1: # only run bulk of code if entities are present
    # Ideally we'd grab the charset from the mail headers too and use
    # it as a guide here.
    preferUnicodeToISO8859 = 1 # (outputEncoding is not 'iso-8859-1')
    prevOffset = 0
    valueParts = []
    import htmlentitydefs
    while entityStart != -1:

      valueParts.append(value[prevOffset:entityStart])
      entityEnd = string.find(value, ';', entityStart+1)

      if entityEnd == -1:
        entityEnd = entityStart
        entity = '&'
      else:
        entity = value[entityStart:entityEnd+1]
        if len(entity) < 4 or entity[1] != '#':
          entity = htmlentitydefs.entitydefs.get(entity[1:-1],entity)
        if len(entity) == 1:
          if (preferUnicodeToISO8859
              and ord(entity) > 127
              and hasattr(entity, 'decode')):
            entity = entity.decode('iso-8859-1')
        else:
          if len(entity) >= 4 and entity[1] == '#':
            if entity[2] in ('X','x'):
              entityCode = int(entity[3:-1], 16)
            else:
              entityCode = int(entity[2:-1])
            if entityCode > 255:
              entity = unichr(entityCode)
            else:
              entity = chr(entityCode)
              if preferUnicodeToISO8859 and hasattr(entity, 'decode'):
                entity = entity.decode('iso-8859-1')
        valueParts.append(entity)
      prevOffset = entityEnd+1
      entityStart = string.find(value, '&', prevOffset)
    valueParts.append(value[prevOffset:])
    value = string.join(valueParts, '')
  return value


class Database:
  def __init__(self, db, user, password):
    self.conn = MySQLdb.connect(user=user, passwd=password, db=db)

  def do_query(self, query):
    """Run QUERY and return the cursor afterwards."""
    cursor = self.conn.cursor()
    cursor.execute(query)
    return cursor;

def sql_stringify(str):
  """Make a string SQL-safe, and enclose it in single quotes."""
  # TODO: What, besides escaping single-quotes, do we need to do here?
  return "'" + str.replace("'", "''") + "'"


class MessageInBrief:
  """A summary of an email message.  A summary is just some headers,
  a message url, and a thread url.  The only headers we care about are
  'From', 'Date', 'Subject', and 'Message-ID'."""

  def __init__(self, db, headers, msg_url, msg_id,
               thread_url=None, thread_id=None):
    """DB is the database in which to store messages.
    HEADERS is a dictionary mapping header names to string values.
    MSG_URL is whence this message may be retrieved.
    MSG_ID is the Message-ID (i.e., cleaned header value) of this message.
    THREAD_URL is a url for this message's thread, and THREAD_ID is a
    unique identifying string (usually not a url) for that thread.
    """
    self.db = db
    self.headers = headers
    self.date = 0   # The "Date: " header, converted to an SQL date
    self.msg_url = msg_url
    self.msg_id = msg_id
    self.thread_url = thread_url
    self.thread_id = thread_id

  def sql_header_convert(self, str):
    """Convert a raw mail header name into a msgfind SQL column name."""
    return str.replace("-", "_") + "_header"

  def store(self):
    """Store this message in the database."""
    q = 'insert into messages '
    q += '(message_url, message_id, thread_url, thread_id, '
    q += ', '.join([self.sql_header_convert(x) for x in self.headers.keys()])
    if self.thread_url:
      q += ") VALUES ('%s', '%s', '%s', '%s', " \
           % (self.msg_url, self.msg_id, self.thread_url, self.thread_id)
    else:
      q += ") VALUES ('%s', '%s', NULL, " % (self.msg_url, self.msg_id)
    q += ', '.join([sql_stringify(x) for x in self.headers.values()])
    q += ');'
    self.db.do_query(q).close()

  def __str__(self):
    ret = self.msg_url + "\n"
    if self.thread_url:
      ret += self.thread_url + " (%s)\n" % self.thread_id
    for header_name in self.headers.keys():
      ret += header_name.capitalize() + ": " + self.headers[header_name] + "\n"
    ret += "Message-ID: " + self.msg_id + "\n"
    return ret + "\n"


# A beginning of an attempt to abstract mailing lists.  This is still
# quite specific to tigris.org lists, but we'll abstract more as we
# grow the ability to handle more lists.
class MailingList:

  raw_mode_suffix = "&raw=true"

  # A dictionary mapping downcased header names to regular expression
  # objects that match those headers.
  # FIXME: These should be parsed out of create-msgfind.sql, really.
  header_match_flags = re.MULTILINE | re.IGNORECASE
  desired_headers = {
    "from"       : re.compile("^from: (.*)",       header_match_flags),
    "to"         : re.compile("^to: (.*)",         header_match_flags),
    "cc"         : re.compile("^cc: (.*)",         header_match_flags),
    "subject"    : re.compile("^subject: (.*)",    header_match_flags),
    "date"       : re.compile("^date: (.*)",       header_match_flags),
    "reply-to"   : re.compile("^reply-to: (.*)",   header_match_flags),
    "message-id" : re.compile("^message-id: (.*)", header_match_flags),
    }

  # Regular expression used to extract a thread ID from a thread URL.
  thread_id_re = re.compile(".*=([0-9]+)$")

  # A thread URL line looks like this:
  #    <a href="http://subversion.tigris.org/servlets/BrowseList?\
  #    list=dev&amp;by=thread&amp;from=388915">Browse this thread</a>
  thread_url_line_re = re.compile('.*<a href="([^"]+)">'
                                  'Browse this thread</a>.*')
  
  def __init__(self, name, db):
    """NAME is the name of the mailing list, e.g., 'foo' in 'foo@bar'.
    DB is the database where message records and the counter live."""
    self.name = name
    self.msg_url_template = "http://subversion.tigris.org/servlets/" \
                            "ReadMsg?list=%s&msgNo=%%d" % name
    self.db = db
    self.next_message_number = -1   # uninitialized until read from db

  def store_next_message_number(self):
    """Store the current next message number to the DB.  If the next
    message number is 0, initialize the counter."""
    if self.next_message_number == 0:
      q = "insert into mailing_lists "                    \
          + "(name, next_message_number) "  \
          + "VALUES "                               \
          + "('%s', %d);" % (self.name, self.next_message_number)
    else:
      q = "update mailing_lists set "                                \
          + "next_message_number=%d where name='%s';"  \
          % (self.next_message_number, self.name)
    self.db.do_query(q).close()
    
  def _next_message_number(self):
    "Return the next message number for this mailing list."
    if self.next_message_number == -1:
      q = "select next_message_number from mailing_lists where " \
          "name = '%s';" % self.name
      counter_cursor = self.db.do_query(q)
      if counter_cursor.rowcount == 0:
        self.next_message_number = 0
        self.store_next_message_number()
      else:
        row = counter_cursor.fetchone()
        self.next_message_number = row[0]
      counter_cursor.close()
    # Okay, we're now in sync with the DB, no matter what.  Return.
    n = self.next_message_number
    self.next_message_number += 1
    return n

  def next_message(self):
    """Store and return a MessageInBrief object for the next message."""
    while True:
      next_message_number = self._next_message_number()
      msg_url = self.msg_url_template % next_message_number
      msg_url_raw = msg_url + self.raw_mode_suffix
      raw_page = urllib2.urlopen(msg_url_raw)
      raw_page = raw_page.read()
      msg_id = None
      thread_url = None
      thread_id = None

      # Grab the thread URL/ID.
      m = self.thread_url_line_re.search(raw_page)
      if m:
        thread_url = m.group(1)
        m = self.thread_id_re.match(thread_url)
        if m: thread_id = m.group(1)

      # Grab the message headers.
      # ### TODO: Doing one-line matches means we don't grab
      # multi-line header values properly.  That needs to be fixed.
      found_headers = { }
      for header_name in self.desired_headers.keys():
        matcher = self.desired_headers[header_name]
        m = matcher.search(raw_page)
        if m:
          found_headers[header_name] = HTMLDecode(m.group(1))
          # For one particular header, Message-ID, we do some data
          # cleaning.  Since we have a principle that any database
          # column whose name ends in "_header" contains exactly the
          # value found in that header, we store msg_id in its own
          # variable, which eventually, after being passed around a
          # bit, is manually mapped to its own column.
          if header_name == "message-id":
            s = found_headers[header_name]
            if s.find("<") >= 0:
              msg_id = s[s.find("<") + 1:s.find(">")]
              del found_headers[header_name]

      if msg_id is None:
        # We can't handle a message with no ID, so just skip it.
        continue

      if (len(found_headers) > 0):
        mb = MessageInBrief(self.db, found_headers,
                            msg_url, msg_id, thread_url, thread_id)
        # Update the message counter for this list.
        mb.store()
        # ### TODO: atomicity concern between storing message and
        # storing incremented message number.  They should happen
        # together in one database transaction, but currently don't.
        # Another way to solve it would be to make sure never to store
        # a record for the same Message-ID twice.
        self.store_next_message_number()
        return mb
      else:
        # ### TODO: This conditional will make development go faster,
        # but I don't think it needs to stay in production.
        # Ordinarily, we can just sleep unconditionally.
        if self.next_message_number > 105930:
          sys.sleep(5)

# ### TODO: Get this stuff from a conf file.
db = Database("msgfind", "msgfindrw", "SECRET")
mailing_list = MailingList('dev', db)

while True:
  mb = mailing_list.next_message()
  print mb
