#!/usr/bin/env python

'''
Usage:
   $ scanley.py index  INDEX_DIRECTORY TARGET1 [TARGET2 ...]
   $ scanley.py search INDEX_DIRECTORY SEARCH_STRING1 [SEARCH_STRING2 ...]

Todo: specify output format.

Oh yeah, and, implement Scanley :-).
'''

# See class 'Index' for the index layout.
#
# =====================
#  How to do a search:
# =====================
#
# The algorithm for finding the phrase "fish food for free" is:
#
#    1. Find intersection of FIDS for all words in the phrase.
#    2. Take the posn lists for "fish" and "food", save all the ones
#       where "food" has a BYTE_POS equal to one of next byte posns
#       for "fish", discard all the rest.
#    3. Shift -- do the same with "food"->"for", then "for"->"free".
#
# The FIDS still standing contain the phrase, and the byte_posn of
# "fish" in each FID's list is the start of the phrase in that file.
#
# Rank the results by number of times the phrase occurs in that file,
# of course.  This will require some more tuple-catching, TBD.
#
# -------------------------------------------------------------------
# Idea for allowing colons, hyphens, at-signs, etc: just record them
# like words in the POSNS table.  When they appear in the search
# string, e.g., "jrandom@foo.com", then make sure that
#
#           - "@" starts at BYTE_POS("jrandom") + len("jrandom")
#           - "foo" starts at BYTE_POS("jrandom") + len("jrandom") + 1
#           - "." starts at BYTE_POS("foo") + len("foo")
#           - "com" starts at BYTE_POS("foo") + len("foo") + 1
#
# That way, search strings that don't include the magic chars won't
# fail to include results that they probably ought to include, but
# search strings that *do* have them won't get any false positives.
# -------------------------------------------------------------------


import os
import re
import sys
import stat
import string
import marshal
import getopt

# Make sure this Python is recent enough.
if sys.hexversion < 0x2000000:
  sys.stderr.write("'%s: Python 2.0 or higher required, "
                   "see www.python.org.\n" % error_prefix)
  sys.exit(1)

# DBM module selection

# 1. If we have bsddb3, it is probably newer than bsddb. Fake bsddb = bsddb3,
#    so that the dbhash module used by anydbm will use bsddb3.
try:
  import bsddb3
  sys.modules['bsddb'] = sys.modules['bsddb3']
except ImportError:
  pass

# 2. These DBM modules are not good enough.
import anydbm
if (anydbm._defaultmod.__name__ == 'dumbdbm'
    or anydbm._defaultmod.__name__ == 'dbm'):
  print 'ERROR: your installation of Python does not contain a suitable'
  print '  DBM module. This script cannot continue.'
  print '  to solve: see http://python.org/doc/current/lib/module-anydbm.html'
  print '  for details.'
  sys.exit(1)

# 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
#    Unfortunately, gdbm appears not to be trouble free, either.
if hasattr(anydbm._defaultmod, 'bsddb') \
    and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
  try:
    gdbm = __import__('gdbm')
  except ImportError:
    sys.stderr.write(warning_prefix +
        ': The version of the bsddb module found on your\n'
        'computer has been reported to malfunction on some datasets,\n'
        'causing KeyError exceptions.  You may wish to upgrade your Python\n'
        'to version 2.3 or later.\n')
  else:
    anydbm._defaultmod = gdbm


# Warnings and errors start with these strings.  They are typically
# followed by a colon and a space, as in "%s: " ==> "WARNING: ".
warning_prefix = "WARNING"
error_prefix = "ERROR"


## Database abstractions. ##

# Always use these constants for opening databases.
DB_OPEN_READ = 'r'
DB_OPEN_NEW = 'n'

# A wrapper for anydbm that uses the marshal module to store items as
# strings.  From http://svn.collab.net/repos/cvs2svn/trunk/cvs2vsn.
class Database:
  def __init__(self, filename, mode):
    self.filename = filename
    # pybsddb3 has a bug which prevents it from working with
    # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
    # causes the DB_TRUNCATE flag to be passed, which is disallowed
    # for databases protected by lock and transaction support
    # (bsddb databases use locking from bsddb version 4.2.4 onwards).
    #
    # Therefore, manually perform the removal (we can do this, because
    # we know that for bsddb - but *not* anydbm in general - the database
    # consists of one file with the name we specify, rather than several
    # based on that name).
    if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
      if os.path.isfile(filename):
        os.unlink(filename)
      mode = 'c'

    self.db = anydbm.open(filename, mode)

  def has_key(self, key):
    return self.db.has_key(key)

  def __getitem__(self, key):
    return marshal.loads(self.db[key])

  def __setitem__(self, key, value):
    self.db[key] = marshal.dumps(value)

  def __delitem__(self, key):
    del self.db[key]

  def get(self, key, default):
    if self.has_key(key):
      return self.__getitem__(key)
    return default

  def len(self):
    return len(self.db)

  def represent(self):
    print "Representing '%s':" % self.filename
    for key in self.db.keys():
      print "   '%s' ==>" % key, marshal.loads(self.db[key])
      print "\n"
    print ""


## Text scanning. ##

class ScannerNoWords:
  """Exception class for when Scanner() finds no words in the stream."""
  pass

class Scanner:
  """Scan a search string or a text stream."""
  def __init__(self, fp=None):
    """Return a new Scanner.  If FP is not None, it is the stream
    which is to be used by successive calls to self.inhale()."""

    # How we decide what's a word and what's not.
    self.word_re = re.compile("[a-zA-Z0-9]")

    # State-saving variables for self.scan().
    self.scan_started = 0
    self.fp = fp
    self.last_ch = ""

  def is_word_char(self, ch):
    """Return true if CH is a word character, false otherwise."""
    if self.word_re.match(ch):
      return 1
    else:
      return 0

  def divide(self, str):
    """Return a list of all the words and all the single-character
    non-whitespace word delimiters in STR.  The latter is any single
    non-whitespace character that is the only thing dividing two
    words.  The elements of the returned list are in the same order
    they are in STR.  For example

       'Fish food is what J. Random and J.S. Bach eat.'

    would return

       ['Fish', 'food', 'is', 'what', 'J', 'Random', 'and',
        'J', '.', 'S', 'Bach', 'eat']'

    ### TODO: It might be more useful to also include single
    ### non-whitespace chars that occur immediately after a word (like
    ### the '.' after 'S' above).  But then again, perhaps not:
    ### if the search string parsing omits them, and the indices omit
    ### them likewise, then a search for 'J.S. Bach' would correctly
    ### find all the positives, it just might, in some weird edge
    ### cases, get some false negatives.
    """
    this_str = ""
    sep_str  = ""
    final_list = [ ]
    for ch in str:
      if self.is_word_char(ch):
        if len(sep_str) == 1 and not sep_str.isspace():
          final_list.append(sep_str)
        sep_str = ""
        this_str = this_str + ch
      else:
        if this_str:
          final_list.append(this_str)
          this_str = ""
        sep_str = sep_str + ch
    if this_str and sep_str:
      sys.stderr.write(error_prefix, "divide('%s')" % str)
      sys.exit(1)
    elif this_str:
      final_list.append(this_str)
    return final_list

  def scan(self):
    """Return a tuple (WORD, OFFSET, SEP_STRING, NEXT_OFFSET).

         - WORD is the next word in the self.fp stream
         - OFFSET is WORD's offset
         - SEP_STRING is everything from the end of WORD to NEXT_OFFSET
         - NEXT_OFFSET is the offset of the next word after WORD

     If this is the first word in the file, then SEP_STRING is "".
     If this is the last word in the file, then NEXT_OFFSET is -1.
     If there are no words in the stream, throw ScannerNoWords.
     """
    word = None
    sep_string = ""
    if not self.scan_started:
      # Start out assuming we're not in a word.
      self.scan_started = 1
      word_start_offset = None
      while 1:
        ch = self.fp.read(1)
        if not ch:
          break
        elif self.is_word_char(ch):
          word = ch
          break
    else:  # Just pick up where we left off.
      word = self.last_ch

    # One way or another, we're in a word now.
    word_start_offset = self.fp.tell() - 1
    while 1:
      ch = self.fp.read(1)
      if not self.is_word_char(ch):
        word_end_offset = self.fp.tell() - 1
        sep_string = ch
        break
      else:
        word = word + ch

    # Now we're out of the word, read till start of next word.
    while 1:
      ch = self.fp.read(1)
      if not ch:
        next_word_offset = -1
        break
      elif self.is_word_char(ch):
        next_word_offset = self.fp.tell() - 1
        self.last_ch = ch
        break
      else:
        sep_string = sep_string + ch
      
    if word is None:
      raise ScannerNoWords

    return (word, word_start_offset, sep_string, next_word_offset)
  

## Index abstraction. ##

class Index:
  """A Scanley index."""
  def __init__(self, index_dir):
    # The directory where all the index files live.
    self.index_dir = index_dir

    # The file that identifies this directory as a Scanley index dir.
    self.fmt_file = os.path.join(index_dir, "scanley.fmt")

    db_flag = 'INVALID'  # DB_OPEN_READ | DB_OPEN_NEW

    if os.path.exists(index_dir):
      if not os.path.isdir(index_dir):
        sys.stderr.write("%s: '%s' exists but is not a directory\n" \
                         % (error_prefix, index_dir))
        sys.exit(1)
      else:  # it is a directory
        if os.path.exists(self.fmt_file):
          # TODO: Someday, check the Scanley index format number or
          # something.   For now, the existence of the file is good enough
          # TODO: initialize databases according to the directory.
          db_flag = DB_OPEN_READ
        elif len(os.listdir(index_dir)) > 0:
          sys.stderr.write("%s: '%s' exists but not owned by Scanley\n" \
                           % (error_prefix, index_dir))
          sys.exit(1)
        else:
          self.own()
          db_flag = DB_OPEN_NEW
    else:  # the dir does not exist, so create it
      os.mkdir(index_dir)
      self.own()
      db_flag = DB_OPEN_NEW

    # Open all the databases.

    # 'words' table:  WORD ==> [FID, FID, FID, ...]
    #
    # Map (case-flattened) words to File IDs.  Each FID appears only
    # once, no matter how many times the word appears in that file.
    self.words_db = Database(os.path.join(index_dir, "words.db"), db_flag)

    # 'posns' table:   FID:WORD ==> [BYTE_POS, BYTE_POS, ...]
    #
    # List all the offsets at which WORD is found in file FID.
    self.posns_db = Database(os.path.join(index_dir, "posns.db"), db_flag)

    # 'nexts' table:  FID:WORD ==> [BYTE_POS, BYTE_POS, ...]
    #
    # List the offset of the *next* word after the corresponding
    # occurence of WORD in file FID.  The correspondence is with
    # FID:WORD's list in the 'words' table.  Therefore, the two lists
    # always have the same number of elements, except when a word is
    # the last word in the file, in which case the offset of the next
    # word is -1.
    self.nexts_db = Database(os.path.join(index_dir, "nexts.db"), db_flag)

    # 'fids' table:  FID  ==> PATH
    # 
    # Map FID numbers to actual file paths.  The special key "next"
    # always maps to the next available FID.  PATHs are as in the
    # 'files' table.
    self.fids_db = Database(os.path.join(index_dir, "fids.db"), db_flag)
    if not self.fids_db.has_key("next"):
      self.fids_db["next"] = '%x' % 1L  # no "next" means it's ok to write

    # 'files' table:  PATH ==> [FID, MODTIME, SIZE, MD5SUM]
    #
    # Map file PATHs to information about the files.  The PATHs are
    # not necessarily absolute, they are just whatever the caller
    # used; thus, it is the caller's decision whether or not the
    # target data shall be relocatable.
    #
    # TODO: md5sum is currently omitted.  It will be implemented when
    # we actually read the file contents; just don't want to read over
    # them twice.
    self.files_db = Database(os.path.join(index_dir, "files.db"), db_flag)

  def own(self):
    """Mark the index dir as being owned by Scanley, by creating
    the format file."""
    idf = open(self.fmt_file, "w")
    idf.write("0\nThis directory holds a Scanley index.\n")
    idf.close()

  def next_fid(self):
    """Return the next available FID and bump."""
    this = self.fids_db["next"]
    self.fids_db["next"] = '%x' % (int(this, 16) + 1)
    return this

  def inhale(self, path):
    """Index file PATH."""
    if self.files_db.has_key(path):
      # TODO: this won't always be an error of course, but for now we
      # don't know how to rebuild indexes.
      sys.stderr.write("%s: '%s' is already indexed\n" % (error_prefix, path))
      sys.exit(1)
    st = os.stat(path)
    fid = self.next_fid()
    self.files_db[path] = [fid, st[stat.ST_SIZE], st[stat.ST_MTIME]]
    self.fids_db[fid] = path

    # Scan the file, filling in the other tables.
    seen_words = { }
    scanner = Scanner(file(path))
    while 1:
      try:
        word, offset, sep_string, next_offset = scanner.scan()
      except (ScannerNoWords):
        return
      word = string.lower(word)
      # Record the FID in the words table.
      if not seen_words.has_key(word):
        if self.words_db.has_key(word):
          fids = self.words_db[word]
        else:
          fids = [ ]
        fids.append(fid)
        self.words_db[word] = fids
      # Record the word's offset in the posns and nexts table.
      key = "%s:%s" % (fid, word)
      if not seen_words.has_key(word):
        self.posns_db[key] = [offset]
        self.nexts_db[key] = [next_offset]
      else:
        posns = self.posns_db[key]
        posns.append(offset)
        self.posns_db[key] = posns
        nexts = self.nexts_db[key]
        nexts.append(next_offset)
        self.nexts_db[key] = nexts

      seen_words[word] = 1
      if next_offset == -1:
        break
      
    ### TODO: We can use the seen_words dictionary to implement
    ### incremental index updates.  With the current scheme, an
    ### incremental update would need to be able to update all entries
    ### that involve a given file.  To find those entries, we need to
    ### know all the words that were in the file as of the last time
    ### it was indexed.  Fortunately, at this point, seen_words
    ### contains exactly those words!  If we save them in a table
    ### mapping FID ==> (WORD, WORD, WORD ...), then we can get them
    ### when we need to updat.  (The values in that table would be
    ### huge, of course, but the table wouldn't be used for searches,
    ### so that doesn't really matter.)
    ###
    ### To save space, we could store FID ==> BLOOM_FILTER, then
    ### iterate over the keys in the 'words' table, checking against
    ### the Bloom Filter to see if the word is probably in the file,
    ### then fetch the word's value (i.e., list of FIDs) to confirm or
    ### deny.

    # TODO: for debugging:
    if 0:
      for db in (self.words_db, self.posns_db, self.nexts_db,
                 self.files_db, self.fids_db):
        db.represent()

  def _intersection(self, lists):
    """Return, as a list, the intersection of list of lists LIST.

    Each sublist of LIST is a list of elements whose presence can be
    queried with the 'in' operator.  The return value is a new list
    of all the elements common to all of the original sublists,
    with no ordering guarantee.

    Neither LISTS nor any sublist thereof may be empty.
    If there is no intersection, return the empty list."""
    ret_list = [ ]
    start_set = lists[0]
    rest_of = lists[1:]
    for elt in start_set:
      so_far_so_good = 1
      for sublist in rest_of:
        if not elt in sublist:
          so_far_so_good = 0
      if so_far_so_good:
        ret_list.append(elt)
    return ret_list
    

  def exhale(self, search_data):
    """Perform a search and return the results as a list.

    The input SEARCH_DATA is a list of strings, either words or single
    non-whitespace characters, of the form returned by Scanner.divide().

    The output is a list of lists: [[file, offset, ...], ...], where
    each file is a path relative to the data root for this Index, and
    each offset (there can be more than one per path) is an offset
    where the desired string starts in that file."""
    scanner = Scanner()
    fids = [ ]
    for datum in search_data:
      # Each datum is either a word or a single, non-whitespace char.
      # We ignore the chars for now, as we don't yet have the code to
      # include them in searches.
      if len(datum) == 1 and not scanner.is_word_char(datum):
        pass
      else:
        fids.append(self.words_db[datum])
    good_fids = self._intersection(fids)

    ret = [ ]
    for fid in good_fids:
      fid_ret = [self.fids_db[fid]]
      start = search_data[0]
      key = fid + ':' + start
      # Start out full, then filter out the ones we disqualify.
      initial_start_offsets = self.posns_db[key]
      initial_next_offsets  = self.nexts_db[key]

      if not len(initial_start_offsets) == len(initial_next_offsets):
        sys.stderr.write(error_prefix + ": bad index?\n")
        sys.exit(1)

      # Chase a trail starting from each of the initial start
      # offsets.  If we make it to the end of the search_data, then
      # it's a hit, otherwise it's a miss.
      for i in range(0, len(initial_start_offsets)):
        disqualified = 0
        last_next = initial_next_offsets[i]
        for datum in search_data[1:]:
          if len(datum) == 1 and not scanner.is_word_char(datum):
            pass
          else:
            key = fid + ':' + datum
            these_posns = self.posns_db[key]
            these_nexts = self.nexts_db[key]
            local_hit = 0
            for j in range(0, len(these_posns)):
              if these_posns[i] == last_next:
                local_hit = 1
                last_next = these_nexts[j]
            if local_hit == 0:
              disqualified = 1
              break

        if not disqualified:
          fid_ret.append(initial_start_offsets[i])

      if len(fid_ret) > 1:
        ret.append(fid_ret)

    return ret


def index(index_directory, targets, ignores, idx=None, verbose=0):
  """Index every file in TARGETS, record the results in INDEX_DIRECTORY.

  TARGETS is a list of file and directory paths; directories are
  indexed by indexing every file under them, recursively.

  If INDEX_DIRECTORY does not exist, create it.  If it does exist, it
  must be either empty or already a Scanley index.

  IGNORES is a list of compiled regular expression objects.  Any file
  matching one of the expressions will not be indexed, and any
  matching directory will not be descended into.

  IDX is used only by recursive calls.  When IDX is not None, it is
  the Index() object created by the root caller and passed down
  through all recursions."""
  if idx is None:
    idx = Index(index_directory)

  for target in targets:

    # Arrgh, find a way to do this with [x for x in blah] syntax, this
    # is ridiculous.
    ignore_this = None
    for pat in ignores:
      if pat.search(target):
        ignore_this = 1

    if not ignore_this:
      if os.path.isdir(target):
        for entry in os.listdir(target):
          index(index_directory, [os.path.join(target, entry)],
                ignores, idx, verbose)
      else:
        if verbose:
          print "Inhaling '%s'..." % target,
          sys.stdout.flush()
        idx.inhale(target)
        if verbose:
          print "done."
          sys.stdout.flush()


## Searching. ##

def search(index_dir, search_strings):
  """Return a list of search results, the results of searching for
  SEARCH_STRINGS in INDEX_DIRECTORY.  Each result is a sublist of the
  form [[file, offset, ...], ...], with the same semantics as the
  return value of Index.exhale().

  TODO: For now, print the raw results as well."""
  idx = Index(index_dir)
  results = [ ]

  # Gather the results.
  scanner = Scanner()
  for str in search_strings:
    compiled_form = scanner.divide(str)
    result = idx.exhale(compiled_form)
    if result:
      results.append(result)
  
  # Present the results.
  for result in results:
    ### TODO: Need to present the results nicely, of course.
    print "Search result:"
    print "   ==> ", result
    print ""

  return results


## Invocation. ##

def main():
  command = ""
  ignores = [ ]
  verbose = 0

  try:
    opts, args = getopt.getopt(sys.argv[1:], 'I:v', ["ignore=", "verbose"])
  except getopt.GetoptError, e:
    sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
    sys.exit(1)
  for opt, value in opts:
    if (opt == '--ignore') or (opt == '-I'):
      ignores.append(value)
    elif (opt == '--verbose') or (opt == '-v'):
      verbose = 1

  command = args[0]
  args = args[1:]
    
  if command == "index":
    index(args[0], args[1:], [re.compile(x) for x in ignores], None, verbose)
  elif command == "search":
    search(args[0], args[1:])
  else:
    sys.stderr.write(error_prefix + ": unknown command '%s'\n" % command)
    sys.exit(1)


if __name__ == '__main__':
  main()