#!/usr/bin/python2 # This is -*- Python -*- import string, sys, xml.sax, xml.sax.handler encoding = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; def encode_number(n): if n == 0: return "[" elif n == 1: return "]" x = n % 62 y = (n / 62) % 62 z = (n / 3844) % 62 return encoding[z] + encoding[y] + encoding[x] class GnoetryDoc: def __init__(self): self.__props = {"Format": "Gnoetry/0.1"} self.__table = { "": 0, "": 1 } self.__text = [] def set_property(self, key, value): self.__props[key] = value def append_token(self, tok): if not tok: return tok = tok.strip().lower() code = self.__table.setdefault(tok, len(self.__table)) self.__text.append(code) N = len(self.__text) if (N % 5000 == 0): sys.stderr.write("%d tokens...\n" % N) def append_start(self): if self.__text and self.__text[-1] == 0: return self.__text.append(0) def append_stop(self): if self.__text and self.__text[-1] <= 1: return self.__text.append(1) def dump(self): # drop any trailing start token if self.__text[-1] == 0: self.__text = self.__text[:-1] # Make sure props are up-to-date self.__props["Tokens"] = len(self.__table) self.__props["Length"] = len(self.__text) # Dump props for key, val in self.__props.items(): print "%s: %s" % (key, val) print table = [] for tok, code in self.__table.items(): table.append((code, tok.replace(" ", "_"))) table.sort(lambda x, y: cmp(x[0], y[0])) for code, tok in table: print "%x %s" % (code, tok) encoded_stream = [] for code in self.__text: encoded_stream.append(encode_number(code)) i = 0 L = len(encoded_stream) while i < L: linelen = 0 step = 0 while i + step < L and linelen <= 76: linelen += len(encoded_stream[i+step]) step += 1 print string.join(encoded_stream[i:i+step], "") i += step def stream(self): reverse = {} for tok, code in self.__table.items(): reverse[code] = tok for code in self.__text: if code == 1: print elif code > 1: print "<%s>" % reverse[code], ############################################################################### abbrev_list = ("dr.", "mr.", "ms.", "mrs.", "m.") def is_abbrev(word): return word.lower() in abbrev_list def is_terminal(word): if word[-1] == ".": return not is_abbrev(word) return word[-1] in ("?", "!") class TextScanner: def __init__(self): self.__gdoc = GnoetryDoc() def get_document(self): return self.__gdoc def process_chunk(self, chunk): chunk = chunk.strip() if not chunk: return chunk = chunk.replace('"', "") # remove quotes chunk = chunk.replace(")", ", ") # map parens to commas chunk = chunk.replace("(", ", ") # map parens to commas tokens = chunk.split() doc = self.get_document() doc.append_start() for token in tokens: if is_terminal(token): token = token[:-1] doc.append_token(token) doc.append_stop() doc.append_start() else: doc.append_token(token) doc.append_stop() def process(self): assert 0 class TextScanner_XML(TextScanner, xml.sax.handler.ContentHandler, xml.sax.handler.DTDHandler, xml.sax.handler.EntityResolver, xml.sax.handler.ErrorHandler): def __init__(self, filename): TextScanner.__init__(self) self.__para = 0 self.__bookbody = 0 self.__pending = [] self.__filename = filename doc = self.get_document() doc.set_property("SourceFile", filename) doc.set_property("TextScanner", "XML") def startElement(self, name, attrs): if name == "para": self.__para += 1 elif name == "bookbody": self.__bookbody += 1 def endElement(self, name): if name == "para": self.__para -= 1 if self.__para == 0: self.__process_pending() elif name == "bookbody": self.__bookbody -= 1 def characters(self, content): if self.__para > 0 and self.__bookbody > 0: line = content.encode("utf-8").strip() if line: self.__pending.append(line) def __process_pending(self): if not self.__pending: return line = string.join(self.__pending, " ") # clean up the results of %apos; entities line = line.replace("s ' ", "s' ") line = line.replace(" ' s ", "'s ") # strip out quotes #line = line.replace('"', "") #line = line.replace('`', "") #line = line.replace(" '", " ") #line = line.replace(".'", ".") #line = line.replace(",'", ".") #line = line.replace(";'", ".") # should probably use a regexp for this #line = line.replace("---", " *mdash* ") #line = line.replace("--", " *mdash* ") # fix hyphenated lines. Also should be a regexp #line = line.replace("- ", "") self.process_chunk(line) self.__pending = [] def process(self): parser = xml.sax.make_parser() parser.setContentHandler(self) parser.parse(self.__filename) scanner = TextScanner_XML(sys.argv[1]) scanner.process() scanner.get_document().dump()