#!/usr/bin/python
import sys, os, string, re, tempfile, os.path

# Fix anchor names in cvsbook.html to be compatible with all browsers,
# by using underscores in place of spaces or "%20" codes.
#
# Author: Dave Makower <davemak@shootinggallery.com>.
# 
# (Thanks, Dave, first for noticing the problem, and then writing this
# script to solve it!  -kff)


##################################
# Initialize
##################################

# Declare and precompile regex constants
FIX_NAME_REGEX = re.compile('(%[0-9A-Fa-f][0-9A-Fa-f])|([^A-Za-z0-9\\.\\-_])')
FIND_NAME_REGEX = re.compile('<a\\s+[^>]*name="([^"]*)"[^>]*>')
REPLACE_NAME_REGEX = re.compile('(<a\\s+)([^>]*\\s)?(name="|href="#)([^"]*)("[^>]*>)')

# Start with empty hashes
old2new = {}
new2old = {}

# ugly hack:
old2new['(dir)'] = '_top'

##################################
# replace all bad chars with underscore
##################################
def fixName(name):
    return FIX_NAME_REGEX.sub('_', name)

##################################
# ensure uniqueness of names
##################################
def makeUnique(name):
    counter = 0
    retval = name
    while new2old.has_key(retval):
       counter = counter + 1
       retval = name + `counter`
    return retval

##################################
# called from within regex substition
##################################
def replaceNames(matchObj):
    g = matchObj.groups('')
    return g[0] + g[1] + g[2] + old2new[g[3]] + g[4]

##################################
# main method, essentially
##################################

src = sys.stdin
dest = sys.stdout

tmp = None
tmpName = tempfile.mktemp("-tmp.html")

try:
    if len(sys.argv) > 1 and sys.argv[1] != '-':
       src = open(sys.argv[1], 'r+')

    tmpName = tempfile.mktemp("-tmp.html")
    tmp = open(tmpName, "w")

    # gather all anchor names and prepare replacements
    for line in src.readlines():
       for name in FIND_NAME_REGEX.findall(line):
          newName = makeUnique(fixName(name))
          old2new[name] = newName
          new2old[newName] = name
       tmp.write(line)

    # switch to tempfile for second pass
    if (src != sys.stdin):
       src.close()
    tmp.close()
    tmp = None
    src = open(tmpName, "r")

    # open output (or use stdout)
    if len(sys.argv) > 2:
       dest = open(sys.argv[2], "w")

    # replace all occurrences of <a href="#..."> and <a name="...">
    for line in src.readlines():
       dest.write(REPLACE_NAME_REGEX.sub(replaceNames, line))

finally:
    if (src != sys.stdin) and not src.closed:
       src.close()

    if (dest != sys.stdout) and not dest.closed:
       dest.close()

    if (tmp != None) and not tmp.closed:
       tmp.close()

    if (os.path.exists(tmpName)):
       os.remove(tmpName)
