/* This is -*- C -*- */
/* vim: set sw=2: */
/* $Id$ */

/*
 * text.c
 *
 * Copyright (C) 2003 The Free Software Foundation, Inc.
 *
 */

/*
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA.
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include "text.h"

#include <stdio.h>

static gboolean
line_is_empty (const gchar *str)
{
  while (*str) {
    if (!isspace ((gint) *str) && *str != '\n')
      return FALSE;
    ++str;
  }
  return TRUE;
}

static void
text_load_header (Text *text)
{
  FILE *in;
  char buffer[4096];
  int bufferlen = 4096;

  in = fopen (text->filename, "r");
  g_assert (in != NULL);

  while (fgets (buffer, bufferlen, in) && !line_is_empty (buffer)) {
    
    gchar *key = buffer;
    gchar *val;
    gint i;
    
    for (i=0; buffer[i] && buffer[i] != '\n' && i < bufferlen; ++i);
    if (i < bufferlen)
      buffer[i] = '\0';
    
    if (*key && isspace ((gint) *key))
      ++key;
    
    val = key;
    while (*val && *val != ':')
      ++val;
    if (*val == ':') {
      *val = '\0';
      ++val;
      while (*val && isspace ((gint) *val))
        ++val;
    }
    
    if (*key && *val) {
      
      if (!g_strcasecmp (key, "Format")) {
        gint maj, min;
	
        if (sscanf (val, "Gnoetry/%d.%d", &maj, &min) == 2) {
          text->format_major_version = maj;
          text->format_minor_version = min;
        } else {
          g_message ("Ill-formed format \"%s\"", val);
        }
	
      } else if (!g_strcasecmp (key, "Title")) {
	
        text->title = g_strdup (val);
	
      } else if (!g_strcasecmp (key, "Author")) {
	
        text->author = g_strdup (val);
	
      } else if (!g_strcasecmp (key, "AuthorSort")) {
	
        text->sort_author = g_strdup (val);
	
      } else if (!g_strcasecmp (key, "Source")) {
	
        text->source = g_strdup (val);
	
      } else if (!g_strcasecmp (key, "Copyright")) {
	
        text->copyright = g_strdup (val);
	
      } else if (!g_strcasecmp (key, "Lingua")) {
	
	/* ignored */

      } else if (!g_strcasecmp (key, "Tokens")) {
	
        text->token_count = atoi (val);
	
      } else if (!g_strcasecmp (key, "Length")) {
	
        text->text_length = atoi (val);
	
      } else {

        /* g_message ("Unknown key/value pair: \"%s\", \"%s\"", key, val); */
      }
      
    } else {
      
      if (*key && !*val)
        g_message ("Key \"%s\" has empty value.", key);
      
    }
  }

  fclose (in);
}
static gint
char2num (gchar x)
{
  if ('0' <= x && x <= '9')
    return (gint)x - (gint)'0';
  if ('a' <= x && x <= 'z')
    return 10 + (gint)x - (gint)'a';
  if ('A' <= x && x <= 'Z')
    return 36 + (gint)x - (gint)'A';
  
  g_assert_not_reached ();
  return -1;
}

static gint
chars2code (const gchar *c)
{
  return char2num (c[0])*62*62 + char2num (c[1])*62 + char2num (c[2]);
}


static void
text_load (Text *text)
{
  FILE *in;
  char buffer[4096];
  int bufferlen = 4096;
  Token **token_table;
  int i;

  g_return_if_fail (text->token_stream == NULL);

  in = fopen (text->filename, "r");
  g_assert (in != NULL);

  /* Skip past the header */
  while (fgets (buffer, bufferlen, in) && ! line_is_empty (buffer));

  token_table = g_new0 (Token *, text->token_count);
  text->token_stream = g_new0 (Token *, text->text_length);

  /* Scan the token table */
  for (i = 0; i < text->token_count; ++i) {
    int code=-1;
    char word[128];
    char *c;
    if (fgets (buffer, bufferlen, in)
        && sscanf (buffer, "%x %s", &code, word) == 2) {
      
      g_assert (0 <= code && code < text->token_count);
      g_assert (token_table[code] == NULL);
      
      c = word;
      while (*c) {
        if (*c == '_') *c = ' ';
        ++c;
      }
      
      token_table[code] = token_lookup (word);

    } else {
      g_error ("Read error on \"%s\": %s", text->filename, buffer);
      return;
    }
  }

  /* Next, read in the text */
  i = 0;
  while (fgets (buffer, bufferlen, in)) {
    gchar *p = buffer;
    while (*p && *p != '\n') {
      gint code;
      if (*p == '[') {
        code = 0;
        ++p;
      } else if (*p == ']') {
        code = 1;
        ++p;
      } else {
        code = chars2code (p);
        p += 3;
      }
      
      g_assert (0 <= code && code < text->token_count);
      text->token_stream[i] = token_table[code];
      ++i;
    }
  }
  g_assert (i == text->text_length);
  
  fclose (in);

  g_free (token_table);
}

/* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */

Text *
text_new (const char *filename)
{
  Text *text = g_new0 (Text, 1);
  text->filename = g_strdup (filename);

  text_load_header (text);

  return text;
}

int
text_length (Text *text)
{
  g_return_val_if_fail (text != NULL, -1);
  if (text->token_stream == NULL)
    text_load (text);
  return text->text_length;
}

Token *
text_get_token (Text *text, int i)
{
  g_return_val_if_fail (text != NULL, NULL);
  g_return_val_if_fail (i >= 0, NULL);

  if (text->token_stream == NULL)
    text_load (text);
  g_assert (text->token_stream != NULL);

  g_return_val_if_fail (i < text->text_length, NULL);

  return text->token_stream[i];
}

void
text_info_dump (Text *text)
{
  g_print (" Filename: %s\n", text->filename);
  g_print ("   Format: %d.%d\n",
           text->format_major_version, text->format_minor_version);
  g_print ("    Title: %s\n", text->title);
  g_print ("   Author: %s\n", text->author);
  g_print ("   Source: %s\n", text->source);
  g_print ("Copyright: %s\n", text->copyright);
  g_print ("   Tokens: %d\n", text->token_count);
  g_print ("   Length: %d\n", text->text_length);
}

/* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */

/* Python Type Magic */

typedef struct _PyText PyText;
struct _PyText {
  PyObject_HEAD;
  Text *text;
};

static PyObject *
py_text_length (PyObject *self, PyObject *args)
{
  Text *text = ((PyText *) self)->text;
  int len;

  Py_BEGIN_ALLOW_THREADS;
  len = text_length (text);
  Py_END_ALLOW_THREADS;

  return Py_BuildValue ("i", len);
}

static PyObject *
py_text_get_token (PyObject *self, PyObject *args)
{
  PyText *py_text = (PyText *) self;
  Token *token;
  int i;

  if (! PyArg_ParseTuple (args, "i", &i))
    return NULL;

  Py_BEGIN_ALLOW_THREADS;
  token = text_get_token (py_text->text, i);
  Py_END_ALLOW_THREADS;

  return token_to_py (token);
}

static PyObject *
py_text_info_dump (PyObject *self, PyObject *args)
{
  PyText *py_text = (PyText *) self;

  text_info_dump (py_text->text);

  Py_INCREF (Py_None);
  return Py_None;
}

static PyMethodDef py_text_methods[] = {
  { "length", py_text_length, METH_VARARGS,
    "Get the length of a text, in tokens." },
  { "get_token", py_text_get_token, METH_VARARGS,
    "Get a specific token within the text." },
  { "info_dump", py_text_info_dump, METH_VARARGS,
    "Dump info." },
  {NULL, NULL, 0, NULL}
};

static PyObject *
py_text_getattr(PyObject *obj, char *name)
{
    return Py_FindMethod(py_text_methods, obj, name);
}

static void
py_text_dealloc(PyObject* self)
{
    PyObject_Del(self);
}

static PyTypeObject py_text_type_info = {
  PyObject_HEAD_INIT(NULL)
  0,
  "Text",
  sizeof(PyText),
  0,
  py_text_dealloc,  /*tp_dealloc*/
  NULL,             /*tp_print*/
  py_text_getattr,  /*tp_getattr*/
  NULL,             /*tp_setattr*/
  NULL,             /*tp_compare*/
  NULL,             /*tp_repr*/
  NULL,             /*tp_as_number*/
  NULL,             /*tp_as_sequence*/
  NULL,             /*tp_as_mapping*/
  NULL,             /*tp_hash */
};

/* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */

PyObject *
text_to_py (Text *text)
{
  PyText *py_text;
  py_text = PyObject_New(PyText, &py_text_type_info);
  py_text->text = text;
  return (PyObject *) py_text;
}

Text *
text_from_py (PyObject *obj)
{
  return ((PyText *) obj)->text;
}

/* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */

PyObject *
py_text_new (PyObject *self, PyObject *args)
{
  PyText *py_text;
  char *filename;
  Text *text;

  if (! PyArg_ParseTuple (args, "s", &filename))
    return NULL;

  text = text_new (filename);
  py_text = PyObject_New(PyText, &py_text_type_info);
  py_text->text = text;
  return (PyObject *) py_text;
}