/////////////////////////////////////////////////////////////////////////// 
/* 
  Copyright 2001,2004 Ronald S. Burkey <info@sandroid.org>
 
  This file is part of GutenMark. 
 
  GutenMark is free software; you can redistribute it and/or modify 
  it under the terms of the GNU General Public License as published by 
  the Free Software Foundation; either version 2 of the License, or 
  (at your option) any later version. 
 
  GutenMark is distributed in the hope that it will be useful, 
  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
  GNU General Public License for more details. 
 
  You should have received a copy of the GNU General Public License 
  along with GutenMark; if not, write to the Free Software 
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 
  Filename:	CreateWordlist.c 
  Purpose:	Creates an in-memory wordlist from a text file. 
  Mods:         11/18/01 RSB    Began. 
                12/01/01 RSB    Fixed a bug in which a newly allocated 
                                Wordlist isn't cleared. 
		12/13/01 RSB	Added LikelyName.
		12/15/01 RSB	Extended LikelyName with "asked", "replied",
				"answered", etc.
		01/21/04 RSB	Removed an unsigned to signed comparison
				that was always true.				
*/

/////////////////////////////////////////////////////////////////////////// 
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "libGutenSpell.h"

//------------------------------------------------------------------------ 
// Detects a character that can be included in a "word".  Returns 0 
// if not, and various combinations of bit-flags if true. 
int
IsWordChar (unsigned char c)
{
  if (c >= 'a' && c <= 'z')
    return (WORD_NORMAL | WORD_LOWER);
  if (c >= 'A' && c <= 'Z')
    return (WORD_NORMAL | WORD_UPPER);
  if (c == '\'')
    return (WORD_NORMAL | WORD_PUNCT);
  if (c >= 223 && /* c <= 255 && */ c != 247)
    return (WORD_DIACRITICAL | WORD_LOWER);
  if (c >= 192 && c <= 222 && c != 215)
    return (WORD_DIACRITICAL | WORD_UPPER);
  if (c == 173)			// soft hyphen. 
    return (WORD_DIACRITICAL | WORD_PUNCT);
  return (WORD_NOT);
}

//------------------------------------------------------------------------ 
// Destroys an in-memory Wordlist structure. 
void
DestroyWordlist (Wordlist * Words)
{
  struct SpellBlockBuffer *Buf, *Next;
  if (Words == NULL)
    return;
  for (Next = Words->Buf; Next != NULL;)
    {
      Buf = Next;
      Next = Buf->Next;
      free (Buf);
    }
  free (Words);
}

//------------------------------------------------------------------------ 
// Allocate space for a string in a spelling-block buffer, and  
// copy the string into the buffer.  Returns a pointer to where the 
// string was stored, or NULL on error. 
char *
AllocSpellString (Wordlist * Words, const char *s)
{
  struct SpellBlockBuffer *NewBuf;
  int j;
  char *ss;
  j = strlen (s) + 1;
  if (Words->Buf->Position + j > SPELLBLOCKSIZE)
    {

      // Not enough room in the current block. 
      NewBuf =
	(struct SpellBlockBuffer *) calloc (1,
					    sizeof (struct SpellBlockBuffer));
      if (NewBuf == NULL)
	return (NULL);
      NewBuf->Next = Words->Buf;
      Words->Buf = NewBuf;
    }
  ss = &Words->Buf->Buffer[Words->Buf->Position];
  strcpy (ss, s);
  Words->Buf->Position += j;
  return (ss);
}

//------------------------------------------------------------------------ 
// Creates an in-memory Wordlist structure from 7-bit or 8-bit ASCII 
// text data stored in a file.  In the latter case, the upper 128 character 
// codes are as in HTML 4.0.  Returns either a pointer to the Wordlist, 
// or else NULL on error.  By seeking to some point in the file before 
// starting, you can avoid parsing the earlier parts of the file. 

static const char *NameIndicators[] = {
  "said", "asked", "answered", "replied", "inquired", "stated",
  "exclaimed"
};
#define NUM_NAMEINDICATORS (sizeof(NameIndicators)/sizeof(const char *))

Wordlist *
CreateWordlist (FILE * Text)
{
  Wordlist *Words;
  int ch, i, j, Matched, Sentence;
  char Word[MAXWORDLENGTH], Normalized[MAXWORDLENGTH];
  char *LastWord = NULL;
  int LastWordIndex = 0;

  //struct SpellBlockBuffer *NewBuf; 
  // Initial allocation of the in-memory structures. 
  Words = (Wordlist *) calloc (1, sizeof (Wordlist));
  if (Words == NULL)
    goto Error;
  Words->Buf =
    (struct SpellBlockBuffer *) calloc (1, sizeof (struct SpellBlockBuffer));
  if (Words->Buf == NULL)
    goto Error;

  // Loop on the input-file contents. 
  Sentence = 1;
  while (EOF != (ch = getc (Text)))
    {

      // If a word ever appears NOT at the beginning of a sentence, we need 
      // to set a flag for it in the wordlist.  The mechanism for detecting 
      // beginnings of sentences is extremely accurate, but it doesn't need 
      // to be for the purposes of GutenMark. 
      if (ch == '.' || ch == '!' || ch == '?' || ch == ':')
	Sentence = 1;

      // Beginning of a new word? 
      if (!IsWordChar (ch) || ch == '\'')
	continue;

      // Fetch the word data into memory.        
      Word[0] = ch;
      for (i = 1; i < sizeof (Word); i++)
	{
	  ch = getc (Text);
	  if (ch == EOF)
	    break;
	  if (!IsWordChar (ch))
	    break;
	  if (ch == '\'')
	    {
	      ch = getc (Text);
	      j = IsWordChar (ch);
	      ungetc (ch, Text);
	      ch = '\'';
	      if (j == 0 || 0 != (j & WORD_PUNCT))
		break;
	    }
	  Word[i] = ch;
	}

      // Add it to the word list, if not an error.       
      if (i < sizeof (Word))
	{
	  Word[i] = '\0';	// Terminate the string. 
	  i = DiacriticalNormalize (Word, Normalized, MAXWORDLENGTH);
	  if (i != 0)
	    {
	      i = SearchWordlist (Words, Normalized, Word, &Matched);
	      if (Matched)
		{

		  // Maintain a count of the number of times this word 
		  // has appeared. 
		  Words->Words[i].Count++;
		  if (!Words->Words[i].Count)
		    Words->Words[i].Count--;
		  Words->Words[i].NotAtBeginning |= !Sentence;
		  Sentence = 0;
		  goto EndOfWord;
		}

	      // At this point, we know the word is not matched. 
	      // We have to make space for it at index i 
	      // in the pointer array. 
	      if (Words->NumWords >= MAXUNIQUEWORDS)
		goto Error;
	      for (j = Words->NumWords; j > i; j--)
		Words->Words[j] = Words->Words[j - 1];
	      Words->Words[i].Count = 1;

	      // Set up the pointer to the Normalized word.  If the 
	      // Following or the preceding record has a Normalized 
	      // word that matches, we can simply reuse its space. 
	      // Otherwise, we have to allocate the space.  At this  
	      // point, the i record actually contains the same data 
	      // as the i+1.     
	      Words->Words[i].NotAtBeginning = !Sentence;
	      Sentence = 0;
	      if (i < Words->NumWords
		  && !strcmp (Words->Words[i].Normalized, Normalized))
		;
	      else if (i > 0
		       && !strcmp (Words->Words[i - 1].Normalized,
				   Normalized))
		Words->Words[i].Normalized = Words->Words[i - 1].Normalized;
	      else
		{
		  if (NULL ==
		      (Words->Words[i].Normalized =
		       AllocSpellString (Words, Normalized)))
		    goto Error;
		}

	      // Set up the pointer to the Full word.  If the 
	      // Full word is the same as the Normalized word, we  
	      // can share space.        
	      if (!strcmp (Word, Normalized))
		Words->Words[i].Full = Words->Words[i].Normalized;
	      else
		{
		  if (NULL ==
		      (Words->Words[i].Full = AllocSpellString (Words, Word)))
		    goto Error;
		}

	      // Okay, we've successfully added the word! 
	      Words->NumWords++;
	    }

	EndOfWord:
	  // We can get a clue if this is a name -- or at least if it's
	  // a name that appears often in the text (and if the text is
	  // English) if it appears as part of something like "said Word" 
	  // or "Word said".
	  if (LastWord != NULL)
	    {
	      if (isupper (Words->Words[i].Normalized[0]))
		{
		  int j;
		  for (j = 0; j < NUM_NAMEINDICATORS; j++)
		    if (!strcasecmp (LastWord, NameIndicators[j]))
		      {
			Words->Words[i].LikelyName = 1;
			break;
		      }
		}
	      else if (isupper (*LastWord)
		       && !Words->Words[LastWordIndex].LikelyName)
		{
		  int j;
		  for (j = 0; j < NUM_NAMEINDICATORS; j++)
		    if (!strcmp
			(Words->Words[i].Normalized, NameIndicators[j]))
		      {
			Words->Words[LastWordIndex].LikelyName = 1;
			break;
		      }
		}
	    }
	  LastWord = Words->Words[i].Normalized;
	  LastWordIndex = i;
	}
      else
	{

	  // The word was too long.  We need to discard it completely. 
	  goto Error;

	  //while (EOF != (ch = getc (Text)) && IsWordChar (ch)); 
	}
    }
  return (Words);
Error:DestroyWordlist (Words);
  return (NULL);
}

//--------------------------------------------------------------------- 
// A main program for test purposes. 
#ifdef TESTMAIN_WORDLIST
int
main (void)
{
  int i;
  Wordlist *Words;
  SpellRecord *Word;
  Words = CreateWordlist (stdin);
  if (Words == NULL)
    {
      printf ("Cannot create wordlist.\n");
      return (1);
    }
  for (i = 0; i < Words->NumWords; i++)
    {
      Word = &(Words->Words[i]);
      printf ("%s %s %d\n", Word->Normalized, Word->Full,
	      Word->NotAtBeginning);
    }
  printf ("%d unique words.\n", Words->NumWords);
  return (0);
}
#endif /*   */
