/*=====================================================================
                =======   COPYRIGHT NOTICE   =======
Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
Ronald Rosenfeld and Philip Clarkson.

All rights reserved.

This software is made available for research purposes only.  It may be
redistributed freely for this purpose, in full or in part, provided
that this entire copyright notice is included on any copies of this
software and applications and derivations thereof.

This software is provided on an "as is" basis, without warranty of any
kind, either expressed or implied, as to any matter including, but not
limited to warranty of fitness of purpose, or merchantability, or
results obtained from use of this software.
======================================================================*/


/* Type and function definitions for general n_gram models */

#ifndef _NGRAM_H_
#define _NGRAM_H_

#include "rr_libs/sih.h"
#include "toolkit.h"

#define DEFAULT_COUNT_TABLE_SIZE 65535
#define DEFAULT_OOV_FRACTION 0.5
#define DEFAULT_DISC_RANGE_1 1
#define DEFAULT_DISC_RANGE_REST 7
#define DEFAULT_MIN_ALPHA -3.2
#define DEFAULT_MAX_ALPHA 2.5
#define DEFAULT_OUT_OF_RANGE_ALPHAS 10000

#define GOOD_TURING 1
#define ABSOLUTE 2
#define LINEAR 3
#define WITTEN_BELL 4

#define SPECIFIED 1
#define BUFFER 2
#define TWO_PASSES 3

#define KEY 65000

#define CLOSED_VOCAB 0
#define OPEN_VOCAB_1 1
#define OPEN_VOCAB_2 2

typedef unsigned short id__t; /* Double underscore, since id_t is
				 already defined on some platforms */
typedef int count_t;   /* The count as read in, rather than its index 
			  in the count table. */
typedef unsigned short count_ind_t; /* The count's index in the count 
				       table. */
typedef unsigned short bo_weight_t;
typedef unsigned short cutoff_t;
typedef int table_size_t;
typedef unsigned short index__t;
typedef double disc_val_t;
typedef double uni_probs_t;
typedef int ptr_tab_t;
typedef float four_byte_t;


typedef struct {
  unsigned short n;
  id__t          *id_array;
  count_t        count;
} ngram;

typedef struct {
  unsigned short count_table_size;
  int            *counts_array;
} count_table_t;

typedef struct {

  /* Language model type */

  unsigned short n;                /* n=3 for trigram, n=4 for 4-gram etc. */
  int            version;

  /* Vocabulary stuff */

  sih_t          *vocab_ht;      /* Vocabulary hash table */
  unsigned short vocab_size;     /* Vocabulary size */
  char           **vocab;        /* Array of vocabulary words */
  unsigned short no_of_ccs;      /* Number of context cues */

  /* Tree */

  table_size_t   *table_sizes;   /* Pointer to table size array */
  id__t          **word_id;      /* Pointer to array of id lists */
  count_ind_t    **count;        /* Pointer to array of count lists 
				    (actually indices in a count table) */
  count_ind_t    *marg_counts;   /* Array of marginal counts for the 
				    unigrams. The normal unigram counts
				    differ in that context cues have
				    zero counts there, but not here */
  int            **count4;       /* Alternative method of storing the counts,
				    using 4 bytes. Not normally allocated */
  int            *marg_counts4;  /* Ditto */
  bo_weight_t    **bo_weight;    /* Pointer to array of back-off weights */
  four_byte_t    **bo_weight4;   /* Pointer to array of 4 byte
				    back_off weights. Only one of
				    these arrays will be allocated */
  index__t       **ind;          /* Pointer to array of index lists */
  

  /* Two-byte alpha stuff */

  double         min_alpha;      /* The minimum alpha in the table */
  double         max_alpha;      /* The maximum alpha in the table */
  unsigned short out_of_range_alphas;  /* The maximum number of out of range 
					  alphas that we are going to allow. */
  double         *alpha_array;
  unsigned short size_of_alpha_array;

  /* Count table */

  count_ind_t    count_table_size; /* Have same size for each count table */
  count_t        **count_table;    /* Pointer to array of count tables */

  /* Index lookup tables */

  ptr_tab_t      **ptr_table;     /* Pointer to the tables used for compact 
				     representation of the indices */
  unsigned short *ptr_table_size; /* Pointer to array of pointer tables */

  /* Discounting and cutoffs - note: some of these may not used,
     depending on the discounting techinque used. */

  unsigned short discounting_method;     /* See #define stuff at the top of 
					    this file */
  cutoff_t       *cutoffs;               /* Array of cutoffs */
  int            **freq_of_freq;         /* Array of frequency of frequency 
					    information  */
  unsigned short *fof_size;              /* The sizes of the above arrays */
  unsigned short *disc_range;            /* Pointer to array of discounting 
					    ranges - typically will be 
					    fof_size - 1, but can be reduced
					    further if stats are anomolous */
  disc_val_t     **gt_disc_ratio;        /* The discounted values of the 
					    counts */
  disc_val_t     *lin_disc_ratio;        /* The linear discounting ratio */
  double         *abs_disc_const;        /* The constant required for
					    absolute discounting */

  /* Unigram statistics */

  uni_probs_t    *uni_probs;             /* Probs for each unigram */
  uni_probs_t    *uni_log_probs;         /* Log probs for each unigram */
  flag           *context_cue;           /* True if word with this id is
					    a context cue */
  int            n_unigrams;             /* Total number of unigrams in
					    the training data */
  int            min_unicount;           /* Count to which infrequent unigrams
					    will be bumped up */
  /* Input files */

  char           *id_gram_filename;  /* The filename of the id-gram file */
  FILE           *id_gram_fp;        /* The file pointer of the id-gram file */
  char           *vocab_filename;    /* The filename of the vocabulary file */
  char           *context_cues_filename; /* The filename of the context cues 
					    file */
  FILE           *context_cues_fp;       /* The file pointer of the context 
					    cues file */

  /* Output files */

  flag           write_arpa;      /* True if the language model is to be 
				     written out in arpa format */
  char           *arpa_filename;  /* The filaname of the arpa format LM */
  FILE           *arpa_fp;        /* The file of the arpa format LM */
  flag           write_bin;       /* True if the language model is to be 
				     written out in binary format */
  char           *bin_filename;   /* The filaname of the bin format LM */
  FILE           *bin_fp;         /* The file of the bin format LM */

  /* Misc */

  int            *num_kgrams;     /* Array indicating how many 
				     2-grams, ... ,n-grams, have been 
				     processed so far */

  unsigned short vocab_type;      /* see #define stuff at the top */

  unsigned short first_id;        /* 0 if we have open vocab, 1 if we have
				     a closed vocab. */

  /* Once the tree has been constructed, the tables are indexed from 0
     to (num_kgrams[i]-1). */

  /* 1-gram tables are indexed from 0 to ng.vocab_size. */

  double         zeroton_fraction; /* cap on prob(zeroton) as fraction of 
				      P(singleton) */
  double         oov_fraction;
  flag           four_byte_alphas;
  flag           four_byte_counts;

} ng_t;

#endif