/*===================================================================== ======= COPYRIGHT NOTICE ======= Copyright (C) 1996, Carnegie Mellon University, Cambridge University, Ronald Rosenfeld and Philip Clarkson. All rights reserved. This software is made available for research purposes only. It may be redistributed freely for this purpose, in full or in part, provided that this entire copyright notice is included on any copies of this software and applications and derivations thereof. This software is provided on an "as is" basis, without warranty of any kind, either expressed or implied, as to any matter including, but not limited to warranty of fitness of purpose, or merchantability, or results obtained from use of this software. ======================================================================*/ /* copyright (C) Roni Rosenfeld, 1990, 1991, 1992, 1993 */ /* Edited by Kristie Seymore, 4/16/97 */ /* Editied further by Philip Clarkson April 1997, in order to provide consistency with the rest of the toolkit */ /* interpolate_v2: Find maximum-likelihood weights for interpolating several probabilistic models, where the models are described by their output on a common set of items (.fprobs file), previously derived from a common text file. "-test_last nnn" means that the last nnn items of each model will be used for testing. "-test_all" means _all_ items will be used for testing. "-test_first nnn" means that the first nnn items of each model will be used for testing and the last nnn items of each model will be used for training. "-test_cv" means to run in cross-validation mode, where each half of the test items will be used for training and testing, and the resulting PPs will be combined to give an overall PP value. If a partition of the items is induced via a "tags" mapping, separate weights will be calculated for each tag. In "test" mode, run thru the main loop only once, to compute the test PP. The initial lambdas may be read from a file. For non-test-mode, they default to 1/#-of-models. Roni Rosenfeld, 3/1/93 Adapted from the version I wrote for my "lm" software. */ #include #include #include #include #include "rr_libs/general.h" #include "pc_libs/pc_general.h" #include "toolkit.h" #define ITEM_T float #define ITEM_FORMAT "%f" #define MCAPTION 20 /* update command line argument sequence */ void updateArgs( int *pargc, char **argv, int rm_cnt ) { int i ; /* update the argument count */ (*pargc)-- ; /* update the command line */ for( i = rm_cnt ; i < *pargc ; i++ ) argv[i] = argv[i+1] ; } void eval(double *sum_logprobs, double **fractions, int *tag_of, int *n_in_tag, double *prob_components, double **lambdas, ITEM_T **model_probs, int ntags, int from_item, int to_item, int nmodels, char **captions, double *p_new_pp, int iter_no, double old_pp, int verbosity, FILE *probs_fp) { int itag, iitem, tag, imodel; double total_prob, total_logprobs, new_pp; for (itag=0; itag PP=%f\n", exp( -sum_logprobs[itag] / n_in_tag[itag])); } } new_pp = exp(-total_logprobs/(to_item-from_item+1)); pc_message(verbosity,2,"\t\t\t=============> TOTAL PP = %g",new_pp); if (iter_no>1) { pc_message(verbosity,2," (down %.4f)\n",(1.0-(new_pp/old_pp))); } else { pc_message(verbosity,2,"\n"); } *p_new_pp = new_pp; } void main (int argc, char **argv) { int use_tags=0; int use_captions=0; int pure_test_mode=0; int first_part = 0; int default_lambdas=1; int verbosity=1; int n_test_items=0; int n_train_items=0; int cv=0; int Mprobs = 60000; int write_lambdas = 0; double stop_ratio = 0.999; static char *rname = "interpolate"; char *tags_filename; char *captions_filename; char *lambdas_filename; char *write_lambdas_filename; FILE *tags_fp; FILE *captions_fp; FILE *lambdas_fp; FILE *probs_fp=NULL; FILE *write_lambdas_fp = NULL; char **model_filenames; /* model_filenames[model] */ FILE **model_fps; /* model_fps[model] */ Boolean *model_fixed_lambdas; /*model_fixed_lambdas[model] */ ITEM_T **model_probs; /* probs[model][item] */ int *tag_of; /* tag_of[item] */ double *prob_components; /* prob_components[model] */ double **lambdas; /* lambdas[model][tag] */ double **fractions; /* fractions[model][tag] */ double *sum_logprobs; /* sum_logprobs[tag] */ int *n_train_in_tag; /* n_in_tag[tag] */ int *n_test_in_tag; /* n_in_tag[tag] */ int nmodels=0; int imodel; int ntags; int itag; int tag; int nitems; int iitem; int iter_no; int half_point = 0; int iter_num; int first_test_items=0; int second_test_items=0; double old_pp=0.0; double new_pp; double test_pp; float dummyf; double first_part_pp=0.0; double second_part_pp=0.0; double total_pp; double sum_logprob_1; double sum_logprob_2; double total_logprob; char **captions; FILE *fp; ITEM_T *pitem; int scanfrc; int nnewitems; int temp_test_items; char *write_fprobs_filename; int i; /* Allocate memory for model data */ model_filenames = (char **) rr_malloc(argc * (sizeof(char *))); model_fixed_lambdas = (Boolean *) rr_malloc(argc * (sizeof(Boolean))); /* Process command line */ report_version(&argc,argv); if (argc == 1 || pc_flagarg(&argc,argv,"-help")) { fprintf(stderr,"Usage : interpolate +[-] model1.fprobs +[-] model2.fprobs ... \n"); fprintf(stderr," [ -test_all | -test_first n | -test_last n | -cv ]\n"); fprintf(stderr," [ -tags .tags ]\n"); fprintf(stderr," [ -captions .captions ]\n"); fprintf(stderr," [ -out_lambdas .lambdas ]\n"); fprintf(stderr," [ -in_lambdas .lambdas ]\n"); fprintf(stderr," [ -stop_ratio 0.999 ]\n"); fprintf(stderr," [ -probs .fprobs ]\n"); fprintf(stderr," [ -max_probs 6000000 ]\n"); exit(1); } /* Grab all the model filename */ i = 0; while (i 0) { probs_fp = rr_oopen(write_fprobs_filename); } else { pc_message(verbosity,2,"Warning : -write option ignored, as none of the data is used for testing.\n"); } } Mprobs = pc_intarg(&argc,argv,"-max_probs",6000000); pc_report_unk_args(&argc,argv,verbosity); if (nmodels==0) quit(-1,"%s: no models specified\n",rname); if (pure_test_mode && default_lambdas) quit(-1,"%s: in pure test mode, initial lambdas must be supplied\n",rname); if (stop_ratio<0.0 || stop_ratio >1.0) quit(-1,"%s: illegal stop_ratio (%f) - must be a fraction\n", rname,stop_ratio); if (cv && pure_test_mode) { quit(-1,"%s : Error - cannot specify both -cv and -test_all.\n",rname); } if (cv && n_test_items != 0) { quit(-1,"%s : Error - cannot specify both -cv and -test_first or -test_last.\n", rname); } if (pure_test_mode && n_test_items != 0) { quit(-1,"%s : Error - cannot specify both -test_all and -test_first or -test_last.\n", rname); } model_fps = (FILE **) rr_malloc(nmodels * sizeof(FILE *)); model_probs = (ITEM_T **) rr_malloc(nmodels * sizeof(ITEM_T *)); lambdas = (double **) rr_malloc(nmodels * sizeof(double *)); fractions = (double **) rr_malloc(nmodels * sizeof(double *)); prob_components = (double *) rr_malloc(nmodels * sizeof(double)); nitems = -1; pc_message(verbosity,2,"%s : Reading the probability streams....",rname); fflush(stderr); for (imodel=0; imodelMprobs) quit(-1, "%s: more than %d probs on %s\n",rname,Mprobs,model_filenames[imodel]); if (imodel==0) nitems = nnewitems; else if (nnewitems != nitems) quit(-1,"%s: model '%s' has %d probs, but model '%s' has %d probs\n", rname,model_filenames[0],nitems,model_filenames[imodel],nnewitems); fclose(model_fps[imodel]); } pc_message(verbosity,2,"Done.\n"); fflush(stderr); if (n_test_items >= nitems) quit(-1,"%s: \"-test_last %d\" was specified, but there are only %d items\n", rname, n_test_items, nitems); if (pure_test_mode) n_test_items=nitems; if (cv) half_point = (int) (nitems/2); if (write_lambdas == 1) { write_lambdas_fp = rr_oopen(write_lambdas_filename); } for (iter_num = 1; iter_num <= 2; iter_num++) { if (cv && iter_num == 1) { n_test_items = nitems - half_point; first_part = 0; } if (cv && iter_num == 2) { n_test_items = half_point; first_part = 1; } n_train_items = nitems - n_test_items; if (n_train_items>0 && n_test_items>0) { if (first_part) { pc_message(verbosity,2, "%s: %d models will be interpolated using the last %d data items\n", rname, nmodels, n_train_items); pc_message(verbosity,2, " The first %d data items will be used for testing\n", n_test_items); } else { pc_message(verbosity,2, "%s: %d models will be interpolated using the first %d data items\n", rname, nmodels, n_train_items); pc_message(verbosity,2, " The last %d data items will be used for testing\n", n_test_items); } } else { if (n_train_items>0) { pc_message(verbosity,2, "%s: %d models will be interpolated using %d data items\n", rname, nmodels, n_train_items); } else { if (n_test_items>0) { pc_message(verbosity,2, "%s: %d models will be tested using %d data items\n", rname, nmodels, n_test_items); } else { if (cv) { pc_message(verbosity,2, "%s: %d models will be tested using cross validation\n", rname, nmodels); } } } } if (!default_lambdas) pc_message(verbosity,2,"%s: %sweights will be read from \"%s\"\n", rname,(n_train_items ? "initial " : ""), (strcmp(lambdas_filename,"-")==0) ? "stdin" : lambdas_filename); for (imodel=0; imodelmaxtag) maxtag = tag_of[iitem]; } if (fscanf(tags_fp,"%d",&tag_of[iitem]) != EOF) quit(-1,"%s: %s contains more than %d items\n", rname, tags_filename, nitems); ntags = maxtag+1; pc_message(verbosity,2,"%s: data is partitioned into %d tags\n", rname, ntags); } else { ntags = 1; for (iitem=0; iitem 1e-8) quit(-1,"%s: weights for tag #%d sum to %g, not to 1\n", rname, itag, sum_lambdas); } if (fscanf(lambdas_fp,"%f",&dummyf) != EOF) quit(-1,"%s: too many numbers found in '%s'\n", rname, lambdas_filename); rr_iclose(lambdas_fp); } /* TRAINING: iterate the EM step */ new_pp = 10e98; iter_no = 1; while (n_train_items>0 && (iter_no==1 || (new_pp/old_pp < stop_ratio))) { old_pp = new_pp; /* re-estimate lambdas before all but the first iteration */ if (iter_no > 1) { for (itag=0; itag0) { fprintf(stderr,"\n"); for (itag=0; itag0) { fprintf(stderr,"\nNOW TESTING ...\n"); if (first_part) { /* Train on last part and test on first part */ eval(sum_logprobs, fractions, tag_of, n_test_in_tag, prob_components, lambdas, model_probs, ntags, 0, n_test_items-1, nmodels,captions, &test_pp, 1, 0.0, verbosity, probs_fp); fprintf(stderr,"\n"); } else { /* Train on first part and test on last part */ eval(sum_logprobs, fractions, tag_of, n_test_in_tag, prob_components, lambdas, model_probs, ntags, n_train_items, nitems-1, nmodels,captions, &test_pp, 1, 0.0, verbosity, probs_fp); fprintf(stderr,"\n"); } } if (iter_num == 1) { first_part_pp = test_pp; first_test_items = n_test_items; } else if (iter_num == 2) { second_part_pp = test_pp; second_test_items = n_test_items; } /* Free all memory allocated in the loop */ free (tag_of); for (tag=0; tag Total PP = %f\n", total_pp); } if (n_test_items>0) exit((int) test_pp); }