#!/usr/bin/env python

# file: $(ISIP)/exp/tuh_eeg/exp_0022/scripts/rescore_p1_conf.py
#
# revision history:
#  20150619 (JP): initial version
#
# usage:
#  rescore_p1_conf.py -i rscdir -r reflist -p pfile -o ofile -d dfile
#
# arguments:
#  -i (--input): rescoring directory containing hyp elab files (input)
#  -r (--ref):   reference list containing ref lab files (input)
#  -p (--param): sweep parameter file (input)
#  -o (--output): output scoring file (output)
#  -d (--details): detailed scoring report
#
# This script generates confusion matrices and error rates.
#------------------------------------------------------------------------------

# import isip modules
#
import autoeeg

# import required modules
#
import os
import sys
import getopt
import subprocess

# main: rescore_p1_conf.py
#
def main(argv):

    # allocate variables for arguments
    #
    rscdir_a = ""
    reflist_a = ""
    pfile_a = ""
    ofile_a = ""
    dfile_a = ""

    # define the command line options
    #
    try:
        opts, args = getopt.getopt(
            argv, "h:i:r:p:o:d:", \
                ["input=", "ref=", "param=", "output=", "details="])

    # error handling for command line options
    #
    except getopt.GetoptError:
        print "*> %s: the option does not exist" % sys.argv[0]
        sys.exit(-1)

    # parse the command line arguments
    #
    for opt, arg in opts:

        # option: help
        #
        if opt == '-h':
            print 'usage: rescore_p1_conf.py -i rsc_dir -r ref_list -o out_file'
            sys.exit(-1)

        # the hyp directory
        #
        elif opt in ("-i", "--input"):
            rscdir_a = arg

        # the reference file
        #
        elif opt in ("-r", "--ref"):
            reflist_a = arg

        # the parameter file
        #
        elif opt in ("-p", "--param"):
            pfile_a = arg

        # the output file
        #
        elif opt in ("-o", "--output"):
            ofile_a = arg

        # the detailed scoring file
        #
        elif opt in ("-d", "--details"):
            dfile_a = arg

        # unknown option
        #
        else:
            print "illegal option: %s" % (arg)
            sys.exit(-1)

    # load the sweep parameter file to get the range:
    #  note that we do this simply to get the number of sweep values
    #  so that we can preallocate space for confusion matrices
    #
    prm = autoeeg.ParamSweep().load_sweep_parameters(pfile_a)

    # load the reference transcriptions into memory:
    #  these are .lab files. we postprocess this to get a model list
    #
    ref_keys, ref_labels = autoeeg.isip_get_ref_labels(reflist_a)
    num_models, model_list = autoeeg.isip_get_model_list(ref_labels)

    # create confusion matrices
    #
    num_mats = int(prm.nswps)
    cnf = []
    for i in range(num_mats):
        tmp_cnf = [[0 for x in range(num_models)] for x in range(num_models)]
        cnf.append(tmp_cnf)

    # load the hypothesis list into memory
    #
    cmd = "find %s -name *_ch*.elab | sort" % rscdir_a
    task = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    res = task.stdout.read()
    assert task.wait() == 0
    hyplist = res.split()

    # create the output directory:
    #  the report files don't necessarily have to be in the same directory.
    #
    odir = os.path.dirname(ofile_a)
    autoeeg.isip_mkdir(odir);
    odir = os.path.dirname(dfile_a)
    autoeeg.isip_mkdir(odir);

    # create a file to hold detailed scoring information
    #
    fd = open(dfile_a, "w")

    # loop over the hypothesis files
    #
    for hyp in hyplist:

        # grab the group name and the channel name to locate the 
        # unique reference transcription
        #
        hyp_parts = hyp.split("/")
        hlen = len(hyp_parts)
        key_grp = hyp_parts[len(hyp_parts)-2]
        base = os.path.basename(hyp_parts[hlen-1])
        ind2 = len(base) - len(".elab")
        ind1 = base.rfind("_ch", 0, ind2)
        key_chan = base[ind1:ind2]
        hyp_key = key_grp + key_chan

        # find the corresponding ref file
        #
        ind_ref = ref_keys.index(hyp_key)
        num_labels = ref_labels[ind_ref].num_labels

        # read the hypothesis file into memory
        #
        hyp_labels = autoeeg.isip_get_hyp_labels(hyp, int(prm.nswps));

        # loop over sweep parameters
        #
        for i in range(int(prm.nswps)):
            swp_val = prm.min + prm.inc * i

            # loop over the reference transcription:
            #  note that scoring is done relative to the reference
            #  transcription.
            for j in range(ref_labels[ind_ref].num_labels):

                # only score non-null hypotheses
                #
                if ref_labels[ind_ref].label[j] != "(null)":

                    # search for the timestamp of the reference in the 
                    # the hypothesis string
                    #
                    ind0 = autoeeg.isip_first_string(hyp_labels[i].start,
                                                     ref_labels[ind_ref].start[j])
                    # check the stop time also - both must match
                    #
                    if (ind0 < 0) or (hyp_labels[i].stop[ind0] !=
                                      ref_labels[ind_ref].stop[j]):
                        print "*> error: label mismatch"
                        print "   hyp file = %s" % hyp
                        print "   ref file = %s" % ref_labels[ind_ref].model_name
                        print "   param sweep = %f (%d)" % (swp_val, i)
                        print "   hyp = %s %s %s" % \
                            (hyp_labels[i].start[ind0],
                             hyp_labels[i].stop[ind0],
                             hyp_labels[i].label[ind0])
                        print "   ref = %s %s %s" % \
                            (ref_labels[ind_ref].start[j],
                             ref_labels[ind_ref].stop[j],
                             ref_labels[ind_ref].label[j])
                        sys.exit(-1)

                    # get the index for each label in the model list
                    #
                    ind1 = model_list.index(ref_labels[ind_ref].label[j])
                    ind2 = model_list.index(hyp_labels[i].label[ind0])
                    cnf[i][ind1][ind2] += 1

                    # output some basic detailed scoring information
                    #
                    if ind1 != ind2:
                        fd.write("sweep_value = %f\n" % swp_val)
                        fd.write("ref: %s\n" % ref_labels[ind_ref].model_name)
                        fd.write("hyp: %s\n" % hyp)
                        fd.write("ref: %s %s %s %f\n" % \
                                     (ref_labels[ind_ref].start[j],
                                      ref_labels[ind_ref].stop[j],
                                      ref_labels[ind_ref].label[j],
                                      ref_labels[ind_ref].llk[j]))
                        fd.write("hyp: %s %s %s %f\n" % \
                                     (hyp_labels[i].start[ind0],
                                      hyp_labels[i].stop[ind0],
                                      hyp_labels[i].label[ind0],
                                      hyp_labels[i].llk[ind0]))
                        fd.write("\n")

            # end of loop over the number of labels
        # end of loop over sweep parameters
    # end of file loop

    # create an output file
    #
    fo = open(ofile_a, "w")
    fo.write("models: %s" % model_list[0])
    for i in range(1, num_models):
        fo.write(", %s" % model_list[i])
    fo.write("\n")
    fo.write("sweep: %f : %f : %f\n" % (prm.min, prm.max, prm.inc))
    fo.write("penalty  labels: ")
    for i in range(len(prm.lbl)):
        fo.write("%s " % prm.lbl[i])
    fo.write("\n")
    fo.write("penalty weights: ")
    for i in range(len(prm.lbl)):
        fo.write("%s " % prm.wgt[i])
    fo.write("\n\n")

    # display the results in a table
    #
    for i in range(num_mats):
        swp_val = prm.min + prm.inc * i
        fo.write("sweep_value = %f\n" % swp_val)
        fo.write("Ref/Hyp: ",)
        for j in range(num_models):
            fo.write("%8s" % model_list[j],)
        fo.write("\n")
        for j in range(num_models):
            fo.write("%8s " % model_list[j],)
            for k in range(num_models):
                fo.write("%8d" % cnf[i][j][k],)
            fo.write("\n")
        fo.write("\n")
    # end of loop

    # clean up
    #
    fo.close()
    fd.close()

# exit gracefully
#

# begin gracefully
#
if __name__ == "__main__":
    main(sys.argv[1:])

#
# end of file
#