#!/usr/bin/env python import sys import glob import numpy as np import scipy.stats def main(argv): hyp_files = glob.glob(argv[0] + '/*.hyp') # get lists of hyp files for each dataset train_hyp_files_2d = filter(lambda x: '2d' in x and 'train' in x, hyp_files) dev_hyp_files_2d = filter(lambda x: '2d' in x and 'dev' in x, hyp_files) eval_hyp_files_2d = filter(lambda x: '2d' in x and 'eval' in x, hyp_files) train_hyp_files_5d = filter(lambda x: '5d' in x and 'train' in x, hyp_files) dev_hyp_files_5d = filter(lambda x: '5d' in x and 'dev' in x, hyp_files) eval_hyp_files_5d = filter(lambda x: '5d' in x and 'eval' in x, hyp_files) #function to extract the first column out of all the hyp files def get_labels(filename): print filename hyp_matrix = np.loadtxt(filename) try: labels = hyp_matrix[:,0].astype(np.int32) # if there is only one column then the previous line of code will # throw an exception except IndexError: labels = hyp_matrix.astype(np.int32) return labels # get labels from the hyp files train_hyp_labels_2d = np.asarray(map(lambda x: get_labels(x), train_hyp_files_2d)) dev_hyp_labels_2d = np.asarray(map(lambda x: get_labels(x), dev_hyp_files_2d)) eval_hyp_labels_2d = np.asarray(map(lambda x: get_labels(x), eval_hyp_files_2d)) train_hyp_labels_5d = np.asarray(map(lambda x: get_labels(x), train_hyp_files_5d)) dev_hyp_labels_5d = np.asarray(map(lambda x: get_labels(x), dev_hyp_files_5d)) eval_hyp_labels_5d = np.asarray(map(lambda x: get_labels(x), eval_hyp_files_5d)) # find the majority vote (i.e. the mode) among the hyp files for each data point train_2d_vote = scipy.stats.mode(train_hyp_labels_2d,axis=0)[0].T dev_2d_vote = scipy.stats.mode(dev_hyp_labels_2d,axis=0)[0].T eval_2d_vote = scipy.stats.mode(eval_hyp_labels_2d,axis=0)[0].T train_5d_vote = scipy.stats.mode(train_hyp_labels_5d,axis=0)[0].T dev_5d_vote = scipy.stats.mode(dev_hyp_labels_5d,axis=0)[0].T eval_5d_vote = scipy.stats.mode(eval_hyp_labels_5d,axis=0)[0].T # create hyp files from the labels data = dict() data['2d'] = dict() data['5d'] = dict() data['2d']['train'] = np.loadtxt(argv[1] + '/2d/train.txt') data['2d']['dev'] = np.loadtxt(argv[1] + '/2d/dev.txt') data['2d']['eval'] = np.loadtxt(argv[1] + '/2d/eval.txt') data['5d']['train'] = np.loadtxt(argv[1] + '/5d/train.txt') data['5d']['dev'] = np.loadtxt(argv[1] + '/5d/dev.txt') data['5d']['eval'] = np.loadtxt(argv[1] + '/5d/eval_anonymized.txt') train_feats_2d = data['2d']['train'][:,1:] train_labels_2d = data['2d']['train'][:,0] dev_feats_2d = data['2d']['dev'][:,1:] dev_labels_2d = data['2d']['dev'][:,0] eval_feats_2d = data['2d']['eval'][:,1:] eval_labels_2d =data['2d']['eval'][:,0] train_feats_5d = data['5d']['train'][:,1:] train_labels_5d = data['5d']['train'][:,0] dev_feats_5d = data['5d']['dev'][:,1:] dev_labels_5d = data['5d']['dev'][:,0] eval_feats_5d = data['5d']['eval'][:,1:] # generate hyp files hyp_train_2d = np.concatenate([train_2d_vote, train_feats_2d], axis=1) hyp_dev_2d = np.concatenate([dev_2d_vote, dev_feats_2d], axis=1) hyp_eval_2d = np.concatenate([eval_2d_vote, eval_feats_2d], axis=1) hyp_train_5d = np.concatenate([train_5d_vote, train_feats_5d], axis=1) hyp_dev_5d = np.concatenate([dev_5d_vote, dev_feats_5d], axis=1) hyp_eval_5d = np.concatenate([eval_5d_vote, eval_feats_5d], axis=1) np.savetxt(argv[2] + "/majority_vote_2d_maj_train.hyp", hyp_train_2d, fmt=['%1.0f', '%.6f', '%.6f']) np.savetxt(argv[2] + "/majority_vote_2d_maj_dev.hyp", hyp_dev_2d, fmt=['%1.0f', '%.6f', '%.6f']) np.savetxt(argv[2] + "/majority_vote_2d_maj_eval.hyp", hyp_eval_2d, fmt=['%1.0f', '%.6f', '%.6f']) np.savetxt(argv[2] + "/majority_vote_5d_maj_train.hyp", hyp_train_5d, fmt=['%1.0f', '%.6f', '%.6f', '%.6f', '%.6f', '%.6f']) np.savetxt(argv[2] + "/majority_vote_5d_maj_dev.hyp", hyp_dev_5d, fmt=['%1.0f', '%.6f', '%.6f', '%.6f', '%.6f', '%.6f']) np.savetxt(argv[2] + "/majority_vote_5d_maj_eval.hyp", hyp_eval_5d, fmt=['%1.0f', '%.6f', '%.6f', '%.6f', '%.6f', '%.6f']) if __name__ == "__main__": if len(sys.argv) != 4: sys.exit("usage: python majority_vote.py hyp_dir ref_dir out_dir") main(sys.argv[1:])