#!/usr/bin/env python import sys, os, re, shutil import subprocess, shlex, glob import argparse import logging import random, time import cdec.score import gzip, itertools #mira run script #requires pycdec to be built, since it is used for scoring hypothesis #translations. #matplotlib must be installed for graphing to work #email option requires mail #scoring function using pycdec scoring def fast_score(hyps, refs, metric): scorer = cdec.score.Scorer(metric) logging.info('loaded {0} references for scoring with {1}\n'.format( len(refs), metric)) if metric=='BLEU': logging.warning('BLEU is ambiguous, assuming IBM_BLEU\n') metric = 'IBM_BLEU' elif metric=='COMBI': logging.warning('COMBI metric is no longer supported, switching to ' 'COMB:TER=-0.5;BLEU=0.5\n') metric = 'COMB:TER=-0.5;BLEU=0.5' stats = sum(scorer(r).evaluate(h) for h,r in itertools.izip(hyps,refs)) logging.info(stats.detail+'\n') return stats.score #create new parallel input file in output directory in sgml format def enseg(devfile, newfile, gprefix): try: dev = open(devfile) new = open(newfile, 'w') except IOError, msg: logging.error('Error opening source file') raise i = 0 for line in dev: (src, refs) = line.split(' ||| ', 1) if re.match('\s* tags, ' 'yout must include a zero based id attribute') sys.exit() else: sgml = '2: logging.error('Error: working directory {0} already exists\n'.format( args.output_dir)) sys.exit() else: os.mkdir(args.output_dir) if args.grammar_prefix: if not os.path.isabs(args.grammar_prefix): args.grammar_prefix = os.path.abspath(args.grammar_prefix) script = open(args.output_dir+'/rerun_mira.sh','w') script.write('cd {0}\n'.format(os.getcwd())) script.write(' '.join(sys.argv)+'\n') script.close() #create weights.0 file from initial weights file if args.weights: shutil.copy(args.weights,os.path.join(args.output_dir,'weights.0')) else: #if no weights given, use Glue 0 as default weights = open(args.output_dir+'/weights.0','w') weights.write('Glue 0\n') weights.close() args.weights = args.output_dir+'/weights.0' #create mira ini file shutil.copy(args.config,'{0}/kbest_cut_mira.ini'.format(args.output_dir)) newdev = args.output_dir+'/dev.input' dev_size = enseg(args.devset, newdev, args.grammar_prefix) args.devset = newdev write_config(args) args.weights, hope_best_fear = optimize(args, script_dir, dev_size) graph_file = graph(args.output_dir, hope_best_fear, args.metric) dev_results, dev_bleu = evaluate(args.devset, args.weights, args.config, script_dir, args.output_dir) if args.test: if args.test_config: test_results, test_bleu = evaluate(args.test, args.weights, args.test_config, script_dir, args.output_dir) else: test_results, test_bleu = evaluate(args.test, args.weights, args.config, script_dir, args.output_dir) else: test_results = '' test_bleu = '' logging.info(dev_results+'\n') logging.info(test_results) write_report(graph_file, dev_results, dev_bleu, test_results, test_bleu, args) logging.info('A graph of the best/hope/fear scores over the iterations ' 'has been saved to {}\n'.format(graph_file)) print 'final weights:\n{}\n'.format(args.weights) #graph of hope/best/fear metric values across all iterations def graph(output_dir, hope_best_fear, metric): try: import matplotlib.pyplot as plt except ImportError: logging.error('Error importing matplotlib. Graphing disabled.\n') return '' max_y = float(max(hope_best_fear['best']))*1.5 plt.plot(hope_best_fear['best'], label='best') plt.plot(hope_best_fear['hope'], label='hope') plt.plot(hope_best_fear['fear'], label='fear') plt.axis([0,len(hope_best_fear['fear'])-1,0,max_y]) plt.xlabel('Iteration') plt.ylabel(metric) plt.legend() graph_file = output_dir+'/mira.pdf' plt.savefig(graph_file) return graph_file #evaluate a given test set using decode-and-evaluate.pl def evaluate(testset, weights, ini, script_dir, out_dir): evaluator = '{}/../utils/decode-and-evaluate.pl'.format(script_dir) try: p = subprocess.Popen([evaluator, '-c', ini, '-w', weights, '-i', testset, '-d', out_dir], stdout=subprocess.PIPE) results, err = p.communicate() bleu, results = results.split('\n',1) except subprocess.CalledProcessError: logging.error('Evalutation of {} failed'.format(testset)) results = '' bleu = '' return results, bleu #print a report to out_dir/mira.results #send email with results if email was given def write_report(graph_file, dev_results, dev_bleu, test_results, test_bleu, args): features, top, bottom = weight_stats(args.weights) top = [f+' '+str(w) for f,w in top] bottom = [f+' '+str(w) for f,w in bottom] subject = 'MIRA {0} {1:7}'.format(os.path.basename(args.devset), dev_bleu) if args.test: subject += ' {0} {1:7}'.format(os.path.basename(args.test), test_bleu) message = ('MIRA has finished running. '+ 'The final weights can be found at \n{}\n'.format(args.weights)+ 'Average weights across all iterations '+ '\n{}/weights.average\n'.format(args.output_dir)+ 'Weights were calculated for {} features\n\n'.format(features)+ '5 highest weights:\n{}\n\n'.format('\n'.join(top))+ '5 lowest weights:\n{}\n'.format('\n'.join(bottom))) if dev_results: message += '\nEvaluation: dev set\n{}'.format(dev_results) if test_results: message += '\nEvaluation: test set\n{}'.format(test_results) out = open(args.output_dir+'/mira.results','w') out.write(message) out.close() if args.email: email_process = subprocess.Popen(['mail', '-s', subject, '-a', graph_file, args.email], stdin = subprocess.PIPE) email_process.communicate(message) #feature weights stats for report def weight_stats(weight_file): f = open(weight_file) features = [] for line in f: feat, weight = line.strip().split() features.append((feat,float(weight))) features.sort(key=lambda a: a[1], reverse=True) return len(features), features[:5], features[-5:] #create source and refs files from parallel devset #TODO remove when kbest_cut_mira changed to take parallel input def split_devset(dev, outdir): parallel = open(dev) source = open(outdir+'/source.input','w') refs = open(outdir+'/refs.input', 'w') references = [] for line in parallel: s,r = line.strip().split(' ||| ',1) source.write(s+'\n') refs.write(r+'\n') references.append(r) source.close() refs.close() return (outdir+'/source.input', outdir+'/refs.input') def optimize(args, script_dir, dev_size): parallelize = script_dir+'/../utils/parallelize.pl' decoder = script_dir+'/kbest_cut_mira' (source, refs) = split_devset(args.devset, args.output_dir) port = random.randint(15000,50000) num_features = 0 last_p_score = 0 best_score_iter = -1 best_score = -1 i = 0 hope_best_fear = {'hope':[],'best':[],'fear':[]} #main optimization loop while i best_score: best_score_iter = i best_score = dec_score new_weights_file = '{}/weights.{}'.format(args.output_dir, i+1) last_weights_file = '{}/weights.{}'.format(args.output_dir, i) i += 1 weight_files = weightdir+'/weights.mira-pass*.*[0-9].gz' average_weights(new_weights_file, weight_files) logging.info('\nBEST ITER: {} :: {}\n\n'.format( best_score_iter, best_score)) weights_final = args.output_dir+'/weights.final' shutil.copy(last_weights_file, weights_final) average_final_weights(args.output_dir) return weights_final, hope_best_fear #TODO #create a weights file with the average of the weights from each iteration def average_final_weights(out_dir): logging.info('Average of weights from each iteration\n') weight_files = glob.glob(out_dir+'/weights.[1-9]*') features = {} for path in weight_files: weights = open(path) for line in weights: f, w = line.strip().split(' ', 1) if f in features: features[f] += float(w) else: features[f] = float(w) weights.close() out = open(out_dir+'/weights.average','w') for f in iter(features): out.write('{} {}\n'.format(f,features[f]/len(weight_files))) logging.info('An average weights file can be found at' '\n{}\n'.format(out_dir+'/weights.average')) #create gzipped version of given file with name filename.gz # and delete original file def gzip_file(filename): gzip_file = gzip.open(filename+'.gz','wb') f = open(filename) gzip_file.writelines(f) f.close() gzip_file.close() os.remove(filename) #average the weights for a given pass def average_weights(new_weights, weight_files): logging.info('AVERAGE {} {}\n'.format(new_weights, weight_files)) feature_weights = {} total_mult = 0.0 for path in glob.glob(weight_files): score = gzip.open(path) mult = 0 logging.info('FILE {}\n'.format(path)) msg, ran, mult = score.readline().strip().split(' ||| ') logging.info('Processing {} {}'.format(ran, mult)) for line in score: f,w = line.split(' ',1) if f in feature_weights: feature_weights[f]+= float(mult)*float(w) else: feature_weights[f] = float(mult)*float(w) total_mult += float(mult) score.close() #write new weights to outfile out = open(new_weights, 'w') for f in iter(feature_weights): avg = feature_weights[f]/total_mult logging.info('{} {} {} ||| Printing {} {}\n'.format(f,feature_weights[f], total_mult, f, avg)) out.write('{} {}\n'.format(f,avg)) def write_config(args): config = ('\n' 'DECODER: ' '/usr0/home/eschling/cdec/training/mira/kbest_cut_mira\n' 'INI FILE: '+args.config+'\n' 'WORKING DIRECTORY: '+args.output_dir+'\n' 'DEVSET: '+args.devset+'\n' 'EVAL METRIC: '+args.metric+'\n' 'MAX ITERATIONS: '+str(args.max_iterations)+'\n' 'DECODE NODES: '+str(args.jobs)+'\n' 'INITIAL WEIGHTS: '+args.weights+'\n') if args.grammar_prefix: config += 'GRAMMAR PREFIX: '+str(args.grammar_prefix)+'\n' if args.test: config += 'TEST SET: '+args.test+'\n' if args.test_config: config += 'TEST CONFIG: '+args.test_config+'\n' if args.email: config += 'EMAIL: '+args.email+'\n' logging.info(config) if __name__=='__main__': main()