From 967d6ca6cc284bd1d292dee0c1374bce7052a872 Mon Sep 17 00:00:00 2001 From: Eva Schlinger Date: Fri, 7 Jun 2013 13:20:15 -0400 Subject: python script to run mira --- python/pkg/cdec/score.py | 2 +- training/mira/mira.py | 530 ++++++++++++++++++++++++++++++++++ training/utils/decode-and-evaluate.pl | 15 +- 3 files changed, 543 insertions(+), 4 deletions(-) create mode 100755 training/mira/mira.py diff --git a/python/pkg/cdec/score.py b/python/pkg/cdec/score.py index 829dfdfd..657b4547 100644 --- a/python/pkg/cdec/score.py +++ b/python/pkg/cdec/score.py @@ -1 +1 @@ -from _cdec import BLEU, TER, CER, SSK, QCRI, Metric +from _cdec import BLEU, TER, CER, SSK, QCRI, Metric, Scorer diff --git a/training/mira/mira.py b/training/mira/mira.py new file mode 100755 index 00000000..0ad65da0 --- /dev/null +++ b/training/mira/mira.py @@ -0,0 +1,530 @@ +#!/usr/bin/env python +import sys, os, re, shutil +import subprocess, shlex, glob +import argparse +import logging +import random, time +import cdec.score +import gzip, itertools + +#mira run script +#requires pycdec to be built, since it is used for scoring hypothesis +#translations. +#matplotlib must be installed for graphing to work +#email option requires mail + +#scoring function using pycdec scoring +def fast_score(hyps, refs, metric): + scorer = cdec.score.Scorer(metric) + logging.info('loaded {0} references for scoring with {1}\n'.format( + len(refs), metric)) + if metric=='BLEU': + logging.warning('BLEU is ambiguous, assuming IBM_BLEU\n') + metric = 'IBM_BLEU' + elif metric=='COMBI': + logging.warning('COMBI metric is no longer supported, switching to ' + 'COMB:TER=-0.5;BLEU=0.5\n') + metric = 'COMB:TER=-0.5;BLEU=0.5' + stats = sum(scorer(r).evaluate(h) for h,r in itertools.izip(hyps,refs)) + logging.info(stats.detail+'\n') + return stats.score + +#create new parallel input file in output directory in sgml format +def enseg(devfile, newfile, gprefix): + try: + dev = open(devfile) + new = open(newfile, 'w') + except IOError, msg: + logging.error('Error opening source file') + raise + + i = 0 + for line in dev: + (src, refs) = line.split(' ||| ', 1) + if re.match('\s* tags, ' + 'yout must include a zero based id attribute') + sys.exit() + else: + sgml = '2: + logging.error('Error: working directory {0} already exists\n'.format( + args.output_dir)) + sys.exit() + else: + os.mkdir(args.output_dir) + + if args.grammar_prefix: + if not os.path.isabs(args.grammar_prefix): + args.grammar_prefix = os.path.abspath(args.grammar_prefix) + + script = open(args.output_dir+'/rerun_mira.sh','w') + script.write('cd {0}\n'.format(os.getcwd())) + script.write(' '.join(sys.argv)+'\n') + script.close() + + #create weights.0 file from initial weights file + if args.weights: + shutil.copy(args.weights,os.path.join(args.output_dir,'weights.0')) + else: #if no weights given, use Glue 0 as default + weights = open(args.output_dir+'/weights.0','w') + weights.write('Glue 0\n') + weights.close() + args.weights = args.output_dir+'/weights.0' + + #create mira ini file + shutil.copy(args.config,'{0}/kbest_cut_mira.ini'.format(args.output_dir)) + + newdev = args.output_dir+'/dev.input' + dev_size = enseg(args.devset, newdev, args.grammar_prefix) + args.devset = newdev + + write_config(args) + args.weights, hope_best_fear = optimize(args, script_dir, dev_size) + + graph_file = graph(args.output_dir, hope_best_fear, args.metric) + + dev_results, dev_bleu = evaluate(args.devset, args.weights, args.config, + script_dir, args.output_dir) + if args.test: + if args.test_config: + test_results, test_bleu = evaluate(args.test, args.weights, + args.test_config, script_dir, args.output_dir) + else: + test_results, test_bleu = evaluate(args.test, args.weights, args.config, + script_dir, args.output_dir) + else: + test_results = '' + test_bleu = '' + logging.info(dev_results+'\n') + logging.info(test_results) + + write_report(graph_file, dev_results, dev_bleu, test_results, test_bleu, args) + + logging.info('A graph of the best/hope/fear scores over the iterations ' + 'has been saved to {}\n'.format(graph_file)) + + print 'final weights:\n{}\n'.format(args.weights) + +#graph of hope/best/fear metric values across all iterations +def graph(output_dir, hope_best_fear, metric): + try: + import matplotlib.pyplot as plt + except ImportError: + logging.error('Error importing matplotlib. Graphing disabled.\n') + return '' + max_y = float(max(hope_best_fear['best']))*1.5 + plt.plot(hope_best_fear['best'], label='best') + plt.plot(hope_best_fear['hope'], label='hope') + plt.plot(hope_best_fear['fear'], label='fear') + plt.axis([0,len(hope_best_fear['fear'])-1,0,max_y]) + plt.xlabel('Iteration') + plt.ylabel(metric) + plt.legend() + graph_file = output_dir+'/mira.pdf' + plt.savefig(graph_file) + return graph_file + +#evaluate a given test set using decode-and-evaluate.pl +def evaluate(testset, weights, ini, script_dir, out_dir): + evaluator = '{}/../utils/decode-and-evaluate.pl'.format(script_dir) + try: + p = subprocess.Popen([evaluator, '-c', ini, '-w', weights, '-i', testset, + '-d', out_dir], stdout=subprocess.PIPE) + results, err = p.communicate() + bleu, results = results.split('\n',1) + except subprocess.CalledProcessError: + logging.error('Evalutation of {} failed'.format(testset)) + results = '' + bleu = '' + return results, bleu + +#print a report to out_dir/mira.results +#send email with results if email was given +def write_report(graph_file, dev_results, dev_bleu, + test_results, test_bleu, args): + features, top, bottom = weight_stats(args.weights) + top = [f+' '+str(w) for f,w in top] + bottom = [f+' '+str(w) for f,w in bottom] + subject = 'MIRA {0} {1:7}'.format(os.path.basename(args.devset), dev_bleu) + if args.test: + subject += ' {0} {1:7}'.format(os.path.basename(args.test), test_bleu) + + message = ('MIRA has finished running. '+ + 'The final weights can be found at \n{}\n'.format(args.weights)+ + 'Average weights across all iterations '+ + '\n{}/weights.average\n'.format(args.output_dir)+ + 'Weights were calculated for {} features\n\n'.format(features)+ + '5 highest weights:\n{}\n\n'.format('\n'.join(top))+ + '5 lowest weights:\n{}\n'.format('\n'.join(bottom))) + + if dev_results: + message += '\nEvaluation: dev set\n{}'.format(dev_results) + if test_results: + message += '\nEvaluation: test set\n{}'.format(test_results) + + out = open(args.output_dir+'/mira.results','w') + out.write(message) + out.close() + + if args.email: + email_process = subprocess.Popen(['mail', '-s', subject, '-a', + graph_file, args.email], stdin = subprocess.PIPE) + email_process.communicate(message) + +#feature weights stats for report +def weight_stats(weight_file): + f = open(weight_file) + features = [] + for line in f: + feat, weight = line.strip().split() + features.append((feat,float(weight))) + features.sort(key=lambda a: a[1], reverse=True) + return len(features), features[:5], features[-5:] + +#create source and refs files from parallel devset +#TODO remove when kbest_cut_mira changed to take parallel input +def split_devset(dev, outdir): + parallel = open(dev) + source = open(outdir+'/source.input','w') + refs = open(outdir+'/refs.input', 'w') + references = [] + for line in parallel: + s,r = line.strip().split(' ||| ',1) + source.write(s+'\n') + refs.write(r+'\n') + references.append(r) + source.close() + refs.close() + return (outdir+'/source.input', outdir+'/refs.input') + +def optimize(args, script_dir, dev_size): + parallelize = script_dir+'/../utils/parallelize.pl' + decoder = script_dir+'/kbest_cut_mira' + (source, refs) = split_devset(args.devset, args.output_dir) + port = random.randint(15000,50000) + num_features = 0 + last_p_score = 0 + best_score_iter = -1 + best_score = -1 + i = 0 + hope_best_fear = {'hope':[],'best':[],'fear':[]} + #main optimization loop + while i best_score: + best_score_iter = i + best_score = dec_score + + new_weights_file = '{}/weights.{}'.format(args.output_dir, i+1) + last_weights_file = '{}/weights.{}'.format(args.output_dir, i) + i += 1 + weight_files = weightdir+'/weights.mira-pass*.*[0-9].gz' + average_weights(new_weights_file, weight_files) + + logging.info('\nBEST ITER: {} :: {}\n\n'.format( + best_score_iter, best_score)) + weights_final = args.output_dir+'/weights.final' + shutil.copy(last_weights_file, weights_final) + average_final_weights(args.output_dir) + + return weights_final, hope_best_fear + +#TODO +#create a weights file with the average of the weights from each iteration +def average_final_weights(out_dir): + logging.info('Average of weights from each iteration\n') + weight_files = glob.glob(out_dir+'/weights.[1-9]*') + features = {} + for path in weight_files: + weights = open(path) + for line in weights: + f, w = line.strip().split(' ', 1) + if f in features: + features[f] += float(w) + else: + features[f] = float(w) + weights.close() + + out = open(out_dir+'/weights.average','w') + for f in iter(features): + out.write('{} {}\n'.format(f,features[f]/len(weight_files))) + logging.info('An average weights file can be found at' + '\n{}\n'.format(out_dir+'/weights.average')) + +#create gzipped version of given file with name filename.gz +# and delete original file +def gzip_file(filename): + gzip_file = gzip.open(filename+'.gz','wb') + f = open(filename) + gzip_file.writelines(f) + f.close() + gzip_file.close() + os.remove(filename) + +#average the weights for a given pass +def average_weights(new_weights, weight_files): + logging.info('AVERAGE {} {}\n'.format(new_weights, weight_files)) + feature_weights = {} + total_mult = 0.0 + for path in glob.glob(weight_files): + score = gzip.open(path) + mult = 0 + logging.info('FILE {}\n'.format(path)) + msg, ran, mult = score.readline().strip().split(' ||| ') + logging.info('Processing {} {}'.format(ran, mult)) + for line in score: + f,w = line.split(' ',1) + if f in feature_weights: + feature_weights[f]+= float(mult)*float(w) + else: + feature_weights[f] = float(mult)*float(w) + total_mult += float(mult) + score.close() + + #write new weights to outfile + out = open(new_weights, 'w') + for f in iter(feature_weights): + avg = feature_weights[f]/total_mult + logging.info('{} {} {} ||| Printing {} {}\n'.format(f,feature_weights[f], + total_mult, f, avg)) + out.write('{} {}\n'.format(f,avg)) + +def write_config(args): + config = ('\n' + 'DECODER: ' + '/usr0/home/eschling/cdec/training/mira/kbest_cut_mira\n' + 'INI FILE: '+args.config+'\n' + 'WORKING DIRECTORY: '+args.output_dir+'\n' + 'DEVSET: '+args.devset+'\n' + 'EVAL METRIC: '+args.metric+'\n' + 'MAX ITERATIONS: '+str(args.max_iterations)+'\n' + 'DECODE NODES: '+str(args.jobs)+'\n' + 'INITIAL WEIGHTS: '+args.weights+'\n') + if args.grammar_prefix: + config += 'GRAMMAR PREFIX: '+str(args.grammar_prefix)+'\n' + if args.test: + config += 'TEST SET: '+args.test+'\n' + if args.test_config: + config += 'TEST CONFIG: '+args.test_config+'\n' + if args.email: + config += 'EMAIL: '+args.email+'\n' + + logging.info(config) + +if __name__=='__main__': + main() diff --git a/training/utils/decode-and-evaluate.pl b/training/utils/decode-and-evaluate.pl index 1a332c08..c69e77dc 100755 --- a/training/utils/decode-and-evaluate.pl +++ b/training/utils/decode-and-evaluate.pl @@ -39,6 +39,7 @@ my $weights; my $use_make = 1; my $useqsub; my $cpbin=1; +my $base_dir; # Process command-line options if (GetOptions( "jobs=i" => \$jobs, @@ -47,6 +48,7 @@ if (GetOptions( "input=s" => \$test_set, "config=s" => \$config, "weights=s" => \$weights, + "dir=s" => \$base_dir, ) == 0 || @ARGV!=0 || $help) { print_help(); exit; @@ -68,7 +70,9 @@ my @tf = localtime(time); my $tname = basename($test_set); $tname =~ s/\.(sgm|sgml|xml)$//i; my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]); - +if ($base_dir) { + $dir = $base_dir.'/'.$dir +} my $time = unchecked_output("date"); check_call("mkdir -p $dir"); @@ -103,11 +107,12 @@ print STDERR "\nOUTPUT: $test_trans\n\n"; my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu"); chomp $bleu; print STDERR "BLEU: $bleu\n"; +print STDOUT "BLEU: $bleu\n"; my $ter = check_output("cat $test_trans | $SCORER $refs -m ter"); chomp $ter; print STDERR " TER: $ter\n"; open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!"; -print TR < - Directory for intermediate and output files. + Base directory where directory with evaluation results + will be located. Job control options: -- cgit v1.2.3