summaryrefslogtreecommitdiff
path: root/training/mira
diff options
context:
space:
mode:
Diffstat (limited to 'training/mira')
-rwxr-xr-xtraining/mira/mira.py533
1 files changed, 533 insertions, 0 deletions
diff --git a/training/mira/mira.py b/training/mira/mira.py
new file mode 100755
index 00000000..f031c313
--- /dev/null
+++ b/training/mira/mira.py
@@ -0,0 +1,533 @@
+#!/usr/bin/env python
+import sys, os, re, shutil
+import subprocess, shlex, glob
+import argparse
+import logging
+import random, time
+import cdec.score
+import gzip, itertools
+
+#mira run script
+#requires pycdec to be built, since it is used for scoring hypothesis
+#translations.
+#matplotlib must be installed for graphing to work
+#email option requires mail
+
+#scoring function using pycdec scoring
+def fast_score(hyps, refs, metric):
+ scorer = cdec.score.Scorer(metric)
+ logging.info('loaded {0} references for scoring with {1}\n'.format(
+ len(refs), metric))
+ if metric=='BLEU':
+ logging.warning('BLEU is ambiguous, assuming IBM_BLEU\n')
+ metric = 'IBM_BLEU'
+ elif metric=='COMBI':
+ logging.warning('COMBI metric is no longer supported, switching to '
+ 'COMB:TER=-0.5;BLEU=0.5\n')
+ metric = 'COMB:TER=-0.5;BLEU=0.5'
+ stats = sum(scorer(r).evaluate(h) for h,r in itertools.izip(hyps,refs))
+ logging.info(stats.detail+'\n')
+ return stats.score
+
+#create new parallel input file in output directory in sgml format
+def enseg(devfile, newfile, gprefix):
+ try:
+ dev = open(devfile)
+ new = open(newfile, 'w')
+ except IOError, msg:
+ logging.error('Error opening source file')
+ raise
+
+ i = 0
+ for line in dev:
+ (src, refs) = line.split(' ||| ', 1)
+ if re.match('\s*<seg', src):
+ if re.search('id="[0-9]+"', src):
+ new.write(line)
+ else:
+ logging.error('When using segments with pre-generated <seg> tags, '
+ 'yout must include a zero based id attribute')
+ sys.exit()
+ else:
+ sgml = '<seg id="{0}"'.format(i)
+ if gprefix:
+ #TODO check if grammar files gzipped or not
+ if os.path.exists('{}.{}.gz'.format(gprefix,i)):
+ sgml += ' grammar="{0}.{1}.gz"'.format(gprefix,i)
+ elif os.path.exists('{}.{}'.format(gprefix,i)):
+ sgml += ' grammar="{}.{}"'.format(gprefix,i)
+ else:
+ logging.error('Could not find grammar files with prefix '
+ '{}\n'.format(gprefix))
+ sys.exit()
+ sgml += '>{0}</seg> ||| {1}'.format(src, refs)
+ new.write(sgml)
+ i+=1
+ new.close()
+ dev.close()
+ return i
+
+def main():
+ #set logging to write all info messages to stderr
+ logging.basicConfig(level=logging.INFO)
+ script_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
+
+ parser= argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('-d', '--devset', required=True,
+ help='dev set input file in parallel. '
+ 'format: src ||| ref1 ||| ref2')
+ parser.add_argument('-c', '--config', required=True,
+ help='decoder configuration file')
+ parser.add_argument('-w','--weights',
+ help='initial weights file')
+ parser.add_argument('-j', '--jobs', type=int, default=1,
+ help='number of decoder processes to run in parallel')
+ parser.add_argument('-o','--output-dir', metavar='DIR',
+ help='directory for intermediate and output files. '
+ 'defaults to mira.(devset name).(time)')
+ parser.add_argument('-e', '--email',
+ help='email address to send result report')
+ parser.add_argument('-t', '--test',
+ help='test set to decode and evaluate')
+ parser.add_argument('--test-config',
+ help='config file for testing. the config file used '
+ 'for tuning feature weights will be used by default.')
+ parser.add_argument('-m', '--metric', default='ibm_bleu',
+ help='metric to optimize. Example values: '
+ 'ibm_bleu, nist_bleu, Koehn_bleu, TER, Combi')
+ parser.add_argument('--max-iterations', type=int, default=10, metavar='N',
+ help='maximum number of iterations to run')
+ parser.add_argument('--optimizer', type=int, default=2, choices=range(1,6),
+ help='learning method to use for weight update.'
+ ' Choices: 1) SGD, 2) PA MIRA with Selection from Cutting'
+ ' Plane, 3) Cutting Plane MIRA, 4) PA MIRA,'
+ ' 5) nbest MIRA with hope, fear, and model constraints')
+ parser.add_argument('--metric-scale', type=int, default=1, metavar='N',
+ help='scale MT loss by this amount when computing'
+ ' hope/fear candidates')
+ parser.add_argument('-k', '--kbest-size', type=int, default=250, metavar='N',
+ help='size of k-best list to extract from forest')
+ parser.add_argument('--update-size', type=int, metavar='N',
+ help='size of k-best list to use for update. defaults to '
+ 'equal kbest-size (applies to optimizer 5)')
+ parser.add_argument('--step-size', type=float, default=0.01,
+ help='controls aggresiveness of update')
+ parser.add_argument('--hope', type=int, default=1, choices=range(1,3),
+ help='how to select hope candidate. options: '
+ '1) model score - cost, 2) min cost')
+ parser.add_argument('--fear', type=int, default=1, choices=range(1,4),
+ help='how to select fear candidate. options: '
+ '1) model score + cost, 2) max cost, 3) max score')
+ parser.add_argument('--sent-approx', action='store_true',
+ help='use smoothed sentence-level MT metric')
+ parser.add_argument('--no-pseudo', action='store_true',
+ help="don't use pseudo document to approximate MT metric")
+ parser.add_argument('--no-unique', action='store_true',
+ help="don't extract unique k-best from forest")
+ parser.add_argument('-g', '--grammar-prefix', metavar='PATH',
+ help='path to sentence specific grammar files')
+ parser.add_argument('--pass-suffix',
+ help='multipass decoding iteration. see documentation '
+ 'at www.cdec-decoder.org for more information')
+ args = parser.parse_args()
+
+ args.metric = args.metric.upper()
+
+ if not args.update_size:
+ args.update_size = args.kbest_size
+
+ #TODO fix path to match decode+evaluate (python month 1-12 instead of 0-11)
+ #if an output directory isn't specified, create a unique directory name
+ #of the form mira.(devset).YYYYMMDD-HHMMSS
+ if not args.output_dir:
+ t = time.localtime()
+ args.output_dir = 'mira.{0}.{1}{2:02}{3:02}-{4:02}{5:02}{6:02}'.format(
+ os.path.splitext(args.devset)[0], t[0], t[1], t[2],
+ t[3], t[4], t[5])
+
+ if not os.path.isabs(args.output_dir):
+ args.output_dir = os.path.abspath(args.output_dir)
+ if os.path.exists(args.output_dir):
+ if len(os.listdir(args.output_dir))>2:
+ logging.error('Error: working directory {0} already exists\n'.format(
+ args.output_dir))
+ sys.exit()
+ else:
+ os.mkdir(args.output_dir)
+
+ if args.grammar_prefix:
+ if not os.path.isabs(args.grammar_prefix):
+ args.grammar_prefix = os.path.abspath(args.grammar_prefix)
+
+ script = open(args.output_dir+'/rerun_mira.sh','w')
+ script.write('cd {0}\n'.format(os.getcwd()))
+ script.write(' '.join(sys.argv)+'\n')
+ script.close()
+
+ #create weights.0 file from initial weights file
+ if args.weights:
+ shutil.copy(args.weights,os.path.join(args.output_dir,'weights.0'))
+ else: #if no weights given, use Glue 0 as default
+ weights = open(args.output_dir+'/weights.0','w')
+ weights.write('Glue 0\n')
+ weights.close()
+ args.weights = args.output_dir+'/weights.0'
+
+ #create mira ini file
+ shutil.copy(args.config,'{0}/kbest_cut_mira.ini'.format(args.output_dir))
+
+ newdev = args.output_dir+'/dev.input'
+ dev_size = enseg(args.devset, newdev, args.grammar_prefix)
+ args.devset = newdev
+
+ write_config(args)
+ args.weights, hope_best_fear = optimize(args, script_dir, dev_size)
+
+ graph_file = graph(args.output_dir, hope_best_fear, args.metric)
+
+ dev_results, dev_bleu = evaluate(args.devset, args.weights, args.config,
+ script_dir, args.output_dir)
+ if args.test:
+ if args.test_config:
+ test_results, test_bleu = evaluate(args.test, args.weights,
+ args.test_config, script_dir, args.output_dir)
+ else:
+ test_results, test_bleu = evaluate(args.test, args.weights, args.config,
+ script_dir, args.output_dir)
+ else:
+ test_results = ''
+ test_bleu = ''
+ logging.info(dev_results+'\n')
+ logging.info(test_results)
+
+ write_report(graph_file, dev_results, dev_bleu, test_results, test_bleu, args)
+
+ if graph_file:
+ logging.info('A graph of the best/hope/fear scores over the iterations '
+ 'has been saved to {}\n'.format(graph_file))
+
+ print 'final weights:\n{}\n'.format(args.weights)
+
+#graph of hope/best/fear metric values across all iterations
+def graph(output_dir, hope_best_fear, metric):
+ try:
+ import matplotlib.pyplot as plt
+ except ImportError:
+ logging.error('Error importing matplotlib. Graphing disabled.\n')
+ return ''
+ max_y = float(max(hope_best_fear['best']))*1.5
+ plt.plot(hope_best_fear['best'], label='best')
+ plt.plot(hope_best_fear['hope'], label='hope')
+ plt.plot(hope_best_fear['fear'], label='fear')
+ plt.axis([0,len(hope_best_fear['fear'])-1,0,max_y])
+ plt.xlabel('Iteration')
+ plt.ylabel(metric)
+ plt.legend()
+ graph_file = output_dir+'/mira.pdf'
+ plt.savefig(graph_file)
+ return graph_file
+
+#evaluate a given test set using decode-and-evaluate.pl
+def evaluate(testset, weights, ini, script_dir, out_dir):
+ evaluator = '{}/../utils/decode-and-evaluate.pl'.format(script_dir)
+ try:
+ p = subprocess.Popen([evaluator, '-c', ini, '-w', weights, '-i', testset,
+ '-d', out_dir], stdout=subprocess.PIPE)
+ results, err = p.communicate()
+ bleu, results = results.split('\n',1)
+ except subprocess.CalledProcessError:
+ logging.error('Evalutation of {} failed'.format(testset))
+ results = ''
+ bleu = ''
+ return results, bleu
+
+#print a report to out_dir/mira.results
+#send email with results if email was given
+def write_report(graph_file, dev_results, dev_bleu,
+ test_results, test_bleu, args):
+ features, top, bottom = weight_stats(args.weights)
+ top = [f+' '+str(w) for f,w in top]
+ bottom = [f+' '+str(w) for f,w in bottom]
+ subject = 'MIRA {0} {1:7}'.format(os.path.basename(args.devset), dev_bleu)
+ if args.test:
+ subject += ' {0} {1:7}'.format(os.path.basename(args.test), test_bleu)
+
+ message = ('MIRA has finished running. '+
+ 'The final weights can be found at \n{}\n'.format(args.weights)+
+ 'Average weights across all iterations '+
+ '\n{}/weights.average\n'.format(args.output_dir)+
+ 'Weights were calculated for {} features\n\n'.format(features)+
+ '5 highest weights:\n{}\n\n'.format('\n'.join(top))+
+ '5 lowest weights:\n{}\n'.format('\n'.join(bottom)))
+
+ if dev_results:
+ message += '\nEvaluation: dev set\n{}'.format(dev_results)
+ if test_results:
+ message += '\nEvaluation: test set\n{}'.format(test_results)
+
+ out = open(args.output_dir+'/mira.results','w')
+ out.write(message)
+ out.close()
+
+ if args.email:
+ cmd = ['mail', '-s', subject]
+ if graph_file:
+ cmd += ['-a', graph_file]
+ email_process = subprocess.Popen(cmd+[args.email], stdin = subprocess.PIPE)
+ email_process.communicate(message)
+
+#feature weights stats for report
+def weight_stats(weight_file):
+ f = open(weight_file)
+ features = []
+ for line in f:
+ feat, weight = line.strip().split()
+ features.append((feat,float(weight)))
+ features.sort(key=lambda a: a[1], reverse=True)
+ return len(features), features[:5], features[-5:]
+
+#create source and refs files from parallel devset
+#TODO remove when kbest_cut_mira changed to take parallel input
+def split_devset(dev, outdir):
+ parallel = open(dev)
+ source = open(outdir+'/source.input','w')
+ refs = open(outdir+'/refs.input', 'w')
+ references = []
+ for line in parallel:
+ s,r = line.strip().split(' ||| ',1)
+ source.write(s+'\n')
+ refs.write(r+'\n')
+ references.append(r)
+ source.close()
+ refs.close()
+ return (outdir+'/source.input', outdir+'/refs.input')
+
+def optimize(args, script_dir, dev_size):
+ parallelize = script_dir+'/../utils/parallelize.pl'
+ decoder = script_dir+'/kbest_cut_mira'
+ (source, refs) = split_devset(args.devset, args.output_dir)
+ port = random.randint(15000,50000)
+ num_features = 0
+ last_p_score = 0
+ best_score_iter = -1
+ best_score = -1
+ i = 0
+ hope_best_fear = {'hope':[],'best':[],'fear':[]}
+ #main optimization loop
+ while i<args.max_iterations:
+ logging.info('\n\nITERATION {}\n========\n'.format(i))
+ logging.info('using port {}\n'.format(port))
+
+ #iteration specific files
+ runfile = args.output_dir+'/run.raw.'+str(i)
+ onebestfile = args.output_dir+'/1best.'+str(i)
+ logdir = args.output_dir+'/logs.'+str(i)
+ decoderlog = logdir+'/decoder.sentserver.log.'+str(i)
+ weightdir = args.output_dir+'/weights.pass'+str(i)
+ os.mkdir(logdir)
+ os.mkdir(weightdir)
+
+ logging.info('RUNNING DECODER AT {}'.format(time.asctime()))
+ weightsfile = args.output_dir+'/weights.'+str(i)
+ logging.info('ITER {}\n'.format(i))
+ curr_pass = '0{}'.format(i)
+ decoder_cmd = ('{0} -c {1} -w {2} -r{3} -m {4} -s {5} -b {6} -k {7} -o {8}'
+ ' -p {9} -O {10} -D {11} -h {12} -f {13} -C {14}').format(
+ decoder, args.config, weightsfile, refs, args.metric,
+ args.metric_scale, args.update_size, args.kbest_size,
+ args.optimizer, curr_pass, weightdir, args.output_dir,
+ args.hope, args.fear, args.step_size)
+ if not args.no_unique:
+ decoder_cmd += ' -u'
+ if args.sent_approx:
+ decoder_cmd += ' -a'
+ if not args.no_pseudo:
+ decoder_cmd += ' -e'
+
+ #always use fork
+ parallel_cmd = '{0} --use-fork -e {1} -j {2} --'.format(
+ parallelize, logdir, args.jobs)
+
+ cmd = parallel_cmd + ' ' + decoder_cmd
+ logging.info('COMMAND: \n{}\n'.format(cmd))
+
+ dlog = open(decoderlog,'w')
+ runf = open(runfile,'w')
+ retries = 0
+ num_topbest = 0
+
+ while retries < 6:
+ #call decoder through parallelize.pl
+ p1 = subprocess.Popen(['cat', source], stdout=subprocess.PIPE)
+ exit_code = subprocess.call(shlex.split(cmd), stderr=dlog, stdout=runf,
+ stdin=p1.stdout)
+ p1.stdout.close()
+
+ if exit_code:
+ logging.error('Failed with exit code {}\n'.format(exit_code))
+ sys.exit(exit_code)
+
+ try:
+ f = open(runfile)
+ except IOError, msg:
+ logging.error('Unable to open {}\n'.format(runfile))
+ sys.exit()
+
+ num_topbest = sum(1 for line in f)
+ f.close()
+ if num_topbest == dev_size: break
+ logging.warning('Incorrect number of top best. '
+ 'Waiting for distributed filesystem and retrying.')
+ time.sleep(10)
+ retries += 1
+
+ if dev_size != num_topbest:
+ logging.error("Dev set contains "+dev_size+" sentences, but we don't "
+ "have topbest for all of these. Decoder failure? "
+ " Check "+decoderlog+'\n')
+ sys.exit()
+ dlog.close()
+ runf.close()
+
+ #write best, hope, and fear translations
+ run = open(runfile)
+ H = open(runfile+'.H', 'w')
+ B = open(runfile+'.B', 'w')
+ F = open(runfile+'.F', 'w')
+ hopes = []
+ bests = []
+ fears = []
+ for line in run:
+ hope, best, fear = line.split(' ||| ')
+ hopes.append(hope)
+ bests.append(best)
+ fears.append(fear)
+ H.write('{}\n'.format(hope))
+ B.write('{}\n'.format(best))
+ F.write('{}\n'.format(fear))
+ run.close()
+ H.close()
+ B.close()
+ F.close()
+
+ #gzip runfiles and log files to save space
+ gzip_file(runfile)
+ gzip_file(decoderlog)
+
+ ref_file = open(refs)
+ references = [line.split(' ||| ') for line in
+ ref_file.read().strip().split('\n')]
+ ref_file.close()
+ #get score for best hypothesis translations, hope and fear translations
+ dec_score = fast_score(bests, references, args.metric)
+ dec_score_h = fast_score(hopes, references, args.metric)
+ dec_score_f = fast_score(fears, references, args.metric)
+
+ hope_best_fear['hope'].append(dec_score)
+ hope_best_fear['best'].append(dec_score_h)
+ hope_best_fear['fear'].append(dec_score_f)
+ logging.info('DECODER SCORE: {0} HOPE: {1} FEAR: {2}\n'.format(
+ dec_score, dec_score_h, dec_score_f))
+ if dec_score > best_score:
+ best_score_iter = i
+ best_score = dec_score
+
+ new_weights_file = '{}/weights.{}'.format(args.output_dir, i+1)
+ last_weights_file = '{}/weights.{}'.format(args.output_dir, i)
+ i += 1
+ weight_files = weightdir+'/weights.mira-pass*.*[0-9].gz'
+ average_weights(new_weights_file, weight_files)
+
+ logging.info('\nBEST ITER: {} :: {}\n\n'.format(
+ best_score_iter, best_score))
+ weights_final = args.output_dir+'/weights.final'
+ shutil.copy(last_weights_file, weights_final)
+ average_final_weights(args.output_dir)
+
+ return weights_final, hope_best_fear
+
+#TODO
+#create a weights file with the average of the weights from each iteration
+def average_final_weights(out_dir):
+ logging.info('Average of weights from each iteration\n')
+ weight_files = glob.glob(out_dir+'/weights.[1-9]*')
+ features = {}
+ for path in weight_files:
+ weights = open(path)
+ for line in weights:
+ f, w = line.strip().split(' ', 1)
+ if f in features:
+ features[f] += float(w)
+ else:
+ features[f] = float(w)
+ weights.close()
+
+ out = open(out_dir+'/weights.average','w')
+ for f in iter(features):
+ out.write('{} {}\n'.format(f,features[f]/len(weight_files)))
+ logging.info('An average weights file can be found at'
+ '\n{}\n'.format(out_dir+'/weights.average'))
+
+#create gzipped version of given file with name filename.gz
+# and delete original file
+def gzip_file(filename):
+ gzip_file = gzip.open(filename+'.gz','wb')
+ f = open(filename)
+ gzip_file.writelines(f)
+ f.close()
+ gzip_file.close()
+ os.remove(filename)
+
+#average the weights for a given pass
+def average_weights(new_weights, weight_files):
+ logging.info('AVERAGE {} {}\n'.format(new_weights, weight_files))
+ feature_weights = {}
+ total_mult = 0.0
+ for path in glob.glob(weight_files):
+ score = gzip.open(path)
+ mult = 0
+ logging.info('FILE {}\n'.format(path))
+ msg, ran, mult = score.readline().strip().split(' ||| ')
+ logging.info('Processing {} {}'.format(ran, mult))
+ for line in score:
+ f,w = line.split(' ',1)
+ if f in feature_weights:
+ feature_weights[f]+= float(mult)*float(w)
+ else:
+ feature_weights[f] = float(mult)*float(w)
+ total_mult += float(mult)
+ score.close()
+
+ #write new weights to outfile
+ out = open(new_weights, 'w')
+ for f in iter(feature_weights):
+ avg = feature_weights[f]/total_mult
+ logging.info('{} {} {} ||| Printing {} {}\n'.format(f,feature_weights[f],
+ total_mult, f, avg))
+ out.write('{} {}\n'.format(f,avg))
+
+def write_config(args):
+ config = ('\n'
+ 'DECODER: '
+ '/usr0/home/eschling/cdec/training/mira/kbest_cut_mira\n'
+ 'INI FILE: '+args.config+'\n'
+ 'WORKING DIRECTORY: '+args.output_dir+'\n'
+ 'DEVSET: '+args.devset+'\n'
+ 'EVAL METRIC: '+args.metric+'\n'
+ 'MAX ITERATIONS: '+str(args.max_iterations)+'\n'
+ 'DECODE NODES: '+str(args.jobs)+'\n'
+ 'INITIAL WEIGHTS: '+args.weights+'\n')
+ if args.grammar_prefix:
+ config += 'GRAMMAR PREFIX: '+str(args.grammar_prefix)+'\n'
+ if args.test:
+ config += 'TEST SET: '+args.test+'\n'
+ if args.test_config:
+ config += 'TEST CONFIG: '+args.test_config+'\n'
+ if args.email:
+ config += 'EMAIL: '+args.email+'\n'
+
+ logging.info(config)
+
+if __name__=='__main__':
+ main()