diff options
Diffstat (limited to 'training')
| -rwxr-xr-x | training/mira/mira.py | 533 | ||||
| -rwxr-xr-x | training/utils/decode-and-evaluate.pl | 15 | 
2 files changed, 545 insertions, 3 deletions
| diff --git a/training/mira/mira.py b/training/mira/mira.py new file mode 100755 index 00000000..f031c313 --- /dev/null +++ b/training/mira/mira.py @@ -0,0 +1,533 @@ +#!/usr/bin/env python +import sys, os, re, shutil +import subprocess, shlex, glob +import argparse +import logging +import random, time +import cdec.score +import gzip, itertools + +#mira run script +#requires pycdec to be built, since it is used for scoring hypothesis +#translations. +#matplotlib must be installed for graphing to work +#email option requires mail + +#scoring function using pycdec scoring +def fast_score(hyps, refs, metric): +  scorer = cdec.score.Scorer(metric) +  logging.info('loaded {0} references for scoring with {1}\n'.format( +                len(refs), metric)) +  if metric=='BLEU': +    logging.warning('BLEU is ambiguous, assuming IBM_BLEU\n') +    metric = 'IBM_BLEU' +  elif metric=='COMBI': +    logging.warning('COMBI metric is no longer supported, switching to ' +                    'COMB:TER=-0.5;BLEU=0.5\n') +    metric = 'COMB:TER=-0.5;BLEU=0.5' +  stats = sum(scorer(r).evaluate(h) for h,r in itertools.izip(hyps,refs)) +  logging.info(stats.detail+'\n') +  return stats.score + +#create new parallel input file in output directory in sgml format +def enseg(devfile, newfile, gprefix): +  try: +    dev = open(devfile) +    new = open(newfile, 'w') +  except IOError, msg: +    logging.error('Error opening source file') +    raise + +  i = 0 +  for line in dev: +    (src, refs) = line.split(' ||| ', 1) +    if re.match('\s*<seg', src): +      if re.search('id="[0-9]+"', src): +        new.write(line) +      else: +        logging.error('When using segments with pre-generated <seg> tags, ' +                      'yout must include a zero based id attribute') +        sys.exit() +    else: +      sgml = '<seg id="{0}"'.format(i) +      if gprefix: +        #TODO check if grammar files gzipped or not +        if os.path.exists('{}.{}.gz'.format(gprefix,i)): +          sgml += ' grammar="{0}.{1}.gz"'.format(gprefix,i) +        elif os.path.exists('{}.{}'.format(gprefix,i)): +          sgml += ' grammar="{}.{}"'.format(gprefix,i) +        else: +          logging.error('Could not find grammar files with prefix ' +                        '{}\n'.format(gprefix)) +          sys.exit() +      sgml += '>{0}</seg> ||| {1}'.format(src, refs) +      new.write(sgml) +    i+=1 +  new.close() +  dev.close() +  return i + +def main(): +  #set logging to write all info messages to stderr +  logging.basicConfig(level=logging.INFO) +  script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) + +  parser= argparse.ArgumentParser( +            formatter_class=argparse.ArgumentDefaultsHelpFormatter) +  parser.add_argument('-d', '--devset', required=True, +                      help='dev set input file in parallel. ' +                      'format: src ||| ref1 ||| ref2') +  parser.add_argument('-c', '--config', required=True, +                      help='decoder configuration file') +  parser.add_argument('-w','--weights', +                      help='initial weights file') +  parser.add_argument('-j', '--jobs', type=int, default=1, +                      help='number of decoder processes to run in parallel') +  parser.add_argument('-o','--output-dir', metavar='DIR', +                      help='directory for intermediate and output files. ' +                      'defaults to mira.(devset name).(time)') +  parser.add_argument('-e', '--email',  +                      help='email address to send result report') +  parser.add_argument('-t', '--test',  +                      help='test set to decode and evaluate') +  parser.add_argument('--test-config',  +                      help='config file for testing. the config file used ' +                      'for tuning feature weights will be used by default.') +  parser.add_argument('-m', '--metric', default='ibm_bleu', +                      help='metric to optimize. Example values: ' +                           'ibm_bleu, nist_bleu, Koehn_bleu, TER, Combi') +  parser.add_argument('--max-iterations', type=int, default=10, metavar='N', +                      help='maximum number of iterations to run') +  parser.add_argument('--optimizer', type=int, default=2, choices=range(1,6), +                      help='learning method to use for weight update.' +                      ' Choices: 1) SGD, 2) PA MIRA with Selection from Cutting' +                      ' Plane, 3) Cutting Plane MIRA, 4) PA MIRA,' +                      ' 5) nbest MIRA with hope, fear, and model constraints') +  parser.add_argument('--metric-scale', type=int, default=1, metavar='N', +                      help='scale MT loss by this amount when computing' +                      ' hope/fear candidates') +  parser.add_argument('-k', '--kbest-size', type=int, default=250, metavar='N',  +                      help='size of k-best list to extract from forest') +  parser.add_argument('--update-size', type=int, metavar='N',  +                      help='size of k-best list to use for update. defaults to ' +                      'equal kbest-size (applies to optimizer 5)') +  parser.add_argument('--step-size', type=float, default=0.01,  +                      help='controls aggresiveness of update') +  parser.add_argument('--hope', type=int, default=1, choices=range(1,3), +                     help='how to select hope candidate. options: ' +                     '1) model score - cost, 2) min cost') +  parser.add_argument('--fear', type=int, default=1, choices=range(1,4), +                      help='how to select fear candidate. options: ' +                      '1) model score + cost, 2) max cost, 3) max score') +  parser.add_argument('--sent-approx', action='store_true',  +                      help='use smoothed sentence-level MT metric') +  parser.add_argument('--no-pseudo', action='store_true', +                      help="don't use pseudo document to approximate MT metric") +  parser.add_argument('--no-unique', action='store_true', +                      help="don't extract unique k-best from forest") +  parser.add_argument('-g', '--grammar-prefix', metavar='PATH', +                      help='path to sentence specific grammar files') +  parser.add_argument('--pass-suffix',  +                      help='multipass decoding iteration. see documentation ' +                           'at www.cdec-decoder.org for more information') +  args = parser.parse_args() + +  args.metric = args.metric.upper() + +  if not args.update_size: +    args.update_size = args.kbest_size +   +  #TODO fix path to match decode+evaluate (python month 1-12 instead of 0-11) +  #if an output directory isn't specified, create a unique directory name  +  #of the form mira.(devset).YYYYMMDD-HHMMSS +  if not args.output_dir: +    t = time.localtime() +    args.output_dir = 'mira.{0}.{1}{2:02}{3:02}-{4:02}{5:02}{6:02}'.format( +                      os.path.splitext(args.devset)[0], t[0], t[1], t[2], +                      t[3], t[4], t[5]) +     +  if not os.path.isabs(args.output_dir): +    args.output_dir = os.path.abspath(args.output_dir) +  if os.path.exists(args.output_dir): +    if len(os.listdir(args.output_dir))>2: +      logging.error('Error: working directory {0} already exists\n'.format( +                    args.output_dir)) +      sys.exit() +  else: +    os.mkdir(args.output_dir) + +  if args.grammar_prefix: +    if not os.path.isabs(args.grammar_prefix): +      args.grammar_prefix = os.path.abspath(args.grammar_prefix) +   +  script = open(args.output_dir+'/rerun_mira.sh','w') +  script.write('cd {0}\n'.format(os.getcwd())) +  script.write(' '.join(sys.argv)+'\n') +  script.close() + +  #create weights.0 file from initial weights file +  if args.weights: +    shutil.copy(args.weights,os.path.join(args.output_dir,'weights.0')) +  else: #if no weights given, use Glue 0 as default +    weights = open(args.output_dir+'/weights.0','w') +    weights.write('Glue 0\n') +    weights.close() +    args.weights = args.output_dir+'/weights.0' +   +  #create mira ini file +  shutil.copy(args.config,'{0}/kbest_cut_mira.ini'.format(args.output_dir)) +   +  newdev = args.output_dir+'/dev.input' +  dev_size = enseg(args.devset, newdev, args.grammar_prefix) +  args.devset = newdev +   +  write_config(args) +  args.weights, hope_best_fear = optimize(args, script_dir, dev_size) +   +  graph_file = graph(args.output_dir, hope_best_fear, args.metric) + +  dev_results, dev_bleu = evaluate(args.devset, args.weights, args.config,  +                         script_dir, args.output_dir) +  if args.test: +    if args.test_config: +      test_results, test_bleu = evaluate(args.test, args.weights,  +                              args.test_config, script_dir, args.output_dir) +    else: +      test_results, test_bleu = evaluate(args.test, args.weights, args.config, +                              script_dir, args.output_dir) +  else:  +    test_results = '' +    test_bleu = '' +  logging.info(dev_results+'\n') +  logging.info(test_results) + +  write_report(graph_file, dev_results, dev_bleu, test_results, test_bleu, args) + +  if graph_file: +    logging.info('A graph of the best/hope/fear scores over the iterations ' +                 'has been saved to {}\n'.format(graph_file)) + +  print 'final weights:\n{}\n'.format(args.weights) + +#graph of hope/best/fear metric values across all iterations +def graph(output_dir, hope_best_fear, metric): +  try:  +    import matplotlib.pyplot as plt +  except ImportError: +    logging.error('Error importing matplotlib. Graphing disabled.\n') +    return '' +  max_y = float(max(hope_best_fear['best']))*1.5 +  plt.plot(hope_best_fear['best'], label='best') +  plt.plot(hope_best_fear['hope'], label='hope') +  plt.plot(hope_best_fear['fear'], label='fear') +  plt.axis([0,len(hope_best_fear['fear'])-1,0,max_y]) +  plt.xlabel('Iteration') +  plt.ylabel(metric) +  plt.legend() +  graph_file = output_dir+'/mira.pdf' +  plt.savefig(graph_file) +  return graph_file + +#evaluate a given test set using decode-and-evaluate.pl +def evaluate(testset, weights, ini, script_dir, out_dir): +  evaluator = '{}/../utils/decode-and-evaluate.pl'.format(script_dir) +  try: +    p = subprocess.Popen([evaluator, '-c', ini, '-w', weights, '-i', testset,  +                         '-d', out_dir], stdout=subprocess.PIPE) +    results, err = p.communicate() +    bleu, results = results.split('\n',1) +  except subprocess.CalledProcessError: +    logging.error('Evalutation of {} failed'.format(testset)) +    results = '' +    bleu = '' +  return results, bleu + +#print a report to out_dir/mira.results +#send email with results if email was given +def write_report(graph_file, dev_results, dev_bleu,  +                 test_results, test_bleu, args): +  features, top, bottom = weight_stats(args.weights)  +  top = [f+' '+str(w) for f,w in top] +  bottom = [f+' '+str(w) for f,w in bottom] +  subject = 'MIRA {0} {1:7}'.format(os.path.basename(args.devset), dev_bleu) +  if args.test: +    subject += ' {0} {1:7}'.format(os.path.basename(args.test), test_bleu) + +  message = ('MIRA has finished running. '+ +            'The final weights can be found at \n{}\n'.format(args.weights)+ +            'Average weights across all iterations '+ +            '\n{}/weights.average\n'.format(args.output_dir)+ +            'Weights were calculated for {} features\n\n'.format(features)+ +            '5 highest weights:\n{}\n\n'.format('\n'.join(top))+ +            '5 lowest weights:\n{}\n'.format('\n'.join(bottom))) +   +  if dev_results: +    message += '\nEvaluation: dev set\n{}'.format(dev_results) +  if test_results: +    message += '\nEvaluation: test set\n{}'.format(test_results) +  +  out = open(args.output_dir+'/mira.results','w') +  out.write(message) +  out.close() +  +  if args.email: +    cmd = ['mail', '-s', subject] +    if graph_file: +      cmd += ['-a', graph_file] +    email_process = subprocess.Popen(cmd+[args.email], stdin = subprocess.PIPE) +    email_process.communicate(message) + +#feature weights stats for report +def weight_stats(weight_file): +  f = open(weight_file) +  features = [] +  for line in f: +    feat, weight = line.strip().split() +    features.append((feat,float(weight))) +  features.sort(key=lambda a: a[1], reverse=True) +  return len(features), features[:5], features[-5:] + +#create source and refs files from parallel devset +#TODO remove when kbest_cut_mira changed to take parallel input +def split_devset(dev, outdir): +  parallel = open(dev) +  source = open(outdir+'/source.input','w') +  refs = open(outdir+'/refs.input', 'w') +  references = [] +  for line in parallel: +    s,r = line.strip().split(' ||| ',1) +    source.write(s+'\n') +    refs.write(r+'\n') +    references.append(r) +  source.close() +  refs.close() +  return (outdir+'/source.input', outdir+'/refs.input') + +def optimize(args, script_dir, dev_size): +  parallelize = script_dir+'/../utils/parallelize.pl' +  decoder = script_dir+'/kbest_cut_mira' +  (source, refs) = split_devset(args.devset, args.output_dir) +  port = random.randint(15000,50000) +  num_features = 0 +  last_p_score = 0 +  best_score_iter = -1 +  best_score = -1 +  i = 0 +  hope_best_fear = {'hope':[],'best':[],'fear':[]} +  #main optimization loop +  while i<args.max_iterations: +    logging.info('\n\nITERATION {}\n========\n'.format(i)) +    logging.info('using port {}\n'.format(port)) + +    #iteration specific files +    runfile = args.output_dir+'/run.raw.'+str(i) +    onebestfile = args.output_dir+'/1best.'+str(i) +    logdir = args.output_dir+'/logs.'+str(i) +    decoderlog = logdir+'/decoder.sentserver.log.'+str(i) +    weightdir = args.output_dir+'/weights.pass'+str(i) +    os.mkdir(logdir) +    os.mkdir(weightdir) +     +    logging.info('RUNNING DECODER AT {}'.format(time.asctime())) +    weightsfile = args.output_dir+'/weights.'+str(i) +    logging.info('ITER {}\n'.format(i)) +    curr_pass = '0{}'.format(i) +    decoder_cmd = ('{0} -c {1} -w {2} -r{3} -m {4} -s {5} -b {6} -k {7} -o {8}' +                   ' -p {9} -O {10} -D {11} -h {12} -f {13} -C {14}').format( +                   decoder, args.config, weightsfile, refs, args.metric, +                   args.metric_scale, args.update_size, args.kbest_size,  +                   args.optimizer, curr_pass, weightdir, args.output_dir, +                   args.hope, args.fear, args.step_size) +    if not args.no_unique:  +      decoder_cmd += ' -u' +    if args.sent_approx: +      decoder_cmd += ' -a' +    if not args.no_pseudo: +      decoder_cmd += ' -e' +     +    #always use fork  +    parallel_cmd = '{0} --use-fork -e {1} -j {2} --'.format( +                    parallelize, logdir, args.jobs) +     +    cmd = parallel_cmd + ' ' + decoder_cmd +    logging.info('COMMAND: \n{}\n'.format(cmd)) +    +    dlog = open(decoderlog,'w') +    runf = open(runfile,'w') +    retries = 0 +    num_topbest = 0 + +    while retries < 6: +      #call decoder through parallelize.pl +      p1 = subprocess.Popen(['cat', source], stdout=subprocess.PIPE) +      exit_code = subprocess.call(shlex.split(cmd), stderr=dlog, stdout=runf,  +                                  stdin=p1.stdout) +      p1.stdout.close() +       +      if exit_code: +        logging.error('Failed with exit code {}\n'.format(exit_code)) +        sys.exit(exit_code) + +      try: +        f = open(runfile) +      except IOError, msg: +        logging.error('Unable to open {}\n'.format(runfile)) +        sys.exit() +       +      num_topbest = sum(1 for line in f) +      f.close() +      if num_topbest == dev_size: break +      logging.warning('Incorrect number of top best. ' +                      'Waiting for distributed filesystem and retrying.') +      time.sleep(10) +      retries += 1 +     +    if dev_size != num_topbest: +      logging.error("Dev set contains "+dev_size+" sentences, but we don't " +                    "have topbest for all of these. Decoder failure? " +                    " Check "+decoderlog+'\n') +      sys.exit() +    dlog.close() +    runf.close() + +    #write best, hope, and fear translations +    run = open(runfile) +    H = open(runfile+'.H', 'w') +    B = open(runfile+'.B', 'w') +    F = open(runfile+'.F', 'w') +    hopes = [] +    bests = [] +    fears = [] +    for line in run: +      hope, best, fear = line.split(' ||| ') +      hopes.append(hope) +      bests.append(best) +      fears.append(fear) +      H.write('{}\n'.format(hope)) +      B.write('{}\n'.format(best)) +      F.write('{}\n'.format(fear)) +    run.close() +    H.close() +    B.close() +    F.close() + +    #gzip runfiles and log files to save space +    gzip_file(runfile) +    gzip_file(decoderlog) + +    ref_file = open(refs) +    references = [line.split(' ||| ') for line in  +                  ref_file.read().strip().split('\n')] +    ref_file.close() +    #get score for best hypothesis translations, hope and fear translations +    dec_score = fast_score(bests, references, args.metric) +    dec_score_h = fast_score(hopes, references, args.metric) +    dec_score_f = fast_score(fears, references, args.metric) +     +    hope_best_fear['hope'].append(dec_score) +    hope_best_fear['best'].append(dec_score_h) +    hope_best_fear['fear'].append(dec_score_f) +    logging.info('DECODER SCORE: {0} HOPE: {1} FEAR: {2}\n'.format( +                  dec_score, dec_score_h, dec_score_f)) +    if dec_score > best_score: +      best_score_iter = i +      best_score = dec_score + +    new_weights_file = '{}/weights.{}'.format(args.output_dir, i+1) +    last_weights_file = '{}/weights.{}'.format(args.output_dir, i) +    i += 1 +    weight_files = weightdir+'/weights.mira-pass*.*[0-9].gz' +    average_weights(new_weights_file, weight_files) + +  logging.info('\nBEST ITER: {} :: {}\n\n'.format( +               best_score_iter, best_score)) +  weights_final = args.output_dir+'/weights.final' +  shutil.copy(last_weights_file, weights_final) +  average_final_weights(args.output_dir) +   +  return weights_final, hope_best_fear + +#TODO +#create a weights file with the average of the weights from each iteration +def average_final_weights(out_dir): +  logging.info('Average of weights from each iteration\n') +  weight_files = glob.glob(out_dir+'/weights.[1-9]*') +  features = {} +  for path in weight_files: +    weights = open(path) +    for line in weights: +      f, w = line.strip().split(' ', 1) +      if f in features: +        features[f] += float(w) +      else: +        features[f] = float(w) +    weights.close() + +  out = open(out_dir+'/weights.average','w') +  for f in iter(features): +    out.write('{} {}\n'.format(f,features[f]/len(weight_files))) +  logging.info('An average weights file can be found at'  +               '\n{}\n'.format(out_dir+'/weights.average')) + +#create gzipped version of given file with name filename.gz +# and delete original file +def gzip_file(filename): +  gzip_file = gzip.open(filename+'.gz','wb') +  f = open(filename) +  gzip_file.writelines(f) +  f.close() +  gzip_file.close() +  os.remove(filename) + +#average the weights for a given pass +def average_weights(new_weights, weight_files): +  logging.info('AVERAGE {} {}\n'.format(new_weights, weight_files)) +  feature_weights = {} +  total_mult = 0.0 +  for path in glob.glob(weight_files): +    score = gzip.open(path) +    mult = 0 +    logging.info('FILE {}\n'.format(path)) +    msg, ran, mult = score.readline().strip().split(' ||| ') +    logging.info('Processing {} {}'.format(ran, mult)) +    for line in score: +      f,w = line.split(' ',1) +      if f in feature_weights: +        feature_weights[f]+= float(mult)*float(w) +      else:  +        feature_weights[f] = float(mult)*float(w) +    total_mult += float(mult) +    score.close() +   +  #write new weights to outfile +  out = open(new_weights, 'w') +  for f in iter(feature_weights): +    avg = feature_weights[f]/total_mult +    logging.info('{} {} {} ||| Printing {} {}\n'.format(f,feature_weights[f],  +                 total_mult, f, avg)) +    out.write('{} {}\n'.format(f,avg)) + +def write_config(args): +  config = ('\n' +            'DECODER: ' +            '/usr0/home/eschling/cdec/training/mira/kbest_cut_mira\n' +            'INI FILE: '+args.config+'\n' +            'WORKING DIRECTORY: '+args.output_dir+'\n' +            'DEVSET: '+args.devset+'\n' +            'EVAL METRIC: '+args.metric+'\n' +            'MAX ITERATIONS: '+str(args.max_iterations)+'\n' +            'DECODE NODES: '+str(args.jobs)+'\n' +            'INITIAL WEIGHTS: '+args.weights+'\n') +  if args.grammar_prefix: +    config += 'GRAMMAR PREFIX: '+str(args.grammar_prefix)+'\n' +  if args.test: +    config += 'TEST SET: '+args.test+'\n' +  if args.test_config: +    config += 'TEST CONFIG: '+args.test_config+'\n' +  if args.email: +    config += 'EMAIL: '+args.email+'\n' +            +  logging.info(config) + +if __name__=='__main__': +  main() diff --git a/training/utils/decode-and-evaluate.pl b/training/utils/decode-and-evaluate.pl index 1a332c08..c69e77dc 100755 --- a/training/utils/decode-and-evaluate.pl +++ b/training/utils/decode-and-evaluate.pl @@ -39,6 +39,7 @@ my $weights;  my $use_make = 1;  my $useqsub;  my $cpbin=1; +my $base_dir;  # Process command-line options  if (GetOptions(  	"jobs=i" => \$jobs, @@ -47,6 +48,7 @@ if (GetOptions(  	"input=s" => \$test_set,          "config=s" => \$config,  	"weights=s" => \$weights, +        "dir=s" => \$base_dir,  ) == 0 || @ARGV!=0 || $help) {  	print_help();  	exit; @@ -68,7 +70,9 @@ my @tf = localtime(time);  my $tname = basename($test_set);  $tname =~ s/\.(sgm|sgml|xml)$//i;  my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]); - +if ($base_dir) { +  $dir = $base_dir.'/'.$dir +}  my $time = unchecked_output("date");  check_call("mkdir -p $dir"); @@ -103,11 +107,12 @@ print STDERR "\nOUTPUT: $test_trans\n\n";  my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu");  chomp $bleu;  print STDERR "BLEU: $bleu\n"; +print STDOUT "BLEU: $bleu\n";  my $ter = check_output("cat $test_trans | $SCORER $refs -m ter");  chomp $ter;  print STDERR " TER: $ter\n";  open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!"; -print TR <<EOT; +my $score_report = <<EOT;  ### SCORE REPORT #############################################################          OUTPUT=$test_trans    SCRIPT INPUT=$test_set @@ -118,6 +123,9 @@ print TR <<EOT;             TER=$ter  ##############################################################################  EOT + +print TR $score_report; +print STDOUT $score_report;  close TR;  my $sr = unchecked_output("cat $dir/test.scores");  print STDERR "\n\n$sr\n(A copy of this report can be found in $dir/test.scores)\n\n"; @@ -166,7 +174,8 @@ Options:  		A file specifying feature weights.  	--dir <dir> -		Directory for intermediate and output files. +		Base directory where directory with evaluation results +                will be located.  Job control options: | 
