diff options
Diffstat (limited to 'gi')
| -rwxr-xr-x | gi/pipeline/OLD.evaluation-pipeline.pl | 277 | 
1 files changed, 277 insertions, 0 deletions
| diff --git a/gi/pipeline/OLD.evaluation-pipeline.pl b/gi/pipeline/OLD.evaluation-pipeline.pl new file mode 100755 index 00000000..06c89b60 --- /dev/null +++ b/gi/pipeline/OLD.evaluation-pipeline.pl @@ -0,0 +1,277 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; +use Cwd; +my $CWD = getcwd; + +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } + +my @DEFAULT_FEATS = qw( +  LogRuleCount SingletonRule LexE2F LexF2E WordPenalty +  LogFCount LanguageModel Glue GlueTop PassThrough SingletonF +); + +my %init_weights = qw( +  LogRuleCount 0.2 +  LexE2F -0.3 +  LexF2E -0.3 +  LogFCount 0.1 +  WordPenalty -1.5 +  LanguageModel 1.2 +  Glue -1.0 +  GlueTop 0.00001 +  PassThrough -10.0 +  SingletonRule -0.1 +  X_EGivenF -0.3 +  X_FGivenE -0.3 +  X_LogECount -1 +  X_LogFCount -0.1 +  X_LogRuleCount 0.3 +  X_SingletonE -0.1 +  X_SingletonF -0.1 +  X_SingletonRule -0.5 +); + +my $CDEC = "$SCRIPT_DIR/../../decoder/cdec"; +my $PARALLELIZE = "$SCRIPT_DIR/../../vest/parallelize.pl"; +my $EXTOOLS = "$SCRIPT_DIR/../../extools"; +die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; +my $VEST = "$SCRIPT_DIR/../../vest"; +die "Can't find vest: $VEST" unless -e $VEST && -d $VEST; +my $DISTVEST = "$VEST/dist-vest.pl"; +my $FILTSCORE = "$EXTOOLS/filter_score_grammar"; +my $ADDXFEATS = "$SCRIPT_DIR/scripts/xfeats.pl"; +assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST, $ADDXFEATS); + +my $config = "$SCRIPT_DIR/clsp.config"; +print STDERR "CORPORA CONFIGURATION: $config\n"; +open CONF, "<$config" or die "Can't read $config: $!"; +my %paths; +my %corpora; +my %lms; +my %devs; +my %devrefs; +my %tests; +my %testevals; +my %xgrammars; +print STDERR "       LANGUAGE PAIRS:"; +while(<CONF>) { +  chomp; +  next if /^#/; +  next if /^\s*$/; +  s/^\s+//; +  s/\s+$//; +  my ($name, $path, $corpus, $lm, $xgrammar, $dev, $devref, @xtests) = split /\s+/; +  $paths{$name} = $path; +  $corpora{$name} = $corpus; +  $lms{$name} = $lm; +  $xgrammars{$name} = $xgrammar; +  $devs{$name} = $dev; +  $devrefs{$name} = $devref; +  $tests{$name} = $xtests[0]; +  $testevals{$name} = $xtests[1]; +  print STDERR " $name"; +} +print STDERR "\n"; + +my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr ); + +my $outdir = "$CWD/exp"; +my $help; +my $XFEATS; +my $EXTRA_FILTER = ''; +my $dataDir = '/export/ws10smt/data'; +if (GetOptions( +        "data=s" => \$dataDir, +        "xfeats" => \$XFEATS, +) == 0 || @ARGV!=2 || $help) { +        print_help(); +        exit; +} +my $lp = $ARGV[0]; +my $grammar = $ARGV[1]; +print STDERR "   CORPUS REPO: $dataDir\n"; +print STDERR " LANGUAGE PAIR: $lp\n"; +die "I don't know about that language pair\n" unless $paths{$lp}; +my $corpdir = "$dataDir"; +if ($paths{$lp} =~ /^\//) { $corpdir = $paths{$lp}; } else { $corpdir .= '/' . $paths{$lp}; } +die "I can't find the corpora directory: $corpdir" unless -d $corpdir; +print STDERR "       GRAMMAR: $grammar\n"; +my $LANG_MODEL = mydircat($corpdir, $lms{$lp}); +print STDERR "            LM: $LANG_MODEL\n"; +my $CORPUS = mydircat($corpdir, $corpora{$lp}); +die "Can't find corpus: $CORPUS" unless -f $CORPUS; + +my $dev = mydircat($corpdir, $devs{$lp}); +my $drefs = $devrefs{$lp}; +die "Can't find dev: $dev\n" unless -f $dev; +die "Dev refs not set" unless $drefs; +$drefs = mydircat($corpdir, $drefs); + +my $test = mydircat($corpdir, $tests{$lp}); +my $teval = mydircat($corpdir, $testevals{$lp}); +die "Can't find test: $test\n" unless -f $test; +assert_exec($teval); + +if ($XFEATS) { +  my $xgram = mydircat($corpdir, $xgrammars{$lp}); +  die "Can't find x-grammar: $xgram" unless -f $xgram; +  $EXTRA_FILTER = "$ADDXFEATS $xgram |"; +  print STDERR "ADDING X-FEATS FROM $xgram\n"; +} + +# MAKE DEV +print STDERR "\nFILTERING FOR dev...\n"; +print STDERR "DEV: $dev (REFS=$drefs)\n"; +`mkdir -p $outdir`; +my $devgrammar = filter($grammar, $dev, 'dev', $outdir); +my $devini = mydircat($outdir, "cdec-dev.ini"); +write_cdec_ini($devini, $devgrammar); + + +# MAKE TEST +print STDERR "\nFILTERING FOR test...\n"; +print STDERR "TEST: $test (EVAL=$teval)\n"; +`mkdir -p $outdir`; +my $testgrammar = filter($grammar, $test, 'test', $outdir); +my $testini = mydircat($outdir, "cdec-test.ini"); +write_cdec_ini($testini, $testgrammar); + + +# CREATE INIT WEIGHTS +print STDERR "\nCREATING INITIAL WEIGHTS FILE: weights.init\n"; +my $weights = mydircat($outdir, "weights.init"); +write_random_weights_file($weights); + + +# VEST +print STDERR "\nMINIMUM ERROR TRAINING\n"; +my $tuned_weights = mydircat($outdir, 'weights.tuned'); +if (-f $tuned_weights) { +  print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n"; +} else { +  my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini"; +  print STDERR "MERT COMMAND: $cmd\n"; +  `rm -rf $outdir/vest 2> /dev/null`; +  chdir $outdir or die "Can't chdir to $outdir: $!"; +  $weights = `$cmd`; +  die "MERT reported non-zero exit code" unless $? == 0; +  chomp $weights; +  safesystem($tuned_weights, "cp $weights $tuned_weights"); +  print STDERR "TUNED WEIGHTS: $tuned_weights\n"; +  die "$tuned_weights is missing!" unless -f $tuned_weights; +} + +# DECODE +print STDERR "\nDECODE TEST SET\n"; +my $decolog = mydircat($outdir, "test-decode.log"); +my $testtrans = mydircat($outdir, "test.trans"); +my $cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans"; +safesystem($testtrans, $cmd) or die "Failed to decode test set!"; + + +# EVALUATE +print STDERR "\nEVALUATE TEST SET\n"; +print STDERR "TEST: $testtrans\n"; +$cmd = "$teval $testtrans"; +safesystem(undef, $cmd) or die "Failed to evaluate!"; +exit 0; + + +sub write_random_weights_file { +  my ($file, @extras) = @_; +  open F, ">$file" or die "Can't write $file: $!"; +  my @feats = (@DEFAULT_FEATS, @extras); +  if ($XFEATS) { +    my @xfeats = qw( +      X_LogRuleCount X_LogECount X_LogFCount X_EGivenF X_FGivenE X_SingletonRule X_SingletonE X_SingletonF +    ); +    @feats = (@feats, @xfeats); +  } +  for my $feat (@feats) { +    my $r = rand(1.6); +    my $w = $init_weights{$feat} * $r; +    if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; } +    print F "$feat $w\n"; +  } +  close F; +} + +sub filter { +  my ($grammar, $set, $name, $outdir) = @_; +  my $outgrammar = mydircat($outdir, "$name.scfg.gz"); +  if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else { +    my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $set | $EXTRA_FILTER gzip > $outgrammar"; +    safesystem($outgrammar, $cmd) or die "Can't filter and score grammar!"; +  } +  return $outgrammar; +} + +sub mydircat { + my ($base, $suffix) = @_; + if ($suffix =~ /^\//) { return $suffix; } + my $res = $base . '/' . $suffix; + $res =~ s/\/\//\//g; + return $res; +} + +sub write_cdec_ini { +  my ($filename, $grammar_path) = (@_); +  open CDECINI, ">$filename" or die "Can't write $filename: $!"; +  print CDECINI <<EOT; +formalism=scfg +cubepruning_pop_limit=100 +add_pass_through_rules=true +scfg_extra_glue_grammar=/export/ws10smt/data/glue/glue.scfg.gz +grammar=$grammar_path +feature_function=WordPenalty +feature_function=LanguageModel -o 3 $LANG_MODEL +EOT +  close CDECINI; +}; + +sub print_help { +  print STDERR<<EOT; + +Usage: $0 [OPTIONS] language-pair grammar.bidir.gz + +Given an induced grammar for an entire corpus (i.e., generated by +local-gi-pipeline.pl), filter and featurize it for a dev and test set, +run MERT, report scores. + +EOT +} + +sub safesystem { +  my $output = shift @_; +  print STDERR "Executing: @_\n"; +  system(@_); +  if ($? == -1) { +      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; +      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } +      exit(1); +  } +  elsif ($? & 127) { +      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", +          ($? & 127),  ($? & 128) ? 'with' : 'without'; +      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } +      exit(1); +  } +  else { +    my $exitcode = $? >> 8; +    if ($exitcode) { +      print STDERR "Exit code: $exitcode\n"; +      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } +    } +    return ! $exitcode; +  } +} + +sub assert_exec { +  my @files = @_; +  for my $file (@files) { +    die "Can't find $file - did you run make?\n" unless -e $file; +    die "Can't execute $file" unless -e $file; +  } +}; + | 
