diff options
| author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-05 18:42:44 +0000 | 
|---|---|---|
| committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-05 18:42:44 +0000 | 
| commit | c6185ff9d09a9c4f012f8cec5a91ac0ad0ef002a (patch) | |
| tree | 68c6fd3e9fa377590cdc10b05c0ed95169ead7f1 | |
| parent | 863c1e258f1da790456b166ddedb1ce61f614d4b (diff) | |
more of exp script
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@131 ec762483-ff6d-05da-a07a-a48fb63a330f
| -rw-r--r-- | gi/pipeline/config.eval | 4 | ||||
| -rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 100 | 
2 files changed, 97 insertions, 7 deletions
| diff --git a/gi/pipeline/config.eval b/gi/pipeline/config.eval index 4419de9f..9f8da238 100644 --- a/gi/pipeline/config.eval +++ b/gi/pipeline/config.eval @@ -1,5 +1,5 @@ -# name path aligned corpus dev dev-refs test1 test1-refs ... -btec btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh devtest/devset3.lc.en* +# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... +btec btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh  fbis chinese-english.fbis corpus.zh-en.al  zhen chinese-english corpus.zh-en.al  aren arabic-english corpus.ar-en.al diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index a7cc20bc..d4b2dc76 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -1,9 +1,19 @@  #!/usr/bin/perl -w  use strict;  use Getopt::Long; +use Cwd; +my $CWD = getcwd;  my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +my $EXTOOLS = "$SCRIPT_DIR/../../extools"; +die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; +my $VEST = "$SCRIPT_DIR/../../vest"; +die "Can't find vest: $VEST" unless -e $VEST && -d $VEST; +my $DISTVEST = "$VEST/dist-vest.pl"; +my $FILTSCORE = "$EXTOOLS/filter_score_grammar"; +assert_exec($FILTSCORE, $DISTVEST); +  my %init_weights = qw(    EGivenF -0.3    FGivenE -0.3 @@ -21,11 +31,12 @@ my %init_weights = qw(  my $config = "$SCRIPT_DIR/config.eval";  open CONF, "<$config" or die "Can't read $config: $!";  my %paths; +my %corpora;  my %lms;  my %devs;  my %devrefs;  my %tests; -my %testrefs; +my %testevals;  print STDERR "LANGUAGE PAIRS:";  while(<CONF>) {    chomp; @@ -33,19 +44,21 @@ while(<CONF>) {    next if /^\s*$/;    s/^\s+//;    s/\s+$//; -  my ($name, $path, $lm, $dev, $devref, @xtests) = split /\s+/; +  my ($name, $path, $corpus, $lm, $dev, $devref, @xtests) = split /\s+/;    $paths{$name} = $path; +  $corpora{$name} = $corpus;    $lms{$name} = $lm;    $devs{$name} = $dev;    $devrefs{$name} = $devref;    $tests{$name} = $xtests[0]; -  $testrefs{$name} = $xtests[1]; +  $testevals{$name} = $xtests[1];    print STDERR " $name";  }  print STDERR "\n";  my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr ); +my $outdir = "$CWD/exp";  my $help;  my $dataDir = '/export/ws10smt/data';  if (GetOptions( @@ -63,8 +76,57 @@ my $corpdir = "$dataDir";  if ($paths{$lp} =~ /^\//) { $corpdir = $paths{$lp}; } else { $corpdir .= '/' . $paths{$lp}; }  die "I can't find the corpora directory: $corpdir" unless -d $corpdir;  print STDERR "       GRAMMAR: $grammar\n"; -my $LANG_MODEL = $corpdir . '/' . $lms{$lp}; +my $LANG_MODEL = mydircat($corpdir, $lms{$lp});  print STDERR "            LM: $LANG_MODEL\n"; +my $CORPUS = mydircat($corpdir, $corpora{$lp}); +die "Can't find corpus: $CORPUS" unless -f $CORPUS; + +my $dev = mydircat($corpdir, $devs{$lp}); +my $drefs = $devrefs{$lp}; +die "Can't find dev: $dev\n" unless -f $dev; +die "Dev refs not set" unless $drefs; +$drefs = mydircat($corpdir, $drefs); + +my $test = mydircat($corpdir, $tests{$lp}); +my $teval = mydircat($corpdir, $testevals{$lp}); +die "Can't find test: $test\n" unless -f $test; +assert_exec($teval); + +# MAKE DEV +print STDERR "\nFILTERING FOR dev...\n"; +print STDERR "DEV: $dev (REFS=$drefs)\n"; +`mkdir -p $outdir`; +my $devgrammar = filter($grammar, $dev, 'dev', $outdir); +my $devini = mydircat($outdir, "cdec-dev.ini"); +write_cdec_ini($devini, $devgrammar); + + +# MAKE TEST +print STDERR "\nFILTERING FOR test...\n"; +print STDERR "TEST: $test (EVAL=$teval)\n"; +`mkdir -p $outdir`; +my $testgrammar = filter($grammar, $test, 'test', $outdir); +my $testini = mydircat($outdir, "cdec-test.ini"); +write_cdec_ini($testini, $testgrammar); + + +sub filter { +  my ($grammar, $set, $name, $outdir) = @_; +  my $outgrammar = mydircat($outdir, "$name.scfg.gz"); +  if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else { +    my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $dev | gzip > $outgrammar"; +    safesystem($cmd) or die "Can't filter and score grammar!"; +  } +  return $outgrammar; +} + +sub mydircat { + my ($base, $suffix) = @_; + if ($suffix =~ /^\//) { return $suffix; } + my $res = $base . '/' . $suffix; + $res =~ s/\/\//\//g; + return $res; +}  sub write_cdec_ini {    my ($filename, $grammar_path) = (@_); @@ -73,7 +135,7 @@ sub write_cdec_ini {  formalism=scfg  cubepruning_pop_limit=100  add_pass_through_rules=true -scfg_extra_glue_grammar=/export/ws10smt/cdyer/glue.scfg.gz +scfg_extra_glue_grammar=/export/ws10smt/data/glue/glue.scfg.gz  grammar=$grammar_path  feature_function=WordPenalty  feature_function=LanguageModel -o 3 $LANG_MODEL @@ -92,3 +154,31 @@ run MERT, report scores.  EOT  } + +sub safesystem { +  print STDERR "Executing: @_\n"; +  system(@_); +  if ($? == -1) { +      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; +      exit(1); +  } +  elsif ($? & 127) { +      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", +          ($? & 127),  ($? & 128) ? 'with' : 'without'; +      exit(1); +  } +  else { +    my $exitcode = $? >> 8; +    print STDERR "Exit code: $exitcode\n" if $exitcode; +    return ! $exitcode; +  } +} + +sub assert_exec { +  my @files = @_; +  for my $file (@files) { +    die "Can't find $file - did you run make?\n" unless -e $file; +    die "Can't execute $file" unless -e $file; +  } +}; + | 
