diff options
| -rw-r--r-- | gi/pipeline/clsp.config | 9 | ||||
| -rw-r--r-- | gi/pipeline/config.eval | 8 | ||||
| -rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 85 | ||||
| -rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 2 | ||||
| -rwxr-xr-x | vest/dist-vest.pl | 4 | 
5 files changed, 71 insertions, 37 deletions
| diff --git a/gi/pipeline/clsp.config b/gi/pipeline/clsp.config new file mode 100644 index 00000000..cd0f9d65 --- /dev/null +++ b/gi/pipeline/clsp.config @@ -0,0 +1,9 @@ +# THIS FILE GIVES THE LOCATIONS OF THE CORPORA USED +# name path aligned-corpus LM xfeats.grammar dev dev-refs test1 testt-eval.sh ... +btec /export/ws10smt/data/btec/ split.zh-en.al lm/en.3gram.lm.gz xgrammar/grammar.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh +fbis /export/ws10smt/data/chinese-english.fbis corpus.zh-en.al +zhen /export/ws10smt/data/chinese-english corpus.zh-en.al +aren /export/ws10smt/data/arabic-english corpus.ar-en.al +uren /export/ws10smt/data/urdu-english corpus.ur-en.al +nlfr /export/ws10smt/data/dutch-french corpus.nl-fr.al + diff --git a/gi/pipeline/config.eval b/gi/pipeline/config.eval deleted file mode 100644 index 9f8da238..00000000 --- a/gi/pipeline/config.eval +++ /dev/null @@ -1,8 +0,0 @@ -# name path aligned-corpus LM dev dev-refs test1 testt-eval.sh ... -btec btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh eval-devset3.sh -fbis chinese-english.fbis corpus.zh-en.al -zhen chinese-english corpus.zh-en.al -aren arabic-english corpus.ar-en.al -uren urdu-english corpus.ur-en.al -nlfr dutch-french corpus.nl-fr.al - diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index c27cd37e..bbab2d64 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -8,20 +8,28 @@ my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR  my @DEFAULT_FEATS = qw(    LogRuleCount SingletonRule LexE2F LexF2E WordPenalty -  LanguageModel Glue GlueTop PassThrough); +  LogFCount LanguageModel Glue GlueTop PassThrough SingletonF +);  my %init_weights = qw(    LogRuleCount 0.2 -  SingletonRule -0.6    LexE2F -0.3    LexF2E -0.3 +  LogFCount 0.1    WordPenalty -1.5    LanguageModel 1.2    Glue -1.0    GlueTop 0.00001    PassThrough -10.0 +  SingletonRule -0.1    X_EGivenF -0.3    X_FGivenE -0.3 +  X_LogECount -1 +  X_LogFCount -0.1 +  X_LogRuleCount 0.3 +  X_SingletonE -0.1 +  X_SingletonF -0.1 +  X_SingletonRule -0.5  );  my $CDEC = "$SCRIPT_DIR/../../decoder/cdec"; @@ -32,9 +40,11 @@ my $VEST = "$SCRIPT_DIR/../../vest";  die "Can't find vest: $VEST" unless -e $VEST && -d $VEST;  my $DISTVEST = "$VEST/dist-vest.pl";  my $FILTSCORE = "$EXTOOLS/filter_score_grammar"; -assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST); +my $ADDXFEATS = "$SCRIPT_DIR/scripts/xfeats.pl"; +assert_exec($CDEC, $PARALLELIZE, $FILTSCORE, $DISTVEST, $ADDXFEATS); -my $config = "$SCRIPT_DIR/config.eval"; +my $config = "$SCRIPT_DIR/clsp.config"; +print STDERR "CORPORA CONFIGURATION: $config\n";  open CONF, "<$config" or die "Can't read $config: $!";  my %paths;  my %corpora; @@ -43,17 +53,19 @@ my %devs;  my %devrefs;  my %tests;  my %testevals; -print STDERR "LANGUAGE PAIRS:"; +my %xgrammars; +print STDERR "       LANGUAGE PAIRS:";  while(<CONF>) {    chomp;    next if /^#/;    next if /^\s*$/;    s/^\s+//;    s/\s+$//; -  my ($name, $path, $corpus, $lm, $dev, $devref, @xtests) = split /\s+/; +  my ($name, $path, $corpus, $lm, $xgrammar, $dev, $devref, @xtests) = split /\s+/;    $paths{$name} = $path;    $corpora{$name} = $corpus;    $lms{$name} = $lm; +  $xgrammars{$name} = $xgrammar;    $devs{$name} = $dev;    $devrefs{$name} = $devref;    $tests{$name} = $xtests[0]; @@ -67,16 +79,15 @@ my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr );  my $outdir = "$CWD/exp";  my $help;  my $XFEATS; +my $EXTRA_FILTER = '';  my $dataDir = '/export/ws10smt/data';  if (GetOptions( -        "xfeats" => \$XFEATS,          "data=s" => \$dataDir, +        "xfeats" => \$XFEATS,  ) == 0 || @ARGV!=2 || $help) {          print_help();          exit;  } -if ($XFEATS) { die "TODO: implement adding of X-features\n"; } -  my $lp = $ARGV[0];  my $grammar = $ARGV[1];  print STDERR "   CORPUS REPO: $dataDir\n"; @@ -102,6 +113,13 @@ my $teval = mydircat($corpdir, $testevals{$lp});  die "Can't find test: $test\n" unless -f $test;  assert_exec($teval); +if ($XFEATS) { +  my $xgram = mydircat($corpdir, $xgrammars{$lp}); +  die "Can't find x-grammar: $xgram" unless -f $xgram; +  $EXTRA_FILTER = "$ADDXFEATS $xgram |"; +  print STDERR "ADDING X-FEATS FROM $xgram\n"; +} +  # MAKE DEV  print STDERR "\nFILTERING FOR dev...\n";  print STDERR "DEV: $dev (REFS=$drefs)\n"; @@ -128,31 +146,34 @@ write_random_weights_file($weights);  # VEST  print STDERR "\nMINIMUM ERROR TRAINING\n"; -my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini"; -print STDERR "MERT COMMAND: $cmd\n"; -`rm -rf $outdir/vest 2> /dev/null`; -chdir $outdir or die "Can't chdir to $outdir: $!"; -$weights = `$cmd`; -die "MERT reported non-zero exit code" unless $? == 0;  my $tuned_weights = mydircat($outdir, 'weights.tuned'); -`cp $weights $tuned_weights`; -print STDERR "TUNED WEIGHTS: $tuned_weights\n"; -die "$tuned_weights is missing!" unless -f $tuned_weights; - +if (-f $tuned_weights) { +  print STDERR "TUNED WEIGHTS $tuned_weights EXISTS: REUSING\n"; +} else { +  my $cmd = "$DISTVEST --ref-files=$drefs --source-file=$dev --weights $weights $devini"; +  print STDERR "MERT COMMAND: $cmd\n"; +  `rm -rf $outdir/vest 2> /dev/null`; +  chdir $outdir or die "Can't chdir to $outdir: $!"; +  $weights = `$cmd`; +  die "MERT reported non-zero exit code" unless $? == 0; +  `cp $weights $tuned_weights`; +  print STDERR "TUNED WEIGHTS: $tuned_weights\n"; +  die "$tuned_weights is missing!" unless -f $tuned_weights; +}  # DECODE  print STDERR "\nDECODE TEST SET\n";  my $decolog = mydircat($outdir, "test-decode.log");  my $testtrans = mydircat($outdir, "test.trans"); -$cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans"; -safesystem($cmd) or die "Failed to decode test set!"; +my $cmd = "cat $test | $PARALLELIZE -j 20 -e $decolog -- $CDEC -c $testini -w $tuned_weights > $testtrans"; +safesystem($testtrans, $cmd) or die "Failed to decode test set!";  # EVALUATE  print STDERR "\nEVALUATE TEST SET\n";  print STDERR "TEST: $testtrans\n";  $cmd = "$teval $testtrans"; -safesystem($cmd) or die "Failed to evaluate!"; +safesystem(undef, $cmd) or die "Failed to evaluate!";  exit 0; @@ -160,10 +181,16 @@ sub write_random_weights_file {    my ($file, @extras) = @_;    open F, ">$file" or die "Can't write $file: $!";    my @feats = (@DEFAULT_FEATS, @extras); -  if ($XFEATS) { push @feats, "X_FGivenE"; push @feats, "X_EGivenF"; } +  if ($XFEATS) { +    my @xfeats = qw( +      X_LogRuleCount X_LogECount X_LogFCount X_EGivenF X_FGivenE X_SingletonRule X_SingletonE X_SingletonF +    ); +    @feats = (@feats, @xfeats); +  }    for my $feat (@feats) {      my $r = rand(1.6);      my $w = $init_weights{$feat} * $r; +    if ($w == 0) { $w = 0.0001; print STDERR "WARNING: $feat had no initial weight!\n"; }      print F "$feat $w\n";    }    close F; @@ -173,8 +200,8 @@ sub filter {    my ($grammar, $set, $name, $outdir) = @_;    my $outgrammar = mydircat($outdir, "$name.scfg.gz");    if (-f $outgrammar) { print STDERR "$outgrammar exists - REUSING!\n"; } else { -    my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $dev | gzip > $outgrammar"; -    safesystem($cmd) or die "Can't filter and score grammar!"; +    my $cmd = "gunzip -c $grammar | $FILTSCORE -c $CORPUS -t $set | $EXTRA_FILTER gzip > $outgrammar"; +    safesystem($outgrammar, $cmd) or die "Can't filter and score grammar!";    }    return $outgrammar;  } @@ -215,20 +242,26 @@ EOT  }  sub safesystem { +  my $output = shift @_;    print STDERR "Executing: @_\n";    system(@_);    if ($? == -1) {        print STDERR "ERROR: Failed to execute: @_\n  $!\n"; +      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }        exit(1);    }    elsif ($? & 127) {        printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n",            ($? & 127),  ($? & 128) ? 'with' : 'without'; +      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; }        exit(1);    }    else {      my $exitcode = $? >> 8; -    print STDERR "Exit code: $exitcode\n" if $exitcode; +    if ($exitcode) { +      print STDERR "Exit code: $exitcode\n"; +      if (defined $output && -e $output) { printf STDERR "Removing $output\n"; `rm -rf $output`; } +    }      return ! $exitcode;    }  } diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 6e396165..9ae7fd95 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -14,7 +14,7 @@ my $ITEMS_IN_MEMORY = 10000000;  # cache size in extractors  my $NUM_TOPICS = 50;  my $NUM_SAMPLES = 1000;  my $CONTEXT_SIZE = 1; -my $BIDIR = 1; +my $BIDIR = 0;  my $TOPICS_CONFIG = "pyp-topics.conf";  my $EXTOOLS = "$SCRIPT_DIR/../../extools"; diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index 86d2b7e0..cee21014 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -374,9 +374,9 @@ while (1){  	print STDERR "\n==========\n";  } -print STDERR "\nFINAL WEIGHTS: $dir/$lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; -print STDOUT "$dir/$lastWeightsFile\n"; +print STDOUT "$lastWeightsFile\n";  exit 0; | 
