diff options
Diffstat (limited to 'gi/pipeline')
-rw-r--r-- | gi/pipeline/config.eval | 8 | ||||
-rwxr-xr-x | gi/pipeline/evaluation-pipeline.pl | 94 |
2 files changed, 102 insertions, 0 deletions
diff --git a/gi/pipeline/config.eval b/gi/pipeline/config.eval new file mode 100644 index 00000000..4419de9f --- /dev/null +++ b/gi/pipeline/config.eval @@ -0,0 +1,8 @@ +# name path aligned corpus dev dev-refs test1 test1-refs ... +btec btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh devtest/devset3.lc.en* +fbis chinese-english.fbis corpus.zh-en.al +zhen chinese-english corpus.zh-en.al +aren arabic-english corpus.ar-en.al +uren urdu-english corpus.ur-en.al +nlfr dutch-french corpus.nl-fr.al + diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl new file mode 100755 index 00000000..a7cc20bc --- /dev/null +++ b/gi/pipeline/evaluation-pipeline.pl @@ -0,0 +1,94 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; + +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } + +my %init_weights = qw( + EGivenF -0.3 + FGivenE -0.3 + LexE2F -0.3 + LexF2E -0.3 + WordPenalty -1.5 + LanguageModel 1.2 + Glue -1.0 + GlueTop 0.00001 + PassThrough -10.0 + X_EGivenF -0.3 + X_FGivenE -0.3 +); + +my $config = "$SCRIPT_DIR/config.eval"; +open CONF, "<$config" or die "Can't read $config: $!"; +my %paths; +my %lms; +my %devs; +my %devrefs; +my %tests; +my %testrefs; +print STDERR "LANGUAGE PAIRS:"; +while(<CONF>) { + chomp; + next if /^#/; + next if /^\s*$/; + s/^\s+//; + s/\s+$//; + my ($name, $path, $lm, $dev, $devref, @xtests) = split /\s+/; + $paths{$name} = $path; + $lms{$name} = $lm; + $devs{$name} = $dev; + $devrefs{$name} = $devref; + $tests{$name} = $xtests[0]; + $testrefs{$name} = $xtests[1]; + print STDERR " $name"; +} +print STDERR "\n"; + +my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr ); + +my $help; +my $dataDir = '/export/ws10smt/data'; +if (GetOptions( + "data=s" => \$dataDir, +) == 0 || @ARGV!=2 || $help) { + print_help(); + exit; +} +my $lp = $ARGV[0]; +my $grammar = $ARGV[1]; +print STDERR " CORPUS REPO: $dataDir\n"; +print STDERR " LANGUAGE PAIR: $lp\n"; +die "I don't know about that language pair\n" unless $paths{$lp}; +my $corpdir = "$dataDir"; +if ($paths{$lp} =~ /^\//) { $corpdir = $paths{$lp}; } else { $corpdir .= '/' . $paths{$lp}; } +die "I can't find the corpora directory: $corpdir" unless -d $corpdir; +print STDERR " GRAMMAR: $grammar\n"; +my $LANG_MODEL = $corpdir . '/' . $lms{$lp}; +print STDERR " LM: $LANG_MODEL\n"; + +sub write_cdec_ini { + my ($filename, $grammar_path) = (@_); + open CDECINI, ">$filename" or die "Can't write $filename: $!"; + print CDECINI <<EOT; +formalism=scfg +cubepruning_pop_limit=100 +add_pass_through_rules=true +scfg_extra_glue_grammar=/export/ws10smt/cdyer/glue.scfg.gz +grammar=$grammar_path +feature_function=WordPenalty +feature_function=LanguageModel -o 3 $LANG_MODEL +EOT + close CDECINI; +}; + +sub print_help { + print STDERR<<EOT; + +Usage: $0 [OPTIONS] language-pair grammar.bidir.gz + +Given an induced grammar for an entire corpus (i.e., generated by +local-gi-pipeline.pl), filter and featurize it for a dev and test set, +run MERT, report scores. + +EOT +} |