From b308a215063f45c3f89c54ed3fefbbe132aef1d0 Mon Sep 17 00:00:00 2001 From: redpony Date: Mon, 5 Jul 2010 16:31:39 +0000 Subject: stub for evaluation pipeline git-svn-id: https://ws10smt.googlecode.com/svn/trunk@129 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/config.eval | 8 ++++ gi/pipeline/evaluation-pipeline.pl | 94 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 gi/pipeline/config.eval create mode 100755 gi/pipeline/evaluation-pipeline.pl (limited to 'gi') diff --git a/gi/pipeline/config.eval b/gi/pipeline/config.eval new file mode 100644 index 00000000..4419de9f --- /dev/null +++ b/gi/pipeline/config.eval @@ -0,0 +1,8 @@ +# name path aligned corpus dev dev-refs test1 test1-refs ... +btec btec/ split.zh-en.al lm/en.3gram.lm.gz devtest/devset1_2.zh devtest/devset1_2.lc.en* devtest/devset3.zh devtest/devset3.lc.en* +fbis chinese-english.fbis corpus.zh-en.al +zhen chinese-english corpus.zh-en.al +aren arabic-english corpus.ar-en.al +uren urdu-english corpus.ur-en.al +nlfr dutch-french corpus.nl-fr.al + diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl new file mode 100755 index 00000000..a7cc20bc --- /dev/null +++ b/gi/pipeline/evaluation-pipeline.pl @@ -0,0 +1,94 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; + +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } + +my %init_weights = qw( + EGivenF -0.3 + FGivenE -0.3 + LexE2F -0.3 + LexF2E -0.3 + WordPenalty -1.5 + LanguageModel 1.2 + Glue -1.0 + GlueTop 0.00001 + PassThrough -10.0 + X_EGivenF -0.3 + X_FGivenE -0.3 +); + +my $config = "$SCRIPT_DIR/config.eval"; +open CONF, "<$config" or die "Can't read $config: $!"; +my %paths; +my %lms; +my %devs; +my %devrefs; +my %tests; +my %testrefs; +print STDERR "LANGUAGE PAIRS:"; +while() { + chomp; + next if /^#/; + next if /^\s*$/; + s/^\s+//; + s/\s+$//; + my ($name, $path, $lm, $dev, $devref, @xtests) = split /\s+/; + $paths{$name} = $path; + $lms{$name} = $lm; + $devs{$name} = $dev; + $devrefs{$name} = $devref; + $tests{$name} = $xtests[0]; + $testrefs{$name} = $xtests[1]; + print STDERR " $name"; +} +print STDERR "\n"; + +my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr ); + +my $help; +my $dataDir = '/export/ws10smt/data'; +if (GetOptions( + "data=s" => \$dataDir, +) == 0 || @ARGV!=2 || $help) { + print_help(); + exit; +} +my $lp = $ARGV[0]; +my $grammar = $ARGV[1]; +print STDERR " CORPUS REPO: $dataDir\n"; +print STDERR " LANGUAGE PAIR: $lp\n"; +die "I don't know about that language pair\n" unless $paths{$lp}; +my $corpdir = "$dataDir"; +if ($paths{$lp} =~ /^\//) { $corpdir = $paths{$lp}; } else { $corpdir .= '/' . $paths{$lp}; } +die "I can't find the corpora directory: $corpdir" unless -d $corpdir; +print STDERR " GRAMMAR: $grammar\n"; +my $LANG_MODEL = $corpdir . '/' . $lms{$lp}; +print STDERR " LM: $LANG_MODEL\n"; + +sub write_cdec_ini { + my ($filename, $grammar_path) = (@_); + open CDECINI, ">$filename" or die "Can't write $filename: $!"; + print CDECINI <