summaryrefslogtreecommitdiff
path: root/gi/pipeline/evaluation-pipeline.pl
blob: a7cc20bcdd1a06ba9956bc4820ed50b212f8b6d8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/perl -w
use strict;
use Getopt::Long;

my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }

my %init_weights = qw(
  EGivenF -0.3
  FGivenE -0.3
  LexE2F -0.3
  LexF2E -0.3
  WordPenalty -1.5
  LanguageModel 1.2
  Glue -1.0
  GlueTop 0.00001
  PassThrough -10.0
  X_EGivenF -0.3
  X_FGivenE -0.3
);

my $config = "$SCRIPT_DIR/config.eval";
open CONF, "<$config" or die "Can't read $config: $!";
my %paths;
my %lms;
my %devs;
my %devrefs;
my %tests;
my %testrefs;
print STDERR "LANGUAGE PAIRS:";
while(<CONF>) {
  chomp;
  next if /^#/;
  next if /^\s*$/;
  s/^\s+//;
  s/\s+$//;
  my ($name, $path, $lm, $dev, $devref, @xtests) = split /\s+/;
  $paths{$name} = $path;
  $lms{$name} = $lm;
  $devs{$name} = $dev;
  $devrefs{$name} = $devref;
  $tests{$name} = $xtests[0];
  $testrefs{$name} = $xtests[1];
  print STDERR " $name";
}
print STDERR "\n";

my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr );

my $help;
my $dataDir = '/export/ws10smt/data';
if (GetOptions(
        "data=s" => \$dataDir,
) == 0 || @ARGV!=2 || $help) {
        print_help();
        exit;
}
my $lp = $ARGV[0];
my $grammar = $ARGV[1];
print STDERR "   CORPUS REPO: $dataDir\n";
print STDERR " LANGUAGE PAIR: $lp\n";
die "I don't know about that language pair\n" unless $paths{$lp};
my $corpdir = "$dataDir";
if ($paths{$lp} =~ /^\//) { $corpdir = $paths{$lp}; } else { $corpdir .= '/' . $paths{$lp}; }
die "I can't find the corpora directory: $corpdir" unless -d $corpdir;
print STDERR "       GRAMMAR: $grammar\n";
my $LANG_MODEL = $corpdir . '/' . $lms{$lp};
print STDERR "            LM: $LANG_MODEL\n";

sub write_cdec_ini {
  my ($filename, $grammar_path) = (@_);
  open CDECINI, ">$filename" or die "Can't write $filename: $!";
  print CDECINI <<EOT;
formalism=scfg
cubepruning_pop_limit=100
add_pass_through_rules=true
scfg_extra_glue_grammar=/export/ws10smt/cdyer/glue.scfg.gz
grammar=$grammar_path
feature_function=WordPenalty
feature_function=LanguageModel -o 3 $LANG_MODEL
EOT
  close CDECINI;
};

sub print_help {
  print STDERR<<EOT;

Usage: $0 [OPTIONS] language-pair grammar.bidir.gz

Given an induced grammar for an entire corpus (i.e., generated by
local-gi-pipeline.pl), filter and featurize it for a dev and test set,
run MERT, report scores.

EOT
}