1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
my %init_weights = qw(
EGivenF -0.3
FGivenE -0.3
LexE2F -0.3
LexF2E -0.3
WordPenalty -1.5
LanguageModel 1.2
Glue -1.0
GlueTop 0.00001
PassThrough -10.0
X_EGivenF -0.3
X_FGivenE -0.3
);
my $config = "$SCRIPT_DIR/config.eval";
open CONF, "<$config" or die "Can't read $config: $!";
my %paths;
my %lms;
my %devs;
my %devrefs;
my %tests;
my %testrefs;
print STDERR "LANGUAGE PAIRS:";
while(<CONF>) {
chomp;
next if /^#/;
next if /^\s*$/;
s/^\s+//;
s/\s+$//;
my ($name, $path, $lm, $dev, $devref, @xtests) = split /\s+/;
$paths{$name} = $path;
$lms{$name} = $lm;
$devs{$name} = $dev;
$devrefs{$name} = $devref;
$tests{$name} = $xtests[0];
$testrefs{$name} = $xtests[1];
print STDERR " $name";
}
print STDERR "\n";
my %langpairs = map { $_ => 1 } qw( btec zhen fbis aren uren nlfr );
my $help;
my $dataDir = '/export/ws10smt/data';
if (GetOptions(
"data=s" => \$dataDir,
) == 0 || @ARGV!=2 || $help) {
print_help();
exit;
}
my $lp = $ARGV[0];
my $grammar = $ARGV[1];
print STDERR " CORPUS REPO: $dataDir\n";
print STDERR " LANGUAGE PAIR: $lp\n";
die "I don't know about that language pair\n" unless $paths{$lp};
my $corpdir = "$dataDir";
if ($paths{$lp} =~ /^\//) { $corpdir = $paths{$lp}; } else { $corpdir .= '/' . $paths{$lp}; }
die "I can't find the corpora directory: $corpdir" unless -d $corpdir;
print STDERR " GRAMMAR: $grammar\n";
my $LANG_MODEL = $corpdir . '/' . $lms{$lp};
print STDERR " LM: $LANG_MODEL\n";
sub write_cdec_ini {
my ($filename, $grammar_path) = (@_);
open CDECINI, ">$filename" or die "Can't write $filename: $!";
print CDECINI <<EOT;
formalism=scfg
cubepruning_pop_limit=100
add_pass_through_rules=true
scfg_extra_glue_grammar=/export/ws10smt/cdyer/glue.scfg.gz
grammar=$grammar_path
feature_function=WordPenalty
feature_function=LanguageModel -o 3 $LANG_MODEL
EOT
close CDECINI;
};
sub print_help {
print STDERR<<EOT;
Usage: $0 [OPTIONS] language-pair grammar.bidir.gz
Given an induced grammar for an entire corpus (i.e., generated by
local-gi-pipeline.pl), filter and featurize it for a dev and test set,
run MERT, report scores.
EOT
}
|