summaryrefslogtreecommitdiff
path: root/training/utils/decode-and-evaluate.pl
diff options
context:
space:
mode:
authorMichael Denkowski <michael.j.denkowski@gmail.com>2012-12-22 16:01:23 -0500
committerMichael Denkowski <michael.j.denkowski@gmail.com>2012-12-22 16:01:23 -0500
commit597d89c11db53e91bc011eab70fd613bbe6453e8 (patch)
tree83c87c07d1ff6d3ee4e3b1626f7eddd49c61095b /training/utils/decode-and-evaluate.pl
parent65e958ff2678a41c22be7171456a63f002ef370b (diff)
parent201af2acd394415a05072fbd53d42584875aa4b4 (diff)
Merge branch 'master' of git://github.com/redpony/cdec
Diffstat (limited to 'training/utils/decode-and-evaluate.pl')
-rwxr-xr-xtraining/utils/decode-and-evaluate.pl246
1 files changed, 246 insertions, 0 deletions
diff --git a/training/utils/decode-and-evaluate.pl b/training/utils/decode-and-evaluate.pl
new file mode 100755
index 00000000..1a332c08
--- /dev/null
+++ b/training/utils/decode-and-evaluate.pl
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use File::Basename qw(basename);
+my $QSUB_CMD = qsub_args(mert_memory());
+
+require "libcall.pl";
+
+# Default settings
+my $default_jobs = env_default_jobs();
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $parallelize = "$bin_dir/parallelize.pl";
+my $libcall = "$bin_dir/libcall.pl";
+my $sentserver = "$bin_dir/sentserver";
+my $sentclient = "$bin_dir/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+my $cdec = "$bin_dir/../../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $jobs = $default_jobs; # number of decode nodes
+my $pmem = "9g";
+my $help = 0;
+my $config;
+my $test_set;
+my $weights;
+my $use_make = 1;
+my $useqsub;
+my $cpbin=1;
+# Process command-line options
+if (GetOptions(
+ "jobs=i" => \$jobs,
+ "help" => \$help,
+ "qsub" => \$useqsub,
+ "input=s" => \$test_set,
+ "config=s" => \$config,
+ "weights=s" => \$weights,
+) == 0 || @ARGV!=0 || $help) {
+ print_help();
+ exit;
+}
+
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+
+if (!defined $test_set) { push @missing_args, "--input"; }
+if (!defined $config) { push @missing_args, "--config"; }
+if (!defined $weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args);
+
+my @tf = localtime(time);
+my $tname = basename($test_set);
+$tname =~ s/\.(sgm|sgml|xml)$//i;
+my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]);
+
+my $time = unchecked_output("date");
+
+check_call("mkdir -p $dir");
+
+split_devset($test_set, "$dir/test.input.raw", "$dir/test.refs");
+my $refs = "-r $dir/test.refs";
+my $newsrc = "$dir/test.input";
+enseg("$dir/test.input.raw", $newsrc);
+my $src_file = $newsrc;
+open F, "<$src_file" or die "Can't read $src_file: $!"; close F;
+
+my $test_trans="$dir/test.trans";
+my $logdir="$dir/logs";
+my $decoderLog="$logdir/decoder.sentserver.log";
+check_call("mkdir -p $logdir");
+
+#decode
+print STDERR "RUNNING DECODER AT ";
+print STDERR unchecked_output("date");
+my $decoder_cmd = "$decoder -c $config --weights $weights";
+my $pcmd;
+if ($use_make) {
+ $pcmd = "cat $src_file | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --";
+} else {
+ $pcmd = "cat $src_file | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --";
+}
+my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $test_trans";
+check_bash_call($cmd);
+print STDERR "DECODER COMPLETED AT ";
+print STDERR unchecked_output("date");
+print STDERR "\nOUTPUT: $test_trans\n\n";
+my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu");
+chomp $bleu;
+print STDERR "BLEU: $bleu\n";
+my $ter = check_output("cat $test_trans | $SCORER $refs -m ter");
+chomp $ter;
+print STDERR " TER: $ter\n";
+open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!";
+print TR <<EOT;
+### SCORE REPORT #############################################################
+ OUTPUT=$test_trans
+ SCRIPT INPUT=$test_set
+ DECODER INPUT=$src_file
+ REFERENCES=$dir/test.refs
+------------------------------------------------------------------------------
+ BLEU=$bleu
+ TER=$ter
+##############################################################################
+EOT
+close TR;
+my $sr = unchecked_output("cat $dir/test.scores");
+print STDERR "\n\n$sr\n(A copy of this report can be found in $dir/test.scores)\n\n";
+exit 0;
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
+ print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
+ } else {
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ }
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+}
+
+sub print_help {
+ my $executable = basename($0); chomp $executable;
+ print << "Help";
+
+Usage: $executable [options] <ini file>
+
+ $executable --config cdec.ini --weights weights.txt [--jobs N] [--qsub] <testset.in-ref>
+
+Options:
+
+ --help
+ Print this message and exit.
+
+ --config <file>
+ A path to the cdec.ini file.
+
+ --weights <file>
+ A file specifying feature weights.
+
+ --dir <dir>
+ Directory for intermediate and output files.
+
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
+
+sub cmdline {
+ return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+
+sub escaped_shell_args {
+ return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+ return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+ return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
+
+sub split_devset {
+ my ($infile, $outsrc, $outref) = @_;
+ open F, "<$infile" or die "Can't read $infile: $!";
+ open S, ">$outsrc" or die "Can't write $outsrc: $!";
+ open R, ">$outref" or die "Can't write $outref: $!";
+ while(<F>) {
+ chomp;
+ my ($src, @refs) = split /\s*\|\|\|\s*/;
+ die "Malformed devset line: $_\n" unless scalar @refs > 0;
+ print S "$src\n";
+ print R join(' ||| ', @refs) . "\n";
+ }
+ close R;
+ close S;
+ close F;
+}
+