diff options
author | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-22 16:01:23 -0500 |
---|---|---|
committer | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-22 16:01:23 -0500 |
commit | 597d89c11db53e91bc011eab70fd613bbe6453e8 (patch) | |
tree | 83c87c07d1ff6d3ee4e3b1626f7eddd49c61095b /training/utils/decode-and-evaluate.pl | |
parent | 65e958ff2678a41c22be7171456a63f002ef370b (diff) | |
parent | 201af2acd394415a05072fbd53d42584875aa4b4 (diff) |
Merge branch 'master' of git://github.com/redpony/cdec
Diffstat (limited to 'training/utils/decode-and-evaluate.pl')
-rwxr-xr-x | training/utils/decode-and-evaluate.pl | 246 |
1 files changed, 246 insertions, 0 deletions
diff --git a/training/utils/decode-and-evaluate.pl b/training/utils/decode-and-evaluate.pl new file mode 100755 index 00000000..1a332c08 --- /dev/null +++ b/training/utils/decode-and-evaluate.pl @@ -0,0 +1,246 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use File::Basename qw(basename); +my $QSUB_CMD = qsub_args(mert_memory()); + +require "libcall.pl"; + +# Default settings +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $parallelize = "$bin_dir/parallelize.pl"; +my $libcall = "$bin_dir/libcall.pl"; +my $sentserver = "$bin_dir/sentserver"; +my $sentclient = "$bin_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "9g"; +my $help = 0; +my $config; +my $test_set; +my $weights; +my $use_make = 1; +my $useqsub; +my $cpbin=1; +# Process command-line options +if (GetOptions( + "jobs=i" => \$jobs, + "help" => \$help, + "qsub" => \$useqsub, + "input=s" => \$test_set, + "config=s" => \$config, + "weights=s" => \$weights, +) == 0 || @ARGV!=0 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); + +if (!defined $test_set) { push @missing_args, "--input"; } +if (!defined $config) { push @missing_args, "--config"; } +if (!defined $weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); + +my @tf = localtime(time); +my $tname = basename($test_set); +$tname =~ s/\.(sgm|sgml|xml)$//i; +my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]); + +my $time = unchecked_output("date"); + +check_call("mkdir -p $dir"); + +split_devset($test_set, "$dir/test.input.raw", "$dir/test.refs"); +my $refs = "-r $dir/test.refs"; +my $newsrc = "$dir/test.input"; +enseg("$dir/test.input.raw", $newsrc); +my $src_file = $newsrc; +open F, "<$src_file" or die "Can't read $src_file: $!"; close F; + +my $test_trans="$dir/test.trans"; +my $logdir="$dir/logs"; +my $decoderLog="$logdir/decoder.sentserver.log"; +check_call("mkdir -p $logdir"); + +#decode +print STDERR "RUNNING DECODER AT "; +print STDERR unchecked_output("date"); +my $decoder_cmd = "$decoder -c $config --weights $weights"; +my $pcmd; +if ($use_make) { + $pcmd = "cat $src_file | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; +} else { + $pcmd = "cat $src_file | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; +} +my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $test_trans"; +check_bash_call($cmd); +print STDERR "DECODER COMPLETED AT "; +print STDERR unchecked_output("date"); +print STDERR "\nOUTPUT: $test_trans\n\n"; +my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu"); +chomp $bleu; +print STDERR "BLEU: $bleu\n"; +my $ter = check_output("cat $test_trans | $SCORER $refs -m ter"); +chomp $ter; +print STDERR " TER: $ter\n"; +open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!"; +print TR <<EOT; +### SCORE REPORT ############################################################# + OUTPUT=$test_trans + SCRIPT INPUT=$test_set + DECODER INPUT=$src_file + REFERENCES=$dir/test.refs +------------------------------------------------------------------------------ + BLEU=$bleu + TER=$ter +############################################################################## +EOT +close TR; +my $sr = unchecked_output("cat $dir/test.scores"); +print STDERR "\n\n$sr\n(A copy of this report can be found in $dir/test.scores)\n\n"; +exit 0; + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + my $executable = basename($0); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable --config cdec.ini --weights weights.txt [--jobs N] [--qsub] <testset.in-ref> + +Options: + + --help + Print this message and exit. + + --config <file> + A path to the cdec.ini file. + + --weights <file> + A file specifying feature weights. + + --dir <dir> + Directory for intermediate and output files. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { + my ($infile, $outsrc, $outref) = @_; + open F, "<$infile" or die "Can't read $infile: $!"; + open S, ">$outsrc" or die "Can't write $outsrc: $!"; + open R, ">$outref" or die "Can't write $outref: $!"; + while(<F>) { + chomp; + my ($src, @refs) = split /\s*\|\|\|\s*/; + die "Malformed devset line: $_\n" unless scalar @refs > 0; + print S "$src\n"; + print R join(' ||| ', @refs) . "\n"; + } + close R; + close S; + close F; +} + |