diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-18 13:35:42 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-18 13:35:42 -0500 |
commit | 1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 (patch) | |
tree | 33e5f3aa5abff1f41314cf8f6afbd2c2c40e4bfd /dpmert/decode-and-evaluate.pl | |
parent | 7c4665949fb93fb3de402e4ce1d19bef67850d05 (diff) |
major restructure of the training code
Diffstat (limited to 'dpmert/decode-and-evaluate.pl')
-rwxr-xr-x | dpmert/decode-and-evaluate.pl | 246 |
1 files changed, 0 insertions, 246 deletions
diff --git a/dpmert/decode-and-evaluate.pl b/dpmert/decode-and-evaluate.pl deleted file mode 100755 index fe765d00..00000000 --- a/dpmert/decode-and-evaluate.pl +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/env perl -use strict; -my @ORIG_ARGV=@ARGV; -use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } - -# Skip local config (used for distributing jobs) if we're running in local-only mode -use LocalConfig; -use Getopt::Long; -use File::Basename qw(basename); -my $QSUB_CMD = qsub_args(mert_memory()); - -require "libcall.pl"; - -# Default settings -my $default_jobs = env_default_jobs(); -my $bin_dir = $SCRIPT_DIR; -die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; -die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; -my $parallelize = "$bin_dir/parallelize.pl"; -my $libcall = "$bin_dir/libcall.pl"; -my $sentserver = "$bin_dir/sentserver"; -my $sentclient = "$bin_dir/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; - -my $SCORER = $FAST_SCORE; -my $cdec = "$bin_dir/../decoder/cdec"; -die "Can't find decoder in $cdec" unless -x $cdec; -die "Can't find $parallelize" unless -x $parallelize; -die "Can't find $libcall" unless -e $libcall; -my $decoder = $cdec; -my $jobs = $default_jobs; # number of decode nodes -my $pmem = "9g"; -my $help = 0; -my $config; -my $test_set; -my $weights; -my $use_make = 1; -my $useqsub; -my $cpbin=1; -# Process command-line options -if (GetOptions( - "jobs=i" => \$jobs, - "help" => \$help, - "qsub" => \$useqsub, - "input=s" => \$test_set, - "config=s" => \$config, - "weights=s" => \$weights, -) == 0 || @ARGV!=0 || $help) { - print_help(); - exit; -} - -if ($useqsub) { - $use_make = 0; - die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); -} - -my @missing_args = (); - -if (!defined $test_set) { push @missing_args, "--input"; } -if (!defined $config) { push @missing_args, "--config"; } -if (!defined $weights) { push @missing_args, "--weights"; } -die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); - -my @tf = localtime(time); -my $tname = basename($test_set); -$tname =~ s/\.(sgm|sgml|xml)$//i; -my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]); - -my $time = unchecked_output("date"); - -check_call("mkdir -p $dir"); - -split_devset($test_set, "$dir/test.input.raw", "$dir/test.refs"); -my $refs = "-r $dir/test.refs"; -my $newsrc = "$dir/test.input"; -enseg("$dir/test.input.raw", $newsrc); -my $src_file = $newsrc; -open F, "<$src_file" or die "Can't read $src_file: $!"; close F; - -my $test_trans="$dir/test.trans"; -my $logdir="$dir/logs"; -my $decoderLog="$logdir/decoder.sentserver.log"; -check_call("mkdir -p $logdir"); - -#decode -print STDERR "RUNNING DECODER AT "; -print STDERR unchecked_output("date"); -my $decoder_cmd = "$decoder -c $config --weights $weights"; -my $pcmd; -if ($use_make) { - $pcmd = "cat $src_file | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; -} else { - $pcmd = "cat $src_file | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; -} -my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $test_trans"; -check_bash_call($cmd); -print STDERR "DECODER COMPLETED AT "; -print STDERR unchecked_output("date"); -print STDERR "\nOUTPUT: $test_trans\n\n"; -my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu"); -chomp $bleu; -print STDERR "BLEU: $bleu\n"; -my $ter = check_output("cat $test_trans | $SCORER $refs -m ter"); -chomp $ter; -print STDERR " TER: $ter\n"; -open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!"; -print TR <<EOT; -### SCORE REPORT ############################################################# - OUTPUT=$test_trans - SCRIPT INPUT=$test_set - DECODER INPUT=$src_file - REFERENCES=$dir/test.refs ------------------------------------------------------------------------------- - BLEU=$bleu - TER=$ter -############################################################################## -EOT -close TR; -my $sr = unchecked_output("cat $dir/test.scores"); -print STDERR "\n\n$sr\n(A copy of this report can be found in $dir/test.scores)\n\n"; -exit 0; - -sub enseg { - my $src = shift; - my $newsrc = shift; - open(SRC, $src); - open(NEWSRC, ">$newsrc"); - my $i=0; - while (my $line=<SRC>){ - chomp $line; - if ($line =~ /^\s*<seg/i) { - if($line =~ /id="[0-9]+"/) { - print NEWSRC "$line\n"; - } else { - die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; - } - } else { - print NEWSRC "<seg id=\"$i\">$line</seg>\n"; - } - $i++; - } - close SRC; - close NEWSRC; -} - -sub print_help { - my $executable = basename($0); chomp $executable; - print << "Help"; - -Usage: $executable [options] <ini file> - - $executable --config cdec.ini --weights weights.txt [--jobs N] [--qsub] <testset.in-ref> - -Options: - - --help - Print this message and exit. - - --config <file> - A path to the cdec.ini file. - - --weights <file> - A file specifying feature weights. - - --dir <dir> - Directory for intermediate and output files. - -Job control options: - - --jobs <I> - Number of decoder processes to run in parallel. [default=$default_jobs] - - --qsub - Use qsub to run jobs in parallel (qsub must be configured in - environment/LocalEnvironment.pm) - - --pmem <N> - Amount of physical memory requested for parallel decoding jobs - (used with qsub requests only) - -Help -} - -sub convert { - my ($str) = @_; - my @ps = split /;/, $str; - my %dict = (); - for my $p (@ps) { - my ($k, $v) = split /=/, $p; - $dict{$k} = $v; - } - return %dict; -} - - - -sub cmdline { - return join ' ',($0,@ORIG_ARGV); -} - -#buggy: last arg gets quoted sometimes? -my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; -my $shell_escape_in_quote=qr{[\\"\$`!]}; - -sub escape_shell { - my ($arg)=@_; - return undef unless defined $arg; - if ($arg =~ /$is_shell_special/) { - $arg =~ s/($shell_escape_in_quote)/\\$1/g; - return "\"$arg\""; - } - return $arg; -} - -sub escaped_shell_args { - return map {local $_=$_;chomp;escape_shell($_)} @_; -} - -sub escaped_shell_args_str { - return join ' ',&escaped_shell_args(@_); -} - -sub escaped_cmdline { - return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); -} - -sub split_devset { - my ($infile, $outsrc, $outref) = @_; - open F, "<$infile" or die "Can't read $infile: $!"; - open S, ">$outsrc" or die "Can't write $outsrc: $!"; - open R, ">$outref" or die "Can't write $outref: $!"; - while(<F>) { - chomp; - my ($src, @refs) = split /\s*\|\|\|\s*/; - die "Malformed devset line: $_\n" unless scalar @refs > 0; - print S "$src\n"; - print R join(' ||| ', @refs) . "\n"; - } - close R; - close S; - close F; -} - |