diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-14 20:33:51 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-14 20:33:51 -0500 |
commit | 7928695272b000de7142b91e05959a8fab6b1d2a (patch) | |
tree | 59fdff666e938512a34f772f04a1a247704a246f /dpmert | |
parent | 41ec6ee5146c92cdb1c279267a5058fe42f8a644 (diff) |
major mert clean up, stuff for simple system demo
Diffstat (limited to 'dpmert')
-rwxr-xr-x | dpmert/decode-and-evaluate.pl | 246 | ||||
-rwxr-xr-x | dpmert/dpmert.pl | 237 | ||||
-rwxr-xr-x | dpmert/parallelize.pl | 6 |
3 files changed, 316 insertions, 173 deletions
diff --git a/dpmert/decode-and-evaluate.pl b/dpmert/decode-and-evaluate.pl new file mode 100755 index 00000000..fe765d00 --- /dev/null +++ b/dpmert/decode-and-evaluate.pl @@ -0,0 +1,246 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use File::Basename qw(basename); +my $QSUB_CMD = qsub_args(mert_memory()); + +require "libcall.pl"; + +# Default settings +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $parallelize = "$bin_dir/parallelize.pl"; +my $libcall = "$bin_dir/libcall.pl"; +my $sentserver = "$bin_dir/sentserver"; +my $sentclient = "$bin_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +my $cdec = "$bin_dir/../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "9g"; +my $help = 0; +my $config; +my $test_set; +my $weights; +my $use_make = 1; +my $useqsub; +my $cpbin=1; +# Process command-line options +if (GetOptions( + "jobs=i" => \$jobs, + "help" => \$help, + "qsub" => \$useqsub, + "input=s" => \$test_set, + "config=s" => \$config, + "weights=s" => \$weights, +) == 0 || @ARGV!=0 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); + +if (!defined $test_set) { push @missing_args, "--input"; } +if (!defined $config) { push @missing_args, "--config"; } +if (!defined $weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); + +my @tf = localtime(time); +my $tname = basename($test_set); +$tname =~ s/\.(sgm|sgml|xml)$//i; +my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]); + +my $time = unchecked_output("date"); + +check_call("mkdir -p $dir"); + +split_devset($test_set, "$dir/test.input.raw", "$dir/test.refs"); +my $refs = "-r $dir/test.refs"; +my $newsrc = "$dir/test.input"; +enseg("$dir/test.input.raw", $newsrc); +my $src_file = $newsrc; +open F, "<$src_file" or die "Can't read $src_file: $!"; close F; + +my $test_trans="$dir/test.trans"; +my $logdir="$dir/logs"; +my $decoderLog="$logdir/decoder.sentserver.log"; +check_call("mkdir -p $logdir"); + +#decode +print STDERR "RUNNING DECODER AT "; +print STDERR unchecked_output("date"); +my $decoder_cmd = "$decoder -c $config --weights $weights"; +my $pcmd; +if ($use_make) { + $pcmd = "cat $src_file | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; +} else { + $pcmd = "cat $src_file | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; +} +my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $test_trans"; +check_bash_call($cmd); +print STDERR "DECODER COMPLETED AT "; +print STDERR unchecked_output("date"); +print STDERR "\nOUTPUT: $test_trans\n\n"; +my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu"); +chomp $bleu; +print STDERR "BLEU: $bleu\n"; +my $ter = check_output("cat $test_trans | $SCORER $refs -m ter"); +chomp $ter; +print STDERR " TER: $ter\n"; +open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!"; +print TR <<EOT; +### SCORE REPORT ############################################################# + OUTPUT=$test_trans + SCRIPT INPUT=$test_set + DECODER INPUT=$src_file + REFERENCES=$dir/test.refs +------------------------------------------------------------------------------ + BLEU=$bleu + TER=$ter +############################################################################## +EOT +close TR; +my $sr = unchecked_output("cat $dir/test.scores"); +print STDERR "\n\n$sr\n(A copy of this report can be found in $dir/test.scores)\n\n"; +exit 0; + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + my $executable = basename($0); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable --config cdec.ini --weights weights.txt [--jobs N] [--qsub] <testset.in-ref> + +Options: + + --help + Print this message and exit. + + --config <file> + A path to the cdec.ini file. + + --weights <file> + A file specifying feature weights. + + --dir <dir> + Directory for intermediate and output files. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { + my ($infile, $outsrc, $outref) = @_; + open F, "<$infile" or die "Can't read $infile: $!"; + open S, ">$outsrc" or die "Can't write $outsrc: $!"; + open R, ">$outref" or die "Can't write $outref: $!"; + while(<F>) { + chomp; + my ($src, @refs) = split /\s*\|\|\|\s*/; + die "Malformed devset line: $_\n" unless scalar @refs > 0; + print S "$src\n"; + print R join(' ||| ', @refs) . "\n"; + } + close R; + close S; + close F; +} + diff --git a/dpmert/dpmert.pl b/dpmert/dpmert.pl index 2e6a9728..c4f98870 100755 --- a/dpmert/dpmert.pl +++ b/dpmert/dpmert.pl @@ -7,15 +7,14 @@ my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR # Skip local config (used for distributing jobs) if we're running in local-only mode use LocalConfig; use Getopt::Long; -use IPC::Open2; -use POSIX ":sys_wait_h"; -my $QSUB_CMD = qsub_args(mert_memory()); - +use File::Basename qw(basename); require "libcall.pl"; +my $QSUB_CMD = qsub_args(mert_memory()); + # Default settings -my $srcFile; -my $refFiles; +my $srcFile; # deprecated +my $refFiles; # deprecated my $default_jobs = env_default_jobs(); my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; @@ -37,7 +36,7 @@ die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; die "Can't find $libcall" unless -e $libcall; my $decoder = $cdec; -my $lines_per_mapper = 400; +my $lines_per_mapper = 200; my $rand_directions = 15; my $iteration = 1; my $best_weights; @@ -47,53 +46,35 @@ my $jobs = $default_jobs; # number of decode nodes my $pmem = "9g"; my $disable_clean = 0; my %seen_weights; -my $normalize; my $help = 0; my $epsilon = 0.0001; -my $interval = 5; -my $dryrun = 0; my $last_score = -10000000; my $metric = "ibm_bleu"; my $dir; my $iniFile; my $weights; my $initialWeights; -my $decoderOpt; -my $noprimary; -my $maxsim=0; -my $oraclen=0; -my $oracleb=20; my $bleu_weight=1; my $use_make = 1; # use make to parallelize line search my $useqsub; my $pass_suffix = ''; -my $devset = ''; -my $cpbin=1; +my $devset; # Process command-line options -Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( - "decoder=s" => \$decoderOpt, + "config=s" => \$iniFile, + "weights=s" => \$initialWeights, + "devset=s" => \$devset, "jobs=i" => \$jobs, - "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, - "dry-run" => \$dryrun, - "epsilon=s" => \$epsilon, "help" => \$help, - "interval" => \$interval, "qsub" => \$useqsub, - "max-iterations=i" => \$max_iterations, - "normalize=s" => \$normalize, + "iterations=i" => \$max_iterations, "pmem=s" => \$pmem, - "cpbin!" => \$cpbin, "random-directions=i" => \$rand_directions, - "devset=s" => \$devset, - "ref-files=s" => \$refFiles, "metric=s" => \$metric, "source-file=s" => \$srcFile, - "weights=s" => \$initialWeights, - "workdir=s" => \$dir, - "opt-iterations=i" => \$optimization_iters, -) == 0 || @ARGV!=1 || $help) { + "output-dir=s" => \$dir, +) == 0 || @ARGV!=0 || $help) { print_help(); exit; } @@ -114,22 +95,17 @@ if (defined $srcFile || defined $refFiles) { EOT } +if (!defined $iniFile) { push @missing_args, "--config"; } if (!defined $devset) { push @missing_args, "--devset"; } if (!defined $initialWeights) { push @missing_args, "--weights"; } -die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); +die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); if ($metric =~ /^(combi|ter)$/i) { $lines_per_mapper = 40; } elsif ($metric =~ /^meteor$/i) { - $lines_per_mapper = 2000; # start up time is really high + $lines_per_mapper = 2000; # start up time is really high for METEOR } -($iniFile) = @ARGV; - - -sub write_config; -sub enseg; -sub print_help; my $nodelist; my $host =check_output("hostname"); chomp $host; @@ -153,8 +129,6 @@ unless ($dir =~ /^\//){ # convert relative path to absolute path $dir = "$basedir/$dir"; } -if ($decoderOpt){ $decoder = $decoderOpt; } - # Initializations and helper functions srand; @@ -169,73 +143,47 @@ sub cleanup { exit 1; }; # Always call cleanup, no matter how we exit -*CORE::GLOBAL::exit = - sub{ cleanup(); }; +*CORE::GLOBAL::exit = sub{ cleanup(); }; $SIG{INT} = "cleanup"; $SIG{TERM} = "cleanup"; $SIG{HUP} = "cleanup"; -my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $decoderBase = basename($decoder); chomp $decoderBase; my $newIniFile = "$dir/$decoderBase.ini"; my $inputFileName = "$dir/input"; my $user = $ENV{"USER"}; - # process ini file -e $iniFile || die "Error: could not open $iniFile for reading\n"; -open(INI, $iniFile); -use File::Basename qw(basename); -#pass bindir, refs to vars holding bin -sub modbin { - local $_; - my $bindir=shift; - check_call("mkdir -p $bindir"); - -d $bindir || die "couldn't make bindir $bindir"; - for (@_) { - my $src=$$_; - $$_="$bindir/".basename($src); - check_call("cp -p $src $$_"); - } -} sub dirsize { opendir ISEMPTY,$_[0]; return scalar(readdir(ISEMPTY))-1; } -if ($dryrun){ - write_config(*STDERR); - exit 0; +if (-e $dir) { + # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs + die "ERROR: output directory $dir already exists (remove or use --output-dir dir)\n\n"; } else { - if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs - die "ERROR: working dir $dir already exists\n\n"; - } else { - -e $dir || mkdir $dir; - mkdir "$dir/hgs"; - modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; - mkdir "$dir/scripts"; - my $cmdfile="$dir/rerun-dpmert.sh"; - open CMD,'>',$cmdfile; - print CMD "cd ",&getcwd,"\n"; -# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. - my $cline=&cmdline."\n"; - print CMD $cline; - close CMD; - print STDERR $cline; - chmod(0755,$cmdfile); - unless (-e $initialWeights) { - print STDERR "Please specify an initial weights file with --initial-weights\n"; - print_help(); - exit; - } - check_call("cp $initialWeights $dir/weights.0"); - die "Can't find weights.0" unless (-e "$dir/weights.0"); - } - write_config(*STDERR); + mkdir "$dir" or die "Can't mkdir $dir: $!"; + mkdir "$dir/hgs" or die; + mkdir "$dir/scripts" or die; + print STDERR <<EOT; + DECODER: $decoder + INI FILE: $iniFile + WORKING DIR: $dir + DEVSET: $devset + EVAL METRIC: $metric + MAX ITERATIONS: $max_iterations + PARALLEL JOBS: $jobs + HEAD NODE: $host + PMEM (DECODING): $pmem + INITIAL WEIGHTS: $initialWeights +EOT } - # Generate initial files and values check_call("cp $iniFile $newIniFile"); +check_call("cp $initialWeights $dir/weights.0"); $iniFile = $newIniFile; split_devset($devset, "$dir/dev.input.raw", "$dir/dev.refs"); @@ -280,9 +228,9 @@ while (1){ my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; my $pcmd; if ($use_make) { - $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; + $pcmd = "cat $srcFile | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; } else { - $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; + $pcmd = "cat $srcFile | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; } my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; print STDERR "COMMAND:\n$cmd\n"; @@ -469,29 +417,11 @@ while (1){ print STDERR "\n==========\n"; } -print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; - -print STDOUT "$lastWeightsFile\n"; - +check_call("cp $lastWeightsFile $dir/weights.final"); +print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n"; +print STDOUT "$dir/weights.final\n"; exit 0; -sub normalize_weights { - my ($rfn, $rpts, $feat) = @_; - my @feat_names = @$rfn; - my @pts = @$rpts; - my $z = 1.0; - for (my $i=0; $i < scalar @feat_names; $i++) { - if ($feat_names[$i] eq $feat) { - $z = $pts[$i]; - last; - } - } - for (my $i=0; $i < scalar @feat_names; $i++) { - $pts[$i] /= $z; - } - print STDERR " NORM WEIGHTS: @pts\n"; - return @pts; -} sub get_lines { my $fn = shift @_; @@ -523,27 +453,6 @@ sub read_weights_file { return join ' ', @r; } -# subs -sub write_config { - my $fh = shift; - my $cleanup = "yes"; - if ($disable_clean) {$cleanup = "no";} - - print $fh "\n"; - print $fh "DECODER: $decoder\n"; - print $fh "INI FILE: $iniFile\n"; - print $fh "WORKING DIR: $dir\n"; - print $fh "DEVSET: $devset\n"; - print $fh "EVAL METRIC: $metric\n"; - print $fh "START ITERATION: $iteration\n"; - print $fh "MAX ITERATIONS: $max_iterations\n"; - print $fh "PARALLEL JOBS: $jobs\n"; - print $fh "HEAD NODE: $host\n"; - print $fh "PMEM (DECODING): $pmem\n"; - print $fh "CLEANUP: $cleanup\n"; - print $fh "INITIAL WEIGHTS: $initialWeights\n"; -} - sub update_weights_file { my ($neww, $rfn, $rpts) = @_; my @feats = @$rfn; @@ -585,22 +494,34 @@ sub enseg { sub print_help { - my $executable = check_output("basename $0"); chomp $executable; - print << "Help"; + my $executable = basename($0); chomp $executable; + print << "Help"; Usage: $executable [options] <ini file> - $executable [options] <ini file> - Runs a complete MERT optimization using the decoder configuration - in <ini file>. Required options are --weights, --source-file, and - --ref-files. + $executable [options] + Runs a complete MERT optimization. Required options are --weights, + --devset, and --config. Options: - --help - Print this message and exit. + --config <file> [-c <file>] + The decoder configuration file. + + --devset <file> [-d <file>] + The source *and* references for the development set. + + --weights <file> [-w <file>] + A file specifying initial feature weights. The format is + FeatureName_1 value1 + FeatureName_2 value2 + **All and only the weights listed in <file> will be optimized!** + + --metric <name> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi - --max-iterations <M> + --iterations <M> Maximum number of iterations to run. If not specified, defaults to 10. @@ -608,39 +529,15 @@ Options: If the decoder is doing multi-pass decoding, the pass suffix "2", "3", etc., is used to control what iteration of weights is set. - --ref-files <files> - Dev set ref files. This option takes only a single string argument. - To use multiple files (including file globbing), this argument should - be quoted. - - --metric <method> - Metric to optimize. - Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi - - --normalize <feature-name> - After each iteration, rescale all feature weights such that feature- - name has a weight of 1.0. - --rand-directions <num> MERT will attempt to optimize along all of the principle directions, set this parameter to explore other directions. Defaults to 5. - --source-file <file> - Dev set source file. + --output-dir <dir> + Directory for intermediate and output files. - --weights <file> - A file specifying initial feature weights. The format is - FeatureName_1 value1 - FeatureName_2 value2 - **All and only the weights listed in <file> will be optimized!** - - --workdir <dir> - Directory for intermediate and output files. If not specified, the - name is derived from the ini filename. Assuming that the ini - filename begins with the decoder name and ends with ini, the default - name of the working directory is inferred from the middle part of - the filename. E.g. an ini file named decoder.foo.ini would have - a default working directory name foo. + --help + Print this message and exit. Job control options: diff --git a/dpmert/parallelize.pl b/dpmert/parallelize.pl index 7d0365cc..d2ebaeea 100755 --- a/dpmert/parallelize.pl +++ b/dpmert/parallelize.pl @@ -40,7 +40,7 @@ my $stay_alive; # dont let server die when having zero clients my $joblist = ""; my $errordir=""; my $multiline; -my @files_to_stage; +my $workdir = '.'; my $numnodes = 8; my $user = $ENV{"USER"}; my $pmem = "9g"; @@ -128,7 +128,7 @@ unless (GetOptions( "recycle-clients" => \$recycle_clients, "error-dir=s" => \$errordir, "multi-line" => \$multiline, - "file=s" => \@files_to_stage, + "workdir=s" => \$workdir, "use-fork" => \$use_fork, "verbose" => \$verbose, "jobs=i" => \$numnodes, @@ -363,7 +363,7 @@ sub launch_job_fork { } sub get_temp_script { - my ($fh, $filename) = tempfile( "workXXXX", SUFFIX => '.sh'); + my ($fh, $filename) = tempfile( "$workdir/workXXXX", SUFFIX => '.sh'); return ($fh, $filename); } |