summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--mteval/meteor_jar.cc.in3
-rw-r--r--pro/Makefile.am (renamed from pro-train/Makefile.am)0
-rw-r--r--pro/README.shared-mem (renamed from pro-train/README.shared-mem)0
-rwxr-xr-xpro/mr_pro_generate_mapper_input.pl (renamed from pro-train/mr_pro_generate_mapper_input.pl)0
-rw-r--r--pro/mr_pro_map.cc (renamed from pro-train/mr_pro_map.cc)0
-rw-r--r--pro/mr_pro_reduce.cc (renamed from pro-train/mr_pro_reduce.cc)0
-rwxr-xr-xpro/pro.pl (renamed from pro-train/dist-pro.pl)244
7 files changed, 67 insertions, 180 deletions
diff --git a/mteval/meteor_jar.cc.in b/mteval/meteor_jar.cc.in
new file mode 100644
index 00000000..fe45a72a
--- /dev/null
+++ b/mteval/meteor_jar.cc.in
@@ -0,0 +1,3 @@
+
+const char* meteor_jar_path = "@METEOR_JAR@";
+
diff --git a/pro-train/Makefile.am b/pro/Makefile.am
index 1e9d46b0..1e9d46b0 100644
--- a/pro-train/Makefile.am
+++ b/pro/Makefile.am
diff --git a/pro-train/README.shared-mem b/pro/README.shared-mem
index 7728efc0..7728efc0 100644
--- a/pro-train/README.shared-mem
+++ b/pro/README.shared-mem
diff --git a/pro-train/mr_pro_generate_mapper_input.pl b/pro/mr_pro_generate_mapper_input.pl
index b30fc4fd..b30fc4fd 100755
--- a/pro-train/mr_pro_generate_mapper_input.pl
+++ b/pro/mr_pro_generate_mapper_input.pl
diff --git a/pro-train/mr_pro_map.cc b/pro/mr_pro_map.cc
index eef40b8a..eef40b8a 100644
--- a/pro-train/mr_pro_map.cc
+++ b/pro/mr_pro_map.cc
diff --git a/pro-train/mr_pro_reduce.cc b/pro/mr_pro_reduce.cc
index 5ef9b470..5ef9b470 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro/mr_pro_reduce.cc
diff --git a/pro-train/dist-pro.pl b/pro/pro.pl
index 31258fa6..891b7e4c 100755
--- a/pro-train/dist-pro.pl
+++ b/pro/pro.pl
@@ -1,5 +1,6 @@
#!/usr/bin/env perl
use strict;
+use File::Basename qw(basename);
my @ORIG_ARGV=@ARGV;
use Cwd qw(getcwd);
my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
@@ -60,51 +61,38 @@ my $use_make = 1; # use make to parallelize
my $useqsub = 0;
my $initial_weights;
my $pass_suffix = '';
-my $cpbin=1;
+my $devset;
# regularization strength
-my $tune_regularizer = 0;
my $reg = 500;
my $reg_previous = 5000;
# Process command-line options
-Getopt::Long::Configure("no_auto_abbrev");
if (GetOptions(
+ "config=s" => \$iniFile,
+ "weights=s" => \$initial_weights,
+ "devset=s" => \$devset,
"jobs=i" => \$jobs,
- "dont-clean" => \$disable_clean,
+ "metric=s" => \$metric,
"pass-suffix=s" => \$pass_suffix,
"qsub" => \$useqsub,
- "dry-run" => \$dryrun,
- "epsilon=s" => \$epsilon,
- "interpolate-with-weights=f" => \$psi,
"help" => \$help,
- "weights=s" => \$initial_weights,
- "tune-regularizer" => \$tune_regularizer,
"reg=f" => \$reg,
"reg-previous=f" => \$reg_previous,
- "use-make=i" => \$use_make,
- "max-iterations=i" => \$max_iterations,
- "pmem=s" => \$pmem,
- "cpbin!" => \$cpbin,
- "ref-files=s" => \$refFiles,
- "metric=s" => \$metric,
- "source-file=s" => \$srcFile,
- "workdir=s" => \$dir,
-) == 0 || @ARGV!=1 || $help) {
+ "output-dir=s" => \$dir,
+) == 0 || @ARGV!=0 || $help) {
print_help();
exit;
}
-die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer;
-
if ($useqsub) {
$use_make = 0;
die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
}
my @missing_args = ();
-if (!defined $srcFile) { push @missing_args, "--source-file"; }
-if (!defined $refFiles) { push @missing_args, "--ref-files"; }
+if (!defined $iniFile) { push @missing_args, "--config"; }
+if (!defined $devset) { push @missing_args, "--devset"; }
if (!defined $initial_weights) { push @missing_args, "--weights"; }
die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
@@ -112,14 +100,6 @@ if ($metric =~ /^(combi|ter)$/i) {
$lines_per_mapper = 5;
}
-($iniFile) = @ARGV;
-
-
-sub write_config;
-sub enseg;
-sub print_help;
-
-my $nodelist;
my $host =check_output("hostname"); chomp $host;
my $bleu;
my $interval_count = 0;
@@ -132,10 +112,8 @@ if ($metric =~ /^ter$|^aer$/i) {
$DIR_FLAG = '';
}
-my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
-
unless ($dir){
- $dir = "protrain";
+ $dir = 'pro';
}
unless ($dir =~ /^\//){ # convert relative path to absolute path
my $basedir = check_output("pwd");
@@ -143,7 +121,6 @@ unless ($dir =~ /^\//){ # convert relative path to absolute path
$dir = "$basedir/$dir";
}
-
# Initializations and helper functions
srand;
@@ -173,57 +150,35 @@ my $user = $ENV{"USER"};
-e $iniFile || die "Error: could not open $iniFile for reading\n";
open(INI, $iniFile);
-use File::Basename qw(basename);
-#pass bindir, refs to vars holding bin
-sub modbin {
- local $_;
- my $bindir=shift;
- check_call("mkdir -p $bindir");
- -d $bindir || die "couldn't make bindir $bindir";
- for (@_) {
- my $src=$$_;
- $$_="$bindir/".basename($src);
- check_call("cp -p $src $$_");
- }
-}
-sub dirsize {
- opendir ISEMPTY,$_[0];
- return scalar(readdir(ISEMPTY))-1;
-}
-my @allweights;
-if ($dryrun){
- write_config(*STDERR);
- exit 0;
+if (-e $dir) {
+ die "ERROR: working dir $dir already exists\n\n";
} else {
- if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs
- die "ERROR: working dir $dir already exists\n\n";
- } else {
- -e $dir || mkdir $dir;
- mkdir "$dir/hgs";
- modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
- mkdir "$dir/scripts";
- my $cmdfile="$dir/rerun-pro.sh";
- open CMD,'>',$cmdfile;
- print CMD "cd ",&getcwd,"\n";
-# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
- my $cline=&cmdline."\n";
- print CMD $cline;
- close CMD;
- print STDERR $cline;
- chmod(0755,$cmdfile);
- check_call("cp $initial_weights $dir/weights.0");
- die "Can't find weights.0" unless (-e "$dir/weights.0");
- }
- write_config(*STDERR);
+ mkdir "$dir" or die "Can't mkdir $dir: $!";
+ mkdir "$dir/hgs" or die;
+ mkdir "$dir/scripts" or die;
+ print STDERR <<EOT;
+ DECODER: $decoder
+ INI FILE: $iniFile
+ WORKING DIR: $dir
+ DEVSET: $devset
+ EVAL METRIC: $metric
+ MAX ITERATIONS: $max_iterations
+ PARALLEL JOBS: $jobs
+ HEAD NODE: $host
+ PMEM (DECODING): $pmem
+ INITIAL WEIGHTS: $initial_weights
+EOT
}
-
# Generate initial files and values
check_call("cp $iniFile $newIniFile");
+check_call("cp $initial_weights $dir/weights.0");
$iniFile = $newIniFile;
+my $refs = "$dir/dev.refs";
+split_devset($devset, "$dir/dev.input.raw", $refs);
my $newsrc = "$dir/dev.input";
-enseg($srcFile, $newsrc);
+enseg("$dir/dev.input.raw", $newsrc);
$srcFile = $newsrc;
my $devSize = 0;
open F, "<$srcFile" or die "Can't read $srcFile: $!";
@@ -238,6 +193,7 @@ my $random_seed = int(time / 1000);
my $lastWeightsFile;
my $lastPScore = 0;
# main optimization loop
+my @allweights;
while (1){
print STDERR "\n\nITERATION $iteration\n==========\n";
@@ -288,7 +244,7 @@ while (1){
$retries++;
}
die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
- my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric");
+ my $dec_score = check_output("cat $runFile | $SCORER -r $refs -m $metric");
chomp $dec_score;
print STDERR "DECODER SCORE: $dec_score\n";
@@ -338,7 +294,7 @@ while (1){
$mapoutput =~ s/mapinput/mapoutput/;
push @mapoutputs, "$dir/splag.$im1/$mapoutput";
$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
- my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
+ my $script = "$MAPPER -s $srcFile -m $metric -r $refs -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
if ($use_make) {
my $script_file = "$dir/scripts/map.$shard";
open F, ">$script_file" or die "Can't write $script_file: $!";
@@ -371,20 +327,7 @@ while (1){
}
my @dev_outs = ();
my @devtest_outs = ();
- if ($tune_regularizer) {
- for (my $i = 0; $i < scalar @mapoutputs; $i++) {
- if ($i % 3 == 1) {
- push @devtest_outs, $mapoutputs[$i];
- } else {
- push @dev_outs, $mapoutputs[$i];
- }
- }
- if (scalar @devtest_outs == 0) {
- die "Not enough training instances for regularization tuning! Rerun without --tune-regularizer\n";
- }
- } else {
- @dev_outs = @mapoutputs;
- }
+ @dev_outs = @mapoutputs;
if ($use_make) {
print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
close $mkfile;
@@ -405,64 +348,25 @@ while (1){
my $tol = 0;
my $til = 0;
my $dev_test_file = "$dir/splag.$im1/devtest.gz";
- if ($tune_regularizer) {
- my $cmd = "cat @devtest_outs | gzip > $dev_test_file";
- check_bash_call($cmd);
- die "Can't find file $dev_test_file" unless -f $dev_test_file;
- }
- #print STDERR "MO: @mapoutputs\n";
- for my $mo (@mapoutputs) {
- #my $olines = get_lines($mo);
- #my $ilines = get_lines($o2i{$mo});
- #die "$mo: no training instances generated!" if $olines == 0;
- }
print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";
print STDERR unchecked_output("date");
$cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -C $reg -y $reg_previous --interpolate_with_weights $psi";
- if ($tune_regularizer) {
- $cmd .= " -T -t $dev_test_file";
- }
$cmd .= " > $dir/weights.$iteration";
print STDERR "COMMAND:\n$cmd\n";
check_bash_call($cmd);
$lastWeightsFile = "$dir/weights.$iteration";
- if ($tune_regularizer) {
- open W, "<$lastWeightsFile" or die "Can't read $lastWeightsFile: $!";
- my $line = <W>;
- close W;
- my ($sharp, $label, $nreg) = split /\s|=/, $line;
- print STDERR "REGULARIZATION STRENGTH ($label) IS $nreg\n";
- $reg = $nreg;
- # only tune regularizer on first iteration?
- $tune_regularizer = 0;
- }
$lastPScore = $score;
$iteration++;
print STDERR "\n==========\n";
}
-print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
-print STDOUT "$lastWeightsFile\n";
+check_call("cp $lastWeightsFile $dir/weights.final");
+print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n";
+print STDOUT "$dir/weights.final\n";
exit 0;
-sub get_lines {
- my $fn = shift @_;
- open FL, "<$fn" or die "Couldn't read $fn: $!";
- my $lc = 0;
- while(<FL>) { $lc++; }
- return $lc;
-}
-
-sub get_comma_sep_refs {
- my ($r,$p) = @_;
- my $o = check_output("echo $p");
- chomp $o;
- my @files = split /\s+/, $o;
- return "-$r " . join(" -$r ", @files);
-}
-
sub read_weights_file {
my ($file) = @_;
open F, "<$file" or die "Couldn't read $file: $!";
@@ -485,42 +389,6 @@ sub read_weights_file {
return join ' ', @r;
}
-# subs
-sub write_config {
- my $fh = shift;
- my $cleanup = "yes";
- if ($disable_clean) {$cleanup = "no";}
-
- print $fh "\n";
- print $fh "DECODER: $decoder\n";
- print $fh "INI FILE: $iniFile\n";
- print $fh "WORKING DIR: $dir\n";
- print $fh "SOURCE (DEV): $srcFile\n";
- print $fh "REFS (DEV): $refFiles\n";
- print $fh "EVAL METRIC: $metric\n";
- print $fh "MAX ITERATIONS: $max_iterations\n";
- print $fh "JOBS: $jobs\n";
- print $fh "HEAD NODE: $host\n";
- print $fh "PMEM (DECODING): $pmem\n";
- print $fh "CLEANUP: $cleanup\n";
-}
-
-sub update_weights_file {
- my ($neww, $rfn, $rpts) = @_;
- my @feats = @$rfn;
- my @pts = @$rpts;
- my $num_feats = scalar @feats;
- my $num_pts = scalar @pts;
- die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
- open G, ">$neww" or die;
- for (my $i = 0; $i < $num_feats; $i++) {
- my $f = $feats[$i];
- my $lambda = $pts[$i];
- print G "$f $lambda\n";
- }
- close G;
-}
-
sub enseg {
my $src = shift;
my $newsrc = shift;
@@ -547,23 +415,21 @@ sub enseg {
sub print_help {
- my $executable = check_output("basename $0"); chomp $executable;
+ my $executable = basename($0); chomp $executable;
print << "Help";
-Usage: $executable [options] <ini file>
+Usage: $executable [options]
- $executable [options] <ini file>
+ $executable [options]
Runs a complete PRO optimization using the ini file specified.
Required:
- --ref-files <files>
- Dev set ref files. This option takes only a single string argument.
- To use multiple files (including file globbing), this argument should
- be quoted.
+ --config <cdec.ini>
+ Decoder configuration file.
- --source-file <file>
- Dev set source file.
+ --devset <files>
+ Dev set source and reference data.
--weights <file>
Initial weights file (use empty file to start from 0)
@@ -669,3 +535,21 @@ sub escaped_shell_args_str {
sub escaped_cmdline {
return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
}
+
+sub split_devset {
+ my ($infile, $outsrc, $outref) = @_;
+ open F, "<$infile" or die "Can't read $infile: $!";
+ open S, ">$outsrc" or die "Can't write $outsrc: $!";
+ open R, ">$outref" or die "Can't write $outref: $!";
+ while(<F>) {
+ chomp;
+ my ($src, @refs) = split /\s*\|\|\|\s*/;
+ die "Malformed devset line: $_\n" unless scalar @refs > 0;
+ print S "$src\n";
+ print R join(' ||| ', @refs) . "\n";
+ }
+ close R;
+ close S;
+ close F;
+}
+