summaryrefslogtreecommitdiff
path: root/training/cluster-ptrain.pl
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
commit0172721855098ca02b207231a654dffa5e4eb1c9 (patch)
tree8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /training/cluster-ptrain.pl
parent37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff)
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'training/cluster-ptrain.pl')
-rwxr-xr-xtraining/cluster-ptrain.pl206
1 files changed, 206 insertions, 0 deletions
diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl
new file mode 100755
index 00000000..03122df9
--- /dev/null
+++ b/training/cluster-ptrain.pl
@@ -0,0 +1,206 @@
+#!/usr/bin/perl -w
+
+use strict;
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path getcwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
+use Getopt::Long;
+
+my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluation
+my $CWD=getcwd();
+my $OPTIMIZER = "$SCRIPT_DIR/mr_optimize_reduce";
+my $DECODER = "$SCRIPT_DIR/../decoder/cdec";
+my $COMBINER_CACHE_SIZE = 150;
+# This is a hack to run this on a weird cluster,
+# eventually, I'll provide Hadoop scripts.
+my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl";
+die "Can't find $OPTIMIZER" unless -f $OPTIMIZER;
+die "Can't execute $OPTIMIZER" unless -x $OPTIMIZER;
+my $restart = '';
+if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
+
+my $pmem="2500mb";
+my $nodes = 1;
+my $max_iteration = 1000;
+my $PRIOR_FLAG = "";
+my $parallel = 1;
+my $CFLAG = "-C 1";
+my $LOCAL;
+my $DISTRIBUTED;
+my $PRIOR;
+my $OALG = "lbfgs";
+my $sigsq = 1;
+my $means_file;
+my $mem_buffers = 20;
+my $RESTART_IF_NECESSARY;
+GetOptions("cdec=s" => \$DECODER,
+ "distributed" => \$DISTRIBUTED,
+ "sigma_squared=f" => \$sigsq,
+ "lbfgs_memory_buffers=i" => \$mem_buffers,
+ "max_iteration=i" => \$max_iteration,
+ "means=s" => \$means_file,
+ "optimizer=s" => \$OALG,
+ "gaussian_prior" => \$PRIOR,
+ "restart_if_necessary" => \$RESTART_IF_NECESSARY,
+ "jobs=i" => \$nodes,
+ "pmem=s" => \$pmem
+ ) or usage();
+usage() unless scalar @ARGV==3;
+my $config_file = shift @ARGV;
+my $training_corpus = shift @ARGV;
+my $initial_weights = shift @ARGV;
+unless ($DISTRIBUTED) { $LOCAL = 1; }
+die "Can't find $config_file" unless -f $config_file;
+die "Can't find $DECODER" unless -f $DECODER;
+die "Can't execute $DECODER" unless -x $DECODER;
+if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; }
+if ($PRIOR) {
+ $PRIOR_FLAG="-p --sigma_squared $sigsq";
+ if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; }
+}
+
+if ($parallel) {
+ die "Can't find $PARALLEL" unless -f $PARALLEL;
+ die "Can't execute $PARALLEL" unless -x $PARALLEL;
+}
+unless ($parallel) { $CFLAG = "-C 500"; }
+unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; }
+my $clines = num_lines($training_corpus);
+my $dir = "$CWD/ptrain";
+
+if ($RESTART_IF_NECESSARY && -d $dir) {
+ $restart = 1;
+}
+
+print STDERR <<EOT;
+PTRAIN CONFIGURATION INFORMATION
+
+ Config file: $config_file
+ Training corpus: $training_corpus
+ Corpus size: $clines
+ Initial weights: $initial_weights
+ Decoder memory: $pmem
+ Max iterations: $max_iteration
+ Optimizer: $OALG
+ Jobs requested: $nodes
+ prior?: $PRIOR_FLAG
+ restart?: $restart
+EOT
+
+if ($OALG) { $OALG="-m $OALG"; }
+
+my $nodelist="1";
+for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; }
+my $iter = 1;
+
+if ($restart) {
+ die "$dir doesn't exist, but --restart specified!\n" unless -d $dir;
+ my $o = `ls -t $dir/weights.*`;
+ my ($a, @x) = split /\n/, $o;
+ if ($a =~ /weights.(\d+)\.gz$/) {
+ $iter = $1;
+ } else {
+ die "Unexpected file: $a!\n";
+ }
+ print STDERR "Restarting at iteration $iter\n";
+} else {
+ die "$dir already exists!\n" if -e $dir;
+ mkdir $dir or die "Can't create $dir: $!";
+
+ unless ($initial_weights =~ /\.gz$/) {
+ `cp $initial_weights $dir/weights.1`;
+ `gzip -9 $dir/weights.1`;
+ } else {
+ `cp $initial_weights $dir/weights.1.gz`;
+ }
+ open T, "<$training_corpus" or die "Can't read $training_corpus: $!";
+ open TO, ">$dir/training.in";
+ my $lc = 0;
+ while(<T>) {
+ chomp;
+ s/^\s+//;
+ s/\s+$//;
+ die "Expected A ||| B in input file" unless / \|\|\| /;
+ print TO "<seg id=\"$lc\">$_</seg>\n";
+ $lc++;
+ }
+ close T;
+ close TO;
+}
+$training_corpus = "$dir/training.in";
+
+my $iter_attempts = 1;
+while ($iter < $max_iteration) {
+ my $cur_time = `date`; chomp $cur_time;
+ print STDERR "\nStarting iteration $iter...\n";
+ print STDERR " time: $cur_time\n";
+ my $start = time;
+ my $next_iter = $iter + 1;
+ my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter";
+ my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz";
+ my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
+ my $cmd = "";
+ if ($parallel) { $cmd = $pcmd; }
+ $cmd .= "$dec_cmd | $opt_cmd";
+
+ print STDERR "EXECUTING: $cmd\n";
+ my $result = `$cmd`;
+ my $exit_code = $? >> 8;
+ if ($exit_code == 99) {
+ $iter_attempts++;
+ if ($iter_attempts > $MAX_ITER_ATTEMPTS) {
+ die "Received restart request $iter_attempts times from optimizer, giving up\n";
+ }
+ print STDERR "Function evaluation failed, retrying (attempt $iter_attempts)\n";
+ next;
+ }
+ if ($? != 0) {
+ die "Error running iteration $iter: $!";
+ }
+ chomp $result;
+ my $end = time;
+ my $diff = ($end - $start);
+ print STDERR " ITERATION $iter TOOK $diff SECONDS\n";
+ $iter = $next_iter;
+ if ($result =~ /1$/) {
+ print STDERR "Training converged.\n";
+ last;
+ }
+ $iter_attempts = 1;
+}
+
+print "FINAL WEIGHTS: $dir/weights.$iter\n";
+`mv $dir/weights.$iter.gz $dir/weights.final.gz`;
+
+sub usage {
+ die <<EOT;
+
+Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init
+
+ Options:
+
+ --distributed Parallelize function evaluation
+ --jobs N Number of jobs to use
+ --cdec PATH Path to cdec binary
+ --optimize OPT lbfgs, rprop, sgd
+ --gaussian_prior add Gaussian prior
+ --means FILE if you want means other than 0
+ --sigma_squared S variance on prior
+ --pmem MEM Memory required for decoder
+ --lbfgs_memory_buffers Number of buffers to use
+ with LBFGS optimizer
+
+EOT
+}
+
+sub num_lines {
+ my $file = shift;
+ my $fh;
+ if ($file=~ /\.gz$/) {
+ open $fh, "zcat $file|" or die "Couldn't fork zcat $file: $!";
+ } else {
+ open $fh, "<$file" or die "Couldn't read $file: $!";
+ }
+ my $lines = 0;
+ while(<$fh>) { $lines++; }
+ close $fh;
+ return $lines;
+}