diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 |
commit | 0172721855098ca02b207231a654dffa5e4eb1c9 (patch) | |
tree | 8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /training/cluster-ptrain.pl | |
parent | 37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff) |
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'training/cluster-ptrain.pl')
-rwxr-xr-x | training/cluster-ptrain.pl | 206 |
1 files changed, 206 insertions, 0 deletions
diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl new file mode 100755 index 00000000..03122df9 --- /dev/null +++ b/training/cluster-ptrain.pl @@ -0,0 +1,206 @@ +#!/usr/bin/perl -w + +use strict; +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path getcwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +use Getopt::Long; + +my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluation +my $CWD=getcwd(); +my $OPTIMIZER = "$SCRIPT_DIR/mr_optimize_reduce"; +my $DECODER = "$SCRIPT_DIR/../decoder/cdec"; +my $COMBINER_CACHE_SIZE = 150; +# This is a hack to run this on a weird cluster, +# eventually, I'll provide Hadoop scripts. +my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl"; +die "Can't find $OPTIMIZER" unless -f $OPTIMIZER; +die "Can't execute $OPTIMIZER" unless -x $OPTIMIZER; +my $restart = ''; +if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; } + +my $pmem="2500mb"; +my $nodes = 1; +my $max_iteration = 1000; +my $PRIOR_FLAG = ""; +my $parallel = 1; +my $CFLAG = "-C 1"; +my $LOCAL; +my $DISTRIBUTED; +my $PRIOR; +my $OALG = "lbfgs"; +my $sigsq = 1; +my $means_file; +my $mem_buffers = 20; +my $RESTART_IF_NECESSARY; +GetOptions("cdec=s" => \$DECODER, + "distributed" => \$DISTRIBUTED, + "sigma_squared=f" => \$sigsq, + "lbfgs_memory_buffers=i" => \$mem_buffers, + "max_iteration=i" => \$max_iteration, + "means=s" => \$means_file, + "optimizer=s" => \$OALG, + "gaussian_prior" => \$PRIOR, + "restart_if_necessary" => \$RESTART_IF_NECESSARY, + "jobs=i" => \$nodes, + "pmem=s" => \$pmem + ) or usage(); +usage() unless scalar @ARGV==3; +my $config_file = shift @ARGV; +my $training_corpus = shift @ARGV; +my $initial_weights = shift @ARGV; +unless ($DISTRIBUTED) { $LOCAL = 1; } +die "Can't find $config_file" unless -f $config_file; +die "Can't find $DECODER" unless -f $DECODER; +die "Can't execute $DECODER" unless -x $DECODER; +if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; } +if ($PRIOR) { + $PRIOR_FLAG="-p --sigma_squared $sigsq"; + if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; } +} + +if ($parallel) { + die "Can't find $PARALLEL" unless -f $PARALLEL; + die "Can't execute $PARALLEL" unless -x $PARALLEL; +} +unless ($parallel) { $CFLAG = "-C 500"; } +unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; } +my $clines = num_lines($training_corpus); +my $dir = "$CWD/ptrain"; + +if ($RESTART_IF_NECESSARY && -d $dir) { + $restart = 1; +} + +print STDERR <<EOT; +PTRAIN CONFIGURATION INFORMATION + + Config file: $config_file + Training corpus: $training_corpus + Corpus size: $clines + Initial weights: $initial_weights + Decoder memory: $pmem + Max iterations: $max_iteration + Optimizer: $OALG + Jobs requested: $nodes + prior?: $PRIOR_FLAG + restart?: $restart +EOT + +if ($OALG) { $OALG="-m $OALG"; } + +my $nodelist="1"; +for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; } +my $iter = 1; + +if ($restart) { + die "$dir doesn't exist, but --restart specified!\n" unless -d $dir; + my $o = `ls -t $dir/weights.*`; + my ($a, @x) = split /\n/, $o; + if ($a =~ /weights.(\d+)\.gz$/) { + $iter = $1; + } else { + die "Unexpected file: $a!\n"; + } + print STDERR "Restarting at iteration $iter\n"; +} else { + die "$dir already exists!\n" if -e $dir; + mkdir $dir or die "Can't create $dir: $!"; + + unless ($initial_weights =~ /\.gz$/) { + `cp $initial_weights $dir/weights.1`; + `gzip -9 $dir/weights.1`; + } else { + `cp $initial_weights $dir/weights.1.gz`; + } + open T, "<$training_corpus" or die "Can't read $training_corpus: $!"; + open TO, ">$dir/training.in"; + my $lc = 0; + while(<T>) { + chomp; + s/^\s+//; + s/\s+$//; + die "Expected A ||| B in input file" unless / \|\|\| /; + print TO "<seg id=\"$lc\">$_</seg>\n"; + $lc++; + } + close T; + close TO; +} +$training_corpus = "$dir/training.in"; + +my $iter_attempts = 1; +while ($iter < $max_iteration) { + my $cur_time = `date`; chomp $cur_time; + print STDERR "\nStarting iteration $iter...\n"; + print STDERR " time: $cur_time\n"; + my $start = time; + my $next_iter = $iter + 1; + my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter"; + my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz"; + my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- "; + my $cmd = ""; + if ($parallel) { $cmd = $pcmd; } + $cmd .= "$dec_cmd | $opt_cmd"; + + print STDERR "EXECUTING: $cmd\n"; + my $result = `$cmd`; + my $exit_code = $? >> 8; + if ($exit_code == 99) { + $iter_attempts++; + if ($iter_attempts > $MAX_ITER_ATTEMPTS) { + die "Received restart request $iter_attempts times from optimizer, giving up\n"; + } + print STDERR "Function evaluation failed, retrying (attempt $iter_attempts)\n"; + next; + } + if ($? != 0) { + die "Error running iteration $iter: $!"; + } + chomp $result; + my $end = time; + my $diff = ($end - $start); + print STDERR " ITERATION $iter TOOK $diff SECONDS\n"; + $iter = $next_iter; + if ($result =~ /1$/) { + print STDERR "Training converged.\n"; + last; + } + $iter_attempts = 1; +} + +print "FINAL WEIGHTS: $dir/weights.$iter\n"; +`mv $dir/weights.$iter.gz $dir/weights.final.gz`; + +sub usage { + die <<EOT; + +Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init + + Options: + + --distributed Parallelize function evaluation + --jobs N Number of jobs to use + --cdec PATH Path to cdec binary + --optimize OPT lbfgs, rprop, sgd + --gaussian_prior add Gaussian prior + --means FILE if you want means other than 0 + --sigma_squared S variance on prior + --pmem MEM Memory required for decoder + --lbfgs_memory_buffers Number of buffers to use + with LBFGS optimizer + +EOT +} + +sub num_lines { + my $file = shift; + my $fh; + if ($file=~ /\.gz$/) { + open $fh, "zcat $file|" or die "Couldn't fork zcat $file: $!"; + } else { + open $fh, "<$file" or die "Couldn't read $file: $!"; + } + my $lines = 0; + while(<$fh>) { $lines++; } + close $fh; + return $lines; +} |