summaryrefslogtreecommitdiff
path: root/training/cluster-ptrain.pl
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-10-20 02:31:25 +0200
committerPatrick Simianer <p@simianer.de>2011-10-20 02:31:25 +0200
commit92e48b652530d2d2bb4f2694501f95a60d727cb2 (patch)
treeb484bd0c4216525690de8b14fb654c9581a300c2 /training/cluster-ptrain.pl
parent0e70073cec6cdcafaf60d4fbcbd1adf82ae21c8e (diff)
parent082b6c77e0703ccd1c85947828c33d4b0eef20f0 (diff)
finalized merge
Diffstat (limited to 'training/cluster-ptrain.pl')
-rwxr-xr-xtraining/cluster-ptrain.pl206
1 files changed, 0 insertions, 206 deletions
diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl
deleted file mode 100755
index 03122df9..00000000
--- a/training/cluster-ptrain.pl
+++ /dev/null
@@ -1,206 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path getcwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
-use Getopt::Long;
-
-my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluation
-my $CWD=getcwd();
-my $OPTIMIZER = "$SCRIPT_DIR/mr_optimize_reduce";
-my $DECODER = "$SCRIPT_DIR/../decoder/cdec";
-my $COMBINER_CACHE_SIZE = 150;
-# This is a hack to run this on a weird cluster,
-# eventually, I'll provide Hadoop scripts.
-my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl";
-die "Can't find $OPTIMIZER" unless -f $OPTIMIZER;
-die "Can't execute $OPTIMIZER" unless -x $OPTIMIZER;
-my $restart = '';
-if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
-
-my $pmem="2500mb";
-my $nodes = 1;
-my $max_iteration = 1000;
-my $PRIOR_FLAG = "";
-my $parallel = 1;
-my $CFLAG = "-C 1";
-my $LOCAL;
-my $DISTRIBUTED;
-my $PRIOR;
-my $OALG = "lbfgs";
-my $sigsq = 1;
-my $means_file;
-my $mem_buffers = 20;
-my $RESTART_IF_NECESSARY;
-GetOptions("cdec=s" => \$DECODER,
- "distributed" => \$DISTRIBUTED,
- "sigma_squared=f" => \$sigsq,
- "lbfgs_memory_buffers=i" => \$mem_buffers,
- "max_iteration=i" => \$max_iteration,
- "means=s" => \$means_file,
- "optimizer=s" => \$OALG,
- "gaussian_prior" => \$PRIOR,
- "restart_if_necessary" => \$RESTART_IF_NECESSARY,
- "jobs=i" => \$nodes,
- "pmem=s" => \$pmem
- ) or usage();
-usage() unless scalar @ARGV==3;
-my $config_file = shift @ARGV;
-my $training_corpus = shift @ARGV;
-my $initial_weights = shift @ARGV;
-unless ($DISTRIBUTED) { $LOCAL = 1; }
-die "Can't find $config_file" unless -f $config_file;
-die "Can't find $DECODER" unless -f $DECODER;
-die "Can't execute $DECODER" unless -x $DECODER;
-if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; }
-if ($PRIOR) {
- $PRIOR_FLAG="-p --sigma_squared $sigsq";
- if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; }
-}
-
-if ($parallel) {
- die "Can't find $PARALLEL" unless -f $PARALLEL;
- die "Can't execute $PARALLEL" unless -x $PARALLEL;
-}
-unless ($parallel) { $CFLAG = "-C 500"; }
-unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; }
-my $clines = num_lines($training_corpus);
-my $dir = "$CWD/ptrain";
-
-if ($RESTART_IF_NECESSARY && -d $dir) {
- $restart = 1;
-}
-
-print STDERR <<EOT;
-PTRAIN CONFIGURATION INFORMATION
-
- Config file: $config_file
- Training corpus: $training_corpus
- Corpus size: $clines
- Initial weights: $initial_weights
- Decoder memory: $pmem
- Max iterations: $max_iteration
- Optimizer: $OALG
- Jobs requested: $nodes
- prior?: $PRIOR_FLAG
- restart?: $restart
-EOT
-
-if ($OALG) { $OALG="-m $OALG"; }
-
-my $nodelist="1";
-for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; }
-my $iter = 1;
-
-if ($restart) {
- die "$dir doesn't exist, but --restart specified!\n" unless -d $dir;
- my $o = `ls -t $dir/weights.*`;
- my ($a, @x) = split /\n/, $o;
- if ($a =~ /weights.(\d+)\.gz$/) {
- $iter = $1;
- } else {
- die "Unexpected file: $a!\n";
- }
- print STDERR "Restarting at iteration $iter\n";
-} else {
- die "$dir already exists!\n" if -e $dir;
- mkdir $dir or die "Can't create $dir: $!";
-
- unless ($initial_weights =~ /\.gz$/) {
- `cp $initial_weights $dir/weights.1`;
- `gzip -9 $dir/weights.1`;
- } else {
- `cp $initial_weights $dir/weights.1.gz`;
- }
- open T, "<$training_corpus" or die "Can't read $training_corpus: $!";
- open TO, ">$dir/training.in";
- my $lc = 0;
- while(<T>) {
- chomp;
- s/^\s+//;
- s/\s+$//;
- die "Expected A ||| B in input file" unless / \|\|\| /;
- print TO "<seg id=\"$lc\">$_</seg>\n";
- $lc++;
- }
- close T;
- close TO;
-}
-$training_corpus = "$dir/training.in";
-
-my $iter_attempts = 1;
-while ($iter < $max_iteration) {
- my $cur_time = `date`; chomp $cur_time;
- print STDERR "\nStarting iteration $iter...\n";
- print STDERR " time: $cur_time\n";
- my $start = time;
- my $next_iter = $iter + 1;
- my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter";
- my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz";
- my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
- my $cmd = "";
- if ($parallel) { $cmd = $pcmd; }
- $cmd .= "$dec_cmd | $opt_cmd";
-
- print STDERR "EXECUTING: $cmd\n";
- my $result = `$cmd`;
- my $exit_code = $? >> 8;
- if ($exit_code == 99) {
- $iter_attempts++;
- if ($iter_attempts > $MAX_ITER_ATTEMPTS) {
- die "Received restart request $iter_attempts times from optimizer, giving up\n";
- }
- print STDERR "Function evaluation failed, retrying (attempt $iter_attempts)\n";
- next;
- }
- if ($? != 0) {
- die "Error running iteration $iter: $!";
- }
- chomp $result;
- my $end = time;
- my $diff = ($end - $start);
- print STDERR " ITERATION $iter TOOK $diff SECONDS\n";
- $iter = $next_iter;
- if ($result =~ /1$/) {
- print STDERR "Training converged.\n";
- last;
- }
- $iter_attempts = 1;
-}
-
-print "FINAL WEIGHTS: $dir/weights.$iter\n";
-`mv $dir/weights.$iter.gz $dir/weights.final.gz`;
-
-sub usage {
- die <<EOT;
-
-Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init
-
- Options:
-
- --distributed Parallelize function evaluation
- --jobs N Number of jobs to use
- --cdec PATH Path to cdec binary
- --optimize OPT lbfgs, rprop, sgd
- --gaussian_prior add Gaussian prior
- --means FILE if you want means other than 0
- --sigma_squared S variance on prior
- --pmem MEM Memory required for decoder
- --lbfgs_memory_buffers Number of buffers to use
- with LBFGS optimizer
-
-EOT
-}
-
-sub num_lines {
- my $file = shift;
- my $fh;
- if ($file=~ /\.gz$/) {
- open $fh, "zcat $file|" or die "Couldn't fork zcat $file: $!";
- } else {
- open $fh, "<$file" or die "Couldn't read $file: $!";
- }
- my $lines = 0;
- while(<$fh>) { $lines++; }
- close $fh;
- return $lines;
-}