summaryrefslogtreecommitdiff
path: root/training/cluster-em.pl
diff options
context:
space:
mode:
Diffstat (limited to 'training/cluster-em.pl')
-rwxr-xr-xtraining/cluster-em.pl114
1 files changed, 0 insertions, 114 deletions
diff --git a/training/cluster-em.pl b/training/cluster-em.pl
deleted file mode 100755
index 267ab642..00000000
--- a/training/cluster-em.pl
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
-use Getopt::Long;
-my $parallel = 0;
-
-my $CWD=`pwd`; chomp $CWD;
-my $BIN_DIR = "$CWD/..";
-my $REDUCER = "$BIN_DIR/training/mr_em_adapted_reduce";
-my $REDUCE2WEIGHTS = "$BIN_DIR/training/mr_reduce_to_weights";
-my $ADAPTER = "$BIN_DIR/training/mr_em_map_adapter";
-my $DECODER = "$BIN_DIR/decoder/cdec";
-my $COMBINER_CACHE_SIZE = 10000000;
-my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl";
-die "Can't find $REDUCER" unless -f $REDUCER;
-die "Can't execute $REDUCER" unless -x $REDUCER;
-die "Can't find $REDUCE2WEIGHTS" unless -f $REDUCE2WEIGHTS;
-die "Can't execute $REDUCE2WEIGHTS" unless -x $REDUCE2WEIGHTS;
-die "Can't find $ADAPTER" unless -f $ADAPTER;
-die "Can't execute $ADAPTER" unless -x $ADAPTER;
-die "Can't find $DECODER" unless -f $DECODER;
-die "Can't execute $DECODER" unless -x $DECODER;
-my $restart = '';
-if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
-
-die "Usage: $0 [--restart] training.corpus cdec.ini\n" unless (scalar @ARGV == 2);
-
-my $training_corpus = shift @ARGV;
-my $config = shift @ARGV;
-my $pmem="2500mb";
-my $nodes = 40;
-my $max_iteration = 1000;
-my $CFLAG = "-C 1";
-if ($parallel) {
- die "Can't find $PARALLEL" unless -f $PARALLEL;
- die "Can't execute $PARALLEL" unless -x $PARALLEL;
-} else { $CFLAG = "-C 500"; }
-
-my $initial_weights = '';
-
-print STDERR <<EOT;
-EM TRAIN CONFIGURATION INFORMATION
-
- Config file: $config
- Training corpus: $training_corpus
- Initial weights: $initial_weights
- Decoder memory: $pmem
- Nodes requested: $nodes
- Max iterations: $max_iteration
- restart: $restart
-EOT
-
-my $nodelist="1";
-for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; }
-my $iter = 1;
-
-my $dir = "$CWD/emtrain";
-if ($restart) {
- die "$dir doesn't exist, but --restart specified!\n" unless -d $dir;
- my $o = `ls -t $dir/weights.*`;
- my ($a, @x) = split /\n/, $o;
- if ($a =~ /weights.(\d+)\.gz$/) {
- $iter = $1;
- } else {
- die "Unexpected file: $a!\n";
- }
- print STDERR "Restarting at iteration $iter\n";
-} else {
- die "$dir already exists!\n" if -e $dir;
- mkdir $dir or die "Can't create $dir: $!";
-
- if ($initial_weights) {
- unless ($initial_weights =~ /\.gz$/) {
- `cp $initial_weights $dir/weights.1`;
- `gzip -9 $dir/weights.1`;
- } else {
- `cp $initial_weights $dir/weights.1.gz`;
- }
- }
-}
-
-while ($iter < $max_iteration) {
- my $cur_time = `date`; chomp $cur_time;
- print STDERR "\nStarting iteration $iter...\n";
- print STDERR " time: $cur_time\n";
- my $start = time;
- my $next_iter = $iter + 1;
- my $WSTR = "-w $dir/weights.$iter.gz";
- if ($iter == 1) { $WSTR = ''; }
- my $dec_cmd="$DECODER --feature_expectations -c $config $WSTR $CFLAG < $training_corpus 2> $dir/deco.log.$iter";
- my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
- my $cmd = "";
- if ($parallel) { $cmd = $pcmd; }
- $cmd .= "$dec_cmd";
- $cmd .= "| $ADAPTER | sort -k1 | $REDUCER | $REDUCE2WEIGHTS -o $dir/weights.$next_iter.gz";
- print STDERR "EXECUTING: $cmd\n";
- my $result = `$cmd`;
- if ($? != 0) {
- die "Error running iteration $iter: $!";
- }
- chomp $result;
- my $end = time;
- my $diff = ($end - $start);
- print STDERR " ITERATION $iter TOOK $diff SECONDS\n";
- $iter = $next_iter;
- if ($result =~ /1$/) {
- print STDERR "Training converged.\n";
- last;
- }
-}
-
-print "FINAL WEIGHTS: $dir/weights.$iter\n";
-