2 files changed, 45 insertions, 33 deletions
diff --git a/training/Makefile.am b/training/Makefile.am
index 6427fcba..490de774 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -1,10 +1,12 @@
 bin_PROGRAMS = \
   model1 \
+  mr_em_map_adapter \
+  mr_em_adapted_reduce \
+  mr_reduce_to_weights \
   mr_optimize_reduce \
   grammar_convert \
   atools \
   plftools \
-  mr_em_train \
   collapse_weights
 
 noinst_PROGRAMS = \
@@ -32,8 +34,14 @@ lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
 mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc optimize.cc
 mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
 
-mr_em_train_SOURCES = mr_em_train.cc
-mr_em_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc
+mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+
+mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc
+mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+
+mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc
+mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
 
 plftools_SOURCES = plftools.cc
 plftools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
diff --git a/training/cluster-em.pl b/training/cluster-em.pl
index 175870da..267ab642 100755
--- a/training/cluster-em.pl
+++ b/training/cluster-em.pl
@@ -3,44 +3,46 @@
 use strict;
 my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
 use Getopt::Long;
-my $parallel = 1;
+my $parallel = 0;
 
 my $CWD=`pwd`; chomp $CWD;
-my $BIN_DIR = "/chomes/redpony/cdyer-svn-repo/cdec/src";
-my $OPTIMIZER = "$BIN_DIR/mr_em_train";
-my $DECODER = "$BIN_DIR/cdec";
-my $COMBINER_CACHE_SIZE = 150;
+my $BIN_DIR = "$CWD/..";
+my $REDUCER = "$BIN_DIR/training/mr_em_adapted_reduce";
+my $REDUCE2WEIGHTS = "$BIN_DIR/training/mr_reduce_to_weights";
+my $ADAPTER = "$BIN_DIR/training/mr_em_map_adapter";
+my $DECODER = "$BIN_DIR/decoder/cdec";
+my $COMBINER_CACHE_SIZE = 10000000;
 my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl";
-die "Can't find $OPTIMIZER" unless -f $OPTIMIZER;
-die "Can't execute $OPTIMIZER" unless -x $OPTIMIZER;
+die "Can't find $REDUCER" unless -f $REDUCER;
+die "Can't execute $REDUCER" unless -x $REDUCER;
+die "Can't find $REDUCE2WEIGHTS" unless -f $REDUCE2WEIGHTS;
+die "Can't execute $REDUCE2WEIGHTS" unless -x $REDUCE2WEIGHTS;
+die "Can't find $ADAPTER" unless -f $ADAPTER;
+die "Can't execute $ADAPTER" unless -x $ADAPTER;
 die "Can't find $DECODER" unless -f $DECODER;
 die "Can't execute $DECODER" unless -x $DECODER;
-die "Can't find $PARALLEL" unless -f $PARALLEL;
-die "Can't execute $PARALLEL" unless -x $PARALLEL;
 my $restart = '';
 if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
 
-die "Usage: $0 [--restart] training.corpus weights.init grammar.file [grammar2.file] ...\n" unless (scalar @ARGV >= 3);
+die "Usage: $0 [--restart] training.corpus cdec.ini\n" unless (scalar @ARGV == 2);
 
 my $training_corpus = shift @ARGV;
-my $initial_weights = shift @ARGV;
-my @in_grammar_files = @ARGV;
+my $config = shift @ARGV;
 my $pmem="2500mb";
 my $nodes = 40;
 my $max_iteration = 1000;
 my $CFLAG = "-C 1";
-unless ($parallel) { $CFLAG = "-C 500"; }
-my @grammar_files;
-for my $g (@in_grammar_files) {
-  unless ($g =~ /^\//) { $g = $CWD . '/' . $g; }
-  die "Can't find $g" unless -f $g;
-  push @grammar_files, $g;
-}
+if ($parallel) {
+  die "Can't find $PARALLEL" unless -f $PARALLEL;
+  die "Can't execute $PARALLEL" unless -x $PARALLEL;
+} else { $CFLAG = "-C 500"; }
+
+my $initial_weights = '';
 
 print STDERR <<EOT;
 EM TRAIN CONFIGURATION INFORMATION
 
-  Grammar file(s): @grammar_files
+      Config file: $config
   Training corpus: $training_corpus
   Initial weights: $initial_weights
    Decoder memory: $pmem
@@ -68,11 +70,13 @@ if ($restart) {
   die "$dir already exists!\n" if -e $dir;
   mkdir $dir or die "Can't create $dir: $!";
 
-  unless ($initial_weights =~ /\.gz$/) {
-    `cp $initial_weights $dir/weights.1`;
-    `gzip -9 $dir/weights.1`;
-  } else {
-    `cp $initial_weights $dir/weights.1.gz`;
+  if ($initial_weights) {
+    unless ($initial_weights =~ /\.gz$/) {
+      `cp $initial_weights $dir/weights.1`;
+      `gzip -9 $dir/weights.1`;
+    } else {
+      `cp $initial_weights $dir/weights.1.gz`;
+    }
   }
 }
 
@@ -82,14 +86,14 @@ while ($iter < $max_iteration) {
   print STDERR "  time: $cur_time\n";
   my $start = time;
   my $next_iter = $iter + 1;
-  my $gfile = '-g' . (join ' -g ', @grammar_files);
-  my $dec_cmd="$DECODER --feature_expectations -S 999 $CFLAG $gfile -n -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter";
-  my $opt_cmd = "$OPTIMIZER $gfile -o $dir/weights.$next_iter.gz";
+  my $WSTR = "-w $dir/weights.$iter.gz";
+  if ($iter == 1) { $WSTR = ''; }
+  my $dec_cmd="$DECODER --feature_expectations -c $config $WSTR $CFLAG < $training_corpus 2> $dir/deco.log.$iter";
   my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
   my $cmd = "";
   if ($parallel) { $cmd = $pcmd; }
-  $cmd .= "$dec_cmd | $opt_cmd";
-
+  $cmd .= "$dec_cmd";
+  $cmd .= "| $ADAPTER | sort -k1 | $REDUCER | $REDUCE2WEIGHTS -o $dir/weights.$next_iter.gz";
   print STDERR "EXECUTING: $cmd\n";
   my $result = `$cmd`;
   if ($? != 0) {