summaryrefslogtreecommitdiff
path: root/training
diff options
context:
space:
mode:
Diffstat (limited to 'training')
-rw-r--r--training/Makefile.am14
-rwxr-xr-xtraining/cluster-em.pl64
2 files changed, 45 insertions, 33 deletions
diff --git a/training/Makefile.am b/training/Makefile.am
index 6427fcba..490de774 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -1,10 +1,12 @@
bin_PROGRAMS = \
model1 \
+ mr_em_map_adapter \
+ mr_em_adapted_reduce \
+ mr_reduce_to_weights \
mr_optimize_reduce \
grammar_convert \
atools \
plftools \
- mr_em_train \
collapse_weights
noinst_PROGRAMS = \
@@ -32,8 +34,14 @@ lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc optimize.cc
mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
-mr_em_train_SOURCES = mr_em_train.cc
-mr_em_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc
+mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+
+mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc
+mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+
+mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc
+mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
plftools_SOURCES = plftools.cc
plftools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
diff --git a/training/cluster-em.pl b/training/cluster-em.pl
index 175870da..267ab642 100755
--- a/training/cluster-em.pl
+++ b/training/cluster-em.pl
@@ -3,44 +3,46 @@
use strict;
my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
use Getopt::Long;
-my $parallel = 1;
+my $parallel = 0;
my $CWD=`pwd`; chomp $CWD;
-my $BIN_DIR = "/chomes/redpony/cdyer-svn-repo/cdec/src";
-my $OPTIMIZER = "$BIN_DIR/mr_em_train";
-my $DECODER = "$BIN_DIR/cdec";
-my $COMBINER_CACHE_SIZE = 150;
+my $BIN_DIR = "$CWD/..";
+my $REDUCER = "$BIN_DIR/training/mr_em_adapted_reduce";
+my $REDUCE2WEIGHTS = "$BIN_DIR/training/mr_reduce_to_weights";
+my $ADAPTER = "$BIN_DIR/training/mr_em_map_adapter";
+my $DECODER = "$BIN_DIR/decoder/cdec";
+my $COMBINER_CACHE_SIZE = 10000000;
my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl";
-die "Can't find $OPTIMIZER" unless -f $OPTIMIZER;
-die "Can't execute $OPTIMIZER" unless -x $OPTIMIZER;
+die "Can't find $REDUCER" unless -f $REDUCER;
+die "Can't execute $REDUCER" unless -x $REDUCER;
+die "Can't find $REDUCE2WEIGHTS" unless -f $REDUCE2WEIGHTS;
+die "Can't execute $REDUCE2WEIGHTS" unless -x $REDUCE2WEIGHTS;
+die "Can't find $ADAPTER" unless -f $ADAPTER;
+die "Can't execute $ADAPTER" unless -x $ADAPTER;
die "Can't find $DECODER" unless -f $DECODER;
die "Can't execute $DECODER" unless -x $DECODER;
-die "Can't find $PARALLEL" unless -f $PARALLEL;
-die "Can't execute $PARALLEL" unless -x $PARALLEL;
my $restart = '';
if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
-die "Usage: $0 [--restart] training.corpus weights.init grammar.file [grammar2.file] ...\n" unless (scalar @ARGV >= 3);
+die "Usage: $0 [--restart] training.corpus cdec.ini\n" unless (scalar @ARGV == 2);
my $training_corpus = shift @ARGV;
-my $initial_weights = shift @ARGV;
-my @in_grammar_files = @ARGV;
+my $config = shift @ARGV;
my $pmem="2500mb";
my $nodes = 40;
my $max_iteration = 1000;
my $CFLAG = "-C 1";
-unless ($parallel) { $CFLAG = "-C 500"; }
-my @grammar_files;
-for my $g (@in_grammar_files) {
- unless ($g =~ /^\//) { $g = $CWD . '/' . $g; }
- die "Can't find $g" unless -f $g;
- push @grammar_files, $g;
-}
+if ($parallel) {
+ die "Can't find $PARALLEL" unless -f $PARALLEL;
+ die "Can't execute $PARALLEL" unless -x $PARALLEL;
+} else { $CFLAG = "-C 500"; }
+
+my $initial_weights = '';
print STDERR <<EOT;
EM TRAIN CONFIGURATION INFORMATION
- Grammar file(s): @grammar_files
+ Config file: $config
Training corpus: $training_corpus
Initial weights: $initial_weights
Decoder memory: $pmem
@@ -68,11 +70,13 @@ if ($restart) {
die "$dir already exists!\n" if -e $dir;
mkdir $dir or die "Can't create $dir: $!";
- unless ($initial_weights =~ /\.gz$/) {
- `cp $initial_weights $dir/weights.1`;
- `gzip -9 $dir/weights.1`;
- } else {
- `cp $initial_weights $dir/weights.1.gz`;
+ if ($initial_weights) {
+ unless ($initial_weights =~ /\.gz$/) {
+ `cp $initial_weights $dir/weights.1`;
+ `gzip -9 $dir/weights.1`;
+ } else {
+ `cp $initial_weights $dir/weights.1.gz`;
+ }
}
}
@@ -82,14 +86,14 @@ while ($iter < $max_iteration) {
print STDERR " time: $cur_time\n";
my $start = time;
my $next_iter = $iter + 1;
- my $gfile = '-g' . (join ' -g ', @grammar_files);
- my $dec_cmd="$DECODER --feature_expectations -S 999 $CFLAG $gfile -n -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter";
- my $opt_cmd = "$OPTIMIZER $gfile -o $dir/weights.$next_iter.gz";
+ my $WSTR = "-w $dir/weights.$iter.gz";
+ if ($iter == 1) { $WSTR = ''; }
+ my $dec_cmd="$DECODER --feature_expectations -c $config $WSTR $CFLAG < $training_corpus 2> $dir/deco.log.$iter";
my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
my $cmd = "";
if ($parallel) { $cmd = $pcmd; }
- $cmd .= "$dec_cmd | $opt_cmd";
-
+ $cmd .= "$dec_cmd";
+ $cmd .= "| $ADAPTER | sort -k1 | $REDUCER | $REDUCE2WEIGHTS -o $dir/weights.$next_iter.gz";
print STDERR "EXECUTING: $cmd\n";
my $result = `$cmd`;
if ($? != 0) {