diff options
Diffstat (limited to 'training')
-rw-r--r-- | training/Makefile.am | 14 | ||||
-rwxr-xr-x | training/cluster-em.pl | 64 |
2 files changed, 45 insertions, 33 deletions
diff --git a/training/Makefile.am b/training/Makefile.am index 6427fcba..490de774 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -1,10 +1,12 @@ bin_PROGRAMS = \ model1 \ + mr_em_map_adapter \ + mr_em_adapted_reduce \ + mr_reduce_to_weights \ mr_optimize_reduce \ grammar_convert \ atools \ plftools \ - mr_em_train \ collapse_weights noinst_PROGRAMS = \ @@ -32,8 +34,14 @@ lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc optimize.cc mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz -mr_em_train_SOURCES = mr_em_train.cc -mr_em_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc +mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a -lz + +mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc +mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz + +mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc +mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz plftools_SOURCES = plftools.cc plftools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz diff --git a/training/cluster-em.pl b/training/cluster-em.pl index 175870da..267ab642 100755 --- a/training/cluster-em.pl +++ b/training/cluster-em.pl @@ -3,44 +3,46 @@ use strict; my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } use Getopt::Long; -my $parallel = 1; +my $parallel = 0; my $CWD=`pwd`; chomp $CWD; -my $BIN_DIR = "/chomes/redpony/cdyer-svn-repo/cdec/src"; -my $OPTIMIZER = "$BIN_DIR/mr_em_train"; -my $DECODER = "$BIN_DIR/cdec"; -my $COMBINER_CACHE_SIZE = 150; +my $BIN_DIR = "$CWD/.."; +my $REDUCER = "$BIN_DIR/training/mr_em_adapted_reduce"; +my $REDUCE2WEIGHTS = "$BIN_DIR/training/mr_reduce_to_weights"; +my $ADAPTER = "$BIN_DIR/training/mr_em_map_adapter"; +my $DECODER = "$BIN_DIR/decoder/cdec"; +my $COMBINER_CACHE_SIZE = 10000000; my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl"; -die "Can't find $OPTIMIZER" unless -f $OPTIMIZER; -die "Can't execute $OPTIMIZER" unless -x $OPTIMIZER; +die "Can't find $REDUCER" unless -f $REDUCER; +die "Can't execute $REDUCER" unless -x $REDUCER; +die "Can't find $REDUCE2WEIGHTS" unless -f $REDUCE2WEIGHTS; +die "Can't execute $REDUCE2WEIGHTS" unless -x $REDUCE2WEIGHTS; +die "Can't find $ADAPTER" unless -f $ADAPTER; +die "Can't execute $ADAPTER" unless -x $ADAPTER; die "Can't find $DECODER" unless -f $DECODER; die "Can't execute $DECODER" unless -x $DECODER; -die "Can't find $PARALLEL" unless -f $PARALLEL; -die "Can't execute $PARALLEL" unless -x $PARALLEL; my $restart = ''; if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; } -die "Usage: $0 [--restart] training.corpus weights.init grammar.file [grammar2.file] ...\n" unless (scalar @ARGV >= 3); +die "Usage: $0 [--restart] training.corpus cdec.ini\n" unless (scalar @ARGV == 2); my $training_corpus = shift @ARGV; -my $initial_weights = shift @ARGV; -my @in_grammar_files = @ARGV; +my $config = shift @ARGV; my $pmem="2500mb"; my $nodes = 40; my $max_iteration = 1000; my $CFLAG = "-C 1"; -unless ($parallel) { $CFLAG = "-C 500"; } -my @grammar_files; -for my $g (@in_grammar_files) { - unless ($g =~ /^\//) { $g = $CWD . '/' . $g; } - die "Can't find $g" unless -f $g; - push @grammar_files, $g; -} +if ($parallel) { + die "Can't find $PARALLEL" unless -f $PARALLEL; + die "Can't execute $PARALLEL" unless -x $PARALLEL; +} else { $CFLAG = "-C 500"; } + +my $initial_weights = ''; print STDERR <<EOT; EM TRAIN CONFIGURATION INFORMATION - Grammar file(s): @grammar_files + Config file: $config Training corpus: $training_corpus Initial weights: $initial_weights Decoder memory: $pmem @@ -68,11 +70,13 @@ if ($restart) { die "$dir already exists!\n" if -e $dir; mkdir $dir or die "Can't create $dir: $!"; - unless ($initial_weights =~ /\.gz$/) { - `cp $initial_weights $dir/weights.1`; - `gzip -9 $dir/weights.1`; - } else { - `cp $initial_weights $dir/weights.1.gz`; + if ($initial_weights) { + unless ($initial_weights =~ /\.gz$/) { + `cp $initial_weights $dir/weights.1`; + `gzip -9 $dir/weights.1`; + } else { + `cp $initial_weights $dir/weights.1.gz`; + } } } @@ -82,14 +86,14 @@ while ($iter < $max_iteration) { print STDERR " time: $cur_time\n"; my $start = time; my $next_iter = $iter + 1; - my $gfile = '-g' . (join ' -g ', @grammar_files); - my $dec_cmd="$DECODER --feature_expectations -S 999 $CFLAG $gfile -n -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter"; - my $opt_cmd = "$OPTIMIZER $gfile -o $dir/weights.$next_iter.gz"; + my $WSTR = "-w $dir/weights.$iter.gz"; + if ($iter == 1) { $WSTR = ''; } + my $dec_cmd="$DECODER --feature_expectations -c $config $WSTR $CFLAG < $training_corpus 2> $dir/deco.log.$iter"; my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- "; my $cmd = ""; if ($parallel) { $cmd = $pcmd; } - $cmd .= "$dec_cmd | $opt_cmd"; - + $cmd .= "$dec_cmd"; + $cmd .= "| $ADAPTER | sort -k1 | $REDUCER | $REDUCE2WEIGHTS -o $dir/weights.$next_iter.gz"; print STDERR "EXECUTING: $cmd\n"; my $result = `$cmd`; if ($? != 0) { |