summaryrefslogtreecommitdiff
path: root/training
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-10-19 14:02:34 +0200
committerPatrick Simianer <p@simianer.de>2011-10-19 14:02:34 +0200
commit9beaeb42b71fa504bfa41a402cb17eb6ac4001af (patch)
tree0add4afabc526391753e4e6b9443a7bf21e3e2c3 /training
parentce3b4db94d40c111ede321ac6de2bb061a81c4af (diff)
parent09297047e446f49804d3f48bf320cdbd38d6396a (diff)
merge upstream/master
Diffstat (limited to 'training')
-rw-r--r--training/Makefile.am26
-rw-r--r--training/augment_grammar.cc4
-rw-r--r--training/cllh_filter_grammar.cc197
-rwxr-xr-xtraining/cluster-em.pl114
-rwxr-xr-xtraining/cluster-ptrain.pl206
-rw-r--r--training/collapse_weights.cc6
-rw-r--r--training/feature_expectations.cc232
-rw-r--r--training/grammar_convert.cc8
-rwxr-xr-xtraining/make-lexcrf-grammar.pl285
-rw-r--r--training/mpi_batch_optimize.cc164
-rw-r--r--training/mpi_compute_cllh.cc (renamed from training/compute_cllh.cc)66
-rw-r--r--training/mpi_extract_features.cc151
-rw-r--r--training/mpi_extract_reachable.cc163
-rw-r--r--training/mpi_flex_optimize.cc346
-rw-r--r--training/mpi_online_optimize.cc75
-rw-r--r--training/mr_optimize_reduce.cc19
16 files changed, 1038 insertions, 1024 deletions
diff --git a/training/Makefile.am b/training/Makefile.am
index 0d9085e4..2a11ae52 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -9,11 +9,12 @@ bin_PROGRAMS = \
atools \
plftools \
collapse_weights \
- cllh_filter_grammar \
+ mpi_extract_reachable \
+ mpi_extract_features \
mpi_online_optimize \
+ mpi_flex_optimize \
mpi_batch_optimize \
- mpi_em_optimize \
- compute_cllh \
+ mpi_compute_cllh \
augment_grammar
noinst_PROGRAMS = \
@@ -25,17 +26,20 @@ TESTS = lbfgs_test optimize_test
mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc
mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc
-mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc online_optimizer.cc optimize.cc
+mpi_flex_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+
+mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc
+mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-mpi_em_optimize_SOURCES = mpi_em_optimize.cc optimize.cc
-mpi_em_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+mpi_extract_features_SOURCES = mpi_extract_features.cc
+mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-compute_cllh_SOURCES = compute_cllh.cc
-compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc optimize.cc
+mpi_batch_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-cllh_filter_grammar_SOURCES = cllh_filter_grammar.cc
-cllh_filter_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc
+mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
augment_grammar_SOURCES = augment_grammar.cc
augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
diff --git a/training/augment_grammar.cc b/training/augment_grammar.cc
index df8d4ee8..e89a92d5 100644
--- a/training/augment_grammar.cc
+++ b/training/augment_grammar.cc
@@ -134,9 +134,7 @@ int main(int argc, char** argv) {
} else { ngram = NULL; }
extra_feature = conf.count("extra_lex_feature") > 0;
if (conf.count("collapse_weights")) {
- Weights w;
- w.InitFromFile(conf["collapse_weights"].as<string>());
- w.InitVector(&col_weights);
+ Weights::InitFromFile(conf["collapse_weights"].as<string>(), &col_weights);
}
clear_features = conf.count("clear_features_after_collapse") > 0;
gather_rules = false;
diff --git a/training/cllh_filter_grammar.cc b/training/cllh_filter_grammar.cc
deleted file mode 100644
index 6998ec2b..00000000
--- a/training/cllh_filter_grammar.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <unistd.h> // fork
-#include <sys/wait.h> // waitpid
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "tdict.h"
-#include "ff_register.h"
-#include "verbose.h"
-#include "hg.h"
-#include "decoder.h"
-#include "filelib.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("training_data,t",po::value<string>(),"Training data corpus")
- ("decoder_config,c",po::value<string>(),"Decoder configuration file")
- ("shards,s",po::value<unsigned>()->default_value(1),"Number of shards")
- ("starting_shard,S",po::value<unsigned>()->default_value(0), "In this invocation only process shards >= S")
- ("work_limit,l",po::value<unsigned>()->default_value(9999), "Process maximially this many shards")
- ("ncpus,C",po::value<unsigned>()->default_value(1),"Number of CPUs to use");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* ids) {
- ReadFile rf(fname);
- istream& in = *rf.stream();
- string line;
- int lc = 0;
- assert(size > 0);
- assert(rank < size);
- while(in) {
- getline(in, line);
- if (!in) break;
- if (lc % size == rank) {
- c->push_back(line);
- ids->push_back(lc);
- }
- ++lc;
- }
-}
-
-struct TrainingObserver : public DecoderObserver {
- TrainingObserver() : s_lhs(-TD::Convert("S")), goal_lhs(-TD::Convert("Goal")) {}
-
- void Reset() {
- total_complete = 0;
- }
-
- virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
- state = 1;
- used.clear();
- failed = true;
- }
-
- virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
- assert(state == 1);
- for (int i = 0; i < hg->edges_.size(); ++i) {
- const TRule* rule = hg->edges_[i].rule_.get();
- if (rule->lhs_ == s_lhs || rule->lhs_ == goal_lhs) // fragile hack to filter out glue rules
- continue;
- used.insert(rule);
- }
- state = 2;
- }
-
- virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
- assert(state == 2);
- state = 3;
- }
-
- virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
- if (state == 3) {
- failed = false;
- } else {
- failed = true;
- }
- }
-
- set<const TRule*> used;
-
- const int s_lhs;
- const int goal_lhs;
- bool failed;
- int total_complete;
- int state;
-};
-
-void work(const string& fname, int rank, int size, Decoder* decoder) {
- cerr << "Worker " << rank << '/' << size << " starting.\n";
- vector<string> corpus;
- vector<int> ids;
- ReadTrainingCorpus(fname, rank, size, &corpus, &ids);
- assert(corpus.size() > 0);
- assert(corpus.size() == ids.size());
- cerr << " " << rank << '/' << size << ": has " << corpus.size() << " sentences to process\n";
- ostringstream oc; oc << "corpus." << rank << "_of_" << size;
- WriteFile foc(oc.str());
- ostringstream og; og << "grammar." << rank << "_of_" << size << ".gz";
- WriteFile fog(og.str());
-
- set<const TRule*> all_used;
- TrainingObserver observer;
- for (int i = 0; i < corpus.size(); ++i) {
- const int sent_id = ids[i];
- const string& input = corpus[i];
- decoder->SetId(sent_id);
- decoder->Decode(input, &observer);
- if (observer.failed) {
- // do nothing
- } else {
- (*foc.stream()) << input << endl;
- for (set<const TRule*>::iterator it = observer.used.begin(); it != observer.used.end(); ++it) {
- if (all_used.insert(*it).second)
- (*fog.stream()) << **it << endl;
- }
- }
- }
-}
-
-int main(int argc, char** argv) {
- register_feature_functions();
-
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const string fname = conf["training_data"].as<string>();
- const unsigned ncpus = conf["ncpus"].as<unsigned>();
- const unsigned shards = conf["shards"].as<unsigned>();
- const unsigned start = conf["starting_shard"].as<unsigned>();
- const unsigned work_limit = conf["work_limit"].as<unsigned>();
- const unsigned eff_shards = min(start + work_limit, shards);
- cerr << "Processing shards " << start << "/" << shards << " to " << eff_shards << "/" << shards << endl;
- assert(ncpus > 0);
- ReadFile ini_rf(conf["decoder_config"].as<string>());
- Decoder decoder(ini_rf.stream());
- if (decoder.GetConf()["input"].as<string>() != "-") {
- cerr << "cdec.ini must not set an input file\n";
- abort();
- }
- SetSilent(true); // turn off verbose decoder output
- cerr << "Forking " << ncpus << " time(s)\n";
- vector<pid_t> children;
- for (int i = 0; i < ncpus; ++i) {
- pid_t pid = fork();
- if (pid < 0) {
- cerr << "Fork failed!\n";
- exit(1);
- }
- if (pid > 0) {
- children.push_back(pid);
- } else {
- for (int j = start; j < eff_shards; ++j) {
- if (j % ncpus == i) {
- cerr << " CPU " << i << " processing shard " << j << endl;
- work(fname, j, shards, &decoder);
- cerr << " Shard " << j << "/" << shards << " finished.\n";
- }
- }
- _exit(0);
- }
- }
- for (int i = 0; i < children.size(); ++i) {
- int status;
- int w = waitpid(children[i], &status, 0);
- if (w < 0) { cerr << "Error while waiting for children!"; return 1; }
- if (WIFSIGNALED(status)) {
- cerr << "Child " << i << " received signal " << WTERMSIG(status) << endl;
- if (WTERMSIG(status) == 11) { cerr << " this is a SEGV- you may be trying to print temporarily created rules\n"; }
- }
- }
- return 0;
-}
diff --git a/training/cluster-em.pl b/training/cluster-em.pl
deleted file mode 100755
index 267ab642..00000000
--- a/training/cluster-em.pl
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
-use Getopt::Long;
-my $parallel = 0;
-
-my $CWD=`pwd`; chomp $CWD;
-my $BIN_DIR = "$CWD/..";
-my $REDUCER = "$BIN_DIR/training/mr_em_adapted_reduce";
-my $REDUCE2WEIGHTS = "$BIN_DIR/training/mr_reduce_to_weights";
-my $ADAPTER = "$BIN_DIR/training/mr_em_map_adapter";
-my $DECODER = "$BIN_DIR/decoder/cdec";
-my $COMBINER_CACHE_SIZE = 10000000;
-my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl";
-die "Can't find $REDUCER" unless -f $REDUCER;
-die "Can't execute $REDUCER" unless -x $REDUCER;
-die "Can't find $REDUCE2WEIGHTS" unless -f $REDUCE2WEIGHTS;
-die "Can't execute $REDUCE2WEIGHTS" unless -x $REDUCE2WEIGHTS;
-die "Can't find $ADAPTER" unless -f $ADAPTER;
-die "Can't execute $ADAPTER" unless -x $ADAPTER;
-die "Can't find $DECODER" unless -f $DECODER;
-die "Can't execute $DECODER" unless -x $DECODER;
-my $restart = '';
-if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
-
-die "Usage: $0 [--restart] training.corpus cdec.ini\n" unless (scalar @ARGV == 2);
-
-my $training_corpus = shift @ARGV;
-my $config = shift @ARGV;
-my $pmem="2500mb";
-my $nodes = 40;
-my $max_iteration = 1000;
-my $CFLAG = "-C 1";
-if ($parallel) {
- die "Can't find $PARALLEL" unless -f $PARALLEL;
- die "Can't execute $PARALLEL" unless -x $PARALLEL;
-} else { $CFLAG = "-C 500"; }
-
-my $initial_weights = '';
-
-print STDERR <<EOT;
-EM TRAIN CONFIGURATION INFORMATION
-
- Config file: $config
- Training corpus: $training_corpus
- Initial weights: $initial_weights
- Decoder memory: $pmem
- Nodes requested: $nodes
- Max iterations: $max_iteration
- restart: $restart
-EOT
-
-my $nodelist="1";
-for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; }
-my $iter = 1;
-
-my $dir = "$CWD/emtrain";
-if ($restart) {
- die "$dir doesn't exist, but --restart specified!\n" unless -d $dir;
- my $o = `ls -t $dir/weights.*`;
- my ($a, @x) = split /\n/, $o;
- if ($a =~ /weights.(\d+)\.gz$/) {
- $iter = $1;
- } else {
- die "Unexpected file: $a!\n";
- }
- print STDERR "Restarting at iteration $iter\n";
-} else {
- die "$dir already exists!\n" if -e $dir;
- mkdir $dir or die "Can't create $dir: $!";
-
- if ($initial_weights) {
- unless ($initial_weights =~ /\.gz$/) {
- `cp $initial_weights $dir/weights.1`;
- `gzip -9 $dir/weights.1`;
- } else {
- `cp $initial_weights $dir/weights.1.gz`;
- }
- }
-}
-
-while ($iter < $max_iteration) {
- my $cur_time = `date`; chomp $cur_time;
- print STDERR "\nStarting iteration $iter...\n";
- print STDERR " time: $cur_time\n";
- my $start = time;
- my $next_iter = $iter + 1;
- my $WSTR = "-w $dir/weights.$iter.gz";
- if ($iter == 1) { $WSTR = ''; }
- my $dec_cmd="$DECODER --feature_expectations -c $config $WSTR $CFLAG < $training_corpus 2> $dir/deco.log.$iter";
- my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
- my $cmd = "";
- if ($parallel) { $cmd = $pcmd; }
- $cmd .= "$dec_cmd";
- $cmd .= "| $ADAPTER | sort -k1 | $REDUCER | $REDUCE2WEIGHTS -o $dir/weights.$next_iter.gz";
- print STDERR "EXECUTING: $cmd\n";
- my $result = `$cmd`;
- if ($? != 0) {
- die "Error running iteration $iter: $!";
- }
- chomp $result;
- my $end = time;
- my $diff = ($end - $start);
- print STDERR " ITERATION $iter TOOK $diff SECONDS\n";
- $iter = $next_iter;
- if ($result =~ /1$/) {
- print STDERR "Training converged.\n";
- last;
- }
-}
-
-print "FINAL WEIGHTS: $dir/weights.$iter\n";
-
diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl
deleted file mode 100755
index 03122df9..00000000
--- a/training/cluster-ptrain.pl
+++ /dev/null
@@ -1,206 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path getcwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
-use Getopt::Long;
-
-my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluation
-my $CWD=getcwd();
-my $OPTIMIZER = "$SCRIPT_DIR/mr_optimize_reduce";
-my $DECODER = "$SCRIPT_DIR/../decoder/cdec";
-my $COMBINER_CACHE_SIZE = 150;
-# This is a hack to run this on a weird cluster,
-# eventually, I'll provide Hadoop scripts.
-my $PARALLEL = "/chomes/redpony/svn-trunk/sa-utils/parallelize.pl";
-die "Can't find $OPTIMIZER" unless -f $OPTIMIZER;
-die "Can't execute $OPTIMIZER" unless -x $OPTIMIZER;
-my $restart = '';
-if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
-
-my $pmem="2500mb";
-my $nodes = 1;
-my $max_iteration = 1000;
-my $PRIOR_FLAG = "";
-my $parallel = 1;
-my $CFLAG = "-C 1";
-my $LOCAL;
-my $DISTRIBUTED;
-my $PRIOR;
-my $OALG = "lbfgs";
-my $sigsq = 1;
-my $means_file;
-my $mem_buffers = 20;
-my $RESTART_IF_NECESSARY;
-GetOptions("cdec=s" => \$DECODER,
- "distributed" => \$DISTRIBUTED,
- "sigma_squared=f" => \$sigsq,
- "lbfgs_memory_buffers=i" => \$mem_buffers,
- "max_iteration=i" => \$max_iteration,
- "means=s" => \$means_file,
- "optimizer=s" => \$OALG,
- "gaussian_prior" => \$PRIOR,
- "restart_if_necessary" => \$RESTART_IF_NECESSARY,
- "jobs=i" => \$nodes,
- "pmem=s" => \$pmem
- ) or usage();
-usage() unless scalar @ARGV==3;
-my $config_file = shift @ARGV;
-my $training_corpus = shift @ARGV;
-my $initial_weights = shift @ARGV;
-unless ($DISTRIBUTED) { $LOCAL = 1; }
-die "Can't find $config_file" unless -f $config_file;
-die "Can't find $DECODER" unless -f $DECODER;
-die "Can't execute $DECODER" unless -x $DECODER;
-if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; }
-if ($PRIOR) {
- $PRIOR_FLAG="-p --sigma_squared $sigsq";
- if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; }
-}
-
-if ($parallel) {
- die "Can't find $PARALLEL" unless -f $PARALLEL;
- die "Can't execute $PARALLEL" unless -x $PARALLEL;
-}
-unless ($parallel) { $CFLAG = "-C 500"; }
-unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; }
-my $clines = num_lines($training_corpus);
-my $dir = "$CWD/ptrain";
-
-if ($RESTART_IF_NECESSARY && -d $dir) {
- $restart = 1;
-}
-
-print STDERR <<EOT;
-PTRAIN CONFIGURATION INFORMATION
-
- Config file: $config_file
- Training corpus: $training_corpus
- Corpus size: $clines
- Initial weights: $initial_weights
- Decoder memory: $pmem
- Max iterations: $max_iteration
- Optimizer: $OALG
- Jobs requested: $nodes
- prior?: $PRIOR_FLAG
- restart?: $restart
-EOT
-
-if ($OALG) { $OALG="-m $OALG"; }
-
-my $nodelist="1";
-for (my $i=1; $i<$nodes; $i++) { $nodelist .= " 1"; }
-my $iter = 1;
-
-if ($restart) {
- die "$dir doesn't exist, but --restart specified!\n" unless -d $dir;
- my $o = `ls -t $dir/weights.*`;
- my ($a, @x) = split /\n/, $o;
- if ($a =~ /weights.(\d+)\.gz$/) {
- $iter = $1;
- } else {
- die "Unexpected file: $a!\n";
- }
- print STDERR "Restarting at iteration $iter\n";
-} else {
- die "$dir already exists!\n" if -e $dir;
- mkdir $dir or die "Can't create $dir: $!";
-
- unless ($initial_weights =~ /\.gz$/) {
- `cp $initial_weights $dir/weights.1`;
- `gzip -9 $dir/weights.1`;
- } else {
- `cp $initial_weights $dir/weights.1.gz`;
- }
- open T, "<$training_corpus" or die "Can't read $training_corpus: $!";
- open TO, ">$dir/training.in";
- my $lc = 0;
- while(<T>) {
- chomp;
- s/^\s+//;
- s/\s+$//;
- die "Expected A ||| B in input file" unless / \|\|\| /;
- print TO "<seg id=\"$lc\">$_</seg>\n";
- $lc++;
- }
- close T;
- close TO;
-}
-$training_corpus = "$dir/training.in";
-
-my $iter_attempts = 1;
-while ($iter < $max_iteration) {
- my $cur_time = `date`; chomp $cur_time;
- print STDERR "\nStarting iteration $iter...\n";
- print STDERR " time: $cur_time\n";
- my $start = time;
- my $next_iter = $iter + 1;
- my $dec_cmd="$DECODER -G $CFLAG -c $config_file -w $dir/weights.$iter.gz < $training_corpus 2> $dir/deco.log.$iter";
- my $opt_cmd = "$OPTIMIZER $PRIOR_FLAG -M $mem_buffers $OALG -s $dir/opt.state -i $dir/weights.$iter.gz -o $dir/weights.$next_iter.gz";
- my $pcmd = "$PARALLEL -e $dir/err -p $pmem --nodelist \"$nodelist\" -- ";
- my $cmd = "";
- if ($parallel) { $cmd = $pcmd; }
- $cmd .= "$dec_cmd | $opt_cmd";
-
- print STDERR "EXECUTING: $cmd\n";
- my $result = `$cmd`;
- my $exit_code = $? >> 8;
- if ($exit_code == 99) {
- $iter_attempts++;
- if ($iter_attempts > $MAX_ITER_ATTEMPTS) {
- die "Received restart request $iter_attempts times from optimizer, giving up\n";
- }
- print STDERR "Function evaluation failed, retrying (attempt $iter_attempts)\n";
- next;
- }
- if ($? != 0) {
- die "Error running iteration $iter: $!";
- }
- chomp $result;
- my $end = time;
- my $diff = ($end - $start);
- print STDERR " ITERATION $iter TOOK $diff SECONDS\n";
- $iter = $next_iter;
- if ($result =~ /1$/) {
- print STDERR "Training converged.\n";
- last;
- }
- $iter_attempts = 1;
-}
-
-print "FINAL WEIGHTS: $dir/weights.$iter\n";
-`mv $dir/weights.$iter.gz $dir/weights.final.gz`;
-
-sub usage {
- die <<EOT;
-
-Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init
-
- Options:
-
- --distributed Parallelize function evaluation
- --jobs N Number of jobs to use
- --cdec PATH Path to cdec binary
- --optimize OPT lbfgs, rprop, sgd
- --gaussian_prior add Gaussian prior
- --means FILE if you want means other than 0
- --sigma_squared S variance on prior
- --pmem MEM Memory required for decoder
- --lbfgs_memory_buffers Number of buffers to use
- with LBFGS optimizer
-
-EOT
-}
-
-sub num_lines {
- my $file = shift;
- my $fh;
- if ($file=~ /\.gz$/) {
- open $fh, "zcat $file|" or die "Couldn't fork zcat $file: $!";
- } else {
- open $fh, "<$file" or die "Couldn't read $file: $!";
- }
- my $lines = 0;
- while(<$fh>) { $lines++; }
- close $fh;
- return $lines;
-}
diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc
index 4fb742fb..dc480f6c 100644
--- a/training/collapse_weights.cc
+++ b/training/collapse_weights.cc
@@ -59,10 +59,8 @@ int main(int argc, char** argv) {
InitCommandLine(argc, argv, &conf);
const string wfile = conf["weights"].as<string>();
const string gfile = conf["grammar"].as<string>();
- Weights wm;
- wm.InitFromFile(wfile);
- vector<double> w;
- wm.InitVector(&w);
+ vector<weight_t> w;
+ Weights::InitFromFile(wfile, &w);
MarginalMap e_tots;
MarginalMap f_tots;
prob_t tot;
diff --git a/training/feature_expectations.cc b/training/feature_expectations.cc
new file mode 100644
index 00000000..f1a85495
--- /dev/null
+++ b/training/feature_expectations.cc
@@ -0,0 +1,232 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <tr1/memory>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "verbose.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "online_optimizer.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+#ifdef HAVE_MPI
+#include <boost/mpi/timer.hpp>
+#include <boost/mpi.hpp>
+namespace mpi = boost::mpi;
+#endif
+
+using namespace std;
+namespace po = boost::program_options;
+
+struct FComp {
+ const vector<double>& w_;
+ FComp(const vector<double>& w) : w_(w) {}
+ bool operator()(int a, int b) const {
+ return fabs(w_[a]) > fabs(w_[b]);
+ }
+};
+
+void ShowFeatures(const vector<double>& w) {
+ vector<int> fnums(w.size());
+ for (int i = 0; i < w.size(); ++i)
+ fnums[i] = i;
+ sort(fnums.begin(), fnums.end(), FComp(w));
+ for (vector<int>::iterator i = fnums.begin(); i != fnums.end(); ++i) {
+ if (w[*i]) cout << FD::Convert(*i) << ' ' << w[*i] << endl;
+ }
+}
+
+void ReadConfig(const string& ini, vector<string>* out) {
+ ReadFile rf(ini);
+ istream& in = *rf.stream();
+ while(in) {
+ string line;
+ getline(in, line);
+ if (!in) continue;
+ out->push_back(line);
+ }
+}
+
+void StoreConfig(const vector<string>& cfg, istringstream* o) {
+ ostringstream os;
+ for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; }
+ o->str(os.str());
+}
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("input,i",po::value<string>(),"Corpus of source language sentences")
+ ("weights,w",po::value<string>(),"Input feature weights file")
+ ("decoder_config,c",po::value<string>(), "cdec.ini file");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || !conf->count("input") || !conf->count("decoder_config")) {
+ cerr << dcmdline_options << endl;
+ return false;
+ }
+ return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ int id = 0;
+ while(in) {
+ getline(in, line);
+ if (!in) break;
+ if (id % size == rank) {
+ c->push_back(line);
+ order->push_back(id);
+ }
+ ++id;
+ }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct TrainingObserver : public DecoderObserver {
+ void Reset() {
+ acc_exp.clear();
+ total_complete = 0;
+ }
+
+ virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
+ cur_model_exp.clear();
+ state = 1;
+ }
+
+ // compute model expectations, denominator of objective
+ virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+ assert(state == 1);
+ state = 2;
+ const prob_t z = InsideOutside<prob_t,
+ EdgeProb,
+ SparseVector<prob_t>,
+ EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
+ cur_model_exp /= z;
+ acc_exp += cur_model_exp;
+ }
+
+ virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+ cerr << "IGNORING ALIGNMENT FOREST!\n";
+ }
+
+ virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
+ if (state == 2) {
+ ++total_complete;
+ }
+ }
+
+ void GetExpectations(SparseVector<double>* g) const {
+ g->clear();
+ for (SparseVector<prob_t>::const_iterator it = acc_exp.begin(); it != acc_exp.end(); ++it)
+ g->set_value(it->first, it->second);
+ }
+
+ int total_complete;
+ SparseVector<prob_t> cur_model_exp;
+ SparseVector<prob_t> acc_exp;
+ int state;
+};
+
+#ifdef HAVE_MPI
+namespace boost { namespace mpi {
+ template<>
+ struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> >
+ : mpl::true_ { };
+} } // end namespace boost::mpi
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+ mpi::environment env(argc, argv);
+ mpi::communicator world;
+ const int size = world.size();
+ const int rank = world.rank();
+#else
+ const int size = 1;
+ const int rank = 0;
+#endif
+ if (size > 1) SetSilent(true); // turn off verbose decoder output
+ register_feature_functions();
+
+ po::variables_map conf;
+ if (!InitCommandLine(argc, argv, &conf))
+ return 1;
+
+ // load initial weights
+ Weights weights;
+ if (conf.count("weights"))
+ weights.InitFromFile(conf["weights"].as<string>());
+
+ vector<string> corpus;
+ vector<int> ids;
+ ReadTrainingCorpus(conf["input"].as<string>(), rank, size, &corpus, &ids);
+ assert(corpus.size() > 0);
+
+ vector<string> cdec_ini;
+ ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
+ istringstream ini;
+ StoreConfig(cdec_ini, &ini);
+ Decoder decoder(&ini);
+ if (decoder.GetConf()["input"].as<string>() != "-") {
+ cerr << "cdec.ini must not set an input file\n";
+ return 1;
+ }
+
+ SparseVector<double> x;
+ weights.InitSparseVector(&x);
+ TrainingObserver observer;
+
+ weights.InitFromVector(x);
+ vector<double> lambdas;
+ weights.InitVector(&lambdas);
+ decoder.SetWeights(lambdas);
+ observer.Reset();
+ for (unsigned i = 0; i < corpus.size(); ++i) {
+ int id = ids[i];
+ decoder.SetId(id);
+ decoder.Decode(corpus[i], &observer);
+ }
+ SparseVector<double> local_exps, exps;
+ observer.GetExpectations(&local_exps);
+#ifdef HAVE_MPI
+ reduce(world, local_exps, exps, std::plus<SparseVector<double> >(), 0);
+#else
+ exps.swap(local_exps);
+#endif
+
+ weights.InitFromVector(exps);
+ weights.InitVector(&lambdas);
+ ShowFeatures(lambdas);
+
+ return 0;
+}
diff --git a/training/grammar_convert.cc b/training/grammar_convert.cc
index 8d292f8a..bf8abb26 100644
--- a/training/grammar_convert.cc
+++ b/training/grammar_convert.cc
@@ -251,12 +251,10 @@ int main(int argc, char **argv) {
const bool is_split_input = (conf["format"].as<string>() == "split");
const bool is_json_input = is_split_input || (conf["format"].as<string>() == "json");
const bool collapse_weights = conf.count("collapse_weights");
- Weights wts;
vector<double> w;
- if (conf.count("weights")) {
- wts.InitFromFile(conf["weights"].as<string>());
- wts.InitVector(&w);
- }
+ if (conf.count("weights"))
+ Weights::InitFromFile(conf["weights"].as<string>(), &w);
+
if (collapse_weights && !w.size()) {
cerr << "--collapse_weights requires a weights file to be specified!\n";
exit(1);
diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl
deleted file mode 100755
index 8cdf7718..00000000
--- a/training/make-lexcrf-grammar.pl
+++ /dev/null
@@ -1,285 +0,0 @@
-#!/usr/bin/perl -w
-use utf8;
-use strict;
-my ($effile, $model1) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.model1\n" unless $effile && -f $effile && $model1 && -f $model1;
-
-open EF, "<$effile" or die;
-open M1, "<$model1" or die;
-binmode(EF,":utf8");
-binmode(M1,":utf8");
-binmode(STDOUT,":utf8");
-my %model1;
-while(<M1>) {
- chomp;
- my ($f, $e, $lp) = split /\s+/;
- $model1{$f}->{$e} = $lp;
-}
-
-my $ADD_MODEL1 = 0; # found that model1 hurts performance
-my $IS_FRENCH_F = 1; # indicates that the f language is french
-my $IS_ARABIC_F = 0; # indicates that the f language is arabic
-my $IS_URDU_F = 0; # indicates that the f language is arabic
-my $ADD_PREFIX_ID = 0;
-my $ADD_LEN = 1;
-my $ADD_SIM = 1;
-my $ADD_DICE = 1;
-my $ADD_111 = 1;
-my $ADD_ID = 1;
-my $ADD_PUNC = 1;
-my $ADD_NUM_MM = 1;
-my $ADD_NULL = 1;
-my $ADD_STEM_ID = 1;
-my $BEAM_RATIO = 50;
-
-my %fdict;
-my %fcounts;
-my %ecounts;
-
-my %sdict;
-
-while(<EF>) {
- chomp;
- my ($f, $e) = split /\s*\|\|\|\s*/;
- my @es = split /\s+/, $e;
- my @fs = split /\s+/, $f;
- for my $ew (@es){ $ecounts{$ew}++; }
- push @fs, '<eps>' if $ADD_NULL;
- for my $fw (@fs){ $fcounts{$fw}++; }
- for my $fw (@fs){
- for my $ew (@es){
- $fdict{$fw}->{$ew}++;
- }
- }
-}
-
-print STDERR "Dice 0\n" if $ADD_DICE;
-print STDERR "OneOneOne 0\nId_OneOneOne 0\n" if $ADD_111;
-print STDERR "Identical 0\n" if $ADD_ID;
-print STDERR "PuncMiss 0\n" if $ADD_PUNC;
-print STDERR "IsNull 0\n" if $ADD_NULL;
-print STDERR "Model1 0\n" if $ADD_MODEL1;
-print STDERR "DLen 0\n" if $ADD_LEN;
-print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM;
-print STDERR "OrthoSim 0\n" if $ADD_SIM;
-print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID);
-my $fc = 1000000;
-my $sids = 1000000;
-for my $f (sort keys %fdict) {
- my $re = $fdict{$f};
- my $max;
- for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) {
- my $efcount = $re->{$e};
- unless (defined $max) { $max = $efcount; }
- my $m1 = $model1{$f}->{$e};
- unless (defined $m1) { next; }
- $fc++;
- my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
- my $feats = "F$fc=1";
- my $oe = $e;
- my $of = $f; # normalized form
- if ($IS_FRENCH_F) {
- # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French
- $of =~ s/â/as/g;
- $of =~ s/ê/es/g;
- $of =~ s/î/is/g;
- $of =~ s/ô/os/g;
- $of =~ s/û/us/g;
- } elsif ($IS_ARABIC_F) {
- if (length($of) > 1 && !($of =~ /\d/)) {
- $of =~ s/\$/sh/g;
- }
- } elsif ($IS_URDU_F) {
- if (length($of) > 1 && !($of =~ /\d/)) {
- $of =~ s/\$/sh/g;
- }
- $oe =~ s/^-e-//;
- $oe =~ s/^al-/al/;
- $of =~ s/([a-z])\~/$1$1/g;
- $of =~ s/E/'/g;
- $of =~ s/^Aw/o/g;
- $of =~ s/\|/a/g;
- $of =~ s/@/h/g;
- $of =~ s/c/ch/g;
- $of =~ s/x/kh/g;
- $of =~ s/\*/dh/g;
- $of =~ s/w/o/g;
- $of =~ s/Z/dh/g;
- $of =~ s/y/i/g;
- $of =~ s/Y/a/g;
- $of = lc $of;
- }
- my $len_e = length($oe);
- my $len_f = length($of);
- $feats .= " Model1=$m1" if ($ADD_MODEL1);
- $feats .= " Dice=$dice" if $ADD_DICE;
- my $is_null = undef;
- if ($ADD_NULL && $f eq '<eps>') {
- $feats .= " IsNull=1";
- $is_null = 1;
- }
- if ($ADD_LEN) {
- if (!$is_null) {
- my $dlen = abs($len_e - $len_f);
- $feats .= " DLen=$dlen";
- }
- }
- my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
- my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
- my $both_non_numeric = (!$e_num && !$f_num);
- if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) {
- $feats .= " NumMM=1";
- }
- if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) {
- $feats .= " NumMatch=1";
- }
- if ($ADD_STEM_ID) {
- my $el = 4;
- my $fl = 4;
- if ($oe =~ /^al|re|co/) { $el++; }
- if ($of =~ /^al|re|co/) { $fl++; }
- if ($oe =~ /^trans|inter/) { $el+=2; }
- if ($of =~ /^trans|inter/) { $fl+=2; }
- if ($fl > length($of)) { $fl = length($of); }
- if ($el > length($oe)) { $el = length($oe); }
- my $sf = substr $of, 0, $fl;
- my $se = substr $oe, 0, $el;
- my $id = $sdict{$sf}->{$se};
- if (!$id) {
- $sids++;
- $sdict{$sf}->{$se} = $sids;
- $id = $sids;
- print STDERR "S$sids 0\n"
- }
- $feats .= " S$id=1";
- }
- if ($ADD_PREFIX_ID) {
- if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {
- my $pe = substr $oe, 0, 3;
- my $pf = substr $of, 0, 3;
- if ($pe eq $pf) { $feats .= " PfxIdentical=1"; }
- }
- }
- if ($ADD_SIM) {
- my $ld = 0;
- my $eff = $len_e;
- if ($eff < $len_f) { $eff = $len_f; }
- if (!$is_null) {
- $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
- }
- $feats .= " OrthoSim=$ld";
- }
- my $ident = ($e eq $f);
- if ($ident && $ADD_ID) { $feats .= " Identical=1"; }
- if ($ADD_111 && ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1)) {
- if ($ident && $ADD_ID) {
- $feats .= " Id_OneOneOne=1";
- }
- $feats .= " OneOneOne=1";
- }
- if ($ADD_PUNC) {
- if (($f =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $e =~ /[a-z]+/) ||
- ($e =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $f =~ /[a-z]+/)) {
- $feats .= " PuncMiss=1";
- }
- }
- my $r = (0.5 - rand)/5;
- print STDERR "F$fc $r\n";
- print "$f ||| $e ||| $feats\n";
- }
-}
-
-sub levenshtein
-{
- # $s1 and $s2 are the two strings
- # $len1 and $len2 are their respective lengths
- #
- my ($s1, $s2) = @_;
- my ($len1, $len2) = (length $s1, length $s2);
-
- # If one of the strings is empty, the distance is the length
- # of the other string
- #
- return $len2 if ($len1 == 0);
- return $len1 if ($len2 == 0);
-
- my %mat;
-
- # Init the distance matrix
- #
- # The first row to 0..$len1
- # The first column to 0..$len2
- # The rest to 0
- #
- # The first row and column are initialized so to denote distance
- # from the empty string
- #
- for (my $i = 0; $i <= $len1; ++$i)
- {
- for (my $j = 0; $j <= $len2; ++$j)
- {
- $mat{$i}{$j} = 0;
- $mat{0}{$j} = $j;
- }
-
- $mat{$i}{0} = $i;
- }
-
- # Some char-by-char processing is ahead, so prepare
- # array of chars from the strings
- #
- my @ar1 = split(//, $s1);
- my @ar2 = split(//, $s2);
-
- for (my $i = 1; $i <= $len1; ++$i)
- {
- for (my $j = 1; $j <= $len2; ++$j)
- {
- # Set the cost to 1 iff the ith char of $s1
- # equals the jth of $s2
- #
- # Denotes a substitution cost. When the char are equal
- # there is no need to substitute, so the cost is 0
- #
- my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
-
- # Cell $mat{$i}{$j} equals the minimum of:
- #
- # - The cell immediately above plus 1
- # - The cell immediately to the left plus 1
- # - The cell diagonally above and to the left plus the cost
- #
- # We can either insert a new char, delete a char or
- # substitute an existing char (with an associated cost)
- #
- $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
- $mat{$i}{$j-1} + 1,
- $mat{$i-1}{$j-1} + $cost]);
- }
- }
-
- # Finally, the Levenshtein distance equals the rightmost bottom cell
- # of the matrix
- #
- # Note that $mat{$x}{$y} denotes the distance between the substrings
- # 1..$x and 1..$y
- #
- return $mat{$len1}{$len2};
-}
-
-
-# minimal element of a list
-#
-sub min
-{
- my @list = @{$_[0]};
- my $min = $list[0];
-
- foreach my $i (@list)
- {
- $min = $i if ($i < $min);
- }
-
- return $min;
-}
-
diff --git a/training/mpi_batch_optimize.cc b/training/mpi_batch_optimize.cc
index 39a8af7d..046e921c 100644
--- a/training/mpi_batch_optimize.cc
+++ b/training/mpi_batch_optimize.cc
@@ -22,6 +22,7 @@ namespace mpi = boost::mpi;
#include "ff_register.h"
#include "decoder.h"
#include "filelib.h"
+#include "stringlib.h"
#include "optimize.h"
#include "fdict.h"
#include "weights.h"
@@ -31,47 +32,18 @@ using namespace std;
using boost::shared_ptr;
namespace po = boost::program_options;
-void SanityCheck(const vector<double>& w) {
- for (int i = 0; i < w.size(); ++i) {
- assert(!isnan(w[i]));
- assert(!isinf(w[i]));
- }
-}
-
-struct FComp {
- const vector<double>& w_;
- FComp(const vector<double>& w) : w_(w) {}
- bool operator()(int a, int b) const {
- return fabs(w_[a]) > fabs(w_[b]);
- }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
- vector<int> fnums(w.size());
- for (int i = 0; i < w.size(); ++i)
- fnums[i] = i;
- vector<int>::iterator mid = fnums.begin();
- mid += (w.size() > 10 ? 10 : w.size());
- partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
- cerr << "TOP FEATURES:";
- for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
- cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
- }
- cerr << endl;
-}
-
bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
opts.add_options()
("input_weights,w",po::value<string>(),"Input feature weights file")
("training_data,t",po::value<string>(),"Training data")
("decoder_config,d",po::value<string>(),"Decoder configuration file")
- ("sharded_input,s",po::value<string>(), "Corpus and grammar files are 'sharded' so each processor loads its own input and grammar file. Argument is the directory containing the shards.")
("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file")
("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)")
("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory")
("gaussian_prior,p","Use a Gaussian prior on the weights")
("means,u", po::value<string>(), "File containing the means for Gaussian prior")
+ ("per_sentence_grammar_scratch,P", po::value<string>(), "(Optional) location of scratch space to copy per-sentence grammars for fast access, useful if a RAM disk is available")
("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior");
po::options_description clo("Command line options");
clo.add_options()
@@ -88,14 +60,10 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
}
po::notify(*conf);
- if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data") | conf->count("sharded_input")) || !conf->count("decoder_config")) {
+ if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data")) || !conf->count("decoder_config")) {
cerr << dcmdline_options << endl;
return false;
}
- if (conf->count("training_data") && conf->count("sharded_input")) {
- cerr << "Cannot specify both --training_data and --sharded_input\n";
- return false;
- }
return true;
}
@@ -124,7 +92,7 @@ struct TrainingObserver : public DecoderObserver {
void SetLocalGradientAndObjective(vector<double>* g, double* o) const {
*o = acc_obj;
for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
- (*g)[it->first] = it->second;
+ (*g)[it->first] = it->second.as_float();
}
virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
@@ -220,6 +188,36 @@ struct VectorPlus : public binary_function<vector<T>, vector<T>, vector<T> > {
}
};
+void MovePerSentenceGrammars(const string& root, int size, int rank, vector<string>* c) {
+ if (!DirectoryExists(root)) {
+ cerr << "Can't find scratch space at " << root << endl;
+ abort();
+ }
+ ostringstream os;
+ os << root << "/psg." << size << "_of_" << rank;
+ const string path = os.str();
+ MkDirP(path);
+ string sent;
+ map<string, string> attr;
+ for (unsigned i = 0; i < c->size(); ++i) {
+ sent = (*c)[i];
+ attr.clear();
+ ProcessAndStripSGML(&sent, &attr);
+ map<string, string>::iterator it = attr.find("grammar");
+ if (it != attr.end()) {
+ string src_file = it->second;
+ bool is_gzipped = (src_file.size() > 3) && (src_file.rfind(".gz") == (src_file.size() - 3));
+ string new_name = path + "/" + md5(sent);
+ if (is_gzipped) new_name += ".gz";
+ CopyFile(src_file, new_name);
+ it->second = new_name;
+ }
+ ostringstream ns;
+ ns << SGMLOpenSegTag(attr) << ' ' << sent << " </seg>";
+ (*c)[i] = ns.str();
+ }
+}
+
int main(int argc, char** argv) {
#ifdef HAVE_MPI
mpi::environment env(argc, argv);
@@ -236,42 +234,9 @@ int main(int argc, char** argv) {
po::variables_map conf;
if (!InitCommandLine(argc, argv, &conf)) return 1;
- string shard_dir;
- if (conf.count("sharded_input")) {
- shard_dir = conf["sharded_input"].as<string>();
- if (!DirectoryExists(shard_dir)) {
- if (rank == 0) cerr << "Can't find shard directory: " << shard_dir << endl;
- return 1;
- }
- if (rank == 0)
- cerr << "Shard directory: " << shard_dir << endl;
- }
-
- // load initial weights
- Weights weights;
- if (rank == 0) { cerr << "Loading weights...\n"; }
- weights.InitFromFile(conf["input_weights"].as<string>());
- if (rank == 0) { cerr << "Done loading weights.\n"; }
-
- // freeze feature set (should be optional?)
- const bool freeze_feature_set = true;
- if (freeze_feature_set) FD::Freeze();
-
// load cdec.ini and set up decoder
vector<string> cdec_ini;
ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
- if (shard_dir.size()) {
- if (rank == 0) {
- for (int i = 0; i < cdec_ini.size(); ++i) {
- if (cdec_ini[i].find("grammar=") == 0) {
- cerr << "!!! using sharded input and " << conf["decoder_config"].as<string>() << " contains a grammar specification:\n" << cdec_ini[i] << "\n VERIFY THAT THIS IS CORRECT!\n";
- }
- }
- }
- ostringstream g;
- g << "grammar=" << shard_dir << "/grammar." << rank << "_of_" << size << ".gz";
- cdec_ini.push_back(g.str());
- }
istringstream ini;
StoreConfig(cdec_ini, &ini);
if (rank == 0) cerr << "Loading grammar...\n";
@@ -282,22 +247,28 @@ int main(int argc, char** argv) {
}
if (rank == 0) cerr << "Done loading grammar!\n";
+ // load initial weights
+ if (rank == 0) { cerr << "Loading weights...\n"; }
+ vector<weight_t>& lambdas = decoder->CurrentWeightVector();
+ Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
+ if (rank == 0) { cerr << "Done loading weights.\n"; }
+
+ // freeze feature set (should be optional?)
+ const bool freeze_feature_set = true;
+ if (freeze_feature_set) FD::Freeze();
+
const int num_feats = FD::NumFeats();
if (rank == 0) cerr << "Number of features: " << num_feats << endl;
+ lambdas.resize(num_feats);
+
const bool gaussian_prior = conf.count("gaussian_prior");
- vector<double> means(num_feats, 0);
+ vector<weight_t> means(num_feats, 0);
if (conf.count("means")) {
if (!gaussian_prior) {
cerr << "Don't use --means without --gaussian_prior!\n";
exit(1);
}
- Weights wm;
- wm.InitFromFile(conf["means"].as<string>());
- if (num_feats != FD::NumFeats()) {
- cerr << "[ERROR] Means file had unexpected features!\n";
- exit(1);
- }
- wm.InitVector(&means);
+ Weights::InitFromFile(conf["means"].as<string>(), &means);
}
shared_ptr<BatchOptimizer> o;
if (rank == 0) {
@@ -309,28 +280,18 @@ int main(int argc, char** argv) {
cerr << "Optimizer: " << o->Name() << endl;
}
double objective = 0;
- vector<double> lambdas(num_feats, 0.0);
- weights.InitVector(&lambdas);
- if (lambdas.size() != num_feats) {
- cerr << "Initial weights file did not have all features specified!\n feats="
- << num_feats << "\n weights file=" << lambdas.size() << endl;
- lambdas.resize(num_feats, 0.0);
- }
vector<double> gradient(num_feats, 0.0);
- vector<double> rcv_grad(num_feats, 0.0);
+ vector<double> rcv_grad;
+ rcv_grad.clear();
bool converged = false;
vector<string> corpus;
- if (shard_dir.size()) {
- ostringstream os; os << shard_dir << "/corpus." << rank << "_of_" << size;
- ReadTrainingCorpus(os.str(), 0, 1, &corpus);
- cerr << os.str() << " has " << corpus.size() << " training examples. " << endl;
- if (corpus.size() > 500) { corpus.resize(500); cerr << " TRUNCATING\n"; }
- } else {
- ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
- }
+ ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
assert(corpus.size() > 0);
+ if (conf.count("per_sentence_grammar_scratch"))
+ MovePerSentenceGrammars(conf["per_sentence_grammar_scratch"].as<string>(), rank, size, &corpus);
+
TrainingObserver observer;
while (!converged) {
observer.Reset();
@@ -341,19 +302,20 @@ int main(int argc, char** argv) {
if (rank == 0) {
cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n";
}
- decoder->SetWeights(lambdas);
for (int i = 0; i < corpus.size(); ++i)
decoder->Decode(corpus[i], &observer);
cerr << " process " << rank << '/' << size << " done\n";
fill(gradient.begin(), gradient.end(), 0);
- fill(rcv_grad.begin(), rcv_grad.end(), 0);
observer.SetLocalGradientAndObjective(&gradient, &objective);
double to = 0;
#ifdef HAVE_MPI
+ rcv_grad.resize(num_feats, 0.0);
mpi::reduce(world, &gradient[0], gradient.size(), &rcv_grad[0], plus<double>(), 0);
- mpi::reduce(world, objective, to, plus<double>(), 0);
swap(gradient, rcv_grad);
+ rcv_grad.clear();
+
+ mpi::reduce(world, objective, to, plus<double>(), 0);
objective = to;
#endif
@@ -378,7 +340,7 @@ int main(int argc, char** argv) {
for (int i = 0; i < gradient.size(); ++i)
gnorm += gradient[i] * gradient[i];
cerr << " GNORM=" << sqrt(gnorm) << endl;
- vector<double> old = lambdas;
+ vector<weight_t> old = lambdas;
int c = 0;
while (old == lambdas) {
++c;
@@ -387,9 +349,8 @@ int main(int argc, char** argv) {
assert(c < 5);
}
old.clear();
- SanityCheck(lambdas);
- ShowLargestFeatures(lambdas);
- weights.InitFromVector(lambdas);
+ Weights::SanityCheck(lambdas);
+ Weights::ShowLargestFeatures(lambdas);
converged = o->HasConverged();
if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
@@ -399,7 +360,7 @@ int main(int argc, char** argv) {
ostringstream vv;
vv << "Objective = " << objective << " (eval count=" << o->EvaluationCount() << ")";
const string svv = vv.str();
- weights.WriteToFile(fname, true, &svv);
+ Weights::WriteToFile(fname, lambdas, true, &svv);
} // rank == 0
int cint = converged;
#ifdef HAVE_MPI
@@ -411,3 +372,4 @@ int main(int argc, char** argv) {
}
return 0;
}
+
diff --git a/training/compute_cllh.cc b/training/mpi_compute_cllh.cc
index 332f6d0c..d5caa745 100644
--- a/training/compute_cllh.cc
+++ b/training/mpi_compute_cllh.cc
@@ -1,6 +1,4 @@
-#include <sstream>
#include <iostream>
-#include <fstream>
#include <vector>
#include <cassert>
#include <cmath>
@@ -12,6 +10,7 @@
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include "sentence_metadata.h"
#include "verbose.h"
#include "hg.h"
#include "prob.h"
@@ -52,7 +51,8 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
return true;
}
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* ids) {
+void ReadInstances(const string& fname, int rank, int size, vector<string>* c) {
+ assert(fname != "-");
ReadFile rf(fname);
istream& in = *rf.stream();
string line;
@@ -60,20 +60,16 @@ void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>*
while(in) {
getline(in, line);
if (!in) break;
- if (lc % size == rank) {
- c->push_back(line);
- ids->push_back(lc);
- }
+ if (lc % size == rank) c->push_back(line);
++lc;
}
}
static const double kMINUS_EPSILON = -1e-6;
-struct TrainingObserver : public DecoderObserver {
- void Reset() {
- acc_obj = 0;
- }
+struct ConditionalLikelihoodObserver : public DecoderObserver {
+
+ ConditionalLikelihoodObserver() : trg_words(), acc_obj(), cur_obj() {}
virtual void NotifyDecodingStart(const SentenceMetadata&) {
cur_obj = 0;
@@ -120,8 +116,10 @@ struct TrainingObserver : public DecoderObserver {
}
assert(!isnan(log_ref_z));
acc_obj += (cur_obj - log_ref_z);
+ trg_words += smeta.GetReference().size();
}
+ unsigned trg_words;
double acc_obj;
double cur_obj;
int state;
@@ -148,15 +146,6 @@ int main(int argc, char** argv) {
if (!InitCommandLine(argc, argv, &conf))
return false;
- // load initial weights
- Weights weights;
- if (conf.count("weights"))
- weights.InitFromFile(conf["weights"].as<string>());
-
- // freeze feature set
- //const bool freeze_feature_set = conf.count("freeze_feature_set");
- //if (freeze_feature_set) FD::Freeze();
-
// load cdec.ini and set up decoder
ReadFile ini_rf(conf["decoder_config"].as<string>());
Decoder decoder(ini_rf.stream());
@@ -165,35 +154,38 @@ int main(int argc, char** argv) {
abort();
}
- vector<string> corpus; vector<int> ids;
- ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
- assert(corpus.size() > 0);
- assert(corpus.size() == ids.size());
+ // load weights
+ vector<weight_t>& weights = decoder.CurrentWeightVector();
+ if (conf.count("weights"))
+ Weights::InitFromFile(conf["weights"].as<string>(), &weights);
- vector<double> wv;
- weights.InitVector(&wv);
- decoder.SetWeights(wv);
- TrainingObserver observer;
- double objective = 0;
- bool converged = false;
+ vector<string> corpus;
+ ReadInstances(conf["training_data"].as<string>(), rank, size, &corpus);
+ assert(corpus.size() > 0);
- observer.Reset();
if (rank == 0)
- cerr << "Each processor is decoding " << corpus.size() << " training examples...\n";
+ cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
- for (int i = 0; i < corpus.size(); ++i) {
- decoder.SetId(ids[i]);
+ ConditionalLikelihoodObserver observer;
+ for (int i = 0; i < corpus.size(); ++i)
decoder.Decode(corpus[i], &observer);
- }
+ double objective = 0;
+ unsigned total_words = 0;
#ifdef HAVE_MPI
reduce(world, observer.acc_obj, objective, std::plus<double>(), 0);
+ reduce(world, observer.trg_words, total_words, std::plus<unsigned>(), 0);
#else
objective = observer.acc_obj;
#endif
- if (rank == 0)
- cout << "OBJECTIVE: " << objective << endl;
+ if (rank == 0) {
+ cout << "CONDITIONAL LOG_e LIKELIHOOD: " << objective << endl;
+ cout << "CONDITIONAL LOG_2 LIKELIHOOD: " << (objective/log(2)) << endl;
+ cout << " CONDITIONAL ENTROPY: " << (objective/log(2) / total_words) << endl;
+ cout << " PERPLEXITY: " << pow(2, (objective/log(2) / total_words)) << endl;
+ }
return 0;
}
+
diff --git a/training/mpi_extract_features.cc b/training/mpi_extract_features.cc
new file mode 100644
index 00000000..6750aa15
--- /dev/null
+++ b/training/mpi_extract_features.cc
@@ -0,0 +1,151 @@
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <cassert>
+
+#include "config.h"
+#ifdef HAVE_MPI
+#include <boost/mpi.hpp>
+#endif
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "ff_register.h"
+#include "verbose.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "decoder.h"
+#include "weights.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("training_data,t",po::value<string>(),"Training data corpus")
+ ("decoder_config,c",po::value<string>(),"Decoder configuration file")
+ ("weights,w", po::value<string>(), "(Optional) weights file; weights may affect what features are encountered in pruning configurations")
+ ("output_prefix,o",po::value<string>()->default_value("features"),"Output path prefix");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
+ cerr << "Decode an input set (optionally in parallel using MPI) and write\nout the feature strings encountered.\n";
+ cerr << dcmdline_options << endl;
+ return false;
+ }
+ return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ int lc = 0;
+ while(in) {
+ getline(in, line);
+ if (!in) break;
+ if (lc % size == rank) c->push_back(line);
+ ++lc;
+ }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct TrainingObserver : public DecoderObserver {
+
+ virtual void NotifyDecodingStart(const SentenceMetadata&) {
+ }
+
+ // compute model expectations, denominator of objective
+ virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+ }
+
+ // compute "empirical" expectations, numerator of objective
+ virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+ }
+};
+
+#ifdef HAVE_MPI
+namespace mpi = boost::mpi;
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+ mpi::environment env(argc, argv);
+ mpi::communicator world;
+ const int size = world.size();
+ const int rank = world.rank();
+#else
+ const int size = 1;
+ const int rank = 0;
+#endif
+ if (size > 1) SetSilent(true); // turn off verbose decoder output
+ register_feature_functions();
+
+ po::variables_map conf;
+ if (!InitCommandLine(argc, argv, &conf))
+ return false;
+
+ // load cdec.ini and set up decoder
+ ReadFile ini_rf(conf["decoder_config"].as<string>());
+ Decoder decoder(ini_rf.stream());
+ if (decoder.GetConf()["input"].as<string>() != "-") {
+ cerr << "cdec.ini must not set an input file\n";
+ abort();
+ }
+
+ if (FD::UsingPerfectHashFunction()) {
+ cerr << "Your configuration file has enabled a cmph hash function. Please disable.\n";
+ return 1;
+ }
+
+ // load optional weights
+ if (conf.count("weights"))
+ Weights::InitFromFile(conf["weights"].as<string>(), &decoder.CurrentWeightVector());
+
+ vector<string> corpus;
+ ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
+ assert(corpus.size() > 0);
+
+ TrainingObserver observer;
+
+ if (rank == 0)
+ cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
+
+ for (int i = 0; i < corpus.size(); ++i)
+ decoder.Decode(corpus[i], &observer);
+
+ {
+ ostringstream os;
+ os << conf["output_prefix"].as<string>() << '.' << rank << "_of_" << size;
+ WriteFile wf(os.str());
+ ostream& out = *wf.stream();
+ const unsigned num_feats = FD::NumFeats();
+ for (unsigned i = 1; i < num_feats; ++i) {
+ out << FD::Convert(i) << endl;
+ }
+ cerr << "Wrote " << os.str() << endl;
+ }
+
+#ifdef HAVE_MPI
+ world.barrier();
+#else
+#endif
+
+ return 0;
+}
+
diff --git a/training/mpi_extract_reachable.cc b/training/mpi_extract_reachable.cc
new file mode 100644
index 00000000..2a7c2b9d
--- /dev/null
+++ b/training/mpi_extract_reachable.cc
@@ -0,0 +1,163 @@
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <cassert>
+
+#include "config.h"
+#ifdef HAVE_MPI
+#include <boost/mpi.hpp>
+#endif
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "ff_register.h"
+#include "verbose.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "decoder.h"
+#include "weights.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("training_data,t",po::value<string>(),"Training data corpus")
+ ("decoder_config,c",po::value<string>(),"Decoder configuration file")
+ ("weights,w", po::value<string>(), "(Optional) weights file; weights may affect what features are encountered in pruning configurations")
+ ("output_prefix,o",po::value<string>()->default_value("reachable"),"Output path prefix");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || !conf->count("training_data") || !conf->count("decoder_config")) {
+ cerr << "Decode an input set (optionally in parallel using MPI) and write\nout the inputs that produce reachable parallel parses.\n";
+ cerr << dcmdline_options << endl;
+ return false;
+ }
+ return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ int lc = 0;
+ while(in) {
+ getline(in, line);
+ if (!in) break;
+ if (lc % size == rank) c->push_back(line);
+ ++lc;
+ }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct ReachabilityObserver : public DecoderObserver {
+
+ virtual void NotifyDecodingStart(const SentenceMetadata&) {
+ reachable = false;
+ }
+
+ // compute model expectations, denominator of objective
+ virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+ }
+
+ // compute "empirical" expectations, numerator of objective
+ virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+ reachable = true;
+ }
+
+ bool reachable;
+};
+
+#ifdef HAVE_MPI
+namespace mpi = boost::mpi;
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+ mpi::environment env(argc, argv);
+ mpi::communicator world;
+ const int size = world.size();
+ const int rank = world.rank();
+#else
+ const int size = 1;
+ const int rank = 0;
+#endif
+ if (size > 1) SetSilent(true); // turn off verbose decoder output
+ register_feature_functions();
+
+ po::variables_map conf;
+ if (!InitCommandLine(argc, argv, &conf))
+ return false;
+
+ // load cdec.ini and set up decoder
+ ReadFile ini_rf(conf["decoder_config"].as<string>());
+ Decoder decoder(ini_rf.stream());
+ if (decoder.GetConf()["input"].as<string>() != "-") {
+ cerr << "cdec.ini must not set an input file\n";
+ abort();
+ }
+
+ if (FD::UsingPerfectHashFunction()) {
+ cerr << "Your configuration file has enabled a cmph hash function. Please disable.\n";
+ return 1;
+ }
+
+ // load optional weights
+ if (conf.count("weights"))
+ Weights::InitFromFile(conf["weights"].as<string>(), &decoder.CurrentWeightVector());
+
+ vector<string> corpus;
+ ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
+ assert(corpus.size() > 0);
+
+
+ if (rank == 0)
+ cerr << "Each processor is decoding ~" << corpus.size() << " training examples...\n";
+
+ size_t num_reached = 0;
+ {
+ ostringstream os;
+ os << conf["output_prefix"].as<string>() << '.' << rank << "_of_" << size;
+ WriteFile wf(os.str());
+ ostream& out = *wf.stream();
+ ReachabilityObserver observer;
+ for (int i = 0; i < corpus.size(); ++i) {
+ decoder.Decode(corpus[i], &observer);
+ if (observer.reachable) {
+ out << corpus[i] << endl;
+ ++num_reached;
+ }
+ corpus[i].clear();
+ }
+ cerr << "Shard " << rank << '/' << size << " finished, wrote "
+ << num_reached << " instances to " << os.str() << endl;
+ }
+
+ size_t total = 0;
+#ifdef HAVE_MPI
+ reduce(world, num_reached, total, std::plus<double>(), 0);
+#else
+ total = num_reached;
+#endif
+ if (rank == 0) {
+ cerr << "-----------------------------------------\n";
+ cerr << "TOTAL = " << total << " instances\n";
+ }
+ return 0;
+}
+
diff --git a/training/mpi_flex_optimize.cc b/training/mpi_flex_optimize.cc
new file mode 100644
index 00000000..87c5f331
--- /dev/null
+++ b/training/mpi_flex_optimize.cc
@@ -0,0 +1,346 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "stringlib.h"
+#include "verbose.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "optimize.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+#ifdef HAVE_MPI
+#include <boost/mpi/timer.hpp>
+#include <boost/mpi.hpp>
+namespace mpi = boost::mpi;
+#endif
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("cdec_config,c",po::value<string>(),"Decoder configuration file")
+ ("weights,w",po::value<string>(),"Initial feature weights")
+ ("training_data,d",po::value<string>(),"Training data")
+ ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(6), "Number of training instances evaluated per processor in each minibatch")
+ ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (options: lbfgs, sgd, rprop)")
+ ("minibatch_iterations,i", po::value<unsigned>()->default_value(10), "Number of optimization iterations per minibatch (1 = standard SGD)")
+ ("iterations,I", po::value<unsigned>()->default_value(50), "Number of passes through the training data before termination")
+ ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+ ("lbfgs_memory_buffers,M", po::value<unsigned>()->default_value(10), "Number of memory buffers for LBFGS history")
+ ("eta_0,e", po::value<double>()->default_value(0.1), "Initial learning rate for SGD")
+ ("L1,1","Use L1 regularization")
+ ("L2,2","Use L2 regularization")
+ ("regularization_strength,C", po::value<double>()->default_value(1.0), "Regularization strength (C)");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || !conf->count("training_data") || !conf->count("cdec_config")) {
+ cerr << "General-purpose minibatch online optimizer (MPI support "
+#if HAVE_MPI
+ << "enabled"
+#else
+ << "not enabled"
+#endif
+ << ")\n" << dcmdline_options << endl;
+ return false;
+ }
+ return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ int id = 0;
+ while(in) {
+ getline(in, line);
+ if (!in) break;
+ if (id % size == rank) {
+ c->push_back(line);
+ order->push_back(id);
+ }
+ ++id;
+ }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct CopyHGsObserver : public DecoderObserver {
+ Hypergraph* hg_;
+ Hypergraph* gold_hg_;
+
+ // this can free up some memory
+ void RemoveRules(Hypergraph* h) {
+ for (unsigned i = 0; i < h->edges_.size(); ++i)
+ h->edges_[i].rule_.reset();
+ }
+
+ void SetCurrentHypergraphs(Hypergraph* h, Hypergraph* gold_h) {
+ hg_ = h;
+ gold_hg_ = gold_h;
+ }
+
+ virtual void NotifyDecodingStart(const SentenceMetadata&) {
+ state = 1;
+ }
+
+ // compute model expectations, denominator of objective
+ virtual void NotifyTranslationForest(const SentenceMetadata&, Hypergraph* hg) {
+ *hg_ = *hg;
+ RemoveRules(hg_);
+ assert(state == 1);
+ state = 2;
+ }
+
+ // compute "empirical" expectations, numerator of objective
+ virtual void NotifyAlignmentForest(const SentenceMetadata&, Hypergraph* hg) {
+ assert(state == 2);
+ state = 3;
+ *gold_hg_ = *hg;
+ RemoveRules(gold_hg_);
+ }
+
+ virtual void NotifyDecodingComplete(const SentenceMetadata&) {
+ if (state == 3) {
+ } else {
+ hg_->clear();
+ gold_hg_->clear();
+ }
+ }
+
+ int state;
+};
+
+void ReadConfig(const string& ini, istringstream* out) {
+ ReadFile rf(ini);
+ istream& in = *rf.stream();
+ ostringstream os;
+ while(in) {
+ string line;
+ getline(in, line);
+ if (!in) continue;
+ os << line << endl;
+ }
+ out->str(os.str());
+}
+
+#ifdef HAVE_MPI
+namespace boost { namespace mpi {
+ template<>
+ struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> >
+ : mpl::true_ { };
+} } // end namespace boost::mpi
+#endif
+
+void AddGrad(const SparseVector<prob_t> x, double s, SparseVector<double>* acc) {
+ for (SparseVector<prob_t>::const_iterator it = x.begin(); it != x.end(); ++it)
+ acc->add_value(it->first, it->second.as_float() * s);
+}
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+ mpi::environment env(argc, argv);
+ mpi::communicator world;
+ const int size = world.size();
+ const int rank = world.rank();
+#else
+ const int size = 1;
+ const int rank = 0;
+#endif
+ if (size > 1) SetSilent(true); // turn off verbose decoder output
+ register_feature_functions();
+ MT19937* rng = NULL;
+
+ po::variables_map conf;
+ if (!InitCommandLine(argc, argv, &conf))
+ return 1;
+
+ boost::shared_ptr<BatchOptimizer> o;
+ const unsigned lbfgs_memory_buffers = conf["lbfgs_memory_buffers"].as<unsigned>();
+
+ istringstream ins;
+ ReadConfig(conf["cdec_config"].as<string>(), &ins);
+ Decoder decoder(&ins);
+
+ // load initial weights
+ vector<weight_t> init_weights;
+ if (conf.count("weights"))
+ Weights::InitFromFile(conf["weights"].as<string>(), &init_weights);
+
+ vector<string> corpus;
+ vector<int> ids;
+ ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids);
+ assert(corpus.size() > 0);
+
+ const unsigned size_per_proc = conf["minibatch_size_per_proc"].as<unsigned>();
+ if (size_per_proc > corpus.size()) {
+ cerr << "Minibatch size must be smaller than corpus size!\n";
+ return 1;
+ }
+
+ size_t total_corpus_size = 0;
+#ifdef HAVE_MPI
+ reduce(world, corpus.size(), total_corpus_size, std::plus<size_t>(), 0);
+#else
+ total_corpus_size = corpus.size();
+#endif
+
+ if (conf.count("random_seed"))
+ rng = new MT19937(conf["random_seed"].as<uint32_t>());
+ else
+ rng = new MT19937;
+
+ const unsigned minibatch_iterations = conf["minibatch_iterations"].as<unsigned>();
+
+ if (rank == 0) {
+ cerr << "Total corpus size: " << total_corpus_size << endl;
+ const unsigned batch_size = size_per_proc * size;
+ }
+
+ SparseVector<double> x;
+ Weights::InitSparseVector(init_weights, &x);
+ CopyHGsObserver observer;
+
+ int write_weights_every_ith = 100; // TODO configure
+ int titer = -1;
+
+ vector<weight_t>& lambdas = decoder.CurrentWeightVector();
+ lambdas.swap(init_weights);
+ init_weights.clear();
+
+ int iter = -1;
+ bool converged = false;
+ while (!converged) {
+#ifdef HAVE_MPI
+ mpi::timer timer;
+#endif
+ x.init_vector(&lambdas);
+ ++iter; ++titer;
+#if 0
+ if (rank == 0) {
+ converged = (iter == max_iteration);
+ Weights::SanityCheck(lambdas);
+ Weights::ShowLargestFeatures(lambdas);
+ string fname = "weights.cur.gz";
+ if (iter % write_weights_every_ith == 0) {
+ ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
+ fname = o.str();
+ }
+ if (converged && ((ai+1)==agenda.size())) { fname = "weights.final.gz"; }
+ ostringstream vv;
+ vv << "total iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << " eta=" << lr->eta(titer);
+ const string svv = vv.str();
+ cerr << svv << endl;
+ Weights::WriteToFile(fname, lambdas, true, &svv);
+ }
+#endif
+
+ vector<Hypergraph> hgs(size_per_proc);
+ vector<Hypergraph> gold_hgs(size_per_proc);
+ for (int i = 0; i < size_per_proc; ++i) {
+ int ei = corpus.size() * rng->next();
+ int id = ids[ei];
+ observer.SetCurrentHypergraphs(&hgs[i], &gold_hgs[i]);
+ decoder.SetId(id);
+ decoder.Decode(corpus[ei], &observer);
+ }
+
+ SparseVector<double> local_grad, g;
+ double local_obj = 0;
+ o.reset();
+ for (unsigned mi = 0; mi < minibatch_iterations; ++mi) {
+ local_grad.clear();
+ g.clear();
+ local_obj = 0;
+
+ for (unsigned i = 0; i < size_per_proc; ++i) {
+ Hypergraph& hg = hgs[i];
+ Hypergraph& hg_gold = gold_hgs[i];
+ if (hg.edges_.size() < 2) continue;
+
+ hg.Reweight(lambdas);
+ hg_gold.Reweight(lambdas);
+ SparseVector<prob_t> model_exp, gold_exp;
+ const prob_t z = InsideOutside<prob_t,
+ EdgeProb,
+ SparseVector<prob_t>,
+ EdgeFeaturesAndProbWeightFunction>(hg, &model_exp);
+ local_obj += log(z);
+ model_exp /= z;
+ AddGrad(model_exp, 1.0, &local_grad);
+ model_exp.clear();
+
+ const prob_t goldz = InsideOutside<prob_t,
+ EdgeProb,
+ SparseVector<prob_t>,
+ EdgeFeaturesAndProbWeightFunction>(hg_gold, &gold_exp);
+ local_obj -= log(goldz);
+
+ if (log(z) - log(goldz) < kMINUS_EPSILON) {
+ cerr << "DIFF. ERR! log_model_z < log_gold_z: " << log(z) << " " << log(goldz) << endl;
+ return 1;
+ }
+
+ gold_exp /= goldz;
+ AddGrad(gold_exp, -1.0, &local_grad);
+ }
+
+ double obj = 0;
+#ifdef HAVE_MPI
+ // TODO obj
+ reduce(world, local_grad, g, std::plus<SparseVector<double> >(), 0);
+#else
+ obj = local_obj;
+ g.swap(local_grad);
+#endif
+ local_grad.clear();
+ if (rank == 0) {
+ g /= (size_per_proc * size);
+ if (!o)
+ o.reset(new LBFGSOptimizer(FD::NumFeats(), lbfgs_memory_buffers));
+ vector<double> gg(FD::NumFeats());
+ if (gg.size() != lambdas.size()) { lambdas.resize(gg.size()); }
+ for (SparseVector<double>::const_iterator it = g.begin(); it != g.end(); ++it)
+ if (it->first) { gg[it->first] = it->second; }
+ cerr << "OBJ: " << obj << endl;
+ o->Optimize(obj, gg, &lambdas);
+ }
+#ifdef HAVE_MPI
+ broadcast(world, x, 0);
+ broadcast(world, converged, 0);
+ world.barrier();
+ if (rank == 0) { cerr << " ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; }
+#endif
+ }
+ }
+ return 0;
+}
diff --git a/training/mpi_online_optimize.cc b/training/mpi_online_optimize.cc
index 32033c19..993627f0 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/mpi_online_optimize.cc
@@ -9,6 +9,7 @@
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include "stringlib.h"
#include "verbose.h"
#include "hg.h"
#include "prob.h"
@@ -31,35 +32,6 @@ namespace mpi = boost::mpi;
using namespace std;
namespace po = boost::program_options;
-void SanityCheck(const vector<double>& w) {
- for (int i = 0; i < w.size(); ++i) {
- assert(!isnan(w[i]));
- assert(!isinf(w[i]));
- }
-}
-
-struct FComp {
- const vector<double>& w_;
- FComp(const vector<double>& w) : w_(w) {}
- bool operator()(int a, int b) const {
- return fabs(w_[a]) > fabs(w_[b]);
- }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
- vector<int> fnums(w.size());
- for (int i = 0; i < w.size(); ++i)
- fnums[i] = i;
- vector<int>::iterator mid = fnums.begin();
- mid += (w.size() > 10 ? 10 : w.size());
- partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
- cerr << "TOP FEATURES:";
- for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
- cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
- }
- cerr << endl;
-}
-
bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
opts.add_options()
@@ -123,7 +95,7 @@ struct TrainingObserver : public DecoderObserver {
void SetLocalGradientAndObjective(vector<double>* g, double* o) const {
*o = acc_obj;
for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
- (*g)[it->first] = it->second;
+ (*g)[it->first] = it->second.as_float();
}
virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
@@ -187,7 +159,7 @@ struct TrainingObserver : public DecoderObserver {
void GetGradient(SparseVector<double>* g) const {
g->clear();
for (SparseVector<prob_t>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
- g->set_value(it->first, it->second);
+ g->set_value(it->first, it->second.as_float());
}
int total_complete;
@@ -233,6 +205,7 @@ bool LoadAgenda(const string& file, vector<pair<string, int> >* a) {
}
int main(int argc, char** argv) {
+ cerr << "THIS SOFTWARE IS DEPRECATED YOU SHOULD USE mpi_flex_optimize\n";
#ifdef HAVE_MPI
mpi::environment env(argc, argv);
mpi::communicator world;
@@ -250,10 +223,25 @@ int main(int argc, char** argv) {
if (!InitCommandLine(argc, argv, &conf))
return 1;
+ vector<pair<string, int> > agenda;
+ if (!LoadAgenda(conf["training_agenda"].as<string>(), &agenda))
+ return 1;
+ if (rank == 0)
+ cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n";
+
+ assert(agenda.size() > 0);
+
+ if (1) { // hack to load the feature hash functions -- TODO this should not be in cdec.ini
+ const string& cur_config = agenda[0].first;
+ const unsigned max_iteration = agenda[0].second;
+ ReadFile ini_rf(cur_config);
+ Decoder decoder(ini_rf.stream());
+ }
+
// load initial weights
- Weights weights;
+ vector<weight_t> init_weights;
if (conf.count("input_weights"))
- weights.InitFromFile(conf["input_weights"].as<string>());
+ Weights::InitFromFile(conf["input_weights"].as<string>(), &init_weights);
vector<int> frozen_fids;
if (conf.count("frozen_features")) {
@@ -310,19 +298,12 @@ int main(int argc, char** argv) {
rng.reset(new MT19937);
SparseVector<double> x;
- weights.InitSparseVector(&x);
+ Weights::InitSparseVector(init_weights, &x);
TrainingObserver observer;
int write_weights_every_ith = 100; // TODO configure
int titer = -1;
- vector<pair<string, int> > agenda;
- if (!LoadAgenda(conf["training_agenda"].as<string>(), &agenda))
- return 1;
- if (rank == 0)
- cerr << "Loaded agenda defining " << agenda.size() << " training epochs\n";
-
- vector<double> lambdas;
for (int ai = 0; ai < agenda.size(); ++ai) {
const string& cur_config = agenda[ai].first;
const unsigned max_iteration = agenda[ai].second;
@@ -331,6 +312,8 @@ int main(int argc, char** argv) {
// load cdec.ini and set up decoder
ReadFile ini_rf(cur_config);
Decoder decoder(ini_rf.stream());
+ vector<weight_t>& lambdas = decoder.CurrentWeightVector();
+ if (ai == 0) { lambdas.swap(init_weights); init_weights.clear(); }
if (rank == 0)
o->ResetEpoch(); // resets the learning rate-- TODO is this good?
@@ -341,15 +324,13 @@ int main(int argc, char** argv) {
#ifdef HAVE_MPI
mpi::timer timer;
#endif
- weights.InitFromVector(x);
- weights.InitVector(&lambdas);
+ x.init_vector(&lambdas);
++iter; ++titer;
observer.Reset();
- decoder.SetWeights(lambdas);
if (rank == 0) {
converged = (iter == max_iteration);
- SanityCheck(lambdas);
- ShowLargestFeatures(lambdas);
+ Weights::SanityCheck(lambdas);
+ Weights::ShowLargestFeatures(lambdas);
string fname = "weights.cur.gz";
if (iter % write_weights_every_ith == 0) {
ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
@@ -360,7 +341,7 @@ int main(int argc, char** argv) {
vv << "total iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << " eta=" << lr->eta(titer);
const string svv = vv.str();
cerr << svv << endl;
- weights.WriteToFile(fname, true, &svv);
+ Weights::WriteToFile(fname, lambdas, true, &svv);
}
for (int i = 0; i < size_per_proc; ++i) {
diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc
index b931991d..15e28fa1 100644
--- a/training/mr_optimize_reduce.cc
+++ b/training/mr_optimize_reduce.cc
@@ -88,25 +88,19 @@ int main(int argc, char** argv) {
const bool use_b64 = conf["input_format"].as<string>() == "b64";
- Weights weights;
- weights.InitFromFile(conf["input_weights"].as<string>());
+ vector<weight_t> lambdas;
+ Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
const string s_obj = "**OBJ**";
int num_feats = FD::NumFeats();
cerr << "Number of features: " << num_feats << endl;
const bool gaussian_prior = conf.count("gaussian_prior");
- vector<double> means(num_feats, 0);
+ vector<weight_t> means(num_feats, 0);
if (conf.count("means")) {
if (!gaussian_prior) {
cerr << "Don't use --means without --gaussian_prior!\n";
exit(1);
}
- Weights wm;
- wm.InitFromFile(conf["means"].as<string>());
- if (num_feats != FD::NumFeats()) {
- cerr << "[ERROR] Means file had unexpected features!\n";
- exit(1);
- }
- wm.InitVector(&means);
+ Weights::InitFromFile(conf["means"].as<string>(), &means);
}
shared_ptr<BatchOptimizer> o;
const string omethod = conf["optimization_method"].as<string>();
@@ -124,8 +118,6 @@ int main(int argc, char** argv) {
cerr << "No state file found, assuming ITERATION 1\n";
}
- vector<double> lambdas(num_feats, 0);
- weights.InitVector(&lambdas);
double objective = 0;
vector<double> gradient(num_feats, 0);
// 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2;
@@ -223,8 +215,7 @@ int main(int argc, char** argv) {
old.clear();
SanityCheck(lambdas);
ShowLargestFeatures(lambdas);
- weights.InitFromVector(lambdas);
- weights.WriteToFile(conf["output_weights"].as<string>(), false);
+ Weights::WriteToFile(conf["output_weights"].as<string>(), lambdas, false);
const bool conv = o->HasConverged();
if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }