diff options
author | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-22 16:01:23 -0500 |
---|---|---|
committer | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-22 16:01:23 -0500 |
commit | 597d89c11db53e91bc011eab70fd613bbe6453e8 (patch) | |
tree | 83c87c07d1ff6d3ee4e3b1626f7eddd49c61095b /training | |
parent | 65e958ff2678a41c22be7171456a63f002ef370b (diff) | |
parent | 201af2acd394415a05072fbd53d42584875aa4b4 (diff) |
Merge branch 'master' of git://github.com/redpony/cdec
Diffstat (limited to 'training')
125 files changed, 8483 insertions, 2548 deletions
diff --git a/training/Makefile.am b/training/Makefile.am index f9c25391..e95e045f 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -1,91 +1,11 @@ -bin_PROGRAMS = \ - fast_align \ - lbl_model \ - test_ngram \ - mr_em_map_adapter \ - mr_em_adapted_reduce \ - mr_reduce_to_weights \ - mr_optimize_reduce \ - grammar_convert \ - plftools \ - collapse_weights \ - mpi_extract_reachable \ - mpi_extract_features \ - mpi_online_optimize \ - mpi_flex_optimize \ - mpi_batch_optimize \ - mpi_compute_cllh \ - augment_grammar +SUBDIRS = \ + liblbfgs \ + utils \ + crf \ + minrisk \ + dpmert \ + pro \ + dtrain \ + mira \ + rampion -noinst_PROGRAMS = \ - lbfgs_test \ - optimize_test - -TESTS = lbfgs_test optimize_test - -noinst_LIBRARIES = libtraining.a -libtraining_a_SOURCES = \ - candidate_set.cc \ - entropy.cc \ - optimize.cc \ - online_optimizer.cc \ - risk.cc - -mpi_online_optimize_SOURCES = mpi_online_optimize.cc -mpi_online_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc -mpi_flex_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc -mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_extract_features_SOURCES = mpi_extract_features.cc -mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc -mpi_batch_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc -mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -augment_grammar_SOURCES = augment_grammar.cc -augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -test_ngram_SOURCES = test_ngram.cc -test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -fast_align_SOURCES = fast_align.cc ttables.cc -fast_align_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -lbl_model_SOURCES = lbl_model.cc -lbl_model_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -grammar_convert_SOURCES = grammar_convert.cc -grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -optimize_test_SOURCES = optimize_test.cc -optimize_test_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -collapse_weights_SOURCES = collapse_weights.cc -collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -lbfgs_test_SOURCES = lbfgs_test.cc -lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc -mr_optimize_reduce_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc -mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc -mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc -mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -plftools_SOURCES = plftools.cc -plftools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm diff --git a/training/add-model1-features-to-scfg.pl b/training/add-model1-features-to-scfg.pl deleted file mode 100755 index a0074317..00000000 --- a/training/add-model1-features-to-scfg.pl +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/perl -w - -# [X] ||| so [X,1] die [X,2] der ||| as [X,1] existing [X,2] the ||| 2.47712135315 2.53182387352 5.07100057602 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| this [X,1] the [X,2] of ||| 2.47712135315 3.19828724861 2.38270020485 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| as [X,1] the [X,2] the ||| 2.47712135315 2.53182387352 1.48463630676 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| is [X,1] the [X,2] of the ||| 2.47712135315 3.45197868347 2.64251494408 ||| 0-0 2-2 4-4 4-5 - -die "Usage: $0 model1.f-e model1.e-f < grammar.scfg\n (use trianing/model1 to extract the model files)\n" unless scalar @ARGV == 2; - -my $fm1 = shift @ARGV; -die unless $fm1; -my $frm1 = shift @ARGV; -die unless $frm1; -open M1,"<$fm1" or die; -open RM1,"<$frm1" or die; -print STDERR "Loading Model 1 probs from $fm1...\n"; -my %m1; -while(<M1>) { - chomp; - my ($f, $e, $lp) = split /\s+/; - $m1{$e}->{$f} = exp($lp); -} -close M1; - -print STDERR "Loading Inverse Model 1 probs from $frm1...\n"; -my %rm1; -while(<RM1>) { - chomp; - my ($e, $f, $lp) = split /\s+/; - $rm1{$f}->{$e} = exp($lp); -} -close RM1; - -my @label = qw( EGivenF LexFGivenE LexEGivenF ); -while(<>) { - chomp; - my ($l, $f, $e, $sscores, $al) = split / \|\|\| /; - my @scores = split /\s+/, $sscores; - unless ($sscores =~ /=/) { - for (my $i=0; $i<3; $i++) { $scores[$i] = "$label[$i]=$scores[$i]"; } - } - push @scores, "RuleCount=1"; - my @fs = split /\s+/, $f; - my @es = split /\s+/, $e; - my $flen = scalar @fs; - my $elen = scalar @es; - my $pgen = 0; - my $nongen = 0; - for (my $i =0; $i < $flen; $i++) { - my $ftot = 0; - next if ($fs[$i] =~ /\[X/); - my $cr = $rm1{$fs[$i]}; - for (my $j=0; $j <= $elen; $j++) { - my $ej = '<eps>'; - if ($j < $elen) { $ej = $es[$j]; } - my $p = $cr->{$ej}; - if (defined $p) { $ftot += $p; } - } - if ($ftot == 0) { $nongen = 1; last; } - $pgen += log($ftot) - log($elen); - } - my $bad = 0; - my $good = 0; - unless ($nongen) { push @scores, "RGood=1"; $good++; } else { push @scores, "RBad=1"; $bad++; } - - $nongen = 0; - $pgen = 0; - for (my $i =0; $i < $elen; $i++) { - my $etot = 0; - next if ($es[$i] =~ /\[X/); - my $cr = $m1{$es[$i]}; -# print STDERR "$es[$i]\n"; - for (my $j=0; $j <= $flen; $j++) { - my $fj = '<eps>'; - if ($j < $flen) { $fj = $fs[$j]; } - my $p = $cr->{$fj}; -# print STDERR " $fs[$j] : $p\n"; - if (defined $p) { $etot += $p; } - } - if ($etot == 0) { $nongen = 1; last; } - $pgen += log($etot) - log($flen); - } - unless ($nongen) { - push @scores, "FGood=1"; - if ($good) { push @scores, "BothGood=1"; } else { push @scores, "SusDel=1"; } - } else { - push @scores, "FBad=1"; - if ($bad) { push @scores, "BothBad=1"; } else { push @scores, "SusHall=1"; } - } - print "$l ||| $f ||| $e ||| @scores"; - if (defined $al) { print " ||| $al\n"; } else { print "\n"; } -} - diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc deleted file mode 100644 index c03eb031..00000000 --- a/training/collapse_weights.cc +++ /dev/null @@ -1,110 +0,0 @@ -char const* NOTES = - "ZF_and_E means unnormalized scaled features.\n" - "For grammars with one nonterminal: F_and_E is joint,\n" - "F_given_E and E_given_F are conditional.\n" - "TODO: group rules by root nonterminal and then normalize.\n"; - - -#include <iostream> -#include <fstream> -#include <tr1/unordered_map> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include <boost/functional/hash.hpp> - -#include "prob.h" -#include "filelib.h" -#include "trule.h" -#include "weights.h" - -namespace po = boost::program_options; -using namespace std; - -typedef std::tr1::unordered_map<vector<WordID>, prob_t, boost::hash<vector<WordID> > > MarginalMap; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("grammar,g", po::value<string>(), "Grammar file") - ("weights,w", po::value<string>(), "Weights file") - ("unnormalized,u", "Always include ZF_and_E unnormalized score (default: only if sum was >1)") - ; - po::options_description clo("Command line options"); - clo.add_options() - ("config,c", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - const string cfg = (*conf)["config"].as<string>(); - cerr << "Configuration file: " << cfg << endl; - ifstream config(cfg.c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("grammar") || !conf->count("weights")) { - cerr << dcmdline_options << endl; - cerr << NOTES << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string wfile = conf["weights"].as<string>(); - const string gfile = conf["grammar"].as<string>(); - vector<weight_t> w; - Weights::InitFromFile(wfile, &w); - MarginalMap e_tots; - MarginalMap f_tots; - prob_t tot; - { - ReadFile rf(gfile); - assert(*rf.stream()); - istream& in = *rf.stream(); - cerr << "Computing marginals...\n"; - int lc = 0; - while(in) { - string line; - getline(in, line); - ++lc; - if (line.empty()) continue; - TRule tr(line, true); - if (tr.GetFeatureValues().empty()) - cerr << "Line " << lc << ": empty features - may introduce bias\n"; - prob_t prob; - prob.logeq(tr.GetFeatureValues().dot(w)); - e_tots[tr.e_] += prob; - f_tots[tr.f_] += prob; - tot += prob; - } - } - bool normalized = (fabs(log(tot)) < 0.001); - cerr << "Total: " << tot << (normalized ? " [normalized]" : " [scaled]") << endl; - ReadFile rf(gfile); - istream&in = *rf.stream(); - while(in) { - string line; - getline(in, line); - if (line.empty()) continue; - TRule tr(line, true); - const double lp = tr.GetFeatureValues().dot(w); - if (std::isinf(lp)) { continue; } - tr.scores_.clear(); - - cout << tr.AsString() << " ||| F_and_E=" << lp - log(tot); - if (!normalized || conf.count("unnormalized")) { - cout << ";ZF_and_E=" << lp; - } - cout << ";F_given_E=" << lp - log(e_tots[tr.e_]) - << ";E_given_F=" << lp - log(f_tots[tr.f_]) << endl; - } - return 0; -} - diff --git a/training/crf/Makefile.am b/training/crf/Makefile.am new file mode 100644 index 00000000..d203df25 --- /dev/null +++ b/training/crf/Makefile.am @@ -0,0 +1,27 @@ +bin_PROGRAMS = \ + mpi_batch_optimize \ + mpi_compute_cllh \ + mpi_extract_features \ + mpi_extract_reachable \ + mpi_flex_optimize \ + mpi_online_optimize + +mpi_online_optimize_SOURCES = mpi_online_optimize.cc +mpi_online_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc +mpi_flex_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc +mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_extract_features_SOURCES = mpi_extract_features.cc +mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc +mpi_batch_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc +mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir)/training -I$(top_srcdir)/training/utils -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/cllh_observer.cc b/training/crf/cllh_observer.cc index 4ec2fa65..4ec2fa65 100644 --- a/training/cllh_observer.cc +++ b/training/crf/cllh_observer.cc diff --git a/training/cllh_observer.h b/training/crf/cllh_observer.h index 0de47331..0de47331 100644 --- a/training/cllh_observer.h +++ b/training/crf/cllh_observer.h diff --git a/training/mpi_batch_optimize.cc b/training/crf/mpi_batch_optimize.cc index 2eff07e4..2eff07e4 100644 --- a/training/mpi_batch_optimize.cc +++ b/training/crf/mpi_batch_optimize.cc diff --git a/training/mpi_compute_cllh.cc b/training/crf/mpi_compute_cllh.cc index 066389d0..066389d0 100644 --- a/training/mpi_compute_cllh.cc +++ b/training/crf/mpi_compute_cllh.cc diff --git a/training/mpi_extract_features.cc b/training/crf/mpi_extract_features.cc index 6750aa15..6750aa15 100644 --- a/training/mpi_extract_features.cc +++ b/training/crf/mpi_extract_features.cc diff --git a/training/mpi_extract_reachable.cc b/training/crf/mpi_extract_reachable.cc index 2a7c2b9d..2a7c2b9d 100644 --- a/training/mpi_extract_reachable.cc +++ b/training/crf/mpi_extract_reachable.cc diff --git a/training/mpi_flex_optimize.cc b/training/crf/mpi_flex_optimize.cc index b52decdc..b52decdc 100644 --- a/training/mpi_flex_optimize.cc +++ b/training/crf/mpi_flex_optimize.cc diff --git a/training/mpi_online_optimize.cc b/training/crf/mpi_online_optimize.cc index d6968848..9e1ae34c 100644 --- a/training/mpi_online_optimize.cc +++ b/training/crf/mpi_online_optimize.cc @@ -5,6 +5,7 @@ #include <cassert> #include <cmath> #include <tr1/memory> +#include <ctime> #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> @@ -41,6 +42,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("training_agenda,a",po::value<string>(), "Text file listing a series of configuration files and the number of iterations to train using each configuration successively") ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(5), "Number of training instances evaluated per processor in each minibatch") ("optimization_method,m", po::value<string>()->default_value("sgd"), "Optimization method (sgd)") + ("max_walltime", po::value<unsigned>(), "Maximum walltime to run (in minutes)") ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)") ("eta_0,e", po::value<double>()->default_value(0.2), "Initial learning rate for SGD (eta_0)") ("L1,1","Use L1 regularization") @@ -304,6 +306,9 @@ int main(int argc, char** argv) { int write_weights_every_ith = 100; // TODO configure int titer = -1; + unsigned timeout = 0; + if (conf.count("max_walltime")) timeout = 60 * conf["max_walltime"].as<unsigned>(); + const time_t start_time = time(NULL); for (int ai = 0; ai < agenda.size(); ++ai) { const string& cur_config = agenda[ai].first; const unsigned max_iteration = agenda[ai].second; @@ -336,9 +341,14 @@ int main(int argc, char** argv) { ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz"; fname = o.str(); } + const time_t cur_time = time(NULL); + if (timeout) { + if ((cur_time - start_time) > timeout) converged = true; + } if (converged && ((ai+1)==agenda.size())) { fname = "weights.final.gz"; } ostringstream vv; - vv << "total iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << " eta=" << lr->eta(titer); + double minutes = (cur_time - start_time) / 60.0; + vv << "total walltime=" << minutes << "min iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << " eta=" << lr->eta(titer); const string svv = vv.str(); cerr << svv << endl; Weights::WriteToFile(fname, lambdas, true, &svv); diff --git a/training/dep-reorder/conll2reordering-forest.pl b/training/dep-reorder/conll2reordering-forest.pl deleted file mode 100755 index 3cd226be..00000000 --- a/training/dep-reorder/conll2reordering-forest.pl +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } -my $FIRST_CONV = "$script_dir/scripts/conll2simplecfg.pl"; -my $CDEC = "$script_dir/../../decoder/cdec"; - -our $tfile1 = "grammar1.$$"; -our $tfile2 = "text.$$"; - -die "Usage: $0 parses.conll\n" unless scalar @ARGV == 1; -open C, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; - -END { unlink $tfile1; unlink "$tfile1.cfg"; unlink $tfile2; } - -my $first = 1; -open T, ">$tfile1" or die "Can't write $tfile1: $!"; -my $lc = 0; -my $flag = 0; -my @words = (); -while(<C>) { - print T; - chomp; - if (/^$/) { - if ($first) { $first = undef; } else { if ($flag) { print "\n"; $flag = 0; } } - $first = undef; - close T; - open SO, ">$tfile2" or die "Can't write $tfile2: $!"; - print SO "@words\n"; - close SO; - @words=(); - `$FIRST_CONV < $tfile1 > $tfile1.cfg`; - if ($? != 0) { - die "Error code: $?"; - } - my $cfg = `$CDEC -n -S 10000 -f scfg -g $tfile1.cfg -i $tfile2 --show_cfg_search_space 2>/dev/null`; - if ($? != 0) { - die "Error code: $?"; - } - my @rules = split /\n/, $cfg; - shift @rules; # get rid of output - for my $rule (@rules) { - my ($lhs, $f, $e, $feats) = split / \|\|\| /, $rule; - $f =~ s/,\d\]/\]/g; - $feats = 'TOP=1' unless $feats; - if ($lhs =~ /\[Goal_\d+\]/) { $lhs = '[S]'; } - print "$lhs ||| $f ||| $feats\n"; - if ($e eq '[1] [2]') { - my ($a, $b) = split /\s+/, $f; - $feats =~ s/=1$//; - my ($x, $y) = split /_/, $feats; - print "$lhs ||| $b $a ||| ${y}_$x=1\n"; - } - $flag = 1; - } - open T, ">$tfile1" or die "Can't write $tfile1: $!"; - $lc = -1; - } else { - my ($ind, $word, @dmmy) = split /\s+/; - push @words, $word; - } - $lc++; -} -close T; - diff --git a/training/dep-reorder/george.conll b/training/dep-reorder/george.conll deleted file mode 100644 index 7eebb360..00000000 --- a/training/dep-reorder/george.conll +++ /dev/null @@ -1,4 +0,0 @@ -1 George _ GEORGE _ _ 2 X _ _ -2 hates _ HATES _ _ 0 X _ _ -3 broccoli _ BROC _ _ 2 X _ _ - diff --git a/training/dep-reorder/scripts/conll2simplecfg.pl b/training/dep-reorder/scripts/conll2simplecfg.pl deleted file mode 100755 index b101347a..00000000 --- a/training/dep-reorder/scripts/conll2simplecfg.pl +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -# 1 在 _ 10 _ _ 4 X _ _ -# 2 门厅 _ 3 _ _ 1 X _ _ -# 3 下面 _ 23 _ _ 4 X _ _ -# 4 。 _ 45 _ _ 0 X _ _ - -my @ldeps; -my @rdeps; -@ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } -@rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } -my $rootcat = 0; -my @cats = ('S'); -my $len = 0; -my @noposcats = ('S'); -while(<>) { - chomp; - if (/^\s*$/) { - write_cfg($len); - $len = 0; - @cats=('S'); - @noposcats = ('S'); - @ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } - @rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } - next; - } - $len++; - my ($pos, $word, $d1, $xcat, $d2, $d3, $headpos, $deptype) = split /\s+/; - my $cat = "C$xcat"; - my $catpos = $cat . "_$pos"; - push @cats, $catpos; - push @noposcats, $cat; - print "[$catpos] ||| $word ||| $word ||| Word=1\n"; - if ($headpos == 0) { $rootcat = $pos; } - if ($pos < $headpos) { - push @{$ldeps[$headpos]}, $pos; - } else { - push @{$rdeps[$headpos]}, $pos; - } -} - -sub write_cfg { - my $len = shift; - for (my $i = 1; $i <= $len; $i++) { - my @lds = @{$ldeps[$i]}; - for my $ld (@lds) { - print "[$cats[$i]] ||| [$cats[$ld],1] [$cats[$i],2] ||| [1] [2] ||| $noposcats[$ld]_$noposcats[$i]=1\n"; - } - my @rds = @{$rdeps[$i]}; - for my $rd (@rds) { - print "[$cats[$i]] ||| [$cats[$i],1] [$cats[$rd],2] ||| [1] [2] ||| $noposcats[$i]_$noposcats[$rd]=1\n"; - } - } - print "[S] ||| [$cats[$rootcat],1] ||| [1] ||| TOP=1\n"; -} - diff --git a/training/dpmert/Makefile.am b/training/dpmert/Makefile.am new file mode 100644 index 00000000..3dbdfa69 --- /dev/null +++ b/training/dpmert/Makefile.am @@ -0,0 +1,25 @@ +bin_PROGRAMS = \ + mr_dpmert_map \ + mr_dpmert_reduce \ + mr_dpmert_generate_mapper_input + +noinst_PROGRAMS = \ + lo_test +TESTS = lo_test + +mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc +mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a + +# nbest2hg_SOURCES = nbest2hg.cc +# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst + +mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc +mr_dpmert_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a + +mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc +mr_dpmert_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a + +lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc +lo_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a + +AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/dpmert/ces.cc b/training/dpmert/ces.cc new file mode 100644 index 00000000..157b2d17 --- /dev/null +++ b/training/dpmert/ces.cc @@ -0,0 +1,90 @@ +#include "ces.h" + +#include <vector> +#include <sstream> +#include <boost/shared_ptr.hpp> + +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h" +#include "lattice.h" +#include "mert_geometry.h" +#include "error_surface.h" +#include "ns.h" + +using namespace std; + +const bool minimize_segments = true; // if adjacent segments have equal scores, merge them + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& ve, + ErrorSurface* env, + const EvaluationMetric* metric, + const Hypergraph& hg) { + vector<WordID> prev_trans; + const vector<boost::shared_ptr<MERTPoint> >& ienv = ve.GetSortedSegs(); + env->resize(ienv.size()); + SufficientStats prev_score; // defaults to 0 + int j = 0; + for (unsigned i = 0; i < ienv.size(); ++i) { + const MERTPoint& seg = *ienv[i]; + vector<WordID> trans; +#if 0 + if (type == AER) { + vector<bool> edges(hg.edges_.size(), false); + seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi + // alignment + ostringstream os; + const string* psrc = ss.GetSource(); + if (psrc == NULL) { + cerr << "AER scoring in VEST requires source, but it is missing!\n"; + abort(); + } + size_t pos = psrc->rfind(" ||| "); + if (pos == string::npos) { + cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; + abort(); + } + Lattice src; + Lattice ref; + LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); + LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); + AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges); + string tstr = os.str(); + TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); + } else { +#endif + seg.ConstructTranslation(&trans); + //} + //cerr << "Scoring: " << TD::GetString(trans) << endl; + if (trans == prev_trans) { + if (!minimize_segments) { + ErrorSegment& out = (*env)[j]; + out.delta.fields.clear(); + out.x = seg.x; + ++j; + } + //cerr << "Identical translation, skipping scoring\n"; + } else { + SufficientStats score; + ss.Evaluate(trans, &score); + // cerr << "score= " << score->ComputeScore() << "\n"; + //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; + const SufficientStats delta = score - prev_score; + //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; + //string xx; delta.Encode(&xx); cerr << xx << endl; + prev_trans.swap(trans); + prev_score = score; + if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { + ErrorSegment& out = (*env)[j]; + out.delta = delta; + out.x = seg.x; + ++j; + } + } + } + // cerr << " In segments: " << ienv.size() << endl; + // cerr << "Out segments: " << j << endl; + assert(j > 0); + env->resize(j); +} + diff --git a/training/dpmert/ces.h b/training/dpmert/ces.h new file mode 100644 index 00000000..e4fa2080 --- /dev/null +++ b/training/dpmert/ces.h @@ -0,0 +1,16 @@ +#ifndef _CES_H_ +#define _CES_H_ + +class ConvexHull; +class Hypergraph; +class SegmentEvaluator; +class ErrorSurface; +class EvaluationMetric; + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& convex_hull, + ErrorSurface* es, + const EvaluationMetric* metric, + const Hypergraph& hg); + +#endif diff --git a/training/dpmert/divide_refs.py b/training/dpmert/divide_refs.py new file mode 100755 index 00000000..b478f918 --- /dev/null +++ b/training/dpmert/divide_refs.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +import sys + +(numRefs, outPrefix) = sys.argv[1:] +numRefs = int(numRefs) + +outs = [open(outPrefix+str(i), "w") for i in range(numRefs)] + +i = 0 +for line in sys.stdin: + outs[i].write(line) + i = (i + 1) % numRefs + +for out in outs: + out.close() diff --git a/training/dpmert/dpmert.pl b/training/dpmert/dpmert.pl new file mode 100755 index 00000000..559420f5 --- /dev/null +++ b/training/dpmert/dpmert.pl @@ -0,0 +1,618 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use File::Basename qw(basename); +require "libcall.pl"; + +my $QSUB_CMD = qsub_args(mert_memory()); + +# Default settings +my $srcFile; # deprecated +my $refFiles; # deprecated +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +my $util_dir = "$SCRIPT_DIR/../utils"; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_dpmert_map"; +my $REDUCER = "$bin_dir/mr_dpmert_reduce"; +my $parallelize = "$util_dir/parallelize.pl"; +my $libcall = "$util_dir/libcall.pl"; +my $sentserver = "$util_dir/sentserver"; +my $sentclient = "$util_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 200; +my $rand_directions = 15; +my $iteration = 1; +my $best_weights; +my $max_iterations = 15; +my $optimization_iters = 6; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "9g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $initialWeights; +my $bleu_weight=1; +my $use_make = 1; # use make to parallelize line search +my $useqsub; +my $pass_suffix = ''; +my $devset; +# Process command-line options +if (GetOptions( + "config=s" => \$iniFile, + "weights=s" => \$initialWeights, + "devset=s" => \$devset, + "jobs=i" => \$jobs, + "pass-suffix=s" => \$pass_suffix, + "help" => \$help, + "qsub" => \$useqsub, + "iterations=i" => \$max_iterations, + "pmem=s" => \$pmem, + "random-directions=i" => \$rand_directions, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "output-dir=s" => \$dir, +) == 0 || @ARGV!=0 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (defined $srcFile || defined $refFiles) { + die <<EOT; + + The options --ref-files and --source-file are no longer supported. + Please specify the input file and its reference translations with + --devset FILE + +EOT +} + +if (!defined $iniFile) { push @missing_args, "--config"; } +if (!defined $devset) { push @missing_args, "--devset"; } +if (!defined $initialWeights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 40; +} elsif ($metric =~ /^meteor$/i) { + $lines_per_mapper = 2000; # start up time is really high for METEOR +} + + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +unless ($dir){ + $dir = "dpmert"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = basename($decoder); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; + +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +if (-e $dir) { + # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs + die "ERROR: output directory $dir already exists (remove or use --output-dir dir)\n\n"; +} else { + mkdir "$dir" or die "Can't mkdir $dir: $!"; + mkdir "$dir/hgs" or die; + mkdir "$dir/scripts" or die; + print STDERR <<EOT; + DECODER: $decoder + INI FILE: $iniFile + WORKING DIR: $dir + DEVSET: $devset + EVAL METRIC: $metric + MAX ITERATIONS: $max_iterations + PARALLEL JOBS: $jobs + HEAD NODE: $host + PMEM (DECODING): $pmem + INITIAL WEIGHTS: $initialWeights +EOT +} + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +check_call("cp $initialWeights $dir/weights.0"); +$iniFile = $newIniFile; + +split_devset($devset, "$dir/dev.input.raw", "$dir/dev.refs"); +my $refs = "-r $dir/dev.refs"; +my $newsrc = "$dir/dev.input"; +enseg("$dir/dev.input.raw", $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { + print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; + print STDERR unchecked_output("date"); + $icc++; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + my @mapoutputs = (); + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + my $mkfile; # only used with makefiles + my $mkfilename; + if ($use_make) { + $mkfilename = "$dir/splag.$im1/domap.mk"; + open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; + print $mkfile "all: $dir/splag.$im1/map.done\n\n"; + } + my @mkouts = (); # only used with makefiles + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "dpmert.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -s $srcFile -m $metric $refs < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + if ($use_make) { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "#!/bin/bash\n"; + print F "$script\n"; + close F; + my $output = "$dir/splag.$im1/$mapoutput"; + push @mkouts, $output; + chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; + } else { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + + $nmappers++; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); + chomp $jobid; + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); + print STDERR " $jobid"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + } + } + if ($use_make) { + print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; + close $mkfile; + my $mcmd = "make -j $jobs -f $mkfilename"; + print STDERR "\nExecuting: $mcmd\n"; + check_call($mcmd); + } else { + print STDERR "\nLaunched $nmappers mappers.\n"; + sleep 8; + print STDERR "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); + $nmappers = scalar @livejobs; + } + print STDERR "All mappers complete.\n"; + } + my $tol = 0; + my $til = 0; + for my $mo (@mapoutputs) { + my $olines = get_lines($mo); + my $ilines = get_lines($o2i{$mo}); + $tol += $olines; + $til += $ilines; + die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; + } + print STDERR "Results for $tol/$til lines\n"; + print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; + print STDERR unchecked_output("date"); + $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; + # sort returns failure even when it doesn't fail for some reason + my $best=unchecked_output("$cmd"); chomp $best; + print STDERR "$best\n"; + my ($oa, $x, $xscore) = split /\|/, $best; + $score = $xscore; + print STDERR "PROJECTED SCORE: $score\n"; + if (abs($x) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; + last; + } + my $psd = $score - $last_score; + $last_score = $score; + if (abs($psd) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; + last; + } + my ($origin, $axis) = split /\s+/, $oa; + + my %ori = convert($origin); + my %axi = convert($axis); + + my $finalFile="$dir/weights.$im1-$opt_iter"; + open W, ">$finalFile" or die "Can't write: $finalFile: $!"; + my $norm = 0; + for my $k (sort keys %ori) { + my $dd = $ori{$k} + $axi{$k} * $x; + $norm += $dd * $dd; + } + $norm = sqrt($norm); + $norm = 1; + for my $k (sort keys %ori) { + my $v = ($ori{$k} + $axi{$k} * $x) / $norm; + print W "$k $v\n"; + } + check_call("rm $dir/splag.$im1/*"); + $inweights = $finalFile; + } + $lastWeightsFile = "$dir/weights.$iteration"; + check_call("cp $inweights $lastWeightsFile"); + if ($icc < 2) { + print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; + last; + } + $lastPScore = $score; + $iteration++; + print STDERR "\n==========\n"; +} + +check_call("cp $lastWeightsFile $dir/weights.final"); +print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n"; +print STDOUT "$dir/weights.final\n"; +exit 0; + + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while(<FL>) { $lc++; } + return $lc; +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + + my $executable = basename($0); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable [options] + Runs a complete MERT optimization. Required options are --weights, + --devset, and --config. + +Options: + + --config <file> [-c <file>] + The decoder configuration file. + + --devset <file> [-d <file>] + The source *and* references for the development set. + + --weights <file> [-w <file>] + A file specifying initial feature weights. The format is + FeatureName_1 value1 + FeatureName_2 value2 + **All and only the weights listed in <file> will be optimized!** + + --metric <name> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --iterations <M> + Maximum number of iterations to run. If not specified, defaults + to 10. + + --pass-suffix <S> + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --rand-directions <num> + MERT will attempt to optimize along all of the principle directions, + set this parameter to explore other directions. Defaults to 5. + + --output-dir <dir> + Directory for intermediate and output files. + + --help + Print this message and exit. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { + my ($infile, $outsrc, $outref) = @_; + open F, "<$infile" or die "Can't read $infile: $!"; + open S, ">$outsrc" or die "Can't write $outsrc: $!"; + open R, ">$outref" or die "Can't write $outref: $!"; + while(<F>) { + chomp; + my ($src, @refs) = split /\s*\|\|\|\s*/; + die "Malformed devset line: $_\n" unless scalar @refs > 0; + print S "$src\n"; + print R join(' ||| ', @refs) . "\n"; + } + close R; + close S; + close F; +} + diff --git a/training/dpmert/error_surface.cc b/training/dpmert/error_surface.cc new file mode 100644 index 00000000..515b67f8 --- /dev/null +++ b/training/dpmert/error_surface.cc @@ -0,0 +1,42 @@ +#include "error_surface.h" + +#include <cassert> +#include <sstream> + +using namespace std; + +ErrorSurface::~ErrorSurface() {} + +void ErrorSurface::Serialize(std::string* out) const { + const int segments = this->size(); + ostringstream os(ios::binary); + os.write((const char*)&segments,sizeof(segments)); + for (int i = 0; i < segments; ++i) { + const ErrorSegment& cur = (*this)[i]; + string senc; + cur.delta.Encode(&senc); + assert(senc.size() < 1024); + unsigned char len = senc.size(); + os.write((const char*)&cur.x, sizeof(cur.x)); + os.write((const char*)&len, sizeof(len)); + os.write((const char*)&senc[0], len); + } + *out = os.str(); +} + +void ErrorSurface::Deserialize(const std::string& in) { + istringstream is(in, ios::binary); + int segments; + is.read((char*)&segments, sizeof(segments)); + this->resize(segments); + for (int i = 0; i < segments; ++i) { + ErrorSegment& cur = (*this)[i]; + unsigned char len; + is.read((char*)&cur.x, sizeof(cur.x)); + is.read((char*)&len, sizeof(len)); + string senc(len, '\0'); assert(senc.size() == len); + is.read((char*)&senc[0], len); + cur.delta = SufficientStats(senc); + } +} + diff --git a/training/dpmert/error_surface.h b/training/dpmert/error_surface.h new file mode 100644 index 00000000..bb65847b --- /dev/null +++ b/training/dpmert/error_surface.h @@ -0,0 +1,24 @@ +#ifndef _ERROR_SURFACE_H_ +#define _ERROR_SURFACE_H_ + +#include <vector> +#include <string> + +#include "ns.h" + +class Score; + +struct ErrorSegment { + double x; + SufficientStats delta; + ErrorSegment() : x(0), delta() {} +}; + +class ErrorSurface : public std::vector<ErrorSegment> { + public: + ~ErrorSurface(); + void Serialize(std::string* out) const; + void Deserialize(const std::string& in); +}; + +#endif diff --git a/training/dpmert/line_mediator.pl b/training/dpmert/line_mediator.pl new file mode 100755 index 00000000..bc2bb24c --- /dev/null +++ b/training/dpmert/line_mediator.pl @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication + +# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver) + +#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism. + +use strict; +use IPC::Open2; +use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO); + +my $quiet=!$ENV{DEBUG}; +$quiet=1 if $ENV{QUIET}; +sub info { + local $,=' '; + print STDERR @_ unless $quiet; +} + +my $mode='CROSS'; +my $ser='DIRECT'; +$mode='PIPE' if $ENV{PIPE}; +$mode='SNAKE' if $ENV{SNAKE}; +$mode='CROSS' if $ENV{CROSS}; +$ser='SERIAL' if $ENV{SERIAL}; +$ser='DIRECT' if $ENV{DIRECT}; +$ser='SERIAL' if $mode eq 'SNAKE'; +info("mode: $mode\n"); +info("connection: $ser\n"); + + +my @c1; +if (scalar @ARGV) { + do { + push @c1,shift + } while scalar @ARGV && $c1[$#c1] ne '--'; +} +pop @c1; +my @c2=@ARGV; +@ARGV=(); +(scalar @c1 && scalar @c2) || die qq{ +usage: $0 cmd1 args -- cmd2 args +all options are environment variables. +DEBUG=1 env var enables debugging output. +CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication. crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output. cmd1 initiates the conversation (sends the first line). default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork). +SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG. +if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout. +if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2. +DIRECT=1 (default) will override SERIAL=1. +CROSS=1 (default) will override SNAKE or PIPE. +}; + +info("1 cmd:",@c1,"\n"); +info("2 cmd:",@c2,"\n"); + +sub lineto { + select $_[0]; + $|=1; + shift; + print @_; +} + +if ($ser eq 'SERIAL') { + my ($R1,$W1,$R2,$W2); + my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3. + my $c2p=open2($R2,$W2,@c2); + if ($mode eq 'CROSS') { + while(<$R1>) { + info("1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("1|2:",$_); + lineto($W1,$_); + } + } else { + my $snake=$mode eq 'SNAKE'; + while(<STDIN>) { + info("IN:",$_); + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("IN|1|2:",$_); + if ($snake) { + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1|2|1:",$_); + } + lineto(*STDOUT,$_); + } + } +} else { + info("DIRECT mode\n"); + my @rw1=POSIX::pipe(); + my @rw2=POSIX::pipe(); + my $pid=undef; + $SIG{CHLD} = sub { wait }; + while (not defined ($pid=fork())) { + sleep 1; + } + my $pipe = $mode eq 'PIPE'; + unless ($pipe) { + POSIX::close(STDOUT_FILENO); + POSIX::close(STDIN_FILENO); + } + if ($pid) { + POSIX::dup2($rw1[1],STDOUT_FILENO); + POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe; + exec @c1; + } else { + POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe; + POSIX::dup2($rw1[0],STDIN_FILENO); + exec @c2; + } + while (wait()!=-1) {} +} diff --git a/training/dpmert/line_optimizer.cc b/training/dpmert/line_optimizer.cc new file mode 100644 index 00000000..9cf33502 --- /dev/null +++ b/training/dpmert/line_optimizer.cc @@ -0,0 +1,114 @@ +#include "line_optimizer.h" + +#include <limits> +#include <algorithm> + +#include "sparse_vector.h" +#include "ns.h" + +using namespace std; + +typedef ErrorSurface::const_iterator ErrorIter; + +// sort by increasing x-ints +struct IntervalComp { + bool operator() (const ErrorIter& a, const ErrorIter& b) const { + return a->x < b->x; + } +}; + +double LineOptimizer::LineOptimize( + const EvaluationMetric* metric, + const vector<ErrorSurface>& surfaces, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon) { + // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << " MINE=" << type << endl; + vector<ErrorIter> all_ints; + for (vector<ErrorSurface>::const_iterator i = surfaces.begin(); + i != surfaces.end(); ++i) { + const ErrorSurface& surface = *i; + for (ErrorIter j = surface.begin(); j != surface.end(); ++j) + all_ints.push_back(j); + } + sort(all_ints.begin(), all_ints.end(), IntervalComp()); + double last_boundary = all_ints.front()->x; + SufficientStats acc; + float& cur_best_score = *best_score; + cur_best_score = (type == MAXIMIZE_SCORE ? + -numeric_limits<float>::max() : numeric_limits<float>::max()); + bool left_edge = true; + double pos = numeric_limits<double>::quiet_NaN(); + for (vector<ErrorIter>::iterator i = all_ints.begin(); + i != all_ints.end(); ++i) { + const ErrorSegment& seg = **i; + if (seg.x - last_boundary > epsilon) { + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = seg.x - 0.1; + left_edge = false; + } else { + pos = last_boundary + (seg.x - last_boundary) / 2; + } + //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; + } + // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; +#undef SHOW_ERROR_SURFACES +#ifdef SHOW_ERROR_SURFACES + cerr << "x=" << seg.x << "\ts=" << sco << "\n"; +#endif + last_boundary = seg.x; + } + // cerr << "x-boundary=" << seg.x << "\n"; + //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; + //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; + acc += seg.delta; + } + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = 0; + } else { + pos = last_boundary + 1000.0; + } + } + return pos; +} + +void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize, + SparseVector<double>* axis, + RandomNumberGenerator<boost::mt19937>* rng) { + axis->clear(); + for (int i = 0; i < features_to_optimize.size(); ++i) + axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0)); + (*axis) /= axis->l2norm(); +} + +void LineOptimizer::CreateOptimizationDirections( + const vector<int>& features_to_optimize, + int additional_random_directions, + RandomNumberGenerator<boost::mt19937>* rng, + vector<SparseVector<double> >* dirs + , bool include_orthogonal + ) { + dirs->clear(); + typedef SparseVector<double> Dir; + vector<Dir> &out=*dirs; + int i=0; + if (include_orthogonal) + for (;i<features_to_optimize.size();++i) { + Dir d; + d.set_value(features_to_optimize[i],1.); + out.push_back(d); + } + out.resize(i+additional_random_directions); + for (;i<out.size();++i) + RandomUnitVector(features_to_optimize, &out[i], rng); + cerr << "Generated " << out.size() << " total axes to optimize along.\n"; +} + diff --git a/training/dpmert/line_optimizer.h b/training/dpmert/line_optimizer.h new file mode 100644 index 00000000..83819f41 --- /dev/null +++ b/training/dpmert/line_optimizer.h @@ -0,0 +1,48 @@ +#ifndef LINE_OPTIMIZER_H_ +#define LINE_OPTIMIZER_H_ + +#include <vector> + +#include "sparse_vector.h" +#include "error_surface.h" +#include "sampler.h" + +class EvaluationMetric; +class Weights; + +struct LineOptimizer { + + // use MINIMIZE_SCORE for things like TER, WER + // MAXIMIZE_SCORE for things like BLEU + enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; + + // merge all the error surfaces together into a global + // error surface and find (the middle of) the best segment + static double LineOptimize( + const EvaluationMetric* metric, + const std::vector<ErrorSurface>& envs, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon = 1.0/65536.0); + + // return a random vector of length 1 where all dimensions + // not listed in dimensions will be 0. + static void RandomUnitVector(const std::vector<int>& dimensions, + SparseVector<double>* axis, + RandomNumberGenerator<boost::mt19937>* rng); + + // generate a list of directions to optimize; the list will + // contain the orthogonal vectors corresponding to the dimensions in + // primary and then additional_random_directions directions in those + // dimensions as well. All vectors will be length 1. + static void CreateOptimizationDirections( + const std::vector<int>& primary, + int additional_random_directions, + RandomNumberGenerator<boost::mt19937>* rng, + std::vector<SparseVector<double> >* dirs + , bool include_primary=true + ); + +}; + +#endif diff --git a/training/dpmert/lo_test.cc b/training/dpmert/lo_test.cc new file mode 100644 index 00000000..95a08d3d --- /dev/null +++ b/training/dpmert/lo_test.cc @@ -0,0 +1,229 @@ +#define BOOST_TEST_MODULE LineOptimizerTest +#include <boost/test/unit_test.hpp> +#include <boost/test/floating_point_comparison.hpp> + +#include <cmath> +#include <iostream> +#include <fstream> + +#include <boost/shared_ptr.hpp> + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "fdict.h" +#include "hg.h" +#include "kbest.h" +#include "hg_io.h" +#include "filelib.h" +#include "inside_outside.h" +#include "viterbi.h" +#include "mert_geometry.h" +#include "line_optimizer.h" + +using namespace std; + +const char* ref11 = "australia reopens embassy in manila"; +const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; +const char* ref21 = "australia reopened manila embassy"; +const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; +const char* ref31 = "australia to reopen embassy in manila"; +const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; +const char* ref41 = "australia to re - open its embassy to manila"; +const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; + +BOOST_AUTO_TEST_CASE( TestCheckNaN) { + double x = 0; + double y = 0; + double z = x / y; + BOOST_CHECK_EQUAL(true, std::isnan(z)); +} + +BOOST_AUTO_TEST_CASE(TestConvexHull) { + boost::shared_ptr<MERTPoint> a1(new MERTPoint(-1, 0)); + boost::shared_ptr<MERTPoint> b1(new MERTPoint(1, 0)); + boost::shared_ptr<MERTPoint> a2(new MERTPoint(-1, 1)); + boost::shared_ptr<MERTPoint> b2(new MERTPoint(1, -1)); + vector<boost::shared_ptr<MERTPoint> > sa; sa.push_back(a1); sa.push_back(b1); + vector<boost::shared_ptr<MERTPoint> > sb; sb.push_back(a2); sb.push_back(b2); + ConvexHull a(sa); + cerr << a << endl; + ConvexHull b(sb); + ConvexHull c = a; + c *= b; + cerr << a << " (*) " << b << " = " << c << endl; + BOOST_CHECK_EQUAL(3, c.size()); +} + +BOOST_AUTO_TEST_CASE(TestConvexHullInside) { + const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector<double> wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 1.0); + hg.Reweight(wts); + vector<pair<vector<WordID>, prob_t> > list; + std::vector<SparseVector<double> > features; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0); + ConvexHullWeightFunction wf(wts, dir); + ConvexHull env = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + cerr << env << endl; + const vector<boost::shared_ptr<MERTPoint> >& segs = env.GetSortedSegs(); + dir *= segs[1]->x; + wts += dir; + hg.Reweight(wts); + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest2.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + for (unsigned i = 0; i < segs.size(); ++i) { + cerr << "seg=" << i << endl; + vector<WordID> trans; + segs[i]->ConstructTranslation(&trans); + cerr << TD::GetString(trans) << endl; + } +} + +BOOST_AUTO_TEST_CASE( TestS1) { + int fPhraseModel_0 = FD::Convert("PhraseModel_0"); + int fPhraseModel_1 = FD::Convert("PhraseModel_1"); + int fPhraseModel_2 = FD::Convert("PhraseModel_2"); + int fLanguageModel = FD::Convert("LanguageModel"); + int fWordPenalty = FD::Convert("WordPenalty"); + int fPassThrough = FD::Convert("PassThrough"); + SparseVector<double> wts; + wts.set_value(fWordPenalty, 4.25); + wts.set_value(fLanguageModel, -1.1165); + wts.set_value(fPhraseModel_0, -0.96); + wts.set_value(fPhraseModel_1, -0.65); + wts.set_value(fPhraseModel_2, -0.77); + wts.set_value(fPassThrough, -10.0); + + vector<int> to_optimize; + to_optimize.push_back(fWordPenalty); + to_optimize.push_back(fLanguageModel); + to_optimize.push_back(fPhraseModel_0); + to_optimize.push_back(fPhraseModel_1); + to_optimize.push_back(fPhraseModel_2); + + std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : "test_data"); + + Hypergraph hg; + ReadFile rf(path + "/0.json.gz"); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(wts); + + Hypergraph hg2; + ReadFile rf2(path + "/1.json.gz"); + HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); + hg2.Reweight(wts); + + vector<vector<WordID> > refs1(4); + TD::ConvertSentence(ref11, &refs1[0]); + TD::ConvertSentence(ref21, &refs1[1]); + TD::ConvertSentence(ref31, &refs1[2]); + TD::ConvertSentence(ref41, &refs1[3]); + vector<vector<WordID> > refs2(4); + TD::ConvertSentence(ref12, &refs2[0]); + TD::ConvertSentence(ref22, &refs2[1]); + TD::ConvertSentence(ref32, &refs2[2]); + TD::ConvertSentence(ref42, &refs2[3]); + vector<ConvexHull> envs(2); + + RandomNumberGenerator<boost::mt19937> rng; + + vector<SparseVector<double> > axes; // directions to search + LineOptimizer::CreateOptimizationDirections( + to_optimize, + 10, + &rng, + &axes); + assert(axes.size() == 10 + to_optimize.size()); + for (unsigned i = 0; i < axes.size(); ++i) + cerr << axes[i] << endl; + const SparseVector<double>& axis = axes[0]; + + cerr << "Computing Viterbi envelope using inside algorithm...\n"; + cerr << "axis: " << axis << endl; + clock_t t_start=clock(); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + envs[1] = Inside<ConvexHull, ConvexHullWeightFunction>(hg2, NULL, wf); + + vector<ErrorSurface> es(2); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1); + boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); + cerr << envs[0].size() << " " << envs[1].size() << endl; + cerr << es[0].size() << " " << es[1].size() << endl; + envs.clear(); + clock_t t_env=clock(); + float score; + double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); + clock_t t_opt=clock(); + cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; + BOOST_CHECK_CLOSE(0.48719698, score, 1e-5); + SparseVector<double> res = axis; + res *= m; + res += wts; + cerr << "res: " << res << endl; + cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl; + cerr << " LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl; + hg.Reweight(res); + hg2.Reweight(res); + vector<WordID> t1,t2; + ViterbiESentence(hg, &t1); + ViterbiESentence(hg2, &t2); + cerr << TD::GetString(t1) << endl; + cerr << TD::GetString(t2) << endl; +} + +BOOST_AUTO_TEST_CASE(TestZeroOrigin) { + const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector<double> wts; + wts.set_value(FD::Convert("PassThrough"), -0.929201533002898); + hg.Reweight(wts); + + vector<pair<vector<WordID>, prob_t> > list; + std::vector<SparseVector<double> > features; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + + SparseVector<double> axis; axis.set_value(FD::Convert("Glue"),1.0); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + vector<ConvexHull> envs(1); + envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + + vector<vector<WordID> > mr(4); + TD::ConvertSentence("untitled", &mr[0]); + TD::ConvertSentence("with no title", &mr[1]); + TD::ConvertSentence("without a title", &mr[2]); + TD::ConvertSentence("without title", &mr[3]); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr); + vector<ErrorSurface> es(1); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); +} + diff --git a/training/dpmert/mert_geometry.cc b/training/dpmert/mert_geometry.cc new file mode 100644 index 00000000..d6973658 --- /dev/null +++ b/training/dpmert/mert_geometry.cc @@ -0,0 +1,185 @@ +#include "mert_geometry.h" + +#include <cassert> +#include <limits> + +using namespace std; + +ConvexHull::ConvexHull(int i) { + if (i == 0) { + // do nothing - <> + } else if (i == 1) { + points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(0, 0, 0, boost::shared_ptr<MERTPoint>(), boost::shared_ptr<MERTPoint>()))); + assert(this->IsMultiplicativeIdentity()); + } else { + cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n"; + abort(); + } +} + +const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const { + const double m = direction.dot(e.feature_values_); + const double b = origin.dot(e.feature_values_); + MERTPoint* point = new MERTPoint(m, b, e); + return ConvexHull(1, point); +} + +ostream& operator<<(ostream& os, const ConvexHull& env) { + os << '<'; + const vector<boost::shared_ptr<MERTPoint> >& points = env.GetSortedSegs(); + for (int i = 0; i < points.size(); ++i) + os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2; + return os << '>'; +} + +#define ORIGINAL_MERT_IMPLEMENTATION 1 +#ifdef ORIGINAL_MERT_IMPLEMENTATION + +struct SlopeCompare { + bool operator() (const boost::shared_ptr<MERTPoint>& a, const boost::shared_ptr<MERTPoint>& b) const { + return a->m < b->m; + } +}; + +const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) { + if (!other.is_sorted) other.Sort(); + if (points.empty()) { + points = other.points; + return *this; + } + is_sorted = false; + int j = points.size(); + points.resize(points.size() + other.points.size()); + for (int i = 0; i < other.points.size(); ++i) + points[j++] = other.points[i]; + assert(j == points.size()); + return *this; +} + +void ConvexHull::Sort() const { + sort(points.begin(), points.end(), SlopeCompare()); + const int k = points.size(); + int j = 0; + for (int i = 0; i < k; ++i) { + MERTPoint l = *points[i]; + l.x = kMinusInfinity; + // cerr << "m=" << l.m << endl; + if (0 < j) { + if (points[j-1]->m == l.m) { // lines are parallel + if (l.b <= points[j-1]->b) continue; + --j; + } + while(0 < j) { + l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m); + if (points[j-1]->x < l.x) break; + --j; + } + if (0 == j) l.x = kMinusInfinity; + } + *points[j++] = l; + } + points.resize(j); + is_sorted = true; +} + +const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) { + if (other.IsMultiplicativeIdentity()) { return *this; } + if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } + + if (!is_sorted) Sort(); + if (!other.is_sorted) other.Sort(); + + if (this->IsEdgeEnvelope()) { +// if (other.size() > 1) +// cerr << *this << " (TIMES) " << other << endl; + boost::shared_ptr<MERTPoint> edge_parent = points[0]; + const double& edge_b = edge_parent->b; + const double& edge_m = edge_parent->m; + points.clear(); + for (int i = 0; i < other.points.size(); ++i) { + const MERTPoint& p = *other.points[i]; + const double m = p.m + edge_m; + const double b = p.b + edge_b; + const double& x = p.x; // x's don't change with * + points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(x, m, b, edge_parent, other.points[i]))); + assert(points.back()->p1->edge); + } +// if (other.size() > 1) +// cerr << " = " << *this << endl; + } else { + vector<boost::shared_ptr<MERTPoint> > new_points; + int this_i = 0; + int other_i = 0; + const int this_size = points.size(); + const int other_size = other.points.size(); + double cur_x = kMinusInfinity; // moves from left to right across the + // real numbers, stopping for all inter- + // sections + double this_next_val = (1 < this_size ? points[1]->x : kPlusInfinity); + double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity); + while (this_i < this_size && other_i < other_size) { + const MERTPoint& this_point = *points[this_i]; + const MERTPoint& other_point= *other.points[other_i]; + const double m = this_point.m + other_point.m; + const double b = this_point.b + other_point.b; + + new_points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i]))); + int comp = 0; + if (this_next_val < other_next_val) comp = -1; else + if (this_next_val > other_next_val) comp = 1; + if (0 == comp) { // the next values are equal, advance both indices + ++this_i; + ++other_i; + cur_x = this_next_val; // could be other_next_val (they're equal!) + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } else { // advance the i with the lower x, update cur_x + if (-1 == comp) { + ++this_i; + cur_x = this_next_val; + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + } else { + ++other_i; + cur_x = other_next_val; + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } + } + } + points.swap(new_points); + } + //cerr << "Multiply: result=" << (*this) << endl; + return *this; +} + +// recursively construct translation +void MERTPoint::ConstructTranslation(vector<WordID>* trans) const { + const MERTPoint* cur = this; + vector<vector<WordID> > ant_trans; + while(!cur->edge) { + ant_trans.resize(ant_trans.size() + 1); + cur->p2->ConstructTranslation(&ant_trans.back()); + cur = cur->p1.get(); + } + size_t ant_size = ant_trans.size(); + vector<const vector<WordID>*> pants(ant_size); + assert(ant_size == cur->edge->tail_nodes_.size()); + --ant_size; + for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; + cur->edge->rule_->ESubstitute(pants, trans); +} + +void MERTPoint::CollectEdgesUsed(std::vector<bool>* edges_used) const { + if (edge) { + assert(edge->id_ < edges_used->size()); + (*edges_used)[edge->id_] = true; + } + if (p1) p1->CollectEdgesUsed(edges_used); + if (p2) p2->CollectEdgesUsed(edges_used); +} + +#else + +// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS + +#endif + diff --git a/training/dpmert/mert_geometry.h b/training/dpmert/mert_geometry.h new file mode 100644 index 00000000..a8b6959e --- /dev/null +++ b/training/dpmert/mert_geometry.h @@ -0,0 +1,81 @@ +#ifndef _MERT_GEOMETRY_H_ +#define _MERT_GEOMETRY_H_ + +#include <vector> +#include <iostream> +#include <boost/shared_ptr.hpp> + +#include "hg.h" +#include "sparse_vector.h" + +static const double kMinusInfinity = -std::numeric_limits<double>::infinity(); +static const double kPlusInfinity = std::numeric_limits<double>::infinity(); + +struct MERTPoint { + MERTPoint() : x(), m(), b(), edge() {} + MERTPoint(double _m, double _b) : + x(kMinusInfinity), m(_m), b(_b), edge() {} + MERTPoint(double _x, double _m, double _b, const boost::shared_ptr<MERTPoint>& p1_, const boost::shared_ptr<MERTPoint>& p2_) : + x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} + MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) : + x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} + + double x; // x intersection with previous segment in env, or -inf if none + double m; // this line's slope + double b; // intercept with y-axis + + // we keep a pointer to the "parents" of this segment so we can reconstruct + // the Viterbi translation corresponding to this segment + boost::shared_ptr<MERTPoint> p1; + boost::shared_ptr<MERTPoint> p2; + + // only MERTPoints created from an edge using the ConvexHullWeightFunction + // have rules + // TRulePtr rule; + const Hypergraph::Edge* edge; + + // recursively recover the Viterbi translation that will result from setting + // the weights to origin + axis * x, where x is any value from this->x up + // until the next largest x in the containing ConvexHull + void ConstructTranslation(std::vector<WordID>* trans) const; + void CollectEdgesUsed(std::vector<bool>* edges_used) const; +}; + +// this is the semiring value type, +// it defines constructors for 0, 1, and the operations + and * +struct ConvexHull { + // create semiring zero + ConvexHull() : is_sorted(true) {} // zero + // for debugging: + ConvexHull(const std::vector<boost::shared_ptr<MERTPoint> >& s) : points(s) { Sort(); } + // create semiring 1 or 0 + explicit ConvexHull(int i); + ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr<MERTPoint>(point)) {} + const ConvexHull& operator+=(const ConvexHull& other); + const ConvexHull& operator*=(const ConvexHull& other); + bool IsMultiplicativeIdentity() const { + return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); } + const std::vector<boost::shared_ptr<MERTPoint> >& GetSortedSegs() const { + if (!is_sorted) Sort(); + return points; + } + size_t size() const { return points.size(); } + + private: + bool IsEdgeEnvelope() const { + return points.size() == 1 && points[0]->edge; } + void Sort() const; + mutable bool is_sorted; + mutable std::vector<boost::shared_ptr<MERTPoint> > points; +}; +std::ostream& operator<<(std::ostream& os, const ConvexHull& env); + +struct ConvexHullWeightFunction { + ConvexHullWeightFunction(const SparseVector<double>& ori, + const SparseVector<double>& dir) : origin(ori), direction(dir) {} + const ConvexHull operator()(const Hypergraph::Edge& e) const; + const SparseVector<double> origin; + const SparseVector<double> direction; +}; + +#endif diff --git a/training/dpmert/mr_dpmert_generate_mapper_input.cc b/training/dpmert/mr_dpmert_generate_mapper_input.cc new file mode 100644 index 00000000..199cd23a --- /dev/null +++ b/training/dpmert/mr_dpmert_generate_mapper_input.cc @@ -0,0 +1,81 @@ +#include <iostream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "weights.h" +#include "line_optimizer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") + ("weights,w",po::value<string>(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r <DIR>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + RandomNumberGenerator<boost::mt19937> rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector<string> features; + SparseVector<weight_t> origin; + vector<weight_t> w; + Weights::InitFromFile(conf["weights"].as<string>(), &w, &features); + Weights::InitSparseVector(w, &origin); + const string forest_repository = conf["forest_repository"].as<string>(); + if (!DirectoryExists(forest_repository)) { + cerr << "Forest repository directory " << forest_repository << " not found!\n"; + return 1; + } + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as<vector<string> >(); + vector<SparseVector<weight_t> > directions; + vector<int> fids(features.size()); + for (unsigned i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as<unsigned int>(), + &rng, + &directions); + unsigned dev_set_size = conf["dev_set_size"].as<unsigned>(); + for (unsigned i = 0; i < dev_set_size; ++i) { + for (unsigned j = 0; j < directions.size(); ++j) { + cout << forest_repository << '/' << i << ".json.gz " << i << ' '; + print(cout, origin, "=", ";"); + cout << ' '; + print(cout, directions[j], "=", ";"); + cout << endl; + } + } + return 0; +} diff --git a/training/dpmert/mr_dpmert_map.cc b/training/dpmert/mr_dpmert_map.cc new file mode 100644 index 00000000..d1efcf96 --- /dev/null +++ b/training/dpmert/mr_dpmert_map.cc @@ -0,0 +1,112 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "filelib.h" +#include "stringlib.h" +#include "sparse_vector.h" +#include "mert_geometry.h" +#include "inside_outside.h" +#include "error_surface.h" +#include "b64tools.h" +#include "hg_io.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") + ("source,s",po::value<string>(), "Source file (ignored, except for AER)") + ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized") + ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF.TXT>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +bool ReadSparseVectorString(const string& s, SparseVector<double>* v) { +#if 0 + // this should work, but untested. + std::istringstream i(s); + i>>*v; +#else + vector<string> fields; + Tokenize(s, ';', &fields); + if (fields.empty()) return false; + for (unsigned i = 0; i < fields.size(); ++i) { + vector<string> pair(2); + Tokenize(fields[i], '=', &pair); + if (pair.size() != 2) { + cerr << "Error parsing vector string: " << fields[i] << endl; + return false; + } + v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); + } + return true; +#endif +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as<string>()); + istream &in=*in_read.stream(); + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + istringstream is(line); + int sent_id; + string file, s_origin, s_direction; + // path-to-file (JSON) sent_ed starting-point search-direction + is >> file >> sent_id >> s_origin >> s_direction; + SparseVector<double> origin; + ReadSparseVectorString(s_origin, &origin); + SparseVector<double> direction; + ReadSparseVectorString(s_direction, &direction); + // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl; + if (last_file != file) { + last_file = file; + ReadFile rf(file); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + } + const ConvexHullWeightFunction wf(origin, direction); + const ConvexHull hull = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + + ErrorSurface es; + ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg); + //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; + // cerr << "Error surface has " << es.size() << " segments\n"; + string val; + es.Serialize(&val); + cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; + B64::b64encode(val.c_str(), val.size(), &cout); + cout << endl << flush; + } + return 0; +} diff --git a/training/dpmert/mr_dpmert_reduce.cc b/training/dpmert/mr_dpmert_reduce.cc new file mode 100644 index 00000000..31512a03 --- /dev/null +++ b/training/dpmert/mr_dpmert_reduce.cc @@ -0,0 +1,77 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sparse_vector.h" +#include "error_surface.h" +#include "line_optimizer.h" +#include "b64tools.h" +#include "stringlib.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = conf->count("evaluation_metric") == 0; + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; + if (metric->IsErrorMetric()) + opt_type = LineOptimizer::MINIMIZE_SCORE; + + vector<ErrorSurface> esv; + string last_key, line, key, val; + while(getline(cin, line)) { + size_t ks = line.find("\t"); + assert(string::npos != ks); + assert(ks > 2); + key = line.substr(2, ks - 2); + val = line.substr(ks + 1); + if (key != last_key) { + if (!last_key.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + last_key.swap(key); + esv.clear(); + } + if (val.size() % 4 != 0) { + cerr << "B64 encoding error 1! Skipping.\n"; + continue; + } + string encoded(val.size() / 4 * 3, '\0'); + if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) { + cerr << "B64 encoding error 2! Skipping.\n"; + continue; + } + esv.push_back(ErrorSurface()); + esv.back().Deserialize(encoded); + } + if (!esv.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + return 0; +} diff --git a/training/dpmert/test_aer/README b/training/dpmert/test_aer/README new file mode 100644 index 00000000..819b2e32 --- /dev/null +++ b/training/dpmert/test_aer/README @@ -0,0 +1,8 @@ +To run the test: + +../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights + +This will optimize the parameters of the tiny lexical translation model +so as to minimize the AER of the Viterbi alignment on the development +set in corpus.src according to the reference alignments in ref.0. + diff --git a/training/dpmert/test_aer/cdec.ini b/training/dpmert/test_aer/cdec.ini new file mode 100644 index 00000000..08187848 --- /dev/null +++ b/training/dpmert/test_aer/cdec.ini @@ -0,0 +1,3 @@ +formalism=lextrans +grammar=grammar +aligner=true diff --git a/training/dpmert/test_aer/corpus.src b/training/dpmert/test_aer/corpus.src new file mode 100644 index 00000000..31b23971 --- /dev/null +++ b/training/dpmert/test_aer/corpus.src @@ -0,0 +1,3 @@ +el gato negro ||| the black cat +el gato ||| the cat +el libro ||| the book diff --git a/training/dpmert/test_aer/grammar b/training/dpmert/test_aer/grammar new file mode 100644 index 00000000..9d857824 --- /dev/null +++ b/training/dpmert/test_aer/grammar @@ -0,0 +1,12 @@ +el ||| cat ||| F1=1 +el ||| the ||| F2=1 +el ||| black ||| F3=1 +el ||| book ||| F11=1 +gato ||| cat ||| F4=1 NN=1 +gato ||| black ||| F5=1 +gato ||| the ||| F6=1 +negro ||| the ||| F7=1 +negro ||| cat ||| F8=1 +negro ||| black ||| F9=1 +libro ||| the ||| F10=1 +libro ||| book ||| F12=1 NN=1 diff --git a/training/dpmert/test_aer/ref.0 b/training/dpmert/test_aer/ref.0 new file mode 100644 index 00000000..734a9c5b --- /dev/null +++ b/training/dpmert/test_aer/ref.0 @@ -0,0 +1,3 @@ +0-0 1-2 2-1 +0-0 1-1 +0-0 1-1 diff --git a/training/dpmert/test_aer/weights b/training/dpmert/test_aer/weights new file mode 100644 index 00000000..afc9282e --- /dev/null +++ b/training/dpmert/test_aer/weights @@ -0,0 +1,13 @@ +F1 0.1 +F2 -.5980815 +F3 0.24235 +F4 0.625 +F5 0.4514 +F6 0.112316 +F7 -0.123415 +F8 -0.25390285 +F9 -0.23852 +F10 0.646 +F11 0.413141 +F12 0.343216 +NN -0.1215 diff --git a/training/dpmert/test_data/0.json.gz b/training/dpmert/test_data/0.json.gz Binary files differnew file mode 100644 index 00000000..30f8dd77 --- /dev/null +++ b/training/dpmert/test_data/0.json.gz diff --git a/training/dpmert/test_data/1.json.gz b/training/dpmert/test_data/1.json.gz Binary files differnew file mode 100644 index 00000000..c82cc179 --- /dev/null +++ b/training/dpmert/test_data/1.json.gz diff --git a/training/dpmert/test_data/c2e.txt.0 b/training/dpmert/test_data/c2e.txt.0 new file mode 100644 index 00000000..12c4abe9 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.0 @@ -0,0 +1,2 @@ +australia reopens embassy in manila +( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/training/dpmert/test_data/c2e.txt.1 b/training/dpmert/test_data/c2e.txt.1 new file mode 100644 index 00000000..4ac12df1 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.1 @@ -0,0 +1,2 @@ +australia reopened manila embassy +( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/training/dpmert/test_data/c2e.txt.2 b/training/dpmert/test_data/c2e.txt.2 new file mode 100644 index 00000000..2f67b72f --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.2 @@ -0,0 +1,2 @@ +australia to reopen embassy in manila +( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/training/dpmert/test_data/c2e.txt.3 b/training/dpmert/test_data/c2e.txt.3 new file mode 100644 index 00000000..5483cef6 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.3 @@ -0,0 +1,2 @@ +australia to re - open its embassy to manila +( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/training/dpmert/test_data/re.txt.0 b/training/dpmert/test_data/re.txt.0 new file mode 100644 index 00000000..86eff087 --- /dev/null +++ b/training/dpmert/test_data/re.txt.0 @@ -0,0 +1,5 @@ +erdogan states turkey to reject any pressures to urge it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . +erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . +we will discuss this dossier in the course of membership negotiations . " +he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/training/dpmert/test_data/re.txt.1 b/training/dpmert/test_data/re.txt.1 new file mode 100644 index 00000000..2140f198 --- /dev/null +++ b/training/dpmert/test_data/re.txt.1 @@ -0,0 +1,5 @@ +erdogan confirms turkey will resist any pressure to recognize cyprus +ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . +erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . +we shall discuss this issue in the course of the membership negotiations . " +he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/training/dpmert/test_data/re.txt.2 b/training/dpmert/test_data/re.txt.2 new file mode 100644 index 00000000..94e46286 --- /dev/null +++ b/training/dpmert/test_data/re.txt.2 @@ -0,0 +1,5 @@ +erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus +ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . +erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . +we shall discuss this dossier during the negotiations on joining . " +and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/training/dpmert/test_data/re.txt.3 b/training/dpmert/test_data/re.txt.3 new file mode 100644 index 00000000..f87c3308 --- /dev/null +++ b/training/dpmert/test_data/re.txt.3 @@ -0,0 +1,5 @@ +erdogan stresses that turkey will reject all pressures to force it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . +erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . +we will discuss this file during the negotiations on joining . " +he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " diff --git a/training/dtrain/Makefile.am b/training/dtrain/Makefile.am new file mode 100644 index 00000000..4f51b0c8 --- /dev/null +++ b/training/dtrain/Makefile.am @@ -0,0 +1,7 @@ +bin_PROGRAMS = dtrain + +dtrain_SOURCES = dtrain.cc score.cc +dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval + diff --git a/training/dtrain/README.md b/training/dtrain/README.md new file mode 100644 index 00000000..7edabbf1 --- /dev/null +++ b/training/dtrain/README.md @@ -0,0 +1,48 @@ +This is a simple (and parallelizable) tuning method for cdec +which is able to train the weights of very many (sparse) features. +It was used here: + "Joint Feature Selection in Distributed Stochastic + Learning for Large-Scale Discriminative Training in + SMT" +(Simianer, Riezler, Dyer; ACL 2012) + + +Building +-------- +Builds when building cdec, see ../BUILDING . +To build only parts needed for dtrain do +``` + autoreconf -ifv + ./configure [--disable-gtest] + cd dtrain/; make +``` + +Running +------- +To run this on a dev set locally: +``` + #define DTRAIN_LOCAL +``` +otherwise remove that line or undef, then recompile. You need a single +grammar file or input annotated with per-sentence grammars (psg) as you +would use with cdec. Additionally you need to give dtrain a file with +references (--refs) when running locally. + +The input for use with hadoop streaming looks like this: +``` + <sid>\t<source>\t<ref>\t<grammar rules separated by \t> +``` +To convert a psg to this format you need to replace all "\n" +by "\t". Make sure there are no tabs in your data. + +For an example of local usage (with the 'distributed' format) +the see test/example/ . This expects dtrain to be built without +DTRAIN_LOCAL. + +Legal +----- +Copyright (c) 2012 by Patrick Simianer <p@simianer.de> + +See the file ../LICENSE.txt for the licensing terms that this software is +released under. + diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc new file mode 100644 index 00000000..18286668 --- /dev/null +++ b/training/dtrain/dtrain.cc @@ -0,0 +1,657 @@ +#include "dtrain.h" + + +bool +dtrain_init(int argc, char** argv, po::variables_map* cfg) +{ + po::options_description ini("Configuration File Options"); + ini.add_options() + ("input", po::value<string>()->default_value("-"), "input file") + ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT") + ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)") + ("decoder_config", po::value<string>(), "configuration file for cdec") + ("print_weights", po::value<string>(), "weights to print on each iteration") + ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences") + ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use") + ("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration") + ("hstreaming", po::value<string>(), "run in hadoop streaming mode, arg is a task id") + ("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)") + ("k", po::value<unsigned>()->default_value(100), "how many translations to sample") + ("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'") + ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'") + ("pair_sampling", po::value<string>()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'") + ("hi_lo", po::value<float>()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5") + ("pair_threshold", po::value<score_t>()->default_value(0.), "bleu [0,1] threshold to filter pairs") + ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)") + ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_, lc_") + ("learning_rate", po::value<weight_t>()->default_value(1.0), "learning rate") + ("gamma", po::value<weight_t>()->default_value(0.), "gamma for SVM (0 for perceptron)") + ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") + ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input") + ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") + ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength") + ("fselect", po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO + ("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU") + ("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") + ("loss_margin", po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near") + ("max_pairs", po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.") +#ifdef DTRAIN_LOCAL + ("refs,r", po::value<string>(), "references in local mode") +#endif + ("noup", po::value<bool>()->zero_tokens(), "do not update weights"); + po::options_description cl("Command Line Options"); + cl.add_options() + ("config,c", po::value<string>(), "dtrain config file") + ("quiet,q", po::value<bool>()->zero_tokens(), "be quiet") + ("verbose,v", po::value<bool>()->zero_tokens(), "be verbose"); + cl.add(ini); + po::store(parse_command_line(argc, argv, cl), *cfg); + if (cfg->count("config")) { + ifstream ini_f((*cfg)["config"].as<string>().c_str()); + po::store(po::parse_config_file(ini_f, ini), *cfg); + } + po::notify(*cfg); + if (!cfg->count("decoder_config")) { + cerr << cl << endl; + return false; + } + if (cfg->count("hstreaming") && (*cfg)["output"].as<string>() != "-") { + cerr << "When using 'hstreaming' the 'output' param should be '-'." << endl; + return false; + } +#ifdef DTRAIN_LOCAL + if ((*cfg)["input"].as<string>() == "-") { + cerr << "Can't use stdin as input with this binary. Recompile without DTRAIN_LOCAL" << endl; + return false; + } +#endif + if ((*cfg)["sample_from"].as<string>() != "kbest" + && (*cfg)["sample_from"].as<string>() != "forest") { + cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as<string>() << "', use 'kbest' or 'forest'." << endl; + return false; + } + if ((*cfg)["sample_from"].as<string>() == "kbest" && (*cfg)["filter"].as<string>() != "uniq" && + (*cfg)["filter"].as<string>() != "not") { + cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as<string>() << "', use 'uniq' or 'not'." << endl; + return false; + } + if ((*cfg)["pair_sampling"].as<string>() != "all" && (*cfg)["pair_sampling"].as<string>() != "XYX" && + (*cfg)["pair_sampling"].as<string>() != "PRO") { + cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as<string>() << "'." << endl; + return false; + } + if(cfg->count("hi_lo") && (*cfg)["pair_sampling"].as<string>() != "XYX") { + cerr << "Warning: hi_lo only works with pair_sampling XYX." << endl; + } + if((*cfg)["hi_lo"].as<float>() > 0.5 || (*cfg)["hi_lo"].as<float>() < 0.01) { + cerr << "hi_lo must lie in [0.01, 0.5]" << endl; + return false; + } + if ((*cfg)["pair_threshold"].as<score_t>() < 0) { + cerr << "The threshold must be >= 0!" << endl; + return false; + } + if ((*cfg)["select_weights"].as<string>() != "last" && (*cfg)["select_weights"].as<string>() != "best" && + (*cfg)["select_weights"].as<string>() != "avg" && (*cfg)["select_weights"].as<string>() != "VOID") { + cerr << "Wrong 'select_weights' param: '" << (*cfg)["select_weights"].as<string>() << "', use 'last' or 'best'." << endl; + return false; + } + return true; +} + +int +main(int argc, char** argv) +{ + // handle most parameters + po::variables_map cfg; + if (!dtrain_init(argc, argv, &cfg)) exit(1); // something is wrong + bool quiet = false; + if (cfg.count("quiet")) quiet = true; + bool verbose = false; + if (cfg.count("verbose")) verbose = true; + bool noup = false; + if (cfg.count("noup")) noup = true; + bool hstreaming = false; + string task_id; + if (cfg.count("hstreaming")) { + hstreaming = true; + quiet = true; + task_id = cfg["hstreaming"].as<string>(); + cerr.precision(17); + } + bool rescale = false; + if (cfg.count("rescale")) rescale = true; + HSReporter rep(task_id); + bool keep = false; + if (cfg.count("keep")) keep = true; + + const unsigned k = cfg["k"].as<unsigned>(); + const unsigned N = cfg["N"].as<unsigned>(); + const unsigned T = cfg["epochs"].as<unsigned>(); + const unsigned stop_after = cfg["stop_after"].as<unsigned>(); + const string filter_type = cfg["filter"].as<string>(); + const string sample_from = cfg["sample_from"].as<string>(); + const string pair_sampling = cfg["pair_sampling"].as<string>(); + const score_t pair_threshold = cfg["pair_threshold"].as<score_t>(); + const string select_weights = cfg["select_weights"].as<string>(); + const float hi_lo = cfg["hi_lo"].as<float>(); + const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>(); + const unsigned max_pairs = cfg["max_pairs"].as<unsigned>(); + weight_t loss_margin = cfg["loss_margin"].as<weight_t>(); + if (loss_margin > 9998.) loss_margin = std::numeric_limits<float>::max(); + bool scale_bleu_diff = false; + if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true; + bool average = false; + if (select_weights == "avg") + average = true; + vector<string> print_weights; + if (cfg.count("print_weights")) + boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" ")); + + // setup decoder + register_feature_functions(); + SetSilent(true); + ReadFile ini_rf(cfg["decoder_config"].as<string>()); + if (!quiet) + cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; + Decoder decoder(ini_rf.stream()); + + // scoring metric/scorer + string scorer_str = cfg["scorer"].as<string>(); + LocalScorer* scorer; + if (scorer_str == "bleu") { + scorer = dynamic_cast<BleuScorer*>(new BleuScorer); + } else if (scorer_str == "stupid_bleu") { + scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer); + } else if (scorer_str == "smooth_bleu") { + scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer); + } else if (scorer_str == "sum_bleu") { + scorer = dynamic_cast<SumBleuScorer*>(new SumBleuScorer); + } else if (scorer_str == "sumexp_bleu") { + scorer = dynamic_cast<SumExpBleuScorer*>(new SumExpBleuScorer); + } else if (scorer_str == "sumwhatever_bleu") { + scorer = dynamic_cast<SumWhateverBleuScorer*>(new SumWhateverBleuScorer); + } else if (scorer_str == "approx_bleu") { + scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d)); + } else if (scorer_str == "lc_bleu") { + scorer = dynamic_cast<LinearBleuScorer*>(new LinearBleuScorer(N)); + } else { + cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl; + exit(1); + } + vector<score_t> bleu_weights; + scorer->Init(N, bleu_weights); + + // setup decoder observer + MT19937 rng; // random number generator, only for forest sampling + HypSampler* observer; + if (sample_from == "kbest") + observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type)); + else + observer = dynamic_cast<KSampler*>(new KSampler(k, &rng)); + observer->SetScorer(scorer); + + // init weights + vector<weight_t>& dense_weights = decoder.CurrentWeightVector(); + SparseVector<weight_t> lambdas, cumulative_penalties, w_average; + if (cfg.count("input_weights")) Weights::InitFromFile(cfg["input_weights"].as<string>(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); + + // meta params for perceptron, SVM + weight_t eta = cfg["learning_rate"].as<weight_t>(); + weight_t gamma = cfg["gamma"].as<weight_t>(); + + // l1 regularization + bool l1naive = false; + bool l1clip = false; + bool l1cumul = false; + weight_t l1_reg = 0; + if (cfg["l1_reg"].as<string>() != "none") { + string s = cfg["l1_reg"].as<string>(); + if (s == "naive") l1naive = true; + else if (s == "clip") l1clip = true; + else if (s == "cumul") l1cumul = true; + l1_reg = cfg["l1_reg_strength"].as<weight_t>(); + } + + // output + string output_fn = cfg["output"].as<string>(); + // input + string input_fn = cfg["input"].as<string>(); + ReadFile input(input_fn); + // buffer input for t > 0 + vector<string> src_str_buf; // source strings (decoder takes only strings) + vector<vector<WordID> > ref_ids_buf; // references as WordID vecs + // where temp files go + string tmp_path = cfg["tmp"].as<string>(); +#ifdef DTRAIN_LOCAL + string refs_fn = cfg["refs"].as<string>(); + ReadFile refs(refs_fn); +#else + string grammar_buf_fn = gettmpf(tmp_path, "dtrain-grammars"); + ogzstream grammar_buf_out; + grammar_buf_out.open(grammar_buf_fn.c_str()); +#endif + + unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size + vector<pair<score_t, score_t> > all_scores; + score_t max_score = 0.; + unsigned best_it = 0; + float overall_time = 0.; + + // output cfg + if (!quiet) { + cerr << _p5; + cerr << endl << "dtrain" << endl << "Parameters:" << endl; + cerr << setw(25) << "k " << k << endl; + cerr << setw(25) << "N " << N << endl; + cerr << setw(25) << "T " << T << endl; + cerr << setw(25) << "scorer '" << scorer_str << "'" << endl; + if (scorer_str == "approx_bleu") + cerr << setw(25) << "approx. B discount " << approx_bleu_d << endl; + cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl; + if (sample_from == "kbest") + cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl; + if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl; + else cerr << setw(25) << "learning rate " << "bleu diff" << endl; + cerr << setw(25) << "gamma " << gamma << endl; + cerr << setw(25) << "loss margin " << loss_margin << endl; + cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl; + if (pair_sampling == "XYX") + cerr << setw(25) << "hi lo " << hi_lo << endl; + cerr << setw(25) << "pair threshold " << pair_threshold << endl; + cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl; + if (cfg.count("l1_reg")) + cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl; + if (rescale) + cerr << setw(25) << "rescale " << rescale << endl; + cerr << setw(25) << "max pairs " << max_pairs << endl; + cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; + cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; +#ifdef DTRAIN_LOCAL + cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl; +#endif + cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; + if (cfg.count("input_weights")) + cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl; + if (stop_after > 0) + cerr << setw(25) << "stop_after " << stop_after << endl; + if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl; + } + + + for (unsigned t = 0; t < T; t++) // T epochs + { + + if (hstreaming) cerr << "reporter:status:Iteration #" << t+1 << " of " << T << endl; + + time_t start, end; + time(&start); +#ifndef DTRAIN_LOCAL + igzstream grammar_buf_in; + if (t > 0) grammar_buf_in.open(grammar_buf_fn.c_str()); +#endif + score_t score_sum = 0.; + score_t model_sum(0); + unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0; + if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl; + + while(true) + { + + string in; + bool next = false, stop = false; // next iteration or premature stop + if (t == 0) { + if(!getline(*input, in)) next = true; + } else { + if (ii == in_sz) next = true; // stop if we reach the end of our input + } + // stop after X sentences (but still go on for those) + if (stop_after > 0 && stop_after == ii && !next) stop = true; + + // produce some pretty output + if (!quiet && !verbose) { + if (ii == 0) cerr << " "; + if ((ii+1) % (DTRAIN_DOTS) == 0) { + cerr << "."; + cerr.flush(); + } + if ((ii+1) % (20*DTRAIN_DOTS) == 0) { + cerr << " " << ii+1 << endl; + if (!next && !stop) cerr << " "; + } + if (stop) { + if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl; + cerr << "Stopping after " << stop_after << " input sentences." << endl; + } else { + if (next) { + if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl; + } + } + } + + // next iteration + if (next || stop) break; + + // weights + lambdas.init_vector(&dense_weights); + + // getting input + vector<WordID> ref_ids; // reference as vector<WordID> +#ifndef DTRAIN_LOCAL + vector<string> in_split; // input: sid\tsrc\tref\tpsg + if (t == 0) { + // handling input + split_in(in, in_split); + if (hstreaming && ii == 0) cerr << "reporter:counter:" << task_id << ",First ID," << in_split[0] << endl; + // getting reference + vector<string> ref_tok; + boost::split(ref_tok, in_split[2], boost::is_any_of(" ")); + register_and_convert(ref_tok, ref_ids); + ref_ids_buf.push_back(ref_ids); + // process and set grammar + bool broken_grammar = true; // ignore broken grammars + for (string::iterator it = in.begin(); it != in.end(); it++) { + if (!isspace(*it)) { + broken_grammar = false; + break; + } + } + if (broken_grammar) { + cerr << "Broken grammar for " << ii+1 << "! Ignoring this input." << endl; + continue; + } + boost::replace_all(in, "\t", "\n"); + in += "\n"; + grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; + decoder.AddSupplementalGrammarFromString(in); + src_str_buf.push_back(in_split[1]); + // decode + observer->SetRef(ref_ids); + decoder.Decode(in_split[1], observer); + } else { + // get buffered grammar + string grammar_str; + while (true) { + string rule; + getline(grammar_buf_in, rule); + if (boost::starts_with(rule, DTRAIN_GRAMMAR_DELIM)) break; + grammar_str += rule + "\n"; + } + decoder.AddSupplementalGrammarFromString(grammar_str); + // decode + observer->SetRef(ref_ids_buf[ii]); + decoder.Decode(src_str_buf[ii], observer); + } +#else + if (t == 0) { + string r_; + getline(*refs, r_); + vector<string> ref_tok; + boost::split(ref_tok, r_, boost::is_any_of(" ")); + register_and_convert(ref_tok, ref_ids); + ref_ids_buf.push_back(ref_ids); + src_str_buf.push_back(in); + } else { + ref_ids = ref_ids_buf[ii]; + } + observer->SetRef(ref_ids); + if (t == 0) + decoder.Decode(in, observer); + else + decoder.Decode(src_str_buf[ii], observer); +#endif + + // get (scored) samples + vector<ScoredHyp>* samples = observer->GetSamples(); + + if (verbose) { + cerr << "--- ref for " << ii << ": "; + if (t > 0) printWordIDVec(ref_ids_buf[ii]); + else printWordIDVec(ref_ids); + cerr << endl; + for (unsigned u = 0; u < samples->size(); u++) { + cerr << _p2 << _np << "[" << u << ". '"; + printWordIDVec((*samples)[u].w); + cerr << "'" << endl; + cerr << "SCORE=" << (*samples)[u].score << ",model="<< (*samples)[u].model << endl; + cerr << "F{" << (*samples)[u].f << "} ]" << endl << endl; + } + } + + score_sum += (*samples)[0].score; // stats for 1best + model_sum += (*samples)[0].model; + + f_count += observer->get_f_count(); + list_sz += observer->get_sz(); + + // weight updates + if (!noup) { + // get pairs + vector<pair<ScoredHyp,ScoredHyp> > pairs; + if (pair_sampling == "all") + all_pairs(samples, pairs, pair_threshold, max_pairs); + if (pair_sampling == "XYX") + partXYX(samples, pairs, pair_threshold, max_pairs, hi_lo); + if (pair_sampling == "PRO") + PROsampling(samples, pairs, pair_threshold, max_pairs); + npairs += pairs.size(); + + for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin(); + it != pairs.end(); it++) { +#ifdef DTRAIN_FASTER_PERCEPTRON + bool rank_error = true; // pair sampling already did this for us + rank_errors++; + score_t margin = std::numeric_limits<float>::max(); +#else + bool rank_error = it->first.model <= it->second.model; + if (rank_error) rank_errors++; + score_t margin = fabs(fabs(it->first.model) - fabs(it->second.model)); + if (!rank_error && margin < loss_margin) margin_violations++; +#endif + if (scale_bleu_diff) eta = it->first.score - it->second.score; + if (rank_error || margin < loss_margin) { + SparseVector<weight_t> diff_vec = it->first.f - it->second.f; + lambdas.plus_eq_v_times_s(diff_vec, eta); + if (gamma) + lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); + } + } + + // l1 regularization + if (l1naive) { + for (unsigned d = 0; d < lambdas.size(); d++) { + weight_t v = lambdas.get(d); + lambdas.set_value(d, v - sign(v) * l1_reg); + } + } else if (l1clip) { + for (unsigned d = 0; d < lambdas.size(); d++) { + if (lambdas.nonzero(d)) { + weight_t v = lambdas.get(d); + if (v > 0) { + lambdas.set_value(d, max(0., v - l1_reg)); + } else { + lambdas.set_value(d, min(0., v + l1_reg)); + } + } + } + } else if (l1cumul) { + weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input + for (unsigned d = 0; d < lambdas.size(); d++) { + if (lambdas.nonzero(d)) { + weight_t v = lambdas.get(d); + weight_t penalty = 0; + if (v > 0) { + penalty = max(0., v-(acc_penalty + cumulative_penalties.get(d))); + } else { + penalty = min(0., v+(acc_penalty - cumulative_penalties.get(d))); + } + lambdas.set_value(d, penalty); + cumulative_penalties.set_value(d, cumulative_penalties.get(d)+penalty); + } + } + } + + } + + if (rescale) lambdas /= lambdas.l2norm(); + + ++ii; + + if (hstreaming) { + rep.update_counter("Seen #"+boost::lexical_cast<string>(t+1), 1u); + rep.update_counter("Seen", 1u); + } + + } // input loop + + if (average) w_average += lambdas; + + if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset(); + + if (t == 0) { + in_sz = ii; // remember size of input (# lines) + if (hstreaming) { + rep.update_counter("|Input|", ii); + rep.update_gcounter("|Input|", ii); + rep.update_gcounter("Shards", 1u); + } + } + +#ifndef DTRAIN_LOCAL + if (t == 0) { + grammar_buf_out.close(); + } else { + grammar_buf_in.close(); + } +#endif + + // print some stats + score_t score_avg = score_sum/(score_t)in_sz; + score_t model_avg = model_sum/(score_t)in_sz; + score_t score_diff, model_diff; + if (t > 0) { + score_diff = score_avg - all_scores[t-1].first; + model_diff = model_avg - all_scores[t-1].second; + } else { + score_diff = score_avg; + model_diff = model_avg; + } + + unsigned nonz = 0; + if (!quiet || hstreaming) nonz = (unsigned)lambdas.num_nonzero(); + + if (!quiet) { + cerr << _p5 << _p << "WEIGHTS" << endl; + for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) { + cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl; + } + cerr << " ---" << endl; + cerr << _np << " 1best avg score: " << score_avg; + cerr << _p << " (" << score_diff << ")" << endl; + cerr << _np << " 1best avg model score: " << model_avg; + cerr << _p << " (" << model_diff << ")" << endl; + cerr << " avg # pairs: "; + cerr << _np << npairs/(float)in_sz << endl; + cerr << " avg # rank err: "; + cerr << rank_errors/(float)in_sz << endl; +#ifndef DTRAIN_FASTER_PERCEPTRON + cerr << " avg # margin viol: "; + cerr << margin_violations/(float)in_sz << endl; +#endif + cerr << " non0 feature count: " << nonz << endl; + cerr << " avg list sz: " << list_sz/(float)in_sz << endl; + cerr << " avg f count: " << f_count/(float)list_sz << endl; + } + + if (hstreaming) { + rep.update_counter("Score 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(score_avg*DTRAIN_SCALE)); + rep.update_counter("Model 1best avg #"+boost::lexical_cast<string>(t+1), (unsigned)(model_avg*DTRAIN_SCALE)); + rep.update_counter("Pairs avg #"+boost::lexical_cast<string>(t+1), (unsigned)((npairs/(weight_t)in_sz)*DTRAIN_SCALE)); + rep.update_counter("Rank errors avg #"+boost::lexical_cast<string>(t+1), (unsigned)((rank_errors/(weight_t)in_sz)*DTRAIN_SCALE)); + rep.update_counter("Margin violations avg #"+boost::lexical_cast<string>(t+1), (unsigned)((margin_violations/(weight_t)in_sz)*DTRAIN_SCALE)); + rep.update_counter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz); + rep.update_gcounter("Non zero feature count #"+boost::lexical_cast<string>(t+1), nonz); + } + + pair<score_t,score_t> remember; + remember.first = score_avg; + remember.second = model_avg; + all_scores.push_back(remember); + if (score_avg > max_score) { + max_score = score_avg; + best_it = t; + } + time (&end); + float time_diff = difftime(end, start); + overall_time += time_diff; + if (!quiet) { + cerr << _p2 << _np << "(time " << time_diff/60. << " min, "; + cerr << time_diff/in_sz << " s/S)" << endl; + } + if (t+1 != T && !quiet) cerr << endl; + + if (noup) break; + + // write weights to file + if (select_weights == "best" || keep) { + lambdas.init_vector(&dense_weights); + string w_fn = "weights." + boost::lexical_cast<string>(t) + ".gz"; + Weights::WriteToFile(w_fn, dense_weights, true); + } + + } // outer loop + + if (average) w_average /= (weight_t)T; + +#ifndef DTRAIN_LOCAL + unlink(grammar_buf_fn.c_str()); +#endif + + if (!noup) { + if (!quiet) cerr << endl << "Writing weights file to '" << output_fn << "' ..." << endl; + if (select_weights == "last" || average) { // last, average + WriteFile of(output_fn); // works with '-' + ostream& o = *of.stream(); + o.precision(17); + o << _np; + if (average) { + for (SparseVector<weight_t>::iterator it = w_average.begin(); it != w_average.end(); ++it) { + if (it->second == 0) continue; + o << FD::Convert(it->first) << '\t' << it->second << endl; + } + } else { + for (SparseVector<weight_t>::iterator it = lambdas.begin(); it != lambdas.end(); ++it) { + if (it->second == 0) continue; + o << FD::Convert(it->first) << '\t' << it->second << endl; + } + } + } else if (select_weights == "VOID") { // do nothing with the weights + } else { // best + if (output_fn != "-") { + CopyFile("weights."+boost::lexical_cast<string>(best_it)+".gz", output_fn); + } else { + ReadFile bestw("weights."+boost::lexical_cast<string>(best_it)+".gz"); + string o; + cout.precision(17); + cout << _np; + while(getline(*bestw, o)) cout << o << endl; + } + if (!keep) { + for (unsigned i = 0; i < T; i++) { + string s = "weights." + boost::lexical_cast<string>(i) + ".gz"; + unlink(s.c_str()); + } + } + } + if (output_fn == "-" && hstreaming) cout << "__SHARD_COUNT__\t1" << endl; + if (!quiet) cerr << "done" << endl; + } + + if (!quiet) { + cerr << _p5 << _np << endl << "---" << endl << "Best iteration: "; + cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl; + cerr << "This took " << overall_time/60. << " min." << endl; + } +} + diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h new file mode 100644 index 00000000..4b6f415c --- /dev/null +++ b/training/dtrain/dtrain.h @@ -0,0 +1,97 @@ +#ifndef _DTRAIN_H_ +#define _DTRAIN_H_ + +#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs + // DO NOT USE WITH SVM! +//#define DTRAIN_LOCAL +#define DTRAIN_DOTS 10 // after how many inputs to display a '.' +#define DTRAIN_GRAMMAR_DELIM "########EOS########" +#define DTRAIN_SCALE 100000 + + +#include <iomanip> +#include <climits> +#include <string.h> + +#include <boost/algorithm/string.hpp> +#include <boost/program_options.hpp> + +#include "ksampler.h" +#include "pairsampling.h" + +#include "filelib.h" + + +using namespace std; +using namespace dtrain; +namespace po = boost::program_options; + +inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids) +{ + vector<string>::const_iterator it; + for (it = strs.begin(); it < strs.end(); it++) + ids.push_back(TD::Convert(*it)); +} + +inline string gettmpf(const string path, const string infix) +{ + char fn[path.size() + infix.size() + 8]; + strcpy(fn, path.c_str()); + strcat(fn, "/"); + strcat(fn, infix.c_str()); + strcat(fn, "-XXXXXX"); + if (!mkstemp(fn)) { + cerr << "Cannot make temp file in" << path << " , exiting." << endl; + exit(1); + } + return string(fn); +} + +inline void split_in(string& s, vector<string>& parts) +{ + unsigned f = 0; + for(unsigned i = 0; i < 3; i++) { + unsigned e = f; + f = s.find("\t", f+1); + if (e != 0) parts.push_back(s.substr(e+1, f-e-1)); + else parts.push_back(s.substr(0, f)); + } + s.erase(0, f+1); +} + +struct HSReporter +{ + string task_id_; + + HSReporter(string task_id) : task_id_(task_id) {} + + inline void update_counter(string name, unsigned amount) { + cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl; + } + inline void update_gcounter(string name, unsigned amount) { + cerr << "reporter:counter:Global," << name << "," << amount << endl; + } +}; + +inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); } +inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); } +inline ostream& _p2(ostream& out) { return out << setprecision(2); } +inline ostream& _p5(ostream& out) { return out << setprecision(5); } + +inline void printWordIDVec(vector<WordID>& v) +{ + for (unsigned i = 0; i < v.size(); i++) { + cerr << TD::Convert(v[i]); + if (i < v.size()-1) cerr << " "; + } +} + +template<typename T> +inline T sign(T z) +{ + if (z == 0) return 0; + return z < 0 ? -1 : +1; +} + +#endif + diff --git a/training/dtrain/hstreaming/avg.rb b/training/dtrain/hstreaming/avg.rb new file mode 100755 index 00000000..2599c732 --- /dev/null +++ b/training/dtrain/hstreaming/avg.rb @@ -0,0 +1,32 @@ +#!/usr/bin/env ruby +# first arg may be an int of custom shard count + +shard_count_key = "__SHARD_COUNT__" + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +w = {} +c = {} +w.default = 0 +c.default = 0 +while line = STDIN.gets + key, val = line.split /\s/ + w[key] += val.to_f + c[key] += 1 +end + +if ARGV.size == 0 + shard_count = w["__SHARD_COUNT__"] +else + shard_count = ARGV[0].to_f +end +w.each_key { |k| + if k == shard_count_key + next + else + puts "#{k}\t#{w[k]/shard_count}" + #puts "# #{c[k]}" + end +} + diff --git a/training/dtrain/hstreaming/cdec.ini b/training/dtrain/hstreaming/cdec.ini new file mode 100644 index 00000000..d4f5cecd --- /dev/null +++ b/training/dtrain/hstreaming/cdec.ini @@ -0,0 +1,22 @@ +formalism=scfg +add_pass_through_rules=true +scfg_max_span_limit=15 +intersection_strategy=cube_pruning +cubepruning_pop_limit=30 +feature_function=WordPenalty +feature_function=KLanguageModel nc-wmt11.en.srilm.gz +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/training/dtrain/hstreaming/dtrain.ini b/training/dtrain/hstreaming/dtrain.ini new file mode 100644 index 00000000..a2c219a1 --- /dev/null +++ b/training/dtrain/hstreaming/dtrain.ini @@ -0,0 +1,15 @@ +input=- +output=- +decoder_config=cdec.ini +tmp=/var/hadoop/mapred/local/ +epochs=1 +k=100 +N=4 +learning_rate=0.0001 +gamma=0 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=XYX +pair_threshold=0 +select_weights=last diff --git a/training/dtrain/hstreaming/dtrain.sh b/training/dtrain/hstreaming/dtrain.sh new file mode 100755 index 00000000..877ff94c --- /dev/null +++ b/training/dtrain/hstreaming/dtrain.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# script to run dtrain with a task id + +pushd . &>/dev/null +cd .. +ID=$(basename $(pwd)) # attempt_... +popd &>/dev/null +./dtrain -c dtrain.ini --hstreaming $ID + diff --git a/training/dtrain/hstreaming/hadoop-streaming-job.sh b/training/dtrain/hstreaming/hadoop-streaming-job.sh new file mode 100755 index 00000000..92419956 --- /dev/null +++ b/training/dtrain/hstreaming/hadoop-streaming-job.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +EXP=a_simple_test + +# change these vars to fit your hadoop installation +HADOOP_HOME=/usr/lib/hadoop-0.20 +JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar +HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" + + IN=input_on_hdfs +OUT=output_weights_on_hdfs + +# you can -reducer to NONE if you want to +# do feature selection/averaging locally (e.g. to +# keep weights of all epochs) +$HSTREAMING \ + -mapper "dtrain.sh" \ + -reducer "ruby lplp.rb l2 select_k 100000" \ + -input $IN \ + -output $OUT \ + -file dtrain.sh \ + -file lplp.rb \ + -file ../dtrain \ + -file dtrain.ini \ + -file cdec.ini \ + -file ../test/example/nc-wmt11.en.srilm.gz \ + -jobconf mapred.reduce.tasks=30 \ + -jobconf mapred.max.map.failures.percent=0 \ + -jobconf mapred.job.name="dtrain $EXP" + diff --git a/training/dtrain/hstreaming/lplp.rb b/training/dtrain/hstreaming/lplp.rb new file mode 100755 index 00000000..f0cd58c5 --- /dev/null +++ b/training/dtrain/hstreaming/lplp.rb @@ -0,0 +1,131 @@ +# lplp.rb + +# norms +def l0(feature_column, n) + if feature_column.size >= n then return 1 else return 0 end +end + +def l1(feature_column, n=-1) + return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i } +end + +def l2(feature_column, n=-1) + return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i } +end + +def linfty(feature_column, n=-1) + return feature_column.map { |i| i.abs }.max +end + +# stats +def median(feature_column, n) + return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2] +end + +def mean(feature_column, n) + return feature_column.reduce { |sum, i| sum+i } / n +end + +# selection +def select_k(weights, norm_fun, n, k=10000) + weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p| + puts "#{p[0]}\t#{mean(p[1], n)}" + k -= 1 + if k == 0 then break end + } +end + +def cut(weights, norm_fun, n, epsilon=0.0001) + weights.each { |k,v| + if norm_fun.call(v, n).abs >= epsilon + puts "#{k}\t#{mean(v, n)}" + end + } +end + +# test +def _test() + puts + w = {} + w["a"] = [1, 2, 3] + w["b"] = [1, 2] + w["c"] = [66] + w["d"] = [10, 20, 30] + n = 3 + puts w.to_s + puts + puts "select_k" + puts "l0 expect ad" + select_k(w, method(:l0), n, 2) + puts "l1 expect cd" + select_k(w, method(:l1), n, 2) + puts "l2 expect c" + select_k(w, method(:l2), n, 1) + puts + puts "cut" + puts "l1 expect cd" + cut(w, method(:l1), n, 7) + puts + puts "median" + a = [1,2,3,4,5] + puts a.to_s + puts median(a, 5) + puts + puts "#{median(a, 7)} <- that's because we add missing 0s:" + puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s + puts + puts "mean expect bc" + w.clear + w["a"] = [2] + w["b"] = [2.1] + w["c"] = [2.2] + cut(w, method(:mean), 1, 2.05) + exit +end +#_test() + +# actually do something +def usage() + puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> [n] < <input>" + puts " l0...: norms for selection" + puts "select_k: only output top k (according to the norm of their column vector) features" + puts " cut: output features with weight >= threshold" + puts " n: if we do not have a shard count use this number for averaging" + exit +end + +if ARGV.size < 3 then usage end +norm_fun = method(ARGV[0].to_sym) +type = ARGV[1] +x = ARGV[2].to_f + +shard_count_key = "__SHARD_COUNT__" + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +w = {} +shard_count = 0 +while line = STDIN.gets + key, val = line.split /\s+/ + if key == shard_count_key + shard_count += 1 + next + end + if w.has_key? key + w[key].push val.to_f + else + w[key] = [val.to_f] + end +end + +if ARGV.size == 4 then shard_count = ARGV[3].to_f end + +if type == 'cut' + cut(w, norm_fun, shard_count, x) +elsif type == 'select_k' + select_k(w, norm_fun, shard_count, x) +else + puts "oh oh" +end + diff --git a/training/dtrain/hstreaming/red-test b/training/dtrain/hstreaming/red-test new file mode 100644 index 00000000..2623d697 --- /dev/null +++ b/training/dtrain/hstreaming/red-test @@ -0,0 +1,9 @@ +a 1 +b 2 +c 3.5 +a 1 +b 2 +c 3.5 +d 1 +e 2 +__SHARD_COUNT__ 2 diff --git a/training/dtrain/kbestget.h b/training/dtrain/kbestget.h new file mode 100644 index 00000000..dd8882e1 --- /dev/null +++ b/training/dtrain/kbestget.h @@ -0,0 +1,152 @@ +#ifndef _DTRAIN_KBESTGET_H_ +#define _DTRAIN_KBESTGET_H_ + +#include "kbest.h" // cdec +#include "sentence_metadata.h" + +#include "verbose.h" +#include "viterbi.h" +#include "ff_register.h" +#include "decoder.h" +#include "weights.h" +#include "logval.h" + +using namespace std; + +namespace dtrain +{ + + +typedef double score_t; + +struct ScoredHyp +{ + vector<WordID> w; + SparseVector<double> f; + score_t model; + score_t score; + unsigned rank; +}; + +struct LocalScorer +{ + unsigned N_; + vector<score_t> w_; + + virtual score_t + Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len)=0; + + void Reset() {} // only for approx bleu + + inline void + Init(unsigned N, vector<score_t> weights) + { + assert(N > 0); + N_ = N; + if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_); + else w_ = weights; + } + + inline score_t + brevity_penalty(const unsigned hyp_len, const unsigned ref_len) + { + if (hyp_len > ref_len) return 1; + return exp(1 - (score_t)ref_len/hyp_len); + } +}; + +struct HypSampler : public DecoderObserver +{ + LocalScorer* scorer_; + vector<WordID>* ref_; + unsigned f_count_, sz_; + virtual vector<ScoredHyp>* GetSamples()=0; + inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; } + inline void SetRef(vector<WordID>& ref) { ref_ = &ref; } + inline unsigned get_f_count() { return f_count_; } + inline unsigned get_sz() { return sz_; } +}; +//////////////////////////////////////////////////////////////////////////////// + + + + +struct KBestGetter : public HypSampler +{ + const unsigned k_; + const string filter_type_; + vector<ScoredHyp> s_; + unsigned src_len_; + + KBestGetter(const unsigned k, const string filter_type) : + k_(k), filter_type_(filter_type) {} + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + src_len_ = smeta.GetSourceLength(); + KBestScored(*hg); + } + + vector<ScoredHyp>* GetSamples() { return &s_; } + + void + KBestScored(const Hypergraph& forest) + { + if (filter_type_ == "uniq") { + KBestUnique(forest); + } else if (filter_type_ == "not") { + KBestNoFilter(forest); + } + } + + void + KBestUnique(const Hypergraph& forest) + { + s_.clear(); sz_ = f_count_ = 0; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, + KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_); + for (unsigned i = 0; i < k_; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, + prob_t, EdgeProb>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + ScoredHyp h; + h.w = d->yield; + h.f = d->feature_values; + h.model = log(d->score); + h.rank = i; + h.score = scorer_->Score(h.w, *ref_, i, src_len_); + s_.push_back(h); + sz_++; + f_count_ += h.f.size(); + } + } + + void + KBestNoFilter(const Hypergraph& forest) + { + s_.clear(); sz_ = f_count_ = 0; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_); + for (unsigned i = 0; i < k_; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + ScoredHyp h; + h.w = d->yield; + h.f = d->feature_values; + h.model = log(d->score); + h.rank = i; + h.score = scorer_->Score(h.w, *ref_, i, src_len_); + s_.push_back(h); + sz_++; + f_count_ += h.f.size(); + } + } +}; + + +} // namespace + +#endif + diff --git a/training/dtrain/ksampler.h b/training/dtrain/ksampler.h new file mode 100644 index 00000000..bc2f56cd --- /dev/null +++ b/training/dtrain/ksampler.h @@ -0,0 +1,61 @@ +#ifndef _DTRAIN_KSAMPLER_H_ +#define _DTRAIN_KSAMPLER_H_ + +#include "hg_sampler.h" // cdec +#include "kbestget.h" +#include "score.h" + +namespace dtrain +{ + +bool +cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b) +{ + return a.model > b.model; +} + +struct KSampler : public HypSampler +{ + const unsigned k_; + vector<ScoredHyp> s_; + MT19937* prng_; + score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>); + unsigned src_len_; + + explicit KSampler(const unsigned k, MT19937* prng) : + k_(k), prng_(prng) {} + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + src_len_ = smeta.GetSourceLength(); + ScoredSamples(*hg); + } + + vector<ScoredHyp>* GetSamples() { return &s_; } + + void ScoredSamples(const Hypergraph& forest) { + s_.clear(); sz_ = f_count_ = 0; + std::vector<HypergraphSampler::Hypothesis> samples; + HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples); + for (unsigned i = 0; i < k_; ++i) { + ScoredHyp h; + h.w = samples[i].words; + h.f = samples[i].fmap; + h.model = log(samples[i].model_score); + h.rank = i; + h.score = scorer_->Score(h.w, *ref_, i, src_len_); + s_.push_back(h); + sz_++; + f_count_ += h.f.size(); + } + sort(s_.begin(), s_.end(), cmp_hyp_by_model_d); + for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i; + } +}; + + +} // namespace + +#endif + diff --git a/training/dtrain/pairsampling.h b/training/dtrain/pairsampling.h new file mode 100644 index 00000000..84be1efb --- /dev/null +++ b/training/dtrain/pairsampling.h @@ -0,0 +1,149 @@ +#ifndef _DTRAIN_PAIRSAMPLING_H_ +#define _DTRAIN_PAIRSAMPLING_H_ + +namespace dtrain +{ + + +bool +accept_pair(score_t a, score_t b, score_t threshold) +{ + if (fabs(a - b) < threshold) return false; + return true; +} + +bool +cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b) +{ + return a.score > b.score; +} + +inline void +all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1) +{ + sort(s->begin(), s->end(), cmp_hyp_by_score_d); + unsigned sz = s->size(); + bool b = false; + unsigned count = 0; + for (unsigned i = 0; i < sz-1; i++) { + for (unsigned j = i+1; j < sz; j++) { + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) + training.push_back(make_pair((*s)[i], (*s)[j])); + } else { + if ((*s)[i].score != (*s)[j].score) + training.push_back(make_pair((*s)[i], (*s)[j])); + } + if (++count == max) { + b = true; + break; + } + } + if (b) break; + } +} + +/* + * multipartite ranking + * sort (descending) by bleu + * compare top X to middle Y and low X + * cmp middle Y to low X + */ + +inline void +partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float hi_lo) +{ + unsigned sz = s->size(); + if (sz < 2) return; + sort(s->begin(), s->end(), cmp_hyp_by_score_d); + unsigned sep = round(sz*hi_lo); + unsigned sep_hi = sep; + if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi; + else sep_hi = 1; + bool b = false; + unsigned count = 0; + for (unsigned i = 0; i < sep_hi; i++) { + for (unsigned j = sep_hi; j < sz; j++) { +#ifdef DTRAIN_FASTER_PERCEPTRON + if ((*s)[i].model <= (*s)[j].model) { +#endif + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) + training.push_back(make_pair((*s)[i], (*s)[j])); + } else { + if ((*s)[i].score != (*s)[j].score) + training.push_back(make_pair((*s)[i], (*s)[j])); + } + if (++count == max) { + b = true; + break; + } +#ifdef DTRAIN_FASTER_PERCEPTRON + } +#endif + } + if (b) break; + } + unsigned sep_lo = sz-sep; + while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo; + for (unsigned i = sep_hi; i < sz-sep_lo; i++) { + for (unsigned j = sz-sep_lo; j < sz; j++) { +#ifdef DTRAIN_FASTER_PERCEPTRON + if ((*s)[i].model <= (*s)[j].model) { +#endif + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) + training.push_back(make_pair((*s)[i], (*s)[j])); + } else { + if ((*s)[i].score != (*s)[j].score) + training.push_back(make_pair((*s)[i], (*s)[j])); + } + if (++count == max) return; +#ifdef DTRAIN_FASTER_PERCEPTRON + } +#endif + } + } +} + +/* + * pair sampling as in + * 'Tuning as Ranking' (Hopkins & May, 2011) + * count = 5000 + * threshold = 5% BLEU (0.05 for param 3) + * cut = top 50 + */ +bool +_PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b) +{ + return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score)); +} +inline void +PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1) +{ + unsigned max_count = 5000, count = 0, sz = s->size(); + bool b = false; + for (unsigned i = 0; i < sz-1; i++) { + for (unsigned j = i+1; j < sz; j++) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) { + training.push_back(make_pair((*s)[i], (*s)[j])); + if (++count == max_count) { + b = true; + break; + } + } + } + if (b) break; + } + if (training.size() > 50) { + sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d); + training.erase(training.begin()+50, training.end()); + } + return; +} + + +} // namespace + +#endif + diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb new file mode 100755 index 00000000..eb4148f5 --- /dev/null +++ b/training/dtrain/parallelize.rb @@ -0,0 +1,80 @@ +#!/usr/bin/env ruby + + +if ARGV.size != 5 + STDERR.write "Usage: " + STDERR.write "ruby parallelize.rb <#shards> <input> <refs> <epochs> <dtrain.ini>\n" + exit +end + +cdec_dir = '/path/to/cdec_dir' +dtrain_bin = "#{cdec_dir}/training/dtrain/dtrain_local" +ruby = '/usr/bin/ruby' +lplp_rb = "#{cdec_dir}/training/dtrain/hstreaming/lplp.rb" +lplp_args = 'l2 select_k 100000' +gzip = '/bin/gzip' + +num_shards = ARGV[0].to_i +input = ARGV[1] +refs = ARGV[2] +epochs = ARGV[3].to_i +ini = ARGV[4] + + +`mkdir work` + +def make_shards(input, refs, num_shards) + lc = `wc -l #{input}`.split.first.to_i + shard_sz = lc / num_shards + leftover = lc % num_shards + in_f = File.new input, 'r' + refs_f = File.new refs, 'r' + shard_in_files = [] + shard_refs_files = [] + 0.upto(num_shards-1) { |shard| + shard_in = File.new "work/shard.#{shard}.in", 'w+' + shard_refs = File.new "work/shard.#{shard}.refs", 'w+' + 0.upto(shard_sz-1) { |i| + shard_in.write in_f.gets + shard_refs.write refs_f.gets + } + shard_in_files << shard_in + shard_refs_files << shard_refs + } + while leftover > 0 + shard_in_files[-1].write in_f.gets + shard_refs_files[-1].write refs_f.gets + leftover -= 1 + end + (shard_in_files + shard_refs_files).each do |f| f.close end + in_f.close + refs_f.close +end + +make_shards input, refs, num_shards + +0.upto(epochs-1) { |epoch| + pids = [] + input_weights = '' + if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end + weights_files = [] + 0.upto(num_shards-1) { |shard| + pids << Kernel.fork { + `#{dtrain_bin} -c #{ini}\ + --input work/shard.#{shard}.in\ + --refs work/shard.#{shard}.refs #{input_weights}\ + --output work/weights.#{shard}.#{epoch}\ + &> work/out.#{shard}.#{epoch}` + } + weights_files << "work/weights.#{shard}.#{epoch}" + } + pids.each { |pid| Process.wait(pid) } + cat = File.new('work/weights_cat', 'w+') + weights_files.each { |f| cat.write File.new(f, 'r').read } + cat.close + `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat &> work/weights.#{epoch}` +} + +`rm work/weights_cat` +`#{gzip} work/*` + diff --git a/training/dtrain/score.cc b/training/dtrain/score.cc new file mode 100644 index 00000000..34fc86a9 --- /dev/null +++ b/training/dtrain/score.cc @@ -0,0 +1,254 @@ +#include "score.h" + +namespace dtrain +{ + + +/* + * bleu + * + * as in "BLEU: a Method for Automatic Evaluation + * of Machine Translation" + * (Papineni et al. '02) + * + * NOTE: 0 if for one n \in {1..N} count is 0 + */ +score_t +BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len) +{ + if (hyp_len == 0 || ref_len == 0) return 0.; + unsigned M = N_; + vector<score_t> v = w_; + if (ref_len < N_) { + M = ref_len; + for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M); + } + score_t sum = 0; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) return 0.; + sum += v[i] * log((score_t)counts.clipped_[i]/counts.sum_[i]); + } + return brevity_penalty(hyp_len, ref_len) * exp(sum); +} + +score_t +BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + return Bleu(counts, hyp_len, ref_len); +} + +/* + * 'stupid' bleu + * + * as in "ORANGE: a Method for Evaluating + * Automatic Evaluation Metrics + * for Machine Translation" + * (Lin & Och '04) + * + * NOTE: 0 iff no 1gram match + */ +score_t +StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + vector<score_t> v = w_; + if (ref_len < N_) { + M = ref_len; + for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M); + } + score_t sum = 0, add = 0; + for (unsigned i = 0; i < M; i++) { + if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.; + if (i == 1) add = 1; + sum += v[i] * log(((score_t)counts.clipped_[i] + add)/((counts.sum_[i] + add))); + } + return brevity_penalty(hyp_len, ref_len) * exp(sum); +} + +/* + * smooth bleu + * + * as in "An End-to-End Discriminative Approach + * to Machine Translation" + * (Liang et al. '06) + * + * NOTE: max is 0.9375 (with N=4) + */ +score_t +SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0.; + vector<score_t> i_bleu; + for (unsigned i = 0; i < M; i++) i_bleu.push_back(0.); + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) { + break; + } else { + score_t i_ng = log((score_t)counts.clipped_[i]/counts.sum_[i]); + for (unsigned j = i; j < M; j++) { + i_bleu[j] += (1/((score_t)j+1)) * i_ng; + } + } + sum += exp(i_bleu[i])/pow(2.0, (double)(N_-i)); + } + return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * 'sum' bleu + * + * sum up Ngram precisions + */ +score_t +SumBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0.; + unsigned j = 1; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; + sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2.0, (double) (N_-j+1)); + j++; + } + return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * 'sum' (exp) bleu + * + * sum up exp(Ngram precisions) + */ +score_t +SumExpBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0.; + unsigned j = 1; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; + sum += exp(((score_t)counts.clipped_[i]/counts.sum_[i]))/pow(2.0, (double) (N_-j+1)); + j++; + } + return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * 'sum' (whatever) bleu + * + * sum up exp(weight * log(Ngram precisions)) + */ +score_t +SumWhateverBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + vector<score_t> v = w_; + if (ref_len < N_) { + M = ref_len; + for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M); + } + score_t sum = 0.; + unsigned j = 1; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; + sum += exp(v[i] * log(((score_t)counts.clipped_[i]/counts.sum_[i])))/pow(2.0, (double) (N_-j+1)); + j++; + } + return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * approx. bleu + * + * as in "Online Large-Margin Training of Syntactic + * and Structural Translation Features" + * (Chiang et al. '08) + * + * NOTE: Needs some more code in dtrain.cc . + * No scaling by src len. + */ +score_t +ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned rank, const unsigned src_len) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (ref_len == 0) return 0.; + score_t score = 0.; + NgramCounts counts(N_); + if (hyp_len > 0) { + counts = make_ngram_counts(hyp, ref, N_); + NgramCounts tmp = glob_onebest_counts_ + counts; + score = Bleu(tmp, hyp_len, ref_len); + } + if (rank == 0) { // 'context of 1best translations' + glob_onebest_counts_ += counts; + glob_onebest_counts_ *= discount_; + glob_hyp_len_ = discount_ * (glob_hyp_len_ + hyp_len); + glob_ref_len_ = discount_ * (glob_ref_len_ + ref_len); + glob_src_len_ = discount_ * (glob_src_len_ + src_len); + } + return score; +} + +/* + * Linear (Corpus) Bleu + * + * as in "Lattice Minimum Bayes-Risk Decoding + * for Statistical Machine Translation" + * (Tromble et al. '08) + * + */ +score_t +LinearBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned rank, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (ref_len == 0) return 0.; + unsigned M = N_; + if (ref_len < N_) M = ref_len; + NgramCounts counts(M); + if (hyp_len > 0) + counts = make_ngram_counts(hyp, ref, M); + score_t ret = 0.; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || onebest_counts_.sum_[i] == 0) break; + ret += counts.sum_[i]/onebest_counts_.sum_[i]; + } + ret = -(hyp_len/(score_t)onebest_len_) + (1./M) * ret; + if (rank == 0) { + onebest_len_ += hyp_len; + onebest_counts_ += counts; + } + return ret; +} + + +} // namespace + diff --git a/training/dtrain/score.h b/training/dtrain/score.h new file mode 100644 index 00000000..f317c903 --- /dev/null +++ b/training/dtrain/score.h @@ -0,0 +1,212 @@ +#ifndef _DTRAIN_SCORE_H_ +#define _DTRAIN_SCORE_H_ + +#include "kbestget.h" + +using namespace std; + +namespace dtrain +{ + + +struct NgramCounts +{ + unsigned N_; + map<unsigned, score_t> clipped_; + map<unsigned, score_t> sum_; + + NgramCounts(const unsigned N) : N_(N) { Zero(); } + + inline void + operator+=(const NgramCounts& rhs) + { + if (rhs.N_ > N_) Resize(rhs.N_); + for (unsigned i = 0; i < N_; i++) { + this->clipped_[i] += rhs.clipped_.find(i)->second; + this->sum_[i] += rhs.sum_.find(i)->second; + } + } + + inline const NgramCounts + operator+(const NgramCounts &other) const + { + NgramCounts result = *this; + result += other; + return result; + } + + inline void + operator*=(const score_t rhs) + { + for (unsigned i = 0; i < N_; i++) { + this->clipped_[i] *= rhs; + this->sum_[i] *= rhs; + } + } + + inline void + Add(const unsigned count, const unsigned ref_count, const unsigned i) + { + assert(i < N_); + if (count > ref_count) { + clipped_[i] += ref_count; + } else { + clipped_[i] += count; + } + sum_[i] += count; + } + + inline void + Zero() + { + for (unsigned i = 0; i < N_; i++) { + clipped_[i] = 0.; + sum_[i] = 0.; + } + } + + inline void + One() + { + for (unsigned i = 0; i < N_; i++) { + clipped_[i] = 1.; + sum_[i] = 1.; + } + } + + inline void + Print() + { + for (unsigned i = 0; i < N_; i++) { + cout << i+1 << "grams (clipped):\t" << clipped_[i] << endl; + cout << i+1 << "grams:\t\t\t" << sum_[i] << endl; + } + } + + inline void Resize(unsigned N) + { + if (N == N_) return; + else if (N > N_) { + for (unsigned i = N_; i < N; i++) { + clipped_[i] = 0.; + sum_[i] = 0.; + } + } else { // N < N_ + for (unsigned i = N_-1; i > N-1; i--) { + clipped_.erase(i); + sum_.erase(i); + } + } + N_ = N; + } +}; + +typedef map<vector<WordID>, unsigned> Ngrams; + +inline Ngrams +make_ngrams(const vector<WordID>& s, const unsigned N) +{ + Ngrams ngrams; + vector<WordID> ng; + for (size_t i = 0; i < s.size(); i++) { + ng.clear(); + for (unsigned j = i; j < min(i+N, s.size()); j++) { + ng.push_back(s[j]); + ngrams[ng]++; + } + } + return ngrams; +} + +inline NgramCounts +make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned N) +{ + Ngrams hyp_ngrams = make_ngrams(hyp, N); + Ngrams ref_ngrams = make_ngrams(ref, N); + NgramCounts counts(N); + Ngrams::iterator it; + Ngrams::iterator ti; + for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) { + ti = ref_ngrams.find(it->first); + if (ti != ref_ngrams.end()) { + counts.Add(it->second, ti->second, it->first.size() - 1); + } else { + counts.Add(it->second, 0, it->first.size() - 1); + } + } + return counts; +} + +struct BleuScorer : public LocalScorer +{ + score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct StupidBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SmoothBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SumBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SumExpBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SumWhateverBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct ApproxBleuScorer : public BleuScorer +{ + NgramCounts glob_onebest_counts_; + unsigned glob_hyp_len_, glob_ref_len_, glob_src_len_; + score_t discount_; + + ApproxBleuScorer(unsigned N, score_t d) : glob_onebest_counts_(NgramCounts(N)), discount_(d) + { + glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0; + } + + inline void Reset() { + glob_onebest_counts_.Zero(); + glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0.; + } + + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len); +}; + +struct LinearBleuScorer : public BleuScorer +{ + unsigned onebest_len_; + NgramCounts onebest_counts_; + + LinearBleuScorer(unsigned N) : onebest_len_(1), onebest_counts_(N) + { + onebest_counts_.One(); + } + + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned /*src_len*/); + + inline void Reset() { + onebest_len_ = 1; + onebest_counts_.One(); + } +}; + + +} // namespace + +#endif + diff --git a/training/dtrain/test/example/README b/training/dtrain/test/example/README new file mode 100644 index 00000000..6937b11b --- /dev/null +++ b/training/dtrain/test/example/README @@ -0,0 +1,8 @@ +Small example of input format for distributed training. +Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini . + +For this to work, undef 'DTRAIN_LOCAL' in dtrain.h +and recompile. + +Data is here: http://simianer.de/#dtrain + diff --git a/training/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini new file mode 100644 index 00000000..d5955f0e --- /dev/null +++ b/training/dtrain/test/example/cdec.ini @@ -0,0 +1,25 @@ +formalism=scfg +add_pass_through_rules=true +scfg_max_span_limit=15 +intersection_strategy=cube_pruning +cubepruning_pop_limit=30 +feature_function=WordPenalty +feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz +# all currently working feature functions for translation: +# (with those features active that were used in the ACL paper) +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +feature_function=RuleIdentityFeatures +feature_function=RuleSourceBigramFeatures +feature_function=RuleTargetBigramFeatures +feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/training/dtrain/test/example/dtrain.ini b/training/dtrain/test/example/dtrain.ini new file mode 100644 index 00000000..72d50ca1 --- /dev/null +++ b/training/dtrain/test/example/dtrain.ini @@ -0,0 +1,22 @@ +input=test/example/nc-wmt11.1k.gz # use '-' for STDIN +output=- # a weights file (add .gz for gzip compression) or STDOUT '-' +select_weights=VOID # don't output weights +decoder_config=test/example/cdec.ini # config for cdec +# weights for these features will be printed on each iteration +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +tmp=/tmp +stop_after=10 # stop epoch after 10 inputs + +# interesting stuff +epochs=2 # run over input 2 times +k=100 # use 100best lists +N=4 # optimize (approx) BLEU4 +scorer=stupid_bleu # use 'stupid' BLEU+1 +learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron) +gamma=0 # use SVM reg +sample_from=kbest # use kbest lists (as opposed to forest) +filter=uniq # only unique entries in kbest (surface form) +pair_sampling=XYX +hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here +pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0) +loss_margin=0 diff --git a/training/dtrain/test/example/expected-output b/training/dtrain/test/example/expected-output new file mode 100644 index 00000000..05326763 --- /dev/null +++ b/training/dtrain/test/example/expected-output @@ -0,0 +1,89 @@ + cdec cfg 'test/example/cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading test/example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** + Example feature: Shape_S00000_T00000 +Seeding random number sequence to 2912000813 + +dtrain +Parameters: + k 100 + N 4 + T 2 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 1 + gamma 0 + loss margin 0 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'VOID' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'test/example/cdec.ini' + input 'test/example/nc-wmt11.1k.gz' + output '-' + stop_after 10 +(a dot represents 10 inputs) +Iteration #1 of 2. + . 10 +Stopping after 10 input sentences. +WEIGHTS + Glue = -637 + WordPenalty = +1064 + LanguageModel = +1175.3 + LanguageModel_OOV = -1437 + PhraseModel_0 = +1935.6 + PhraseModel_1 = +2499.3 + PhraseModel_2 = +964.96 + PhraseModel_3 = +1410.8 + PhraseModel_4 = -5977.9 + PhraseModel_5 = +522 + PhraseModel_6 = +1089 + PassThrough = -1308 + --- + 1best avg score: 0.16963 (+0.16963) + 1best avg model score: 64485 (+64485) + avg # pairs: 1494.4 + avg # rank err: 702.6 + avg # margin viol: 0 + non0 feature count: 528 + avg list sz: 85.7 + avg f count: 102.75 +(time 0.083 min, 0.5 s/S) + +Iteration #2 of 2. + . 10 +WEIGHTS + Glue = -1196 + WordPenalty = +809.52 + LanguageModel = +3112.1 + LanguageModel_OOV = -1464 + PhraseModel_0 = +3895.5 + PhraseModel_1 = +4683.4 + PhraseModel_2 = +1092.8 + PhraseModel_3 = +1079.6 + PhraseModel_4 = -6827.7 + PhraseModel_5 = -888 + PhraseModel_6 = +142 + PassThrough = -1335 + --- + 1best avg score: 0.277 (+0.10736) + 1best avg model score: -3110.5 (-67595) + avg # pairs: 1144.2 + avg # rank err: 529.1 + avg # margin viol: 0 + non0 feature count: 859 + avg list sz: 74.9 + avg f count: 112.84 +(time 0.067 min, 0.4 s/S) + +Writing weights file to '-' ... +done + +--- +Best iteration: 2 [SCORE 'stupid_bleu'=0.277]. +This took 0.15 min. diff --git a/training/dtrain/test/parallelize/cdec.ini b/training/dtrain/test/parallelize/cdec.ini new file mode 100644 index 00000000..72e99dc5 --- /dev/null +++ b/training/dtrain/test/parallelize/cdec.ini @@ -0,0 +1,22 @@ +formalism=scfg +add_pass_through_rules=true +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +scfg_max_span_limit=15 +feature_function=WordPenalty +feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5 +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/training/dtrain/test/parallelize/dtrain.ini b/training/dtrain/test/parallelize/dtrain.ini new file mode 100644 index 00000000..03f9d240 --- /dev/null +++ b/training/dtrain/test/parallelize/dtrain.ini @@ -0,0 +1,15 @@ +k=100 +N=4 +learning_rate=0.0001 +gamma=0 +loss_margin=0 +epochs=1 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=XYX +hi_lo=0.1 +select_weights=last +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +tmp=/tmp +decoder_config=cdec.ini diff --git a/training/dtrain/test/parallelize/in b/training/dtrain/test/parallelize/in new file mode 100644 index 00000000..a312809f --- /dev/null +++ b/training/dtrain/test/parallelize/in @@ -0,0 +1,10 @@ +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.0.gz" id="0">barack obama erhält als vierter us @-@ präsident den frieden nobelpreis</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.1.gz" id="1">der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.2.gz" id="2">darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.3.gz" id="3">der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.4.gz" id="4">zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.5.gz" id="5">das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.6.gz" id="6">nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.7.gz" id="7">diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.8.gz" id="8">das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.9.gz" id="9">der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt .</seg> diff --git a/training/dtrain/test/parallelize/refs b/training/dtrain/test/parallelize/refs new file mode 100644 index 00000000..4d3128cb --- /dev/null +++ b/training/dtrain/test/parallelize/refs @@ -0,0 +1,10 @@ +barack obama becomes the fourth american president to receive the nobel peace prize +the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so . +he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things . +the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule . +first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations . +the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway . +then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award . +he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office . +the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan . +the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries . diff --git a/training/dtrain/test/parallelize/test/cdec.ini b/training/dtrain/test/parallelize/test/cdec.ini new file mode 100644 index 00000000..72e99dc5 --- /dev/null +++ b/training/dtrain/test/parallelize/test/cdec.ini @@ -0,0 +1,22 @@ +formalism=scfg +add_pass_through_rules=true +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +scfg_max_span_limit=15 +feature_function=WordPenalty +feature_function=KLanguageModel /stor/dat/wmt12/en/news_only/m/wmt12.news.en.3.kenv5 +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/training/dtrain/test/parallelize/test/dtrain.ini b/training/dtrain/test/parallelize/test/dtrain.ini new file mode 100644 index 00000000..03f9d240 --- /dev/null +++ b/training/dtrain/test/parallelize/test/dtrain.ini @@ -0,0 +1,15 @@ +k=100 +N=4 +learning_rate=0.0001 +gamma=0 +loss_margin=0 +epochs=1 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=XYX +hi_lo=0.1 +select_weights=last +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +tmp=/tmp +decoder_config=cdec.ini diff --git a/training/dtrain/test/parallelize/test/in b/training/dtrain/test/parallelize/test/in new file mode 100644 index 00000000..a312809f --- /dev/null +++ b/training/dtrain/test/parallelize/test/in @@ -0,0 +1,10 @@ +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.0.gz" id="0">barack obama erhält als vierter us @-@ präsident den frieden nobelpreis</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.1.gz" id="1">der amerikanische präsident barack obama kommt für 26 stunden nach oslo , norwegen , um hier als vierter us @-@ präsident in der geschichte den frieden nobelpreis entgegen zunehmen .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.2.gz" id="2">darüber hinaus erhält er das diplom sowie die medaille und einen scheck über 1,4 mio. dollar für seine außer gewöhnlichen bestrebungen um die intensivierung der welt diplomatie und zusammen arbeit unter den völkern .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.3.gz" id="3">der chef des weißen hauses kommt morgen zusammen mit seiner frau michelle in der nordwegischen metropole an und wird die ganze zeit beschäftigt sein .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.4.gz" id="4">zunächst stattet er dem nobel @-@ institut einen besuch ab , wo er überhaupt zum ersten mal mit den fünf ausschuss mitglieder zusammen trifft , die ihn im oktober aus 172 leuten und 33 organisationen gewählt haben .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.5.gz" id="5">das präsidenten paar hat danach ein treffen mit dem norwegischen könig harald v. und königin sonja eingeplant .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.6.gz" id="6">nachmittags erreicht dann der besuch seinen höhepunkt mit der zeremonie , bei der obama den prestige preis übernimmt .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.7.gz" id="7">diesen erhält er als der vierte us @-@ präsident , aber erst als der dritte , der den preis direkt im amt entgegen nimmt .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.8.gz" id="8">das weiße haus avisierte schon , dass obama bei der übernahme des preises über den afghanistan krieg sprechen wird .</seg> +<seg grammar="/stor/dat/wmt12/dev/newstest2010/g/grammar.out.9.gz" id="9">der präsident will diesem thema nicht ausweichen , weil er weiß , dass er den preis als ein präsident übernimmt , der zur zeit krieg in zwei ländern führt .</seg> diff --git a/training/dtrain/test/parallelize/test/refs b/training/dtrain/test/parallelize/test/refs new file mode 100644 index 00000000..4d3128cb --- /dev/null +++ b/training/dtrain/test/parallelize/test/refs @@ -0,0 +1,10 @@ +barack obama becomes the fourth american president to receive the nobel peace prize +the american president barack obama will fly into oslo , norway for 26 hours to receive the nobel peace prize , the fourth american president in history to do so . +he will receive a diploma , medal and cheque for 1.4 million dollars for his exceptional efforts to improve global diplomacy and encourage international cooperation , amongst other things . +the head of the white house will be flying into the norwegian city in the morning with his wife michelle and will have a busy schedule . +first , he will visit the nobel institute , where he will have his first meeting with the five committee members who selected him from 172 people and 33 organisations . +the presidential couple then has a meeting scheduled with king harald v and queen sonja of norway . +then , in the afternoon , the visit will culminate in a grand ceremony , at which obama will receive the prestigious award . +he will be the fourth american president to be awarded the prize , and only the third to have received it while actually in office . +the white house has stated that , when he accepts the prize , obama will speak about the war in afghanistan . +the president does not want to skirt around this topic , as he realises that he is accepting the prize as a president whose country is currently at war in two countries . diff --git a/training/dtrain/test/toy/cdec.ini b/training/dtrain/test/toy/cdec.ini new file mode 100644 index 00000000..98b02d44 --- /dev/null +++ b/training/dtrain/test/toy/cdec.ini @@ -0,0 +1,2 @@ +formalism=scfg +add_pass_through_rules=true diff --git a/training/dtrain/test/toy/dtrain.ini b/training/dtrain/test/toy/dtrain.ini new file mode 100644 index 00000000..a091732f --- /dev/null +++ b/training/dtrain/test/toy/dtrain.ini @@ -0,0 +1,12 @@ +decoder_config=test/toy/cdec.ini +input=test/toy/input +output=- +print_weights=logp shell_rule house_rule small_rule little_rule PassThrough +k=4 +N=4 +epochs=2 +scorer=bleu +sample_from=kbest +filter=uniq +pair_sampling=all +learning_rate=1 diff --git a/training/dtrain/test/toy/input b/training/dtrain/test/toy/input new file mode 100644 index 00000000..4d10a9ea --- /dev/null +++ b/training/dtrain/test/toy/input @@ -0,0 +1,2 @@ +0 ich sah ein kleines haus i saw a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1 [JJ] ||| kleines ||| small ||| logp=0 small_rule=1 [JJ] ||| kleines ||| little ||| logp=0 little_rule=1 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0 +1 ich fand ein kleines haus i found a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 house_rule=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 shell_rule=1 [JJ] ||| kleines ||| small ||| logp=0 small_rule=1 [JJ] ||| kleines ||| little ||| logp=0 little_rule=1 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0 diff --git a/training/fast_align.cc b/training/fast_align.cc deleted file mode 100644 index 7492d26f..00000000 --- a/training/fast_align.cc +++ /dev/null @@ -1,281 +0,0 @@ -#include <iostream> -#include <cmath> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "m.h" -#include "corpus_tools.h" -#include "stringlib.h" -#include "filelib.h" -#include "ttables.h" -#include "tdict.h" - -namespace po = boost::program_options; -using namespace std; - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input,i",po::value<string>(),"Parallel corpus input file") - ("reverse,r","Reverse estimation (swap source and target during training)") - ("iterations,I",po::value<unsigned>()->default_value(5),"Number of iterations of EM training") - //("bidir,b", "Run bidirectional alignment") - ("favor_diagonal,d", "Use a static alignment distribution that assigns higher probabilities to alignments near the diagonal") - ("prob_align_null", po::value<double>()->default_value(0.08), "When --favor_diagonal is set, what's the probability of a null alignment?") - ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (<1 = flat >1 = sharp)") - ("variational_bayes,v","Infer VB estimate of parameters under a symmetric Dirichlet prior") - ("alpha,a", po::value<double>()->default_value(0.01), "Hyperparameter for optional Dirichlet prior") - ("no_null_word,N","Do not generate from a null token") - ("output_parameters,p", "Write model parameters instead of alignments") - ("beam_threshold,t",po::value<double>()->default_value(-4),"When writing parameters, log_10 of beam threshold for writing parameter (-10000 to include everything, 0 max parameter only)") - ("hide_training_alignments,H", "Hide training alignments (only useful if you want to use -x option and just compute testset statistics)") - ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model") - ("no_add_viterbi,V","When writing model parameters, do not add Viterbi alignment points (may generate a grammar where some training sentence pairs are unreachable)"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || conf->count("input") == 0) { - cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -int main(int argc, char** argv) { - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) return 1; - const string fname = conf["input"].as<string>(); - const bool reverse = conf.count("reverse") > 0; - const int ITERATIONS = conf["iterations"].as<unsigned>(); - const double BEAM_THRESHOLD = pow(10.0, conf["beam_threshold"].as<double>()); - const bool use_null = (conf.count("no_null_word") == 0); - const WordID kNULL = TD::Convert("<eps>"); - const bool add_viterbi = (conf.count("no_add_viterbi") == 0); - const bool variational_bayes = (conf.count("variational_bayes") > 0); - const bool write_alignments = (conf.count("output_parameters") == 0); - const double diagonal_tension = conf["diagonal_tension"].as<double>(); - const double prob_align_null = conf["prob_align_null"].as<double>(); - const bool hide_training_alignments = (conf.count("hide_training_alignments") > 0); - string testset; - if (conf.count("testset")) testset = conf["testset"].as<string>(); - const double prob_align_not_null = 1.0 - prob_align_null; - const double alpha = conf["alpha"].as<double>(); - const bool favor_diagonal = conf.count("favor_diagonal"); - if (variational_bayes && alpha <= 0.0) { - cerr << "--alpha must be > 0\n"; - return 1; - } - - TTable s2t, t2s; - TTable::Word2Word2Double s2t_viterbi; - double tot_len_ratio = 0; - double mean_srclen_multiplier = 0; - vector<double> unnormed_a_i; - for (int iter = 0; iter < ITERATIONS; ++iter) { - const bool final_iteration = (iter == (ITERATIONS - 1)); - cerr << "ITERATION " << (iter + 1) << (final_iteration ? " (FINAL)" : "") << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - double likelihood = 0; - double denom = 0.0; - int lc = 0; - bool flag = false; - string line; - string ssrc, strg; - vector<WordID> src, trg; - while(true) { - getline(in, line); - if (!in) break; - ++lc; - if (lc % 1000 == 0) { cerr << '.'; flag = true; } - if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } - src.clear(); trg.clear(); - CorpusTools::ReadLine(line, &src, &trg); - if (reverse) swap(src, trg); - if (src.size() == 0 || trg.size() == 0) { - cerr << "Error: " << lc << "\n" << line << endl; - return 1; - } - if (src.size() > unnormed_a_i.size()) - unnormed_a_i.resize(src.size()); - if (iter == 0) - tot_len_ratio += static_cast<double>(trg.size()) / static_cast<double>(src.size()); - denom += trg.size(); - vector<double> probs(src.size() + 1); - bool first_al = true; // used for write_alignments - for (int j = 0; j < trg.size(); ++j) { - const WordID& f_j = trg[j]; - double sum = 0; - const double j_over_ts = double(j) / trg.size(); - double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) - if (use_null) { - if (favor_diagonal) prob_a_i = prob_align_null; - probs[0] = s2t.prob(kNULL, f_j) * prob_a_i; - sum += probs[0]; - } - double az = 0; - if (favor_diagonal) { - for (int ta = 0; ta < src.size(); ++ta) { - unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); - az += unnormed_a_i[ta]; - } - az /= prob_align_not_null; - } - for (int i = 1; i <= src.size(); ++i) { - if (favor_diagonal) - prob_a_i = unnormed_a_i[i-1] / az; - probs[i] = s2t.prob(src[i-1], f_j) * prob_a_i; - sum += probs[i]; - } - if (final_iteration) { - if (add_viterbi || write_alignments) { - WordID max_i = 0; - double max_p = -1; - int max_index = -1; - if (use_null) { - max_i = kNULL; - max_index = 0; - max_p = probs[0]; - } - for (int i = 1; i <= src.size(); ++i) { - if (probs[i] > max_p) { - max_index = i; - max_p = probs[i]; - max_i = src[i-1]; - } - } - if (!hide_training_alignments && write_alignments) { - if (max_index > 0) { - if (first_al) first_al = false; else cout << ' '; - if (reverse) - cout << j << '-' << (max_index - 1); - else - cout << (max_index - 1) << '-' << j; - } - } - s2t_viterbi[max_i][f_j] = 1.0; - } - } else { - if (use_null) - s2t.Increment(kNULL, f_j, probs[0] / sum); - for (int i = 1; i <= src.size(); ++i) - s2t.Increment(src[i-1], f_j, probs[i] / sum); - } - likelihood += log(sum); - } - if (write_alignments && final_iteration && !hide_training_alignments) cout << endl; - } - - // log(e) = 1.0 - double base2_likelihood = likelihood / log(2); - - if (flag) { cerr << endl; } - if (iter == 0) { - mean_srclen_multiplier = tot_len_ratio / lc; - cerr << "expected target length = source length * " << mean_srclen_multiplier << endl; - } - cerr << " log_e likelihood: " << likelihood << endl; - cerr << " log_2 likelihood: " << base2_likelihood << endl; - cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; - cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; - if (!final_iteration) { - if (variational_bayes) - s2t.NormalizeVB(alpha); - else - s2t.Normalize(); - } - } - if (testset.size()) { - ReadFile rf(testset); - istream& in = *rf.stream(); - int lc = 0; - double tlp = 0; - string line; - while (getline(in, line)) { - ++lc; - vector<WordID> src, trg; - CorpusTools::ReadLine(line, &src, &trg); - cout << TD::GetString(src) << " ||| " << TD::GetString(trg) << " |||"; - if (reverse) swap(src, trg); - double log_prob = Md::log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier); - if (src.size() > unnormed_a_i.size()) - unnormed_a_i.resize(src.size()); - - // compute likelihood - for (int j = 0; j < trg.size(); ++j) { - const WordID& f_j = trg[j]; - double sum = 0; - int a_j = 0; - double max_pat = 0; - const double j_over_ts = double(j) / trg.size(); - double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) - if (use_null) { - if (favor_diagonal) prob_a_i = prob_align_null; - max_pat = s2t.prob(kNULL, f_j) * prob_a_i; - sum += max_pat; - } - double az = 0; - if (favor_diagonal) { - for (int ta = 0; ta < src.size(); ++ta) { - unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); - az += unnormed_a_i[ta]; - } - az /= prob_align_not_null; - } - for (int i = 1; i <= src.size(); ++i) { - if (favor_diagonal) - prob_a_i = unnormed_a_i[i-1] / az; - double pat = s2t.prob(src[i-1], f_j) * prob_a_i; - if (pat > max_pat) { max_pat = pat; a_j = i; } - sum += pat; - } - log_prob += log(sum); - if (write_alignments) { - if (a_j > 0) { - cout << ' '; - if (reverse) - cout << j << '-' << (a_j - 1); - else - cout << (a_j - 1) << '-' << j; - } - } - } - tlp += log_prob; - cout << " ||| " << log_prob << endl << flush; - } // loop over test set sentences - cerr << "TOTAL LOG PROB " << tlp << endl; - } - - if (write_alignments) return 0; - - for (TTable::Word2Word2Double::iterator ei = s2t.ttable.begin(); ei != s2t.ttable.end(); ++ei) { - const TTable::Word2Double& cpd = ei->second; - const TTable::Word2Double& vit = s2t_viterbi[ei->first]; - const string& esym = TD::Convert(ei->first); - double max_p = -1; - for (TTable::Word2Double::const_iterator fi = cpd.begin(); fi != cpd.end(); ++fi) - if (fi->second > max_p) max_p = fi->second; - const double threshold = max_p * BEAM_THRESHOLD; - for (TTable::Word2Double::const_iterator fi = cpd.begin(); fi != cpd.end(); ++fi) { - if (fi->second > threshold || (vit.find(fi->first) != vit.end())) { - cout << esym << ' ' << TD::Convert(fi->first) << ' ' << log(fi->second) << endl; - } - } - } - return 0; -} - diff --git a/training/feature_expectations.cc b/training/feature_expectations.cc deleted file mode 100644 index f1a85495..00000000 --- a/training/feature_expectations.cc +++ /dev/null @@ -1,232 +0,0 @@ -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> -#include <cmath> -#include <tr1/memory> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "verbose.h" -#include "hg.h" -#include "prob.h" -#include "inside_outside.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "online_optimizer.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "sampler.h" - -#ifdef HAVE_MPI -#include <boost/mpi/timer.hpp> -#include <boost/mpi.hpp> -namespace mpi = boost::mpi; -#endif - -using namespace std; -namespace po = boost::program_options; - -struct FComp { - const vector<double>& w_; - FComp(const vector<double>& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowFeatures(const vector<double>& w) { - vector<int> fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - sort(fnums.begin(), fnums.end(), FComp(w)); - for (vector<int>::iterator i = fnums.begin(); i != fnums.end(); ++i) { - if (w[*i]) cout << FD::Convert(*i) << ' ' << w[*i] << endl; - } -} - -void ReadConfig(const string& ini, vector<string>* out) { - ReadFile rf(ini); - istream& in = *rf.stream(); - while(in) { - string line; - getline(in, line); - if (!in) continue; - out->push_back(line); - } -} - -void StoreConfig(const vector<string>& cfg, istringstream* o) { - ostringstream os; - for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; } - o->str(os.str()); -} - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input,i",po::value<string>(),"Corpus of source language sentences") - ("weights,w",po::value<string>(),"Input feature weights file") - ("decoder_config,c",po::value<string>(), "cdec.ini file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("input") || !conf->count("decoder_config")) { - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - int id = 0; - while(in) { - getline(in, line); - if (!in) break; - if (id % size == rank) { - c->push_back(line); - order->push_back(id); - } - ++id; - } -} - -static const double kMINUS_EPSILON = -1e-6; - -struct TrainingObserver : public DecoderObserver { - void Reset() { - acc_exp.clear(); - total_complete = 0; - } - - virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { - cur_model_exp.clear(); - state = 1; - } - - // compute model expectations, denominator of objective - virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 1); - state = 2; - const prob_t z = InsideOutside<prob_t, - EdgeProb, - SparseVector<prob_t>, - EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); - cur_model_exp /= z; - acc_exp += cur_model_exp; - } - - virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { - cerr << "IGNORING ALIGNMENT FOREST!\n"; - } - - virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { - if (state == 2) { - ++total_complete; - } - } - - void GetExpectations(SparseVector<double>* g) const { - g->clear(); - for (SparseVector<prob_t>::const_iterator it = acc_exp.begin(); it != acc_exp.end(); ++it) - g->set_value(it->first, it->second); - } - - int total_complete; - SparseVector<prob_t> cur_model_exp; - SparseVector<prob_t> acc_exp; - int state; -}; - -#ifdef HAVE_MPI -namespace boost { namespace mpi { - template<> - struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> > - : mpl::true_ { }; -} } // end namespace boost::mpi -#endif - -int main(int argc, char** argv) { -#ifdef HAVE_MPI - mpi::environment env(argc, argv); - mpi::communicator world; - const int size = world.size(); - const int rank = world.rank(); -#else - const int size = 1; - const int rank = 0; -#endif - if (size > 1) SetSilent(true); // turn off verbose decoder output - register_feature_functions(); - - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) - return 1; - - // load initial weights - Weights weights; - if (conf.count("weights")) - weights.InitFromFile(conf["weights"].as<string>()); - - vector<string> corpus; - vector<int> ids; - ReadTrainingCorpus(conf["input"].as<string>(), rank, size, &corpus, &ids); - assert(corpus.size() > 0); - - vector<string> cdec_ini; - ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini); - istringstream ini; - StoreConfig(cdec_ini, &ini); - Decoder decoder(&ini); - if (decoder.GetConf()["input"].as<string>() != "-") { - cerr << "cdec.ini must not set an input file\n"; - return 1; - } - - SparseVector<double> x; - weights.InitSparseVector(&x); - TrainingObserver observer; - - weights.InitFromVector(x); - vector<double> lambdas; - weights.InitVector(&lambdas); - decoder.SetWeights(lambdas); - observer.Reset(); - for (unsigned i = 0; i < corpus.size(); ++i) { - int id = ids[i]; - decoder.SetId(id); - decoder.Decode(corpus[i], &observer); - } - SparseVector<double> local_exps, exps; - observer.GetExpectations(&local_exps); -#ifdef HAVE_MPI - reduce(world, local_exps, exps, std::plus<SparseVector<double> >(), 0); -#else - exps.swap(local_exps); -#endif - - weights.InitFromVector(exps); - weights.InitVector(&lambdas); - ShowFeatures(lambdas); - - return 0; -} diff --git a/training/lbl_model.cc b/training/lbl_model.cc deleted file mode 100644 index a46ce33c..00000000 --- a/training/lbl_model.cc +++ /dev/null @@ -1,421 +0,0 @@ -#include <iostream> - -#include "config.h" -#ifndef HAVE_EIGEN - int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; } -#else - -#include <cstdlib> -#include <algorithm> -#include <cmath> -#include <set> -#include <cstring> // memset -#include <ctime> - -#ifdef HAVE_MPI -#include <boost/mpi/timer.hpp> -#include <boost/mpi.hpp> -#include <boost/archive/text_oarchive.hpp> -namespace mpi = boost::mpi; -#endif -#include <boost/math/special_functions/fpclassify.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include <Eigen/Dense> - -#include "corpus_tools.h" -#include "optimize.h" -#include "array2d.h" -#include "m.h" -#include "lattice.h" -#include "stringlib.h" -#include "filelib.h" -#include "tdict.h" - -namespace po = boost::program_options; -using namespace std; - -#define kDIMENSIONS 10 -typedef Eigen::Matrix<double, kDIMENSIONS, 1> RVector; -typedef Eigen::Matrix<double, 1, kDIMENSIONS> RTVector; -typedef Eigen::Matrix<double, kDIMENSIONS, kDIMENSIONS> TMatrix; -vector<RVector> r_src, r_trg; - -#if HAVE_MPI -namespace boost { -namespace serialization { - -template<class Archive> -void serialize(Archive & ar, RVector & v, const unsigned int version) { - for (unsigned i = 0; i < kDIMENSIONS; ++i) - ar & v[i]; -} - -} // namespace serialization -} // namespace boost -#endif - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input,i",po::value<string>(),"Input file") - ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training") - ("regularization_strength,C",po::value<double>()->default_value(0.1),"L2 regularization strength (0 for no regularization)") - ("eta", po::value<double>()->default_value(0.1f), "Eta for SGD") - ("source_embeddings,f", po::value<string>(), "File containing source embeddings (if unset, random vectors will be used)") - ("target_embeddings,e", po::value<string>(), "File containing target embeddings (if unset, random vectors will be used)") - ("random_seed,s", po::value<unsigned>(), "Random seed") - ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") - ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (argc < 2 || conf->count("help")) { - cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -void Normalize(RVector* v) { - double norm = v->norm(); - assert(norm > 0.0f); - *v /= norm; -} - -void Flatten(const TMatrix& m, vector<double>* v) { - unsigned c = 0; - v->resize(kDIMENSIONS * kDIMENSIONS); - for (unsigned i = 0; i < kDIMENSIONS; ++i) - for (unsigned j = 0; j < kDIMENSIONS; ++j) { - assert(boost::math::isfinite(m(i, j))); - (*v)[c++] = m(i,j); - } -} - -void Unflatten(const vector<double>& v, TMatrix* m) { - unsigned c = 0; - for (unsigned i = 0; i < kDIMENSIONS; ++i) - for (unsigned j = 0; j < kDIMENSIONS; ++j) { - assert(boost::math::isfinite(v[c])); - (*m)(i, j) = v[c++]; - } -} - -double ApplyRegularization(const double C, - const vector<double>& weights, - vector<double>* g) { - assert(weights.size() == g->size()); - double reg = 0; - for (size_t i = 0; i < weights.size(); ++i) { - const double& w_i = weights[i]; - double& g_i = (*g)[i]; - reg += C * w_i * w_i; - g_i += 2 * C * w_i; - } - return reg; -} - -void LoadEmbeddings(const string& filename, vector<RVector>* pv) { - vector<RVector>& v = *pv; - cerr << "Reading embeddings from " << filename << " ...\n"; - ReadFile rf(filename); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - while(getline(in, line)) { - ++lc; - size_t cur = line.find(' '); - if (cur == string::npos || cur == 0) { - cerr << "Parse error reading line " << lc << ":\n" << line << endl; - abort(); - } - WordID w = TD::Convert(line.substr(0, cur)); - if (w >= v.size()) continue; - RVector& curv = v[w]; - line[cur] = 0; - size_t start = cur + 1; - cur = start + 1; - size_t c = 0; - while(cur < line.size()) { - if (line[cur] == ' ') { - line[cur] = 0; - curv[c++] = strtod(&line[start], NULL); - start = cur + 1; - cur = start; - if (c == kDIMENSIONS) break; - } - ++cur; - } - if (c < kDIMENSIONS && cur != start) { - if (cur < line.size()) line[cur] = 0; - curv[c++] = strtod(&line[start], NULL); - } - if (c != kDIMENSIONS) { - static bool first = true; - if (first) { - cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n"; - first = false; - } - for (; c < kDIMENSIONS; ++c) curv[c] = rand(); - } - if (c == kDIMENSIONS && cur != line.size()) { - static bool first = true; - if (first) { - cerr << " embedding file contains more dimensions than configured with, truncating.\n"; - first = false; - } - } - } -} - -int main(int argc, char** argv) { -#ifdef HAVE_MPI - std::cerr << "**MPI enabled.\n"; - mpi::environment env(argc, argv); - mpi::communicator world; - const int size = world.size(); - const int rank = world.rank(); -#else - std::cerr << "**MPI disabled.\n"; - const int rank = 0; - const int size = 1; -#endif - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) return 1; - const string fname = conf["input"].as<string>(); - const double reg_strength = conf["regularization_strength"].as<double>(); - const bool has_l2 = reg_strength; - assert(reg_strength >= 0.0f); - const int ITERATIONS = conf["iterations"].as<unsigned>(); - const double eta = conf["eta"].as<double>(); - const double diagonal_tension = conf["diagonal_tension"].as<double>(); - bool SGD = false; - if (diagonal_tension < 0.0) { - cerr << "Invalid value for diagonal_tension: must be >= 0\n"; - return 1; - } - string testset; - if (conf.count("testset")) testset = conf["testset"].as<string>(); - - unsigned lc = 0; - vector<double> unnormed_a_i; - bool flag = false; - vector<vector<WordID> > srcs, trgs; - vector<WordID> vocab_e; - { - set<WordID> svocab_e, svocab_f; - CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size); - copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e)); - } - cerr << "Number of target word types: " << vocab_e.size() << endl; - const double num_examples = lc; - - boost::shared_ptr<LBFGSOptimizer> lbfgs; - if (rank == 0) - lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100)); - r_trg.resize(TD::NumWords() + 1); - r_src.resize(TD::NumWords() + 1); - vector<set<unsigned> > trg_pos(TD::NumWords() + 1); - - if (conf.count("random_seed")) { - srand(conf["random_seed"].as<unsigned>()); - } else { - unsigned seed = time(NULL) + rank * 100; - cerr << "Random seed: " << seed << endl; - srand(seed); - } - - TMatrix t = TMatrix::Zero(); - if (rank == 0) { - t = TMatrix::Random() / 50.0; - for (unsigned i = 1; i < r_trg.size(); ++i) { - r_trg[i] = RVector::Random(); - r_src[i] = RVector::Random(); - } - if (conf.count("source_embeddings")) - LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src); - if (conf.count("target_embeddings")) - LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg); - } - - // do optimization - TMatrix g = TMatrix::Zero(); - vector<TMatrix> exp_src; - vector<double> z_src; - vector<double> flat_g, flat_t, rcv_grad; - Flatten(t, &flat_t); - bool converged = false; -#if HAVE_MPI - mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); - mpi::broadcast(world, r_trg, 0); - mpi::broadcast(world, r_src, 0); -#endif - cerr << "rank=" << rank << ": " << r_trg[0][4] << endl; - for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { - if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl; - Unflatten(flat_t, &t); - double likelihood = 0; - double denom = 0.0; - lc = 0; - flag = false; - g *= 0; - for (unsigned i = 0; i < srcs.size(); ++i) { - const vector<WordID>& src = srcs[i]; - const vector<WordID>& trg = trgs[i]; - ++lc; - if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; } - if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } - denom += trg.size(); - - exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero()); - z_src.clear(); z_src.resize(src.size(), 0.0); - Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero()); - Array2D<double> z_refs(src.size(), trg.size(), 0.0); - for (unsigned j = 0; j < trg.size(); ++j) - trg_pos[trg[j]].insert(j); - - for (unsigned i = 0; i < src.size(); ++i) { - const RVector& r_s = r_src[src[i]]; - const RTVector pred = r_s.transpose() * t; - TMatrix& exp_m = exp_src[i]; - double& z = z_src[i]; - for (unsigned k = 0; k < vocab_e.size(); ++k) { - const WordID v_k = vocab_e[k]; - const RVector& r_t = r_trg[v_k]; - const double dot_prod = pred * r_t; - const double u = exp(dot_prod); - z += u; - const TMatrix v = r_s * r_t.transpose() * u; - exp_m += v; - set<unsigned>& ref_locs = trg_pos[v_k]; - if (!ref_locs.empty()) { - for (set<unsigned>::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) { - TMatrix& exp_ref_ij = exp_refs(i, *it); - double& z_ref_ij = z_refs(i, *it); - z_ref_ij += u; - exp_ref_ij += v; - } - } - } - } - for (unsigned j = 0; j < trg.size(); ++j) - trg_pos[trg[j]].clear(); - - // model expectations for a single target generation with - // uniform alignment prior - // TODO: when using a non-uniform alignment, m_exp will be - // a function of j (below) - double m_z = 0; - TMatrix m_exp = TMatrix::Zero(); - for (unsigned i = 0; i < src.size(); ++i) { - m_exp += exp_src[i]; - m_z += z_src[i]; - } - m_exp /= m_z; - - Array2D<bool> al(src.size(), trg.size(), false); - for (unsigned j = 0; j < trg.size(); ++j) { - double ref_z = 0; - TMatrix ref_exp = TMatrix::Zero(); - int max_i = 0; - double max_s = -9999999; - for (unsigned i = 0; i < src.size(); ++i) { - ref_exp += exp_refs(i, j); - ref_z += z_refs(i, j); - if (log(z_refs(i, j)) > max_s) { - max_s = log(z_refs(i, j)); - max_i = i; - } - // TODO handle alignment prob - } - if (ref_z <= 0) { - cerr << "TRG=" << TD::Convert(trg[j]) << endl; - cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl; - cerr << " REF_EXP=\n" << ref_exp << endl; - cerr << " M_EXP=\n" << m_exp << endl; - abort(); - } - al(max_i, j) = true; - ref_exp /= ref_z; - g += m_exp - ref_exp; - likelihood += log(ref_z) - log(m_z); - if (SGD) { - t -= g * eta / num_examples; - g *= 0; - } - } - - if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; } - } - if (flag && rank == 0) { cerr << endl; } - - double obj = 0; - if (!SGD) { - Flatten(g, &flat_g); - obj = -likelihood; -#if HAVE_MPI - rcv_grad.resize(flat_g.size(), 0.0); - mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus<double>(), 0); - swap(flat_g, rcv_grad); - rcv_grad.clear(); - - double to = 0; - mpi::reduce(world, obj, to, plus<double>(), 0); - obj = to; - double tlh = 0; - mpi::reduce(world, likelihood, tlh, plus<double>(), 0); - likelihood = tlh; - double td = 0; - mpi::reduce(world, denom, td, plus<double>(), 0); - denom = td; -#endif - } - - if (rank == 0) { - double gn = 0; - for (unsigned i = 0; i < flat_g.size(); ++i) - gn += flat_g[i]*flat_g[i]; - const double base2_likelihood = likelihood / log(2); - cerr << " log_e likelihood: " << likelihood << endl; - cerr << " log_2 likelihood: " << base2_likelihood << endl; - cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; - cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; - cerr << " gradient norm: " << sqrt(gn) << endl; - if (!SGD) { - if (has_l2) { - const double r = ApplyRegularization(reg_strength, - flat_t, - &flat_g); - obj += r; - cerr << " regularization: " << r << endl; - } - lbfgs->Optimize(obj, flat_g, &flat_t); - converged = (lbfgs->HasConverged()); - } - } -#ifdef HAVE_MPI - mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); - mpi::broadcast(world, converged, 0); -#endif - } - if (rank == 0) - cerr << "TRANSLATION MATRIX:" << endl << t << endl; - return 0; -} - -#endif - diff --git a/training/minrisk/Makefile.am b/training/minrisk/Makefile.am new file mode 100644 index 00000000..821730c2 --- /dev/null +++ b/training/minrisk/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = minrisk_optimize + +minrisk_optimize_SOURCES = minrisk_optimize.cc +minrisk_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/training/liblbfgs/liblbfgs.a + +AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training -I$(top_srcdir)/training/utils diff --git a/training/minrisk/minrisk.pl b/training/minrisk/minrisk.pl new file mode 100755 index 00000000..0f8bacd0 --- /dev/null +++ b/training/minrisk/minrisk.pl @@ -0,0 +1,540 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); +my $default_jobs = env_default_jobs(); + +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/minrisk_generate_input.pl"; +my $MAPPER = "$bin_dir/minrisk_optimize"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 30; +my $iteration = 1; +my $best_weights; +my $psi = 1; +my $default_max_iter = 30; +my $max_iterations = $default_max_iter; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "4g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $use_make = 1; # use make to parallelize +my $useqsub = 0; +my $initial_weights; +my $pass_suffix = ''; +my $cpbin=1; + +# regularization strength +my $tune_regularizer = 0; +my $reg = 500; +my $reg_previous = 5000; +my $dont_accum = 0; + +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "jobs=i" => \$jobs, + "dont-clean" => \$disable_clean, + "dont-accumulate" => \$dont_accum, + "pass-suffix=s" => \$pass_suffix, + "qsub" => \$useqsub, + "dry-run" => \$dryrun, + "epsilon=s" => \$epsilon, + "help" => \$help, + "weights=s" => \$initial_weights, + "reg=f" => \$reg, + "use-make=i" => \$use_make, + "max-iterations=i" => \$max_iterations, + "pmem=s" => \$pmem, + "cpbin!" => \$cpbin, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "workdir=s" => \$dir, +) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer; + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $srcFile) { push @missing_args, "--source-file"; } +if (!defined $refFiles) { push @missing_args, "--ref-files"; } +if (!defined $initial_weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 5; +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ + $dir = "minrisk"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { + local $_; + my $bindir=shift; + check_call("mkdir -p $bindir"); + -d $bindir || die "couldn't make bindir $bindir"; + for (@_) { + my $src=$$_; + $$_="$bindir/".basename($src); + check_call("cp -p $src $$_"); + } +} +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +my @allweights; +if ($dryrun){ + write_config(*STDERR); + exit 0; +} else { + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs + die "ERROR: working dir $dir already exists\n\n"; + } else { + -e $dir || mkdir $dir; + mkdir "$dir/hgs"; + modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; + mkdir "$dir/scripts"; + my $cmdfile="$dir/rerun-pro.sh"; + open CMD,'>',$cmdfile; + print CMD "cd ",&getcwd,"\n"; +# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. + my $cline=&cmdline."\n"; + print CMD $cline; + close CMD; + print STDERR $cline; + chmod(0755,$cmdfile); + check_call("cp $initial_weights $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); + } + write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; +my $kbest = "$dir/kbest"; +if ($dont_accum) { + $kbest = ''; +} else { + check_call("mkdir -p $kbest"); + $kbest = "--kbest_repository $kbest"; +} + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + push @allweights, "-w $dir/weights.$im1"; + `rm -f $dir/hgs/*.gz`; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + print STDERR " - GENERATE TRAINING EXEMPLARS\n"; + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + my $outweights="$dir/weights.$iteration"; + $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + $cmd="$MAPPER $refs_comma_sep -m $metric -i $dir/agenda.$im1 $kbest -w $inweights > $outweights"; + check_call($cmd); + $lastWeightsFile = $outweights; + $iteration++; + `rm hgs/*.gz`; + print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while(<FL>) { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($r,$p) = @_; + my $o = check_output("echo $p"); + chomp $o; + my @files = split /\s+/, $o; + return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +# subs +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "JOBS: $jobs\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; + die "Empty dev set!" if ($i == 0); +} + +sub print_help { + + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable [options] <ini file> + Runs a complete PRO optimization using the ini file specified. + +Required: + + --ref-files <files> + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + + --source-file <file> + Dev set source file. + + --weights <file> + Initial weights file (use empty file to start from 0) + +General options: + + --help + Print this message and exit. + + --dont-accumulate + Don't accumulate k-best lists from multiple iterations. + + --max-iterations <M> + Maximum number of iterations to run. If not specified, defaults + to $default_max_iter. + + --metric <method> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --pass-suffix <S> + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --workdir <dir> + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Regularization options: + + --reg <F> + l2 regularization strength [default=500]. The greater this value, + the closer to zero the weights will be. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} diff --git a/training/minrisk/minrisk_generate_input.pl b/training/minrisk/minrisk_generate_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/training/minrisk/minrisk_generate_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { + my $file = $hg; + my $id = $hg; + $id =~ s/(\.json)?\.gz//; + print "$d/$file $id\n"; +} + diff --git a/training/minrisk/minrisk_optimize.cc b/training/minrisk/minrisk_optimize.cc new file mode 100644 index 00000000..da8b5260 --- /dev/null +++ b/training/minrisk/minrisk_optimize.cc @@ -0,0 +1,197 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <limits> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "liblbfgs/lbfgs++.h" +#include "filelib.h" +#include "stringlib.h" +#include "weights.h" +#include "hg_io.h" +#include "kbest.h" +#include "viterbi.h" +#include "ns.h" +#include "ns_docscorer.h" +#include "candidate_set.h" +#include "risk.h" +#include "entropy.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") + ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations") + ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") + ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") + ("temperature,T",po::value<double>()->default_value(0.0), "Temperature parameter for objective (>0 increases the entropy)") + ("l1_strength,C",po::value<double>()->default_value(0.0), "L1 regularization strength") + ("memory_buffers,M",po::value<unsigned>()->default_value(20), "Memory buffers used in LBFGS") + ("kbest_repository,R",po::value<string>(), "Accumulate k-best lists from previous iterations (parameter is path to repository)") + ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF.TXT>\n"; + flag = true; + } + if (!conf->count("weights")) { + cerr << "Please specify weights using -w <WEIGHTS.TXT>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +EvaluationMetric* metric = NULL; + +struct RiskObjective { + explicit RiskObjective(const vector<training::CandidateSet>& tr, const double temp) : training(tr), T(temp) {} + double operator()(const vector<double>& x, double* g) const { + fill(g, g + x.size(), 0.0); + double obj = 0; + double h = 0; + for (unsigned i = 0; i < training.size(); ++i) { + training::CandidateSetRisk risk(training[i], *metric); + training::CandidateSetEntropy entropy(training[i]); + SparseVector<double> tg, hg; + double r = risk(x, &tg); + double hh = entropy(x, &hg); + h += hh; + obj += r; + for (SparseVector<double>::iterator it = tg.begin(); it != tg.end(); ++it) + g[it->first] += it->second; + if (T) { + for (SparseVector<double>::iterator it = hg.begin(); it != hg.end(); ++it) + g[it->first] += T * it->second; + } + } + cerr << (1-(obj / training.size())) << " H=" << h << endl; + return obj - T * h; + } + const vector<training::CandidateSet>& training; + const double T; // temperature for entropy regularization +}; + +double LearnParameters(const vector<training::CandidateSet>& training, + const double temp, // > 0 increases the entropy, < 0 decreases the entropy + const double C1, + const unsigned memory_buffers, + vector<weight_t>* px) { + RiskObjective obj(training, temp); + LBFGS<RiskObjective> lbfgs(px, obj, memory_buffers, C1); + lbfgs.MinimizeFunction(); + return 0; +} + +#if 0 +struct FooLoss { + double operator()(const vector<double>& x, double* g) const { + fill(g, g + x.size(), 0.0); + training::CandidateSet cs; + training::CandidateSetEntropy cse(cs); + cs.cs.resize(3); + cs.cs[0].fmap.set_value(FD::Convert("F1"), -1.0); + cs.cs[1].fmap.set_value(FD::Convert("F2"), 1.0); + cs.cs[2].fmap.set_value(FD::Convert("F1"), 2.0); + cs.cs[2].fmap.set_value(FD::Convert("F2"), 0.5); + SparseVector<double> xx; + double h = cse(x, &xx); + cerr << cse(x, &xx) << endl; cerr << "G: " << xx << endl; + for (SparseVector<double>::iterator i = xx.begin(); i != xx.end(); ++i) + g[i->first] += i->second; + return -h; + } +}; +#endif + +int main(int argc, char** argv) { +#if 0 + training::CandidateSet cs; + training::CandidateSetEntropy cse(cs); + cs.cs.resize(3); + cs.cs[0].fmap.set_value(FD::Convert("F1"), -1.0); + cs.cs[1].fmap.set_value(FD::Convert("F2"), 1.0); + cs.cs[2].fmap.set_value(FD::Convert("F1"), 2.0); + cs.cs[2].fmap.set_value(FD::Convert("F2"), 0.5); + FooLoss foo; + vector<double> ww(FD::NumFeats()); ww[FD::Convert("F1")] = 1.0; + LBFGS<FooLoss> lbfgs(&ww, foo, 100, 0.0); + lbfgs.MinimizeFunction(); + return 1; +#endif + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + + metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as<string>()); + string kbest_repo; + if (conf.count("kbest_repository")) { + kbest_repo = conf["kbest_repository"].as<string>(); + MkDirP(kbest_repo); + } + istream &in=*in_read.stream(); + const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); + vector<weight_t> weights; + const string weightsf = conf["weights"].as<string>(); + Weights::InitFromFile(weightsf, &weights); + double t = 0; + for (unsigned i = 0; i < weights.size(); ++i) + t += weights[i] * weights[i]; + if (t > 0) { + for (unsigned i = 0; i < weights.size(); ++i) + weights[i] /= sqrt(t); + } + string line, file; + vector<training::CandidateSet> kis; + cerr << "Loading hypergraphs...\n"; + while(getline(in, line)) { + istringstream is(line); + int sent_id; + kis.resize(kis.size() + 1); + training::CandidateSet& curkbest = kis.back(); + string kbest_file; + if (kbest_repo.size()) { + ostringstream os; + os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; + kbest_file = os.str(); + if (FileExists(kbest_file)) + curkbest.ReadFromFile(kbest_file); + } + is >> file >> sent_id; + ReadFile rf(file); + if (kis.size() % 5 == 0) { cerr << '.'; } + if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; } + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(weights); + curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]); + if (kbest_file.size()) + curkbest.WriteToFile(kbest_file); + } + cerr << "\nHypergraphs loaded.\n"; + weights.resize(FD::NumFeats()); + + double c1 = conf["l1_strength"].as<double>(); + double temp = conf["temperature"].as<double>(); + unsigned m = conf["memory_buffers"].as<unsigned>(); + LearnParameters(kis, temp, c1, m, &weights); + Weights::WriteToFile("-", weights); + return 0; +} + diff --git a/training/mira/Makefile.am b/training/mira/Makefile.am new file mode 100644 index 00000000..c8f404fb --- /dev/null +++ b/training/mira/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = kbest_mira + +kbest_mira_SOURCES = kbest_mira.cc +kbest_mira_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/mira/kbest_mira.cc b/training/mira/kbest_mira.cc new file mode 100644 index 00000000..8b7993dd --- /dev/null +++ b/training/mira/kbest_mira.cc @@ -0,0 +1,309 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <cassert> +#include <cmath> +#include <tr1/memory> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "hg_sampler.h" +#include "sentence_metadata.h" +#include "scorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +using namespace std; +namespace po = boost::program_options; + +bool invert_score; +std::tr1::shared_ptr<MT19937> rng; + +void RandomPermutation(int len, vector<int>* p_ids) { + vector<int>& ids = *p_ids; + ids.resize(len); + for (int i = 0; i < len; ++i) ids[i] = i; + for (int i = len; i > 0; --i) { + int j = rng->next() * i; + if (j == i) i--; + swap(ids[i-1], ids[j]); + } +} + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input_weights,w",po::value<string>(),"Input feature weights file") + ("source,i",po::value<string>(),"Source file for development set") + ("passes,p", po::value<int>()->default_value(15), "Number of passes through the training data") + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)") + ("mt_metric,m",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("max_step_size,C", po::value<double>()->default_value(0.01), "regularization strength (C)") + ("mt_metric_scale,s", po::value<double>()->default_value(1.0), "Amount to scale MT loss function by") + ("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles") + ("sample_forest,f", "Instead of a k-best list, sample k hypotheses from the decoder's forest") + ("sample_forest_unit_weight_vector,x", "Before sampling (must use -f option), rescale the weight vector used so it has unit length; this may improve the quality of the samples") + ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)") + ("decoder_config,c",po::value<string>(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("input_weights") || !conf->count("source") || !conf->count("decoder_config") || !conf->count("reference")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +static const double kMINUS_EPSILON = -1e-6; + +struct HypothesisInfo { + SparseVector<double> features; + double mt_metric; +}; + +struct GoodBadOracle { + std::tr1::shared_ptr<HypothesisInfo> good; + std::tr1::shared_ptr<HypothesisInfo> bad; +}; + +struct TrainingObserver : public DecoderObserver { + TrainingObserver(const int k, const DocScorer& d, bool sf, vector<GoodBadOracle>* o) : ds(d), oracles(*o), kbest_size(k), sample_forest(sf) {} + const DocScorer& ds; + vector<GoodBadOracle>& oracles; + std::tr1::shared_ptr<HypothesisInfo> cur_best; + const int kbest_size; + const bool sample_forest; + + const HypothesisInfo& GetCurrentBestHypothesis() const { + return *cur_best; + } + + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + UpdateOracles(smeta.GetSentenceID(), *hg); + } + + std::tr1::shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double score) { + std::tr1::shared_ptr<HypothesisInfo> h(new HypothesisInfo); + h->features = feats; + h->mt_metric = score; + return h; + } + + void UpdateOracles(int sent_id, const Hypergraph& forest) { + std::tr1::shared_ptr<HypothesisInfo>& cur_good = oracles[sent_id].good; + std::tr1::shared_ptr<HypothesisInfo>& cur_bad = oracles[sent_id].bad; + cur_bad.reset(); // TODO get rid of?? + + if (sample_forest) { + vector<WordID> cur_prediction; + ViterbiESentence(forest, &cur_prediction); + float sentscore = ds[sent_id]->ScoreCandidate(cur_prediction)->ComputeScore(); + cur_best = MakeHypothesisInfo(ViterbiFeatures(forest), sentscore); + + vector<HypergraphSampler::Hypothesis> samples; + HypergraphSampler::sample_hypotheses(forest, kbest_size, &*rng, &samples); + for (unsigned i = 0; i < samples.size(); ++i) { + sentscore = ds[sent_id]->ScoreCandidate(samples[i].words)->ComputeScore(); + if (invert_score) sentscore *= -1.0; + if (!cur_good || sentscore > cur_good->mt_metric) + cur_good = MakeHypothesisInfo(samples[i].fmap, sentscore); + if (!cur_bad || sentscore < cur_bad->mt_metric) + cur_bad = MakeHypothesisInfo(samples[i].fmap, sentscore); + } + } else { + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size); + for (int i = 0; i < kbest_size; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + float sentscore = ds[sent_id]->ScoreCandidate(d->yield)->ComputeScore(); + if (invert_score) sentscore *= -1.0; + // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; + if (i == 0) + cur_best = MakeHypothesisInfo(d->feature_values, sentscore); + if (!cur_good || sentscore > cur_good->mt_metric) + cur_good = MakeHypothesisInfo(d->feature_values, sentscore); + if (!cur_bad || sentscore < cur_bad->mt_metric) + cur_bad = MakeHypothesisInfo(d->feature_values, sentscore); + } + //cerr << "GOOD: " << cur_good->mt_metric << endl; + //cerr << " CUR: " << cur_best->mt_metric << endl; + //cerr << " BAD: " << cur_bad->mt_metric << endl; + } + } +}; + +void ReadTrainingCorpus(const string& fname, vector<string>* c) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) break; + c->push_back(line); + } +} + +bool ApproxEqual(double a, double b) { + if (a == b) return true; + return (fabs(a-b)/fabs(b)) < 0.000001; +} + +int main(int argc, char** argv) { + register_feature_functions(); + SetSilent(true); // turn off verbose decoder output + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + + if (conf.count("random_seed")) + rng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + rng.reset(new MT19937); + const bool sample_forest = conf.count("sample_forest") > 0; + const bool sample_forest_unit_weight_vector = conf.count("sample_forest_unit_weight_vector") > 0; + if (sample_forest_unit_weight_vector && !sample_forest) { + cerr << "Cannot --sample_forest_unit_weight_vector without --sample_forest" << endl; + return 1; + } + vector<string> corpus; + ReadTrainingCorpus(conf["source"].as<string>(), &corpus); + const string metric_name = conf["mt_metric"].as<string>(); + ScoreType type = ScoreTypeFromString(metric_name); + if (type == TER) { + invert_score = true; + } else { + invert_score = false; + } + DocScorer ds(type, conf["reference"].as<vector<string> >(), ""); + cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; + if (ds.size() != corpus.size()) { + cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; + return 1; + } + + ReadFile ini_rf(conf["decoder_config"].as<string>()); + Decoder decoder(ini_rf.stream()); + + // load initial weights + vector<weight_t>& dense_weights = decoder.CurrentWeightVector(); + SparseVector<weight_t> lambdas; + Weights::InitFromFile(conf["input_weights"].as<string>(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); + + const double max_step_size = conf["max_step_size"].as<double>(); + const double mt_metric_scale = conf["mt_metric_scale"].as<double>(); + + assert(corpus.size() > 0); + vector<GoodBadOracle> oracles(corpus.size()); + + TrainingObserver observer(conf["k_best_size"].as<int>(), ds, sample_forest, &oracles); + int cur_sent = 0; + int lcount = 0; + int normalizer = 0; + double tot_loss = 0; + int dots = 0; + int cur_pass = 0; + SparseVector<double> tot; + tot += lambdas; // initial weights + normalizer++; // count for initial weights + int max_iteration = conf["passes"].as<int>() * corpus.size(); + string msg = "# MIRA tuned weights"; + string msga = "# MIRA tuned weights AVERAGED"; + vector<int> order; + RandomPermutation(corpus.size(), &order); + while (lcount <= max_iteration) { + lambdas.init_vector(&dense_weights); + if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.'; } + if (corpus.size() == cur_sent) { + cerr << " [AVG METRIC LAST PASS=" << (tot_loss / corpus.size()) << "]\n"; + Weights::ShowLargestFeatures(dense_weights); + cur_sent = 0; + tot_loss = 0; + dots = 0; + ostringstream os; + os << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz"; + SparseVector<double> x = tot; + x /= normalizer; + ostringstream sa; + sa << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz"; + x.init_vector(&dense_weights); + Weights::WriteToFile(os.str(), dense_weights, true, &msg); + ++cur_pass; + RandomPermutation(corpus.size(), &order); + } + if (cur_sent == 0) { + cerr << "PASS " << (lcount / corpus.size() + 1) << endl; + } + decoder.SetId(order[cur_sent]); + double sc = 1.0; + if (sample_forest_unit_weight_vector) { + sc = lambdas.l2norm(); + if (sc > 0) { + for (unsigned i = 0; i < dense_weights.size(); ++i) + dense_weights[i] /= sc; + } + } + decoder.Decode(corpus[order[cur_sent]], &observer); // update oracles + if (sc && sc != 1.0) { + for (unsigned i = 0; i < dense_weights.size(); ++i) + dense_weights[i] *= sc; + } + const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis(); + const HypothesisInfo& cur_good = *oracles[order[cur_sent]].good; + const HypothesisInfo& cur_bad = *oracles[order[cur_sent]].bad; + tot_loss += cur_hyp.mt_metric; + if (!ApproxEqual(cur_hyp.mt_metric, cur_good.mt_metric)) { + const double loss = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights) + + mt_metric_scale * (cur_good.mt_metric - cur_bad.mt_metric); + //cerr << "LOSS: " << loss << endl; + if (loss > 0.0) { + SparseVector<double> diff = cur_good.features; + diff -= cur_bad.features; + double step_size = loss / diff.l2norm_sq(); + //cerr << loss << " " << step_size << " " << diff << endl; + if (step_size > max_step_size) step_size = max_step_size; + lambdas += (cur_good.features * step_size); + lambdas -= (cur_bad.features * step_size); + //cerr << "L: " << lambdas << endl; + } + } + tot += lambdas; + ++normalizer; + ++lcount; + ++cur_sent; + } + cerr << endl; + Weights::WriteToFile("weights.mira-final.gz", dense_weights, true, &msg); + tot /= normalizer; + tot.init_vector(dense_weights); + msg = "# MIRA tuned weights (averaged vector)"; + Weights::WriteToFile("weights.mira-final-avg.gz", dense_weights, true, &msg); + cerr << "Optimization complete.\nAVERAGED WEIGHTS: weights.mira-final-avg.gz\n"; + return 0; +} + diff --git a/training/mpi_em_optimize.cc b/training/mpi_em_optimize.cc deleted file mode 100644 index 48683b15..00000000 --- a/training/mpi_em_optimize.cc +++ /dev/null @@ -1,389 +0,0 @@ -#include <sstream> -#include <iostream> -#include <vector> -#include <cassert> -#include <cmath> - -#ifdef HAVE_MPI -#include <mpi.h> -#endif - -#include <boost/shared_ptr.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "verbose.h" -#include "hg.h" -#include "prob.h" -#include "inside_outside.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "optimize.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -using boost::shared_ptr; -namespace po = boost::program_options; - -void SanityCheck(const vector<double>& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector<double>& w_; - FComp(const vector<double>& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector<double>& w) { - vector<int> fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector<int>::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input_weights,w",po::value<string>(),"Input feature weights file") - ("training_data,t",po::value<string>(),"Training data") - ("decoder_config,c",po::value<string>(),"Decoder configuration file") - ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !(conf->count("training_data")) || !conf->count("decoder_config")) { - cerr << dcmdline_options << endl; -#ifdef HAVE_MPI - MPI::Finalize(); -#endif - exit(1); - } -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - int lc = 0; - while(in) { - getline(in, line); - if (!in) break; - if (lc % size == rank) c->push_back(line); - ++lc; - } -} - -static const double kMINUS_EPSILON = -1e-6; - -struct TrainingObserver : public DecoderObserver { - void Reset() { - total_complete = 0; - cur_obj = 0; - tot_obj = 0; - tot.clear(); - } - - void SetLocalGradientAndObjective(SparseVector<double>* g, double* o) const { - *o = tot_obj; - *g = tot; - } - - virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { - cur_obj = 0; - state = 1; - } - - void ExtractExpectedCounts(Hypergraph* hg) { - vector<prob_t> posts; - cur.clear(); - const prob_t z = hg->ComputeEdgePosteriors(1.0, &posts); - cur_obj = log(z); - for (int i = 0; i < posts.size(); ++i) { - const SparseVector<double>& efeats = hg->edges_[i].feature_values_; - const double post = static_cast<double>(posts[i] / z); - for (SparseVector<double>::const_iterator j = efeats.begin(); j != efeats.end(); ++j) - cur.add_value(j->first, post); - } - } - - // compute model expectations, denominator of objective - virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 1); - state = 2; - ExtractExpectedCounts(hg); - } - - // replace translation forest, since we're doing EM training (we don't know which) - virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 2); - state = 3; - ExtractExpectedCounts(hg); - } - - virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { - ++total_complete; - tot_obj += cur_obj; - tot += cur; - } - - int total_complete; - double cur_obj; - double tot_obj; - SparseVector<double> cur, tot; - int state; -}; - -void ReadConfig(const string& ini, vector<string>* out) { - ReadFile rf(ini); - istream& in = *rf.stream(); - while(in) { - string line; - getline(in, line); - if (!in) continue; - out->push_back(line); - } -} - -void StoreConfig(const vector<string>& cfg, istringstream* o) { - ostringstream os; - for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; } - o->str(os.str()); -} - -struct OptimizableMultinomialFamily { - struct CPD { - CPD() : z() {} - double z; - map<WordID, double> c2counts; - }; - map<WordID, CPD> counts; - double Value(WordID conditioning, WordID generated) const { - map<WordID, CPD>::const_iterator it = counts.find(conditioning); - assert(it != counts.end()); - map<WordID,double>::const_iterator r = it->second.c2counts.find(generated); - if (r == it->second.c2counts.end()) return 0; - return r->second; - } - void Increment(WordID conditioning, WordID generated, double count) { - CPD& cc = counts[conditioning]; - cc.z += count; - cc.c2counts[generated] += count; - } - void Optimize() { - for (map<WordID, CPD>::iterator i = counts.begin(); i != counts.end(); ++i) { - CPD& cpd = i->second; - for (map<WordID, double>::iterator j = cpd.c2counts.begin(); j != cpd.c2counts.end(); ++j) { - j->second /= cpd.z; - // cerr << "P(" << TD::Convert(j->first) << " | " << TD::Convert(i->first) << " ) = " << j->second << endl; - } - } - } - void Clear() { - counts.clear(); - } -}; - -struct CountManager { - CountManager(size_t num_types) : oms_(num_types) {} - virtual ~CountManager(); - virtual void AddCounts(const SparseVector<double>& c) = 0; - void Optimize(SparseVector<double>* weights) { - for (int i = 0; i < oms_.size(); ++i) { - oms_[i].Optimize(); - } - GetOptimalValues(weights); - for (int i = 0; i < oms_.size(); ++i) { - oms_[i].Clear(); - } - } - virtual void GetOptimalValues(SparseVector<double>* wv) const = 0; - vector<OptimizableMultinomialFamily> oms_; -}; -CountManager::~CountManager() {} - -struct TaggerCountManager : public CountManager { - // 0 = transitions, 2 = emissions - TaggerCountManager() : CountManager(2) {} - void AddCounts(const SparseVector<double>& c); - void GetOptimalValues(SparseVector<double>* wv) const { - for (set<int>::const_iterator it = fids_.begin(); it != fids_.end(); ++it) { - int ftype; - WordID cond, gen; - bool is_optimized = TaggerCountManager::GetFeature(*it, &ftype, &cond, &gen); - assert(is_optimized); - wv->set_value(*it, log(oms_[ftype].Value(cond, gen))); - } - } - // Id:0:a=1 Bi:a_b=1 Bi:b_c=1 Bi:c_d=1 Uni:a=1 Uni:b=1 Uni:c=1 Uni:d=1 Id:1:b=1 Bi:BOS_a=1 Id:2:c=1 - static bool GetFeature(const int fid, int* feature_type, WordID* cond, WordID* gen) { - const string& feat = FD::Convert(fid); - if (feat.size() > 5 && feat[0] == 'I' && feat[1] == 'd' && feat[2] == ':') { - // emission - const size_t p = feat.rfind(':'); - assert(p != string::npos); - *cond = TD::Convert(feat.substr(p+1)); - *gen = TD::Convert(feat.substr(3, p - 3)); - *feature_type = 1; - return true; - } else if (feat[0] == 'B' && feat.size() > 5 && feat[2] == ':' && feat[1] == 'i') { - // transition - const size_t p = feat.rfind('_'); - assert(p != string::npos); - *gen = TD::Convert(feat.substr(p+1)); - *cond = TD::Convert(feat.substr(3, p - 3)); - *feature_type = 0; - return true; - } else if (feat[0] == 'U' && feat.size() > 4 && feat[1] == 'n' && feat[2] == 'i' && feat[3] == ':') { - // ignore - return false; - } else { - cerr << "Don't know how to deal with feature of type: " << feat << endl; - abort(); - } - } - set<int> fids_; -}; - -void TaggerCountManager::AddCounts(const SparseVector<double>& c) { - for (SparseVector<double>::const_iterator it = c.begin(); it != c.end(); ++it) { - const double& val = it->second; - int ftype; - WordID cond, gen; - if (GetFeature(it->first, &ftype, &cond, &gen)) { - oms_[ftype].Increment(cond, gen, val); - fids_.insert(it->first); - } - } -} - -int main(int argc, char** argv) { -#ifdef HAVE_MPI - MPI::Init(argc, argv); - const int size = MPI::COMM_WORLD.Get_size(); - const int rank = MPI::COMM_WORLD.Get_rank(); -#else - const int size = 1; - const int rank = 0; -#endif - SetSilent(true); // turn off verbose decoder output - register_feature_functions(); - - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - TaggerCountManager tcm; - - // load cdec.ini and set up decoder - vector<string> cdec_ini; - ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini); - istringstream ini; - StoreConfig(cdec_ini, &ini); - if (rank == 0) cerr << "Loading grammar...\n"; - Decoder* decoder = new Decoder(&ini); - if (decoder->GetConf()["input"].as<string>() != "-") { - cerr << "cdec.ini must not set an input file\n"; -#ifdef HAVE_MPI - MPI::COMM_WORLD.Abort(1); -#endif - } - if (rank == 0) cerr << "Done loading grammar!\n"; - Weights w; - if (conf.count("input_weights")) - w.InitFromFile(conf["input_weights"].as<string>()); - - double objective = 0; - bool converged = false; - - vector<double> lambdas; - w.InitVector(&lambdas); - vector<string> corpus; - ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus); - assert(corpus.size() > 0); - - int iteration = 0; - TrainingObserver observer; - while (!converged) { - ++iteration; - observer.Reset(); - if (rank == 0) { - cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n"; - } - decoder->SetWeights(lambdas); - for (int i = 0; i < corpus.size(); ++i) - decoder->Decode(corpus[i], &observer); - - SparseVector<double> x; - observer.SetLocalGradientAndObjective(&x, &objective); - cerr << "COUNTS = " << x << endl; - cerr << " OBJ = " << objective << endl; - tcm.AddCounts(x); - -#if 0 -#ifdef HAVE_MPI - MPI::COMM_WORLD.Reduce(const_cast<double*>(&gradient.data()[0]), &rcv_grad[0], num_feats, MPI::DOUBLE, MPI::SUM, 0); - MPI::COMM_WORLD.Reduce(&objective, &to, 1, MPI::DOUBLE, MPI::SUM, 0); - swap(gradient, rcv_grad); - objective = to; -#endif -#endif - - if (rank == 0) { - SparseVector<double> wsv; - tcm.Optimize(&wsv); - - w.InitFromVector(wsv); - w.InitVector(&lambdas); - - ShowLargestFeatures(lambdas); - - converged = iteration > 100; - if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } - - string fname = "weights.cur.gz"; - if (converged) { fname = "weights.final.gz"; } - ostringstream vv; - vv << "Objective = " << objective << " (ITERATION=" << iteration << ")"; - const string svv = vv.str(); - w.WriteToFile(fname, true, &svv); - } // rank == 0 - int cint = converged; -#ifdef HAVE_MPI - MPI::COMM_WORLD.Bcast(const_cast<double*>(&lambdas.data()[0]), num_feats, MPI::DOUBLE, 0); - MPI::COMM_WORLD.Bcast(&cint, 1, MPI::INT, 0); - MPI::COMM_WORLD.Barrier(); -#endif - converged = cint; - } -#ifdef HAVE_MPI - MPI::Finalize(); -#endif - return 0; -} diff --git a/training/mr_em_adapted_reduce.cc b/training/mr_em_adapted_reduce.cc deleted file mode 100644 index f65b5440..00000000 --- a/training/mr_em_adapted_reduce.cc +++ /dev/null @@ -1,173 +0,0 @@ -#include <iostream> -#include <vector> -#include <cassert> -#include <cmath> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "m.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("optimization_method,m", po::value<string>()->default_value("em"), "Optimization method (em, vb)") - ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -double NoZero(const double& x) { - if (x) return x; - return 1e-35; -} - -void Maximize(const bool use_vb, - const double& alpha, - const int total_event_types, - SparseVector<double>* pc) { - const SparseVector<double>& counts = *pc; - - if (use_vb) - assert(total_event_types >= counts.size()); - - double tot = 0; - for (SparseVector<double>::const_iterator it = counts.begin(); - it != counts.end(); ++it) - tot += it->second; -// cerr << " = " << tot << endl; - assert(tot > 0.0); - double ltot = log(tot); - if (use_vb) - ltot = Md::digamma(tot + total_event_types * alpha); - for (SparseVector<double>::const_iterator it = counts.begin(); - it != counts.end(); ++it) { - if (use_vb) { - pc->set_value(it->first, NoZero(Md::digamma(it->second + alpha) - ltot)); - } else { - pc->set_value(it->first, NoZero(log(it->second) - ltot)); - } - } -#if 0 - if (counts.size() < 50) { - for (SparseVector<double>::const_iterator it = counts.begin(); - it != counts.end(); ++it) { - cerr << " p(" << FD::Convert(it->first) << ")=" << exp(it->second); - } - cerr << endl; - } -#endif -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["input_format"].as<string>() == "b64"; - const bool use_vb = conf["optimization_method"].as<string>() == "vb"; - const double alpha = 1e-09; - if (use_vb) - cerr << "Using variational Bayes, make sure alphas are set\n"; - - const string s_obj = "**OBJ**"; - // E-step - string cur_key = ""; - SparseVector<double> acc; - double logprob = 0; - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; - int feat; - double val; - size_t i = line.find("\t"); - const string key = line.substr(0, i); - assert(i != string::npos); - ++i; - if (key != cur_key) { - if (cur_key.size() > 0) { - // TODO shouldn't be num_active, should be total number - // of events - Maximize(use_vb, alpha, acc.size(), &acc); - cout << cur_key << '\t'; - if (use_b64) - B64::Encode(0.0, acc, &cout); - else - cout << acc; - cout << endl; - acc.clear(); - } - cur_key = key; - } - if (use_b64) { - SparseVector<double> g; - double obj; - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping!\n"; - continue; - } - logprob += obj; - acc += g; - } else { // text encoding - your counts will not be accurate! - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat == -1) { - logprob += val; - } else { - acc.add_value(feat, val); - } - } - } - } - // TODO shouldn't be num_active, should be total number - // of events - Maximize(use_vb, alpha, acc.size(), &acc); - cout << cur_key << '\t'; - if (use_b64) - B64::Encode(0.0, acc, &cout); - else - cout << acc; - cout << endl << flush; - - cerr << "LOGPROB: " << logprob << endl; - - return 0; -} diff --git a/training/mr_em_map_adapter.cc b/training/mr_em_map_adapter.cc deleted file mode 100644 index ead4598d..00000000 --- a/training/mr_em_map_adapter.cc +++ /dev/null @@ -1,160 +0,0 @@ -#include <iostream> -#include <fstream> -#include <cassert> -#include <cmath> - -#include <boost/utility.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include "boost/tuple/tuple.hpp" - -#include "fdict.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -// useful for EM models parameterized by a bunch of multinomials -// this converts event counts (returned from cdec as feature expectations) -// into different keys and values (which are lists of all the events, -// conditioned on the key) for summing and normalization by a reducer - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("buffer_size,b", po::value<int>()->default_value(1), "Buffer size (in # of counts) before emitting counts") - ("format,f",po::value<string>()->default_value("b64"), "Encoding of the input (b64 or text)"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct EventMapper { - int Map(int fid) { - int& cv = map_[fid]; - if (!cv) { - cv = GetConditioningVariable(fid); - } - return cv; - } - void Clear() { map_.clear(); } - protected: - virtual int GetConditioningVariable(int fid) const = 0; - private: - map<int, int> map_; -}; - -struct LexAlignEventMapper : public EventMapper { - protected: - virtual int GetConditioningVariable(int fid) const { - const string& str = FD::Convert(fid); - size_t pos = str.rfind("_"); - if (pos == string::npos || pos == 0 || pos >= str.size() - 1) { - cerr << "Bad feature for EM adapter: " << str << endl; - abort(); - } - return FD::Convert(str.substr(0, pos)); - } -}; - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["format"].as<string>() == "b64"; - const int buffer_size = conf["buffer_size"].as<int>(); - - const string s_obj = "**OBJ**"; - // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; - // 0<TAB>**OBJ**=1.1;Feat1=1.0; - - EventMapper* event_mapper = new LexAlignEventMapper; - map<int, SparseVector<double> > counts; - size_t total = 0; - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; - int feat; - double val; - size_t i = line.find("\t"); - assert(i != string::npos); - ++i; - SparseVector<double> g; - double obj = 0; - if (use_b64) { - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping!\n"; - continue; - } - } else { // text encoding - your counts will not be accurate! - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat == -1) { - obj = val; - } else { - g.set_value(feat, val); - } - } - } - //cerr << "OBJ: " << obj << endl; - const SparseVector<double>& cg = g; - for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) { - const int cond_var = event_mapper->Map(it->first); - SparseVector<double>& cond_counts = counts[cond_var]; - int delta = cond_counts.size(); - cond_counts.add_value(it->first, it->second); - delta = cond_counts.size() - delta; - total += delta; - } - if (total > buffer_size) { - for (map<int, SparseVector<double> >::iterator it = counts.begin(); - it != counts.end(); ++it) { - const SparseVector<double>& cc = it->second; - cout << FD::Convert(it->first) << '\t'; - if (use_b64) { - B64::Encode(0.0, cc, &cout); - } else { - abort(); - } - cout << endl; - } - cout << flush; - total = 0; - counts.clear(); - } - } - - return 0; -} - diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc deleted file mode 100644 index d490192f..00000000 --- a/training/mr_optimize_reduce.cc +++ /dev/null @@ -1,231 +0,0 @@ -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> -#include <cmath> - -#include <boost/shared_ptr.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "optimize.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -void SanityCheck(const vector<double>& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!std::isnan(w[i])); - assert(!std::isinf(w[i])); - } -} - -struct FComp { - const vector<double>& w_; - FComp(const vector<double>& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector<double>& w) { - vector<int> fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector<int>::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input_weights,i",po::value<string>(),"Input feature weights file") - ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file") - ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)") - ("state,s",po::value<string>(),"Read (and write if output_state is not set) optimizer state from this state file. In the first iteration, the file should not exist.") - ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)") - ("output_state,S", po::value<string>(), "Output state file (optional override)") - ("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory") - ("eta,e", po::value<double>()->default_value(0.1), "Learning rate for SGD (eta)") - ("gaussian_prior,p","Use a Gaussian prior on the weights") - ("means,u", po::value<string>(), "File containing the means for Gaussian prior") - ("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("input_weights") || !conf->count("state")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["input_format"].as<string>() == "b64"; - - vector<weight_t> lambdas; - Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas); - const string s_obj = "**OBJ**"; - int num_feats = FD::NumFeats(); - cerr << "Number of features: " << num_feats << endl; - const bool gaussian_prior = conf.count("gaussian_prior"); - vector<weight_t> means(num_feats, 0); - if (conf.count("means")) { - if (!gaussian_prior) { - cerr << "Don't use --means without --gaussian_prior!\n"; - exit(1); - } - Weights::InitFromFile(conf["means"].as<string>(), &means); - } - boost::shared_ptr<BatchOptimizer> o; - const string omethod = conf["optimization_method"].as<string>(); - if (omethod == "rprop") - o.reset(new RPropOptimizer(num_feats)); // TODO add configuration - else - o.reset(new LBFGSOptimizer(num_feats, conf["correction_buffers"].as<int>())); - cerr << "Optimizer: " << o->Name() << endl; - string state_file = conf["state"].as<string>(); - { - ifstream in(state_file.c_str(), ios::binary); - if (in) - o->Load(&in); - else - cerr << "No state file found, assuming ITERATION 1\n"; - } - - double objective = 0; - vector<double> gradient(num_feats, 0); - // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; - // 0<TAB>**OBJ**=1.1;Feat1=1.0; - int total_lines = 0; // TODO - this should be a count of the - // training instances!! - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; - ++total_lines; - int feat; - double val; - size_t i = line.find("\t"); - assert(i != string::npos); - ++i; - if (use_b64) { - SparseVector<double> g; - double obj; - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping gradient!\n"; - cerr << " START: " << line.substr(0,line.size() > 200 ? 200 : line.size()) << endl; - if (line.size() > 200) - cerr << " END: " << line.substr(line.size() - 200, 200) << endl; - cout << "-1\tRESTART\n"; - exit(99); - } - objective += obj; - const SparseVector<double>& cg = g; - for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) { - if (it->first >= num_feats) { - cerr << "Unexpected feature in gradient: " << FD::Convert(it->first) << endl; - abort(); - } - gradient[it->first] -= it->second; - } - } else { // text encoding - your gradients will not be accurate! - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - if (feat >= num_feats) { - cerr << "Unexpected feature in gradient: " << line.substr(start, i - start) << endl; - abort(); - } - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat == -1) { - objective += val; - } else { - gradient[feat] -= val; - } - } - } - } - - if (gaussian_prior) { - const double sigsq = conf["sigma_squared"].as<double>(); - double norm = 0; - for (int k = 1; k < lambdas.size(); ++k) { - const double& lambda_k = lambdas[k]; - if (lambda_k) { - const double param = (lambda_k - means[k]); - norm += param * param; - gradient[k] += param / sigsq; - } - } - const double reg = norm / (2.0 * sigsq); - cerr << "REGULARIZATION TERM: " << reg << endl; - objective += reg; - } - cerr << "EVALUATION #" << o->EvaluationCount() << " OBJECTIVE: " << objective << endl; - double gnorm = 0; - for (int i = 0; i < gradient.size(); ++i) - gnorm += gradient[i] * gradient[i]; - cerr << " GNORM=" << sqrt(gnorm) << endl; - vector<double> old = lambdas; - int c = 0; - while (old == lambdas) { - ++c; - if (c > 1) { cerr << "Same lambdas, repeating optimization\n"; } - o->Optimize(objective, gradient, &lambdas); - assert(c < 5); - } - old.clear(); - SanityCheck(lambdas); - ShowLargestFeatures(lambdas); - Weights::WriteToFile(conf["output_weights"].as<string>(), lambdas, false); - - const bool conv = o->HasConverged(); - if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } - - if (conf.count("output_state")) - state_file = conf["output_state"].as<string>(); - ofstream out(state_file.c_str(), ios::binary); - cerr << "Writing state to: " << state_file << endl; - o->Save(&out); - out.close(); - - cout << o->EvaluationCount() << "\t" << conv << endl; - return 0; -} diff --git a/training/mr_reduce_to_weights.cc b/training/mr_reduce_to_weights.cc deleted file mode 100644 index 16b47720..00000000 --- a/training/mr_reduce_to_weights.cc +++ /dev/null @@ -1,109 +0,0 @@ -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)") - ("input,i",po::value<string>()->default_value("-"),"Read file from") - ("output,o",po::value<string>()->default_value("-"),"Write weights to"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void WriteWeights(const SparseVector<double>& weights, ostream* out) { - for (SparseVector<double>::const_iterator it = weights.begin(); - it != weights.end(); ++it) { - (*out) << FD::Convert(it->first) << " " << it->second << endl; - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["input_format"].as<string>() == "b64"; - - const string s_obj = "**OBJ**"; - // E-step - ReadFile rf(conf["input"].as<string>()); - istream* in = rf.stream(); - assert(*in); - WriteFile wf(conf["output"].as<string>()); - ostream* out = wf.stream(); - out->precision(17); - while(*in) { - string line; - getline(*in, line); - if (line.empty()) continue; - int feat; - double val; - size_t i = line.find("\t"); - assert(i != string::npos); - ++i; - if (use_b64) { - SparseVector<double> g; - double obj; - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping!\n"; - continue; - } - WriteWeights(g, out); - } else { // text encoding - your counts will not be accurate! - SparseVector<double> weights; - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat != -1) { - weights.set_value(feat, val); - } - } - WriteWeights(weights, out); - } - } - - return 0; -} diff --git a/training/pro/Makefile.am b/training/pro/Makefile.am new file mode 100644 index 00000000..e0a45a33 --- /dev/null +++ b/training/pro/Makefile.am @@ -0,0 +1,11 @@ +bin_PROGRAMS = \ + mr_pro_map \ + mr_pro_reduce + +mr_pro_map_SOURCES = mr_pro_map.cc +mr_pro_map_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a + +mr_pro_reduce_SOURCES = mr_pro_reduce.cc +mr_pro_reduce_LDADD = $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/utils/libutils.a + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils -I$(top_srcdir)/training diff --git a/training/pro/mr_pro_generate_mapper_input.pl b/training/pro/mr_pro_generate_mapper_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/training/pro/mr_pro_generate_mapper_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { + my $file = $hg; + my $id = $hg; + $id =~ s/(\.json)?\.gz//; + print "$d/$file $id\n"; +} + diff --git a/training/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc new file mode 100644 index 00000000..eef40b8a --- /dev/null +++ b/training/pro/mr_pro_map.cc @@ -0,0 +1,201 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> +#include <tr1/unordered_map> + +#include <boost/functional/hash.hpp> +#include <boost/shared_ptr.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "candidate_set.h" +#include "sampler.h" +#include "filelib.h" +#include "stringlib.h" +#include "weights.h" +#include "inside_outside.h" +#include "hg_io.h" +#include "ns.h" +#include "ns_docscorer.h" + +// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011) + +using namespace std; +namespace po = boost::program_options; + +boost::shared_ptr<MT19937> rng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") + ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations") + ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)") + ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") + ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)") + ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") + ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract") + ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") + ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") + ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF.TXT>\n"; + flag = true; + } + if (!conf->count("weights")) { + cerr << "Please specify weights using -w <WEIGHTS.TXT>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct ThresholdAlpha { + explicit ThresholdAlpha(double t = 0.05) : threshold(t) {} + double operator()(double mag) const { + if (mag < threshold) return 0.0; else return 1.0; + } + const double threshold; +}; + +struct TrainingInstance { + TrainingInstance(const SparseVector<weight_t>& feats, bool positive, float diff) : x(feats), y(positive), gdiff(diff) {} + SparseVector<weight_t> x; +#undef DEBUGGING_PRO +#ifdef DEBUGGING_PRO + vector<WordID> a; + vector<WordID> b; +#endif + bool y; + float gdiff; +}; +#ifdef DEBUGGING_PRO +ostream& operator<<(ostream& os, const TrainingInstance& d) { + return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x; +} +#endif + +struct DiffOrder { + bool operator()(const TrainingInstance& a, const TrainingInstance& b) const { + return a.gdiff > b.gdiff; + } +}; + +void Sample(const unsigned gamma, + const unsigned xi, + const training::CandidateSet& J_i, + const EvaluationMetric* metric, + vector<TrainingInstance>* pv) { + const bool invert_score = metric->IsErrorMetric(); + vector<TrainingInstance> v1, v2; + float avg_diff = 0; + for (unsigned i = 0; i < gamma; ++i) { + const size_t a = rng->inclusive(0, J_i.size() - 1)(); + const size_t b = rng->inclusive(0, J_i.size() - 1)(); + if (a == b) continue; + float ga = metric->ComputeScore(J_i[a].eval_feats); + float gb = metric->ComputeScore(J_i[b].eval_feats); + bool positive = gb < ga; + if (invert_score) positive = !positive; + const float gdiff = fabs(ga - gb); + if (!gdiff) continue; + avg_diff += gdiff; + SparseVector<weight_t> xdiff = (J_i[a].fmap - J_i[b].fmap).erase_zeros(); + if (xdiff.empty()) { + cerr << "Empty diff:\n " << TD::GetString(J_i[a].ewords) << endl << "x=" << J_i[a].fmap << endl; + cerr << " " << TD::GetString(J_i[b].ewords) << endl << "x=" << J_i[b].fmap << endl; + continue; + } + v1.push_back(TrainingInstance(xdiff, positive, gdiff)); +#ifdef DEBUGGING_PRO + v1.back().a = J_i[a].hyp; + v1.back().b = J_i[b].hyp; + cerr << "N: " << v1.back() << endl; +#endif + } + avg_diff /= v1.size(); + + for (unsigned i = 0; i < v1.size(); ++i) { + double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff)); + // cerr << "avg_diff=" << avg_diff << " gdiff=" << v1[i].gdiff << " p=" << p << endl; + if (rng->next() < p) v2.push_back(v1[i]); + } + vector<TrainingInstance>::iterator mid = v2.begin() + xi; + if (xi > v2.size()) mid = v2.end(); + partial_sort(v2.begin(), mid, v2.end(), DiffOrder()); + copy(v2.begin(), mid, back_inserter(*pv)); +#ifdef DEBUGGING_PRO + if (v2.size() >= 5) { + for (int i =0; i < (mid - v2.begin()); ++i) { + cerr << v2[i] << endl; + } + cerr << pv->back() << endl; + } +#endif +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + if (conf.count("random_seed")) + rng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + rng.reset(new MT19937); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as<string>()); + istream &in=*in_read.stream(); + const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); + const unsigned gamma = conf["candidate_pairs"].as<unsigned>(); + const unsigned xi = conf["best_pairs"].as<unsigned>(); + string weightsf = conf["weights"].as<string>(); + vector<weight_t> weights; + Weights::InitFromFile(weightsf, &weights); + string kbest_repo = conf["kbest_repository"].as<string>(); + MkDirP(kbest_repo); + while(in) { + vector<TrainingInstance> v; + string line; + getline(in, line); + if (line.empty()) continue; + istringstream is(line); + int sent_id; + string file; + // path-to-file (JSON) sent_id + is >> file >> sent_id; + ReadFile rf(file); + ostringstream os; + training::CandidateSet J_i; + os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; + const string kbest_file = os.str(); + if (FileExists(kbest_file)) + J_i.ReadFromFile(kbest_file); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(weights); + J_i.AddKBestCandidates(hg, kbest_size, ds[sent_id]); + J_i.WriteToFile(kbest_file); + + Sample(gamma, xi, J_i, metric, &v); + for (unsigned i = 0; i < v.size(); ++i) { + const TrainingInstance& vi = v[i]; + cout << vi.y << "\t" << vi.x << endl; + cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl; + } + } + return 0; +} + diff --git a/training/pro/mr_pro_reduce.cc b/training/pro/mr_pro_reduce.cc new file mode 100644 index 00000000..5ef9b470 --- /dev/null +++ b/training/pro/mr_pro_reduce.cc @@ -0,0 +1,286 @@ +#include <cstdlib> +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "weights.h" +#include "sparse_vector.h" +#include "optimize.h" +#include "liblbfgs/lbfgs++.h" + +using namespace std; +namespace po = boost::program_options; + +// since this is a ranking model, there should be equal numbers of +// positive and negative examples, so the bias should be 0 +static const double MAX_BIAS = 1e-10; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation") + ("regularization_strength,C",po::value<double>()->default_value(500.0), "l2 regularization strength") + ("l1",po::value<double>()->default_value(0.0), "l1 regularization strength") + ("regularize_to_weights,y",po::value<double>()->default_value(5000.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect") + ("memory_buffers,m",po::value<unsigned>()->default_value(100), "Number of memory buffers (LBFGS)") + ("min_reg,r",po::value<double>()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght") + ("max_reg,R",po::value<double>()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght") + ("testset,t",po::value<string>(), "Optional held-out test set") + ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength") + ("interpolate_with_weights,p",po::value<double>()->default_value(1.0), "[deprecated] Output weights are p*w + (1-p)*w_prev; 1.0 = no effect") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ParseSparseVector(string& line, size_t cur, SparseVector<weight_t>* out) { + SparseVector<weight_t>& x = *out; + size_t last_start = cur; + size_t last_comma = string::npos; + while(cur <= line.size()) { + if (line[cur] == ' ' || cur == line.size()) { + if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { + cerr << "[ERROR] " << line << endl << " position = " << cur << endl; + exit(1); + } + const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); + if (cur < line.size()) line[cur] = 0; + const weight_t val = strtod(&line[last_comma + 1], NULL); + x.set_value(fid, val); + + last_comma = string::npos; + last_start = cur+1; + } else { + if (line[cur] == '=') + last_comma = cur; + } + ++cur; + } +} + +void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corpus) { + istream& in = *pin; + corpus->clear(); + bool flag = false; + int lc = 0; + string line; + SparseVector<weight_t> x; + while(getline(in, line)) { + ++lc; + if (lc % 1000 == 0) { cerr << '.'; flag = true; } + if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } + if (line.empty()) continue; + const size_t ks = line.find("\t"); + assert(string::npos != ks); + assert(ks == 1); + const bool y = line[0] == '1'; + x.clear(); + ParseSparseVector(line, ks + 1, &x); + corpus->push_back(make_pair(y, x)); + } + if (flag) cerr << endl; +} + +void GradAdd(const SparseVector<weight_t>& v, const double scale, weight_t* acc) { + for (SparseVector<weight_t>::const_iterator it = v.begin(); + it != v.end(); ++it) { + acc[it->first] += it->second * scale; + } +} + +double ApplyRegularizationTerms(const double C, + const double T, + const vector<weight_t>& weights, + const vector<weight_t>& prev_weights, + weight_t* g) { + double reg = 0; + for (size_t i = 0; i < weights.size(); ++i) { + const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); + const double& w_i = weights[i]; + reg += C * w_i * w_i; + g[i] += 2 * C * w_i; + + const double diff_i = w_i - prev_w_i; + reg += T * diff_i * diff_i; + g[i] += 2 * T * diff_i; + } + return reg; +} + +double TrainingInference(const vector<weight_t>& x, + const vector<pair<bool, SparseVector<weight_t> > >& corpus, + weight_t* g = NULL) { + double cll = 0; + for (int i = 0; i < corpus.size(); ++i) { + const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias + double lp_false = dotprod; + double lp_true = -dotprod; + if (0 < lp_true) { + lp_true += log1p(exp(-lp_true)); + lp_false = log1p(exp(lp_false)); + } else { + lp_true = log1p(exp(lp_true)); + lp_false += log1p(exp(-lp_false)); + } + lp_true*=-1; + lp_false*=-1; + if (corpus[i].first) { // true label + cll -= lp_true; + if (g) { + // g -= corpus[i].second * exp(lp_false); + GradAdd(corpus[i].second, -exp(lp_false), g); + g[0] -= exp(lp_false); // bias + } + } else { // false label + cll -= lp_false; + if (g) { + // g += corpus[i].second * exp(lp_true); + GradAdd(corpus[i].second, exp(lp_true), g); + g[0] += exp(lp_true); // bias + } + } + } + return cll; +} + +struct ProLoss { + ProLoss(const vector<pair<bool, SparseVector<weight_t> > >& tr, + const vector<pair<bool, SparseVector<weight_t> > >& te, + const double c, + const double t, + const vector<weight_t>& px) : training(tr), testing(te), C(c), T(t), prev_x(px){} + double operator()(const vector<double>& x, double* g) const { + fill(g, g + x.size(), 0.0); + double cll = TrainingInference(x, training, g); + tppl = 0; + if (testing.size()) + tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size())); + double ppl = cll / log(2); + ppl /= training.size(); + ppl = pow(2.0, ppl); + double reg = ApplyRegularizationTerms(C, T, x, prev_x, g); + return cll + reg; + } + const vector<pair<bool, SparseVector<weight_t> > >& training, testing; + const double C, T; + const vector<double>& prev_x; + mutable double tppl; +}; + +// return held-out log likelihood +double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training, + const vector<pair<bool, SparseVector<weight_t> > >& testing, + const double C, + const double C1, + const double T, + const unsigned memory_buffers, + const vector<weight_t>& prev_x, + vector<weight_t>* px) { + assert(px->size() == prev_x.size()); + ProLoss loss(training, testing, C, T, prev_x); + LBFGS<ProLoss> lbfgs(px, loss, memory_buffers, C1); + lbfgs.MinimizeFunction(); + return loss.tppl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + string line; + vector<pair<bool, SparseVector<weight_t> > > training, testing; + const bool tune_regularizer = conf.count("tune_regularizer"); + if (tune_regularizer && !conf.count("testset")) { + cerr << "--tune_regularizer requires --testset to be set\n"; + return 1; + } + const double min_reg = conf["min_reg"].as<double>(); + const double max_reg = conf["max_reg"].as<double>(); + double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned + double C1 = conf["l1"].as<double>(); // will be overridden if parameter is tuned + const double T = conf["regularize_to_weights"].as<double>(); + assert(C >= 0.0); + assert(min_reg >= 0.0); + assert(max_reg >= 0.0); + assert(max_reg > min_reg); + const double psi = conf["interpolate_with_weights"].as<double>(); + if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; } + ReadCorpus(&cin, &training); + if (conf.count("testset")) { + ReadFile rf(conf["testset"].as<string>()); + ReadCorpus(rf.stream(), &testing); + } + cerr << "Number of features: " << FD::NumFeats() << endl; + + vector<weight_t> x, prev_x; // x[0] is bias + if (conf.count("weights")) { + Weights::InitFromFile(conf["weights"].as<string>(), &x); + x.resize(FD::NumFeats()); + prev_x = x; + } else { + x.resize(FD::NumFeats()); + prev_x = x; + } + cerr << " Number of features: " << x.size() << endl; + cerr << "Number of training examples: " << training.size() << endl; + cerr << "Number of testing examples: " << testing.size() << endl; + double tppl = 0.0; + vector<pair<double,double> > sp; + vector<double> smoothed; + if (tune_regularizer) { + C = min_reg; + const double steps = 18; + double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps); + cerr << "SWEEP FACTOR: " << sweep_factor << endl; + while(C < max_reg) { + cerr << "C=" << C << "\tT=" <<T << endl; + tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x); + sp.push_back(make_pair(C, tppl)); + C *= sweep_factor; + } + smoothed.resize(sp.size(), 0); + smoothed[0] = sp[0].second; + smoothed.back() = sp.back().second; + for (int i = 1; i < sp.size()-1; ++i) { + double prev = sp[i-1].second; + double next = sp[i+1].second; + double cur = sp[i].second; + smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next); + } + double best_ppl = 9999999; + unsigned best_i = 0; + for (unsigned i = 0; i < sp.size(); ++i) { + if (smoothed[i] < best_ppl) { + best_ppl = smoothed[i]; + best_i = i; + } + } + C = sp[best_i].first; + } // tune regularizer + tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x); + if (conf.count("weights")) { + for (int i = 1; i < x.size(); ++i) { + x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi); + } + } + cout.precision(15); + cout << "# C=" << C << "\theld out perplexity="; + if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; } + if (sp.size()) { + cout << "# Parameter sweep:\n"; + for (int i = 0; i < sp.size(); ++i) { + cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl; + } + } + Weights::WriteToFile("-", x); + return 0; +} diff --git a/training/pro/pro.pl b/training/pro/pro.pl new file mode 100755 index 00000000..3b30c379 --- /dev/null +++ b/training/pro/pro.pl @@ -0,0 +1,555 @@ +#!/usr/bin/env perl +use strict; +use File::Basename qw(basename); +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); +my $default_jobs = env_default_jobs(); + +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl"; +my $MAPPER = "$bin_dir/mr_pro_map"; +my $REDUCER = "$bin_dir/mr_pro_reduce"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 30; +my $iteration = 1; +my $best_weights; +my $psi = 1; +my $default_max_iter = 30; +my $max_iterations = $default_max_iter; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "4g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $use_make = 1; # use make to parallelize +my $useqsub = 0; +my $initial_weights; +my $pass_suffix = ''; +my $devset; + +# regularization strength +my $reg = 500; +my $reg_previous = 5000; + +# Process command-line options +if (GetOptions( + "config=s" => \$iniFile, + "weights=s" => \$initial_weights, + "devset=s" => \$devset, + "jobs=i" => \$jobs, + "metric=s" => \$metric, + "pass-suffix=s" => \$pass_suffix, + "qsub" => \$useqsub, + "help" => \$help, + "reg=f" => \$reg, + "reg-previous=f" => \$reg_previous, + "output-dir=s" => \$dir, +) == 0 || @ARGV!=0 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $iniFile) { push @missing_args, "--config"; } +if (!defined $devset) { push @missing_args, "--devset"; } +if (!defined $initial_weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 5; +} + +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +unless ($dir){ + $dir = 'pro'; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +if (-e $dir) { + die "ERROR: working dir $dir already exists\n\n"; +} else { + mkdir "$dir" or die "Can't mkdir $dir: $!"; + mkdir "$dir/hgs" or die; + mkdir "$dir/scripts" or die; + print STDERR <<EOT; + DECODER: $decoder + INI FILE: $iniFile + WORKING DIR: $dir + DEVSET: $devset + EVAL METRIC: $metric + MAX ITERATIONS: $max_iterations + PARALLEL JOBS: $jobs + HEAD NODE: $host + PMEM (DECODING): $pmem + INITIAL WEIGHTS: $initial_weights +EOT +} + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +check_call("cp $initial_weights $dir/weights.0"); +$iniFile = $newIniFile; + +my $refs = "$dir/dev.refs"; +split_devset($devset, "$dir/dev.input.raw", $refs); +my $newsrc = "$dir/dev.input"; +enseg("$dir/dev.input.raw", $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +my @allweights; +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + push @allweights, "-w $dir/weights.$im1"; + `rm -f $dir/hgs/*.gz`; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER -r $refs -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + print STDERR " - GENERATE TRAINING EXEMPLARS\n"; + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput."; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + my $mkfile; # only used with makefiles + my $mkfilename; + if ($use_make) { + $mkfilename = "$dir/splag.$im1/domap.mk"; + open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; + print $mkfile "all: $dir/splag.$im1/map.done\n\n"; + } + my @mkouts = (); # only used with makefiles + my @mapoutputs = (); + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "pro.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -s $srcFile -m $metric -r $refs -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; + if ($use_make) { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "#!/bin/bash\n"; + print F "$script\n"; + close F; + my $output = "$dir/splag.$im1/$mapoutput"; + push @mkouts, $output; + chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; + } else { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + + $nmappers++; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); + chomp $jobid; + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); + print STDERR " $jobid"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + } + } + my @dev_outs = (); + my @devtest_outs = (); + @dev_outs = @mapoutputs; + if ($use_make) { + print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; + close $mkfile; + my $mcmd = "make -j $jobs -f $mkfilename"; + print STDERR "\nExecuting: $mcmd\n"; + check_call($mcmd); + } else { + print STDERR "\nLaunched $nmappers mappers.\n"; + sleep 8; + print STDERR "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); + $nmappers = scalar @livejobs; + } + print STDERR "All mappers complete.\n"; + } + my $tol = 0; + my $til = 0; + my $dev_test_file = "$dir/splag.$im1/devtest.gz"; + print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n"; + print STDERR unchecked_output("date"); + $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -C $reg -y $reg_previous --interpolate_with_weights $psi"; + $cmd .= " > $dir/weights.$iteration"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + $lastWeightsFile = "$dir/weights.$iteration"; + $lastPScore = $score; + $iteration++; + print STDERR "\n==========\n"; +} + + +check_call("cp $lastWeightsFile $dir/weights.final"); +print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n"; +print STDOUT "$dir/weights.final\n"; + +exit 0; + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; + die "Empty dev set!" if ($i == 0); +} + +sub print_help { + + my $executable = basename($0); chomp $executable; + print << "Help"; + +Usage: $executable [options] + + $executable [options] + Runs a complete PRO optimization using the ini file specified. + +Required: + + --config <cdec.ini> + Decoder configuration file. + + --devset <files> + Dev set source and reference data. + + --weights <file> + Initial weights file (use empty file to start from 0) + +General options: + + --help + Print this message and exit. + + --max-iterations <M> + Maximum number of iterations to run. If not specified, defaults + to $default_max_iter. + + --metric <method> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --pass-suffix <S> + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --workdir <dir> + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Regularization options: + + --reg <F> + l2 regularization strength [default=500]. The greater this value, + the closer to zero the weights will be. + + --reg-previous <F> + l2 penalty for moving away from the weights from the previous + iteration. [default=5000]. The greater this value, the closer + to the previous iteration's weights the next iteration's weights + will be. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Deprecated options: + + --interpolate-with-weights <F> + [deprecated] At each iteration the resulting weights are + interpolated with the weights from the previous iteration, with + this factor. [default=1.0, i.e., no effect] + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { + my ($infile, $outsrc, $outref) = @_; + open F, "<$infile" or die "Can't read $infile: $!"; + open S, ">$outsrc" or die "Can't write $outsrc: $!"; + open R, ">$outref" or die "Can't write $outref: $!"; + while(<F>) { + chomp; + my ($src, @refs) = split /\s*\|\|\|\s*/; + die "Malformed devset line: $_\n" unless scalar @refs > 0; + print S "$src\n"; + print R join(' ||| ', @refs) . "\n"; + } + close R; + close S; + close F; +} + diff --git a/training/rampion/Makefile.am b/training/rampion/Makefile.am new file mode 100644 index 00000000..ef0ca147 --- /dev/null +++ b/training/rampion/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = rampion_cccp + +rampion_cccp_SOURCES = rampion_cccp.cc +rampion_cccp_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a + +AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils diff --git a/training/rampion/rampion.pl b/training/rampion/rampion.pl new file mode 100755 index 00000000..ae084db6 --- /dev/null +++ b/training/rampion/rampion.pl @@ -0,0 +1,540 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); +my $default_jobs = env_default_jobs(); + +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/rampion_generate_input.pl"; +my $MAPPER = "$bin_dir/rampion_cccp"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 30; +my $iteration = 1; +my $best_weights; +my $psi = 1; +my $default_max_iter = 30; +my $max_iterations = $default_max_iter; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "4g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $use_make = 1; # use make to parallelize +my $useqsub = 0; +my $initial_weights; +my $pass_suffix = ''; +my $cpbin=1; + +# regularization strength +my $tune_regularizer = 0; +my $reg = 500; +my $reg_previous = 5000; +my $dont_accum = 0; + +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "jobs=i" => \$jobs, + "dont-clean" => \$disable_clean, + "dont-accumulate" => \$dont_accum, + "pass-suffix=s" => \$pass_suffix, + "qsub" => \$useqsub, + "dry-run" => \$dryrun, + "epsilon=s" => \$epsilon, + "help" => \$help, + "weights=s" => \$initial_weights, + "reg=f" => \$reg, + "use-make=i" => \$use_make, + "max-iterations=i" => \$max_iterations, + "pmem=s" => \$pmem, + "cpbin!" => \$cpbin, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "workdir=s" => \$dir, +) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer; + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $srcFile) { push @missing_args, "--source-file"; } +if (!defined $refFiles) { push @missing_args, "--ref-files"; } +if (!defined $initial_weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 5; +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ + $dir = "rampion"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { + local $_; + my $bindir=shift; + check_call("mkdir -p $bindir"); + -d $bindir || die "couldn't make bindir $bindir"; + for (@_) { + my $src=$$_; + $$_="$bindir/".basename($src); + check_call("cp -p $src $$_"); + } +} +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +my @allweights; +if ($dryrun){ + write_config(*STDERR); + exit 0; +} else { + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs + die "ERROR: working dir $dir already exists\n\n"; + } else { + -e $dir || mkdir $dir; + mkdir "$dir/hgs"; + modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; + mkdir "$dir/scripts"; + my $cmdfile="$dir/rerun-pro.sh"; + open CMD,'>',$cmdfile; + print CMD "cd ",&getcwd,"\n"; +# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. + my $cline=&cmdline."\n"; + print CMD $cline; + close CMD; + print STDERR $cline; + chmod(0755,$cmdfile); + check_call("cp $initial_weights $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); + } + write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; +my $kbest = "$dir/kbest"; +if ($dont_accum) { + $kbest = ''; +} else { + check_call("mkdir -p $kbest"); + $kbest = "--kbest_repository $kbest"; +} + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + push @allweights, "-w $dir/weights.$im1"; + `rm -f $dir/hgs/*.gz`; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + print STDERR " - GENERATE TRAINING EXEMPLARS\n"; + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + my $outweights="$dir/weights.$iteration"; + $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + $cmd="$MAPPER $refs_comma_sep -m $metric -i $dir/agenda.$im1 $kbest -w $inweights > $outweights"; + check_call($cmd); + $lastWeightsFile = $outweights; + $iteration++; + `rm hgs/*.gz`; + print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while(<FL>) { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($r,$p) = @_; + my $o = check_output("echo $p"); + chomp $o; + my @files = split /\s+/, $o; + return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +# subs +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "JOBS: $jobs\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; + die "Empty dev set!" if ($i == 0); +} + +sub print_help { + + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable [options] <ini file> + Runs a complete PRO optimization using the ini file specified. + +Required: + + --ref-files <files> + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + + --source-file <file> + Dev set source file. + + --weights <file> + Initial weights file (use empty file to start from 0) + +General options: + + --help + Print this message and exit. + + --dont-accumulate + Don't accumulate k-best lists from multiple iterations. + + --max-iterations <M> + Maximum number of iterations to run. If not specified, defaults + to $default_max_iter. + + --metric <method> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --pass-suffix <S> + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --workdir <dir> + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Regularization options: + + --reg <F> + l2 regularization strength [default=500]. The greater this value, + the closer to zero the weights will be. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} diff --git a/training/rampion/rampion_cccp.cc b/training/rampion/rampion_cccp.cc new file mode 100644 index 00000000..1e36dc51 --- /dev/null +++ b/training/rampion/rampion_cccp.cc @@ -0,0 +1,168 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <limits> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "stringlib.h" +#include "weights.h" +#include "hg_io.h" +#include "kbest.h" +#include "viterbi.h" +#include "ns.h" +#include "ns_docscorer.h" +#include "candidate_set.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") + ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations") + ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") + ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") + ("kbest_repository,R",po::value<string>(), "Accumulate k-best lists from previous iterations (parameter is path to repository)") + ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract") + ("cccp_iterations,I", po::value<unsigned>()->default_value(10u), "CCCP iterations (T')") + ("ssd_iterations,J", po::value<unsigned>()->default_value(5u), "Stochastic subgradient iterations (T'')") + ("eta", po::value<double>()->default_value(1e-4), "Step size") + ("regularization_strength,C", po::value<double>()->default_value(1.0), "L2 regularization strength") + ("alpha,a", po::value<double>()->default_value(10.0), "Cost scale (alpha); alpha * [1-metric(y,y')]") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF.TXT>\n"; + flag = true; + } + if (!conf->count("weights")) { + cerr << "Please specify weights using -w <WEIGHTS.TXT>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct GainFunction { + explicit GainFunction(const EvaluationMetric* m) : metric(m) {} + float operator()(const SufficientStats& eval_feats) const { + float g = metric->ComputeScore(eval_feats); + if (!metric->IsErrorMetric()) g = 1 - g; + return g; + } + const EvaluationMetric* metric; +}; + +template <typename GainFunc> +void CostAugmentedSearch(const GainFunc& gain, + const training::CandidateSet& cs, + const SparseVector<double>& w, + double alpha, + SparseVector<double>* fmap) { + unsigned best_i = 0; + double best = -numeric_limits<double>::infinity(); + for (unsigned i = 0; i < cs.size(); ++i) { + double s = cs[i].fmap.dot(w) + alpha * gain(cs[i].eval_feats); + if (s > best) { + best = s; + best_i = i; + } + } + *fmap = cs[best_i].fmap; +} + + + +// runs lines 4--15 of rampion algorithm +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + double goodsign = -1; + double badsign = -goodsign; + + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as<string>()); + string kbest_repo; + if (conf.count("kbest_repository")) { + kbest_repo = conf["kbest_repository"].as<string>(); + MkDirP(kbest_repo); + } + istream &in=*in_read.stream(); + const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); + const unsigned tp = conf["cccp_iterations"].as<unsigned>(); + const unsigned tpp = conf["ssd_iterations"].as<unsigned>(); + const double eta = conf["eta"].as<double>(); + const double reg = conf["regularization_strength"].as<double>(); + const double alpha = conf["alpha"].as<double>(); + SparseVector<weight_t> weights; + { + vector<weight_t> vweights; + const string weightsf = conf["weights"].as<string>(); + Weights::InitFromFile(weightsf, &vweights); + Weights::InitSparseVector(vweights, &weights); + } + string line, file; + vector<training::CandidateSet> kis; + cerr << "Loading hypergraphs...\n"; + while(getline(in, line)) { + istringstream is(line); + int sent_id; + kis.resize(kis.size() + 1); + training::CandidateSet& curkbest = kis.back(); + string kbest_file; + if (kbest_repo.size()) { + ostringstream os; + os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; + kbest_file = os.str(); + if (FileExists(kbest_file)) + curkbest.ReadFromFile(kbest_file); + } + is >> file >> sent_id; + ReadFile rf(file); + if (kis.size() % 5 == 0) { cerr << '.'; } + if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; } + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(weights); + curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]); + if (kbest_file.size()) + curkbest.WriteToFile(kbest_file); + } + cerr << "\nHypergraphs loaded.\n"; + + vector<SparseVector<weight_t> > goals(kis.size()); // f(x_i,y+,h+) + SparseVector<weight_t> fear; // f(x,y-,h-) + const GainFunction gain(metric); + for (unsigned iterp = 1; iterp <= tp; ++iterp) { + cerr << "CCCP Iteration " << iterp << endl; + for (unsigned i = 0; i < goals.size(); ++i) + CostAugmentedSearch(gain, kis[i], weights, goodsign * alpha, &goals[i]); + for (unsigned iterpp = 1; iterpp <= tpp; ++iterpp) { + cerr << " SSD Iteration " << iterpp << endl; + for (unsigned i = 0; i < goals.size(); ++i) { + CostAugmentedSearch(gain, kis[i], weights, badsign * alpha, &fear); + weights -= weights * (eta * reg / goals.size()); + weights += (goals[i] - fear) * eta; + } + } + } + vector<weight_t> w; + weights.init_vector(&w); + Weights::WriteToFile("-", w); + return 0; +} + diff --git a/training/rampion/rampion_generate_input.pl b/training/rampion/rampion_generate_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/training/rampion/rampion_generate_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { + my $file = $hg; + my $id = $hg; + $id =~ s/(\.json)?\.gz//; + print "$d/$file $id\n"; +} + diff --git a/training/ttables.cc b/training/ttables.cc deleted file mode 100644 index 45bf14c5..00000000 --- a/training/ttables.cc +++ /dev/null @@ -1,31 +0,0 @@ -#include "ttables.h" - -#include <cassert> - -#include "dict.h" - -using namespace std; -using namespace std::tr1; - -void TTable::DeserializeProbsFromText(std::istream* in) { - int c = 0; - while(*in) { - string e; - string f; - double p; - (*in) >> e >> f >> p; - if (e.empty()) break; - ++c; - ttable[TD::Convert(e)][TD::Convert(f)] = p; - } - cerr << "Loaded " << c << " translation parameters.\n"; -} - -void TTable::SerializeHelper(string* out, const Word2Word2Double& o) { - assert(!"not implemented"); -} - -void TTable::DeserializeHelper(const string& in, Word2Word2Double* o) { - assert(!"not implemented"); -} - diff --git a/training/ttables.h b/training/ttables.h deleted file mode 100644 index 9baa13ca..00000000 --- a/training/ttables.h +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef _TTABLES_H_ -#define _TTABLES_H_ - -#include <iostream> -#include <tr1/unordered_map> - -#include "sparse_vector.h" -#include "m.h" -#include "wordid.h" -#include "tdict.h" - -class TTable { - public: - TTable() {} - typedef std::tr1::unordered_map<WordID, double> Word2Double; - typedef std::tr1::unordered_map<WordID, Word2Double> Word2Word2Double; - inline double prob(const int& e, const int& f) const { - const Word2Word2Double::const_iterator cit = ttable.find(e); - if (cit != ttable.end()) { - const Word2Double& cpd = cit->second; - const Word2Double::const_iterator it = cpd.find(f); - if (it == cpd.end()) return 1e-9; - return it->second; - } else { - return 1e-9; - } - } - inline void Increment(const int& e, const int& f) { - counts[e][f] += 1.0; - } - inline void Increment(const int& e, const int& f, double x) { - counts[e][f] += x; - } - void NormalizeVB(const double alpha) { - ttable.swap(counts); - for (Word2Word2Double::iterator cit = ttable.begin(); - cit != ttable.end(); ++cit) { - double tot = 0; - Word2Double& cpd = cit->second; - for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - tot += it->second + alpha; - for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - it->second = exp(Md::digamma(it->second + alpha) - Md::digamma(tot)); - } - counts.clear(); - } - void Normalize() { - ttable.swap(counts); - for (Word2Word2Double::iterator cit = ttable.begin(); - cit != ttable.end(); ++cit) { - double tot = 0; - Word2Double& cpd = cit->second; - for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - tot += it->second; - for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - it->second /= tot; - } - counts.clear(); - } - // adds counts from another TTable - probabilities remain unchanged - TTable& operator+=(const TTable& rhs) { - for (Word2Word2Double::const_iterator it = rhs.counts.begin(); - it != rhs.counts.end(); ++it) { - const Word2Double& cpd = it->second; - Word2Double& tgt = counts[it->first]; - for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) { - tgt[j->first] += j->second; - } - } - return *this; - } - void ShowTTable() const { - for (Word2Word2Double::const_iterator it = ttable.begin(); it != ttable.end(); ++it) { - const Word2Double& cpd = it->second; - for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) { - std::cerr << "P(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl; - } - } - } - void ShowCounts() const { - for (Word2Word2Double::const_iterator it = counts.begin(); it != counts.end(); ++it) { - const Word2Double& cpd = it->second; - for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) { - std::cerr << "c(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl; - } - } - } - void DeserializeProbsFromText(std::istream* in); - void SerializeCounts(std::string* out) const { SerializeHelper(out, counts); } - void DeserializeCounts(const std::string& in) { DeserializeHelper(in, &counts); } - void SerializeProbs(std::string* out) const { SerializeHelper(out, ttable); } - void DeserializeProbs(const std::string& in) { DeserializeHelper(in, &ttable); } - private: - static void SerializeHelper(std::string*, const Word2Word2Double& o); - static void DeserializeHelper(const std::string&, Word2Word2Double* o); - public: - Word2Word2Double ttable; - Word2Word2Double counts; -}; - -#endif diff --git a/training/utils/Makefile.am b/training/utils/Makefile.am new file mode 100644 index 00000000..d708a9f5 --- /dev/null +++ b/training/utils/Makefile.am @@ -0,0 +1,37 @@ +noinst_LIBRARIES = libtraining_utils.a + +bin_PROGRAMS = \ + sentserver \ + sentclient \ + grammar_convert + +noinst_PROGRAMS = \ + lbfgs_test \ + optimize_test + +sentserver_SOURCES = sentserver.c +sentserver_LDFLAGS = -pthread + +sentclient_SOURCES = sentclient.c +sentclient_LDFLAGS = -pthread + +TESTS = lbfgs_test optimize_test + +libtraining_utils_a_SOURCES = \ + candidate_set.cc \ + entropy.cc \ + optimize.cc \ + online_optimizer.cc \ + risk.cc + +optimize_test_SOURCES = optimize_test.cc +optimize_test_LDADD = libtraining_utils.a $(top_srcdir)/utils/libutils.a + +grammar_convert_SOURCES = grammar_convert.cc +grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a + +lbfgs_test_SOURCES = lbfgs_test.cc +lbfgs_test_LDADD = $(top_srcdir)/utils/libutils.a + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I$(top_srcdir)/klm + diff --git a/training/candidate_set.cc b/training/utils/candidate_set.cc index 087efec3..087efec3 100644 --- a/training/candidate_set.cc +++ b/training/utils/candidate_set.cc diff --git a/training/candidate_set.h b/training/utils/candidate_set.h index 9d326ed0..9d326ed0 100644 --- a/training/candidate_set.h +++ b/training/utils/candidate_set.h diff --git a/training/utils/decode-and-evaluate.pl b/training/utils/decode-and-evaluate.pl new file mode 100755 index 00000000..1a332c08 --- /dev/null +++ b/training/utils/decode-and-evaluate.pl @@ -0,0 +1,246 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use File::Basename qw(basename); +my $QSUB_CMD = qsub_args(mert_memory()); + +require "libcall.pl"; + +# Default settings +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $parallelize = "$bin_dir/parallelize.pl"; +my $libcall = "$bin_dir/libcall.pl"; +my $sentserver = "$bin_dir/sentserver"; +my $sentclient = "$bin_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "9g"; +my $help = 0; +my $config; +my $test_set; +my $weights; +my $use_make = 1; +my $useqsub; +my $cpbin=1; +# Process command-line options +if (GetOptions( + "jobs=i" => \$jobs, + "help" => \$help, + "qsub" => \$useqsub, + "input=s" => \$test_set, + "config=s" => \$config, + "weights=s" => \$weights, +) == 0 || @ARGV!=0 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); + +if (!defined $test_set) { push @missing_args, "--input"; } +if (!defined $config) { push @missing_args, "--config"; } +if (!defined $weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); + +my @tf = localtime(time); +my $tname = basename($test_set); +$tname =~ s/\.(sgm|sgml|xml)$//i; +my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]); + +my $time = unchecked_output("date"); + +check_call("mkdir -p $dir"); + +split_devset($test_set, "$dir/test.input.raw", "$dir/test.refs"); +my $refs = "-r $dir/test.refs"; +my $newsrc = "$dir/test.input"; +enseg("$dir/test.input.raw", $newsrc); +my $src_file = $newsrc; +open F, "<$src_file" or die "Can't read $src_file: $!"; close F; + +my $test_trans="$dir/test.trans"; +my $logdir="$dir/logs"; +my $decoderLog="$logdir/decoder.sentserver.log"; +check_call("mkdir -p $logdir"); + +#decode +print STDERR "RUNNING DECODER AT "; +print STDERR unchecked_output("date"); +my $decoder_cmd = "$decoder -c $config --weights $weights"; +my $pcmd; +if ($use_make) { + $pcmd = "cat $src_file | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; +} else { + $pcmd = "cat $src_file | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; +} +my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $test_trans"; +check_bash_call($cmd); +print STDERR "DECODER COMPLETED AT "; +print STDERR unchecked_output("date"); +print STDERR "\nOUTPUT: $test_trans\n\n"; +my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu"); +chomp $bleu; +print STDERR "BLEU: $bleu\n"; +my $ter = check_output("cat $test_trans | $SCORER $refs -m ter"); +chomp $ter; +print STDERR " TER: $ter\n"; +open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!"; +print TR <<EOT; +### SCORE REPORT ############################################################# + OUTPUT=$test_trans + SCRIPT INPUT=$test_set + DECODER INPUT=$src_file + REFERENCES=$dir/test.refs +------------------------------------------------------------------------------ + BLEU=$bleu + TER=$ter +############################################################################## +EOT +close TR; +my $sr = unchecked_output("cat $dir/test.scores"); +print STDERR "\n\n$sr\n(A copy of this report can be found in $dir/test.scores)\n\n"; +exit 0; + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + my $executable = basename($0); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable --config cdec.ini --weights weights.txt [--jobs N] [--qsub] <testset.in-ref> + +Options: + + --help + Print this message and exit. + + --config <file> + A path to the cdec.ini file. + + --weights <file> + A file specifying feature weights. + + --dir <dir> + Directory for intermediate and output files. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { + my ($infile, $outsrc, $outref) = @_; + open F, "<$infile" or die "Can't read $infile: $!"; + open S, ">$outsrc" or die "Can't write $outsrc: $!"; + open R, ">$outref" or die "Can't write $outref: $!"; + while(<F>) { + chomp; + my ($src, @refs) = split /\s*\|\|\|\s*/; + die "Malformed devset line: $_\n" unless scalar @refs > 0; + print S "$src\n"; + print R join(' ||| ', @refs) . "\n"; + } + close R; + close S; + close F; +} + diff --git a/training/entropy.cc b/training/utils/entropy.cc index 4fdbe2be..4fdbe2be 100644 --- a/training/entropy.cc +++ b/training/utils/entropy.cc diff --git a/training/entropy.h b/training/utils/entropy.h index 796589ca..796589ca 100644 --- a/training/entropy.h +++ b/training/utils/entropy.h diff --git a/training/grammar_convert.cc b/training/utils/grammar_convert.cc index 607a7cb9..607a7cb9 100644 --- a/training/grammar_convert.cc +++ b/training/utils/grammar_convert.cc diff --git a/training/lbfgs.h b/training/utils/lbfgs.h index e8baecab..e8baecab 100644 --- a/training/lbfgs.h +++ b/training/utils/lbfgs.h diff --git a/training/lbfgs_test.cc b/training/utils/lbfgs_test.cc index 9678e788..9678e788 100644 --- a/training/lbfgs_test.cc +++ b/training/utils/lbfgs_test.cc diff --git a/training/utils/libcall.pl b/training/utils/libcall.pl new file mode 100644 index 00000000..c7d0f128 --- /dev/null +++ b/training/utils/libcall.pl @@ -0,0 +1,71 @@ +use IPC::Open3; +use Symbol qw(gensym); + +$DUMMY_STDERR = gensym(); +$DUMMY_STDIN = gensym(); + +# Run the command and ignore failures +sub unchecked_call { + system("@_") +} + +# Run the command and return its output, if any ignoring failures +sub unchecked_output { + return `@_` +} + +# WARNING: Do not use this for commands that will return large amounts +# of stdout or stderr -- they might block indefinitely +sub check_output { + print STDERR "Executing and gathering output: @_\n"; + + my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_); + my $proc_output = ""; + while( <PH> ) { + $proc_output .= $_; + } + waitpid($pid, 0); + # TODO: Grab signal that the process died from + my $child_exit_status = $? >> 8; + if($child_exit_status == 0) { + return $proc_output; + } else { + print STDERR "ERROR: Execution of @_ failed.\n"; + exit(1); + } +} + +# Based on Moses' safesystem sub +sub check_call { + print STDERR "Executing: @_\n"; + system(@_); + my $exitcode = $? >> 8; + if($exitcode == 0) { + return 0; + } elsif ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + + } elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + + } else { + print STDERR "Failed with exit code: $exitcode\n" if $exitcode; + exit($exitcode); + } +} + +sub check_bash_call { + my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); + check_call(@args); +} + +sub check_bash_output { + my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); + return check_output(@args); +} + +# perl module weirdness... +return 1; diff --git a/training/online_optimizer.cc b/training/utils/online_optimizer.cc index 3ed95452..3ed95452 100644 --- a/training/online_optimizer.cc +++ b/training/utils/online_optimizer.cc diff --git a/training/online_optimizer.h b/training/utils/online_optimizer.h index 28d89344..28d89344 100644 --- a/training/online_optimizer.h +++ b/training/utils/online_optimizer.h diff --git a/training/optimize.cc b/training/utils/optimize.cc index 41ac90d8..41ac90d8 100644 --- a/training/optimize.cc +++ b/training/utils/optimize.cc diff --git a/training/optimize.h b/training/utils/optimize.h index 07943b44..07943b44 100644 --- a/training/optimize.h +++ b/training/utils/optimize.h diff --git a/training/optimize_test.cc b/training/utils/optimize_test.cc index bff2ca03..bff2ca03 100644 --- a/training/optimize_test.cc +++ b/training/utils/optimize_test.cc diff --git a/training/utils/parallelize.pl b/training/utils/parallelize.pl new file mode 100755 index 00000000..4197e0e5 --- /dev/null +++ b/training/utils/parallelize.pl @@ -0,0 +1,423 @@ +#!/usr/bin/env perl + +# Author: Adam Lopez +# +# This script takes a command that processes input +# from stdin one-line-at-time, and parallelizes it +# on the cluster using David Chiang's sentserver/ +# sentclient architecture. +# +# Prerequisites: the command *must* read each line +# without waiting for subsequent lines of input +# (for instance, a command which must read all lines +# of input before processing will not work) and +# return it to the output *without* buffering +# multiple lines. + +#TODO: if -j 1, run immediately, not via sentserver? possible differences in environment might make debugging harder + +#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s + +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } +use LocalConfig; + +use Cwd qw/ abs_path cwd getcwd /; +use File::Temp qw/ tempfile /; +use Getopt::Long; +use IPC::Open2; +use strict; +use POSIX ":sys_wait_h"; + +use File::Basename; +my $myDir = dirname(__FILE__); +print STDERR __FILE__." -> $myDir\n"; +push(@INC, $myDir); +require "libcall.pl"; + +my $tailn=5; # +0 = concatenate all the client logs. 5 = last 5 lines +my $recycle_clients; # spawn new clients when previous ones terminate +my $stay_alive; # dont let server die when having zero clients +my $joblist = ""; +my $errordir=""; +my $multiline; +my $workdir = '.'; +my $numnodes = 8; +my $user = $ENV{"USER"}; +my $pmem = "9g"; +my $basep=50300; +my $randp=300; +my $tryp=50; +my $no_which; +my $no_cd; + +my $DEBUG=$ENV{DEBUG}; +print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG; +my $verbose = 1; +sub verbose { + if ($verbose) { + print STDERR @_,"\n"; + } +} +sub debug { + if ($DEBUG) { + my ($package, $filename, $line) = caller; + print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n"; + } +} +my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].; +my $shell_escape_in_quote=qr.[\\"\$`!].; +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + return '""' unless $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} +sub preview_files { + my ($l,$skipempty,$footer,$n)=@_; + $n=$tailn unless defined $n; + my @f=grep { ! ($skipempty && -z $_) } @$l; + my $fn=join(' ',map {escape_shell($_)} @f); + my $cmd="tail -n $n $fn"; + unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); +} +sub prefix_dirname($) { + #like `dirname but if ends in / then return the whole thing + local ($_)=@_; + if (/\/$/) { + $_; + } else { + s#/[^/]$##; + $_ ? $_ : ''; + } +} +sub ensure_final_slash($) { + local ($_)=@_; + m#/$# ? $_ : ($_."/"); +} +sub extend_path($$;$$) { + my ($base,$ext,$mkdir,$baseisdir)=@_; + if (-d $base) { + $base.="/"; + } else { + my $dir; + if ($baseisdir) { + $dir=$base; + $base.='/' unless $base =~ /\/$/; + } else { + $dir=prefix_dirname($base); + } + my @cmd=("/bin/mkdir","-p",$dir); + check_call(@cmd) if $mkdir; + } + return $base.$ext; +} + +my $abscwd=abs_path(&getcwd); +sub print_help; + +my $use_fork; +my @pids; + +# Process command-line options +unless (GetOptions( + "stay-alive" => \$stay_alive, + "recycle-clients" => \$recycle_clients, + "error-dir=s" => \$errordir, + "multi-line" => \$multiline, + "workdir=s" => \$workdir, + "use-fork" => \$use_fork, + "verbose" => \$verbose, + "jobs=i" => \$numnodes, + "pmem=s" => \$pmem, + "baseport=i" => \$basep, +# "iport=i" => \$randp, #for short name -i + "no-which!" => \$no_which, + "no-cd!" => \$no_cd, + "tailn=s" => \$tailn, +) && scalar @ARGV){ + print_help(); + die "bad options."; +} + +my $cmd = ""; +my $prog=shift; +if ($no_which) { + $cmd=$prog; +} else { + $cmd=check_output("which $prog"); + chomp $cmd; + die "$prog not found - $cmd" unless $cmd; +} +#$cmd=abs_path($cmd); +for my $arg (@ARGV) { + $cmd .= " ".escape_shell($arg); +} +die "Please specify a command to parallelize\n" if $cmd eq ''; + +my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n"); + +my $executable = $cmd; +$executable =~ s/^\s*(\S+)($|\s.*)/$1/; +$executable=check_output("basename $executable"); +chomp $executable; + + +print STDERR "Parallelizing ($numnodes ways): $cmd\n\n"; + +# create -e dir and save .sh +use File::Temp qw/tempdir/; +unless ($errordir) { + $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1); +} +if ($errordir) { + my $scriptfile=extend_path("$errordir/","$executable.sh",1,1); + -d $errordir || die "should have created -e dir $errordir"; + open SF,">",$scriptfile || die; + print SF "$cdcmd$cmd\n"; + close SF; + chmod 0755,$scriptfile; + $errordir=abs_path($errordir); + &verbose("-e dir: $errordir"); +} + +# set cleanup handler +my @cleanup_cmds; +sub cleanup; +sub cleanup_and_die; +$SIG{INT} = "cleanup_and_die"; +$SIG{TERM} = "cleanup_and_die"; +$SIG{HUP} = "cleanup_and_die"; + +# other subs: +sub numof_live_jobs; +sub launch_job_on_node; + + +# vars +my $mydir = check_output("dirname $0"); chomp $mydir; +my $sentserver = "$mydir/sentserver"; +my $sentclient = "$mydir/sentclient"; +my $host = check_output("hostname"); +chomp $host; + + +# find open port +srand; +my $port = 50300+int(rand($randp)); +my $endp=$port+$tryp; +sub listening_port_lines { + my $quiet=$verbose?'':'2>/dev/null'; + return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp"); +} +my $netstat=&listening_port_lines; + +if ($verbose){ print STDERR "Testing port $port...";} + +while ($netstat=~/$port/ || &listening_port_lines=~/$port/){ + if ($verbose){ print STDERR "port is busy\n";} + $port++; + if ($port > $endp){ + die "Unable to find open port\n"; + } + if ($verbose){ print STDERR "Testing port $port... "; } +} +if ($verbose){ + print STDERR "port $port is available\n"; +} + +my $key = int(rand()*1000000); + +my $multiflag = ""; +if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; } +my $stay_alive_flag = ""; +if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; } + +my $node_count = 0; +my $script = ""; +# fork == one thread runs the sentserver, while the +# other spawns the sentclient commands. +my $pid = fork; +if ($pid == 0) { # child + sleep 8; # give other thread time to start sentserver + $script = "$cdcmd$sentclient $host:$port:$key $cmd"; + + if ($verbose){ + print STDERR "Client script:\n====\n"; + print STDERR $script; + print STDERR "====\n"; + } + for (my $jobn=0; $jobn<$numnodes; $jobn++){ + launch_job(); + } + if ($recycle_clients) { + my $ret; + my $livejobs; + while (1) { + $ret = waitpid($pid, WNOHANG); + #print STDERR "waitpid $pid ret = $ret \n"; + last if ($ret != 0); + $livejobs = numof_live_jobs(); + if ($numnodes >= $livejobs ) { # a client terminated, OR # lines of input was less than -j + print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n"; + launch_job(); + } else { + sleep 15; + } + } + } + print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n"; + for my $p (@pids) { + waitpid($p, 0); + } +} else { +# my $todo = "$sentserver -k $key $multiflag $port "; + my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag "; + if ($verbose){ print STDERR "Running: $todo\n"; } + check_call($todo); + print STDERR "Call to $sentserver returned.\n"; + cleanup(); + exit(0); +} + +sub numof_live_jobs { + if ($use_fork) { + die "not implemented"; + } else { + # We can probably continue decoding if the qstat error is only temporary + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat"))); + return ($#livejobs + 1); + } +} +my (@errors,@outs,@cmds); + +sub launch_job { + if ($use_fork) { return launch_job_fork(); } + my $errorfile = "/dev/null"; + my $outfile = "/dev/null"; + $node_count++; + my $clientname = $executable; + $clientname =~ s/^(.{4}).*$/$1/; + $clientname = "$clientname.$node_count"; + if ($errordir){ + $errorfile = "$errordir/$clientname.ER"; + $outfile = "$errordir/$clientname.OU"; + push @errors,$errorfile; + push @outs,$outfile; + } + my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile"; + push @cmds,$todo; + + print STDERR "Running: $todo\n"; + local(*QOUT, *QIN); + open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!"; + print QIN $script; + close QIN; + while (my $jobid=<QOUT>){ + chomp $jobid; + if ($verbose){ print STDERR "Launched client job: $jobid"; } + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + print STDERR " short job id $jobid\n"; + if ($verbose){ + print STDERR "cd: $abscwd\n"; + print STDERR "cmd: $cmd\n"; + } + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + my $cleanfn="qdel $jobid 2> /dev/null"; + push(@cleanup_cmds, $cleanfn); + } + close QOUT; +} + +sub launch_job_fork { + my $errorfile = "/dev/null"; + my $outfile = "/dev/null"; + $node_count++; + my $clientname = $executable; + $clientname =~ s/^(.{4}).*$/$1/; + $clientname = "$clientname.$node_count"; + if ($errordir){ + $errorfile = "$errordir/$clientname.ER"; + $outfile = "$errordir/$clientname.OU"; + push @errors,$errorfile; + push @outs,$outfile; + } + my $pid = fork; + if ($pid == 0) { + my ($fh, $scr_name) = get_temp_script(); + print $fh $script; + close $fh; + my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile"; + print STDERR "EXEC: $todo\n"; + my $out = check_output("$todo"); + unlink $scr_name or warn "Failed to remove $scr_name"; + exit 0; + } else { + push @pids, $pid; + } +} + +sub get_temp_script { + my ($fh, $filename) = tempfile( "$workdir/workXXXX", SUFFIX => '.sh'); + return ($fh, $filename); +} + +sub cleanup_and_die { + cleanup(); + die "\n"; +} + +sub cleanup { + print STDERR "Cleaning up...\n"; + for $cmd (@cleanup_cmds){ + print STDERR " Cleanup command: $cmd\n"; + eval $cmd; + } + print STDERR "outputs:\n",preview_files(\@outs,1),"\n"; + print STDERR "errors:\n",preview_files(\@errors,1),"\n"; + print STDERR "cmd:\n",$cmd,"\n"; + print STDERR " cat $errordir/*.ER\nfor logs.\n"; + print STDERR "Cleanup finished.\n"; +} + +sub print_help +{ + my $name = check_output("basename $0"); chomp $name; + print << "Help"; + +usage: $name [options] + + Automatic black-box parallelization of commands. + +options: + + --use-fork + Instead of using qsub, use fork. + + -e, --error-dir <dir> + Retain output files from jobs in <dir>, rather + than silently deleting them. + + -m, --multi-line + Expect that command may produce multiple output + lines for a single input line. $name makes a + reasonable attempt to obtain all output before + processing additional inputs. However, use of this + option is inherently unsafe. + + -v, --verbose + Print diagnostic informatoin on stderr. + + -j, --jobs + Number of jobs to use. + + -p, --pmem + pmem setting for each job. + +Help +} diff --git a/training/risk.cc b/training/utils/risk.cc index d5a12cfd..d5a12cfd 100644 --- a/training/risk.cc +++ b/training/utils/risk.cc diff --git a/training/risk.h b/training/utils/risk.h index 2e8db0fb..2e8db0fb 100644 --- a/training/risk.h +++ b/training/utils/risk.h diff --git a/training/utils/sentclient.c b/training/utils/sentclient.c new file mode 100644 index 00000000..91d994ab --- /dev/null +++ b/training/utils/sentclient.c @@ -0,0 +1,76 @@ +/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <netinet/in.h> +#include <netdb.h> +#include <string.h> + +#include "sentserver.h" + +int main (int argc, char *argv[]) { + int sock, port; + char *s, *key; + struct hostent *hp; + struct sockaddr_in server; + int errors = 0; + + if (argc < 3) { + fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n"); + exit(1); + } + + s = strchr(argv[1], ':'); + key = NULL; + + if (s == NULL) { + port = DEFAULT_PORT; + } else { + *s = '\0'; + s+=1; + /* dumb hack */ + key = strchr(s, ':'); + if (key != NULL){ + *key = '\0'; + key += 1; + } + port = atoi(s); + } + + sock = socket(AF_INET, SOCK_STREAM, 0); + + hp = gethostbyname(argv[1]); + if (hp == NULL) { + fprintf(stderr, "unknown host %s\n", argv[1]); + exit(1); + } + + bzero((char *)&server, sizeof(server)); + bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); + server.sin_family = hp->h_addrtype; + server.sin_port = htons(port); + + while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) { + perror("connect()"); + sleep(1); + errors++; + if (errors > 5) + exit(1); + } + + close(0); + close(1); + dup2(sock, 0); + dup2(sock, 1); + + if (key != NULL){ + write(1, key, strlen(key)); + write(1, "\n", 1); + } + + execvp(argv[2], argv+2); + return 0; +} diff --git a/training/utils/sentserver.c b/training/utils/sentserver.c new file mode 100644 index 00000000..c20b4fa6 --- /dev/null +++ b/training/utils/sentserver.c @@ -0,0 +1,515 @@ +/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ + +#include <string.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdio.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/time.h> +#include <netinet/in.h> +#include <sched.h> +#include <pthread.h> +#include <errno.h> + +#include "sentserver.h" + +#define MAX_CLIENTS 64 + +struct clientinfo { + int s; + struct sockaddr_in sin; +}; + +struct line { + int id; + char *s; + int status; + struct line *next; +} *head, **ptail; + +int n_sent = 0, n_received=0, n_flushed=0; + +#define STATUS_RUNNING 0 +#define STATUS_ABORTED 1 +#define STATUS_FINISHED 2 + +pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER; + +int n_clients = 0; +int s; +int expect_multiline_output = 0; +int log_mutex = 0; +int stay_alive = 0; /* dont panic and die with zero clients */ + +void queue_finish(struct line *node, char *s, int fid); +char * read_line(int fd, int multiline); +void done (int code); + +struct line * queue_get(int fid) { + struct line *cur; + char *s, *synch; + + if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid); + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + + /* First, check for aborted sentences. */ + + if (log_mutex) fprintf(stderr, " Checking queue for aborted jobs (fid %d)\n", fid); + for (cur = head; cur != NULL; cur = cur->next) { + if (cur->status == STATUS_ABORTED) { + cur->status = STATUS_RUNNING; + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + return cur; + } + } + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + /* Otherwise, read a new one. */ + if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); + if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); + pthread_mutex_lock(&input_mutex); + s = read_line(0,0); + + while (s) { + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); + pthread_mutex_unlock(&input_mutex); + + cur = malloc(sizeof (struct line)); + cur->id = n_sent; + cur->s = s; + cur->next = NULL; + + *ptail = cur; + ptail = &cur->next; + + n_sent++; + + if (strcmp(s,"===SYNCH===\n")==0){ + fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid); + // Note: queue_finish calls free(cur->s). + // Therefore we need to create a new string here. + synch = malloc((strlen("===SYNCH===\n")+2) * sizeof (char)); + synch = strcpy(synch, s); + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + queue_finish(cur, synch, fid); /* handles its own lock */ + + if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); + if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); + pthread_mutex_lock(&input_mutex); + + s = read_line(0,0); + } else { + if (log_mutex) fprintf(stderr, " Received new data %d (fid %d)\n", cur->id, fid); + cur->status = STATUS_RUNNING; + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + return cur; + } + } + + if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); + pthread_mutex_unlock(&input_mutex); + /* Only way to reach this point: no more output */ + + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + if (head == NULL) { + fprintf(stderr, "Reached end of file. Exiting.\n"); + done(0); + } else + ptail = NULL; /* This serves as a signal that there is no more input */ + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + return NULL; +} + +void queue_panic() { + struct line *next; + while (head && head->status == STATUS_FINISHED) { + /* Write out finished sentences */ + if (head->status == STATUS_FINISHED) { + fputs(head->s, stdout); + fflush(stdout); + } + /* Write out blank line for unfinished sentences */ + if (head->status == STATUS_ABORTED) { + fputs("\n", stdout); + fflush(stdout); + } + /* By defition, there cannot be any RUNNING sentences, since + function is only called when n_clients == 0 */ + free(head->s); + next = head->next; + free(head); + head = next; + n_flushed++; + } + fclose(stdout); + fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n"); + done(1); +} + +void queue_abort(struct line *node, int fid) { + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + node->status = STATUS_ABORTED; + if (n_clients == 0) { + if (stay_alive) { + fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n"); + } else { + queue_panic(); + } + } + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); +} + + +void queue_print() { + struct line *cur; + + fprintf(stderr, " Queue\n"); + + for (cur = head; cur != NULL; cur = cur->next) { + switch(cur->status) { + case STATUS_RUNNING: + fprintf(stderr, " %d running ", cur->id); break; + case STATUS_ABORTED: + fprintf(stderr, " %d aborted ", cur->id); break; + case STATUS_FINISHED: + fprintf(stderr, " %d finished ", cur->id); break; + + } + fprintf(stderr, "\n"); + //fprintf(stderr, cur->s); + } +} + +void queue_finish(struct line *node, char *s, int fid) { + struct line *next; + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + + free(node->s); + node->s = s; + node->status = STATUS_FINISHED; + n_received++; + + /* Flush out finished nodes */ + while (head && head->status == STATUS_FINISHED) { + + if (log_mutex) fprintf(stderr, " Flushing finished node %d\n", head->id); + + fputs(head->s, stdout); + fflush(stdout); + if (log_mutex) fprintf(stderr, " Flushed node %d\n", head->id); + free(head->s); + + next = head->next; + free(head); + + head = next; + + n_flushed++; + + if (head == NULL) { /* empty queue */ + if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */ + fprintf(stderr, "All sentences finished. Exiting.\n"); + done(0); + } else /* ptail pointed at something which was just popped off the stack -- reset to head*/ + ptail = &head; + } + } + + if (log_mutex) fprintf(stderr, " Flushing output %d\n", head->id); + fflush(stdout); + fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed); + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + +} + +char * read_line(int fd, int multiline) { + int size = 80; + char errorbuf[100]; + char *s = malloc(size+2); + int result, errors=0; + int i = 0; + + result = read(fd, s+i, 1); + + while (1) { + if (result < 0) { + perror("read()"); + sprintf(errorbuf, "Error code: %d\n", errno); + fprintf(stderr, errorbuf); + errors++; + if (errors > 5) { + free(s); + return NULL; + } else { + sleep(1); /* retry after delay */ + } + } else if (result == 0) { + break; + } else if (multiline==0 && s[i] == '\n') { + break; + } else { + if (s[i] == '\n'){ + /* if we've reached this point, + then multiline must be 1, and we're + going to poll the fd for an additional + line of data. The basic design is to + run a select on the filedescriptor fd. + Select will return under two conditions: + if there is data on the fd, or if a + timeout is reached. We'll select on this + fd. If select returns because there's data + ready, keep going; else assume there's no + more and return the data we already have. + */ + + fd_set set; + FD_ZERO(&set); + FD_SET(fd, &set); + + struct timeval timeout; + timeout.tv_sec = 3; // number of seconds for timeout + timeout.tv_usec = 0; + + int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); + if (ready<1){ + break; // no more data, stop looping + } + } + i++; + + if (i == size) { + size = size*2; + s = realloc(s, size+2); + } + } + + result = read(fd, s+i, 1); + } + + if (result == 0 && i == 0) { /* end of file */ + free(s); + return NULL; + } + + s[i] = '\n'; + s[i+1] = '\0'; + + return s; +} + +void * new_client(void *arg) { + struct clientinfo *client = (struct clientinfo *)arg; + struct line *cur; + int result; + char *s; + char errorbuf[100]; + + pthread_mutex_lock(&clients_mutex); + n_clients++; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client connected (%d connected)\n", n_clients); + + for (;;) { + + cur = queue_get(client->s); + + if (cur) { + /* fprintf(stderr, "Sending to client: %s", cur->s); */ + fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s); + result = write(client->s, cur->s, strlen(cur->s)); + if (result < strlen(cur->s)){ + perror("write()"); + sprintf(errorbuf, "Error code: %d\n", errno); + fprintf(stderr, errorbuf); + + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client died (%d connected)\n", n_clients); + queue_abort(cur, client->s); + + close(client->s); + free(client); + + pthread_exit(NULL); + } + } else { + close(client->s); + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + fprintf(stderr, "Client dismissed (%d connected)\n", n_clients); + pthread_exit(NULL); + } + + s = read_line(client->s,expect_multiline_output); + if (s) { + /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */ + fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id); +// queue_print(); + queue_finish(cur, s, client->s); + } else { + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client died (%d connected)\n", n_clients); + queue_abort(cur, client->s); + + close(client->s); + free(client); + + pthread_exit(NULL); + } + + } + return 0; +} + +void done (int code) { + close(s); + exit(code); +} + + + +int main (int argc, char *argv[]) { + struct sockaddr_in sin, from; + int g; + socklen_t len; + struct clientinfo *client; + int port; + int opt; + int errors = 0; + int argi; + char *key = NULL, *client_key; + int use_key = 0; + /* the key stuff here doesn't provide any + real measure of security, it's mainly to keep + jobs from bumping into each other. */ + + pthread_t tid; + port = DEFAULT_PORT; + + for (argi=1; argi < argc; argi++){ + if (strcmp(argv[argi], "-m")==0){ + expect_multiline_output = 1; + } else if (strcmp(argv[argi], "-k")==0){ + argi++; + if (argi == argc){ + fprintf(stderr, "Key must be specified after -k\n"); + exit(1); + } + key = argv[argi]; + use_key = 1; + } else if (strcmp(argv[argi], "--stay-alive")==0){ + stay_alive = 1; /* dont panic and die with zero clients */ + } else { + port = atoi(argv[argi]); + } + } + + /* Initialize data structures */ + head = NULL; + ptail = &head; + + /* Set up listener */ + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + opt = 1; + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(port); + while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) { + perror("bind()"); + sleep(1); + errors++; + if (errors > 100) + exit(1); + } + + len = sizeof(sin); + getsockname(s, (struct sockaddr *) &sin, &len); + + fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port)); + + while (listen(s, MAX_CLIENTS) < 0) { + perror("listen()"); + sleep(1); + errors++; + if (errors > 100) + exit(1); + } + + for (;;) { + len = sizeof(from); + g = accept(s, (struct sockaddr *)&from, &len); + if (g < 0) { + perror("accept()"); + sleep(1); + continue; + } + client = malloc(sizeof(struct clientinfo)); + client->s = g; + bcopy(&from, &client->sin, len); + + if (use_key){ + fd_set set; + FD_ZERO(&set); + FD_SET(client->s, &set); + + struct timeval timeout; + timeout.tv_sec = 3; // number of seconds for timeout + timeout.tv_usec = 0; + + int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); + if (ready<1){ + fprintf(stderr, "Prospective client failed to respond with correct key.\n"); + close(client->s); + free(client); + } else { + client_key = read_line(client->s,0); + client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */ + if (strcmp(key, client_key)==0){ + pthread_create(&tid, NULL, new_client, client); + } else { + fprintf(stderr, "Prospective client failed to respond with correct key.\n"); + close(client->s); + free(client); + } + free(client_key); + } + } else { + pthread_create(&tid, NULL, new_client, client); + } + } + +} + + + diff --git a/training/utils/sentserver.h b/training/utils/sentserver.h new file mode 100644 index 00000000..cd17a546 --- /dev/null +++ b/training/utils/sentserver.h @@ -0,0 +1,6 @@ +#ifndef SENTSERVER_H +#define SENTSERVER_H + +#define DEFAULT_PORT 50000 + +#endif |