diff options
Diffstat (limited to 'training')
156 files changed, 11101 insertions, 2574 deletions
diff --git a/training/Jamfile b/training/Jamfile deleted file mode 100644 index 073451fa..00000000 --- a/training/Jamfile +++ /dev/null @@ -1,25 +0,0 @@ -import testing ; -import option ; - -lib training : - ..//utils - ..//mteval - ..//decoder - ../klm/lm//kenlm - ..//boost_program_options - ttables.cc - : <include>. - : : - <library>..//decoder - <library>../klm/lm//kenlm - <library>..//utils - <library>..//mteval - <library>..//boost_program_options - ; - -exe model1 : model1.cc : <include>../decoder ; - -# // all_tests [ glob *_test.cc ] : ..//decoder : <testing.arg>$(TOP)/decoder/test_data ; - -alias programs : model1 ; - diff --git a/training/Makefile.am b/training/Makefile.am index 5254333a..e95e045f 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -1,91 +1,11 @@ -bin_PROGRAMS = \ - fast_align \ - lbl_model \ - test_ngram \ - mr_em_map_adapter \ - mr_em_adapted_reduce \ - mr_reduce_to_weights \ - mr_optimize_reduce \ - grammar_convert \ - plftools \ - collapse_weights \ - mpi_extract_reachable \ - mpi_extract_features \ - mpi_online_optimize \ - mpi_flex_optimize \ - mpi_batch_optimize \ - mpi_compute_cllh \ - augment_grammar +SUBDIRS = \ + liblbfgs \ + utils \ + crf \ + minrisk \ + dpmert \ + pro \ + dtrain \ + mira \ + rampion -noinst_PROGRAMS = \ - lbfgs_test \ - optimize_test - -TESTS = lbfgs_test optimize_test - -noinst_LIBRARIES = libtraining.a -libtraining_a_SOURCES = \ - candidate_set.cc \ - entropy.cc \ - optimize.cc \ - online_optimizer.cc \ - risk.cc - -mpi_online_optimize_SOURCES = mpi_online_optimize.cc -mpi_online_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc -mpi_flex_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc -mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_extract_features_SOURCES = mpi_extract_features.cc -mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc -mpi_batch_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc -mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -augment_grammar_SOURCES = augment_grammar.cc -augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -test_ngram_SOURCES = test_ngram.cc -test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -fast_align_SOURCES = fast_align.cc ttables.cc -fast_align_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - -lbl_model_SOURCES = lbl_model.cc -lbl_model_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - -grammar_convert_SOURCES = grammar_convert.cc -grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - -optimize_test_SOURCES = optimize_test.cc -optimize_test_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - -collapse_weights_SOURCES = collapse_weights.cc -collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - -lbfgs_test_SOURCES = lbfgs_test.cc -lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - -mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc -mr_optimize_reduce_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - -mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc -mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - -mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc -mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - -mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc -mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - -plftools_SOURCES = plftools.cc -plftools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm diff --git a/training/add-model1-features-to-scfg.pl b/training/add-model1-features-to-scfg.pl deleted file mode 100755 index a0074317..00000000 --- a/training/add-model1-features-to-scfg.pl +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/perl -w - -# [X] ||| so [X,1] die [X,2] der ||| as [X,1] existing [X,2] the ||| 2.47712135315 2.53182387352 5.07100057602 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| this [X,1] the [X,2] of ||| 2.47712135315 3.19828724861 2.38270020485 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| as [X,1] the [X,2] the ||| 2.47712135315 2.53182387352 1.48463630676 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| is [X,1] the [X,2] of the ||| 2.47712135315 3.45197868347 2.64251494408 ||| 0-0 2-2 4-4 4-5 - -die "Usage: $0 model1.f-e model1.e-f < grammar.scfg\n (use trianing/model1 to extract the model files)\n" unless scalar @ARGV == 2; - -my $fm1 = shift @ARGV; -die unless $fm1; -my $frm1 = shift @ARGV; -die unless $frm1; -open M1,"<$fm1" or die; -open RM1,"<$frm1" or die; -print STDERR "Loading Model 1 probs from $fm1...\n"; -my %m1; -while(<M1>) { - chomp; - my ($f, $e, $lp) = split /\s+/; - $m1{$e}->{$f} = exp($lp); -} -close M1; - -print STDERR "Loading Inverse Model 1 probs from $frm1...\n"; -my %rm1; -while(<RM1>) { - chomp; - my ($e, $f, $lp) = split /\s+/; - $rm1{$f}->{$e} = exp($lp); -} -close RM1; - -my @label = qw( EGivenF LexFGivenE LexEGivenF ); -while(<>) { - chomp; - my ($l, $f, $e, $sscores, $al) = split / \|\|\| /; - my @scores = split /\s+/, $sscores; - unless ($sscores =~ /=/) { - for (my $i=0; $i<3; $i++) { $scores[$i] = "$label[$i]=$scores[$i]"; } - } - push @scores, "RuleCount=1"; - my @fs = split /\s+/, $f; - my @es = split /\s+/, $e; - my $flen = scalar @fs; - my $elen = scalar @es; - my $pgen = 0; - my $nongen = 0; - for (my $i =0; $i < $flen; $i++) { - my $ftot = 0; - next if ($fs[$i] =~ /\[X/); - my $cr = $rm1{$fs[$i]}; - for (my $j=0; $j <= $elen; $j++) { - my $ej = '<eps>'; - if ($j < $elen) { $ej = $es[$j]; } - my $p = $cr->{$ej}; - if (defined $p) { $ftot += $p; } - } - if ($ftot == 0) { $nongen = 1; last; } - $pgen += log($ftot) - log($elen); - } - my $bad = 0; - my $good = 0; - unless ($nongen) { push @scores, "RGood=1"; $good++; } else { push @scores, "RBad=1"; $bad++; } - - $nongen = 0; - $pgen = 0; - for (my $i =0; $i < $elen; $i++) { - my $etot = 0; - next if ($es[$i] =~ /\[X/); - my $cr = $m1{$es[$i]}; -# print STDERR "$es[$i]\n"; - for (my $j=0; $j <= $flen; $j++) { - my $fj = '<eps>'; - if ($j < $flen) { $fj = $fs[$j]; } - my $p = $cr->{$fj}; -# print STDERR " $fs[$j] : $p\n"; - if (defined $p) { $etot += $p; } - } - if ($etot == 0) { $nongen = 1; last; } - $pgen += log($etot) - log($flen); - } - unless ($nongen) { - push @scores, "FGood=1"; - if ($good) { push @scores, "BothGood=1"; } else { push @scores, "SusDel=1"; } - } else { - push @scores, "FBad=1"; - if ($bad) { push @scores, "BothBad=1"; } else { push @scores, "SusHall=1"; } - } - print "$l ||| $f ||| $e ||| @scores"; - if (defined $al) { print " ||| $al\n"; } else { print "\n"; } -} - diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc deleted file mode 100644 index dc480f6c..00000000 --- a/training/collapse_weights.cc +++ /dev/null @@ -1,110 +0,0 @@ -char const* NOTES = - "ZF_and_E means unnormalized scaled features.\n" - "For grammars with one nonterminal: F_and_E is joint,\n" - "F_given_E and E_given_F are conditional.\n" - "TODO: group rules by root nonterminal and then normalize.\n"; - - -#include <iostream> -#include <fstream> -#include <tr1/unordered_map> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include <boost/functional/hash.hpp> - -#include "prob.h" -#include "filelib.h" -#include "trule.h" -#include "weights.h" - -namespace po = boost::program_options; -using namespace std; - -typedef std::tr1::unordered_map<vector<WordID>, prob_t, boost::hash<vector<WordID> > > MarginalMap; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("grammar,g", po::value<string>(), "Grammar file") - ("weights,w", po::value<string>(), "Weights file") - ("unnormalized,u", "Always include ZF_and_E unnormalized score (default: only if sum was >1)") - ; - po::options_description clo("Command line options"); - clo.add_options() - ("config,c", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - const string cfg = (*conf)["config"].as<string>(); - cerr << "Configuration file: " << cfg << endl; - ifstream config(cfg.c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("grammar") || !conf->count("weights")) { - cerr << dcmdline_options << endl; - cerr << NOTES << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string wfile = conf["weights"].as<string>(); - const string gfile = conf["grammar"].as<string>(); - vector<weight_t> w; - Weights::InitFromFile(wfile, &w); - MarginalMap e_tots; - MarginalMap f_tots; - prob_t tot; - { - ReadFile rf(gfile); - assert(*rf.stream()); - istream& in = *rf.stream(); - cerr << "Computing marginals...\n"; - int lc = 0; - while(in) { - string line; - getline(in, line); - ++lc; - if (line.empty()) continue; - TRule tr(line, true); - if (tr.GetFeatureValues().empty()) - cerr << "Line " << lc << ": empty features - may introduce bias\n"; - prob_t prob; - prob.logeq(tr.GetFeatureValues().dot(w)); - e_tots[tr.e_] += prob; - f_tots[tr.f_] += prob; - tot += prob; - } - } - bool normalized = (fabs(log(tot)) < 0.001); - cerr << "Total: " << tot << (normalized ? " [normalized]" : " [scaled]") << endl; - ReadFile rf(gfile); - istream&in = *rf.stream(); - while(in) { - string line; - getline(in, line); - if (line.empty()) continue; - TRule tr(line, true); - const double lp = tr.GetFeatureValues().dot(w); - if (isinf(lp)) { continue; } - tr.scores_.clear(); - - cout << tr.AsString() << " ||| F_and_E=" << lp - log(tot); - if (!normalized || conf.count("unnormalized")) { - cout << ";ZF_and_E=" << lp; - } - cout << ";F_given_E=" << lp - log(e_tots[tr.e_]) - << ";E_given_F=" << lp - log(f_tots[tr.f_]) << endl; - } - return 0; -} - diff --git a/training/crf/Makefile.am b/training/crf/Makefile.am new file mode 100644 index 00000000..4a8c30fd --- /dev/null +++ b/training/crf/Makefile.am @@ -0,0 +1,31 @@ +bin_PROGRAMS = \ + mpi_batch_optimize \ + mpi_compute_cllh \ + mpi_extract_features \ + mpi_extract_reachable \ + mpi_flex_optimize \ + mpi_online_optimize \ + mpi_baum_welch + +mpi_baum_welch_SOURCES = mpi_baum_welch.cc +mpi_baum_welch_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz + +mpi_online_optimize_SOURCES = mpi_online_optimize.cc +mpi_online_optimize_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz + +mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc +mpi_flex_optimize_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz + +mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc +mpi_extract_reachable_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz + +mpi_extract_features_SOURCES = mpi_extract_features.cc +mpi_extract_features_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz + +mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc cllh_observer.h +mpi_batch_optimize_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz + +mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc cllh_observer.h +mpi_compute_cllh_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz + +AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir)/training -I$(top_srcdir)/training/utils -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/crf/baum_welch_example/README.md b/training/crf/baum_welch_example/README.md new file mode 100644 index 00000000..97525da5 --- /dev/null +++ b/training/crf/baum_welch_example/README.md @@ -0,0 +1,32 @@ +Here's how to do Baum-Welch training with `cdec`. + +## Set the tags you want. + +First, set the number of tags you want in tagset.txt (these +can be any symbols, listed one after another, separated +by whitespace), e.g.: + + C1 C2 C3 C4 + +## Extract the parameter feature names + + ../mpi_extract_features -c cdec.ini -t train.txt + +If you have compiled with MPI, you can use `mpirun`: + + mpirun -np 8 ../mpi_extract_features -c cdec.ini -t train.txt + +## Randomly initialize the weights file + + sort -u features.* | ./random_init.pl > weights.init + +## Run training + + ../mpi_baum_welch -c cdec.ini -t train.txt -w weights.init -n 50 + +Again, if you have compiled with MPI, you can use `mpirun`: + + mpirun -np 8 ../mpi_baum_welch -c cdec.ini -t train.txt -w weights.init -n 50 + +The `-n` flag indicates how many iterations to run for. + diff --git a/training/crf/baum_welch_example/cdec.ini b/training/crf/baum_welch_example/cdec.ini new file mode 100644 index 00000000..61203da7 --- /dev/null +++ b/training/crf/baum_welch_example/cdec.ini @@ -0,0 +1,5 @@ +feature_function=Tagger_BigramIndicator +feature_function=LexicalPairIndicator +formalism=tagger +tagger_tagset=tagset.txt +intersection_strategy=full diff --git a/training/crf/baum_welch_example/random_init.pl b/training/crf/baum_welch_example/random_init.pl new file mode 100755 index 00000000..98467ed1 --- /dev/null +++ b/training/crf/baum_welch_example/random_init.pl @@ -0,0 +1,9 @@ +#!/usr/bin/perl -w +while(<>) { + chomp; + my ($a,$b,@d) =split /\s+/; + die "Bad input" if scalar @d > 0; + $r = -rand() * rand() - 0.5; + $r = 0 if $a =~ /^Uni:/; + print "$a $r\n"; +} diff --git a/training/crf/baum_welch_example/tagset.txt b/training/crf/baum_welch_example/tagset.txt new file mode 100644 index 00000000..93a48451 --- /dev/null +++ b/training/crf/baum_welch_example/tagset.txt @@ -0,0 +1 @@ +1 2 3 4 diff --git a/training/crf/baum_welch_example/train.txt b/training/crf/baum_welch_example/train.txt new file mode 100644 index 00000000..e9c3455e --- /dev/null +++ b/training/crf/baum_welch_example/train.txt @@ -0,0 +1,2000 @@ +t h e +t o +o f +i n +a n d +a +s a i d +f o r +o n +t h a t +w i t h +w a s +i s +b y +a t +h e +a s +f r o m +i t +h a s +b e +h i s +h a v e +w i l l +a n +a r e +w e r e +b u t +n o t +w h o +a f t e r +h a d +y e a r +i t s +t w o +t h i s +w h i c h +t h e y +t h e i r +g o v e r n m e n t +b e e n +w e +p e r c e n t +w o u l d +n e w +i +a l s o +u p +m o r e +o n e +p e o p l e +f i r s t +l a s t +a b o u t +c h i n a +p r e s i d e n t +o v e r +m i l l i o n +o r +o u t +w o r l d +w h e n +a l l +o t h e r +m i n i s t e r +t h r e e +t h a n +u n i t e d +t h e r e +a g a i n s t +i n t o +c o u n t r y +s o m e +p o l i c e +n o +t i m e +y e a r s +s t a t e +w e d n e s d a y +t u e s d a y +t h u r s d a y +s t a t e s +m o n d a y +u s +c o u l d +i f +f r i d a y +s i n c e +b i l l i o n +s h e +f o r e i g n +o f f i c i a l s +d a y +i n t e r n a t i o n a l +h e r +b e t w e e n +o n l y +b e f o r e +s o u t h +w h i l e +d u r i n g +n a t i o n a l +t o l d +s e c o n d +g r o u p +f o u r +d o w n +c i t y +p a r t y +t h e m +s e c u r i t y +d o +m a d e +d o l l a r s +p o i n t s +u n d e r +m i l i t a r y +b e c a u s e +w e e k +c o u n t r i e s +c a n +c h i n e s e +o f f +s u n d a y +m o s t +s o +h i m +e c o n o m i c +f o r m e r +i r a q +f i v e +s a t u r d a y +a c c o r d i n g +d i d +n o w +o f f i c i a l +m a y +n e w s +w a r +a n y +w h e r e +t e a m +m e e t i n g +k i l l e d +b a n k +s h o u l d +j u s t +r e p o r t e d +m a n y +n e x t +w h a t +c o m p a n y +i n c l u d i n g +b a c k +m o n t h +r e p o r t +o u r +p r i m e +m a r k e t +s t i l l +b e i n g +c o u r t +t r a d e +h e r e +p e a c e +h i g h +o l d +s e t +t h r o u g h +y o u +i s r a e l +t a l k s +e n d +t a k e +e x p e c t e d +p o l i t i c a l +s i x +s u c h +b o t h +m a k e +h o m e +l o c a l +j a p a n +r u s s i a +s a y i n g +g e n e r a l +t o p +a n o t h e r +e u r o p e a n +n o r t h +h e l d +t h i r d +m a j o r +s t a t e m e n t +w e l l +a m e r i c a n +i s r a e l i +t a i w a n +l e a d e r +c a p i t a l +l o n g +o i l +t h o s e +c a l l e d +p a r t +s p o k e s m a n +w o r k +d e v e l o p m e n t +a d d e d +s a y s +w o n +m e m b e r s +l e f t +c h i e f +g a m e +l i k e +t h e n +h e l p +s a y +p a l e s t i n i a n +v e r y +c u p +p u b l i c +f r a n c e +c e n t r a l +l e a d e r s +w i n +b u s h +m i n i s t r y +m o n t h s +g e t +w a y +d a y s +r e g i o n +s u p p o r t +t r o o p s +a g e n c y +f o r c e s +e a r l i e r +e v e n +n a t i o n s +v i s i t +g a m e s +e u +f i n a l +a m o n g +h o u s e +s e v e r a l +e a r l y +l e d +d l r s +l a t e r +w o m e n +k o n g +h o n g +p r e s s +p o w e r +t o d a y +o p e n +i n d e x +o f f i c e +f o l l o w i n g +a r o u n d +b a s e d +c o n f e r e n c e +b r i t i s h +c o u n c i l +u n i o n +t o o k +c a m e +w e s t +r u n +h o w e v e r +e a s t +l a t e +s e a s o n +g o o d +c l o s e +g e r m a n y +l e a d +p a s t +d e f e n s e +p l a c e +n u m b e r +a r m y +r u s s i a n +l a w +i n d i a +m e n +f i n a n c i a l +e c o n o m y +l e a s t +s e c r e t a r y +s a m e +y o r k +f o u n d +g o i n g +r i g h t +g o +m y +o p p o s i t i o n +f o r c e +a g r e e m e n t +e l e c t i o n +h o w +b u s i n e s s +f r e n c h +a u t h o r i t i e s +p l a y +m u c h +r i g h t s +t i m e s +c o m m i t t e e +r o u n d +p r o v i n c e +k o r e a +h a l f +a t t a c k +p r i c e s +s t o c k +h i t +p l a n +a r e a +c o o p e r a t i o n +s e v e n +n e a r +e x c h a n g e +u s e d +n u c l e a r +p a k i s t a n +b e i j i n g +a n n o u n c e d +a i r +a f r i c a +c e n t e r +a g o +t h e s e +d e c i s i o n +a t t a c k s +w i t h o u t +m a t c h +m a r c h +n a t i o n +h e a d +t o t a l +c o m p a n i e s +m a n +d e a l +w a s h i n g t o n +r e c e n t +c a s e +f i r e +n i g h t +a u s t r a l i a +a f r i c a n +u n t i l +i r a n +e l e c t i o n s +s o u t h e r n +l e a g u e +p u t +e a c h +m e m b e r +c h i l d r e n +h e a l t h +p c +p a r l i a m e n t +l o s t +t h i n k +d e a t h +m u s t +e i g h t +w o r k e r s +u s e +b r i t a i n +w a n t +s y s t e m +r e c o r d +d e p a r t m e n t +p r o g r a m +e f f o r t s +g r o w t h +r e s u l t s +i r a q i +i s s u e +b e s t +w h e t h e r +h u m a n +n o r t h e r n +c o n t r o l +f a r +f u r t h e r +a l r e a d y +s h a r e s +r e l a t i o n s +m e e t +s o l d i e r s +s e e +f r e e +c o m e +j a p a n e s e +m o n e y +d o l l a r +r e p o r t s +d i r e c t o r +s h a r e +g i v e +j u n e +c o m m i s s i o n +l a r g e s t +i n d u s t r y +c o n t i n u e +s t a r t +c a m p a i g n +l e a d i n g +q u a r t e r +i n f o r m a t i o n +v i c t o r y +r o s e +o t h e r s +a n t i +r e t u r n +f a m i l y +i s s u e s +s h o t +p o l i c y +e u r o p e +m e d i a +p l a n s +b o r d e r +n e e d +p e r +a r e a s +j u l y +v i o l e n c e +d e s p i t e +d o e s +s t r o n g +c h a i r m a n +s e r v i c e +v o t e +a s k e d +f o o d +f e d e r a l +w e n t +t a k e n +d u e +m o v e +o w n +b e g a n +i t a l y +g r o u p s +p o s s i b l e +l i f e +c l e a r +r a t e +f e l l +c r i s i s +r e b e l s +p o i n t +d e m o c r a t i c +w a t e r +a h e a d +i n v e s t m e n t +i n c r e a s e +s h o w +p l a y e r s +r e l e a s e d +g e r m a n +t o w n +n e a r l y +p r o c e s s +m i l e s +f e w +d i e d +a d m i n i s t r a t i o n +a p r i l +w e e k s +l e v e l +k e y +a s i a +s a l e s +w e a p o n s +c l o s e d +b e h i n d +m i n u t e s +a g r e e d +p r e s i d e n t i a l +g r e a t +m a k i n g +t o o +p r i c e +j o h n +g o t +f o u r t h +a g a i n +k i l o m e t e r s +s i t u a t i o n +m a i n +w h i t e +r e p o r t e r s +h o u r s +s e n i o r +a s i a n +r e p u b l i c +a w a y +g l o b a l +f i g h t i n g +s e r i e s +b e t t e r +n e w s p a p e r +m e +c l i n t o n +a r r e s t e d +h i g h e r +k n o w +f u t u r e +s c o r e d +g o l d +n a t o +m o r n i n g +n e v e r +b e a t +w i t h i n +r n +c u r r e n t +p e r i o d +b e c o m e +w e s t e r n +i m p o r t a n t +l o n d o n +a u s t r a l i a n +s p a i n +e n e r g y +a i d +a c c u s e d +o l y m p i c +n i n e +a c r o s s +f a c e +o r g a n i z a t i o n +g o a l +t a k i n g +i n j u r e d +s p e c i a l +r a c e +d a i l y +s u m m i t +s i d e +l i n e +o r d e r +u n i v e r s i t y +a f g h a n i s t a n +p l a y e d +b i g +c a r +t r y i n g +e n g l a n d +q u o t e d +d e +a l o n g +i s l a m i c +o u t s i d e +t r a d i n g +e d s +c u t +a c t i o n +p r o b l e m s +v i c e +w o r k i n g +y e n +b u i l d i n g +s i g n e d +k n o w n +c h a n g e +c h a r g e s +s m a l l +l o w e r +a l t h o u g h +s e n t +c o n g r e s s +h o s p i t a l +h o l d +m i g h t +u n +e v e r y +g i v e n +d e p u t y +i n t e r e s t +i s l a n d +s c h o o l +d r u g +k i l l i n g +r u l i n g +t o u r +o p e n i n g +t e r m +f u l l +c l r +l i t t l e +m a r k e t s +c o a c h +j a n u a r y +s c h e d u l e d +k e e p +t u r k e y +p r e v i o u s +e x e c u t i v e +g a s +m e t +j o i n t +t r i a l +b o a r d +p r o d u c t i o n +i n d o n e s i a +s e r v i c e s +l i k e l y +t h o u s a n d s +i n d i a n +p o s t +a r a b +c e n t s +h o p e +s i n g a p o r e +p a l e s t i n i a n s +p a r t i e s +g a v e +b i l l +d e a d +r o l e +s e p t e m b e r +t e l e v i s i o n +c o m m u n i t y +r e g i o n a l +a d d i n g +a m e r i c a +o n c e +y u a n +t e s t +s t o c k s +w h o s e +p a y +p r i v a t e +l a t e s t +i n v e s t o r s +f r o n t +c a n a d a +r e l e a s e +r e c e i v e d +m e a n w h i l e +l e s s +t h a i l a n d +l a n d +c h a m p i o n +r e a c h e d +u r g e d +d e c e m b e r +a s s o c i a t i o n +f i g h t +s i d e s +s t a r t e d +l a r g e +y e t +m i d d l e +c a l l +p r e s s u r e +e n d e d +s o c i a l +p r o j e c t +l o w +h a r d +c l u b +p r e m i e r +t e c h n o l o g y +f a i l e d +t o u r n a m e n t +r e a l +p r o v i d e +g a z a +m i n u t e +a f f a i r s +m i n i s t e r s +p r o d u c t s +r e s e a r c h +s e e n +g e o r g e +e v e n t +s t o p +i n v e s t i g a t i o n +a i r p o r t +m e x i c o +t i t l e +t o k y o +e a s t e r n +b i g g e s t +y o u n g +d e m a n d +t h o u g h +a r m e d +s a n +o p e n e d +m e a s u r e s +n o v e m b e r +a v e r a g e +m a r k +o c t o b e r +k o r e a n +r a d i o +b o d y +s e c t o r +c a b i n e t +g m t +a s s o c i a t e d +a p +c i v i l +t e r r o r i s m +s h o w e d +p r i s o n +s i t e +p r o b l e m +s e s s i o n +b r a z i l +m u s l i m +c o a l i t i o n +b a g h d a d +b i d +s t r e e t +c o m i n g +b e l i e v e +m a l a y s i a +s t u d e n t s +d e c i d e d +f i e l d +r e d +n e g o t i a t i o n s +w i n n i n g +o p e r a t i o n +c r o s s +s o o n +p l a n n e d +a b l e +t i e s +t a x +j u s t i c e +d o m e s t i c +d a v i d +i n c l u d e +n a m e +b o m b +t r a i n i n g +j u d g e +v i c t i m s +m e d i c a l +c o n d i t i o n +f i n d +r e m a i n +i s s u e d +f i n a n c e +l o t +l a b o r +b t +e n o u g h +i m m e d i a t e l y +s h o r t +l o s s +a n n u a l +m o v e d +r e b e l +s t r i k e +r o a d +r e c e n t l y +i t a l i a n +c o n s t r u c t i o n +t r y +a u g u s t +e x p r e s s e d +m i l i t a n t s +t o g e t h e r +w a n t e d +r a t e s +f u n d +f o r w a r d +m i s s i o n +d i s c u s s +r e s u l t +c a l l s +k o s o v o +o p e r a t i o n s +c a s e s +z e a l a n d +s o u r c e s +i n c r e a s e d +l e g a l +b a n k s +i n v o l v e d +o f f i c e r s +l e a v e +m e t e r s +w a r n e d +h a v i n g +r e a c h +b r i n g +h i s t o r y +d i s t r i c t +j o b +a l l o w e d +a r r i v e d +t o w a r d +c l a i m e d +e g y p t +t e a m s +a l l o w +a l m o s t +f e b r u a r y +s e r i o u s +p o o r +c o n t i n u e d +s t e p +i n t e r v i e w +e d u c a t i o n +n o n +r e a l l y +s t a r +l e e +r e s i d e n t s +b a n +s o c c e r +n e e d e d +p a r i s +i n d u s t r i a l +p l a y e r +m o s c o w +s t a t i o n +o f f e r +h u n d r e d s +t a l i b a n +w o m a n +m a n a g e m e n t +l e b a n o n +n o t e d +c h e n +p o s i t i o n +f i n i s h e d +c o s t +e x p e r t s +e v e r +m o v e m e n t +t e r r o r i s t +p l a n e +b l a c k +d i f f e r e n t +b e l i e v e d +p l a y i n g +c a u s e d +h o p e s +c o n d i t i o n s +b r o u g h t +f o r c e d +l a u n c h e d +w e e k e n d +m i c h a e l +s e a +r i s e +d e t a i l s +s p o r t s +e t h n i c +s t a f f +c h a n c e +g o a l s +b u d g e t +h a n d +b a s e +s e c o n d s +s r i +s p e a k i n g +o f f i c e r +m a j o r i t y +w a n t s +c h a r g e d +s h a n g h a i +v i e t n a m +x i n h u a +c o m m e n t +d r o p p e d +t u r n e d +p r o t e s t +r e f o r m +s u s p e c t e d +a m i d +t r i e d +c i t i e s +g r o u n d +t u r k i s h +s t a g e +e f f o r t +s +c o m m u n i s t +a n a l y s t s +h a m a s +p r o j e c t s +c o n t r a c t +i n d e p e n d e n c e +l o o k i n g +a m +s i g n +f o l l o w e d +r e m a i n s +c o m p a r e d +u s i n g +h e a v y +a f t e r n o o n +s t r a i g h t +l o o k +f a l l +r e a d y +e u r o +c h a r g e +w o u n d e d +p r o g r e s s +p a c i f i c +d e n i e d +h o u r +c a r e e r +c o n f i r m e d +t h a i +r u l e +c o u r s e +w i f e +e x p o r t s +b e c a m e +a m e r i c a n s +e m e r g e n c y +a r a f a t +r e f u s e d +l i s t +a l l e g e d +c h a m p i o n s h i p +p o p u l a t i o n +n e e d s +c o m p e t i t i o n +o r d e r e d +s a f e t y +a u t h o r i t y +i l l e g a l +t v +d o n e +e v i d e n c e +s t a y +f i f t h +s e e k i n g +s t u d y +l i v e +r u n s +c o a s t +s a u d i +h e l p e d +a c t i v i t i e s +m a n a g e r +w o r t h +k i n g +g r o w i n g +r u n n i n g +f i r e d +i n c l u d e d +p a u l +w a l l +r e t u r n e d +c o n f l i c t +m y a n m a r +d e m o c r a c y +p r o +f o r m +a l w a y s +a m b a s s a d o r +m a t c h e s +t h i n g s +m a i n l a n d +s a w +d i s e a s e +r e l a t e d +f u n d s +i n d e p e n d e n t +t o n s +a p p r o v e d +e m b a s s y +c u r r e n c y +b r e a k +s e n a t e +c o n c e r n s +f i g u r e s +j o i n +r e s o l u t i o n +o f t e n +c o n f i d e n c e +e s p e c i a l l y +w i n n e r +c a r r i e d +i m p r o v e +s w e d e n +z i m b a b w e +t h r e a t +c u r r e n t l y +s i n g l e +h i m s e l f +l i v i n g +r e f u g e e s +a i m e d +c o u n t y +c a n n o t +a r m s +b u i l d +g e t t i n g +a p p e a r e d +d i f f i c u l t +s p a n i s h +r i v e r +m i s s i n g +e s t i m a t e d +s o m e t h i n g +p r o p o s e d +c e r e m o n y +i n s t e a d +b r o k e +c h u r c h +o l y m p i c s +s p a c e +p r o f i t +v i l l a g e +l i g h t +p e r f o r m a n c e +d e l e g a t i o n +t r i p +o v e r a l l +p a r t s +a c t +c o r r u p t i o n +d i v i s i o n +s i m i l a r +p o s i t i v e +c a m p +g r a n d +p o r t +s u p p o r t e r s +r e p u b l i c a n +b e g i n +j o n e s +p a r k +b i l a t e r a l +c l o u d y +d i p l o m a t i c +p r e s e n t +l o s +a r g e n t i n a +t r a v e l +s p e e c h +a t t e n t i o n +n e t +j o b s +a r r e s t +p r o s e c u t o r s +i n f l a t i o n +n a m e d +j o r d a n +s o n +g o v e r n m e n t s +r u l e s +p r o t e c t i o n +k e n y a +h o m e s +l i v e s +s e r b +s a n c t i o n s +a t t e m p t +e x p o r t +m e a n s +n i g e r i a +r e m a i n e d +t u r n +c r i m e s +c o n c e r n +e n v i r o n m e n t +p l a n t +l e t t e r +v a l u e +r e s p o n s e +a s s e m b l y +p r o p o s a l +h o l d i n g +b o m b i n g +e n s u r e +a f g h a n +r e s o u r c e s +f a m i l i e s +r e s t +i n s i d e +t h r o u g h o u t +m a t t e r +c a u s e +l a w m a k e r s +i i +f u e l +c a l i f o r n i a +e g y p t i a n +o w n e d +s u i c i d e +c z e c h +c a r e +a t t o r n e y +c l a i m s +v o t e r s +n e t w o r k +b a l l +p h i l i p p i n e +f o o t b a l l +s p o k e s w o m a n +i n c i d e n t +p r e v e n t +w h y +d e v e l o p i n g +c i v i l i a n s +e n g l i s h +o b a m a +i n t e r n e t +r i c e +s a d d a m +y o u r +u p d a t e s +l e t +d o i n g +a i r c r a f t +f l i g h t +a n g e l e s +i n t e l l i g e n c e +p h i l i p p i n e s +f a t h e r +c r e d i t +a l l i a n c e +t e r m s +r a i s e d +i r a n i a n +c h a n g e s +s y r i a +v a r i o u s +i n d o n e s i a n +l i +i r e l a n d +l e a v i n g +d e c l i n e d +c o m m o n +i n j u r y +t r e a t m e n t +a v a i l a b l e +c h a m p i o n s +e l e c t e d +s u m m e r +d a t a +o v e r s e a s +p a i d +c e n t u r y +n o t h i n g +f i r m +r e l i g i o u s +s w i t z e r l a n d +o f f e r e d +c h a m p i o n s h i p s +t h o u g h t +c a n d i d a t e +c o n s i d e r e d +r i s k +c r i m e +g o v e r n o r +f i l m +r a l l y +f l o r i d a +t e r r o r +d o u b l e +e q u i p m e n t +j e r u s a l e m +c a r r y i n g +p e r s o n +f e e l +t e r r i t o r y +a l +c o m m e r c i a l +u k r a i n e +b o d i e s +p r o t e s t s +n e t h e r l a n d s +f i n i s h +a c c e s s +t a r g e t +a u s t r i a +s o u r c e +r e p r e s e n t a t i v e s +s p e n t +j e w i s h +p o t e n t i a l +r i s i n g +t r e a t y +c a n a d i a n +a g e +c a +s p e n d i n g +n e c e s s a r y +r a i n +z o n e +c a r s +p r o m o t e +n a t u r a l +d a m a g e +f o c u s +w e a t h e r +p o l i c i e s +p r o t e c t +a i d s +c o +g i v i n g +b c +b a c k e d +l a n k a +a p p e a l +r e j e c t e d +f a n s +b a d +s o u t h e a s t +r i v a l +p l a n n i n g +b o s n i a +c o m e s +b u y +s o v i e t +h o t e l +d u t c h +q u e s t i o n +t a i p e i +b o o s t +c o s t s +i n s t i t u t e +s o c i e t y +s h o o t i n g +t h e m s e l v e s +e v e n t s +k i n d +p a p e r +w o r k e d +c o n s t i t u t i o n +u r g e n t +s e t t l e m e n t +e a r n i n g s +j o s e +m o t h e r +a c c i d e n t +f a c t +d r o p +r a n g e +h a n d s +s e e k +h u g e +l a w y e r +s t a r t i n g +h e a r t +c o m m a n d e r +t o u r i s m +p a s s e n g e r s +s u s p e c t s +h i g h e s t +p o p u l a r +s t a b i l i t y +s u p r e m e +b u s +r o b e r t +b a t t l e +p r o g r a m s +c u b a +w i n s +d r u g s +s u r v e y +h o s t +m u r d e r +d a t e +g u l f +w i l l i a m s +s e n d +s u f f e r e d +p e n a l t y +k e p t +s t a d i u m +c i t i z e n s +f i g u r e +h e a d q u a r t e r s +g u a r d +p u b l i s h e d +s t a n d +t e n n i s +c r e a t e +b e g i n n i n g +e v e n i n g +p h o n e +f o o t +r u l e d +c a s h +s o l d +c h i c a g o +p o l a n d +d e m o c r a t s +r e f o r m s +b o s n i a n +s u r e +c h i l d +m a y o r +a t t e n d +l e a d e r s h i p +e m p l o y e e s +t e l e p h o n e +l o s s e s +b o r n +a s s i s t a n c e +t h i n g +t r a i n +s u p p l y +e i t h e r +b u i l t +l a u n c h +c r u d e +m o v i n g +g r e e c e +t r a c k +r a i s e +d r i v e +r e s p o n s i b i l i t y +f e d e r a t i o n +c o l o m b i a +g r e e n +c o n c e r n e d +c a n d i d a t e s +n e w s p a p e r s +r e v i e w +i n t e r i o r +d e b t +w h o l e +t e x a s +m o s t l y +r e l i e f +f a r m e r s +g o o d s +p a k i s t a n i +d e g r e e s +s e l l +d e t a i n e d +s w i s s +c r i m i n a l +d e c a d e s +m i s s i l e +a b o v e +d r a w +p a s s e d +e x p l o s i o n +m a k e s +l a w s +b a n g l a d e s h +t a l k +m a d r i d +m a s s +c o n v i c t e d +i t e m s +m e d a l +s u c c e s s +s e a t s +q u i c k l y +c a l l i n g +k i m +t r a f f i c +d i r e c t +o r g a n i z a t i o n s +l e v e l s +s e r v e +a d d r e s s +s t r e s s e d +s t a n d i n g +w a n g +d e c l a r e d +j a m e s +c a p t a i n +t h r e a t e n e d +p r o m i s e d +s u d a n +v a n +p a s s +e n v i r o n m e n t a l +r a t h e r +w o r s t +p o u n d s +b l u e +s i x t h +m e t e r +i n c l u d e s +m u s i c +r e d u c e +t a k e s +v o t e s +r e s c u e +c o m p l e t e d +s e a r c h +i n n i n g s +v e h i c l e s +c l a i m +t r a n s p o r t +a v o i d +i n c o m e +p o l l +a f f e c t e d +g e o r g i a +g a i n e d +w o +r e +v i s i t i n g +r e s p o n s i b l e +e f f e c t +p o l l s +h e a r i n g +l o s i n g +e s t a b l i s h e d +f a i r +g i a n t +c h a l l e n g e +f e e t +p r o p e r t y +t e s t s +l e g +a g r i c u l t u r e +l o n g e r +d e a t h s +s q u a r e +p a r t i c u l a r l y +d i s p u t e +b +e n t e r p r i s e s +v o l u m e +c a r r y +m i d +s e p a r a t e +i d e n t i f i e d +i t s e l f +h e a d e d +a n o n y m i t y +p a r l i a m e n t a r y +c r a s h +r e m a i n i n g +j o u r n a l i s t s +i n c r e a s i n g +s t a t i s t i c s +d e s c r i b e d +b u r e a u +i n j u r i e s +p r o v i d e d +j o i n e d +i m m e d i a t e +d e b a t e +i m p a c t +m e s s a g e +m e e t i n g s +r e q u e s t +s c h o o l s +o c c u r r e d +r e m a r k s +c o m m i t t e d +p r o t e s t e r s +t o u g h +s p o k e +s t r i p +f a c e s +c r o w d +s h o w s +w a r n i n g +s t o r y +q u a l i t y +p e t e r +f r e e d o m +d e v e l o p +m a r t i n +p e r s o n a l +s e r b i a +a n y t h i n g +b l a m e d +i n t e r e s t s +n e i g h b o r i n g +d o c t o r s +f l i g h t s +s h i p +r e g i m e +b l a i r +u n i t +a g e n c i e s +a f p +s u g g e s t e d +l a c k +s e l l i n g +a n n a n +y u g o s l a v i a +l a +c o n s u m e r +s u s p e n d e d +s t o p p e d +c o m m e n t s +c o m p u t e r +c o n s i d e r +a i r l i n e s +l e b a n e s e +p r e p a r e d +d i a l o g u e +e x p e c t +t w i c e +p u t i n +a l l e g a t i o n s +b r o w n +a c c e p t +a p p r o v a l +w i d e +n e a r b y +s y s t e m s +v i e w +p u s h +p r o b a b l y +e v e r y t h i n g +d r a f t +t r a d i t i o n a l +s t a t u s +s t r u c k +s e i z e d +p a r t l y +s t a n d a r d +h u s s e i n +p o v e r t y +d o z e n s +r e g i o n s +c r i c k e t +l o a n s +e +b o o k +b a s i s +a n n o u n c e m e n t +r u r a l +s e r b s +a d d i t i o n +g r e e k +c o m p l e t e +r o o m +g r e a t e r +a l l e g e d l y +f i n a l s +f a c i n g +l i m i t e d +c u t s +r i c h a r d +b u s i n e s s e s +l i n k e d +p e a c e f u l +c r e w +t o u r i s t s +m a i n l y +p r i s o n e r s +p o w e r f u l +c r o a t i a +f i l e d +k u w a i t +f o r u m +r e s e r v e +m i l a n +b l a s t +a n n i v e r s a r y +a t t e n d e d +e n d i n g +d e v e l o p e d +c e r t a i n +b e l o w +f e l t +p r o v i n c i a l +c y p r u s +c r i t i c i z e d +o p p o r t u n i t y +s m i t h +p o l i t i c s +s e l f +h u m a n i t a r i a n +r e a s o n +l a w y e r s +r e v e n u e +d o c u m e n t s +w r o t e +q u e s t i o n s +n o r w a y +d o w +p a n e l +f e a r +s e n t e n c e d +b a n n e d +c i v i l i a n +c u l t u r a l +p e r s o n n e l +b e l g i u m +a b u +c a p a c i t y +a m o u n t +s e c u r i t i e s +b l o o d +s i g n i f i c a n t +e x p e r i e n c e +a s e a n +h o u s i n g +j o h n s o n +p h o t o s +r o y a l +i m p o r t s +a d d i t i o n a l +y e l t s i n +c d y +h e a r d +t h o m a s +b a n k i n g +l e a d s +v i s i t e d +f e a r s +u g a n d a +d r i v e r +c o n t r o l l e d +d e m a n d s +i n s t i t u t i o n s +a l i +c h r i s t i a n +s t o r m +f o r e c a s t +g r a f +f i g h t e r s +s t r e e t s +r e s p e c t +s p o t +w e b +m i s s e d +s c i e n c e +h e a d s +h i t s +m a s s i v e +c u l t u r e +c o u p l e +v e n e z u e l a +r e p o r t e d l y +i n s u r a n c e +s p r e a d +s o l u t i o n +p l a c e d +s e r v e d +f a c i l i t i e s +s t r a t e g y +t e c h n i c a l +s t e p s +d e e p +h o p e d +d e c i d e +s a l e +j a i l +d i s c u s s e d +s a v e +n e p a l +a r a b i a +e n v o y +a t t a c k e d +w a y s +r e c e i v e +h a p p y +h a l l +g u i l t y +p r a c t i c e +l o v e +e u r o s +o p e r a t i n g +c h a n g e d +b o s t o n +d e c a d e +d e f i c i t +p r o d u c t +l i n e s +p a t i e n t s +f r i e n d s +s y d n e y +a c c o r d +t i e d +s p e e d +w o r d s +t i e +s c o r e +c o n d u c t e d +c r i t i c i s m +m u s l i m s +b r o t h e r +c l a s s +r o m a n i a +h e l p i n g +f a s t +h a p p e n e d +d e f e n d i n g +n a v y +w i t n e s s e s +f u l l y +s u s p e c t +i s l a n d s +m a i n t a i n +p r e s e n c e +j a k a r t a +p a c k a g e +y a r d s +g a i n +a c c o u n t +s q u a d +s h a r o n +w i n g +a c t i o n s +a t h e n s +s t r a t e g i c +s t r e n g t h e n +f r i e n d l y +d e s t r o y e d +a p p a r e n t l y +c o n s e r v a t i v e +g a i n s +f a i l u r e +f u t u r e s +s h o t s +r e l a t i o n s h i p +c o m m i s s i o n e r +m a l a y s i a n +r e q u i r e d +a t l a n t a +a g r e e +d e f e a t +s t r i k e r +a d v a n c e d +b r a z i l i a n +a s s e t s +h o u s e s +s u p p l i e s +s a f e +m i l l i o n s +s o u g h t +f r e s h +v i d e o +p r o s e c u t o r +p u l l e d +v e h i c l e +t o l l +p a r e n t s +c e a s e +a c t i v i s t s +o r g a n i z e d +e n t e r e d +s h i i t e +l a n g u a g e +a b b a s +b i n +p r e v i o u s l y +c l o s i n g +w o r k s +t e r r o r i s t s +t o n y +c o v e r +f o l l o w +l e g i s l a t i v e +r i c h +c l a s h e s +i m p o s e d +r a n +m c c a i n +s u c c e s s f u l +s e v e n t h +s c o r i n g +c a u g h t +a p p o i n t e d +a l l i e s +a d m i t t e d +w o r l d w i d e +o r d e r s +d e m a n d e d +c r e a t e d +r a n k e d +m i l i t a n t +i n v e s t i g a t o r s +s h o w i n g +p o s s i b i l i t y +s e a t +d a u g h t e r +s i t e s +s h o r t l y +c o m m e r c e +n e t a n y a h u +a d v a n c e +a i r l i n e +f i r m s +a b r o a d +f o u n d a t i o n +c o m m i t m e n t +p l e d g e d +k i l l +r e p r e s e n t a t i v e +n o r t h w e s t +s c e n e +b e a t i n g +i m p r o v e d +r e s u m e +w h o m +s l i g h t l y +v o t i n g +b o m b i n g s +s e r i o u s l y +s e t t i n g +c a r l o s +e f f e c t i v e +h k +r e g u l a r +j i a n g +p r i n c e +d e c l i n e +b a y +n o r t h e a s t +s o l d i e r +r e a c h i n g +a g r e e m e n t s +m i k e +h u r t +c r i t i c a l +i d e a +m i l o s e v i c +f i s c a l +t a r g e t s +a g r i c u l t u r a l +m u s h a r r a f +d e s i g n e d +o v e r n i g h t +b o y +d o z e n +p r o d u c e +c a l m +s t a n d a r d s +l e g i s l a t i o n +s e n t e n c e +w i t h d r a w a l +s e e d e d +c o m p o s i t e +t r a d e d +w i n t e r +d a v i s +t r u s t +c l i m a t e +i n d u s t r i e s +p r o f i t s +v o t e d +c a m b o d i a +s y r i a n +s i g n s +l o a n +s t e e l +e l e c t r i c i t y +t e h r a n +c i t i n g +h u s b a n d +b i t +c o m b a t +h a n d e d +f e s t i v a l +i m f +p r e s i d e n c y +c a p t u r e d +s t u d e n t +f i n e +s t a t i o n s +s i l v e r +c h a v e z +i n t e r +m o m e n t +t a b l e +c o u p +p o p e +p r o v i n c e s +a h m e d +b u i l d i n g s +o u t p u t +l i b e r a t i o n +m o n e t a r y +c l o s e r +c o l l e g e +f l u +a d v a n t a g e +a s s i s t a n t +g o n e +s e c r e t +x +c a t h o l i c +n a m e s +l i s t e d +f i n a l l y +c a n c e r +p r o d u c e d +m e a s u r e +f l e d +l a r g e l y +d e f e a t e d +c o n g o +b a s i c +j e a n +l o s e +p r i z e +b a n g k o k +a s k +f r a n c i s c o +r e g i s t e r e d +d i s a s t e r +g o l f +i n d i v i d u a l +c o n t i n u e s +w t o +i n i t i a l +a n y o n e +q u a k e +f a c e d +s c i e n t i s t s +m o b i l e +p o s i t i o n s +f i e l d s +r e c o v e r y +m u s e u m +n u m b e r s +d e n m a r k +m a n i l a +h o l d s +c e n t +e x +e s t a b l i s h +w i d e l y +o f f i c e s +i n s i s t e d +u n i t s +k a s h m i r +r e f e r e n d u m +l o c a t e d +u p o n +a l l o w i n g +s c a l e +o p p o s e d +w a t c h +i n d i c a t e d +p a r t n e r +e a r t h q u a k e +s c a n d a l +e v e r y o n e +a p p r o a c h +t r u c k +i m p o r t a n c e +t h r e a t s +p o r t u g a l +s e x +r e c o r d s +s u p e r +s t o o d +c o n t a c t +m a t e r i a l s +v i o l e n t +p l a c e s +a n a l y s t +a d d s +a l o n e +g o e s +m o v i e +e x p e c t s +a r t +s e o u l +m e x i c a n +y e s t e r d a y +p l a n e s +n i n t h +o n l i n e +h e l i c o p t e r +i m m i g r a t i o n +p a r t n e r s +i n f r a s t r u c t u r e +b o a t +v i s i t s +n o r m a l +s t a k e +g u e r r i l l a s +m a c a o +w i l l i n g +s u n +a w a r d +t e l l +s o u t h w e s t +s p o r t +e n t e r +r e s o l v e +c h a n c e s +m i a m i +e l +e n t i r e diff --git a/training/cllh_observer.cc b/training/crf/cllh_observer.cc index 58232769..4ec2fa65 100644 --- a/training/cllh_observer.cc +++ b/training/crf/cllh_observer.cc @@ -45,7 +45,7 @@ void ConditionalLikelihoodObserver::NotifyAlignmentForest(const SentenceMetadata cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl; exit(1); } - assert(!isnan(log_ref_z)); + assert(!std::isnan(log_ref_z)); acc_obj += (cur_obj - log_ref_z); trg_words += smeta.GetReference().size(); } diff --git a/training/cllh_observer.h b/training/crf/cllh_observer.h index 0de47331..0de47331 100644 --- a/training/cllh_observer.h +++ b/training/crf/cllh_observer.h diff --git a/training/mpi_batch_optimize.cc b/training/crf/mpi_batch_optimize.cc index 6432f4a2..2eff07e4 100644 --- a/training/mpi_batch_optimize.cc +++ b/training/crf/mpi_batch_optimize.cc @@ -142,7 +142,7 @@ struct TrainingObserver : public DecoderObserver { cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl; exit(1); } - assert(!isnan(log_ref_z)); + assert(!std::isnan(log_ref_z)); ref_exp -= cur_model_exp; acc_grad -= ref_exp; acc_obj += (cur_obj - log_ref_z); diff --git a/training/crf/mpi_baum_welch.cc b/training/crf/mpi_baum_welch.cc new file mode 100644 index 00000000..d69b1769 --- /dev/null +++ b/training/crf/mpi_baum_welch.cc @@ -0,0 +1,316 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <cassert> +#include <cmath> + +#include "config.h" +#ifdef HAVE_MPI +#include <boost/mpi/timer.hpp> +#include <boost/mpi.hpp> +namespace mpi = boost::mpi; +#endif + +#include <boost/unordered_map.hpp> +#include <boost/functional/hash.hpp> +#include <boost/shared_ptr.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sentence_metadata.h" +#include "verbose.h" +#include "hg.h" +#include "prob.h" +#include "inside_outside.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "stringlib.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" + +using namespace std; +namespace po = boost::program_options; + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input_weights,w",po::value<string>(),"Input feature weights file") + ("iterations,n",po::value<unsigned>()->default_value(50), "Number of training iterations") + ("training_data,t",po::value<string>(),"Training data") + ("decoder_config,c",po::value<string>(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data")) || !conf->count("decoder_config")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + int lc = 0; + while(in) { + getline(in, line); + if (!in) break; + if (lc % size == rank) c->push_back(line); + ++lc; + } +} + +static const double kMINUS_EPSILON = -1e-6; + +struct TrainingObserver : public DecoderObserver { + void Reset() { + acc_grad.clear(); + acc_obj = 0; + total_complete = 0; + trg_words = 0; + } + + void SetLocalGradientAndObjective(vector<double>* g, double* o) const { + *o = acc_obj; + for (SparseVector<double>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it) + (*g)[it->first] = it->second; + } + + virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { + state = 1; + } + + // compute model expectations, denominator of objective + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + assert(state == 1); + trg_words += smeta.GetSourceLength(); + state = 2; + SparseVector<prob_t> exps; + const prob_t z = InsideOutside<prob_t, + EdgeProb, + SparseVector<prob_t>, + EdgeFeaturesAndProbWeightFunction>(*hg, &exps); + exps /= z; + for (SparseVector<prob_t>::iterator it = exps.begin(); it != exps.end(); ++it) + acc_grad.add_value(it->first, it->second.as_float()); + + acc_obj += log(z); + } + + // compute "empirical" expectations, numerator of objective + virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { + cerr << "Shouldn't get an alignment forest!\n"; + abort(); + } + + virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { + ++total_complete; + } + + int total_complete; + SparseVector<double> acc_grad; + double acc_obj; + unsigned trg_words; + int state; +}; + +void ReadConfig(const string& ini, vector<string>* out) { + ReadFile rf(ini); + istream& in = *rf.stream(); + while(in) { + string line; + getline(in, line); + if (!in) continue; + out->push_back(line); + } +} + +void StoreConfig(const vector<string>& cfg, istringstream* o) { + ostringstream os; + for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; } + o->str(os.str()); +} + +#if 0 +template <typename T> +struct VectorPlus : public binary_function<vector<T>, vector<T>, vector<T> > { + vector<T> operator()(const vector<int>& a, const vector<int>& b) const { + assert(a.size() == b.size()); + vector<T> v(a.size()); + transform(a.begin(), a.end(), b.begin(), v.begin(), plus<T>()); + return v; + } +}; +#endif + +int main(int argc, char** argv) { +#ifdef HAVE_MPI + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + const int size = 1; + const int rank = 0; +#endif + SetSilent(true); // turn off verbose decoder output + register_feature_functions(); + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + const unsigned iterations = conf["iterations"].as<unsigned>(); + + // load cdec.ini and set up decoder + vector<string> cdec_ini; + ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini); + istringstream ini; + StoreConfig(cdec_ini, &ini); + Decoder* decoder = new Decoder(&ini); + if (decoder->GetConf()["input"].as<string>() != "-") { + cerr << "cdec.ini must not set an input file\n"; + return 1; + } + + // load initial weights + if (rank == 0) { cerr << "Loading weights...\n"; } + vector<weight_t>& lambdas = decoder->CurrentWeightVector(); + Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas); + if (rank == 0) { cerr << "Done loading weights.\n"; } + + // freeze feature set (should be optional?) + const bool freeze_feature_set = true; + if (freeze_feature_set) FD::Freeze(); + + const int num_feats = FD::NumFeats(); + if (rank == 0) cerr << "Number of features: " << num_feats << endl; + lambdas.resize(num_feats); + + vector<double> gradient(num_feats, 0.0); + vector<double> rcv_grad; + rcv_grad.clear(); + bool converged = false; + + vector<string> corpus, test_corpus; + ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus); + assert(corpus.size() > 0); + if (conf.count("test_data")) + ReadTrainingCorpus(conf["test_data"].as<string>(), rank, size, &test_corpus); + + // build map from feature id to the accumulator that should normalize + boost::unordered_map<std::string, boost::unordered_map<int, double>, boost::hash<std::string> > ccs; + vector<boost::unordered_map<int, double>* > cpd_to_acc; + if (rank == 0) { + cpd_to_acc.resize(num_feats); + for (unsigned f = 1; f < num_feats; ++f) { + string normalizer; + //0 ||| 7 9 ||| Bi:BOS_7=1 Bi:7_9=1 Bi:9_EOS=1 Id:a:7=1 Uni:7=1 Id:b:9=1 Uni:9=1 ||| 0 + const string& fstr = FD::Convert(f); + if (fstr.find("Bi:") == 0) { + size_t pos = fstr.rfind('_'); + if (pos < fstr.size()) + normalizer = fstr.substr(0, pos); + } else if (fstr.find("Id:") == 0) { + size_t pos = fstr.rfind(':'); + if (pos < fstr.size()) { + normalizer = "Emit:"; + normalizer += fstr.substr(pos); + } + } + if (normalizer.size() > 0) { + boost::unordered_map<int, double>& acc = ccs[normalizer]; + cpd_to_acc[f] = &acc; + } + } + } + + TrainingObserver observer; + int iteration = 0; + while (!converged) { + ++iteration; + observer.Reset(); +#ifdef HAVE_MPI + mpi::timer timer; + world.barrier(); +#endif + if (rank == 0) { + cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n"; + cerr << " Testset size: " << test_corpus.size() << " sentences / proc)\n"; + for(boost::unordered_map<string, boost::unordered_map<int,double>, boost::hash<string> >::iterator it = ccs.begin(); it != ccs.end(); ++it) + it->second.clear(); + } + for (int i = 0; i < corpus.size(); ++i) + decoder->Decode(corpus[i], &observer); + cerr << " process " << rank << '/' << size << " done\n"; + fill(gradient.begin(), gradient.end(), 0); + double objective = 0; + observer.SetLocalGradientAndObjective(&gradient, &objective); + + unsigned total_words = 0; +#ifdef HAVE_MPI + double to = 0; + rcv_grad.resize(num_feats, 0.0); + mpi::reduce(world, &gradient[0], gradient.size(), &rcv_grad[0], plus<double>(), 0); + swap(gradient, rcv_grad); + rcv_grad.clear(); + + reduce(world, observer.trg_words, total_words, std::plus<unsigned>(), 0); + mpi::reduce(world, objective, to, plus<double>(), 0); + objective = to; +#else + total_words = observer.trg_words; +#endif + if (rank == 0) { // run optimizer only on rank=0 node + cerr << "TRAINING CORPUS: ln p(x)=" << objective << "\t log_2 p(x) = " << (objective/log(2)) << "\t cross entropy = " << (objective/log(2) / total_words) << "\t ppl = " << pow(2, (-objective/log(2) / total_words)) << endl; + for (unsigned f = 1; f < num_feats; ++f) { + boost::unordered_map<int, double>* m = cpd_to_acc[f]; + if (m && gradient[f]) { + (*m)[f] += gradient[f]; + } + for(boost::unordered_map<string, boost::unordered_map<int,double>, boost::hash<string> >::iterator it = ccs.begin(); it != ccs.end(); ++it) { + const boost::unordered_map<int,double>& ccs = it->second; + double z = 0; + for (boost::unordered_map<int,double>::const_iterator ci = ccs.begin(); ci != ccs.end(); ++ci) + z += ci->second + 1e-09; + double lz = log(z); + for (boost::unordered_map<int,double>::const_iterator ci = ccs.begin(); ci != ccs.end(); ++ci) + lambdas[ci->first] = log(ci->second + 1e-09) - lz; + } + } + Weights::SanityCheck(lambdas); + Weights::ShowLargestFeatures(lambdas); + + converged = (iteration == iterations); + + string fname = "weights.cur.gz"; + if (converged) { fname = "weights.final.gz"; } + ostringstream vv; + vv << "Objective = " << objective << " (eval count=" << iteration << ")"; + const string svv = vv.str(); + Weights::WriteToFile(fname, lambdas, true, &svv); + } // rank == 0 + int cint = converged; +#ifdef HAVE_MPI + mpi::broadcast(world, &lambdas[0], lambdas.size(), 0); + mpi::broadcast(world, cint, 0); + if (rank == 0) { cerr << " ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; } +#endif + converged = cint; + } + return 0; +} + diff --git a/training/mpi_compute_cllh.cc b/training/crf/mpi_compute_cllh.cc index 066389d0..066389d0 100644 --- a/training/mpi_compute_cllh.cc +++ b/training/crf/mpi_compute_cllh.cc diff --git a/training/mpi_extract_features.cc b/training/crf/mpi_extract_features.cc index 6750aa15..6750aa15 100644 --- a/training/mpi_extract_features.cc +++ b/training/crf/mpi_extract_features.cc diff --git a/training/mpi_extract_reachable.cc b/training/crf/mpi_extract_reachable.cc index 2a7c2b9d..2a7c2b9d 100644 --- a/training/mpi_extract_reachable.cc +++ b/training/crf/mpi_extract_reachable.cc diff --git a/training/mpi_flex_optimize.cc b/training/crf/mpi_flex_optimize.cc index b52decdc..b52decdc 100644 --- a/training/mpi_flex_optimize.cc +++ b/training/crf/mpi_flex_optimize.cc diff --git a/training/mpi_online_optimize.cc b/training/crf/mpi_online_optimize.cc index 993627f0..9e1ae34c 100644 --- a/training/mpi_online_optimize.cc +++ b/training/crf/mpi_online_optimize.cc @@ -5,6 +5,7 @@ #include <cassert> #include <cmath> #include <tr1/memory> +#include <ctime> #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> @@ -41,6 +42,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("training_agenda,a",po::value<string>(), "Text file listing a series of configuration files and the number of iterations to train using each configuration successively") ("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(5), "Number of training instances evaluated per processor in each minibatch") ("optimization_method,m", po::value<string>()->default_value("sgd"), "Optimization method (sgd)") + ("max_walltime", po::value<unsigned>(), "Maximum walltime to run (in minutes)") ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)") ("eta_0,e", po::value<double>()->default_value(0.2), "Initial learning rate for SGD (eta_0)") ("L1,1","Use L1 regularization") @@ -143,7 +145,7 @@ struct TrainingObserver : public DecoderObserver { cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl; exit(1); } - assert(!isnan(log_ref_z)); + assert(!std::isnan(log_ref_z)); ref_exp -= cur_model_exp; acc_grad += ref_exp; acc_obj += (cur_obj - log_ref_z); @@ -304,6 +306,9 @@ int main(int argc, char** argv) { int write_weights_every_ith = 100; // TODO configure int titer = -1; + unsigned timeout = 0; + if (conf.count("max_walltime")) timeout = 60 * conf["max_walltime"].as<unsigned>(); + const time_t start_time = time(NULL); for (int ai = 0; ai < agenda.size(); ++ai) { const string& cur_config = agenda[ai].first; const unsigned max_iteration = agenda[ai].second; @@ -330,15 +335,20 @@ int main(int argc, char** argv) { if (rank == 0) { converged = (iter == max_iteration); Weights::SanityCheck(lambdas); - Weights::ShowLargestFeatures(lambdas); + static int cc = 0; ++cc; if (cc > 1) { Weights::ShowLargestFeatures(lambdas); } string fname = "weights.cur.gz"; if (iter % write_weights_every_ith == 0) { ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz"; fname = o.str(); } + const time_t cur_time = time(NULL); + if (timeout) { + if ((cur_time - start_time) > timeout) converged = true; + } if (converged && ((ai+1)==agenda.size())) { fname = "weights.final.gz"; } ostringstream vv; - vv << "total iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << " eta=" << lr->eta(titer); + double minutes = (cur_time - start_time) / 60.0; + vv << "total walltime=" << minutes << "min iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << " eta=" << lr->eta(titer); const string svv = vv.str(); cerr << svv << endl; Weights::WriteToFile(fname, lambdas, true, &svv); diff --git a/training/dep-reorder/conll2reordering-forest.pl b/training/dep-reorder/conll2reordering-forest.pl deleted file mode 100755 index 3cd226be..00000000 --- a/training/dep-reorder/conll2reordering-forest.pl +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } -my $FIRST_CONV = "$script_dir/scripts/conll2simplecfg.pl"; -my $CDEC = "$script_dir/../../decoder/cdec"; - -our $tfile1 = "grammar1.$$"; -our $tfile2 = "text.$$"; - -die "Usage: $0 parses.conll\n" unless scalar @ARGV == 1; -open C, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; - -END { unlink $tfile1; unlink "$tfile1.cfg"; unlink $tfile2; } - -my $first = 1; -open T, ">$tfile1" or die "Can't write $tfile1: $!"; -my $lc = 0; -my $flag = 0; -my @words = (); -while(<C>) { - print T; - chomp; - if (/^$/) { - if ($first) { $first = undef; } else { if ($flag) { print "\n"; $flag = 0; } } - $first = undef; - close T; - open SO, ">$tfile2" or die "Can't write $tfile2: $!"; - print SO "@words\n"; - close SO; - @words=(); - `$FIRST_CONV < $tfile1 > $tfile1.cfg`; - if ($? != 0) { - die "Error code: $?"; - } - my $cfg = `$CDEC -n -S 10000 -f scfg -g $tfile1.cfg -i $tfile2 --show_cfg_search_space 2>/dev/null`; - if ($? != 0) { - die "Error code: $?"; - } - my @rules = split /\n/, $cfg; - shift @rules; # get rid of output - for my $rule (@rules) { - my ($lhs, $f, $e, $feats) = split / \|\|\| /, $rule; - $f =~ s/,\d\]/\]/g; - $feats = 'TOP=1' unless $feats; - if ($lhs =~ /\[Goal_\d+\]/) { $lhs = '[S]'; } - print "$lhs ||| $f ||| $feats\n"; - if ($e eq '[1] [2]') { - my ($a, $b) = split /\s+/, $f; - $feats =~ s/=1$//; - my ($x, $y) = split /_/, $feats; - print "$lhs ||| $b $a ||| ${y}_$x=1\n"; - } - $flag = 1; - } - open T, ">$tfile1" or die "Can't write $tfile1: $!"; - $lc = -1; - } else { - my ($ind, $word, @dmmy) = split /\s+/; - push @words, $word; - } - $lc++; -} -close T; - diff --git a/training/dep-reorder/george.conll b/training/dep-reorder/george.conll deleted file mode 100644 index 7eebb360..00000000 --- a/training/dep-reorder/george.conll +++ /dev/null @@ -1,4 +0,0 @@ -1 George _ GEORGE _ _ 2 X _ _ -2 hates _ HATES _ _ 0 X _ _ -3 broccoli _ BROC _ _ 2 X _ _ - diff --git a/training/dep-reorder/scripts/conll2simplecfg.pl b/training/dep-reorder/scripts/conll2simplecfg.pl deleted file mode 100755 index b101347a..00000000 --- a/training/dep-reorder/scripts/conll2simplecfg.pl +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -# 1 在 _ 10 _ _ 4 X _ _ -# 2 门厅 _ 3 _ _ 1 X _ _ -# 3 下面 _ 23 _ _ 4 X _ _ -# 4 。 _ 45 _ _ 0 X _ _ - -my @ldeps; -my @rdeps; -@ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } -@rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } -my $rootcat = 0; -my @cats = ('S'); -my $len = 0; -my @noposcats = ('S'); -while(<>) { - chomp; - if (/^\s*$/) { - write_cfg($len); - $len = 0; - @cats=('S'); - @noposcats = ('S'); - @ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } - @rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } - next; - } - $len++; - my ($pos, $word, $d1, $xcat, $d2, $d3, $headpos, $deptype) = split /\s+/; - my $cat = "C$xcat"; - my $catpos = $cat . "_$pos"; - push @cats, $catpos; - push @noposcats, $cat; - print "[$catpos] ||| $word ||| $word ||| Word=1\n"; - if ($headpos == 0) { $rootcat = $pos; } - if ($pos < $headpos) { - push @{$ldeps[$headpos]}, $pos; - } else { - push @{$rdeps[$headpos]}, $pos; - } -} - -sub write_cfg { - my $len = shift; - for (my $i = 1; $i <= $len; $i++) { - my @lds = @{$ldeps[$i]}; - for my $ld (@lds) { - print "[$cats[$i]] ||| [$cats[$ld],1] [$cats[$i],2] ||| [1] [2] ||| $noposcats[$ld]_$noposcats[$i]=1\n"; - } - my @rds = @{$rdeps[$i]}; - for my $rd (@rds) { - print "[$cats[$i]] ||| [$cats[$i],1] [$cats[$rd],2] ||| [1] [2] ||| $noposcats[$i]_$noposcats[$rd]=1\n"; - } - } - print "[S] ||| [$cats[$rootcat],1] ||| [1] ||| TOP=1\n"; -} - diff --git a/training/dpmert/Makefile.am b/training/dpmert/Makefile.am new file mode 100644 index 00000000..b85bb275 --- /dev/null +++ b/training/dpmert/Makefile.am @@ -0,0 +1,27 @@ +bin_PROGRAMS = \ + mr_dpmert_map \ + mr_dpmert_reduce \ + mr_dpmert_generate_mapper_input + +noinst_PROGRAMS = \ + lo_test +TESTS = lo_test + +mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc +mr_dpmert_generate_mapper_input_LDADD = ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +# nbest2hg_SOURCES = nbest2hg.cc +# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst + +mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc ces.h error_surface.h line_optimizer.h mert_geometry.h +mr_dpmert_map_LDADD = ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc ces.h error_surface.h line_optimizer.h mert_geometry.h +mr_dpmert_reduce_LDADD = ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc ces.h error_surface.h line_optimizer.h mert_geometry.h +lo_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +EXTRA_DIST = test_data dpmert.pl + +AM_CPPFLAGS = -DTEST_DATA=\"$(top_srcdir)/training/dpmert/test_data\" -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/dpmert/ces.cc b/training/dpmert/ces.cc new file mode 100644 index 00000000..157b2d17 --- /dev/null +++ b/training/dpmert/ces.cc @@ -0,0 +1,90 @@ +#include "ces.h" + +#include <vector> +#include <sstream> +#include <boost/shared_ptr.hpp> + +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h" +#include "lattice.h" +#include "mert_geometry.h" +#include "error_surface.h" +#include "ns.h" + +using namespace std; + +const bool minimize_segments = true; // if adjacent segments have equal scores, merge them + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& ve, + ErrorSurface* env, + const EvaluationMetric* metric, + const Hypergraph& hg) { + vector<WordID> prev_trans; + const vector<boost::shared_ptr<MERTPoint> >& ienv = ve.GetSortedSegs(); + env->resize(ienv.size()); + SufficientStats prev_score; // defaults to 0 + int j = 0; + for (unsigned i = 0; i < ienv.size(); ++i) { + const MERTPoint& seg = *ienv[i]; + vector<WordID> trans; +#if 0 + if (type == AER) { + vector<bool> edges(hg.edges_.size(), false); + seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi + // alignment + ostringstream os; + const string* psrc = ss.GetSource(); + if (psrc == NULL) { + cerr << "AER scoring in VEST requires source, but it is missing!\n"; + abort(); + } + size_t pos = psrc->rfind(" ||| "); + if (pos == string::npos) { + cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; + abort(); + } + Lattice src; + Lattice ref; + LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); + LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); + AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges); + string tstr = os.str(); + TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); + } else { +#endif + seg.ConstructTranslation(&trans); + //} + //cerr << "Scoring: " << TD::GetString(trans) << endl; + if (trans == prev_trans) { + if (!minimize_segments) { + ErrorSegment& out = (*env)[j]; + out.delta.fields.clear(); + out.x = seg.x; + ++j; + } + //cerr << "Identical translation, skipping scoring\n"; + } else { + SufficientStats score; + ss.Evaluate(trans, &score); + // cerr << "score= " << score->ComputeScore() << "\n"; + //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; + const SufficientStats delta = score - prev_score; + //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; + //string xx; delta.Encode(&xx); cerr << xx << endl; + prev_trans.swap(trans); + prev_score = score; + if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { + ErrorSegment& out = (*env)[j]; + out.delta = delta; + out.x = seg.x; + ++j; + } + } + } + // cerr << " In segments: " << ienv.size() << endl; + // cerr << "Out segments: " << j << endl; + assert(j > 0); + env->resize(j); +} + diff --git a/training/dpmert/ces.h b/training/dpmert/ces.h new file mode 100644 index 00000000..e4fa2080 --- /dev/null +++ b/training/dpmert/ces.h @@ -0,0 +1,16 @@ +#ifndef _CES_H_ +#define _CES_H_ + +class ConvexHull; +class Hypergraph; +class SegmentEvaluator; +class ErrorSurface; +class EvaluationMetric; + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& convex_hull, + ErrorSurface* es, + const EvaluationMetric* metric, + const Hypergraph& hg); + +#endif diff --git a/training/dpmert/divide_refs.py b/training/dpmert/divide_refs.py new file mode 100755 index 00000000..b478f918 --- /dev/null +++ b/training/dpmert/divide_refs.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +import sys + +(numRefs, outPrefix) = sys.argv[1:] +numRefs = int(numRefs) + +outs = [open(outPrefix+str(i), "w") for i in range(numRefs)] + +i = 0 +for line in sys.stdin: + outs[i].write(line) + i = (i + 1) % numRefs + +for out in outs: + out.close() diff --git a/training/dpmert/dpmert.pl b/training/dpmert/dpmert.pl new file mode 100755 index 00000000..559420f5 --- /dev/null +++ b/training/dpmert/dpmert.pl @@ -0,0 +1,618 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use File::Basename qw(basename); +require "libcall.pl"; + +my $QSUB_CMD = qsub_args(mert_memory()); + +# Default settings +my $srcFile; # deprecated +my $refFiles; # deprecated +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +my $util_dir = "$SCRIPT_DIR/../utils"; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_dpmert_map"; +my $REDUCER = "$bin_dir/mr_dpmert_reduce"; +my $parallelize = "$util_dir/parallelize.pl"; +my $libcall = "$util_dir/libcall.pl"; +my $sentserver = "$util_dir/sentserver"; +my $sentclient = "$util_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 200; +my $rand_directions = 15; +my $iteration = 1; +my $best_weights; +my $max_iterations = 15; +my $optimization_iters = 6; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "9g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $initialWeights; +my $bleu_weight=1; +my $use_make = 1; # use make to parallelize line search +my $useqsub; +my $pass_suffix = ''; +my $devset; +# Process command-line options +if (GetOptions( + "config=s" => \$iniFile, + "weights=s" => \$initialWeights, + "devset=s" => \$devset, + "jobs=i" => \$jobs, + "pass-suffix=s" => \$pass_suffix, + "help" => \$help, + "qsub" => \$useqsub, + "iterations=i" => \$max_iterations, + "pmem=s" => \$pmem, + "random-directions=i" => \$rand_directions, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "output-dir=s" => \$dir, +) == 0 || @ARGV!=0 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (defined $srcFile || defined $refFiles) { + die <<EOT; + + The options --ref-files and --source-file are no longer supported. + Please specify the input file and its reference translations with + --devset FILE + +EOT +} + +if (!defined $iniFile) { push @missing_args, "--config"; } +if (!defined $devset) { push @missing_args, "--devset"; } +if (!defined $initialWeights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 40; +} elsif ($metric =~ /^meteor$/i) { + $lines_per_mapper = 2000; # start up time is really high for METEOR +} + + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +unless ($dir){ + $dir = "dpmert"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = basename($decoder); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; + +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +if (-e $dir) { + # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs + die "ERROR: output directory $dir already exists (remove or use --output-dir dir)\n\n"; +} else { + mkdir "$dir" or die "Can't mkdir $dir: $!"; + mkdir "$dir/hgs" or die; + mkdir "$dir/scripts" or die; + print STDERR <<EOT; + DECODER: $decoder + INI FILE: $iniFile + WORKING DIR: $dir + DEVSET: $devset + EVAL METRIC: $metric + MAX ITERATIONS: $max_iterations + PARALLEL JOBS: $jobs + HEAD NODE: $host + PMEM (DECODING): $pmem + INITIAL WEIGHTS: $initialWeights +EOT +} + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +check_call("cp $initialWeights $dir/weights.0"); +$iniFile = $newIniFile; + +split_devset($devset, "$dir/dev.input.raw", "$dir/dev.refs"); +my $refs = "-r $dir/dev.refs"; +my $newsrc = "$dir/dev.input"; +enseg("$dir/dev.input.raw", $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { + print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; + print STDERR unchecked_output("date"); + $icc++; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + my @mapoutputs = (); + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + my $mkfile; # only used with makefiles + my $mkfilename; + if ($use_make) { + $mkfilename = "$dir/splag.$im1/domap.mk"; + open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; + print $mkfile "all: $dir/splag.$im1/map.done\n\n"; + } + my @mkouts = (); # only used with makefiles + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "dpmert.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -s $srcFile -m $metric $refs < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + if ($use_make) { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "#!/bin/bash\n"; + print F "$script\n"; + close F; + my $output = "$dir/splag.$im1/$mapoutput"; + push @mkouts, $output; + chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; + } else { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + + $nmappers++; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); + chomp $jobid; + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); + print STDERR " $jobid"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + } + } + if ($use_make) { + print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; + close $mkfile; + my $mcmd = "make -j $jobs -f $mkfilename"; + print STDERR "\nExecuting: $mcmd\n"; + check_call($mcmd); + } else { + print STDERR "\nLaunched $nmappers mappers.\n"; + sleep 8; + print STDERR "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); + $nmappers = scalar @livejobs; + } + print STDERR "All mappers complete.\n"; + } + my $tol = 0; + my $til = 0; + for my $mo (@mapoutputs) { + my $olines = get_lines($mo); + my $ilines = get_lines($o2i{$mo}); + $tol += $olines; + $til += $ilines; + die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; + } + print STDERR "Results for $tol/$til lines\n"; + print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; + print STDERR unchecked_output("date"); + $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; + # sort returns failure even when it doesn't fail for some reason + my $best=unchecked_output("$cmd"); chomp $best; + print STDERR "$best\n"; + my ($oa, $x, $xscore) = split /\|/, $best; + $score = $xscore; + print STDERR "PROJECTED SCORE: $score\n"; + if (abs($x) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; + last; + } + my $psd = $score - $last_score; + $last_score = $score; + if (abs($psd) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; + last; + } + my ($origin, $axis) = split /\s+/, $oa; + + my %ori = convert($origin); + my %axi = convert($axis); + + my $finalFile="$dir/weights.$im1-$opt_iter"; + open W, ">$finalFile" or die "Can't write: $finalFile: $!"; + my $norm = 0; + for my $k (sort keys %ori) { + my $dd = $ori{$k} + $axi{$k} * $x; + $norm += $dd * $dd; + } + $norm = sqrt($norm); + $norm = 1; + for my $k (sort keys %ori) { + my $v = ($ori{$k} + $axi{$k} * $x) / $norm; + print W "$k $v\n"; + } + check_call("rm $dir/splag.$im1/*"); + $inweights = $finalFile; + } + $lastWeightsFile = "$dir/weights.$iteration"; + check_call("cp $inweights $lastWeightsFile"); + if ($icc < 2) { + print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; + last; + } + $lastPScore = $score; + $iteration++; + print STDERR "\n==========\n"; +} + +check_call("cp $lastWeightsFile $dir/weights.final"); +print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n"; +print STDOUT "$dir/weights.final\n"; +exit 0; + + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while(<FL>) { $lc++; } + return $lc; +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + + my $executable = basename($0); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable [options] + Runs a complete MERT optimization. Required options are --weights, + --devset, and --config. + +Options: + + --config <file> [-c <file>] + The decoder configuration file. + + --devset <file> [-d <file>] + The source *and* references for the development set. + + --weights <file> [-w <file>] + A file specifying initial feature weights. The format is + FeatureName_1 value1 + FeatureName_2 value2 + **All and only the weights listed in <file> will be optimized!** + + --metric <name> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --iterations <M> + Maximum number of iterations to run. If not specified, defaults + to 10. + + --pass-suffix <S> + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --rand-directions <num> + MERT will attempt to optimize along all of the principle directions, + set this parameter to explore other directions. Defaults to 5. + + --output-dir <dir> + Directory for intermediate and output files. + + --help + Print this message and exit. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { + my ($infile, $outsrc, $outref) = @_; + open F, "<$infile" or die "Can't read $infile: $!"; + open S, ">$outsrc" or die "Can't write $outsrc: $!"; + open R, ">$outref" or die "Can't write $outref: $!"; + while(<F>) { + chomp; + my ($src, @refs) = split /\s*\|\|\|\s*/; + die "Malformed devset line: $_\n" unless scalar @refs > 0; + print S "$src\n"; + print R join(' ||| ', @refs) . "\n"; + } + close R; + close S; + close F; +} + diff --git a/training/dpmert/error_surface.cc b/training/dpmert/error_surface.cc new file mode 100644 index 00000000..515b67f8 --- /dev/null +++ b/training/dpmert/error_surface.cc @@ -0,0 +1,42 @@ +#include "error_surface.h" + +#include <cassert> +#include <sstream> + +using namespace std; + +ErrorSurface::~ErrorSurface() {} + +void ErrorSurface::Serialize(std::string* out) const { + const int segments = this->size(); + ostringstream os(ios::binary); + os.write((const char*)&segments,sizeof(segments)); + for (int i = 0; i < segments; ++i) { + const ErrorSegment& cur = (*this)[i]; + string senc; + cur.delta.Encode(&senc); + assert(senc.size() < 1024); + unsigned char len = senc.size(); + os.write((const char*)&cur.x, sizeof(cur.x)); + os.write((const char*)&len, sizeof(len)); + os.write((const char*)&senc[0], len); + } + *out = os.str(); +} + +void ErrorSurface::Deserialize(const std::string& in) { + istringstream is(in, ios::binary); + int segments; + is.read((char*)&segments, sizeof(segments)); + this->resize(segments); + for (int i = 0; i < segments; ++i) { + ErrorSegment& cur = (*this)[i]; + unsigned char len; + is.read((char*)&cur.x, sizeof(cur.x)); + is.read((char*)&len, sizeof(len)); + string senc(len, '\0'); assert(senc.size() == len); + is.read((char*)&senc[0], len); + cur.delta = SufficientStats(senc); + } +} + diff --git a/training/dpmert/error_surface.h b/training/dpmert/error_surface.h new file mode 100644 index 00000000..bb65847b --- /dev/null +++ b/training/dpmert/error_surface.h @@ -0,0 +1,24 @@ +#ifndef _ERROR_SURFACE_H_ +#define _ERROR_SURFACE_H_ + +#include <vector> +#include <string> + +#include "ns.h" + +class Score; + +struct ErrorSegment { + double x; + SufficientStats delta; + ErrorSegment() : x(0), delta() {} +}; + +class ErrorSurface : public std::vector<ErrorSegment> { + public: + ~ErrorSurface(); + void Serialize(std::string* out) const; + void Deserialize(const std::string& in); +}; + +#endif diff --git a/training/dpmert/line_mediator.pl b/training/dpmert/line_mediator.pl new file mode 100755 index 00000000..bc2bb24c --- /dev/null +++ b/training/dpmert/line_mediator.pl @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication + +# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver) + +#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism. + +use strict; +use IPC::Open2; +use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO); + +my $quiet=!$ENV{DEBUG}; +$quiet=1 if $ENV{QUIET}; +sub info { + local $,=' '; + print STDERR @_ unless $quiet; +} + +my $mode='CROSS'; +my $ser='DIRECT'; +$mode='PIPE' if $ENV{PIPE}; +$mode='SNAKE' if $ENV{SNAKE}; +$mode='CROSS' if $ENV{CROSS}; +$ser='SERIAL' if $ENV{SERIAL}; +$ser='DIRECT' if $ENV{DIRECT}; +$ser='SERIAL' if $mode eq 'SNAKE'; +info("mode: $mode\n"); +info("connection: $ser\n"); + + +my @c1; +if (scalar @ARGV) { + do { + push @c1,shift + } while scalar @ARGV && $c1[$#c1] ne '--'; +} +pop @c1; +my @c2=@ARGV; +@ARGV=(); +(scalar @c1 && scalar @c2) || die qq{ +usage: $0 cmd1 args -- cmd2 args +all options are environment variables. +DEBUG=1 env var enables debugging output. +CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication. crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output. cmd1 initiates the conversation (sends the first line). default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork). +SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG. +if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout. +if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2. +DIRECT=1 (default) will override SERIAL=1. +CROSS=1 (default) will override SNAKE or PIPE. +}; + +info("1 cmd:",@c1,"\n"); +info("2 cmd:",@c2,"\n"); + +sub lineto { + select $_[0]; + $|=1; + shift; + print @_; +} + +if ($ser eq 'SERIAL') { + my ($R1,$W1,$R2,$W2); + my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3. + my $c2p=open2($R2,$W2,@c2); + if ($mode eq 'CROSS') { + while(<$R1>) { + info("1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("1|2:",$_); + lineto($W1,$_); + } + } else { + my $snake=$mode eq 'SNAKE'; + while(<STDIN>) { + info("IN:",$_); + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("IN|1|2:",$_); + if ($snake) { + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1|2|1:",$_); + } + lineto(*STDOUT,$_); + } + } +} else { + info("DIRECT mode\n"); + my @rw1=POSIX::pipe(); + my @rw2=POSIX::pipe(); + my $pid=undef; + $SIG{CHLD} = sub { wait }; + while (not defined ($pid=fork())) { + sleep 1; + } + my $pipe = $mode eq 'PIPE'; + unless ($pipe) { + POSIX::close(STDOUT_FILENO); + POSIX::close(STDIN_FILENO); + } + if ($pid) { + POSIX::dup2($rw1[1],STDOUT_FILENO); + POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe; + exec @c1; + } else { + POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe; + POSIX::dup2($rw1[0],STDIN_FILENO); + exec @c2; + } + while (wait()!=-1) {} +} diff --git a/training/dpmert/line_optimizer.cc b/training/dpmert/line_optimizer.cc new file mode 100644 index 00000000..9cf33502 --- /dev/null +++ b/training/dpmert/line_optimizer.cc @@ -0,0 +1,114 @@ +#include "line_optimizer.h" + +#include <limits> +#include <algorithm> + +#include "sparse_vector.h" +#include "ns.h" + +using namespace std; + +typedef ErrorSurface::const_iterator ErrorIter; + +// sort by increasing x-ints +struct IntervalComp { + bool operator() (const ErrorIter& a, const ErrorIter& b) const { + return a->x < b->x; + } +}; + +double LineOptimizer::LineOptimize( + const EvaluationMetric* metric, + const vector<ErrorSurface>& surfaces, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon) { + // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << " MINE=" << type << endl; + vector<ErrorIter> all_ints; + for (vector<ErrorSurface>::const_iterator i = surfaces.begin(); + i != surfaces.end(); ++i) { + const ErrorSurface& surface = *i; + for (ErrorIter j = surface.begin(); j != surface.end(); ++j) + all_ints.push_back(j); + } + sort(all_ints.begin(), all_ints.end(), IntervalComp()); + double last_boundary = all_ints.front()->x; + SufficientStats acc; + float& cur_best_score = *best_score; + cur_best_score = (type == MAXIMIZE_SCORE ? + -numeric_limits<float>::max() : numeric_limits<float>::max()); + bool left_edge = true; + double pos = numeric_limits<double>::quiet_NaN(); + for (vector<ErrorIter>::iterator i = all_ints.begin(); + i != all_ints.end(); ++i) { + const ErrorSegment& seg = **i; + if (seg.x - last_boundary > epsilon) { + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = seg.x - 0.1; + left_edge = false; + } else { + pos = last_boundary + (seg.x - last_boundary) / 2; + } + //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; + } + // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; +#undef SHOW_ERROR_SURFACES +#ifdef SHOW_ERROR_SURFACES + cerr << "x=" << seg.x << "\ts=" << sco << "\n"; +#endif + last_boundary = seg.x; + } + // cerr << "x-boundary=" << seg.x << "\n"; + //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; + //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; + acc += seg.delta; + } + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = 0; + } else { + pos = last_boundary + 1000.0; + } + } + return pos; +} + +void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize, + SparseVector<double>* axis, + RandomNumberGenerator<boost::mt19937>* rng) { + axis->clear(); + for (int i = 0; i < features_to_optimize.size(); ++i) + axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0)); + (*axis) /= axis->l2norm(); +} + +void LineOptimizer::CreateOptimizationDirections( + const vector<int>& features_to_optimize, + int additional_random_directions, + RandomNumberGenerator<boost::mt19937>* rng, + vector<SparseVector<double> >* dirs + , bool include_orthogonal + ) { + dirs->clear(); + typedef SparseVector<double> Dir; + vector<Dir> &out=*dirs; + int i=0; + if (include_orthogonal) + for (;i<features_to_optimize.size();++i) { + Dir d; + d.set_value(features_to_optimize[i],1.); + out.push_back(d); + } + out.resize(i+additional_random_directions); + for (;i<out.size();++i) + RandomUnitVector(features_to_optimize, &out[i], rng); + cerr << "Generated " << out.size() << " total axes to optimize along.\n"; +} + diff --git a/training/dpmert/line_optimizer.h b/training/dpmert/line_optimizer.h new file mode 100644 index 00000000..83819f41 --- /dev/null +++ b/training/dpmert/line_optimizer.h @@ -0,0 +1,48 @@ +#ifndef LINE_OPTIMIZER_H_ +#define LINE_OPTIMIZER_H_ + +#include <vector> + +#include "sparse_vector.h" +#include "error_surface.h" +#include "sampler.h" + +class EvaluationMetric; +class Weights; + +struct LineOptimizer { + + // use MINIMIZE_SCORE for things like TER, WER + // MAXIMIZE_SCORE for things like BLEU + enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; + + // merge all the error surfaces together into a global + // error surface and find (the middle of) the best segment + static double LineOptimize( + const EvaluationMetric* metric, + const std::vector<ErrorSurface>& envs, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon = 1.0/65536.0); + + // return a random vector of length 1 where all dimensions + // not listed in dimensions will be 0. + static void RandomUnitVector(const std::vector<int>& dimensions, + SparseVector<double>* axis, + RandomNumberGenerator<boost::mt19937>* rng); + + // generate a list of directions to optimize; the list will + // contain the orthogonal vectors corresponding to the dimensions in + // primary and then additional_random_directions directions in those + // dimensions as well. All vectors will be length 1. + static void CreateOptimizationDirections( + const std::vector<int>& primary, + int additional_random_directions, + RandomNumberGenerator<boost::mt19937>* rng, + std::vector<SparseVector<double> >* dirs + , bool include_primary=true + ); + +}; + +#endif diff --git a/training/dpmert/lo_test.cc b/training/dpmert/lo_test.cc new file mode 100644 index 00000000..d89bcd99 --- /dev/null +++ b/training/dpmert/lo_test.cc @@ -0,0 +1,229 @@ +#define BOOST_TEST_MODULE LineOptimizerTest +#include <boost/test/unit_test.hpp> +#include <boost/test/floating_point_comparison.hpp> + +#include <cmath> +#include <iostream> +#include <fstream> + +#include <boost/shared_ptr.hpp> + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "fdict.h" +#include "hg.h" +#include "kbest.h" +#include "hg_io.h" +#include "filelib.h" +#include "inside_outside.h" +#include "viterbi.h" +#include "mert_geometry.h" +#include "line_optimizer.h" + +using namespace std; + +const char* ref11 = "australia reopens embassy in manila"; +const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; +const char* ref21 = "australia reopened manila embassy"; +const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; +const char* ref31 = "australia to reopen embassy in manila"; +const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; +const char* ref41 = "australia to re - open its embassy to manila"; +const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; + +BOOST_AUTO_TEST_CASE( TestCheckNaN) { + double x = 0; + double y = 0; + double z = x / y; + BOOST_CHECK_EQUAL(true, std::isnan(z)); +} + +BOOST_AUTO_TEST_CASE(TestConvexHull) { + boost::shared_ptr<MERTPoint> a1(new MERTPoint(-1, 0)); + boost::shared_ptr<MERTPoint> b1(new MERTPoint(1, 0)); + boost::shared_ptr<MERTPoint> a2(new MERTPoint(-1, 1)); + boost::shared_ptr<MERTPoint> b2(new MERTPoint(1, -1)); + vector<boost::shared_ptr<MERTPoint> > sa; sa.push_back(a1); sa.push_back(b1); + vector<boost::shared_ptr<MERTPoint> > sb; sb.push_back(a2); sb.push_back(b2); + ConvexHull a(sa); + cerr << a << endl; + ConvexHull b(sb); + ConvexHull c = a; + c *= b; + cerr << a << " (*) " << b << " = " << c << endl; + BOOST_CHECK_EQUAL(3, c.size()); +} + +BOOST_AUTO_TEST_CASE(TestConvexHullInside) { + const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector<double> wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 1.0); + hg.Reweight(wts); + vector<pair<vector<WordID>, prob_t> > list; + std::vector<SparseVector<double> > features; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0); + ConvexHullWeightFunction wf(wts, dir); + ConvexHull env = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + cerr << env << endl; + const vector<boost::shared_ptr<MERTPoint> >& segs = env.GetSortedSegs(); + dir *= segs[1]->x; + wts += dir; + hg.Reweight(wts); + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest2.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + for (unsigned i = 0; i < segs.size(); ++i) { + cerr << "seg=" << i << endl; + vector<WordID> trans; + segs[i]->ConstructTranslation(&trans); + cerr << TD::GetString(trans) << endl; + } +} + +BOOST_AUTO_TEST_CASE( TestS1) { + int fPhraseModel_0 = FD::Convert("PhraseModel_0"); + int fPhraseModel_1 = FD::Convert("PhraseModel_1"); + int fPhraseModel_2 = FD::Convert("PhraseModel_2"); + int fLanguageModel = FD::Convert("LanguageModel"); + int fWordPenalty = FD::Convert("WordPenalty"); + int fPassThrough = FD::Convert("PassThrough"); + SparseVector<double> wts; + wts.set_value(fWordPenalty, 4.25); + wts.set_value(fLanguageModel, -1.1165); + wts.set_value(fPhraseModel_0, -0.96); + wts.set_value(fPhraseModel_1, -0.65); + wts.set_value(fPhraseModel_2, -0.77); + wts.set_value(fPassThrough, -10.0); + + vector<int> to_optimize; + to_optimize.push_back(fWordPenalty); + to_optimize.push_back(fLanguageModel); + to_optimize.push_back(fPhraseModel_0); + to_optimize.push_back(fPhraseModel_1); + to_optimize.push_back(fPhraseModel_2); + + std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : TEST_DATA); + + Hypergraph hg; + ReadFile rf(path + "/0.json.gz"); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(wts); + + Hypergraph hg2; + ReadFile rf2(path + "/1.json.gz"); + HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); + hg2.Reweight(wts); + + vector<vector<WordID> > refs1(4); + TD::ConvertSentence(ref11, &refs1[0]); + TD::ConvertSentence(ref21, &refs1[1]); + TD::ConvertSentence(ref31, &refs1[2]); + TD::ConvertSentence(ref41, &refs1[3]); + vector<vector<WordID> > refs2(4); + TD::ConvertSentence(ref12, &refs2[0]); + TD::ConvertSentence(ref22, &refs2[1]); + TD::ConvertSentence(ref32, &refs2[2]); + TD::ConvertSentence(ref42, &refs2[3]); + vector<ConvexHull> envs(2); + + RandomNumberGenerator<boost::mt19937> rng; + + vector<SparseVector<double> > axes; // directions to search + LineOptimizer::CreateOptimizationDirections( + to_optimize, + 10, + &rng, + &axes); + assert(axes.size() == 10 + to_optimize.size()); + for (unsigned i = 0; i < axes.size(); ++i) + cerr << axes[i] << endl; + const SparseVector<double>& axis = axes[0]; + + cerr << "Computing Viterbi envelope using inside algorithm...\n"; + cerr << "axis: " << axis << endl; + clock_t t_start=clock(); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + envs[1] = Inside<ConvexHull, ConvexHullWeightFunction>(hg2, NULL, wf); + + vector<ErrorSurface> es(2); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1); + boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); + cerr << envs[0].size() << " " << envs[1].size() << endl; + cerr << es[0].size() << " " << es[1].size() << endl; + envs.clear(); + clock_t t_env=clock(); + float score; + double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); + clock_t t_opt=clock(); + cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; + BOOST_CHECK_CLOSE(0.48719698, score, 1e-5); + SparseVector<double> res = axis; + res *= m; + res += wts; + cerr << "res: " << res << endl; + cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl; + cerr << " LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl; + hg.Reweight(res); + hg2.Reweight(res); + vector<WordID> t1,t2; + ViterbiESentence(hg, &t1); + ViterbiESentence(hg2, &t2); + cerr << TD::GetString(t1) << endl; + cerr << TD::GetString(t2) << endl; +} + +BOOST_AUTO_TEST_CASE(TestZeroOrigin) { + const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector<double> wts; + wts.set_value(FD::Convert("PassThrough"), -0.929201533002898); + hg.Reweight(wts); + + vector<pair<vector<WordID>, prob_t> > list; + std::vector<SparseVector<double> > features; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + + SparseVector<double> axis; axis.set_value(FD::Convert("Glue"),1.0); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + vector<ConvexHull> envs(1); + envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + + vector<vector<WordID> > mr(4); + TD::ConvertSentence("untitled", &mr[0]); + TD::ConvertSentence("with no title", &mr[1]); + TD::ConvertSentence("without a title", &mr[2]); + TD::ConvertSentence("without title", &mr[3]); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr); + vector<ErrorSurface> es(1); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); +} + diff --git a/training/dpmert/mert_geometry.cc b/training/dpmert/mert_geometry.cc new file mode 100644 index 00000000..d6973658 --- /dev/null +++ b/training/dpmert/mert_geometry.cc @@ -0,0 +1,185 @@ +#include "mert_geometry.h" + +#include <cassert> +#include <limits> + +using namespace std; + +ConvexHull::ConvexHull(int i) { + if (i == 0) { + // do nothing - <> + } else if (i == 1) { + points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(0, 0, 0, boost::shared_ptr<MERTPoint>(), boost::shared_ptr<MERTPoint>()))); + assert(this->IsMultiplicativeIdentity()); + } else { + cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n"; + abort(); + } +} + +const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const { + const double m = direction.dot(e.feature_values_); + const double b = origin.dot(e.feature_values_); + MERTPoint* point = new MERTPoint(m, b, e); + return ConvexHull(1, point); +} + +ostream& operator<<(ostream& os, const ConvexHull& env) { + os << '<'; + const vector<boost::shared_ptr<MERTPoint> >& points = env.GetSortedSegs(); + for (int i = 0; i < points.size(); ++i) + os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2; + return os << '>'; +} + +#define ORIGINAL_MERT_IMPLEMENTATION 1 +#ifdef ORIGINAL_MERT_IMPLEMENTATION + +struct SlopeCompare { + bool operator() (const boost::shared_ptr<MERTPoint>& a, const boost::shared_ptr<MERTPoint>& b) const { + return a->m < b->m; + } +}; + +const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) { + if (!other.is_sorted) other.Sort(); + if (points.empty()) { + points = other.points; + return *this; + } + is_sorted = false; + int j = points.size(); + points.resize(points.size() + other.points.size()); + for (int i = 0; i < other.points.size(); ++i) + points[j++] = other.points[i]; + assert(j == points.size()); + return *this; +} + +void ConvexHull::Sort() const { + sort(points.begin(), points.end(), SlopeCompare()); + const int k = points.size(); + int j = 0; + for (int i = 0; i < k; ++i) { + MERTPoint l = *points[i]; + l.x = kMinusInfinity; + // cerr << "m=" << l.m << endl; + if (0 < j) { + if (points[j-1]->m == l.m) { // lines are parallel + if (l.b <= points[j-1]->b) continue; + --j; + } + while(0 < j) { + l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m); + if (points[j-1]->x < l.x) break; + --j; + } + if (0 == j) l.x = kMinusInfinity; + } + *points[j++] = l; + } + points.resize(j); + is_sorted = true; +} + +const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) { + if (other.IsMultiplicativeIdentity()) { return *this; } + if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } + + if (!is_sorted) Sort(); + if (!other.is_sorted) other.Sort(); + + if (this->IsEdgeEnvelope()) { +// if (other.size() > 1) +// cerr << *this << " (TIMES) " << other << endl; + boost::shared_ptr<MERTPoint> edge_parent = points[0]; + const double& edge_b = edge_parent->b; + const double& edge_m = edge_parent->m; + points.clear(); + for (int i = 0; i < other.points.size(); ++i) { + const MERTPoint& p = *other.points[i]; + const double m = p.m + edge_m; + const double b = p.b + edge_b; + const double& x = p.x; // x's don't change with * + points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(x, m, b, edge_parent, other.points[i]))); + assert(points.back()->p1->edge); + } +// if (other.size() > 1) +// cerr << " = " << *this << endl; + } else { + vector<boost::shared_ptr<MERTPoint> > new_points; + int this_i = 0; + int other_i = 0; + const int this_size = points.size(); + const int other_size = other.points.size(); + double cur_x = kMinusInfinity; // moves from left to right across the + // real numbers, stopping for all inter- + // sections + double this_next_val = (1 < this_size ? points[1]->x : kPlusInfinity); + double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity); + while (this_i < this_size && other_i < other_size) { + const MERTPoint& this_point = *points[this_i]; + const MERTPoint& other_point= *other.points[other_i]; + const double m = this_point.m + other_point.m; + const double b = this_point.b + other_point.b; + + new_points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i]))); + int comp = 0; + if (this_next_val < other_next_val) comp = -1; else + if (this_next_val > other_next_val) comp = 1; + if (0 == comp) { // the next values are equal, advance both indices + ++this_i; + ++other_i; + cur_x = this_next_val; // could be other_next_val (they're equal!) + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } else { // advance the i with the lower x, update cur_x + if (-1 == comp) { + ++this_i; + cur_x = this_next_val; + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + } else { + ++other_i; + cur_x = other_next_val; + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } + } + } + points.swap(new_points); + } + //cerr << "Multiply: result=" << (*this) << endl; + return *this; +} + +// recursively construct translation +void MERTPoint::ConstructTranslation(vector<WordID>* trans) const { + const MERTPoint* cur = this; + vector<vector<WordID> > ant_trans; + while(!cur->edge) { + ant_trans.resize(ant_trans.size() + 1); + cur->p2->ConstructTranslation(&ant_trans.back()); + cur = cur->p1.get(); + } + size_t ant_size = ant_trans.size(); + vector<const vector<WordID>*> pants(ant_size); + assert(ant_size == cur->edge->tail_nodes_.size()); + --ant_size; + for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; + cur->edge->rule_->ESubstitute(pants, trans); +} + +void MERTPoint::CollectEdgesUsed(std::vector<bool>* edges_used) const { + if (edge) { + assert(edge->id_ < edges_used->size()); + (*edges_used)[edge->id_] = true; + } + if (p1) p1->CollectEdgesUsed(edges_used); + if (p2) p2->CollectEdgesUsed(edges_used); +} + +#else + +// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS + +#endif + diff --git a/training/dpmert/mert_geometry.h b/training/dpmert/mert_geometry.h new file mode 100644 index 00000000..a8b6959e --- /dev/null +++ b/training/dpmert/mert_geometry.h @@ -0,0 +1,81 @@ +#ifndef _MERT_GEOMETRY_H_ +#define _MERT_GEOMETRY_H_ + +#include <vector> +#include <iostream> +#include <boost/shared_ptr.hpp> + +#include "hg.h" +#include "sparse_vector.h" + +static const double kMinusInfinity = -std::numeric_limits<double>::infinity(); +static const double kPlusInfinity = std::numeric_limits<double>::infinity(); + +struct MERTPoint { + MERTPoint() : x(), m(), b(), edge() {} + MERTPoint(double _m, double _b) : + x(kMinusInfinity), m(_m), b(_b), edge() {} + MERTPoint(double _x, double _m, double _b, const boost::shared_ptr<MERTPoint>& p1_, const boost::shared_ptr<MERTPoint>& p2_) : + x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} + MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) : + x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} + + double x; // x intersection with previous segment in env, or -inf if none + double m; // this line's slope + double b; // intercept with y-axis + + // we keep a pointer to the "parents" of this segment so we can reconstruct + // the Viterbi translation corresponding to this segment + boost::shared_ptr<MERTPoint> p1; + boost::shared_ptr<MERTPoint> p2; + + // only MERTPoints created from an edge using the ConvexHullWeightFunction + // have rules + // TRulePtr rule; + const Hypergraph::Edge* edge; + + // recursively recover the Viterbi translation that will result from setting + // the weights to origin + axis * x, where x is any value from this->x up + // until the next largest x in the containing ConvexHull + void ConstructTranslation(std::vector<WordID>* trans) const; + void CollectEdgesUsed(std::vector<bool>* edges_used) const; +}; + +// this is the semiring value type, +// it defines constructors for 0, 1, and the operations + and * +struct ConvexHull { + // create semiring zero + ConvexHull() : is_sorted(true) {} // zero + // for debugging: + ConvexHull(const std::vector<boost::shared_ptr<MERTPoint> >& s) : points(s) { Sort(); } + // create semiring 1 or 0 + explicit ConvexHull(int i); + ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr<MERTPoint>(point)) {} + const ConvexHull& operator+=(const ConvexHull& other); + const ConvexHull& operator*=(const ConvexHull& other); + bool IsMultiplicativeIdentity() const { + return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); } + const std::vector<boost::shared_ptr<MERTPoint> >& GetSortedSegs() const { + if (!is_sorted) Sort(); + return points; + } + size_t size() const { return points.size(); } + + private: + bool IsEdgeEnvelope() const { + return points.size() == 1 && points[0]->edge; } + void Sort() const; + mutable bool is_sorted; + mutable std::vector<boost::shared_ptr<MERTPoint> > points; +}; +std::ostream& operator<<(std::ostream& os, const ConvexHull& env); + +struct ConvexHullWeightFunction { + ConvexHullWeightFunction(const SparseVector<double>& ori, + const SparseVector<double>& dir) : origin(ori), direction(dir) {} + const ConvexHull operator()(const Hypergraph::Edge& e) const; + const SparseVector<double> origin; + const SparseVector<double> direction; +}; + +#endif diff --git a/training/dpmert/mr_dpmert_generate_mapper_input.cc b/training/dpmert/mr_dpmert_generate_mapper_input.cc new file mode 100644 index 00000000..199cd23a --- /dev/null +++ b/training/dpmert/mr_dpmert_generate_mapper_input.cc @@ -0,0 +1,81 @@ +#include <iostream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "weights.h" +#include "line_optimizer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") + ("weights,w",po::value<string>(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r <DIR>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + RandomNumberGenerator<boost::mt19937> rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector<string> features; + SparseVector<weight_t> origin; + vector<weight_t> w; + Weights::InitFromFile(conf["weights"].as<string>(), &w, &features); + Weights::InitSparseVector(w, &origin); + const string forest_repository = conf["forest_repository"].as<string>(); + if (!DirectoryExists(forest_repository)) { + cerr << "Forest repository directory " << forest_repository << " not found!\n"; + return 1; + } + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as<vector<string> >(); + vector<SparseVector<weight_t> > directions; + vector<int> fids(features.size()); + for (unsigned i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as<unsigned int>(), + &rng, + &directions); + unsigned dev_set_size = conf["dev_set_size"].as<unsigned>(); + for (unsigned i = 0; i < dev_set_size; ++i) { + for (unsigned j = 0; j < directions.size(); ++j) { + cout << forest_repository << '/' << i << ".json.gz " << i << ' '; + print(cout, origin, "=", ";"); + cout << ' '; + print(cout, directions[j], "=", ";"); + cout << endl; + } + } + return 0; +} diff --git a/training/dpmert/mr_dpmert_map.cc b/training/dpmert/mr_dpmert_map.cc new file mode 100644 index 00000000..d1efcf96 --- /dev/null +++ b/training/dpmert/mr_dpmert_map.cc @@ -0,0 +1,112 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "filelib.h" +#include "stringlib.h" +#include "sparse_vector.h" +#include "mert_geometry.h" +#include "inside_outside.h" +#include "error_surface.h" +#include "b64tools.h" +#include "hg_io.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") + ("source,s",po::value<string>(), "Source file (ignored, except for AER)") + ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized") + ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF.TXT>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +bool ReadSparseVectorString(const string& s, SparseVector<double>* v) { +#if 0 + // this should work, but untested. + std::istringstream i(s); + i>>*v; +#else + vector<string> fields; + Tokenize(s, ';', &fields); + if (fields.empty()) return false; + for (unsigned i = 0; i < fields.size(); ++i) { + vector<string> pair(2); + Tokenize(fields[i], '=', &pair); + if (pair.size() != 2) { + cerr << "Error parsing vector string: " << fields[i] << endl; + return false; + } + v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); + } + return true; +#endif +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as<string>()); + istream &in=*in_read.stream(); + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + istringstream is(line); + int sent_id; + string file, s_origin, s_direction; + // path-to-file (JSON) sent_ed starting-point search-direction + is >> file >> sent_id >> s_origin >> s_direction; + SparseVector<double> origin; + ReadSparseVectorString(s_origin, &origin); + SparseVector<double> direction; + ReadSparseVectorString(s_direction, &direction); + // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl; + if (last_file != file) { + last_file = file; + ReadFile rf(file); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + } + const ConvexHullWeightFunction wf(origin, direction); + const ConvexHull hull = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf); + + ErrorSurface es; + ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg); + //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; + // cerr << "Error surface has " << es.size() << " segments\n"; + string val; + es.Serialize(&val); + cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; + B64::b64encode(val.c_str(), val.size(), &cout); + cout << endl << flush; + } + return 0; +} diff --git a/training/dpmert/mr_dpmert_reduce.cc b/training/dpmert/mr_dpmert_reduce.cc new file mode 100644 index 00000000..31512a03 --- /dev/null +++ b/training/dpmert/mr_dpmert_reduce.cc @@ -0,0 +1,77 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "sparse_vector.h" +#include "error_surface.h" +#include "line_optimizer.h" +#include "b64tools.h" +#include "stringlib.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = conf->count("evaluation_metric") == 0; + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; + if (metric->IsErrorMetric()) + opt_type = LineOptimizer::MINIMIZE_SCORE; + + vector<ErrorSurface> esv; + string last_key, line, key, val; + while(getline(cin, line)) { + size_t ks = line.find("\t"); + assert(string::npos != ks); + assert(ks > 2); + key = line.substr(2, ks - 2); + val = line.substr(ks + 1); + if (key != last_key) { + if (!last_key.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + last_key.swap(key); + esv.clear(); + } + if (val.size() % 4 != 0) { + cerr << "B64 encoding error 1! Skipping.\n"; + continue; + } + string encoded(val.size() / 4 * 3, '\0'); + if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) { + cerr << "B64 encoding error 2! Skipping.\n"; + continue; + } + esv.push_back(ErrorSurface()); + esv.back().Deserialize(encoded); + } + if (!esv.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + return 0; +} diff --git a/training/dpmert/test_aer/README b/training/dpmert/test_aer/README new file mode 100644 index 00000000..819b2e32 --- /dev/null +++ b/training/dpmert/test_aer/README @@ -0,0 +1,8 @@ +To run the test: + +../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights + +This will optimize the parameters of the tiny lexical translation model +so as to minimize the AER of the Viterbi alignment on the development +set in corpus.src according to the reference alignments in ref.0. + diff --git a/training/dpmert/test_aer/cdec.ini b/training/dpmert/test_aer/cdec.ini new file mode 100644 index 00000000..08187848 --- /dev/null +++ b/training/dpmert/test_aer/cdec.ini @@ -0,0 +1,3 @@ +formalism=lextrans +grammar=grammar +aligner=true diff --git a/training/dpmert/test_aer/corpus.src b/training/dpmert/test_aer/corpus.src new file mode 100644 index 00000000..31b23971 --- /dev/null +++ b/training/dpmert/test_aer/corpus.src @@ -0,0 +1,3 @@ +el gato negro ||| the black cat +el gato ||| the cat +el libro ||| the book diff --git a/training/dpmert/test_aer/grammar b/training/dpmert/test_aer/grammar new file mode 100644 index 00000000..9d857824 --- /dev/null +++ b/training/dpmert/test_aer/grammar @@ -0,0 +1,12 @@ +el ||| cat ||| F1=1 +el ||| the ||| F2=1 +el ||| black ||| F3=1 +el ||| book ||| F11=1 +gato ||| cat ||| F4=1 NN=1 +gato ||| black ||| F5=1 +gato ||| the ||| F6=1 +negro ||| the ||| F7=1 +negro ||| cat ||| F8=1 +negro ||| black ||| F9=1 +libro ||| the ||| F10=1 +libro ||| book ||| F12=1 NN=1 diff --git a/training/dpmert/test_aer/ref.0 b/training/dpmert/test_aer/ref.0 new file mode 100644 index 00000000..734a9c5b --- /dev/null +++ b/training/dpmert/test_aer/ref.0 @@ -0,0 +1,3 @@ +0-0 1-2 2-1 +0-0 1-1 +0-0 1-1 diff --git a/training/dpmert/test_aer/weights b/training/dpmert/test_aer/weights new file mode 100644 index 00000000..afc9282e --- /dev/null +++ b/training/dpmert/test_aer/weights @@ -0,0 +1,13 @@ +F1 0.1 +F2 -.5980815 +F3 0.24235 +F4 0.625 +F5 0.4514 +F6 0.112316 +F7 -0.123415 +F8 -0.25390285 +F9 -0.23852 +F10 0.646 +F11 0.413141 +F12 0.343216 +NN -0.1215 diff --git a/training/dpmert/test_data/0.json.gz b/training/dpmert/test_data/0.json.gz Binary files differnew file mode 100644 index 00000000..30f8dd77 --- /dev/null +++ b/training/dpmert/test_data/0.json.gz diff --git a/training/dpmert/test_data/1.json.gz b/training/dpmert/test_data/1.json.gz Binary files differnew file mode 100644 index 00000000..c82cc179 --- /dev/null +++ b/training/dpmert/test_data/1.json.gz diff --git a/training/dpmert/test_data/c2e.txt.0 b/training/dpmert/test_data/c2e.txt.0 new file mode 100644 index 00000000..12c4abe9 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.0 @@ -0,0 +1,2 @@ +australia reopens embassy in manila +( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/training/dpmert/test_data/c2e.txt.1 b/training/dpmert/test_data/c2e.txt.1 new file mode 100644 index 00000000..4ac12df1 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.1 @@ -0,0 +1,2 @@ +australia reopened manila embassy +( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/training/dpmert/test_data/c2e.txt.2 b/training/dpmert/test_data/c2e.txt.2 new file mode 100644 index 00000000..2f67b72f --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.2 @@ -0,0 +1,2 @@ +australia to reopen embassy in manila +( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/training/dpmert/test_data/c2e.txt.3 b/training/dpmert/test_data/c2e.txt.3 new file mode 100644 index 00000000..5483cef6 --- /dev/null +++ b/training/dpmert/test_data/c2e.txt.3 @@ -0,0 +1,2 @@ +australia to re - open its embassy to manila +( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/training/dpmert/test_data/re.txt.0 b/training/dpmert/test_data/re.txt.0 new file mode 100644 index 00000000..86eff087 --- /dev/null +++ b/training/dpmert/test_data/re.txt.0 @@ -0,0 +1,5 @@ +erdogan states turkey to reject any pressures to urge it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . +erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . +we will discuss this dossier in the course of membership negotiations . " +he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/training/dpmert/test_data/re.txt.1 b/training/dpmert/test_data/re.txt.1 new file mode 100644 index 00000000..2140f198 --- /dev/null +++ b/training/dpmert/test_data/re.txt.1 @@ -0,0 +1,5 @@ +erdogan confirms turkey will resist any pressure to recognize cyprus +ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . +erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . +we shall discuss this issue in the course of the membership negotiations . " +he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/training/dpmert/test_data/re.txt.2 b/training/dpmert/test_data/re.txt.2 new file mode 100644 index 00000000..94e46286 --- /dev/null +++ b/training/dpmert/test_data/re.txt.2 @@ -0,0 +1,5 @@ +erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus +ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . +erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . +we shall discuss this dossier during the negotiations on joining . " +and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/training/dpmert/test_data/re.txt.3 b/training/dpmert/test_data/re.txt.3 new file mode 100644 index 00000000..f87c3308 --- /dev/null +++ b/training/dpmert/test_data/re.txt.3 @@ -0,0 +1,5 @@ +erdogan stresses that turkey will reject all pressures to force it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . +erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . +we will discuss this file during the negotiations on joining . " +he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " diff --git a/training/dtrain/Makefile.am b/training/dtrain/Makefile.am new file mode 100644 index 00000000..844c790d --- /dev/null +++ b/training/dtrain/Makefile.am @@ -0,0 +1,7 @@ +bin_PROGRAMS = dtrain + +dtrain_SOURCES = dtrain.cc score.cc dtrain.h kbestget.h ksampler.h pairsampling.h score.h +dtrain_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval + diff --git a/training/dtrain/README.md b/training/dtrain/README.md new file mode 100644 index 00000000..2ab2f232 --- /dev/null +++ b/training/dtrain/README.md @@ -0,0 +1,30 @@ +This is a simple (and parallelizable) tuning method for cdec +which is able to train the weights of very many (sparse) features. +It was used here: + "Joint Feature Selection in Distributed Stochastic + Learning for Large-Scale Discriminative Training in + SMT" +(Simianer, Riezler, Dyer; ACL 2012) + + +Building +-------- +Builds when building cdec, see ../BUILDING . +To build only parts needed for dtrain do +``` + autoreconf -ifv + ./configure + cd training/dtrain/; make +``` + +Running +------- +See directories under test/ . + +Legal +----- +Copyright (c) 2012-2013 by Patrick Simianer <p@simianer.de> + +See the file LICENSE.txt in the root folder for the licensing terms that this software is +released under. + diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc new file mode 100644 index 00000000..149f87d4 --- /dev/null +++ b/training/dtrain/dtrain.cc @@ -0,0 +1,553 @@ +#include "dtrain.h" + + +bool +dtrain_init(int argc, char** argv, po::variables_map* cfg) +{ + po::options_description ini("Configuration File Options"); + ini.add_options() + ("input", po::value<string>()->default_value("-"), "input file (src)") + ("refs,r", po::value<string>(), "references") + ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT") + ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)") + ("decoder_config", po::value<string>(), "configuration file for cdec") + ("print_weights", po::value<string>(), "weights to print on each iteration") + ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences") + ("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration") + ("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)") + ("k", po::value<unsigned>()->default_value(100), "how many translations to sample") + ("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'") + ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'") + ("pair_sampling", po::value<string>()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'") + ("hi_lo", po::value<float>()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5") + ("pair_threshold", po::value<score_t>()->default_value(0.), "bleu [0,1] threshold to filter pairs") + ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)") + ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_, lc_") + ("learning_rate", po::value<weight_t>()->default_value(1.0), "learning rate") + ("gamma", po::value<weight_t>()->default_value(0.), "gamma for SVM (0 for perceptron)") + ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)") + ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input") + ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010) UNTESTED") + ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength") + ("fselect", po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO + ("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU") + ("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") + ("loss_margin", po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near") + ("max_pairs", po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.") + ("noup", po::value<bool>()->zero_tokens(), "do not update weights"); + po::options_description cl("Command Line Options"); + cl.add_options() + ("config,c", po::value<string>(), "dtrain config file") + ("quiet,q", po::value<bool>()->zero_tokens(), "be quiet") + ("verbose,v", po::value<bool>()->zero_tokens(), "be verbose"); + cl.add(ini); + po::store(parse_command_line(argc, argv, cl), *cfg); + if (cfg->count("config")) { + ifstream ini_f((*cfg)["config"].as<string>().c_str()); + po::store(po::parse_config_file(ini_f, ini), *cfg); + } + po::notify(*cfg); + if (!cfg->count("decoder_config")) { + cerr << cl << endl; + return false; + } + if ((*cfg)["sample_from"].as<string>() != "kbest" + && (*cfg)["sample_from"].as<string>() != "forest") { + cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as<string>() << "', use 'kbest' or 'forest'." << endl; + return false; + } + if ((*cfg)["sample_from"].as<string>() == "kbest" && (*cfg)["filter"].as<string>() != "uniq" && + (*cfg)["filter"].as<string>() != "not") { + cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as<string>() << "', use 'uniq' or 'not'." << endl; + return false; + } + if ((*cfg)["pair_sampling"].as<string>() != "all" && (*cfg)["pair_sampling"].as<string>() != "XYX" && + (*cfg)["pair_sampling"].as<string>() != "PRO") { + cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as<string>() << "'." << endl; + return false; + } + if(cfg->count("hi_lo") && (*cfg)["pair_sampling"].as<string>() != "XYX") { + cerr << "Warning: hi_lo only works with pair_sampling XYX." << endl; + } + if((*cfg)["hi_lo"].as<float>() > 0.5 || (*cfg)["hi_lo"].as<float>() < 0.01) { + cerr << "hi_lo must lie in [0.01, 0.5]" << endl; + return false; + } + if ((*cfg)["pair_threshold"].as<score_t>() < 0) { + cerr << "The threshold must be >= 0!" << endl; + return false; + } + if ((*cfg)["select_weights"].as<string>() != "last" && (*cfg)["select_weights"].as<string>() != "best" && + (*cfg)["select_weights"].as<string>() != "avg" && (*cfg)["select_weights"].as<string>() != "VOID") { + cerr << "Wrong 'select_weights' param: '" << (*cfg)["select_weights"].as<string>() << "', use 'last' or 'best'." << endl; + return false; + } + return true; +} + +int +main(int argc, char** argv) +{ + // handle most parameters + po::variables_map cfg; + if (!dtrain_init(argc, argv, &cfg)) exit(1); // something is wrong + bool quiet = false; + if (cfg.count("quiet")) quiet = true; + bool verbose = false; + if (cfg.count("verbose")) verbose = true; + bool noup = false; + if (cfg.count("noup")) noup = true; + bool rescale = false; + if (cfg.count("rescale")) rescale = true; + bool keep = false; + if (cfg.count("keep")) keep = true; + + const unsigned k = cfg["k"].as<unsigned>(); + const unsigned N = cfg["N"].as<unsigned>(); + const unsigned T = cfg["epochs"].as<unsigned>(); + const unsigned stop_after = cfg["stop_after"].as<unsigned>(); + const string filter_type = cfg["filter"].as<string>(); + const string sample_from = cfg["sample_from"].as<string>(); + const string pair_sampling = cfg["pair_sampling"].as<string>(); + const score_t pair_threshold = cfg["pair_threshold"].as<score_t>(); + const string select_weights = cfg["select_weights"].as<string>(); + const float hi_lo = cfg["hi_lo"].as<float>(); + const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>(); + const unsigned max_pairs = cfg["max_pairs"].as<unsigned>(); + weight_t loss_margin = cfg["loss_margin"].as<weight_t>(); + if (loss_margin > 9998.) loss_margin = std::numeric_limits<float>::max(); + bool scale_bleu_diff = false; + if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true; + bool average = false; + if (select_weights == "avg") + average = true; + vector<string> print_weights; + if (cfg.count("print_weights")) + boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" ")); + + + // setup decoder + register_feature_functions(); + SetSilent(true); + ReadFile ini_rf(cfg["decoder_config"].as<string>()); + if (!quiet) + cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; + Decoder decoder(ini_rf.stream()); + + // scoring metric/scorer + string scorer_str = cfg["scorer"].as<string>(); + LocalScorer* scorer; + if (scorer_str == "bleu") { + scorer = dynamic_cast<BleuScorer*>(new BleuScorer); + } else if (scorer_str == "stupid_bleu") { + scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer); + } else if (scorer_str == "fixed_stupid_bleu") { + scorer = dynamic_cast<FixedStupidBleuScorer*>(new FixedStupidBleuScorer); + } else if (scorer_str == "smooth_bleu") { + scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer); + } else if (scorer_str == "sum_bleu") { + scorer = dynamic_cast<SumBleuScorer*>(new SumBleuScorer); + } else if (scorer_str == "sumexp_bleu") { + scorer = dynamic_cast<SumExpBleuScorer*>(new SumExpBleuScorer); + } else if (scorer_str == "sumwhatever_bleu") { + scorer = dynamic_cast<SumWhateverBleuScorer*>(new SumWhateverBleuScorer); + } else if (scorer_str == "approx_bleu") { + scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d)); + } else if (scorer_str == "lc_bleu") { + scorer = dynamic_cast<LinearBleuScorer*>(new LinearBleuScorer(N)); + } else { + cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl; + exit(1); + } + vector<score_t> bleu_weights; + scorer->Init(N, bleu_weights); + + // setup decoder observer + MT19937 rng; // random number generator, only for forest sampling + HypSampler* observer; + if (sample_from == "kbest") + observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type)); + else + observer = dynamic_cast<KSampler*>(new KSampler(k, &rng)); + observer->SetScorer(scorer); + + // init weights + vector<weight_t>& dense_weights = decoder.CurrentWeightVector(); + SparseVector<weight_t> lambdas, cumulative_penalties, w_average; + if (cfg.count("input_weights")) Weights::InitFromFile(cfg["input_weights"].as<string>(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); + + // meta params for perceptron, SVM + weight_t eta = cfg["learning_rate"].as<weight_t>(); + weight_t gamma = cfg["gamma"].as<weight_t>(); + + // faster perceptron: consider only misranked pairs, see + // DO NOT ENABLE WITH SVM (gamma > 0) OR loss_margin! + bool faster_perceptron = false; + if (gamma==0 && loss_margin==0) faster_perceptron = true; + + // l1 regularization + bool l1naive = false; + bool l1clip = false; + bool l1cumul = false; + weight_t l1_reg = 0; + if (cfg["l1_reg"].as<string>() != "none") { + string s = cfg["l1_reg"].as<string>(); + if (s == "naive") l1naive = true; + else if (s == "clip") l1clip = true; + else if (s == "cumul") l1cumul = true; + l1_reg = cfg["l1_reg_strength"].as<weight_t>(); + } + + // output + string output_fn = cfg["output"].as<string>(); + // input + string input_fn = cfg["input"].as<string>(); + ReadFile input(input_fn); + // buffer input for t > 0 + vector<string> src_str_buf; // source strings (decoder takes only strings) + vector<vector<WordID> > ref_ids_buf; // references as WordID vecs + string refs_fn = cfg["refs"].as<string>(); + ReadFile refs(refs_fn); + + unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size + vector<pair<score_t, score_t> > all_scores; + score_t max_score = 0.; + unsigned best_it = 0; + float overall_time = 0.; + + // output cfg + if (!quiet) { + cerr << _p5; + cerr << endl << "dtrain" << endl << "Parameters:" << endl; + cerr << setw(25) << "k " << k << endl; + cerr << setw(25) << "N " << N << endl; + cerr << setw(25) << "T " << T << endl; + cerr << setw(26) << "scorer '" << scorer_str << "'" << endl; + if (scorer_str == "approx_bleu") + cerr << setw(25) << "approx. B discount " << approx_bleu_d << endl; + cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl; + if (sample_from == "kbest") + cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl; + if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl; + else cerr << setw(25) << "learning rate " << "bleu diff" << endl; + cerr << setw(25) << "gamma " << gamma << endl; + cerr << setw(25) << "loss margin " << loss_margin << endl; + cerr << setw(25) << "faster perceptron " << faster_perceptron << endl; + cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl; + if (pair_sampling == "XYX") + cerr << setw(25) << "hi lo " << hi_lo << endl; + cerr << setw(25) << "pair threshold " << pair_threshold << endl; + cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl; + if (cfg.count("l1_reg")) + cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl; + if (rescale) + cerr << setw(25) << "rescale " << rescale << endl; + cerr << setw(25) << "max pairs " << max_pairs << endl; + cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; + cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; + cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl; + cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; + if (cfg.count("input_weights")) + cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl; + if (stop_after > 0) + cerr << setw(25) << "stop_after " << stop_after << endl; + if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl; + } + + + for (unsigned t = 0; t < T; t++) // T epochs + { + + time_t start, end; + time(&start); + score_t score_sum = 0.; + score_t model_sum(0); + unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0; + if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl; + + while(true) + { + + string in; + bool next = false, stop = false; // next iteration or premature stop + if (t == 0) { + if(!getline(*input, in)) next = true; + } else { + if (ii == in_sz) next = true; // stop if we reach the end of our input + } + // stop after X sentences (but still go on for those) + if (stop_after > 0 && stop_after == ii && !next) stop = true; + + // produce some pretty output + if (!quiet && !verbose) { + if (ii == 0) cerr << " "; + if ((ii+1) % (DTRAIN_DOTS) == 0) { + cerr << "."; + cerr.flush(); + } + if ((ii+1) % (20*DTRAIN_DOTS) == 0) { + cerr << " " << ii+1 << endl; + if (!next && !stop) cerr << " "; + } + if (stop) { + if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl; + cerr << "Stopping after " << stop_after << " input sentences." << endl; + } else { + if (next) { + if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl; + } + } + } + + // next iteration + if (next || stop) break; + + // weights + lambdas.init_vector(&dense_weights); + + // getting input + vector<WordID> ref_ids; // reference as vector<WordID> + if (t == 0) { + string r_; + getline(*refs, r_); + vector<string> ref_tok; + boost::split(ref_tok, r_, boost::is_any_of(" ")); + register_and_convert(ref_tok, ref_ids); + ref_ids_buf.push_back(ref_ids); + src_str_buf.push_back(in); + } else { + ref_ids = ref_ids_buf[ii]; + } + observer->SetRef(ref_ids); + if (t == 0) + decoder.Decode(in, observer); + else + decoder.Decode(src_str_buf[ii], observer); + + // get (scored) samples + vector<ScoredHyp>* samples = observer->GetSamples(); + + if (verbose) { + cerr << "--- ref for " << ii << ": "; + if (t > 0) printWordIDVec(ref_ids_buf[ii]); + else printWordIDVec(ref_ids); + cerr << endl; + for (unsigned u = 0; u < samples->size(); u++) { + cerr << _p2 << _np << "[" << u << ". '"; + printWordIDVec((*samples)[u].w); + cerr << "'" << endl; + cerr << "SCORE=" << (*samples)[u].score << ",model="<< (*samples)[u].model << endl; + cerr << "F{" << (*samples)[u].f << "} ]" << endl << endl; + } + } + + score_sum += (*samples)[0].score; // stats for 1best + model_sum += (*samples)[0].model; + + f_count += observer->get_f_count(); + list_sz += observer->get_sz(); + + // weight updates + if (!noup) { + // get pairs + vector<pair<ScoredHyp,ScoredHyp> > pairs; + if (pair_sampling == "all") + all_pairs(samples, pairs, pair_threshold, max_pairs, faster_perceptron); + if (pair_sampling == "XYX") + partXYX(samples, pairs, pair_threshold, max_pairs, faster_perceptron, hi_lo); + if (pair_sampling == "PRO") + PROsampling(samples, pairs, pair_threshold, max_pairs); + npairs += pairs.size(); + + for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin(); + it != pairs.end(); it++) { + bool rank_error; + score_t margin; + if (faster_perceptron) { // we only have considering misranked pairs + rank_error = true; // pair sampling already did this for us + margin = std::numeric_limits<float>::max(); + } else { + rank_error = it->first.model <= it->second.model; + margin = fabs(fabs(it->first.model) - fabs(it->second.model)); + if (!rank_error && margin < loss_margin) margin_violations++; + } + if (rank_error) rank_errors++; + if (scale_bleu_diff) eta = it->first.score - it->second.score; + if (rank_error || margin < loss_margin) { + SparseVector<weight_t> diff_vec = it->first.f - it->second.f; + lambdas.plus_eq_v_times_s(diff_vec, eta); + if (gamma) + lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); + } + } + + // l1 regularization + // please note that this penalizes _all_ weights + // (contrary to only the ones changed by the last update) + // after a _sentence_ (not after each example/pair) + if (l1naive) { + FastSparseVector<weight_t>::iterator it = lambdas.begin(); + for (; it != lambdas.end(); ++it) { + it->second -= sign(it->second) * l1_reg; + } + } else if (l1clip) { + FastSparseVector<weight_t>::iterator it = lambdas.begin(); + for (; it != lambdas.end(); ++it) { + if (it->second != 0) { + weight_t v = it->second; + if (v > 0) { + it->second = max(0., v - l1_reg); + } else { + it->second = min(0., v + l1_reg); + } + } + } + } else if (l1cumul) { + weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input + FastSparseVector<weight_t>::iterator it = lambdas.begin(); + for (; it != lambdas.end(); ++it) { + if (it->second != 0) { + weight_t v = it->second; + weight_t penalized = 0.; + if (v > 0) { + penalized = max(0., v-(acc_penalty + cumulative_penalties.get(it->first))); + } else { + penalized = min(0., v+(acc_penalty - cumulative_penalties.get(it->first))); + } + it->second = penalized; + cumulative_penalties.set_value(it->first, cumulative_penalties.get(it->first)+penalized); + } + } + } + + } + + if (rescale) lambdas /= lambdas.l2norm(); + + ++ii; + + } // input loop + + if (average) w_average += lambdas; + + if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset(); + + if (t == 0) { + in_sz = ii; // remember size of input (# lines) + } + + // print some stats + score_t score_avg = score_sum/(score_t)in_sz; + score_t model_avg = model_sum/(score_t)in_sz; + score_t score_diff, model_diff; + if (t > 0) { + score_diff = score_avg - all_scores[t-1].first; + model_diff = model_avg - all_scores[t-1].second; + } else { + score_diff = score_avg; + model_diff = model_avg; + } + + unsigned nonz = 0; + if (!quiet) nonz = (unsigned)lambdas.num_nonzero(); + + if (!quiet) { + cerr << _p5 << _p << "WEIGHTS" << endl; + for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) { + cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl; + } + cerr << " ---" << endl; + cerr << _np << " 1best avg score: " << score_avg; + cerr << _p << " (" << score_diff << ")" << endl; + cerr << _np << " 1best avg model score: " << model_avg; + cerr << _p << " (" << model_diff << ")" << endl; + cerr << " avg # pairs: "; + cerr << _np << npairs/(float)in_sz; + if (faster_perceptron) cerr << " (meaningless)"; + cerr << endl; + cerr << " avg # rank err: "; + cerr << rank_errors/(float)in_sz << endl; + cerr << " avg # margin viol: "; + cerr << margin_violations/(float)in_sz << endl; + cerr << " non0 feature count: " << nonz << endl; + cerr << " avg list sz: " << list_sz/(float)in_sz << endl; + cerr << " avg f count: " << f_count/(float)list_sz << endl; + } + + pair<score_t,score_t> remember; + remember.first = score_avg; + remember.second = model_avg; + all_scores.push_back(remember); + if (score_avg > max_score) { + max_score = score_avg; + best_it = t; + } + time (&end); + float time_diff = difftime(end, start); + overall_time += time_diff; + if (!quiet) { + cerr << _p2 << _np << "(time " << time_diff/60. << " min, "; + cerr << time_diff/in_sz << " s/S)" << endl; + } + if (t+1 != T && !quiet) cerr << endl; + + if (noup) break; + + // write weights to file + if (select_weights == "best" || keep) { + lambdas.init_vector(&dense_weights); + string w_fn = "weights." + boost::lexical_cast<string>(t) + ".gz"; + Weights::WriteToFile(w_fn, dense_weights, true); + } + + } // outer loop + + if (average) w_average /= (weight_t)T; + + if (!noup) { + if (!quiet) cerr << endl << "Writing weights file to '" << output_fn << "' ..." << endl; + if (select_weights == "last" || average) { // last, average + WriteFile of(output_fn); // works with '-' + ostream& o = *of.stream(); + o.precision(17); + o << _np; + if (average) { + for (SparseVector<weight_t>::iterator it = w_average.begin(); it != w_average.end(); ++it) { + if (it->second == 0) continue; + o << FD::Convert(it->first) << '\t' << it->second << endl; + } + } else { + for (SparseVector<weight_t>::iterator it = lambdas.begin(); it != lambdas.end(); ++it) { + if (it->second == 0) continue; + o << FD::Convert(it->first) << '\t' << it->second << endl; + } + } + } else if (select_weights == "VOID") { // do nothing with the weights + } else { // best + if (output_fn != "-") { + CopyFile("weights."+boost::lexical_cast<string>(best_it)+".gz", output_fn); + } else { + ReadFile bestw("weights."+boost::lexical_cast<string>(best_it)+".gz"); + string o; + cout.precision(17); + cout << _np; + while(getline(*bestw, o)) cout << o << endl; + } + if (!keep) { + for (unsigned i = 0; i < T; i++) { + string s = "weights." + boost::lexical_cast<string>(i) + ".gz"; + unlink(s.c_str()); + } + } + } + if (!quiet) cerr << "done" << endl; + } + + if (!quiet) { + cerr << _p5 << _np << endl << "---" << endl << "Best iteration: "; + cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl; + cerr << "This took " << overall_time/60. << " min." << endl; + } +} + diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h new file mode 100644 index 00000000..eb0b9f17 --- /dev/null +++ b/training/dtrain/dtrain.h @@ -0,0 +1,92 @@ +#ifndef _DTRAIN_H_ +#define _DTRAIN_H_ + +#define DTRAIN_DOTS 10 // after how many inputs to display a '.' +#define DTRAIN_SCALE 100000 + +#include <iomanip> +#include <climits> +#include <string.h> + +#include <boost/algorithm/string.hpp> +#include <boost/program_options.hpp> + +#include "ksampler.h" +#include "pairsampling.h" + +#include "filelib.h" + + +using namespace std; +using namespace dtrain; +namespace po = boost::program_options; + +inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids) +{ + vector<string>::const_iterator it; + for (it = strs.begin(); it < strs.end(); it++) + ids.push_back(TD::Convert(*it)); +} + +inline string gettmpf(const string path, const string infix) +{ + char fn[path.size() + infix.size() + 8]; + strcpy(fn, path.c_str()); + strcat(fn, "/"); + strcat(fn, infix.c_str()); + strcat(fn, "-XXXXXX"); + if (!mkstemp(fn)) { + cerr << "Cannot make temp file in" << path << " , exiting." << endl; + exit(1); + } + return string(fn); +} + +inline void split_in(string& s, vector<string>& parts) +{ + unsigned f = 0; + for(unsigned i = 0; i < 3; i++) { + unsigned e = f; + f = s.find("\t", f+1); + if (e != 0) parts.push_back(s.substr(e+1, f-e-1)); + else parts.push_back(s.substr(0, f)); + } + s.erase(0, f+1); +} + +struct HSReporter +{ + string task_id_; + + HSReporter(string task_id) : task_id_(task_id) {} + + inline void update_counter(string name, unsigned amount) { + cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl; + } + inline void update_gcounter(string name, unsigned amount) { + cerr << "reporter:counter:Global," << name << "," << amount << endl; + } +}; + +inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); } +inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); } +inline ostream& _p2(ostream& out) { return out << setprecision(2); } +inline ostream& _p5(ostream& out) { return out << setprecision(5); } + +inline void printWordIDVec(vector<WordID>& v) +{ + for (unsigned i = 0; i < v.size(); i++) { + cerr << TD::Convert(v[i]); + if (i < v.size()-1) cerr << " "; + } +} + +template<typename T> +inline T sign(T z) +{ + if (z == 0) return 0; + return z < 0 ? -1 : +1; +} + +#endif + diff --git a/training/dtrain/examples/parallelized/README b/training/dtrain/examples/parallelized/README new file mode 100644 index 00000000..89715105 --- /dev/null +++ b/training/dtrain/examples/parallelized/README @@ -0,0 +1,5 @@ +run for example + ../../parallelize.rb ./dtrain.ini 4 false 2 2 ./in ./refs + +final weights will be in the file work/weights.3 + diff --git a/training/dtrain/examples/parallelized/cdec.ini b/training/dtrain/examples/parallelized/cdec.ini new file mode 100644 index 00000000..e43ba1c4 --- /dev/null +++ b/training/dtrain/examples/parallelized/cdec.ini @@ -0,0 +1,22 @@ +formalism=scfg +add_pass_through_rules=true +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +scfg_max_span_limit=15 +feature_function=WordPenalty +feature_function=KLanguageModel ../example/nc-wmt11.en.srilm.gz +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/training/dtrain/examples/parallelized/dtrain.ini b/training/dtrain/examples/parallelized/dtrain.ini new file mode 100644 index 00000000..f19ef891 --- /dev/null +++ b/training/dtrain/examples/parallelized/dtrain.ini @@ -0,0 +1,16 @@ +k=100 +N=4 +learning_rate=0.0001 +gamma=0 +loss_margin=1.0 +epochs=1 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=XYX +hi_lo=0.1 +select_weights=last +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +# newer version of the grammar extractor use different feature names: +#print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +decoder_config=cdec.ini diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz Binary files differnew file mode 100644 index 00000000..1e28a24b --- /dev/null +++ b/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz Binary files differnew file mode 100644 index 00000000..372f5675 --- /dev/null +++ b/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz Binary files differnew file mode 100644 index 00000000..145d0dc0 --- /dev/null +++ b/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz Binary files differnew file mode 100644 index 00000000..105593ff --- /dev/null +++ b/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz Binary files differnew file mode 100644 index 00000000..30781f48 --- /dev/null +++ b/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz Binary files differnew file mode 100644 index 00000000..834ee759 --- /dev/null +++ b/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz Binary files differnew file mode 100644 index 00000000..2e76f348 --- /dev/null +++ b/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz Binary files differnew file mode 100644 index 00000000..3741a887 --- /dev/null +++ b/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz Binary files differnew file mode 100644 index 00000000..ebf6bd0c --- /dev/null +++ b/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz Binary files differnew file mode 100644 index 00000000..c1791059 --- /dev/null +++ b/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz diff --git a/training/dtrain/examples/parallelized/in b/training/dtrain/examples/parallelized/in new file mode 100644 index 00000000..51d01fe7 --- /dev/null +++ b/training/dtrain/examples/parallelized/in @@ -0,0 +1,10 @@ +<seg grammar="grammar/grammar.out.0.gz" id="0">europas nach rassen geteiltes haus</seg> +<seg grammar="grammar/grammar.out.1.gz" id="1">ein gemeinsames merkmal aller extremen rechten in europa ist ihr rassismus und die tatsache , daß sie das einwanderungsproblem als politischen hebel benutzen .</seg> +<seg grammar="grammar/grammar.out.2.gz" id="2">der lega nord in italien , der vlaams block in den niederlanden , die anhänger von le pens nationaler front in frankreich , sind beispiele für parteien oder bewegungen , die sich um das gemeinsame thema : ablehnung der zuwanderung gebildet haben und um forderung nach einer vereinfachten politik , um sie zu regeln .</seg> +<seg grammar="grammar/grammar.out.3.gz" id="3">während individuen wie jörg haidar und jean @-@ marie le pen kommen und ( leider nicht zu bald ) wieder gehen mögen , wird die rassenfrage aus der europäischer politik nicht so bald verschwinden .</seg> +<seg grammar="grammar/grammar.out.4.gz" id="4">eine alternde einheimische bevölkerung und immer offenere grenzen vermehren die rassistische zersplitterung in den europäischen ländern .</seg> +<seg grammar="grammar/grammar.out.5.gz" id="5">die großen parteien der rechten und der linken mitte haben sich dem problem gestellt , in dem sie den kopf in den sand gesteckt und allen aussichten zuwider gehofft haben , es möge bald verschwinden .</seg> +<seg grammar="grammar/grammar.out.6.gz" id="6">das aber wird es nicht , wie die geschichte des rassismus in amerika deutlich zeigt .</seg> +<seg grammar="grammar/grammar.out.7.gz" id="7">die beziehungen zwischen den rassen standen in den usa über jahrzehnte - und tun das noch heute - im zentrum der politischen debatte . das ging so weit , daß rassentrennung genauso wichtig wie das einkommen wurde , - wenn nicht sogar noch wichtiger - um politische zuneigungen und einstellungen zu bestimmen .</seg> +<seg grammar="grammar/grammar.out.8.gz" id="8">der erste schritt , um mit der rassenfrage umzugehen ist , ursache und folgen rassistischer feindseligkeiten zu verstehen , auch dann , wenn das bedeutet , unangenehme tatsachen aufzudecken .</seg> +<seg grammar="grammar/grammar.out.9.gz" id="9">genau das haben in den usa eine große anzahl an forschungsvorhaben in wirtschaft , soziologie , psychologie und politikwissenschaft geleistet . diese forschungen zeigten , daß menschen unterschiedlicher rasse einander deutlich weniger vertrauen .</seg> diff --git a/training/dtrain/examples/parallelized/refs b/training/dtrain/examples/parallelized/refs new file mode 100644 index 00000000..632e27b0 --- /dev/null +++ b/training/dtrain/examples/parallelized/refs @@ -0,0 +1,10 @@ +europe 's divided racial house +a common feature of europe 's extreme right is its racism and use of the immigration issue as a political wedge . +the lega nord in italy , the vlaams blok in the netherlands , the supporters of le pen 's national front in france , are all examples of parties or movements formed on the common theme of aversion to immigrants and promotion of simplistic policies to control them . +while individuals like jorg haidar and jean @-@ marie le pen may come and ( never to soon ) go , the race question will not disappear from european politics anytime soon . +an aging population at home and ever more open borders imply increasing racial fragmentation in european countries . +mainstream parties of the center left and center right have confronted this prospect by hiding their heads in the ground , hoping against hope that the problem will disappear . +it will not , as america 's racial history clearly shows . +race relations in the us have been for decades - and remain - at the center of political debate , to the point that racial cleavages are as important as income , if not more , as determinants of political preferences and attitudes . +the first step to address racial politics is to understand the origin and consequences of racial animosity , even if it means uncovering unpleasant truths . +this is precisely what a large amount of research in economics , sociology , psychology and political science has done for the us . diff --git a/training/dtrain/examples/parallelized/work/out.0.0 b/training/dtrain/examples/parallelized/work/out.0.0 new file mode 100644 index 00000000..7a00ed0f --- /dev/null +++ b/training/dtrain/examples/parallelized/work/out.0.0 @@ -0,0 +1,61 @@ + cdec cfg 'cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading ../example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** +Seeding random number sequence to 3121929377 + +dtrain +Parameters: + k 100 + N 4 + T 1 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 0.0001 + gamma 0 + loss margin 1 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'last' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'cdec.ini' + input 'work/shard.0.0.in' + refs 'work/shard.0.0.refs' + output 'work/weights.0.0' +(a dot represents 10 inputs) +Iteration #1 of 1. + 5 +WEIGHTS + Glue = +0.2663 + WordPenalty = -0.0079042 + LanguageModel = +0.44782 + LanguageModel_OOV = -0.0401 + PhraseModel_0 = -0.193 + PhraseModel_1 = +0.71321 + PhraseModel_2 = +0.85196 + PhraseModel_3 = -0.43986 + PhraseModel_4 = -0.44803 + PhraseModel_5 = -0.0538 + PhraseModel_6 = -0.1788 + PassThrough = -0.1477 + --- + 1best avg score: 0.17521 (+0.17521) + 1best avg model score: 21.556 (+21.556) + avg # pairs: 1671.2 + avg # rank err: 1118.6 + avg # margin viol: 552.6 + non0 feature count: 12 + avg list sz: 100 + avg f count: 11.32 +(time 0.37 min, 4.4 s/S) + +Writing weights file to 'work/weights.0.0' ... +done + +--- +Best iteration: 1 [SCORE 'stupid_bleu'=0.17521]. +This took 0.36667 min. diff --git a/training/dtrain/examples/parallelized/work/out.0.1 b/training/dtrain/examples/parallelized/work/out.0.1 new file mode 100644 index 00000000..e2bd6649 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/out.0.1 @@ -0,0 +1,62 @@ + cdec cfg 'cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading ../example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** +Seeding random number sequence to 2767202922 + +dtrain +Parameters: + k 100 + N 4 + T 1 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 0.0001 + gamma 0 + loss margin 1 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'last' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'cdec.ini' + input 'work/shard.0.0.in' + refs 'work/shard.0.0.refs' + output 'work/weights.0.1' + weights in 'work/weights.0' +(a dot represents 10 inputs) +Iteration #1 of 1. + 5 +WEIGHTS + Glue = -0.2699 + WordPenalty = +0.080605 + LanguageModel = -0.026572 + LanguageModel_OOV = -0.30025 + PhraseModel_0 = -0.32076 + PhraseModel_1 = +0.67451 + PhraseModel_2 = +0.92 + PhraseModel_3 = -0.36402 + PhraseModel_4 = -0.592 + PhraseModel_5 = -0.0269 + PhraseModel_6 = -0.28755 + PassThrough = -0.33285 + --- + 1best avg score: 0.26638 (+0.26638) + 1best avg model score: 53.197 (+53.197) + avg # pairs: 2028.6 + avg # rank err: 998.2 + avg # margin viol: 918.8 + non0 feature count: 12 + avg list sz: 100 + avg f count: 10.496 +(time 0.32 min, 3.8 s/S) + +Writing weights file to 'work/weights.0.1' ... +done + +--- +Best iteration: 1 [SCORE 'stupid_bleu'=0.26638]. +This took 0.31667 min. diff --git a/training/dtrain/examples/parallelized/work/out.1.0 b/training/dtrain/examples/parallelized/work/out.1.0 new file mode 100644 index 00000000..6e790e38 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/out.1.0 @@ -0,0 +1,61 @@ + cdec cfg 'cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading ../example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** +Seeding random number sequence to 1432415010 + +dtrain +Parameters: + k 100 + N 4 + T 1 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 0.0001 + gamma 0 + loss margin 1 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'last' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'cdec.ini' + input 'work/shard.1.0.in' + refs 'work/shard.1.0.refs' + output 'work/weights.1.0' +(a dot represents 10 inputs) +Iteration #1 of 1. + 5 +WEIGHTS + Glue = -0.3815 + WordPenalty = +0.20064 + LanguageModel = +0.95304 + LanguageModel_OOV = -0.264 + PhraseModel_0 = -0.22362 + PhraseModel_1 = +0.12254 + PhraseModel_2 = +0.26328 + PhraseModel_3 = +0.38018 + PhraseModel_4 = -0.48654 + PhraseModel_5 = +0 + PhraseModel_6 = -0.3645 + PassThrough = -0.2216 + --- + 1best avg score: 0.10863 (+0.10863) + 1best avg model score: -4.9841 (-4.9841) + avg # pairs: 1345.4 + avg # rank err: 822.4 + avg # margin viol: 501 + non0 feature count: 11 + avg list sz: 100 + avg f count: 11.814 +(time 0.45 min, 5.4 s/S) + +Writing weights file to 'work/weights.1.0' ... +done + +--- +Best iteration: 1 [SCORE 'stupid_bleu'=0.10863]. +This took 0.45 min. diff --git a/training/dtrain/examples/parallelized/work/out.1.1 b/training/dtrain/examples/parallelized/work/out.1.1 new file mode 100644 index 00000000..0b984761 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/out.1.1 @@ -0,0 +1,62 @@ + cdec cfg 'cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading ../example/nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** +Seeding random number sequence to 1771918374 + +dtrain +Parameters: + k 100 + N 4 + T 1 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 0.0001 + gamma 0 + loss margin 1 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'last' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'cdec.ini' + input 'work/shard.1.0.in' + refs 'work/shard.1.0.refs' + output 'work/weights.1.1' + weights in 'work/weights.0' +(a dot represents 10 inputs) +Iteration #1 of 1. + 5 +WEIGHTS + Glue = -0.3178 + WordPenalty = +0.11092 + LanguageModel = +0.17269 + LanguageModel_OOV = -0.13485 + PhraseModel_0 = -0.45371 + PhraseModel_1 = +0.38789 + PhraseModel_2 = +0.75311 + PhraseModel_3 = -0.38163 + PhraseModel_4 = -0.58817 + PhraseModel_5 = -0.0269 + PhraseModel_6 = -0.27315 + PassThrough = -0.16745 + --- + 1best avg score: 0.13169 (+0.13169) + 1best avg model score: 24.226 (+24.226) + avg # pairs: 1951.2 + avg # rank err: 985.4 + avg # margin viol: 951 + non0 feature count: 12 + avg list sz: 100 + avg f count: 11.224 +(time 0.42 min, 5 s/S) + +Writing weights file to 'work/weights.1.1' ... +done + +--- +Best iteration: 1 [SCORE 'stupid_bleu'=0.13169]. +This took 0.41667 min. diff --git a/training/dtrain/examples/parallelized/work/shard.0.0.in b/training/dtrain/examples/parallelized/work/shard.0.0.in new file mode 100644 index 00000000..92f9c78e --- /dev/null +++ b/training/dtrain/examples/parallelized/work/shard.0.0.in @@ -0,0 +1,5 @@ +<seg grammar="grammar/grammar.out.0.gz" id="0">europas nach rassen geteiltes haus</seg> +<seg grammar="grammar/grammar.out.1.gz" id="1">ein gemeinsames merkmal aller extremen rechten in europa ist ihr rassismus und die tatsache , daß sie das einwanderungsproblem als politischen hebel benutzen .</seg> +<seg grammar="grammar/grammar.out.2.gz" id="2">der lega nord in italien , der vlaams block in den niederlanden , die anhänger von le pens nationaler front in frankreich , sind beispiele für parteien oder bewegungen , die sich um das gemeinsame thema : ablehnung der zuwanderung gebildet haben und um forderung nach einer vereinfachten politik , um sie zu regeln .</seg> +<seg grammar="grammar/grammar.out.3.gz" id="3">während individuen wie jörg haidar und jean @-@ marie le pen kommen und ( leider nicht zu bald ) wieder gehen mögen , wird die rassenfrage aus der europäischer politik nicht so bald verschwinden .</seg> +<seg grammar="grammar/grammar.out.4.gz" id="4">eine alternde einheimische bevölkerung und immer offenere grenzen vermehren die rassistische zersplitterung in den europäischen ländern .</seg> diff --git a/training/dtrain/examples/parallelized/work/shard.0.0.refs b/training/dtrain/examples/parallelized/work/shard.0.0.refs new file mode 100644 index 00000000..bef68fee --- /dev/null +++ b/training/dtrain/examples/parallelized/work/shard.0.0.refs @@ -0,0 +1,5 @@ +europe 's divided racial house +a common feature of europe 's extreme right is its racism and use of the immigration issue as a political wedge . +the lega nord in italy , the vlaams blok in the netherlands , the supporters of le pen 's national front in france , are all examples of parties or movements formed on the common theme of aversion to immigrants and promotion of simplistic policies to control them . +while individuals like jorg haidar and jean @-@ marie le pen may come and ( never to soon ) go , the race question will not disappear from european politics anytime soon . +an aging population at home and ever more open borders imply increasing racial fragmentation in european countries . diff --git a/training/dtrain/examples/parallelized/work/shard.1.0.in b/training/dtrain/examples/parallelized/work/shard.1.0.in new file mode 100644 index 00000000..b7695ce7 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/shard.1.0.in @@ -0,0 +1,5 @@ +<seg grammar="grammar/grammar.out.5.gz" id="5">die großen parteien der rechten und der linken mitte haben sich dem problem gestellt , in dem sie den kopf in den sand gesteckt und allen aussichten zuwider gehofft haben , es möge bald verschwinden .</seg> +<seg grammar="grammar/grammar.out.6.gz" id="6">das aber wird es nicht , wie die geschichte des rassismus in amerika deutlich zeigt .</seg> +<seg grammar="grammar/grammar.out.7.gz" id="7">die beziehungen zwischen den rassen standen in den usa über jahrzehnte - und tun das noch heute - im zentrum der politischen debatte . das ging so weit , daß rassentrennung genauso wichtig wie das einkommen wurde , - wenn nicht sogar noch wichtiger - um politische zuneigungen und einstellungen zu bestimmen .</seg> +<seg grammar="grammar/grammar.out.8.gz" id="8">der erste schritt , um mit der rassenfrage umzugehen ist , ursache und folgen rassistischer feindseligkeiten zu verstehen , auch dann , wenn das bedeutet , unangenehme tatsachen aufzudecken .</seg> +<seg grammar="grammar/grammar.out.9.gz" id="9">genau das haben in den usa eine große anzahl an forschungsvorhaben in wirtschaft , soziologie , psychologie und politikwissenschaft geleistet . diese forschungen zeigten , daß menschen unterschiedlicher rasse einander deutlich weniger vertrauen .</seg> diff --git a/training/dtrain/examples/parallelized/work/shard.1.0.refs b/training/dtrain/examples/parallelized/work/shard.1.0.refs new file mode 100644 index 00000000..6076f6d5 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/shard.1.0.refs @@ -0,0 +1,5 @@ +mainstream parties of the center left and center right have confronted this prospect by hiding their heads in the ground , hoping against hope that the problem will disappear . +it will not , as america 's racial history clearly shows . +race relations in the us have been for decades - and remain - at the center of political debate , to the point that racial cleavages are as important as income , if not more , as determinants of political preferences and attitudes . +the first step to address racial politics is to understand the origin and consequences of racial animosity , even if it means uncovering unpleasant truths . +this is precisely what a large amount of research in economics , sociology , psychology and political science has done for the us . diff --git a/training/dtrain/examples/parallelized/work/weights.0 b/training/dtrain/examples/parallelized/work/weights.0 new file mode 100644 index 00000000..ddd595a8 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.0 @@ -0,0 +1,12 @@ +LanguageModel 0.7004298992212881 +PhraseModel_2 0.5576194336478857 +PhraseModel_1 0.41787318415343155 +PhraseModel_4 -0.46728502545635164 +PhraseModel_3 -0.029839521598455515 +Glue -0.05760000000000068 +PhraseModel_6 -0.2716499999999978 +PhraseModel_0 -0.20831031065605327 +LanguageModel_OOV -0.15205000000000077 +PassThrough -0.1846500000000006 +WordPenalty 0.09636994553433414 +PhraseModel_5 -0.026900000000000257 diff --git a/training/dtrain/examples/parallelized/work/weights.0.0 b/training/dtrain/examples/parallelized/work/weights.0.0 new file mode 100644 index 00000000..c9370b18 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.0.0 @@ -0,0 +1,12 @@ +WordPenalty -0.0079041595706392243 +LanguageModel 0.44781580828279532 +LanguageModel_OOV -0.04010000000000042 +Glue 0.26629999999999948 +PhraseModel_0 -0.19299677809125185 +PhraseModel_1 0.71321026861732773 +PhraseModel_2 0.85195540993310537 +PhraseModel_3 -0.43986310822842656 +PhraseModel_4 -0.44802855630415955 +PhraseModel_5 -0.053800000000000514 +PhraseModel_6 -0.17879999999999835 +PassThrough -0.14770000000000036 diff --git a/training/dtrain/examples/parallelized/work/weights.0.1 b/training/dtrain/examples/parallelized/work/weights.0.1 new file mode 100644 index 00000000..8fad3de8 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.0.1 @@ -0,0 +1,12 @@ +WordPenalty 0.080605055841244472 +LanguageModel -0.026571720531022844 +LanguageModel_OOV -0.30024999999999141 +Glue -0.26989999999999842 +PhraseModel_2 0.92000295209089566 +PhraseModel_1 0.67450748692470841 +PhraseModel_4 -0.5920000014976784 +PhraseModel_3 -0.36402437203127397 +PhraseModel_6 -0.28754999999999603 +PhraseModel_0 -0.32076244202907672 +PassThrough -0.33284999999999004 +PhraseModel_5 -0.026900000000000257 diff --git a/training/dtrain/examples/parallelized/work/weights.1 b/training/dtrain/examples/parallelized/work/weights.1 new file mode 100644 index 00000000..03058a16 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.1 @@ -0,0 +1,12 @@ +PhraseModel_2 0.8365578543552836 +PhraseModel_4 -0.5900840266009169 +PhraseModel_1 0.5312000609786991 +PhraseModel_0 -0.3872342271319619 +PhraseModel_3 -0.3728279676912084 +Glue -0.2938500000000036 +PhraseModel_6 -0.2803499999999967 +PassThrough -0.25014999999999626 +LanguageModel_OOV -0.21754999999999702 +LanguageModel 0.07306061161169894 +WordPenalty 0.09576193325966899 +PhraseModel_5 -0.026900000000000257 diff --git a/training/dtrain/examples/parallelized/work/weights.1.0 b/training/dtrain/examples/parallelized/work/weights.1.0 new file mode 100644 index 00000000..6a6a65c1 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.1.0 @@ -0,0 +1,11 @@ +WordPenalty 0.20064405063930751 +LanguageModel 0.9530439901597807 +LanguageModel_OOV -0.26400000000000112 +Glue -0.38150000000000084 +PhraseModel_0 -0.22362384322085468 +PhraseModel_1 0.12253609968953538 +PhraseModel_2 0.26328345736266612 +PhraseModel_3 0.38018406503151553 +PhraseModel_4 -0.48654149460854373 +PhraseModel_6 -0.36449999999999722 +PassThrough -0.22160000000000085 diff --git a/training/dtrain/examples/parallelized/work/weights.1.1 b/training/dtrain/examples/parallelized/work/weights.1.1 new file mode 100644 index 00000000..f56ea4a2 --- /dev/null +++ b/training/dtrain/examples/parallelized/work/weights.1.1 @@ -0,0 +1,12 @@ +WordPenalty 0.1109188106780935 +LanguageModel 0.17269294375442074 +LanguageModel_OOV -0.13485000000000266 +Glue -0.3178000000000088 +PhraseModel_2 0.75311275661967159 +PhraseModel_1 0.38789263503268989 +PhraseModel_4 -0.58816805170415531 +PhraseModel_3 -0.38163156335114284 +PhraseModel_6 -0.27314999999999739 +PhraseModel_0 -0.45370601223484697 +PassThrough -0.16745000000000249 +PhraseModel_5 -0.026900000000000257 diff --git a/training/dtrain/examples/standard/README b/training/dtrain/examples/standard/README new file mode 100644 index 00000000..ce37d31a --- /dev/null +++ b/training/dtrain/examples/standard/README @@ -0,0 +1,2 @@ +Call `dtrain` from this folder with ../../dtrain -c dtrain.ini . + diff --git a/training/dtrain/examples/standard/cdec.ini b/training/dtrain/examples/standard/cdec.ini new file mode 100644 index 00000000..e1edc68d --- /dev/null +++ b/training/dtrain/examples/standard/cdec.ini @@ -0,0 +1,26 @@ +formalism=scfg +add_pass_through_rules=true +scfg_max_span_limit=15 +intersection_strategy=cube_pruning +cubepruning_pop_limit=200 +grammar=nc-wmt11.grammar.gz +feature_function=WordPenalty +feature_function=KLanguageModel ./nc-wmt11.en.srilm.gz +# all currently working feature functions for translation: +# (with those features active that were used in the ACL paper) +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=Dwarf +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +feature_function=RuleIdentityFeatures +feature_function=RuleSourceBigramFeatures +feature_function=RuleTargetBigramFeatures +feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini new file mode 100644 index 00000000..e1072d30 --- /dev/null +++ b/training/dtrain/examples/standard/dtrain.ini @@ -0,0 +1,24 @@ +input=./nc-wmt11.de.gz +refs=./nc-wmt11.en.gz +output=- # a weights file (add .gz for gzip compression) or STDOUT '-' +select_weights=VOID # output average (over epochs) weight vector +decoder_config=./cdec.ini # config for cdec +# weights for these features will be printed on each iteration +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +# newer version of the grammar extractor use different feature names: +#print_weights= EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV +stop_after=10 # stop epoch after 10 inputs + +# interesting stuff +epochs=2 # run over input 2 times +k=100 # use 100best lists +N=4 # optimize (approx) BLEU4 +scorer=stupid_bleu # use 'stupid' BLEU+1 +learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron) +gamma=0 # use SVM reg +sample_from=kbest # use kbest lists (as opposed to forest) +filter=uniq # only unique entries in kbest (surface form) +pair_sampling=XYX # +hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here +pair_threshold=0 # minimum distance in BLEU (here: > 0) +loss_margin=0 # update if correctly ranked, but within this margin diff --git a/training/dtrain/examples/standard/expected-output b/training/dtrain/examples/standard/expected-output new file mode 100644 index 00000000..7cd09dbf --- /dev/null +++ b/training/dtrain/examples/standard/expected-output @@ -0,0 +1,91 @@ + cdec cfg './cdec.ini' +Loading the LM will be faster if you build a binary file. +Reading ./nc-wmt11.en.srilm.gz +----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 +**************************************************************************************************** + Example feature: Shape_S00000_T00000 +Seeding random number sequence to 2679584485 + +dtrain +Parameters: + k 100 + N 4 + T 2 + scorer 'stupid_bleu' + sample from 'kbest' + filter 'uniq' + learning rate 1 + gamma 0 + loss margin 0 + faster perceptron 1 + pairs 'XYX' + hi lo 0.1 + pair threshold 0 + select weights 'VOID' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg './cdec.ini' + input './nc-wmt11.de.gz' + refs './nc-wmt11.en.gz' + output '-' + stop_after 10 +(a dot represents 10 inputs) +Iteration #1 of 2. + . 10 +Stopping after 10 input sentences. +WEIGHTS + Glue = -576 + WordPenalty = +417.79 + LanguageModel = +5117.5 + LanguageModel_OOV = -1307 + PhraseModel_0 = -1612 + PhraseModel_1 = -2159.6 + PhraseModel_2 = -677.36 + PhraseModel_3 = +2663.8 + PhraseModel_4 = -1025.9 + PhraseModel_5 = -8 + PhraseModel_6 = +70 + PassThrough = -1455 + --- + 1best avg score: 0.27697 (+0.27697) + 1best avg model score: -47918 (-47918) + avg # pairs: 581.9 (meaningless) + avg # rank err: 581.9 + avg # margin viol: 0 + non0 feature count: 703 + avg list sz: 90.9 + avg f count: 100.09 +(time 0.25 min, 1.5 s/S) + +Iteration #2 of 2. + . 10 +WEIGHTS + Glue = -622 + WordPenalty = +898.56 + LanguageModel = +8066.2 + LanguageModel_OOV = -2590 + PhraseModel_0 = -4335.8 + PhraseModel_1 = -5864.4 + PhraseModel_2 = -1729.8 + PhraseModel_3 = +2831.9 + PhraseModel_4 = -5384.8 + PhraseModel_5 = +1449 + PhraseModel_6 = +480 + PassThrough = -2578 + --- + 1best avg score: 0.37119 (+0.094226) + 1best avg model score: -1.3174e+05 (-83822) + avg # pairs: 584.1 (meaningless) + avg # rank err: 584.1 + avg # margin viol: 0 + non0 feature count: 1115 + avg list sz: 91.3 + avg f count: 90.755 +(time 0.3 min, 1.8 s/S) + +Writing weights file to '-' ... +done + +--- +Best iteration: 2 [SCORE 'stupid_bleu'=0.37119]. +This took 0.55 min. diff --git a/training/dtrain/examples/standard/nc-wmt11.de.gz b/training/dtrain/examples/standard/nc-wmt11.de.gz Binary files differnew file mode 100644 index 00000000..0741fd92 --- /dev/null +++ b/training/dtrain/examples/standard/nc-wmt11.de.gz diff --git a/training/dtrain/examples/standard/nc-wmt11.en.gz b/training/dtrain/examples/standard/nc-wmt11.en.gz Binary files differnew file mode 100644 index 00000000..1c0bd401 --- /dev/null +++ b/training/dtrain/examples/standard/nc-wmt11.en.gz diff --git a/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz b/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz Binary files differnew file mode 100644 index 00000000..7ce81057 --- /dev/null +++ b/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz diff --git a/training/dtrain/examples/standard/nc-wmt11.grammar.gz b/training/dtrain/examples/standard/nc-wmt11.grammar.gz Binary files differnew file mode 100644 index 00000000..ce4024a1 --- /dev/null +++ b/training/dtrain/examples/standard/nc-wmt11.grammar.gz diff --git a/training/dtrain/examples/toy/cdec.ini b/training/dtrain/examples/toy/cdec.ini new file mode 100644 index 00000000..b14f4819 --- /dev/null +++ b/training/dtrain/examples/toy/cdec.ini @@ -0,0 +1,3 @@ +formalism=scfg +add_pass_through_rules=true +grammar=grammar.gz diff --git a/training/dtrain/examples/toy/dtrain.ini b/training/dtrain/examples/toy/dtrain.ini new file mode 100644 index 00000000..cd715f26 --- /dev/null +++ b/training/dtrain/examples/toy/dtrain.ini @@ -0,0 +1,13 @@ +decoder_config=cdec.ini +input=src +refs=tgt +output=- +print_weights=logp shell_rule house_rule small_rule little_rule PassThrough +k=4 +N=4 +epochs=2 +scorer=bleu +sample_from=kbest +filter=uniq +pair_sampling=all +learning_rate=1 diff --git a/training/dtrain/examples/toy/expected-output b/training/dtrain/examples/toy/expected-output new file mode 100644 index 00000000..1da2aadd --- /dev/null +++ b/training/dtrain/examples/toy/expected-output @@ -0,0 +1,77 @@ +Warning: hi_lo only works with pair_sampling XYX. + cdec cfg 'cdec.ini' +Seeding random number sequence to 1664825829 + +dtrain +Parameters: + k 4 + N 4 + T 2 + scorer 'bleu' + sample from 'kbest' + filter 'uniq' + learning rate 1 + gamma 0 + loss margin 0 + pairs 'all' + pair threshold 0 + select weights 'last' + l1 reg 0 'none' + max pairs 4294967295 + cdec cfg 'cdec.ini' + input 'src' + refs 'tgt' + output '-' +(a dot represents 10 inputs) +Iteration #1 of 2. + 2 +WEIGHTS + logp = +0 + shell_rule = -1 + house_rule = +2 + small_rule = -2 + little_rule = +3 + PassThrough = -5 + --- + 1best avg score: 0.5 (+0.5) + 1best avg model score: 2.5 (+2.5) + avg # pairs: 4 + avg # rank err: 1.5 + avg # margin viol: 0 + non0 feature count: 6 + avg list sz: 4 + avg f count: 2.875 +(time 0 min, 0 s/S) + +Iteration #2 of 2. + 2 +WEIGHTS + logp = +0 + shell_rule = -1 + house_rule = +2 + small_rule = -2 + little_rule = +3 + PassThrough = -5 + --- + 1best avg score: 1 (+0.5) + 1best avg model score: 5 (+2.5) + avg # pairs: 5 + avg # rank err: 0 + avg # margin viol: 0 + non0 feature count: 6 + avg list sz: 4 + avg f count: 3 +(time 0 min, 0 s/S) + +Writing weights file to '-' ... +house_rule 2 +little_rule 3 +Glue -4 +PassThrough -5 +small_rule -2 +shell_rule -1 +done + +--- +Best iteration: 2 [SCORE 'bleu'=1]. +This took 0 min. diff --git a/training/dtrain/examples/toy/grammar.gz b/training/dtrain/examples/toy/grammar.gz Binary files differnew file mode 100644 index 00000000..8eb0d29e --- /dev/null +++ b/training/dtrain/examples/toy/grammar.gz diff --git a/training/dtrain/examples/toy/src b/training/dtrain/examples/toy/src new file mode 100644 index 00000000..87e39ef2 --- /dev/null +++ b/training/dtrain/examples/toy/src @@ -0,0 +1,2 @@ +ich sah ein kleines haus +ich fand ein kleines haus diff --git a/training/dtrain/examples/toy/tgt b/training/dtrain/examples/toy/tgt new file mode 100644 index 00000000..174926b3 --- /dev/null +++ b/training/dtrain/examples/toy/tgt @@ -0,0 +1,2 @@ +i saw a little house +i found a little house diff --git a/training/dtrain/kbestget.h b/training/dtrain/kbestget.h new file mode 100644 index 00000000..dd8882e1 --- /dev/null +++ b/training/dtrain/kbestget.h @@ -0,0 +1,152 @@ +#ifndef _DTRAIN_KBESTGET_H_ +#define _DTRAIN_KBESTGET_H_ + +#include "kbest.h" // cdec +#include "sentence_metadata.h" + +#include "verbose.h" +#include "viterbi.h" +#include "ff_register.h" +#include "decoder.h" +#include "weights.h" +#include "logval.h" + +using namespace std; + +namespace dtrain +{ + + +typedef double score_t; + +struct ScoredHyp +{ + vector<WordID> w; + SparseVector<double> f; + score_t model; + score_t score; + unsigned rank; +}; + +struct LocalScorer +{ + unsigned N_; + vector<score_t> w_; + + virtual score_t + Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len)=0; + + void Reset() {} // only for approx bleu + + inline void + Init(unsigned N, vector<score_t> weights) + { + assert(N > 0); + N_ = N; + if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_); + else w_ = weights; + } + + inline score_t + brevity_penalty(const unsigned hyp_len, const unsigned ref_len) + { + if (hyp_len > ref_len) return 1; + return exp(1 - (score_t)ref_len/hyp_len); + } +}; + +struct HypSampler : public DecoderObserver +{ + LocalScorer* scorer_; + vector<WordID>* ref_; + unsigned f_count_, sz_; + virtual vector<ScoredHyp>* GetSamples()=0; + inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; } + inline void SetRef(vector<WordID>& ref) { ref_ = &ref; } + inline unsigned get_f_count() { return f_count_; } + inline unsigned get_sz() { return sz_; } +}; +//////////////////////////////////////////////////////////////////////////////// + + + + +struct KBestGetter : public HypSampler +{ + const unsigned k_; + const string filter_type_; + vector<ScoredHyp> s_; + unsigned src_len_; + + KBestGetter(const unsigned k, const string filter_type) : + k_(k), filter_type_(filter_type) {} + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + src_len_ = smeta.GetSourceLength(); + KBestScored(*hg); + } + + vector<ScoredHyp>* GetSamples() { return &s_; } + + void + KBestScored(const Hypergraph& forest) + { + if (filter_type_ == "uniq") { + KBestUnique(forest); + } else if (filter_type_ == "not") { + KBestNoFilter(forest); + } + } + + void + KBestUnique(const Hypergraph& forest) + { + s_.clear(); sz_ = f_count_ = 0; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, + KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_); + for (unsigned i = 0; i < k_; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, + prob_t, EdgeProb>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + ScoredHyp h; + h.w = d->yield; + h.f = d->feature_values; + h.model = log(d->score); + h.rank = i; + h.score = scorer_->Score(h.w, *ref_, i, src_len_); + s_.push_back(h); + sz_++; + f_count_ += h.f.size(); + } + } + + void + KBestNoFilter(const Hypergraph& forest) + { + s_.clear(); sz_ = f_count_ = 0; + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_); + for (unsigned i = 0; i < k_; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + ScoredHyp h; + h.w = d->yield; + h.f = d->feature_values; + h.model = log(d->score); + h.rank = i; + h.score = scorer_->Score(h.w, *ref_, i, src_len_); + s_.push_back(h); + sz_++; + f_count_ += h.f.size(); + } + } +}; + + +} // namespace + +#endif + diff --git a/training/dtrain/ksampler.h b/training/dtrain/ksampler.h new file mode 100644 index 00000000..bc2f56cd --- /dev/null +++ b/training/dtrain/ksampler.h @@ -0,0 +1,61 @@ +#ifndef _DTRAIN_KSAMPLER_H_ +#define _DTRAIN_KSAMPLER_H_ + +#include "hg_sampler.h" // cdec +#include "kbestget.h" +#include "score.h" + +namespace dtrain +{ + +bool +cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b) +{ + return a.model > b.model; +} + +struct KSampler : public HypSampler +{ + const unsigned k_; + vector<ScoredHyp> s_; + MT19937* prng_; + score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>); + unsigned src_len_; + + explicit KSampler(const unsigned k, MT19937* prng) : + k_(k), prng_(prng) {} + + virtual void + NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + { + src_len_ = smeta.GetSourceLength(); + ScoredSamples(*hg); + } + + vector<ScoredHyp>* GetSamples() { return &s_; } + + void ScoredSamples(const Hypergraph& forest) { + s_.clear(); sz_ = f_count_ = 0; + std::vector<HypergraphSampler::Hypothesis> samples; + HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples); + for (unsigned i = 0; i < k_; ++i) { + ScoredHyp h; + h.w = samples[i].words; + h.f = samples[i].fmap; + h.model = log(samples[i].model_score); + h.rank = i; + h.score = scorer_->Score(h.w, *ref_, i, src_len_); + s_.push_back(h); + sz_++; + f_count_ += h.f.size(); + } + sort(s_.begin(), s_.end(), cmp_hyp_by_model_d); + for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i; + } +}; + + +} // namespace + +#endif + diff --git a/training/dtrain/lplp.rb b/training/dtrain/lplp.rb new file mode 100755 index 00000000..86e835e8 --- /dev/null +++ b/training/dtrain/lplp.rb @@ -0,0 +1,123 @@ +# lplp.rb + +# norms +def l0(feature_column, n) + if feature_column.size >= n then return 1 else return 0 end +end + +def l1(feature_column, n=-1) + return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i } +end + +def l2(feature_column, n=-1) + return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i } +end + +def linfty(feature_column, n=-1) + return feature_column.map { |i| i.abs }.max +end + +# stats +def median(feature_column, n) + return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2] +end + +def mean(feature_column, n) + return feature_column.reduce { |sum, i| sum+i } / n +end + +# selection +def select_k(weights, norm_fun, n, k=10000) + weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p| + puts "#{p[0]}\t#{mean(p[1], n)}" + k -= 1 + if k == 0 then break end + } +end + +def cut(weights, norm_fun, n, epsilon=0.0001) + weights.each { |k,v| + if norm_fun.call(v, n).abs >= epsilon + puts "#{k}\t#{mean(v, n)}" + end + } +end + +# test +def _test() + puts + w = {} + w["a"] = [1, 2, 3] + w["b"] = [1, 2] + w["c"] = [66] + w["d"] = [10, 20, 30] + n = 3 + puts w.to_s + puts + puts "select_k" + puts "l0 expect ad" + select_k(w, method(:l0), n, 2) + puts "l1 expect cd" + select_k(w, method(:l1), n, 2) + puts "l2 expect c" + select_k(w, method(:l2), n, 1) + puts + puts "cut" + puts "l1 expect cd" + cut(w, method(:l1), n, 7) + puts + puts "median" + a = [1,2,3,4,5] + puts a.to_s + puts median(a, 5) + puts + puts "#{median(a, 7)} <- that's because we add missing 0s:" + puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s + puts + puts "mean expect bc" + w.clear + w["a"] = [2] + w["b"] = [2.1] + w["c"] = [2.2] + cut(w, method(:mean), 1, 2.05) + exit +end +#_test() + + +def usage() + puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> <#shards> < <input>" + puts " l0...: norms for selection" + puts "select_k: only output top k (according to the norm of their column vector) features" + puts " cut: output features with weight >= threshold" + puts " n: if we do not have a shard count use this number for averaging" + exit 1 +end + +if ARGV.size < 4 then usage end +norm_fun = method(ARGV[0].to_sym) +type = ARGV[1] +x = ARGV[2].to_f +shard_count = ARGV[3].to_f + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +w = {} +while line = STDIN.gets + key, val = line.split /\s+/ + if w.has_key? key + w[key].push val.to_f + else + w[key] = [val.to_f] + end +end + +if type == 'cut' + cut(w, norm_fun, shard_count, x) +elsif type == 'select_k' + select_k(w, norm_fun, shard_count, x) +else + puts "oh oh" +end + diff --git a/training/dtrain/pairsampling.h b/training/dtrain/pairsampling.h new file mode 100644 index 00000000..3f67e209 --- /dev/null +++ b/training/dtrain/pairsampling.h @@ -0,0 +1,140 @@ +#ifndef _DTRAIN_PAIRSAMPLING_H_ +#define _DTRAIN_PAIRSAMPLING_H_ + +namespace dtrain +{ + + +bool +accept_pair(score_t a, score_t b, score_t threshold) +{ + if (fabs(a - b) < threshold) return false; + return true; +} + +bool +cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b) +{ + return a.score > b.score; +} + +inline void +all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool misranked_only, float _unused=1) +{ + sort(s->begin(), s->end(), cmp_hyp_by_score_d); + unsigned sz = s->size(); + bool b = false; + unsigned count = 0; + for (unsigned i = 0; i < sz-1; i++) { + for (unsigned j = i+1; j < sz; j++) { + if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) + training.push_back(make_pair((*s)[i], (*s)[j])); + } else { + if ((*s)[i].score != (*s)[j].score) + training.push_back(make_pair((*s)[i], (*s)[j])); + } + if (++count == max) { + b = true; + break; + } + } + if (b) break; + } +} + +/* + * multipartite ranking + * sort (descending) by bleu + * compare top X to middle Y and low X + * cmp middle Y to low X + */ + +inline void +partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool misranked_only, float hi_lo) +{ + unsigned sz = s->size(); + if (sz < 2) return; + sort(s->begin(), s->end(), cmp_hyp_by_score_d); + unsigned sep = round(sz*hi_lo); + unsigned sep_hi = sep; + if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi; + else sep_hi = 1; + bool b = false; + unsigned count = 0; + for (unsigned i = 0; i < sep_hi; i++) { + for (unsigned j = sep_hi; j < sz; j++) { + if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) + training.push_back(make_pair((*s)[i], (*s)[j])); + } else { + if ((*s)[i].score != (*s)[j].score) + training.push_back(make_pair((*s)[i], (*s)[j])); + } + if (++count == max) { + b = true; + break; + } + } + if (b) break; + } + unsigned sep_lo = sz-sep; + while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo; + for (unsigned i = sep_hi; i < sz-sep_lo; i++) { + for (unsigned j = sz-sep_lo; j < sz; j++) { + if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue; + if (threshold > 0) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) + training.push_back(make_pair((*s)[i], (*s)[j])); + } else { + if ((*s)[i].score != (*s)[j].score) + training.push_back(make_pair((*s)[i], (*s)[j])); + } + if (++count == max) return; + } + } +} + +/* + * pair sampling as in + * 'Tuning as Ranking' (Hopkins & May, 2011) + * count = 5000 + * threshold = 5% BLEU (0.05 for param 3) + * cut = top 50 + */ +bool +_PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b) +{ + return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score)); +} +inline void +PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool _unused=false, float _also_unused=0) +{ + unsigned max_count = 5000, count = 0, sz = s->size(); + bool b = false; + for (unsigned i = 0; i < sz-1; i++) { + for (unsigned j = i+1; j < sz; j++) { + if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) { + training.push_back(make_pair((*s)[i], (*s)[j])); + if (++count == max_count) { + b = true; + break; + } + } + } + if (b) break; + } + if (training.size() > 50) { + sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d); + training.erase(training.begin()+50, training.end()); + } + return; +} + + +} // namespace + +#endif + diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb new file mode 100755 index 00000000..e661416e --- /dev/null +++ b/training/dtrain/parallelize.rb @@ -0,0 +1,149 @@ +#!/usr/bin/env ruby + +require 'trollop' + +def usage + STDERR.write "Usage: " + STDERR.write "ruby parallelize.rb -c <dtrain.ini> [-e <epochs=10>] [--randomize/-z] [--reshard/-y] -s <#shards|0> [-p <at once=9999>] -i <input> -r <refs> [--qsub/-q] [--dtrain_binary <path to dtrain binary>] [-l \"l2 select_k 100000\"]\n" + exit 1 +end + +opts = Trollop::options do + opt :config, "dtrain config file", :type => :string + opt :epochs, "number of epochs", :type => :int, :default => 10 + opt :lplp_args, "arguments for lplp.rb", :type => :string, :default => "l2 select_k 100000" + opt :randomize, "randomize shards before each epoch", :type => :bool, :short => '-z', :default => false + opt :reshard, "reshard after each epoch", :type => :bool, :short => '-y', :default => false + opt :shards, "number of shards", :type => :int + opt :processes_at_once, "have this number (max) running at the same time", :type => :int, :default => 9999 + opt :input, "input", :type => :string + opt :references, "references", :type => :string + opt :qsub, "use qsub", :type => :bool, :default => false + opt :dtrain_binary, "path to dtrain binary", :type => :string +end +usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references] + + +dtrain_dir = File.expand_path File.dirname(__FILE__) +if not opts[:dtrain_binary] + dtrain_bin = "#{dtrain_dir}/dtrain" +else + dtrain_bin = opts[:dtrain_binary] +end +ruby = '/usr/bin/ruby' +lplp_rb = "#{dtrain_dir}/lplp.rb" +lplp_args = opts[:lplp_args] +cat = '/bin/cat' + +ini = opts[:config] +epochs = opts[:epochs] +rand = opts[:randomize] +reshard = opts[:reshard] +predefined_shards = false +if opts[:shards] == 0 + predefined_shards = true + num_shards = 0 +else + num_shards = opts[:shards] +end +input = opts[:input] +refs = opts[:references] +use_qsub = opts[:qsub] +shards_at_once = opts[:processes_at_once] + +`mkdir work` + +def make_shards(input, refs, num_shards, epoch, rand) + lc = `wc -l #{input}`.split.first.to_i + index = (0..lc-1).to_a + index.reverse! + index.shuffle! if rand + shard_sz = lc / num_shards + leftover = lc % num_shards + in_f = File.new input, 'r' + in_lines = in_f.readlines + refs_f = File.new refs, 'r' + refs_lines = refs_f.readlines + shard_in_files = [] + shard_refs_files = [] + in_fns = [] + refs_fns = [] + 0.upto(num_shards-1) { |shard| + in_fn = "work/shard.#{shard}.#{epoch}.in" + shard_in = File.new in_fn, 'w+' + in_fns << in_fn + refs_fn = "work/shard.#{shard}.#{epoch}.refs" + shard_refs = File.new refs_fn, 'w+' + refs_fns << refs_fn + 0.upto(shard_sz-1) { |i| + j = index.pop + shard_in.write in_lines[j] + shard_refs.write refs_lines[j] + } + shard_in_files << shard_in + shard_refs_files << shard_refs + } + while leftover > 0 + j = index.pop + shard_in_files[-1].write in_lines[j] + shard_refs_files[-1].write refs_lines[j] + leftover -= 1 + end + (shard_in_files + shard_refs_files).each do |f| f.close end + in_f.close + refs_f.close + return [in_fns, refs_fns] +end + +input_files = [] +refs_files = [] +if predefined_shards + input_files = File.new(input).readlines.map {|i| i.strip } + refs_files = File.new(refs).readlines.map {|i| i.strip } + num_shards = input_files.size +else + input_files, refs_files = make_shards input, refs, num_shards, 0, rand +end + +0.upto(epochs-1) { |epoch| + puts "epoch #{epoch+1}" + pids = [] + input_weights = '' + if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end + weights_files = [] + shard = 0 + remaining_shards = num_shards + while remaining_shards > 0 + shards_at_once.times { + break if remaining_shards==0 + qsub_str_start = qsub_str_end = '' + local_end = '' + if use_qsub + qsub_str_start = "qsub -cwd -sync y -b y -j y -o work/out.#{shard}.#{epoch} -N dtrain.#{shard}.#{epoch} \"" + qsub_str_end = "\"" + local_end = '' + else + local_end = "&>work/out.#{shard}.#{epoch}" + end + pids << Kernel.fork { + `#{qsub_str_start}#{dtrain_bin} -c #{ini}\ + --input #{input_files[shard]}\ + --refs #{refs_files[shard]} #{input_weights}\ + --output work/weights.#{shard}.#{epoch}#{qsub_str_end} #{local_end}` + } + weights_files << "work/weights.#{shard}.#{epoch}" + shard += 1 + remaining_shards -= 1 + } + pids.each { |pid| Process.wait(pid) } + pids.clear + end + `#{cat} work/weights.*.#{epoch} > work/weights_cat` + `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat > work/weights.#{epoch}` + if rand and reshard and epoch+1!=epochs + input_files, refs_files = make_shards input, refs, num_shards, epoch+1, rand + end +} + +`rm work/weights_cat` + diff --git a/training/dtrain/score.cc b/training/dtrain/score.cc new file mode 100644 index 00000000..96d6e10a --- /dev/null +++ b/training/dtrain/score.cc @@ -0,0 +1,283 @@ +#include "score.h" + +namespace dtrain +{ + + +/* + * bleu + * + * as in "BLEU: a Method for Automatic Evaluation + * of Machine Translation" + * (Papineni et al. '02) + * + * NOTE: 0 if for one n \in {1..N} count is 0 + */ +score_t +BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len) +{ + if (hyp_len == 0 || ref_len == 0) return 0.; + unsigned M = N_; + vector<score_t> v = w_; + if (ref_len < N_) { + M = ref_len; + for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M); + } + score_t sum = 0; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) return 0.; + sum += v[i] * log((score_t)counts.clipped_[i]/counts.sum_[i]); + } + return brevity_penalty(hyp_len, ref_len) * exp(sum); +} + +score_t +BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + return Bleu(counts, hyp_len, ref_len); +} + +/* + * 'stupid' bleu + * + * as in "ORANGE: a Method for Evaluating + * Automatic Evaluation Metrics + * for Machine Translation" + * (Lin & Och '04) + * + * NOTE: 0 iff no 1gram match ('grounded') + */ +score_t +StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + vector<score_t> v = w_; + if (ref_len < N_) { + M = ref_len; + for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M); + } + score_t sum = 0, add = 0; + for (unsigned i = 0; i < M; i++) { + if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.; + if (i == 1) add = 1; + sum += v[i] * log(((score_t)counts.clipped_[i] + add)/((counts.sum_[i] + add))); + } + return brevity_penalty(hyp_len, ref_len) * exp(sum); +} + +/* + * fixed 'stupid' bleu + * + * as in "Optimizing for Sentence-Level BLEU+1 + * Yields Short Translations" + * (Nakov et al. '12) + */ +score_t +FixedStupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + vector<score_t> v = w_; + if (ref_len < N_) { + M = ref_len; + for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M); + } + score_t sum = 0, add = 0; + for (unsigned i = 0; i < M; i++) { + if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.; + if (i == 1) add = 1; + sum += v[i] * log(((score_t)counts.clipped_[i] + add)/((counts.sum_[i] + add))); + } + return brevity_penalty(hyp_len, ref_len+1) * exp(sum); // <- fix +} + +/* + * smooth bleu + * + * as in "An End-to-End Discriminative Approach + * to Machine Translation" + * (Liang et al. '06) + * + * NOTE: max is 0.9375 (with N=4) + */ +score_t +SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0.; + vector<score_t> i_bleu; + for (unsigned i = 0; i < M; i++) i_bleu.push_back(0.); + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) { + break; + } else { + score_t i_ng = log((score_t)counts.clipped_[i]/counts.sum_[i]); + for (unsigned j = i; j < M; j++) { + i_bleu[j] += (1/((score_t)j+1)) * i_ng; + } + } + sum += exp(i_bleu[i])/pow(2.0, (double)(N_-i)); + } + return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * 'sum' bleu + * + * sum up Ngram precisions + */ +score_t +SumBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0.; + unsigned j = 1; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; + sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2.0, (double) (N_-j+1)); + j++; + } + return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * 'sum' (exp) bleu + * + * sum up exp(Ngram precisions) + */ +score_t +SumExpBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0.; + unsigned j = 1; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; + sum += exp(((score_t)counts.clipped_[i]/counts.sum_[i]))/pow(2.0, (double) (N_-j+1)); + j++; + } + return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * 'sum' (whatever) bleu + * + * sum up exp(weight * log(Ngram precisions)) + */ +score_t +SumWhateverBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned /*rank*/, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (hyp_len == 0 || ref_len == 0) return 0.; + NgramCounts counts = make_ngram_counts(hyp, ref, N_); + unsigned M = N_; + vector<score_t> v = w_; + if (ref_len < N_) { + M = ref_len; + for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M); + } + score_t sum = 0.; + unsigned j = 1; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break; + sum += exp(v[i] * log(((score_t)counts.clipped_[i]/counts.sum_[i])))/pow(2.0, (double) (N_-j+1)); + j++; + } + return brevity_penalty(hyp_len, ref_len) * sum; +} + +/* + * approx. bleu + * + * as in "Online Large-Margin Training of Syntactic + * and Structural Translation Features" + * (Chiang et al. '08) + * + * NOTE: Needs some more code in dtrain.cc . + * No scaling by src len. + */ +score_t +ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned rank, const unsigned src_len) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (ref_len == 0) return 0.; + score_t score = 0.; + NgramCounts counts(N_); + if (hyp_len > 0) { + counts = make_ngram_counts(hyp, ref, N_); + NgramCounts tmp = glob_onebest_counts_ + counts; + score = Bleu(tmp, hyp_len, ref_len); + } + if (rank == 0) { // 'context of 1best translations' + glob_onebest_counts_ += counts; + glob_onebest_counts_ *= discount_; + glob_hyp_len_ = discount_ * (glob_hyp_len_ + hyp_len); + glob_ref_len_ = discount_ * (glob_ref_len_ + ref_len); + glob_src_len_ = discount_ * (glob_src_len_ + src_len); + } + return score; +} + +/* + * Linear (Corpus) Bleu + * + * as in "Lattice Minimum Bayes-Risk Decoding + * for Statistical Machine Translation" + * (Tromble et al. '08) + * + */ +score_t +LinearBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref, + const unsigned rank, const unsigned /*src_len*/) +{ + unsigned hyp_len = hyp.size(), ref_len = ref.size(); + if (ref_len == 0) return 0.; + unsigned M = N_; + if (ref_len < N_) M = ref_len; + NgramCounts counts(M); + if (hyp_len > 0) + counts = make_ngram_counts(hyp, ref, M); + score_t ret = 0.; + for (unsigned i = 0; i < M; i++) { + if (counts.sum_[i] == 0 || onebest_counts_.sum_[i] == 0) break; + ret += counts.sum_[i]/onebest_counts_.sum_[i]; + } + ret = -(hyp_len/(score_t)onebest_len_) + (1./M) * ret; + if (rank == 0) { + onebest_len_ += hyp_len; + onebest_counts_ += counts; + } + return ret; +} + + +} // namespace + diff --git a/training/dtrain/score.h b/training/dtrain/score.h new file mode 100644 index 00000000..bddaa071 --- /dev/null +++ b/training/dtrain/score.h @@ -0,0 +1,217 @@ +#ifndef _DTRAIN_SCORE_H_ +#define _DTRAIN_SCORE_H_ + +#include "kbestget.h" + +using namespace std; + +namespace dtrain +{ + + +struct NgramCounts +{ + unsigned N_; + map<unsigned, score_t> clipped_; + map<unsigned, score_t> sum_; + + NgramCounts(const unsigned N) : N_(N) { Zero(); } + + inline void + operator+=(const NgramCounts& rhs) + { + if (rhs.N_ > N_) Resize(rhs.N_); + for (unsigned i = 0; i < N_; i++) { + this->clipped_[i] += rhs.clipped_.find(i)->second; + this->sum_[i] += rhs.sum_.find(i)->second; + } + } + + inline const NgramCounts + operator+(const NgramCounts &other) const + { + NgramCounts result = *this; + result += other; + return result; + } + + inline void + operator*=(const score_t rhs) + { + for (unsigned i = 0; i < N_; i++) { + this->clipped_[i] *= rhs; + this->sum_[i] *= rhs; + } + } + + inline void + Add(const unsigned count, const unsigned ref_count, const unsigned i) + { + assert(i < N_); + if (count > ref_count) { + clipped_[i] += ref_count; + } else { + clipped_[i] += count; + } + sum_[i] += count; + } + + inline void + Zero() + { + for (unsigned i = 0; i < N_; i++) { + clipped_[i] = 0.; + sum_[i] = 0.; + } + } + + inline void + One() + { + for (unsigned i = 0; i < N_; i++) { + clipped_[i] = 1.; + sum_[i] = 1.; + } + } + + inline void + Print() + { + for (unsigned i = 0; i < N_; i++) { + cout << i+1 << "grams (clipped):\t" << clipped_[i] << endl; + cout << i+1 << "grams:\t\t\t" << sum_[i] << endl; + } + } + + inline void Resize(unsigned N) + { + if (N == N_) return; + else if (N > N_) { + for (unsigned i = N_; i < N; i++) { + clipped_[i] = 0.; + sum_[i] = 0.; + } + } else { // N < N_ + for (unsigned i = N_-1; i > N-1; i--) { + clipped_.erase(i); + sum_.erase(i); + } + } + N_ = N; + } +}; + +typedef map<vector<WordID>, unsigned> Ngrams; + +inline Ngrams +make_ngrams(const vector<WordID>& s, const unsigned N) +{ + Ngrams ngrams; + vector<WordID> ng; + for (size_t i = 0; i < s.size(); i++) { + ng.clear(); + for (unsigned j = i; j < min(i+N, s.size()); j++) { + ng.push_back(s[j]); + ngrams[ng]++; + } + } + return ngrams; +} + +inline NgramCounts +make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned N) +{ + Ngrams hyp_ngrams = make_ngrams(hyp, N); + Ngrams ref_ngrams = make_ngrams(ref, N); + NgramCounts counts(N); + Ngrams::iterator it; + Ngrams::iterator ti; + for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) { + ti = ref_ngrams.find(it->first); + if (ti != ref_ngrams.end()) { + counts.Add(it->second, ti->second, it->first.size() - 1); + } else { + counts.Add(it->second, 0, it->first.size() - 1); + } + } + return counts; +} + +struct BleuScorer : public LocalScorer +{ + score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len); + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct StupidBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct FixedStupidBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SmoothBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SumBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SumExpBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct SumWhateverBleuScorer : public LocalScorer +{ + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/); +}; + +struct ApproxBleuScorer : public BleuScorer +{ + NgramCounts glob_onebest_counts_; + unsigned glob_hyp_len_, glob_ref_len_, glob_src_len_; + score_t discount_; + + ApproxBleuScorer(unsigned N, score_t d) : glob_onebest_counts_(NgramCounts(N)), discount_(d) + { + glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0; + } + + inline void Reset() { + glob_onebest_counts_.Zero(); + glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0.; + } + + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len); +}; + +struct LinearBleuScorer : public BleuScorer +{ + unsigned onebest_len_; + NgramCounts onebest_counts_; + + LinearBleuScorer(unsigned N) : onebest_len_(1), onebest_counts_(N) + { + onebest_counts_.One(); + } + + score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned /*src_len*/); + + inline void Reset() { + onebest_len_ = 1; + onebest_counts_.One(); + } +}; + + +} // namespace + +#endif + diff --git a/training/fast_align.cc b/training/fast_align.cc deleted file mode 100644 index 0d7b0202..00000000 --- a/training/fast_align.cc +++ /dev/null @@ -1,271 +0,0 @@ -#include <iostream> -#include <cmath> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "m.h" -#include "corpus_tools.h" -#include "stringlib.h" -#include "filelib.h" -#include "ttables.h" -#include "tdict.h" - -namespace po = boost::program_options; -using namespace std; - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input,i",po::value<string>(),"Parallel corpus input file") - ("reverse,r","Reverse estimation (swap source and target during training)") - ("iterations,I",po::value<unsigned>()->default_value(5),"Number of iterations of EM training") - //("bidir,b", "Run bidirectional alignment") - ("favor_diagonal,d", "Use a static alignment distribution that assigns higher probabilities to alignments near the diagonal") - ("prob_align_null", po::value<double>()->default_value(0.08), "When --favor_diagonal is set, what's the probability of a null alignment?") - ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (<1 = flat >1 = sharp)") - ("variational_bayes,v","Infer VB estimate of parameters under a symmetric Dirichlet prior") - ("alpha,a", po::value<double>()->default_value(0.01), "Hyperparameter for optional Dirichlet prior") - ("no_null_word,N","Do not generate from a null token") - ("output_parameters,p", "Write model parameters instead of alignments") - ("beam_threshold,t",po::value<double>()->default_value(-4),"When writing parameters, log_10 of beam threshold for writing parameter (-10000 to include everything, 0 max parameter only)") - ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model") - ("no_add_viterbi,V","When writing model parameters, do not add Viterbi alignment points (may generate a grammar where some training sentence pairs are unreachable)"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || conf->count("input") == 0) { - cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -double PosteriorInference(const vector<WordID>& src, const vector<WordID>& trg) { - double llh = 0; - static vector<double> unnormed_a_i; - if (src.size() > unnormed_a_i.size()) - unnormed_a_i.resize(src.size()); - return llh; -} - -int main(int argc, char** argv) { - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) return 1; - const string fname = conf["input"].as<string>(); - const bool reverse = conf.count("reverse") > 0; - const int ITERATIONS = conf["iterations"].as<unsigned>(); - const double BEAM_THRESHOLD = pow(10.0, conf["beam_threshold"].as<double>()); - const bool use_null = (conf.count("no_null_word") == 0); - const WordID kNULL = TD::Convert("<eps>"); - const bool add_viterbi = (conf.count("no_add_viterbi") == 0); - const bool variational_bayes = (conf.count("variational_bayes") > 0); - const bool write_alignments = (conf.count("output_parameters") == 0); - const double diagonal_tension = conf["diagonal_tension"].as<double>(); - const double prob_align_null = conf["prob_align_null"].as<double>(); - string testset; - if (conf.count("testset")) testset = conf["testset"].as<string>(); - const double prob_align_not_null = 1.0 - prob_align_null; - const double alpha = conf["alpha"].as<double>(); - const bool favor_diagonal = conf.count("favor_diagonal"); - if (variational_bayes && alpha <= 0.0) { - cerr << "--alpha must be > 0\n"; - return 1; - } - - TTable s2t, t2s; - TTable::Word2Word2Double s2t_viterbi; - double tot_len_ratio = 0; - double mean_srclen_multiplier = 0; - vector<double> unnormed_a_i; - for (int iter = 0; iter < ITERATIONS; ++iter) { - const bool final_iteration = (iter == (ITERATIONS - 1)); - cerr << "ITERATION " << (iter + 1) << (final_iteration ? " (FINAL)" : "") << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - double likelihood = 0; - double denom = 0.0; - int lc = 0; - bool flag = false; - string line; - string ssrc, strg; - vector<WordID> src, trg; - while(true) { - getline(in, line); - if (!in) break; - ++lc; - if (lc % 1000 == 0) { cerr << '.'; flag = true; } - if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } - src.clear(); trg.clear(); - CorpusTools::ReadLine(line, &src, &trg); - if (reverse) swap(src, trg); - if (src.size() == 0 || trg.size() == 0) { - cerr << "Error: " << lc << "\n" << line << endl; - return 1; - } - if (src.size() > unnormed_a_i.size()) - unnormed_a_i.resize(src.size()); - if (iter == 0) - tot_len_ratio += static_cast<double>(trg.size()) / static_cast<double>(src.size()); - denom += trg.size(); - vector<double> probs(src.size() + 1); - bool first_al = true; // used for write_alignments - for (int j = 0; j < trg.size(); ++j) { - const WordID& f_j = trg[j]; - double sum = 0; - const double j_over_ts = double(j) / trg.size(); - double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) - if (use_null) { - if (favor_diagonal) prob_a_i = prob_align_null; - probs[0] = s2t.prob(kNULL, f_j) * prob_a_i; - sum += probs[0]; - } - double az = 0; - if (favor_diagonal) { - for (int ta = 0; ta < src.size(); ++ta) { - unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); - az += unnormed_a_i[ta]; - } - az /= prob_align_not_null; - } - for (int i = 1; i <= src.size(); ++i) { - if (favor_diagonal) - prob_a_i = unnormed_a_i[i-1] / az; - probs[i] = s2t.prob(src[i-1], f_j) * prob_a_i; - sum += probs[i]; - } - if (final_iteration) { - if (add_viterbi || write_alignments) { - WordID max_i = 0; - double max_p = -1; - int max_index = -1; - if (use_null) { - max_i = kNULL; - max_index = 0; - max_p = probs[0]; - } - for (int i = 1; i <= src.size(); ++i) { - if (probs[i] > max_p) { - max_index = i; - max_p = probs[i]; - max_i = src[i-1]; - } - } - if (write_alignments) { - if (max_index > 0) { - if (first_al) first_al = false; else cout << ' '; - if (reverse) - cout << j << '-' << (max_index - 1); - else - cout << (max_index - 1) << '-' << j; - } - } - s2t_viterbi[max_i][f_j] = 1.0; - } - } else { - if (use_null) - s2t.Increment(kNULL, f_j, probs[0] / sum); - for (int i = 1; i <= src.size(); ++i) - s2t.Increment(src[i-1], f_j, probs[i] / sum); - } - likelihood += log(sum); - } - if (write_alignments && final_iteration) cout << endl; - } - - // log(e) = 1.0 - double base2_likelihood = likelihood / log(2); - - if (flag) { cerr << endl; } - if (iter == 0) { - mean_srclen_multiplier = tot_len_ratio / lc; - cerr << "expected target length = source length * " << mean_srclen_multiplier << endl; - } - cerr << " log_e likelihood: " << likelihood << endl; - cerr << " log_2 likelihood: " << base2_likelihood << endl; - cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; - cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; - if (!final_iteration) { - if (variational_bayes) - s2t.NormalizeVB(alpha); - else - s2t.Normalize(); - } - } - if (testset.size()) { - ReadFile rf(testset); - istream& in = *rf.stream(); - int lc = 0; - double tlp = 0; - string ssrc, strg, line; - while (getline(in, line)) { - ++lc; - vector<WordID> src, trg; - CorpusTools::ReadLine(line, &src, &trg); - double log_prob = Md::log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier); - if (src.size() > unnormed_a_i.size()) - unnormed_a_i.resize(src.size()); - - // compute likelihood - for (int j = 0; j < trg.size(); ++j) { - const WordID& f_j = trg[j]; - double sum = 0; - const double j_over_ts = double(j) / trg.size(); - double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) - if (use_null) { - if (favor_diagonal) prob_a_i = prob_align_null; - sum += s2t.prob(kNULL, f_j) * prob_a_i; - } - double az = 0; - if (favor_diagonal) { - for (int ta = 0; ta < src.size(); ++ta) { - unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); - az += unnormed_a_i[ta]; - } - az /= prob_align_not_null; - } - for (int i = 1; i <= src.size(); ++i) { - if (favor_diagonal) - prob_a_i = unnormed_a_i[i-1] / az; - sum += s2t.prob(src[i-1], f_j) * prob_a_i; - } - log_prob += log(sum); - } - tlp += log_prob; - cerr << ssrc << " ||| " << strg << " ||| " << log_prob << endl; - } - cerr << "TOTAL LOG PROB " << tlp << endl; - } - - if (write_alignments) return 0; - - for (TTable::Word2Word2Double::iterator ei = s2t.ttable.begin(); ei != s2t.ttable.end(); ++ei) { - const TTable::Word2Double& cpd = ei->second; - const TTable::Word2Double& vit = s2t_viterbi[ei->first]; - const string& esym = TD::Convert(ei->first); - double max_p = -1; - for (TTable::Word2Double::const_iterator fi = cpd.begin(); fi != cpd.end(); ++fi) - if (fi->second > max_p) max_p = fi->second; - const double threshold = max_p * BEAM_THRESHOLD; - for (TTable::Word2Double::const_iterator fi = cpd.begin(); fi != cpd.end(); ++fi) { - if (fi->second > threshold || (vit.find(fi->first) != vit.end())) { - cout << esym << ' ' << TD::Convert(fi->first) << ' ' << log(fi->second) << endl; - } - } - } - return 0; -} - diff --git a/training/feature_expectations.cc b/training/feature_expectations.cc deleted file mode 100644 index f1a85495..00000000 --- a/training/feature_expectations.cc +++ /dev/null @@ -1,232 +0,0 @@ -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> -#include <cmath> -#include <tr1/memory> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "verbose.h" -#include "hg.h" -#include "prob.h" -#include "inside_outside.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "online_optimizer.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "sampler.h" - -#ifdef HAVE_MPI -#include <boost/mpi/timer.hpp> -#include <boost/mpi.hpp> -namespace mpi = boost::mpi; -#endif - -using namespace std; -namespace po = boost::program_options; - -struct FComp { - const vector<double>& w_; - FComp(const vector<double>& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowFeatures(const vector<double>& w) { - vector<int> fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - sort(fnums.begin(), fnums.end(), FComp(w)); - for (vector<int>::iterator i = fnums.begin(); i != fnums.end(); ++i) { - if (w[*i]) cout << FD::Convert(*i) << ' ' << w[*i] << endl; - } -} - -void ReadConfig(const string& ini, vector<string>* out) { - ReadFile rf(ini); - istream& in = *rf.stream(); - while(in) { - string line; - getline(in, line); - if (!in) continue; - out->push_back(line); - } -} - -void StoreConfig(const vector<string>& cfg, istringstream* o) { - ostringstream os; - for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; } - o->str(os.str()); -} - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input,i",po::value<string>(),"Corpus of source language sentences") - ("weights,w",po::value<string>(),"Input feature weights file") - ("decoder_config,c",po::value<string>(), "cdec.ini file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("input") || !conf->count("decoder_config")) { - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - int id = 0; - while(in) { - getline(in, line); - if (!in) break; - if (id % size == rank) { - c->push_back(line); - order->push_back(id); - } - ++id; - } -} - -static const double kMINUS_EPSILON = -1e-6; - -struct TrainingObserver : public DecoderObserver { - void Reset() { - acc_exp.clear(); - total_complete = 0; - } - - virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { - cur_model_exp.clear(); - state = 1; - } - - // compute model expectations, denominator of objective - virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 1); - state = 2; - const prob_t z = InsideOutside<prob_t, - EdgeProb, - SparseVector<prob_t>, - EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); - cur_model_exp /= z; - acc_exp += cur_model_exp; - } - - virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { - cerr << "IGNORING ALIGNMENT FOREST!\n"; - } - - virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { - if (state == 2) { - ++total_complete; - } - } - - void GetExpectations(SparseVector<double>* g) const { - g->clear(); - for (SparseVector<prob_t>::const_iterator it = acc_exp.begin(); it != acc_exp.end(); ++it) - g->set_value(it->first, it->second); - } - - int total_complete; - SparseVector<prob_t> cur_model_exp; - SparseVector<prob_t> acc_exp; - int state; -}; - -#ifdef HAVE_MPI -namespace boost { namespace mpi { - template<> - struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> > - : mpl::true_ { }; -} } // end namespace boost::mpi -#endif - -int main(int argc, char** argv) { -#ifdef HAVE_MPI - mpi::environment env(argc, argv); - mpi::communicator world; - const int size = world.size(); - const int rank = world.rank(); -#else - const int size = 1; - const int rank = 0; -#endif - if (size > 1) SetSilent(true); // turn off verbose decoder output - register_feature_functions(); - - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) - return 1; - - // load initial weights - Weights weights; - if (conf.count("weights")) - weights.InitFromFile(conf["weights"].as<string>()); - - vector<string> corpus; - vector<int> ids; - ReadTrainingCorpus(conf["input"].as<string>(), rank, size, &corpus, &ids); - assert(corpus.size() > 0); - - vector<string> cdec_ini; - ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini); - istringstream ini; - StoreConfig(cdec_ini, &ini); - Decoder decoder(&ini); - if (decoder.GetConf()["input"].as<string>() != "-") { - cerr << "cdec.ini must not set an input file\n"; - return 1; - } - - SparseVector<double> x; - weights.InitSparseVector(&x); - TrainingObserver observer; - - weights.InitFromVector(x); - vector<double> lambdas; - weights.InitVector(&lambdas); - decoder.SetWeights(lambdas); - observer.Reset(); - for (unsigned i = 0; i < corpus.size(); ++i) { - int id = ids[i]; - decoder.SetId(id); - decoder.Decode(corpus[i], &observer); - } - SparseVector<double> local_exps, exps; - observer.GetExpectations(&local_exps); -#ifdef HAVE_MPI - reduce(world, local_exps, exps, std::plus<SparseVector<double> >(), 0); -#else - exps.swap(local_exps); -#endif - - weights.InitFromVector(exps); - weights.InitVector(&lambdas); - ShowFeatures(lambdas); - - return 0; -} diff --git a/training/lbl_model.cc b/training/lbl_model.cc deleted file mode 100644 index a46ce33c..00000000 --- a/training/lbl_model.cc +++ /dev/null @@ -1,421 +0,0 @@ -#include <iostream> - -#include "config.h" -#ifndef HAVE_EIGEN - int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; } -#else - -#include <cstdlib> -#include <algorithm> -#include <cmath> -#include <set> -#include <cstring> // memset -#include <ctime> - -#ifdef HAVE_MPI -#include <boost/mpi/timer.hpp> -#include <boost/mpi.hpp> -#include <boost/archive/text_oarchive.hpp> -namespace mpi = boost::mpi; -#endif -#include <boost/math/special_functions/fpclassify.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include <Eigen/Dense> - -#include "corpus_tools.h" -#include "optimize.h" -#include "array2d.h" -#include "m.h" -#include "lattice.h" -#include "stringlib.h" -#include "filelib.h" -#include "tdict.h" - -namespace po = boost::program_options; -using namespace std; - -#define kDIMENSIONS 10 -typedef Eigen::Matrix<double, kDIMENSIONS, 1> RVector; -typedef Eigen::Matrix<double, 1, kDIMENSIONS> RTVector; -typedef Eigen::Matrix<double, kDIMENSIONS, kDIMENSIONS> TMatrix; -vector<RVector> r_src, r_trg; - -#if HAVE_MPI -namespace boost { -namespace serialization { - -template<class Archive> -void serialize(Archive & ar, RVector & v, const unsigned int version) { - for (unsigned i = 0; i < kDIMENSIONS; ++i) - ar & v[i]; -} - -} // namespace serialization -} // namespace boost -#endif - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input,i",po::value<string>(),"Input file") - ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training") - ("regularization_strength,C",po::value<double>()->default_value(0.1),"L2 regularization strength (0 for no regularization)") - ("eta", po::value<double>()->default_value(0.1f), "Eta for SGD") - ("source_embeddings,f", po::value<string>(), "File containing source embeddings (if unset, random vectors will be used)") - ("target_embeddings,e", po::value<string>(), "File containing target embeddings (if unset, random vectors will be used)") - ("random_seed,s", po::value<unsigned>(), "Random seed") - ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") - ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (argc < 2 || conf->count("help")) { - cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -void Normalize(RVector* v) { - double norm = v->norm(); - assert(norm > 0.0f); - *v /= norm; -} - -void Flatten(const TMatrix& m, vector<double>* v) { - unsigned c = 0; - v->resize(kDIMENSIONS * kDIMENSIONS); - for (unsigned i = 0; i < kDIMENSIONS; ++i) - for (unsigned j = 0; j < kDIMENSIONS; ++j) { - assert(boost::math::isfinite(m(i, j))); - (*v)[c++] = m(i,j); - } -} - -void Unflatten(const vector<double>& v, TMatrix* m) { - unsigned c = 0; - for (unsigned i = 0; i < kDIMENSIONS; ++i) - for (unsigned j = 0; j < kDIMENSIONS; ++j) { - assert(boost::math::isfinite(v[c])); - (*m)(i, j) = v[c++]; - } -} - -double ApplyRegularization(const double C, - const vector<double>& weights, - vector<double>* g) { - assert(weights.size() == g->size()); - double reg = 0; - for (size_t i = 0; i < weights.size(); ++i) { - const double& w_i = weights[i]; - double& g_i = (*g)[i]; - reg += C * w_i * w_i; - g_i += 2 * C * w_i; - } - return reg; -} - -void LoadEmbeddings(const string& filename, vector<RVector>* pv) { - vector<RVector>& v = *pv; - cerr << "Reading embeddings from " << filename << " ...\n"; - ReadFile rf(filename); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - while(getline(in, line)) { - ++lc; - size_t cur = line.find(' '); - if (cur == string::npos || cur == 0) { - cerr << "Parse error reading line " << lc << ":\n" << line << endl; - abort(); - } - WordID w = TD::Convert(line.substr(0, cur)); - if (w >= v.size()) continue; - RVector& curv = v[w]; - line[cur] = 0; - size_t start = cur + 1; - cur = start + 1; - size_t c = 0; - while(cur < line.size()) { - if (line[cur] == ' ') { - line[cur] = 0; - curv[c++] = strtod(&line[start], NULL); - start = cur + 1; - cur = start; - if (c == kDIMENSIONS) break; - } - ++cur; - } - if (c < kDIMENSIONS && cur != start) { - if (cur < line.size()) line[cur] = 0; - curv[c++] = strtod(&line[start], NULL); - } - if (c != kDIMENSIONS) { - static bool first = true; - if (first) { - cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n"; - first = false; - } - for (; c < kDIMENSIONS; ++c) curv[c] = rand(); - } - if (c == kDIMENSIONS && cur != line.size()) { - static bool first = true; - if (first) { - cerr << " embedding file contains more dimensions than configured with, truncating.\n"; - first = false; - } - } - } -} - -int main(int argc, char** argv) { -#ifdef HAVE_MPI - std::cerr << "**MPI enabled.\n"; - mpi::environment env(argc, argv); - mpi::communicator world; - const int size = world.size(); - const int rank = world.rank(); -#else - std::cerr << "**MPI disabled.\n"; - const int rank = 0; - const int size = 1; -#endif - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) return 1; - const string fname = conf["input"].as<string>(); - const double reg_strength = conf["regularization_strength"].as<double>(); - const bool has_l2 = reg_strength; - assert(reg_strength >= 0.0f); - const int ITERATIONS = conf["iterations"].as<unsigned>(); - const double eta = conf["eta"].as<double>(); - const double diagonal_tension = conf["diagonal_tension"].as<double>(); - bool SGD = false; - if (diagonal_tension < 0.0) { - cerr << "Invalid value for diagonal_tension: must be >= 0\n"; - return 1; - } - string testset; - if (conf.count("testset")) testset = conf["testset"].as<string>(); - - unsigned lc = 0; - vector<double> unnormed_a_i; - bool flag = false; - vector<vector<WordID> > srcs, trgs; - vector<WordID> vocab_e; - { - set<WordID> svocab_e, svocab_f; - CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size); - copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e)); - } - cerr << "Number of target word types: " << vocab_e.size() << endl; - const double num_examples = lc; - - boost::shared_ptr<LBFGSOptimizer> lbfgs; - if (rank == 0) - lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100)); - r_trg.resize(TD::NumWords() + 1); - r_src.resize(TD::NumWords() + 1); - vector<set<unsigned> > trg_pos(TD::NumWords() + 1); - - if (conf.count("random_seed")) { - srand(conf["random_seed"].as<unsigned>()); - } else { - unsigned seed = time(NULL) + rank * 100; - cerr << "Random seed: " << seed << endl; - srand(seed); - } - - TMatrix t = TMatrix::Zero(); - if (rank == 0) { - t = TMatrix::Random() / 50.0; - for (unsigned i = 1; i < r_trg.size(); ++i) { - r_trg[i] = RVector::Random(); - r_src[i] = RVector::Random(); - } - if (conf.count("source_embeddings")) - LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src); - if (conf.count("target_embeddings")) - LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg); - } - - // do optimization - TMatrix g = TMatrix::Zero(); - vector<TMatrix> exp_src; - vector<double> z_src; - vector<double> flat_g, flat_t, rcv_grad; - Flatten(t, &flat_t); - bool converged = false; -#if HAVE_MPI - mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); - mpi::broadcast(world, r_trg, 0); - mpi::broadcast(world, r_src, 0); -#endif - cerr << "rank=" << rank << ": " << r_trg[0][4] << endl; - for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { - if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl; - Unflatten(flat_t, &t); - double likelihood = 0; - double denom = 0.0; - lc = 0; - flag = false; - g *= 0; - for (unsigned i = 0; i < srcs.size(); ++i) { - const vector<WordID>& src = srcs[i]; - const vector<WordID>& trg = trgs[i]; - ++lc; - if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; } - if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } - denom += trg.size(); - - exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero()); - z_src.clear(); z_src.resize(src.size(), 0.0); - Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero()); - Array2D<double> z_refs(src.size(), trg.size(), 0.0); - for (unsigned j = 0; j < trg.size(); ++j) - trg_pos[trg[j]].insert(j); - - for (unsigned i = 0; i < src.size(); ++i) { - const RVector& r_s = r_src[src[i]]; - const RTVector pred = r_s.transpose() * t; - TMatrix& exp_m = exp_src[i]; - double& z = z_src[i]; - for (unsigned k = 0; k < vocab_e.size(); ++k) { - const WordID v_k = vocab_e[k]; - const RVector& r_t = r_trg[v_k]; - const double dot_prod = pred * r_t; - const double u = exp(dot_prod); - z += u; - const TMatrix v = r_s * r_t.transpose() * u; - exp_m += v; - set<unsigned>& ref_locs = trg_pos[v_k]; - if (!ref_locs.empty()) { - for (set<unsigned>::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) { - TMatrix& exp_ref_ij = exp_refs(i, *it); - double& z_ref_ij = z_refs(i, *it); - z_ref_ij += u; - exp_ref_ij += v; - } - } - } - } - for (unsigned j = 0; j < trg.size(); ++j) - trg_pos[trg[j]].clear(); - - // model expectations for a single target generation with - // uniform alignment prior - // TODO: when using a non-uniform alignment, m_exp will be - // a function of j (below) - double m_z = 0; - TMatrix m_exp = TMatrix::Zero(); - for (unsigned i = 0; i < src.size(); ++i) { - m_exp += exp_src[i]; - m_z += z_src[i]; - } - m_exp /= m_z; - - Array2D<bool> al(src.size(), trg.size(), false); - for (unsigned j = 0; j < trg.size(); ++j) { - double ref_z = 0; - TMatrix ref_exp = TMatrix::Zero(); - int max_i = 0; - double max_s = -9999999; - for (unsigned i = 0; i < src.size(); ++i) { - ref_exp += exp_refs(i, j); - ref_z += z_refs(i, j); - if (log(z_refs(i, j)) > max_s) { - max_s = log(z_refs(i, j)); - max_i = i; - } - // TODO handle alignment prob - } - if (ref_z <= 0) { - cerr << "TRG=" << TD::Convert(trg[j]) << endl; - cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl; - cerr << " REF_EXP=\n" << ref_exp << endl; - cerr << " M_EXP=\n" << m_exp << endl; - abort(); - } - al(max_i, j) = true; - ref_exp /= ref_z; - g += m_exp - ref_exp; - likelihood += log(ref_z) - log(m_z); - if (SGD) { - t -= g * eta / num_examples; - g *= 0; - } - } - - if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; } - } - if (flag && rank == 0) { cerr << endl; } - - double obj = 0; - if (!SGD) { - Flatten(g, &flat_g); - obj = -likelihood; -#if HAVE_MPI - rcv_grad.resize(flat_g.size(), 0.0); - mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus<double>(), 0); - swap(flat_g, rcv_grad); - rcv_grad.clear(); - - double to = 0; - mpi::reduce(world, obj, to, plus<double>(), 0); - obj = to; - double tlh = 0; - mpi::reduce(world, likelihood, tlh, plus<double>(), 0); - likelihood = tlh; - double td = 0; - mpi::reduce(world, denom, td, plus<double>(), 0); - denom = td; -#endif - } - - if (rank == 0) { - double gn = 0; - for (unsigned i = 0; i < flat_g.size(); ++i) - gn += flat_g[i]*flat_g[i]; - const double base2_likelihood = likelihood / log(2); - cerr << " log_e likelihood: " << likelihood << endl; - cerr << " log_2 likelihood: " << base2_likelihood << endl; - cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; - cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; - cerr << " gradient norm: " << sqrt(gn) << endl; - if (!SGD) { - if (has_l2) { - const double r = ApplyRegularization(reg_strength, - flat_t, - &flat_g); - obj += r; - cerr << " regularization: " << r << endl; - } - lbfgs->Optimize(obj, flat_g, &flat_t); - converged = (lbfgs->HasConverged()); - } - } -#ifdef HAVE_MPI - mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); - mpi::broadcast(world, converged, 0); -#endif - } - if (rank == 0) - cerr << "TRANSLATION MATRIX:" << endl << t << endl; - return 0; -} - -#endif - diff --git a/training/liblbfgs/Jamfile b/training/liblbfgs/Jamfile deleted file mode 100644 index 49c82748..00000000 --- a/training/liblbfgs/Jamfile +++ /dev/null @@ -1,5 +0,0 @@ -import testing ; - -lib liblbfgs : lbfgs.c : <include>.. ; - -unit-test ll_test : ll_test.cc liblbfgs : <include>.. ; diff --git a/training/liblbfgs/Makefile.am b/training/liblbfgs/Makefile.am index 64a3794d..272d6f56 100644 --- a/training/liblbfgs/Makefile.am +++ b/training/liblbfgs/Makefile.am @@ -6,10 +6,17 @@ ll_test_LDADD = liblbfgs.a -lz noinst_LIBRARIES = liblbfgs.a -liblbfgs_a_SOURCES = lbfgs.c +liblbfgs_a_SOURCES = \ + lbfgs.c \ + arithmetic_ansi.h \ + arithmetic_sse_double.h \ + arithmetic_sse_float.h \ + lbfgs++.h \ + lbfgs.h ################################################################ # do NOT NOT NOT add any other -I includes NO NO NO NO NO ###### AM_LDFLAGS = liblbfgs.a -lz -AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -I. -I.. +AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -I$(top_srcdir)/training -I$(top_srcdir)/training/liblbfgs ################################################################ + diff --git a/training/minrisk/Makefile.am b/training/minrisk/Makefile.am new file mode 100644 index 00000000..ebf6fa91 --- /dev/null +++ b/training/minrisk/Makefile.am @@ -0,0 +1,8 @@ +bin_PROGRAMS = minrisk_optimize + +minrisk_optimize_SOURCES = minrisk_optimize.cc +minrisk_optimize_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a ../../training/liblbfgs/liblbfgs.a + +EXTRA_DIST = minrisk.pl minrisk_generate_input.pl + +AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training -I$(top_srcdir)/training/utils diff --git a/training/minrisk/minrisk.pl b/training/minrisk/minrisk.pl new file mode 100755 index 00000000..0f8bacd0 --- /dev/null +++ b/training/minrisk/minrisk.pl @@ -0,0 +1,540 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); +my $default_jobs = env_default_jobs(); + +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/minrisk_generate_input.pl"; +my $MAPPER = "$bin_dir/minrisk_optimize"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 30; +my $iteration = 1; +my $best_weights; +my $psi = 1; +my $default_max_iter = 30; +my $max_iterations = $default_max_iter; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "4g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $use_make = 1; # use make to parallelize +my $useqsub = 0; +my $initial_weights; +my $pass_suffix = ''; +my $cpbin=1; + +# regularization strength +my $tune_regularizer = 0; +my $reg = 500; +my $reg_previous = 5000; +my $dont_accum = 0; + +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "jobs=i" => \$jobs, + "dont-clean" => \$disable_clean, + "dont-accumulate" => \$dont_accum, + "pass-suffix=s" => \$pass_suffix, + "qsub" => \$useqsub, + "dry-run" => \$dryrun, + "epsilon=s" => \$epsilon, + "help" => \$help, + "weights=s" => \$initial_weights, + "reg=f" => \$reg, + "use-make=i" => \$use_make, + "max-iterations=i" => \$max_iterations, + "pmem=s" => \$pmem, + "cpbin!" => \$cpbin, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "workdir=s" => \$dir, +) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer; + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $srcFile) { push @missing_args, "--source-file"; } +if (!defined $refFiles) { push @missing_args, "--ref-files"; } +if (!defined $initial_weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 5; +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ + $dir = "minrisk"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { + local $_; + my $bindir=shift; + check_call("mkdir -p $bindir"); + -d $bindir || die "couldn't make bindir $bindir"; + for (@_) { + my $src=$$_; + $$_="$bindir/".basename($src); + check_call("cp -p $src $$_"); + } +} +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +my @allweights; +if ($dryrun){ + write_config(*STDERR); + exit 0; +} else { + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs + die "ERROR: working dir $dir already exists\n\n"; + } else { + -e $dir || mkdir $dir; + mkdir "$dir/hgs"; + modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; + mkdir "$dir/scripts"; + my $cmdfile="$dir/rerun-pro.sh"; + open CMD,'>',$cmdfile; + print CMD "cd ",&getcwd,"\n"; +# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. + my $cline=&cmdline."\n"; + print CMD $cline; + close CMD; + print STDERR $cline; + chmod(0755,$cmdfile); + check_call("cp $initial_weights $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); + } + write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; +my $kbest = "$dir/kbest"; +if ($dont_accum) { + $kbest = ''; +} else { + check_call("mkdir -p $kbest"); + $kbest = "--kbest_repository $kbest"; +} + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + push @allweights, "-w $dir/weights.$im1"; + `rm -f $dir/hgs/*.gz`; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + print STDERR " - GENERATE TRAINING EXEMPLARS\n"; + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + my $outweights="$dir/weights.$iteration"; + $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + $cmd="$MAPPER $refs_comma_sep -m $metric -i $dir/agenda.$im1 $kbest -w $inweights > $outweights"; + check_call($cmd); + $lastWeightsFile = $outweights; + $iteration++; + `rm hgs/*.gz`; + print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while(<FL>) { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($r,$p) = @_; + my $o = check_output("echo $p"); + chomp $o; + my @files = split /\s+/, $o; + return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +# subs +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "JOBS: $jobs\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; + die "Empty dev set!" if ($i == 0); +} + +sub print_help { + + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable [options] <ini file> + Runs a complete PRO optimization using the ini file specified. + +Required: + + --ref-files <files> + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + + --source-file <file> + Dev set source file. + + --weights <file> + Initial weights file (use empty file to start from 0) + +General options: + + --help + Print this message and exit. + + --dont-accumulate + Don't accumulate k-best lists from multiple iterations. + + --max-iterations <M> + Maximum number of iterations to run. If not specified, defaults + to $default_max_iter. + + --metric <method> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --pass-suffix <S> + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --workdir <dir> + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Regularization options: + + --reg <F> + l2 regularization strength [default=500]. The greater this value, + the closer to zero the weights will be. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} diff --git a/training/minrisk/minrisk_generate_input.pl b/training/minrisk/minrisk_generate_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/training/minrisk/minrisk_generate_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { + my $file = $hg; + my $id = $hg; + $id =~ s/(\.json)?\.gz//; + print "$d/$file $id\n"; +} + diff --git a/training/minrisk/minrisk_optimize.cc b/training/minrisk/minrisk_optimize.cc new file mode 100644 index 00000000..da8b5260 --- /dev/null +++ b/training/minrisk/minrisk_optimize.cc @@ -0,0 +1,197 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <limits> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "liblbfgs/lbfgs++.h" +#include "filelib.h" +#include "stringlib.h" +#include "weights.h" +#include "hg_io.h" +#include "kbest.h" +#include "viterbi.h" +#include "ns.h" +#include "ns_docscorer.h" +#include "candidate_set.h" +#include "risk.h" +#include "entropy.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") + ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations") + ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") + ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") + ("temperature,T",po::value<double>()->default_value(0.0), "Temperature parameter for objective (>0 increases the entropy)") + ("l1_strength,C",po::value<double>()->default_value(0.0), "L1 regularization strength") + ("memory_buffers,M",po::value<unsigned>()->default_value(20), "Memory buffers used in LBFGS") + ("kbest_repository,R",po::value<string>(), "Accumulate k-best lists from previous iterations (parameter is path to repository)") + ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF.TXT>\n"; + flag = true; + } + if (!conf->count("weights")) { + cerr << "Please specify weights using -w <WEIGHTS.TXT>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +EvaluationMetric* metric = NULL; + +struct RiskObjective { + explicit RiskObjective(const vector<training::CandidateSet>& tr, const double temp) : training(tr), T(temp) {} + double operator()(const vector<double>& x, double* g) const { + fill(g, g + x.size(), 0.0); + double obj = 0; + double h = 0; + for (unsigned i = 0; i < training.size(); ++i) { + training::CandidateSetRisk risk(training[i], *metric); + training::CandidateSetEntropy entropy(training[i]); + SparseVector<double> tg, hg; + double r = risk(x, &tg); + double hh = entropy(x, &hg); + h += hh; + obj += r; + for (SparseVector<double>::iterator it = tg.begin(); it != tg.end(); ++it) + g[it->first] += it->second; + if (T) { + for (SparseVector<double>::iterator it = hg.begin(); it != hg.end(); ++it) + g[it->first] += T * it->second; + } + } + cerr << (1-(obj / training.size())) << " H=" << h << endl; + return obj - T * h; + } + const vector<training::CandidateSet>& training; + const double T; // temperature for entropy regularization +}; + +double LearnParameters(const vector<training::CandidateSet>& training, + const double temp, // > 0 increases the entropy, < 0 decreases the entropy + const double C1, + const unsigned memory_buffers, + vector<weight_t>* px) { + RiskObjective obj(training, temp); + LBFGS<RiskObjective> lbfgs(px, obj, memory_buffers, C1); + lbfgs.MinimizeFunction(); + return 0; +} + +#if 0 +struct FooLoss { + double operator()(const vector<double>& x, double* g) const { + fill(g, g + x.size(), 0.0); + training::CandidateSet cs; + training::CandidateSetEntropy cse(cs); + cs.cs.resize(3); + cs.cs[0].fmap.set_value(FD::Convert("F1"), -1.0); + cs.cs[1].fmap.set_value(FD::Convert("F2"), 1.0); + cs.cs[2].fmap.set_value(FD::Convert("F1"), 2.0); + cs.cs[2].fmap.set_value(FD::Convert("F2"), 0.5); + SparseVector<double> xx; + double h = cse(x, &xx); + cerr << cse(x, &xx) << endl; cerr << "G: " << xx << endl; + for (SparseVector<double>::iterator i = xx.begin(); i != xx.end(); ++i) + g[i->first] += i->second; + return -h; + } +}; +#endif + +int main(int argc, char** argv) { +#if 0 + training::CandidateSet cs; + training::CandidateSetEntropy cse(cs); + cs.cs.resize(3); + cs.cs[0].fmap.set_value(FD::Convert("F1"), -1.0); + cs.cs[1].fmap.set_value(FD::Convert("F2"), 1.0); + cs.cs[2].fmap.set_value(FD::Convert("F1"), 2.0); + cs.cs[2].fmap.set_value(FD::Convert("F2"), 0.5); + FooLoss foo; + vector<double> ww(FD::NumFeats()); ww[FD::Convert("F1")] = 1.0; + LBFGS<FooLoss> lbfgs(&ww, foo, 100, 0.0); + lbfgs.MinimizeFunction(); + return 1; +#endif + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + + metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as<string>()); + string kbest_repo; + if (conf.count("kbest_repository")) { + kbest_repo = conf["kbest_repository"].as<string>(); + MkDirP(kbest_repo); + } + istream &in=*in_read.stream(); + const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); + vector<weight_t> weights; + const string weightsf = conf["weights"].as<string>(); + Weights::InitFromFile(weightsf, &weights); + double t = 0; + for (unsigned i = 0; i < weights.size(); ++i) + t += weights[i] * weights[i]; + if (t > 0) { + for (unsigned i = 0; i < weights.size(); ++i) + weights[i] /= sqrt(t); + } + string line, file; + vector<training::CandidateSet> kis; + cerr << "Loading hypergraphs...\n"; + while(getline(in, line)) { + istringstream is(line); + int sent_id; + kis.resize(kis.size() + 1); + training::CandidateSet& curkbest = kis.back(); + string kbest_file; + if (kbest_repo.size()) { + ostringstream os; + os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; + kbest_file = os.str(); + if (FileExists(kbest_file)) + curkbest.ReadFromFile(kbest_file); + } + is >> file >> sent_id; + ReadFile rf(file); + if (kis.size() % 5 == 0) { cerr << '.'; } + if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; } + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(weights); + curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]); + if (kbest_file.size()) + curkbest.WriteToFile(kbest_file); + } + cerr << "\nHypergraphs loaded.\n"; + weights.resize(FD::NumFeats()); + + double c1 = conf["l1_strength"].as<double>(); + double temp = conf["temperature"].as<double>(); + unsigned m = conf["memory_buffers"].as<unsigned>(); + LearnParameters(kis, temp, c1, m, &weights); + Weights::WriteToFile("-", weights); + return 0; +} + diff --git a/training/mira/Makefile.am b/training/mira/Makefile.am new file mode 100644 index 00000000..fa4fb22d --- /dev/null +++ b/training/mira/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = kbest_mira + +kbest_mira_SOURCES = kbest_mira.cc +kbest_mira_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/mira/kbest_mira.cc b/training/mira/kbest_mira.cc new file mode 100644 index 00000000..d59b4224 --- /dev/null +++ b/training/mira/kbest_mira.cc @@ -0,0 +1,322 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <cassert> +#include <cmath> +#include <tr1/memory> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "stringlib.h" +#include "hg_sampler.h" +#include "sentence_metadata.h" +#include "ns.h" +#include "ns_docscorer.h" +#include "verbose.h" +#include "viterbi.h" +#include "hg.h" +#include "prob.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "filelib.h" +#include "fdict.h" +#include "weights.h" +#include "sparse_vector.h" +#include "sampler.h" + +using namespace std; +namespace po = boost::program_options; + +bool invert_score; +std::tr1::shared_ptr<MT19937> rng; + +void RandomPermutation(int len, vector<int>* p_ids) { + vector<int>& ids = *p_ids; + ids.resize(len); + for (int i = 0; i < len; ++i) ids[i] = i; + for (int i = len; i > 0; --i) { + int j = rng->next() * i; + if (j == i) i--; + swap(ids[i-1], ids[j]); + } +} + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("input_weights,w",po::value<string>(),"Input feature weights file") + ("source,i",po::value<string>(),"Source file for development set") + ("passes,p", po::value<int>()->default_value(15), "Number of passes through the training data") + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)") + ("mt_metric,m",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("max_step_size,C", po::value<double>()->default_value(0.01), "regularization strength (C)") + ("mt_metric_scale,s", po::value<double>()->default_value(1.0), "Amount to scale MT loss function by") + ("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles") + ("sample_forest,f", "Instead of a k-best list, sample k hypotheses from the decoder's forest") + ("sample_forest_unit_weight_vector,x", "Before sampling (must use -f option), rescale the weight vector used so it has unit length; this may improve the quality of the samples") + ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)") + ("decoder_config,c",po::value<string>(),"Decoder configuration file"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value<string>(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as<string>().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || !conf->count("input_weights") || !conf->count("source") || !conf->count("decoder_config") || !conf->count("reference")) { + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +static const double kMINUS_EPSILON = -1e-6; + +struct HypothesisInfo { + SparseVector<double> features; + double mt_metric; +}; + +struct GoodBadOracle { + std::tr1::shared_ptr<HypothesisInfo> good; + std::tr1::shared_ptr<HypothesisInfo> bad; +}; + +struct TrainingObserver : public DecoderObserver { + TrainingObserver(const int k, const DocumentScorer& d, const EvaluationMetric& m, bool sf, vector<GoodBadOracle>* o) : ds(d), metric(m), oracles(*o), kbest_size(k), sample_forest(sf) {} + const DocumentScorer& ds; + const EvaluationMetric& metric; + vector<GoodBadOracle>& oracles; + std::tr1::shared_ptr<HypothesisInfo> cur_best; + const int kbest_size; + const bool sample_forest; + + const HypothesisInfo& GetCurrentBestHypothesis() const { + return *cur_best; + } + + virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { + UpdateOracles(smeta.GetSentenceID(), *hg); + } + + std::tr1::shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double score) { + std::tr1::shared_ptr<HypothesisInfo> h(new HypothesisInfo); + h->features = feats; + h->mt_metric = score; + return h; + } + + void UpdateOracles(int sent_id, const Hypergraph& forest) { + std::tr1::shared_ptr<HypothesisInfo>& cur_good = oracles[sent_id].good; + std::tr1::shared_ptr<HypothesisInfo>& cur_bad = oracles[sent_id].bad; + cur_bad.reset(); // TODO get rid of?? + + if (sample_forest) { + vector<WordID> cur_prediction; + ViterbiESentence(forest, &cur_prediction); + SufficientStats sstats; + ds[sent_id]->Evaluate(cur_prediction, &sstats); + float sentscore = metric.ComputeScore(sstats); + cur_best = MakeHypothesisInfo(ViterbiFeatures(forest), sentscore); + + vector<HypergraphSampler::Hypothesis> samples; + HypergraphSampler::sample_hypotheses(forest, kbest_size, &*rng, &samples); + for (unsigned i = 0; i < samples.size(); ++i) { + ds[sent_id]->Evaluate(samples[i].words, &sstats); + float sentscore = metric.ComputeScore(sstats); + if (invert_score) sentscore *= -1.0; + if (!cur_good || sentscore > cur_good->mt_metric) + cur_good = MakeHypothesisInfo(samples[i].fmap, sentscore); + if (!cur_bad || sentscore < cur_bad->mt_metric) + cur_bad = MakeHypothesisInfo(samples[i].fmap, sentscore); + } + } else { + KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size); + SufficientStats sstats; + for (int i = 0; i < kbest_size; ++i) { + const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(forest.nodes_.size() - 1, i); + if (!d) break; + ds[sent_id]->Evaluate(d->yield, &sstats); + float sentscore = metric.ComputeScore(sstats); + if (invert_score) sentscore *= -1.0; + // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl; + if (i == 0) + cur_best = MakeHypothesisInfo(d->feature_values, sentscore); + if (!cur_good || sentscore > cur_good->mt_metric) + cur_good = MakeHypothesisInfo(d->feature_values, sentscore); + if (!cur_bad || sentscore < cur_bad->mt_metric) + cur_bad = MakeHypothesisInfo(d->feature_values, sentscore); + } + //cerr << "GOOD: " << cur_good->mt_metric << endl; + //cerr << " CUR: " << cur_best->mt_metric << endl; + //cerr << " BAD: " << cur_bad->mt_metric << endl; + } + } +}; + +void ReadTrainingCorpus(const string& fname, vector<string>* c) { + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + while(in) { + getline(in, line); + if (!in) break; + c->push_back(line); + } +} + +bool ApproxEqual(double a, double b) { + if (a == b) return true; + return (fabs(a-b)/fabs(b)) < 0.000001; +} + +int main(int argc, char** argv) { + register_feature_functions(); + SetSilent(true); // turn off verbose decoder output + + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + + if (conf.count("random_seed")) + rng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + rng.reset(new MT19937); + const bool sample_forest = conf.count("sample_forest") > 0; + const bool sample_forest_unit_weight_vector = conf.count("sample_forest_unit_weight_vector") > 0; + if (sample_forest_unit_weight_vector && !sample_forest) { + cerr << "Cannot --sample_forest_unit_weight_vector without --sample_forest" << endl; + return 1; + } + vector<string> corpus; + ReadTrainingCorpus(conf["source"].as<string>(), &corpus); + + string metric_name = UppercaseString(conf["mt_metric"].as<string>()); + if (metric_name == "COMBI") { + cerr << "WARNING: 'combi' metric is no longer supported, switching to 'COMB:TER=-0.5;IBM_BLEU=0.5'\n"; + metric_name = "COMB:TER=-0.5;IBM_BLEU=0.5"; + } else if (metric_name == "BLEU") { + cerr << "WARNING: 'BLEU' is ambiguous, assuming 'IBM_BLEU'\n"; + metric_name = "IBM_BLEU"; + } + EvaluationMetric* metric = EvaluationMetric::Instance(metric_name); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl; + invert_score = metric->IsErrorMetric(); + + if (ds.size() != corpus.size()) { + cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n"; + return 1; + } + + ReadFile ini_rf(conf["decoder_config"].as<string>()); + Decoder decoder(ini_rf.stream()); + + // load initial weights + vector<weight_t>& dense_weights = decoder.CurrentWeightVector(); + SparseVector<weight_t> lambdas; + Weights::InitFromFile(conf["input_weights"].as<string>(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); + + const double max_step_size = conf["max_step_size"].as<double>(); + const double mt_metric_scale = conf["mt_metric_scale"].as<double>(); + + assert(corpus.size() > 0); + vector<GoodBadOracle> oracles(corpus.size()); + + TrainingObserver observer(conf["k_best_size"].as<int>(), ds, *metric, sample_forest, &oracles); + int cur_sent = 0; + int lcount = 0; + int normalizer = 0; + double tot_loss = 0; + int dots = 0; + int cur_pass = 0; + SparseVector<double> tot; + tot += lambdas; // initial weights + normalizer++; // count for initial weights + int max_iteration = conf["passes"].as<int>() * corpus.size(); + string msg = "# MIRA tuned weights"; + string msga = "# MIRA tuned weights AVERAGED"; + vector<int> order; + RandomPermutation(corpus.size(), &order); + while (lcount <= max_iteration) { + lambdas.init_vector(&dense_weights); + if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.'; } + if (corpus.size() == cur_sent) { + cerr << " [AVG METRIC LAST PASS=" << (tot_loss / corpus.size()) << "]\n"; + Weights::ShowLargestFeatures(dense_weights); + cur_sent = 0; + tot_loss = 0; + dots = 0; + ostringstream os; + os << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz"; + SparseVector<double> x = tot; + x /= normalizer; + ostringstream sa; + sa << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz"; + x.init_vector(&dense_weights); + Weights::WriteToFile(os.str(), dense_weights, true, &msg); + ++cur_pass; + RandomPermutation(corpus.size(), &order); + } + if (cur_sent == 0) { + cerr << "PASS " << (lcount / corpus.size() + 1) << endl; + } + decoder.SetId(order[cur_sent]); + double sc = 1.0; + if (sample_forest_unit_weight_vector) { + sc = lambdas.l2norm(); + if (sc > 0) { + for (unsigned i = 0; i < dense_weights.size(); ++i) + dense_weights[i] /= sc; + } + } + decoder.Decode(corpus[order[cur_sent]], &observer); // update oracles + if (sc && sc != 1.0) { + for (unsigned i = 0; i < dense_weights.size(); ++i) + dense_weights[i] *= sc; + } + const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis(); + const HypothesisInfo& cur_good = *oracles[order[cur_sent]].good; + const HypothesisInfo& cur_bad = *oracles[order[cur_sent]].bad; + tot_loss += cur_hyp.mt_metric; + if (!ApproxEqual(cur_hyp.mt_metric, cur_good.mt_metric)) { + const double loss = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights) + + mt_metric_scale * (cur_good.mt_metric - cur_bad.mt_metric); + //cerr << "LOSS: " << loss << endl; + if (loss > 0.0) { + SparseVector<double> diff = cur_good.features; + diff -= cur_bad.features; + double step_size = loss / diff.l2norm_sq(); + //cerr << loss << " " << step_size << " " << diff << endl; + if (step_size > max_step_size) step_size = max_step_size; + lambdas += (cur_good.features * step_size); + lambdas -= (cur_bad.features * step_size); + //cerr << "L: " << lambdas << endl; + } + } + tot += lambdas; + ++normalizer; + ++lcount; + ++cur_sent; + } + cerr << endl; + Weights::WriteToFile("weights.mira-final.gz", dense_weights, true, &msg); + tot /= normalizer; + tot.init_vector(dense_weights); + msg = "# MIRA tuned weights (averaged vector)"; + Weights::WriteToFile("weights.mira-final-avg.gz", dense_weights, true, &msg); + cerr << "Optimization complete.\nAVERAGED WEIGHTS: weights.mira-final-avg.gz\n"; + return 0; +} + diff --git a/training/mpi_em_optimize.cc b/training/mpi_em_optimize.cc deleted file mode 100644 index 48683b15..00000000 --- a/training/mpi_em_optimize.cc +++ /dev/null @@ -1,389 +0,0 @@ -#include <sstream> -#include <iostream> -#include <vector> -#include <cassert> -#include <cmath> - -#ifdef HAVE_MPI -#include <mpi.h> -#endif - -#include <boost/shared_ptr.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "verbose.h" -#include "hg.h" -#include "prob.h" -#include "inside_outside.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "optimize.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -using boost::shared_ptr; -namespace po = boost::program_options; - -void SanityCheck(const vector<double>& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector<double>& w_; - FComp(const vector<double>& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector<double>& w) { - vector<int> fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector<int>::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input_weights,w",po::value<string>(),"Input feature weights file") - ("training_data,t",po::value<string>(),"Training data") - ("decoder_config,c",po::value<string>(),"Decoder configuration file") - ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !(conf->count("training_data")) || !conf->count("decoder_config")) { - cerr << dcmdline_options << endl; -#ifdef HAVE_MPI - MPI::Finalize(); -#endif - exit(1); - } -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - int lc = 0; - while(in) { - getline(in, line); - if (!in) break; - if (lc % size == rank) c->push_back(line); - ++lc; - } -} - -static const double kMINUS_EPSILON = -1e-6; - -struct TrainingObserver : public DecoderObserver { - void Reset() { - total_complete = 0; - cur_obj = 0; - tot_obj = 0; - tot.clear(); - } - - void SetLocalGradientAndObjective(SparseVector<double>* g, double* o) const { - *o = tot_obj; - *g = tot; - } - - virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { - cur_obj = 0; - state = 1; - } - - void ExtractExpectedCounts(Hypergraph* hg) { - vector<prob_t> posts; - cur.clear(); - const prob_t z = hg->ComputeEdgePosteriors(1.0, &posts); - cur_obj = log(z); - for (int i = 0; i < posts.size(); ++i) { - const SparseVector<double>& efeats = hg->edges_[i].feature_values_; - const double post = static_cast<double>(posts[i] / z); - for (SparseVector<double>::const_iterator j = efeats.begin(); j != efeats.end(); ++j) - cur.add_value(j->first, post); - } - } - - // compute model expectations, denominator of objective - virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 1); - state = 2; - ExtractExpectedCounts(hg); - } - - // replace translation forest, since we're doing EM training (we don't know which) - virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 2); - state = 3; - ExtractExpectedCounts(hg); - } - - virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { - ++total_complete; - tot_obj += cur_obj; - tot += cur; - } - - int total_complete; - double cur_obj; - double tot_obj; - SparseVector<double> cur, tot; - int state; -}; - -void ReadConfig(const string& ini, vector<string>* out) { - ReadFile rf(ini); - istream& in = *rf.stream(); - while(in) { - string line; - getline(in, line); - if (!in) continue; - out->push_back(line); - } -} - -void StoreConfig(const vector<string>& cfg, istringstream* o) { - ostringstream os; - for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; } - o->str(os.str()); -} - -struct OptimizableMultinomialFamily { - struct CPD { - CPD() : z() {} - double z; - map<WordID, double> c2counts; - }; - map<WordID, CPD> counts; - double Value(WordID conditioning, WordID generated) const { - map<WordID, CPD>::const_iterator it = counts.find(conditioning); - assert(it != counts.end()); - map<WordID,double>::const_iterator r = it->second.c2counts.find(generated); - if (r == it->second.c2counts.end()) return 0; - return r->second; - } - void Increment(WordID conditioning, WordID generated, double count) { - CPD& cc = counts[conditioning]; - cc.z += count; - cc.c2counts[generated] += count; - } - void Optimize() { - for (map<WordID, CPD>::iterator i = counts.begin(); i != counts.end(); ++i) { - CPD& cpd = i->second; - for (map<WordID, double>::iterator j = cpd.c2counts.begin(); j != cpd.c2counts.end(); ++j) { - j->second /= cpd.z; - // cerr << "P(" << TD::Convert(j->first) << " | " << TD::Convert(i->first) << " ) = " << j->second << endl; - } - } - } - void Clear() { - counts.clear(); - } -}; - -struct CountManager { - CountManager(size_t num_types) : oms_(num_types) {} - virtual ~CountManager(); - virtual void AddCounts(const SparseVector<double>& c) = 0; - void Optimize(SparseVector<double>* weights) { - for (int i = 0; i < oms_.size(); ++i) { - oms_[i].Optimize(); - } - GetOptimalValues(weights); - for (int i = 0; i < oms_.size(); ++i) { - oms_[i].Clear(); - } - } - virtual void GetOptimalValues(SparseVector<double>* wv) const = 0; - vector<OptimizableMultinomialFamily> oms_; -}; -CountManager::~CountManager() {} - -struct TaggerCountManager : public CountManager { - // 0 = transitions, 2 = emissions - TaggerCountManager() : CountManager(2) {} - void AddCounts(const SparseVector<double>& c); - void GetOptimalValues(SparseVector<double>* wv) const { - for (set<int>::const_iterator it = fids_.begin(); it != fids_.end(); ++it) { - int ftype; - WordID cond, gen; - bool is_optimized = TaggerCountManager::GetFeature(*it, &ftype, &cond, &gen); - assert(is_optimized); - wv->set_value(*it, log(oms_[ftype].Value(cond, gen))); - } - } - // Id:0:a=1 Bi:a_b=1 Bi:b_c=1 Bi:c_d=1 Uni:a=1 Uni:b=1 Uni:c=1 Uni:d=1 Id:1:b=1 Bi:BOS_a=1 Id:2:c=1 - static bool GetFeature(const int fid, int* feature_type, WordID* cond, WordID* gen) { - const string& feat = FD::Convert(fid); - if (feat.size() > 5 && feat[0] == 'I' && feat[1] == 'd' && feat[2] == ':') { - // emission - const size_t p = feat.rfind(':'); - assert(p != string::npos); - *cond = TD::Convert(feat.substr(p+1)); - *gen = TD::Convert(feat.substr(3, p - 3)); - *feature_type = 1; - return true; - } else if (feat[0] == 'B' && feat.size() > 5 && feat[2] == ':' && feat[1] == 'i') { - // transition - const size_t p = feat.rfind('_'); - assert(p != string::npos); - *gen = TD::Convert(feat.substr(p+1)); - *cond = TD::Convert(feat.substr(3, p - 3)); - *feature_type = 0; - return true; - } else if (feat[0] == 'U' && feat.size() > 4 && feat[1] == 'n' && feat[2] == 'i' && feat[3] == ':') { - // ignore - return false; - } else { - cerr << "Don't know how to deal with feature of type: " << feat << endl; - abort(); - } - } - set<int> fids_; -}; - -void TaggerCountManager::AddCounts(const SparseVector<double>& c) { - for (SparseVector<double>::const_iterator it = c.begin(); it != c.end(); ++it) { - const double& val = it->second; - int ftype; - WordID cond, gen; - if (GetFeature(it->first, &ftype, &cond, &gen)) { - oms_[ftype].Increment(cond, gen, val); - fids_.insert(it->first); - } - } -} - -int main(int argc, char** argv) { -#ifdef HAVE_MPI - MPI::Init(argc, argv); - const int size = MPI::COMM_WORLD.Get_size(); - const int rank = MPI::COMM_WORLD.Get_rank(); -#else - const int size = 1; - const int rank = 0; -#endif - SetSilent(true); // turn off verbose decoder output - register_feature_functions(); - - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - TaggerCountManager tcm; - - // load cdec.ini and set up decoder - vector<string> cdec_ini; - ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini); - istringstream ini; - StoreConfig(cdec_ini, &ini); - if (rank == 0) cerr << "Loading grammar...\n"; - Decoder* decoder = new Decoder(&ini); - if (decoder->GetConf()["input"].as<string>() != "-") { - cerr << "cdec.ini must not set an input file\n"; -#ifdef HAVE_MPI - MPI::COMM_WORLD.Abort(1); -#endif - } - if (rank == 0) cerr << "Done loading grammar!\n"; - Weights w; - if (conf.count("input_weights")) - w.InitFromFile(conf["input_weights"].as<string>()); - - double objective = 0; - bool converged = false; - - vector<double> lambdas; - w.InitVector(&lambdas); - vector<string> corpus; - ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus); - assert(corpus.size() > 0); - - int iteration = 0; - TrainingObserver observer; - while (!converged) { - ++iteration; - observer.Reset(); - if (rank == 0) { - cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n"; - } - decoder->SetWeights(lambdas); - for (int i = 0; i < corpus.size(); ++i) - decoder->Decode(corpus[i], &observer); - - SparseVector<double> x; - observer.SetLocalGradientAndObjective(&x, &objective); - cerr << "COUNTS = " << x << endl; - cerr << " OBJ = " << objective << endl; - tcm.AddCounts(x); - -#if 0 -#ifdef HAVE_MPI - MPI::COMM_WORLD.Reduce(const_cast<double*>(&gradient.data()[0]), &rcv_grad[0], num_feats, MPI::DOUBLE, MPI::SUM, 0); - MPI::COMM_WORLD.Reduce(&objective, &to, 1, MPI::DOUBLE, MPI::SUM, 0); - swap(gradient, rcv_grad); - objective = to; -#endif -#endif - - if (rank == 0) { - SparseVector<double> wsv; - tcm.Optimize(&wsv); - - w.InitFromVector(wsv); - w.InitVector(&lambdas); - - ShowLargestFeatures(lambdas); - - converged = iteration > 100; - if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } - - string fname = "weights.cur.gz"; - if (converged) { fname = "weights.final.gz"; } - ostringstream vv; - vv << "Objective = " << objective << " (ITERATION=" << iteration << ")"; - const string svv = vv.str(); - w.WriteToFile(fname, true, &svv); - } // rank == 0 - int cint = converged; -#ifdef HAVE_MPI - MPI::COMM_WORLD.Bcast(const_cast<double*>(&lambdas.data()[0]), num_feats, MPI::DOUBLE, 0); - MPI::COMM_WORLD.Bcast(&cint, 1, MPI::INT, 0); - MPI::COMM_WORLD.Barrier(); -#endif - converged = cint; - } -#ifdef HAVE_MPI - MPI::Finalize(); -#endif - return 0; -} diff --git a/training/mr_em_adapted_reduce.cc b/training/mr_em_adapted_reduce.cc deleted file mode 100644 index f65b5440..00000000 --- a/training/mr_em_adapted_reduce.cc +++ /dev/null @@ -1,173 +0,0 @@ -#include <iostream> -#include <vector> -#include <cassert> -#include <cmath> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "m.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("optimization_method,m", po::value<string>()->default_value("em"), "Optimization method (em, vb)") - ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -double NoZero(const double& x) { - if (x) return x; - return 1e-35; -} - -void Maximize(const bool use_vb, - const double& alpha, - const int total_event_types, - SparseVector<double>* pc) { - const SparseVector<double>& counts = *pc; - - if (use_vb) - assert(total_event_types >= counts.size()); - - double tot = 0; - for (SparseVector<double>::const_iterator it = counts.begin(); - it != counts.end(); ++it) - tot += it->second; -// cerr << " = " << tot << endl; - assert(tot > 0.0); - double ltot = log(tot); - if (use_vb) - ltot = Md::digamma(tot + total_event_types * alpha); - for (SparseVector<double>::const_iterator it = counts.begin(); - it != counts.end(); ++it) { - if (use_vb) { - pc->set_value(it->first, NoZero(Md::digamma(it->second + alpha) - ltot)); - } else { - pc->set_value(it->first, NoZero(log(it->second) - ltot)); - } - } -#if 0 - if (counts.size() < 50) { - for (SparseVector<double>::const_iterator it = counts.begin(); - it != counts.end(); ++it) { - cerr << " p(" << FD::Convert(it->first) << ")=" << exp(it->second); - } - cerr << endl; - } -#endif -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["input_format"].as<string>() == "b64"; - const bool use_vb = conf["optimization_method"].as<string>() == "vb"; - const double alpha = 1e-09; - if (use_vb) - cerr << "Using variational Bayes, make sure alphas are set\n"; - - const string s_obj = "**OBJ**"; - // E-step - string cur_key = ""; - SparseVector<double> acc; - double logprob = 0; - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; - int feat; - double val; - size_t i = line.find("\t"); - const string key = line.substr(0, i); - assert(i != string::npos); - ++i; - if (key != cur_key) { - if (cur_key.size() > 0) { - // TODO shouldn't be num_active, should be total number - // of events - Maximize(use_vb, alpha, acc.size(), &acc); - cout << cur_key << '\t'; - if (use_b64) - B64::Encode(0.0, acc, &cout); - else - cout << acc; - cout << endl; - acc.clear(); - } - cur_key = key; - } - if (use_b64) { - SparseVector<double> g; - double obj; - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping!\n"; - continue; - } - logprob += obj; - acc += g; - } else { // text encoding - your counts will not be accurate! - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat == -1) { - logprob += val; - } else { - acc.add_value(feat, val); - } - } - } - } - // TODO shouldn't be num_active, should be total number - // of events - Maximize(use_vb, alpha, acc.size(), &acc); - cout << cur_key << '\t'; - if (use_b64) - B64::Encode(0.0, acc, &cout); - else - cout << acc; - cout << endl << flush; - - cerr << "LOGPROB: " << logprob << endl; - - return 0; -} diff --git a/training/mr_em_map_adapter.cc b/training/mr_em_map_adapter.cc deleted file mode 100644 index ead4598d..00000000 --- a/training/mr_em_map_adapter.cc +++ /dev/null @@ -1,160 +0,0 @@ -#include <iostream> -#include <fstream> -#include <cassert> -#include <cmath> - -#include <boost/utility.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include "boost/tuple/tuple.hpp" - -#include "fdict.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -// useful for EM models parameterized by a bunch of multinomials -// this converts event counts (returned from cdec as feature expectations) -// into different keys and values (which are lists of all the events, -// conditioned on the key) for summing and normalization by a reducer - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("buffer_size,b", po::value<int>()->default_value(1), "Buffer size (in # of counts) before emitting counts") - ("format,f",po::value<string>()->default_value("b64"), "Encoding of the input (b64 or text)"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct EventMapper { - int Map(int fid) { - int& cv = map_[fid]; - if (!cv) { - cv = GetConditioningVariable(fid); - } - return cv; - } - void Clear() { map_.clear(); } - protected: - virtual int GetConditioningVariable(int fid) const = 0; - private: - map<int, int> map_; -}; - -struct LexAlignEventMapper : public EventMapper { - protected: - virtual int GetConditioningVariable(int fid) const { - const string& str = FD::Convert(fid); - size_t pos = str.rfind("_"); - if (pos == string::npos || pos == 0 || pos >= str.size() - 1) { - cerr << "Bad feature for EM adapter: " << str << endl; - abort(); - } - return FD::Convert(str.substr(0, pos)); - } -}; - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["format"].as<string>() == "b64"; - const int buffer_size = conf["buffer_size"].as<int>(); - - const string s_obj = "**OBJ**"; - // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; - // 0<TAB>**OBJ**=1.1;Feat1=1.0; - - EventMapper* event_mapper = new LexAlignEventMapper; - map<int, SparseVector<double> > counts; - size_t total = 0; - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; - int feat; - double val; - size_t i = line.find("\t"); - assert(i != string::npos); - ++i; - SparseVector<double> g; - double obj = 0; - if (use_b64) { - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping!\n"; - continue; - } - } else { // text encoding - your counts will not be accurate! - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat == -1) { - obj = val; - } else { - g.set_value(feat, val); - } - } - } - //cerr << "OBJ: " << obj << endl; - const SparseVector<double>& cg = g; - for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) { - const int cond_var = event_mapper->Map(it->first); - SparseVector<double>& cond_counts = counts[cond_var]; - int delta = cond_counts.size(); - cond_counts.add_value(it->first, it->second); - delta = cond_counts.size() - delta; - total += delta; - } - if (total > buffer_size) { - for (map<int, SparseVector<double> >::iterator it = counts.begin(); - it != counts.end(); ++it) { - const SparseVector<double>& cc = it->second; - cout << FD::Convert(it->first) << '\t'; - if (use_b64) { - B64::Encode(0.0, cc, &cout); - } else { - abort(); - } - cout << endl; - } - cout << flush; - total = 0; - counts.clear(); - } - } - - return 0; -} - diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc deleted file mode 100644 index 461e6b5f..00000000 --- a/training/mr_optimize_reduce.cc +++ /dev/null @@ -1,231 +0,0 @@ -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> -#include <cmath> - -#include <boost/shared_ptr.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "optimize.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -void SanityCheck(const vector<double>& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector<double>& w_; - FComp(const vector<double>& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector<double>& w) { - vector<int> fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector<int>::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input_weights,i",po::value<string>(),"Input feature weights file") - ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file") - ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)") - ("state,s",po::value<string>(),"Read (and write if output_state is not set) optimizer state from this state file. In the first iteration, the file should not exist.") - ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)") - ("output_state,S", po::value<string>(), "Output state file (optional override)") - ("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory") - ("eta,e", po::value<double>()->default_value(0.1), "Learning rate for SGD (eta)") - ("gaussian_prior,p","Use a Gaussian prior on the weights") - ("means,u", po::value<string>(), "File containing the means for Gaussian prior") - ("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("input_weights") || !conf->count("state")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["input_format"].as<string>() == "b64"; - - vector<weight_t> lambdas; - Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas); - const string s_obj = "**OBJ**"; - int num_feats = FD::NumFeats(); - cerr << "Number of features: " << num_feats << endl; - const bool gaussian_prior = conf.count("gaussian_prior"); - vector<weight_t> means(num_feats, 0); - if (conf.count("means")) { - if (!gaussian_prior) { - cerr << "Don't use --means without --gaussian_prior!\n"; - exit(1); - } - Weights::InitFromFile(conf["means"].as<string>(), &means); - } - boost::shared_ptr<BatchOptimizer> o; - const string omethod = conf["optimization_method"].as<string>(); - if (omethod == "rprop") - o.reset(new RPropOptimizer(num_feats)); // TODO add configuration - else - o.reset(new LBFGSOptimizer(num_feats, conf["correction_buffers"].as<int>())); - cerr << "Optimizer: " << o->Name() << endl; - string state_file = conf["state"].as<string>(); - { - ifstream in(state_file.c_str(), ios::binary); - if (in) - o->Load(&in); - else - cerr << "No state file found, assuming ITERATION 1\n"; - } - - double objective = 0; - vector<double> gradient(num_feats, 0); - // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; - // 0<TAB>**OBJ**=1.1;Feat1=1.0; - int total_lines = 0; // TODO - this should be a count of the - // training instances!! - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; - ++total_lines; - int feat; - double val; - size_t i = line.find("\t"); - assert(i != string::npos); - ++i; - if (use_b64) { - SparseVector<double> g; - double obj; - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping gradient!\n"; - cerr << " START: " << line.substr(0,line.size() > 200 ? 200 : line.size()) << endl; - if (line.size() > 200) - cerr << " END: " << line.substr(line.size() - 200, 200) << endl; - cout << "-1\tRESTART\n"; - exit(99); - } - objective += obj; - const SparseVector<double>& cg = g; - for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) { - if (it->first >= num_feats) { - cerr << "Unexpected feature in gradient: " << FD::Convert(it->first) << endl; - abort(); - } - gradient[it->first] -= it->second; - } - } else { // text encoding - your gradients will not be accurate! - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - if (feat >= num_feats) { - cerr << "Unexpected feature in gradient: " << line.substr(start, i - start) << endl; - abort(); - } - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat == -1) { - objective += val; - } else { - gradient[feat] -= val; - } - } - } - } - - if (gaussian_prior) { - const double sigsq = conf["sigma_squared"].as<double>(); - double norm = 0; - for (int k = 1; k < lambdas.size(); ++k) { - const double& lambda_k = lambdas[k]; - if (lambda_k) { - const double param = (lambda_k - means[k]); - norm += param * param; - gradient[k] += param / sigsq; - } - } - const double reg = norm / (2.0 * sigsq); - cerr << "REGULARIZATION TERM: " << reg << endl; - objective += reg; - } - cerr << "EVALUATION #" << o->EvaluationCount() << " OBJECTIVE: " << objective << endl; - double gnorm = 0; - for (int i = 0; i < gradient.size(); ++i) - gnorm += gradient[i] * gradient[i]; - cerr << " GNORM=" << sqrt(gnorm) << endl; - vector<double> old = lambdas; - int c = 0; - while (old == lambdas) { - ++c; - if (c > 1) { cerr << "Same lambdas, repeating optimization\n"; } - o->Optimize(objective, gradient, &lambdas); - assert(c < 5); - } - old.clear(); - SanityCheck(lambdas); - ShowLargestFeatures(lambdas); - Weights::WriteToFile(conf["output_weights"].as<string>(), lambdas, false); - - const bool conv = o->HasConverged(); - if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } - - if (conf.count("output_state")) - state_file = conf["output_state"].as<string>(); - ofstream out(state_file.c_str(), ios::binary); - cerr << "Writing state to: " << state_file << endl; - o->Save(&out); - out.close(); - - cout << o->EvaluationCount() << "\t" << conv << endl; - return 0; -} diff --git a/training/mr_reduce_to_weights.cc b/training/mr_reduce_to_weights.cc deleted file mode 100644 index 16b47720..00000000 --- a/training/mr_reduce_to_weights.cc +++ /dev/null @@ -1,109 +0,0 @@ -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)") - ("input,i",po::value<string>()->default_value("-"),"Read file from") - ("output,o",po::value<string>()->default_value("-"),"Write weights to"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void WriteWeights(const SparseVector<double>& weights, ostream* out) { - for (SparseVector<double>::const_iterator it = weights.begin(); - it != weights.end(); ++it) { - (*out) << FD::Convert(it->first) << " " << it->second << endl; - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["input_format"].as<string>() == "b64"; - - const string s_obj = "**OBJ**"; - // E-step - ReadFile rf(conf["input"].as<string>()); - istream* in = rf.stream(); - assert(*in); - WriteFile wf(conf["output"].as<string>()); - ostream* out = wf.stream(); - out->precision(17); - while(*in) { - string line; - getline(*in, line); - if (line.empty()) continue; - int feat; - double val; - size_t i = line.find("\t"); - assert(i != string::npos); - ++i; - if (use_b64) { - SparseVector<double> g; - double obj; - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping!\n"; - continue; - } - WriteWeights(g, out); - } else { // text encoding - your counts will not be accurate! - SparseVector<double> weights; - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat != -1) { - weights.set_value(feat, val); - } - } - WriteWeights(weights, out); - } - } - - return 0; -} diff --git a/training/pro/Makefile.am b/training/pro/Makefile.am new file mode 100644 index 00000000..09364804 --- /dev/null +++ b/training/pro/Makefile.am @@ -0,0 +1,13 @@ +bin_PROGRAMS = \ + mr_pro_map \ + mr_pro_reduce + +mr_pro_map_SOURCES = mr_pro_map.cc +mr_pro_map_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +mr_pro_reduce_SOURCES = mr_pro_reduce.cc +mr_pro_reduce_LDADD = ../../training/liblbfgs/liblbfgs.a ../../utils/libutils.a + +EXTRA_DIST = mr_pro_generate_mapper_input.pl pro.pl + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils -I$(top_srcdir)/training diff --git a/training/pro/mr_pro_generate_mapper_input.pl b/training/pro/mr_pro_generate_mapper_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/training/pro/mr_pro_generate_mapper_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { + my $file = $hg; + my $id = $hg; + $id =~ s/(\.json)?\.gz//; + print "$d/$file $id\n"; +} + diff --git a/training/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc new file mode 100644 index 00000000..eef40b8a --- /dev/null +++ b/training/pro/mr_pro_map.cc @@ -0,0 +1,201 @@ +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> +#include <tr1/unordered_map> + +#include <boost/functional/hash.hpp> +#include <boost/shared_ptr.hpp> +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "candidate_set.h" +#include "sampler.h" +#include "filelib.h" +#include "stringlib.h" +#include "weights.h" +#include "inside_outside.h" +#include "hg_io.h" +#include "ns.h" +#include "ns_docscorer.h" + +// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011) + +using namespace std; +namespace po = boost::program_options; + +boost::shared_ptr<MT19937> rng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") + ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations") + ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)") + ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") + ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)") + ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") + ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract") + ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") + ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") + ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF.TXT>\n"; + flag = true; + } + if (!conf->count("weights")) { + cerr << "Please specify weights using -w <WEIGHTS.TXT>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct ThresholdAlpha { + explicit ThresholdAlpha(double t = 0.05) : threshold(t) {} + double operator()(double mag) const { + if (mag < threshold) return 0.0; else return 1.0; + } + const double threshold; +}; + +struct TrainingInstance { + TrainingInstance(const SparseVector<weight_t>& feats, bool positive, float diff) : x(feats), y(positive), gdiff(diff) {} + SparseVector<weight_t> x; +#undef DEBUGGING_PRO +#ifdef DEBUGGING_PRO + vector<WordID> a; + vector<WordID> b; +#endif + bool y; + float gdiff; +}; +#ifdef DEBUGGING_PRO +ostream& operator<<(ostream& os, const TrainingInstance& d) { + return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x; +} +#endif + +struct DiffOrder { + bool operator()(const TrainingInstance& a, const TrainingInstance& b) const { + return a.gdiff > b.gdiff; + } +}; + +void Sample(const unsigned gamma, + const unsigned xi, + const training::CandidateSet& J_i, + const EvaluationMetric* metric, + vector<TrainingInstance>* pv) { + const bool invert_score = metric->IsErrorMetric(); + vector<TrainingInstance> v1, v2; + float avg_diff = 0; + for (unsigned i = 0; i < gamma; ++i) { + const size_t a = rng->inclusive(0, J_i.size() - 1)(); + const size_t b = rng->inclusive(0, J_i.size() - 1)(); + if (a == b) continue; + float ga = metric->ComputeScore(J_i[a].eval_feats); + float gb = metric->ComputeScore(J_i[b].eval_feats); + bool positive = gb < ga; + if (invert_score) positive = !positive; + const float gdiff = fabs(ga - gb); + if (!gdiff) continue; + avg_diff += gdiff; + SparseVector<weight_t> xdiff = (J_i[a].fmap - J_i[b].fmap).erase_zeros(); + if (xdiff.empty()) { + cerr << "Empty diff:\n " << TD::GetString(J_i[a].ewords) << endl << "x=" << J_i[a].fmap << endl; + cerr << " " << TD::GetString(J_i[b].ewords) << endl << "x=" << J_i[b].fmap << endl; + continue; + } + v1.push_back(TrainingInstance(xdiff, positive, gdiff)); +#ifdef DEBUGGING_PRO + v1.back().a = J_i[a].hyp; + v1.back().b = J_i[b].hyp; + cerr << "N: " << v1.back() << endl; +#endif + } + avg_diff /= v1.size(); + + for (unsigned i = 0; i < v1.size(); ++i) { + double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff)); + // cerr << "avg_diff=" << avg_diff << " gdiff=" << v1[i].gdiff << " p=" << p << endl; + if (rng->next() < p) v2.push_back(v1[i]); + } + vector<TrainingInstance>::iterator mid = v2.begin() + xi; + if (xi > v2.size()) mid = v2.end(); + partial_sort(v2.begin(), mid, v2.end(), DiffOrder()); + copy(v2.begin(), mid, back_inserter(*pv)); +#ifdef DEBUGGING_PRO + if (v2.size() >= 5) { + for (int i =0; i < (mid - v2.begin()); ++i) { + cerr << v2[i] << endl; + } + cerr << pv->back() << endl; + } +#endif +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + if (conf.count("random_seed")) + rng.reset(new MT19937(conf["random_seed"].as<uint32_t>())); + else + rng.reset(new MT19937); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as<string>()); + istream &in=*in_read.stream(); + const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); + const unsigned gamma = conf["candidate_pairs"].as<unsigned>(); + const unsigned xi = conf["best_pairs"].as<unsigned>(); + string weightsf = conf["weights"].as<string>(); + vector<weight_t> weights; + Weights::InitFromFile(weightsf, &weights); + string kbest_repo = conf["kbest_repository"].as<string>(); + MkDirP(kbest_repo); + while(in) { + vector<TrainingInstance> v; + string line; + getline(in, line); + if (line.empty()) continue; + istringstream is(line); + int sent_id; + string file; + // path-to-file (JSON) sent_id + is >> file >> sent_id; + ReadFile rf(file); + ostringstream os; + training::CandidateSet J_i; + os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; + const string kbest_file = os.str(); + if (FileExists(kbest_file)) + J_i.ReadFromFile(kbest_file); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(weights); + J_i.AddKBestCandidates(hg, kbest_size, ds[sent_id]); + J_i.WriteToFile(kbest_file); + + Sample(gamma, xi, J_i, metric, &v); + for (unsigned i = 0; i < v.size(); ++i) { + const TrainingInstance& vi = v[i]; + cout << vi.y << "\t" << vi.x << endl; + cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl; + } + } + return 0; +} + diff --git a/training/pro/mr_pro_reduce.cc b/training/pro/mr_pro_reduce.cc new file mode 100644 index 00000000..5ef9b470 --- /dev/null +++ b/training/pro/mr_pro_reduce.cc @@ -0,0 +1,286 @@ +#include <cstdlib> +#include <sstream> +#include <iostream> +#include <fstream> +#include <vector> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "weights.h" +#include "sparse_vector.h" +#include "optimize.h" +#include "liblbfgs/lbfgs++.h" + +using namespace std; +namespace po = boost::program_options; + +// since this is a ranking model, there should be equal numbers of +// positive and negative examples, so the bias should be 0 +static const double MAX_BIAS = 1e-10; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation") + ("regularization_strength,C",po::value<double>()->default_value(500.0), "l2 regularization strength") + ("l1",po::value<double>()->default_value(0.0), "l1 regularization strength") + ("regularize_to_weights,y",po::value<double>()->default_value(5000.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect") + ("memory_buffers,m",po::value<unsigned>()->default_value(100), "Number of memory buffers (LBFGS)") + ("min_reg,r",po::value<double>()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght") + ("max_reg,R",po::value<double>()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght") + ("testset,t",po::value<string>(), "Optional held-out test set") + ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength") + ("interpolate_with_weights,p",po::value<double>()->default_value(1.0), "[deprecated] Output weights are p*w + (1-p)*w_prev; 1.0 = no effect") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ParseSparseVector(string& line, size_t cur, SparseVector<weight_t>* out) { + SparseVector<weight_t>& x = *out; + size_t last_start = cur; + size_t last_comma = string::npos; + while(cur <= line.size()) { + if (line[cur] == ' ' || cur == line.size()) { + if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { + cerr << "[ERROR] " << line << endl << " position = " << cur << endl; + exit(1); + } + const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); + if (cur < line.size()) line[cur] = 0; + const weight_t val = strtod(&line[last_comma + 1], NULL); + x.set_value(fid, val); + + last_comma = string::npos; + last_start = cur+1; + } else { + if (line[cur] == '=') + last_comma = cur; + } + ++cur; + } +} + +void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corpus) { + istream& in = *pin; + corpus->clear(); + bool flag = false; + int lc = 0; + string line; + SparseVector<weight_t> x; + while(getline(in, line)) { + ++lc; + if (lc % 1000 == 0) { cerr << '.'; flag = true; } + if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } + if (line.empty()) continue; + const size_t ks = line.find("\t"); + assert(string::npos != ks); + assert(ks == 1); + const bool y = line[0] == '1'; + x.clear(); + ParseSparseVector(line, ks + 1, &x); + corpus->push_back(make_pair(y, x)); + } + if (flag) cerr << endl; +} + +void GradAdd(const SparseVector<weight_t>& v, const double scale, weight_t* acc) { + for (SparseVector<weight_t>::const_iterator it = v.begin(); + it != v.end(); ++it) { + acc[it->first] += it->second * scale; + } +} + +double ApplyRegularizationTerms(const double C, + const double T, + const vector<weight_t>& weights, + const vector<weight_t>& prev_weights, + weight_t* g) { + double reg = 0; + for (size_t i = 0; i < weights.size(); ++i) { + const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0); + const double& w_i = weights[i]; + reg += C * w_i * w_i; + g[i] += 2 * C * w_i; + + const double diff_i = w_i - prev_w_i; + reg += T * diff_i * diff_i; + g[i] += 2 * T * diff_i; + } + return reg; +} + +double TrainingInference(const vector<weight_t>& x, + const vector<pair<bool, SparseVector<weight_t> > >& corpus, + weight_t* g = NULL) { + double cll = 0; + for (int i = 0; i < corpus.size(); ++i) { + const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias + double lp_false = dotprod; + double lp_true = -dotprod; + if (0 < lp_true) { + lp_true += log1p(exp(-lp_true)); + lp_false = log1p(exp(lp_false)); + } else { + lp_true = log1p(exp(lp_true)); + lp_false += log1p(exp(-lp_false)); + } + lp_true*=-1; + lp_false*=-1; + if (corpus[i].first) { // true label + cll -= lp_true; + if (g) { + // g -= corpus[i].second * exp(lp_false); + GradAdd(corpus[i].second, -exp(lp_false), g); + g[0] -= exp(lp_false); // bias + } + } else { // false label + cll -= lp_false; + if (g) { + // g += corpus[i].second * exp(lp_true); + GradAdd(corpus[i].second, exp(lp_true), g); + g[0] += exp(lp_true); // bias + } + } + } + return cll; +} + +struct ProLoss { + ProLoss(const vector<pair<bool, SparseVector<weight_t> > >& tr, + const vector<pair<bool, SparseVector<weight_t> > >& te, + const double c, + const double t, + const vector<weight_t>& px) : training(tr), testing(te), C(c), T(t), prev_x(px){} + double operator()(const vector<double>& x, double* g) const { + fill(g, g + x.size(), 0.0); + double cll = TrainingInference(x, training, g); + tppl = 0; + if (testing.size()) + tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size())); + double ppl = cll / log(2); + ppl /= training.size(); + ppl = pow(2.0, ppl); + double reg = ApplyRegularizationTerms(C, T, x, prev_x, g); + return cll + reg; + } + const vector<pair<bool, SparseVector<weight_t> > >& training, testing; + const double C, T; + const vector<double>& prev_x; + mutable double tppl; +}; + +// return held-out log likelihood +double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training, + const vector<pair<bool, SparseVector<weight_t> > >& testing, + const double C, + const double C1, + const double T, + const unsigned memory_buffers, + const vector<weight_t>& prev_x, + vector<weight_t>* px) { + assert(px->size() == prev_x.size()); + ProLoss loss(training, testing, C, T, prev_x); + LBFGS<ProLoss> lbfgs(px, loss, memory_buffers, C1); + lbfgs.MinimizeFunction(); + return loss.tppl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + string line; + vector<pair<bool, SparseVector<weight_t> > > training, testing; + const bool tune_regularizer = conf.count("tune_regularizer"); + if (tune_regularizer && !conf.count("testset")) { + cerr << "--tune_regularizer requires --testset to be set\n"; + return 1; + } + const double min_reg = conf["min_reg"].as<double>(); + const double max_reg = conf["max_reg"].as<double>(); + double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned + double C1 = conf["l1"].as<double>(); // will be overridden if parameter is tuned + const double T = conf["regularize_to_weights"].as<double>(); + assert(C >= 0.0); + assert(min_reg >= 0.0); + assert(max_reg >= 0.0); + assert(max_reg > min_reg); + const double psi = conf["interpolate_with_weights"].as<double>(); + if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; } + ReadCorpus(&cin, &training); + if (conf.count("testset")) { + ReadFile rf(conf["testset"].as<string>()); + ReadCorpus(rf.stream(), &testing); + } + cerr << "Number of features: " << FD::NumFeats() << endl; + + vector<weight_t> x, prev_x; // x[0] is bias + if (conf.count("weights")) { + Weights::InitFromFile(conf["weights"].as<string>(), &x); + x.resize(FD::NumFeats()); + prev_x = x; + } else { + x.resize(FD::NumFeats()); + prev_x = x; + } + cerr << " Number of features: " << x.size() << endl; + cerr << "Number of training examples: " << training.size() << endl; + cerr << "Number of testing examples: " << testing.size() << endl; + double tppl = 0.0; + vector<pair<double,double> > sp; + vector<double> smoothed; + if (tune_regularizer) { + C = min_reg; + const double steps = 18; + double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps); + cerr << "SWEEP FACTOR: " << sweep_factor << endl; + while(C < max_reg) { + cerr << "C=" << C << "\tT=" <<T << endl; + tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x); + sp.push_back(make_pair(C, tppl)); + C *= sweep_factor; + } + smoothed.resize(sp.size(), 0); + smoothed[0] = sp[0].second; + smoothed.back() = sp.back().second; + for (int i = 1; i < sp.size()-1; ++i) { + double prev = sp[i-1].second; + double next = sp[i+1].second; + double cur = sp[i].second; + smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next); + } + double best_ppl = 9999999; + unsigned best_i = 0; + for (unsigned i = 0; i < sp.size(); ++i) { + if (smoothed[i] < best_ppl) { + best_ppl = smoothed[i]; + best_i = i; + } + } + C = sp[best_i].first; + } // tune regularizer + tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x); + if (conf.count("weights")) { + for (int i = 1; i < x.size(); ++i) { + x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi); + } + } + cout.precision(15); + cout << "# C=" << C << "\theld out perplexity="; + if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; } + if (sp.size()) { + cout << "# Parameter sweep:\n"; + for (int i = 0; i < sp.size(); ++i) { + cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl; + } + } + Weights::WriteToFile("-", x); + return 0; +} diff --git a/training/pro/pro.pl b/training/pro/pro.pl new file mode 100755 index 00000000..3b30c379 --- /dev/null +++ b/training/pro/pro.pl @@ -0,0 +1,555 @@ +#!/usr/bin/env perl +use strict; +use File::Basename qw(basename); +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); +my $default_jobs = env_default_jobs(); + +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl"; +my $MAPPER = "$bin_dir/mr_pro_map"; +my $REDUCER = "$bin_dir/mr_pro_reduce"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 30; +my $iteration = 1; +my $best_weights; +my $psi = 1; +my $default_max_iter = 30; +my $max_iterations = $default_max_iter; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "4g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $use_make = 1; # use make to parallelize +my $useqsub = 0; +my $initial_weights; +my $pass_suffix = ''; +my $devset; + +# regularization strength +my $reg = 500; +my $reg_previous = 5000; + +# Process command-line options +if (GetOptions( + "config=s" => \$iniFile, + "weights=s" => \$initial_weights, + "devset=s" => \$devset, + "jobs=i" => \$jobs, + "metric=s" => \$metric, + "pass-suffix=s" => \$pass_suffix, + "qsub" => \$useqsub, + "help" => \$help, + "reg=f" => \$reg, + "reg-previous=f" => \$reg_previous, + "output-dir=s" => \$dir, +) == 0 || @ARGV!=0 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $iniFile) { push @missing_args, "--config"; } +if (!defined $devset) { push @missing_args, "--devset"; } +if (!defined $initial_weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 5; +} + +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +unless ($dir){ + $dir = 'pro'; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +if (-e $dir) { + die "ERROR: working dir $dir already exists\n\n"; +} else { + mkdir "$dir" or die "Can't mkdir $dir: $!"; + mkdir "$dir/hgs" or die; + mkdir "$dir/scripts" or die; + print STDERR <<EOT; + DECODER: $decoder + INI FILE: $iniFile + WORKING DIR: $dir + DEVSET: $devset + EVAL METRIC: $metric + MAX ITERATIONS: $max_iterations + PARALLEL JOBS: $jobs + HEAD NODE: $host + PMEM (DECODING): $pmem + INITIAL WEIGHTS: $initial_weights +EOT +} + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +check_call("cp $initial_weights $dir/weights.0"); +$iniFile = $newIniFile; + +my $refs = "$dir/dev.refs"; +split_devset($devset, "$dir/dev.input.raw", $refs); +my $newsrc = "$dir/dev.input"; +enseg("$dir/dev.input.raw", $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +my @allweights; +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + push @allweights, "-w $dir/weights.$im1"; + `rm -f $dir/hgs/*.gz`; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER -r $refs -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + print STDERR " - GENERATE TRAINING EXEMPLARS\n"; + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput."; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + my $mkfile; # only used with makefiles + my $mkfilename; + if ($use_make) { + $mkfilename = "$dir/splag.$im1/domap.mk"; + open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; + print $mkfile "all: $dir/splag.$im1/map.done\n\n"; + } + my @mkouts = (); # only used with makefiles + my @mapoutputs = (); + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "pro.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -s $srcFile -m $metric -r $refs -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; + if ($use_make) { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "#!/bin/bash\n"; + print F "$script\n"; + close F; + my $output = "$dir/splag.$im1/$mapoutput"; + push @mkouts, $output; + chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; + } else { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + + $nmappers++; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); + chomp $jobid; + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); + print STDERR " $jobid"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + } + } + my @dev_outs = (); + my @devtest_outs = (); + @dev_outs = @mapoutputs; + if ($use_make) { + print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; + close $mkfile; + my $mcmd = "make -j $jobs -f $mkfilename"; + print STDERR "\nExecuting: $mcmd\n"; + check_call($mcmd); + } else { + print STDERR "\nLaunched $nmappers mappers.\n"; + sleep 8; + print STDERR "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); + $nmappers = scalar @livejobs; + } + print STDERR "All mappers complete.\n"; + } + my $tol = 0; + my $til = 0; + my $dev_test_file = "$dir/splag.$im1/devtest.gz"; + print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n"; + print STDERR unchecked_output("date"); + $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -C $reg -y $reg_previous --interpolate_with_weights $psi"; + $cmd .= " > $dir/weights.$iteration"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + $lastWeightsFile = "$dir/weights.$iteration"; + $lastPScore = $score; + $iteration++; + print STDERR "\n==========\n"; +} + + +check_call("cp $lastWeightsFile $dir/weights.final"); +print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n"; +print STDOUT "$dir/weights.final\n"; + +exit 0; + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; + die "Empty dev set!" if ($i == 0); +} + +sub print_help { + + my $executable = basename($0); chomp $executable; + print << "Help"; + +Usage: $executable [options] + + $executable [options] + Runs a complete PRO optimization using the ini file specified. + +Required: + + --config <cdec.ini> + Decoder configuration file. + + --devset <files> + Dev set source and reference data. + + --weights <file> + Initial weights file (use empty file to start from 0) + +General options: + + --help + Print this message and exit. + + --max-iterations <M> + Maximum number of iterations to run. If not specified, defaults + to $default_max_iter. + + --metric <method> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --pass-suffix <S> + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --workdir <dir> + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Regularization options: + + --reg <F> + l2 regularization strength [default=500]. The greater this value, + the closer to zero the weights will be. + + --reg-previous <F> + l2 penalty for moving away from the weights from the previous + iteration. [default=5000]. The greater this value, the closer + to the previous iteration's weights the next iteration's weights + will be. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Deprecated options: + + --interpolate-with-weights <F> + [deprecated] At each iteration the resulting weights are + interpolated with the weights from the previous iteration, with + this factor. [default=1.0, i.e., no effect] + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { + my ($infile, $outsrc, $outref) = @_; + open F, "<$infile" or die "Can't read $infile: $!"; + open S, ">$outsrc" or die "Can't write $outsrc: $!"; + open R, ">$outref" or die "Can't write $outref: $!"; + while(<F>) { + chomp; + my ($src, @refs) = split /\s*\|\|\|\s*/; + die "Malformed devset line: $_\n" unless scalar @refs > 0; + print S "$src\n"; + print R join(' ||| ', @refs) . "\n"; + } + close R; + close S; + close F; +} + diff --git a/training/rampion/Makefile.am b/training/rampion/Makefile.am new file mode 100644 index 00000000..c72283cd --- /dev/null +++ b/training/rampion/Makefile.am @@ -0,0 +1,8 @@ +bin_PROGRAMS = rampion_cccp + +rampion_cccp_SOURCES = rampion_cccp.cc +rampion_cccp_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +EXTRA_DIST = rampion.pl rampion_generate_input.pl + +AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils diff --git a/training/rampion/rampion.pl b/training/rampion/rampion.pl new file mode 100755 index 00000000..ae084db6 --- /dev/null +++ b/training/rampion/rampion.pl @@ -0,0 +1,540 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); +my $default_jobs = env_default_jobs(); + +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/rampion_generate_input.pl"; +my $MAPPER = "$bin_dir/rampion_cccp"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 30; +my $iteration = 1; +my $best_weights; +my $psi = 1; +my $default_max_iter = 30; +my $max_iterations = $default_max_iter; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "4g"; +my $disable_clean = 0; +my %seen_weights; +my $help = 0; +my $epsilon = 0.0001; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $use_make = 1; # use make to parallelize +my $useqsub = 0; +my $initial_weights; +my $pass_suffix = ''; +my $cpbin=1; + +# regularization strength +my $tune_regularizer = 0; +my $reg = 500; +my $reg_previous = 5000; +my $dont_accum = 0; + +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "jobs=i" => \$jobs, + "dont-clean" => \$disable_clean, + "dont-accumulate" => \$dont_accum, + "pass-suffix=s" => \$pass_suffix, + "qsub" => \$useqsub, + "dry-run" => \$dryrun, + "epsilon=s" => \$epsilon, + "help" => \$help, + "weights=s" => \$initial_weights, + "reg=f" => \$reg, + "use-make=i" => \$use_make, + "max-iterations=i" => \$max_iterations, + "pmem=s" => \$pmem, + "cpbin!" => \$cpbin, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "workdir=s" => \$dir, +) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer; + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $srcFile) { push @missing_args, "--source-file"; } +if (!defined $refFiles) { push @missing_args, "--ref-files"; } +if (!defined $initial_weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 5; +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ + $dir = "rampion"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { + local $_; + my $bindir=shift; + check_call("mkdir -p $bindir"); + -d $bindir || die "couldn't make bindir $bindir"; + for (@_) { + my $src=$$_; + $$_="$bindir/".basename($src); + check_call("cp -p $src $$_"); + } +} +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +my @allweights; +if ($dryrun){ + write_config(*STDERR); + exit 0; +} else { + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs + die "ERROR: working dir $dir already exists\n\n"; + } else { + -e $dir || mkdir $dir; + mkdir "$dir/hgs"; + modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; + mkdir "$dir/scripts"; + my $cmdfile="$dir/rerun-pro.sh"; + open CMD,'>',$cmdfile; + print CMD "cd ",&getcwd,"\n"; +# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. + my $cline=&cmdline."\n"; + print CMD $cline; + close CMD; + print STDERR $cline; + chmod(0755,$cmdfile); + check_call("cp $initial_weights $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); + } + write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while(<F>) { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; +my $kbest = "$dir/kbest"; +if ($dont_accum) { + $kbest = ''; +} else { + check_call("mkdir -p $kbest"); + $kbest = "--kbest_repository $kbest"; +} + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + push @allweights, "-w $dir/weights.$im1"; + `rm -f $dir/hgs/*.gz`; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + print STDERR " - GENERATE TRAINING EXEMPLARS\n"; + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + my $outweights="$dir/weights.$iteration"; + $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + $cmd="$MAPPER $refs_comma_sep -m $metric -i $dir/agenda.$im1 $kbest -w $inweights > $outweights"; + check_call($cmd); + $lastWeightsFile = $outweights; + $iteration++; + `rm hgs/*.gz`; + print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while(<FL>) { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($r,$p) = @_; + my $o = check_output("echo $p"); + chomp $o; + my @files = split /\s+/, $o; + return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while(<F>) { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +# subs +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "JOBS: $jobs\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; + die "Empty dev set!" if ($i == 0); +} + +sub print_help { + + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable [options] <ini file> + Runs a complete PRO optimization using the ini file specified. + +Required: + + --ref-files <files> + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + + --source-file <file> + Dev set source file. + + --weights <file> + Initial weights file (use empty file to start from 0) + +General options: + + --help + Print this message and exit. + + --dont-accumulate + Don't accumulate k-best lists from multiple iterations. + + --max-iterations <M> + Maximum number of iterations to run. If not specified, defaults + to $default_max_iter. + + --metric <method> + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --pass-suffix <S> + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --workdir <dir> + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Regularization options: + + --reg <F> + l2 regularization strength [default=500]. The greater this value, + the closer to zero the weights will be. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} diff --git a/training/rampion/rampion_cccp.cc b/training/rampion/rampion_cccp.cc new file mode 100644 index 00000000..1e36dc51 --- /dev/null +++ b/training/rampion/rampion_cccp.cc @@ -0,0 +1,168 @@ +#include <sstream> +#include <iostream> +#include <vector> +#include <limits> + +#include <boost/program_options.hpp> +#include <boost/program_options/variables_map.hpp> + +#include "filelib.h" +#include "stringlib.h" +#include "weights.h" +#include "hg_io.h" +#include "kbest.h" +#include "viterbi.h" +#include "ns.h" +#include "ns_docscorer.h" +#include "candidate_set.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)") + ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations") + ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)") + ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") + ("kbest_repository,R",po::value<string>(), "Accumulate k-best lists from previous iterations (parameter is path to repository)") + ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract") + ("cccp_iterations,I", po::value<unsigned>()->default_value(10u), "CCCP iterations (T')") + ("ssd_iterations,J", po::value<unsigned>()->default_value(5u), "Stochastic subgradient iterations (T'')") + ("eta", po::value<double>()->default_value(1e-4), "Step size") + ("regularization_strength,C", po::value<double>()->default_value(1.0), "L2 regularization strength") + ("alpha,a", po::value<double>()->default_value(10.0), "Cost scale (alpha); alpha * [1-metric(y,y')]") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r <REF.TXT>\n"; + flag = true; + } + if (!conf->count("weights")) { + cerr << "Please specify weights using -w <WEIGHTS.TXT>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct GainFunction { + explicit GainFunction(const EvaluationMetric* m) : metric(m) {} + float operator()(const SufficientStats& eval_feats) const { + float g = metric->ComputeScore(eval_feats); + if (!metric->IsErrorMetric()) g = 1 - g; + return g; + } + const EvaluationMetric* metric; +}; + +template <typename GainFunc> +void CostAugmentedSearch(const GainFunc& gain, + const training::CandidateSet& cs, + const SparseVector<double>& w, + double alpha, + SparseVector<double>* fmap) { + unsigned best_i = 0; + double best = -numeric_limits<double>::infinity(); + for (unsigned i = 0; i < cs.size(); ++i) { + double s = cs[i].fmap.dot(w) + alpha * gain(cs[i].eval_feats); + if (s > best) { + best = s; + best_i = i; + } + } + *fmap = cs[best_i].fmap; +} + + + +// runs lines 4--15 of rampion algorithm +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as<string>(); + + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as<vector<string> >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + double goodsign = -1; + double badsign = -goodsign; + + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as<string>()); + string kbest_repo; + if (conf.count("kbest_repository")) { + kbest_repo = conf["kbest_repository"].as<string>(); + MkDirP(kbest_repo); + } + istream &in=*in_read.stream(); + const unsigned kbest_size = conf["kbest_size"].as<unsigned>(); + const unsigned tp = conf["cccp_iterations"].as<unsigned>(); + const unsigned tpp = conf["ssd_iterations"].as<unsigned>(); + const double eta = conf["eta"].as<double>(); + const double reg = conf["regularization_strength"].as<double>(); + const double alpha = conf["alpha"].as<double>(); + SparseVector<weight_t> weights; + { + vector<weight_t> vweights; + const string weightsf = conf["weights"].as<string>(); + Weights::InitFromFile(weightsf, &vweights); + Weights::InitSparseVector(vweights, &weights); + } + string line, file; + vector<training::CandidateSet> kis; + cerr << "Loading hypergraphs...\n"; + while(getline(in, line)) { + istringstream is(line); + int sent_id; + kis.resize(kis.size() + 1); + training::CandidateSet& curkbest = kis.back(); + string kbest_file; + if (kbest_repo.size()) { + ostringstream os; + os << kbest_repo << "/kbest." << sent_id << ".txt.gz"; + kbest_file = os.str(); + if (FileExists(kbest_file)) + curkbest.ReadFromFile(kbest_file); + } + is >> file >> sent_id; + ReadFile rf(file); + if (kis.size() % 5 == 0) { cerr << '.'; } + if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; } + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(weights); + curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]); + if (kbest_file.size()) + curkbest.WriteToFile(kbest_file); + } + cerr << "\nHypergraphs loaded.\n"; + + vector<SparseVector<weight_t> > goals(kis.size()); // f(x_i,y+,h+) + SparseVector<weight_t> fear; // f(x,y-,h-) + const GainFunction gain(metric); + for (unsigned iterp = 1; iterp <= tp; ++iterp) { + cerr << "CCCP Iteration " << iterp << endl; + for (unsigned i = 0; i < goals.size(); ++i) + CostAugmentedSearch(gain, kis[i], weights, goodsign * alpha, &goals[i]); + for (unsigned iterpp = 1; iterpp <= tpp; ++iterpp) { + cerr << " SSD Iteration " << iterpp << endl; + for (unsigned i = 0; i < goals.size(); ++i) { + CostAugmentedSearch(gain, kis[i], weights, badsign * alpha, &fear); + weights -= weights * (eta * reg / goals.size()); + weights += (goals[i] - fear) * eta; + } + } + } + vector<weight_t> w; + weights.init_vector(&w); + Weights::WriteToFile("-", w); + return 0; +} + diff --git a/training/rampion/rampion_generate_input.pl b/training/rampion/rampion_generate_input.pl new file mode 100755 index 00000000..b30fc4fd --- /dev/null +++ b/training/rampion/rampion_generate_input.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1; +my $d = shift @ARGV; +die "Can't find directory $d" unless -d $d; + +opendir(DIR, $d) or die "Can't read $d: $!"; +my @hgs = grep { /\.gz$/ } readdir(DIR); +closedir DIR; + +for my $hg (@hgs) { + my $file = $hg; + my $id = $hg; + $id =~ s/(\.json)?\.gz//; + print "$d/$file $id\n"; +} + diff --git a/training/ttables.cc b/training/ttables.cc deleted file mode 100644 index 45bf14c5..00000000 --- a/training/ttables.cc +++ /dev/null @@ -1,31 +0,0 @@ -#include "ttables.h" - -#include <cassert> - -#include "dict.h" - -using namespace std; -using namespace std::tr1; - -void TTable::DeserializeProbsFromText(std::istream* in) { - int c = 0; - while(*in) { - string e; - string f; - double p; - (*in) >> e >> f >> p; - if (e.empty()) break; - ++c; - ttable[TD::Convert(e)][TD::Convert(f)] = p; - } - cerr << "Loaded " << c << " translation parameters.\n"; -} - -void TTable::SerializeHelper(string* out, const Word2Word2Double& o) { - assert(!"not implemented"); -} - -void TTable::DeserializeHelper(const string& in, Word2Word2Double* o) { - assert(!"not implemented"); -} - diff --git a/training/ttables.h b/training/ttables.h deleted file mode 100644 index 9baa13ca..00000000 --- a/training/ttables.h +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef _TTABLES_H_ -#define _TTABLES_H_ - -#include <iostream> -#include <tr1/unordered_map> - -#include "sparse_vector.h" -#include "m.h" -#include "wordid.h" -#include "tdict.h" - -class TTable { - public: - TTable() {} - typedef std::tr1::unordered_map<WordID, double> Word2Double; - typedef std::tr1::unordered_map<WordID, Word2Double> Word2Word2Double; - inline double prob(const int& e, const int& f) const { - const Word2Word2Double::const_iterator cit = ttable.find(e); - if (cit != ttable.end()) { - const Word2Double& cpd = cit->second; - const Word2Double::const_iterator it = cpd.find(f); - if (it == cpd.end()) return 1e-9; - return it->second; - } else { - return 1e-9; - } - } - inline void Increment(const int& e, const int& f) { - counts[e][f] += 1.0; - } - inline void Increment(const int& e, const int& f, double x) { - counts[e][f] += x; - } - void NormalizeVB(const double alpha) { - ttable.swap(counts); - for (Word2Word2Double::iterator cit = ttable.begin(); - cit != ttable.end(); ++cit) { - double tot = 0; - Word2Double& cpd = cit->second; - for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - tot += it->second + alpha; - for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - it->second = exp(Md::digamma(it->second + alpha) - Md::digamma(tot)); - } - counts.clear(); - } - void Normalize() { - ttable.swap(counts); - for (Word2Word2Double::iterator cit = ttable.begin(); - cit != ttable.end(); ++cit) { - double tot = 0; - Word2Double& cpd = cit->second; - for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - tot += it->second; - for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - it->second /= tot; - } - counts.clear(); - } - // adds counts from another TTable - probabilities remain unchanged - TTable& operator+=(const TTable& rhs) { - for (Word2Word2Double::const_iterator it = rhs.counts.begin(); - it != rhs.counts.end(); ++it) { - const Word2Double& cpd = it->second; - Word2Double& tgt = counts[it->first]; - for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) { - tgt[j->first] += j->second; - } - } - return *this; - } - void ShowTTable() const { - for (Word2Word2Double::const_iterator it = ttable.begin(); it != ttable.end(); ++it) { - const Word2Double& cpd = it->second; - for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) { - std::cerr << "P(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl; - } - } - } - void ShowCounts() const { - for (Word2Word2Double::const_iterator it = counts.begin(); it != counts.end(); ++it) { - const Word2Double& cpd = it->second; - for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) { - std::cerr << "c(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl; - } - } - } - void DeserializeProbsFromText(std::istream* in); - void SerializeCounts(std::string* out) const { SerializeHelper(out, counts); } - void DeserializeCounts(const std::string& in) { DeserializeHelper(in, &counts); } - void SerializeProbs(std::string* out) const { SerializeHelper(out, ttable); } - void DeserializeProbs(const std::string& in) { DeserializeHelper(in, &ttable); } - private: - static void SerializeHelper(std::string*, const Word2Word2Double& o); - static void DeserializeHelper(const std::string&, Word2Word2Double* o); - public: - Word2Word2Double ttable; - Word2Word2Double counts; -}; - -#endif diff --git a/training/utils/Makefile.am b/training/utils/Makefile.am new file mode 100644 index 00000000..27c6e344 --- /dev/null +++ b/training/utils/Makefile.am @@ -0,0 +1,46 @@ +noinst_LIBRARIES = libtraining_utils.a + +bin_PROGRAMS = \ + sentserver \ + sentclient \ + grammar_convert + +noinst_PROGRAMS = \ + lbfgs_test \ + optimize_test + +EXTRA_DIST = decode-and-evaluate.pl libcall.pl parallelize.pl + +sentserver_SOURCES = sentserver.cc +sentserver_LDFLAGS = -pthread + +sentclient_SOURCES = sentclient.cc +sentclient_LDFLAGS = -pthread + +TESTS = lbfgs_test optimize_test + +libtraining_utils_a_SOURCES = \ + candidate_set.h \ + entropy.h \ + lbfgs.h \ + online_optimizer.h \ + optimize.h \ + risk.h \ + sentserver.h \ + candidate_set.cc \ + entropy.cc \ + optimize.cc \ + online_optimizer.cc \ + risk.cc + +optimize_test_SOURCES = optimize_test.cc +optimize_test_LDADD = libtraining_utils.a ../../utils/libutils.a + +grammar_convert_SOURCES = grammar_convert.cc +grammar_convert_LDADD = ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a + +lbfgs_test_SOURCES = lbfgs_test.cc +lbfgs_test_LDADD = ../../utils/libutils.a + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I$(top_srcdir)/klm + diff --git a/training/candidate_set.cc b/training/utils/candidate_set.cc index 087efec3..087efec3 100644 --- a/training/candidate_set.cc +++ b/training/utils/candidate_set.cc diff --git a/training/candidate_set.h b/training/utils/candidate_set.h index 9d326ed0..9d326ed0 100644 --- a/training/candidate_set.h +++ b/training/utils/candidate_set.h diff --git a/training/utils/decode-and-evaluate.pl b/training/utils/decode-and-evaluate.pl new file mode 100755 index 00000000..1a332c08 --- /dev/null +++ b/training/utils/decode-and-evaluate.pl @@ -0,0 +1,246 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use File::Basename qw(basename); +my $QSUB_CMD = qsub_args(mert_memory()); + +require "libcall.pl"; + +# Default settings +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $parallelize = "$bin_dir/parallelize.pl"; +my $libcall = "$bin_dir/libcall.pl"; +my $sentserver = "$bin_dir/sentserver"; +my $sentclient = "$bin_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +my $cdec = "$bin_dir/../../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "9g"; +my $help = 0; +my $config; +my $test_set; +my $weights; +my $use_make = 1; +my $useqsub; +my $cpbin=1; +# Process command-line options +if (GetOptions( + "jobs=i" => \$jobs, + "help" => \$help, + "qsub" => \$useqsub, + "input=s" => \$test_set, + "config=s" => \$config, + "weights=s" => \$weights, +) == 0 || @ARGV!=0 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); + +if (!defined $test_set) { push @missing_args, "--input"; } +if (!defined $config) { push @missing_args, "--config"; } +if (!defined $weights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args); + +my @tf = localtime(time); +my $tname = basename($test_set); +$tname =~ s/\.(sgm|sgml|xml)$//i; +my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]); + +my $time = unchecked_output("date"); + +check_call("mkdir -p $dir"); + +split_devset($test_set, "$dir/test.input.raw", "$dir/test.refs"); +my $refs = "-r $dir/test.refs"; +my $newsrc = "$dir/test.input"; +enseg("$dir/test.input.raw", $newsrc); +my $src_file = $newsrc; +open F, "<$src_file" or die "Can't read $src_file: $!"; close F; + +my $test_trans="$dir/test.trans"; +my $logdir="$dir/logs"; +my $decoderLog="$logdir/decoder.sentserver.log"; +check_call("mkdir -p $logdir"); + +#decode +print STDERR "RUNNING DECODER AT "; +print STDERR unchecked_output("date"); +my $decoder_cmd = "$decoder -c $config --weights $weights"; +my $pcmd; +if ($use_make) { + $pcmd = "cat $src_file | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --"; +} else { + $pcmd = "cat $src_file | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --"; +} +my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $test_trans"; +check_bash_call($cmd); +print STDERR "DECODER COMPLETED AT "; +print STDERR unchecked_output("date"); +print STDERR "\nOUTPUT: $test_trans\n\n"; +my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu"); +chomp $bleu; +print STDERR "BLEU: $bleu\n"; +my $ter = check_output("cat $test_trans | $SCORER $refs -m ter"); +chomp $ter; +print STDERR " TER: $ter\n"; +open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!"; +print TR <<EOT; +### SCORE REPORT ############################################################# + OUTPUT=$test_trans + SCRIPT INPUT=$test_set + DECODER INPUT=$src_file + REFERENCES=$dir/test.refs +------------------------------------------------------------------------------ + BLEU=$bleu + TER=$ter +############################################################################## +EOT +close TR; +my $sr = unchecked_output("cat $dir/test.scores"); +print STDERR "\n\n$sr\n(A copy of this report can be found in $dir/test.scores)\n\n"; +exit 0; + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=<SRC>){ + chomp $line; + if ($line =~ /^\s*<seg/i) { + if($line =~ /id="[0-9]+"/) { + print NEWSRC "$line\n"; + } else { + die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "<seg id=\"$i\">$line</seg>\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + my $executable = basename($0); chomp $executable; + print << "Help"; + +Usage: $executable [options] <ini file> + + $executable --config cdec.ini --weights weights.txt [--jobs N] [--qsub] <testset.in-ref> + +Options: + + --help + Print this message and exit. + + --config <file> + A path to the cdec.ini file. + + --weights <file> + A file specifying feature weights. + + --dir <dir> + Directory for intermediate and output files. + +Job control options: + + --jobs <I> + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem <N> + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} + +sub split_devset { + my ($infile, $outsrc, $outref) = @_; + open F, "<$infile" or die "Can't read $infile: $!"; + open S, ">$outsrc" or die "Can't write $outsrc: $!"; + open R, ">$outref" or die "Can't write $outref: $!"; + while(<F>) { + chomp; + my ($src, @refs) = split /\s*\|\|\|\s*/; + die "Malformed devset line: $_\n" unless scalar @refs > 0; + print S "$src\n"; + print R join(' ||| ', @refs) . "\n"; + } + close R; + close S; + close F; +} + diff --git a/training/entropy.cc b/training/utils/entropy.cc index 4fdbe2be..4fdbe2be 100644 --- a/training/entropy.cc +++ b/training/utils/entropy.cc diff --git a/training/entropy.h b/training/utils/entropy.h index 796589ca..796589ca 100644 --- a/training/entropy.h +++ b/training/utils/entropy.h diff --git a/training/grammar_convert.cc b/training/utils/grammar_convert.cc index 607a7cb9..607a7cb9 100644 --- a/training/grammar_convert.cc +++ b/training/utils/grammar_convert.cc diff --git a/training/lbfgs.h b/training/utils/lbfgs.h index e8baecab..e8baecab 100644 --- a/training/lbfgs.h +++ b/training/utils/lbfgs.h diff --git a/training/lbfgs_test.cc b/training/utils/lbfgs_test.cc index 9678e788..9678e788 100644 --- a/training/lbfgs_test.cc +++ b/training/utils/lbfgs_test.cc diff --git a/training/utils/libcall.pl b/training/utils/libcall.pl new file mode 100644 index 00000000..c7d0f128 --- /dev/null +++ b/training/utils/libcall.pl @@ -0,0 +1,71 @@ +use IPC::Open3; +use Symbol qw(gensym); + +$DUMMY_STDERR = gensym(); +$DUMMY_STDIN = gensym(); + +# Run the command and ignore failures +sub unchecked_call { + system("@_") +} + +# Run the command and return its output, if any ignoring failures +sub unchecked_output { + return `@_` +} + +# WARNING: Do not use this for commands that will return large amounts +# of stdout or stderr -- they might block indefinitely +sub check_output { + print STDERR "Executing and gathering output: @_\n"; + + my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_); + my $proc_output = ""; + while( <PH> ) { + $proc_output .= $_; + } + waitpid($pid, 0); + # TODO: Grab signal that the process died from + my $child_exit_status = $? >> 8; + if($child_exit_status == 0) { + return $proc_output; + } else { + print STDERR "ERROR: Execution of @_ failed.\n"; + exit(1); + } +} + +# Based on Moses' safesystem sub +sub check_call { + print STDERR "Executing: @_\n"; + system(@_); + my $exitcode = $? >> 8; + if($exitcode == 0) { + return 0; + } elsif ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + + } elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + + } else { + print STDERR "Failed with exit code: $exitcode\n" if $exitcode; + exit($exitcode); + } +} + +sub check_bash_call { + my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); + check_call(@args); +} + +sub check_bash_output { + my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); + return check_output(@args); +} + +# perl module weirdness... +return 1; diff --git a/training/online_optimizer.cc b/training/utils/online_optimizer.cc index 3ed95452..3ed95452 100644 --- a/training/online_optimizer.cc +++ b/training/utils/online_optimizer.cc diff --git a/training/online_optimizer.h b/training/utils/online_optimizer.h index 28d89344..28d89344 100644 --- a/training/online_optimizer.h +++ b/training/utils/online_optimizer.h diff --git a/training/optimize.cc b/training/utils/optimize.cc index 41ac90d8..41ac90d8 100644 --- a/training/optimize.cc +++ b/training/utils/optimize.cc diff --git a/training/optimize.h b/training/utils/optimize.h index 07943b44..07943b44 100644 --- a/training/optimize.h +++ b/training/utils/optimize.h diff --git a/training/optimize_test.cc b/training/utils/optimize_test.cc index bff2ca03..bff2ca03 100644 --- a/training/optimize_test.cc +++ b/training/utils/optimize_test.cc diff --git a/training/utils/parallelize.pl b/training/utils/parallelize.pl new file mode 100755 index 00000000..4197e0e5 --- /dev/null +++ b/training/utils/parallelize.pl @@ -0,0 +1,423 @@ +#!/usr/bin/env perl + +# Author: Adam Lopez +# +# This script takes a command that processes input +# from stdin one-line-at-time, and parallelizes it +# on the cluster using David Chiang's sentserver/ +# sentclient architecture. +# +# Prerequisites: the command *must* read each line +# without waiting for subsequent lines of input +# (for instance, a command which must read all lines +# of input before processing will not work) and +# return it to the output *without* buffering +# multiple lines. + +#TODO: if -j 1, run immediately, not via sentserver? possible differences in environment might make debugging harder + +#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s + +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } +use LocalConfig; + +use Cwd qw/ abs_path cwd getcwd /; +use File::Temp qw/ tempfile /; +use Getopt::Long; +use IPC::Open2; +use strict; +use POSIX ":sys_wait_h"; + +use File::Basename; +my $myDir = dirname(__FILE__); +print STDERR __FILE__." -> $myDir\n"; +push(@INC, $myDir); +require "libcall.pl"; + +my $tailn=5; # +0 = concatenate all the client logs. 5 = last 5 lines +my $recycle_clients; # spawn new clients when previous ones terminate +my $stay_alive; # dont let server die when having zero clients +my $joblist = ""; +my $errordir=""; +my $multiline; +my $workdir = '.'; +my $numnodes = 8; +my $user = $ENV{"USER"}; +my $pmem = "9g"; +my $basep=50300; +my $randp=300; +my $tryp=50; +my $no_which; +my $no_cd; + +my $DEBUG=$ENV{DEBUG}; +print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG; +my $verbose = 1; +sub verbose { + if ($verbose) { + print STDERR @_,"\n"; + } +} +sub debug { + if ($DEBUG) { + my ($package, $filename, $line) = caller; + print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n"; + } +} +my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].; +my $shell_escape_in_quote=qr.[\\"\$`!].; +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + return '""' unless $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} +sub preview_files { + my ($l,$skipempty,$footer,$n)=@_; + $n=$tailn unless defined $n; + my @f=grep { ! ($skipempty && -z $_) } @$l; + my $fn=join(' ',map {escape_shell($_)} @f); + my $cmd="tail -n $n $fn"; + unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); +} +sub prefix_dirname($) { + #like `dirname but if ends in / then return the whole thing + local ($_)=@_; + if (/\/$/) { + $_; + } else { + s#/[^/]$##; + $_ ? $_ : ''; + } +} +sub ensure_final_slash($) { + local ($_)=@_; + m#/$# ? $_ : ($_."/"); +} +sub extend_path($$;$$) { + my ($base,$ext,$mkdir,$baseisdir)=@_; + if (-d $base) { + $base.="/"; + } else { + my $dir; + if ($baseisdir) { + $dir=$base; + $base.='/' unless $base =~ /\/$/; + } else { + $dir=prefix_dirname($base); + } + my @cmd=("/bin/mkdir","-p",$dir); + check_call(@cmd) if $mkdir; + } + return $base.$ext; +} + +my $abscwd=abs_path(&getcwd); +sub print_help; + +my $use_fork; +my @pids; + +# Process command-line options +unless (GetOptions( + "stay-alive" => \$stay_alive, + "recycle-clients" => \$recycle_clients, + "error-dir=s" => \$errordir, + "multi-line" => \$multiline, + "workdir=s" => \$workdir, + "use-fork" => \$use_fork, + "verbose" => \$verbose, + "jobs=i" => \$numnodes, + "pmem=s" => \$pmem, + "baseport=i" => \$basep, +# "iport=i" => \$randp, #for short name -i + "no-which!" => \$no_which, + "no-cd!" => \$no_cd, + "tailn=s" => \$tailn, +) && scalar @ARGV){ + print_help(); + die "bad options."; +} + +my $cmd = ""; +my $prog=shift; +if ($no_which) { + $cmd=$prog; +} else { + $cmd=check_output("which $prog"); + chomp $cmd; + die "$prog not found - $cmd" unless $cmd; +} +#$cmd=abs_path($cmd); +for my $arg (@ARGV) { + $cmd .= " ".escape_shell($arg); +} +die "Please specify a command to parallelize\n" if $cmd eq ''; + +my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n"); + +my $executable = $cmd; +$executable =~ s/^\s*(\S+)($|\s.*)/$1/; +$executable=check_output("basename $executable"); +chomp $executable; + + +print STDERR "Parallelizing ($numnodes ways): $cmd\n\n"; + +# create -e dir and save .sh +use File::Temp qw/tempdir/; +unless ($errordir) { + $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1); +} +if ($errordir) { + my $scriptfile=extend_path("$errordir/","$executable.sh",1,1); + -d $errordir || die "should have created -e dir $errordir"; + open SF,">",$scriptfile || die; + print SF "$cdcmd$cmd\n"; + close SF; + chmod 0755,$scriptfile; + $errordir=abs_path($errordir); + &verbose("-e dir: $errordir"); +} + +# set cleanup handler +my @cleanup_cmds; +sub cleanup; +sub cleanup_and_die; +$SIG{INT} = "cleanup_and_die"; +$SIG{TERM} = "cleanup_and_die"; +$SIG{HUP} = "cleanup_and_die"; + +# other subs: +sub numof_live_jobs; +sub launch_job_on_node; + + +# vars +my $mydir = check_output("dirname $0"); chomp $mydir; +my $sentserver = "$mydir/sentserver"; +my $sentclient = "$mydir/sentclient"; +my $host = check_output("hostname"); +chomp $host; + + +# find open port +srand; +my $port = 50300+int(rand($randp)); +my $endp=$port+$tryp; +sub listening_port_lines { + my $quiet=$verbose?'':'2>/dev/null'; + return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp"); +} +my $netstat=&listening_port_lines; + +if ($verbose){ print STDERR "Testing port $port...";} + +while ($netstat=~/$port/ || &listening_port_lines=~/$port/){ + if ($verbose){ print STDERR "port is busy\n";} + $port++; + if ($port > $endp){ + die "Unable to find open port\n"; + } + if ($verbose){ print STDERR "Testing port $port... "; } +} +if ($verbose){ + print STDERR "port $port is available\n"; +} + +my $key = int(rand()*1000000); + +my $multiflag = ""; +if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; } +my $stay_alive_flag = ""; +if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; } + +my $node_count = 0; +my $script = ""; +# fork == one thread runs the sentserver, while the +# other spawns the sentclient commands. +my $pid = fork; +if ($pid == 0) { # child + sleep 8; # give other thread time to start sentserver + $script = "$cdcmd$sentclient $host:$port:$key $cmd"; + + if ($verbose){ + print STDERR "Client script:\n====\n"; + print STDERR $script; + print STDERR "====\n"; + } + for (my $jobn=0; $jobn<$numnodes; $jobn++){ + launch_job(); + } + if ($recycle_clients) { + my $ret; + my $livejobs; + while (1) { + $ret = waitpid($pid, WNOHANG); + #print STDERR "waitpid $pid ret = $ret \n"; + last if ($ret != 0); + $livejobs = numof_live_jobs(); + if ($numnodes >= $livejobs ) { # a client terminated, OR # lines of input was less than -j + print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n"; + launch_job(); + } else { + sleep 15; + } + } + } + print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n"; + for my $p (@pids) { + waitpid($p, 0); + } +} else { +# my $todo = "$sentserver -k $key $multiflag $port "; + my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag "; + if ($verbose){ print STDERR "Running: $todo\n"; } + check_call($todo); + print STDERR "Call to $sentserver returned.\n"; + cleanup(); + exit(0); +} + +sub numof_live_jobs { + if ($use_fork) { + die "not implemented"; + } else { + # We can probably continue decoding if the qstat error is only temporary + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat"))); + return ($#livejobs + 1); + } +} +my (@errors,@outs,@cmds); + +sub launch_job { + if ($use_fork) { return launch_job_fork(); } + my $errorfile = "/dev/null"; + my $outfile = "/dev/null"; + $node_count++; + my $clientname = $executable; + $clientname =~ s/^(.{4}).*$/$1/; + $clientname = "$clientname.$node_count"; + if ($errordir){ + $errorfile = "$errordir/$clientname.ER"; + $outfile = "$errordir/$clientname.OU"; + push @errors,$errorfile; + push @outs,$outfile; + } + my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile"; + push @cmds,$todo; + + print STDERR "Running: $todo\n"; + local(*QOUT, *QIN); + open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!"; + print QIN $script; + close QIN; + while (my $jobid=<QOUT>){ + chomp $jobid; + if ($verbose){ print STDERR "Launched client job: $jobid"; } + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + print STDERR " short job id $jobid\n"; + if ($verbose){ + print STDERR "cd: $abscwd\n"; + print STDERR "cmd: $cmd\n"; + } + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + my $cleanfn="qdel $jobid 2> /dev/null"; + push(@cleanup_cmds, $cleanfn); + } + close QOUT; +} + +sub launch_job_fork { + my $errorfile = "/dev/null"; + my $outfile = "/dev/null"; + $node_count++; + my $clientname = $executable; + $clientname =~ s/^(.{4}).*$/$1/; + $clientname = "$clientname.$node_count"; + if ($errordir){ + $errorfile = "$errordir/$clientname.ER"; + $outfile = "$errordir/$clientname.OU"; + push @errors,$errorfile; + push @outs,$outfile; + } + my $pid = fork; + if ($pid == 0) { + my ($fh, $scr_name) = get_temp_script(); + print $fh $script; + close $fh; + my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile"; + print STDERR "EXEC: $todo\n"; + my $out = check_output("$todo"); + unlink $scr_name or warn "Failed to remove $scr_name"; + exit 0; + } else { + push @pids, $pid; + } +} + +sub get_temp_script { + my ($fh, $filename) = tempfile( "$workdir/workXXXX", SUFFIX => '.sh'); + return ($fh, $filename); +} + +sub cleanup_and_die { + cleanup(); + die "\n"; +} + +sub cleanup { + print STDERR "Cleaning up...\n"; + for $cmd (@cleanup_cmds){ + print STDERR " Cleanup command: $cmd\n"; + eval $cmd; + } + print STDERR "outputs:\n",preview_files(\@outs,1),"\n"; + print STDERR "errors:\n",preview_files(\@errors,1),"\n"; + print STDERR "cmd:\n",$cmd,"\n"; + print STDERR " cat $errordir/*.ER\nfor logs.\n"; + print STDERR "Cleanup finished.\n"; +} + +sub print_help +{ + my $name = check_output("basename $0"); chomp $name; + print << "Help"; + +usage: $name [options] + + Automatic black-box parallelization of commands. + +options: + + --use-fork + Instead of using qsub, use fork. + + -e, --error-dir <dir> + Retain output files from jobs in <dir>, rather + than silently deleting them. + + -m, --multi-line + Expect that command may produce multiple output + lines for a single input line. $name makes a + reasonable attempt to obtain all output before + processing additional inputs. However, use of this + option is inherently unsafe. + + -v, --verbose + Print diagnostic informatoin on stderr. + + -j, --jobs + Number of jobs to use. + + -p, --pmem + pmem setting for each job. + +Help +} diff --git a/training/risk.cc b/training/utils/risk.cc index d5a12cfd..d5a12cfd 100644 --- a/training/risk.cc +++ b/training/utils/risk.cc diff --git a/training/risk.h b/training/utils/risk.h index 2e8db0fb..2e8db0fb 100644 --- a/training/risk.h +++ b/training/utils/risk.h diff --git a/training/utils/sentclient.cc b/training/utils/sentclient.cc new file mode 100644 index 00000000..91d994ab --- /dev/null +++ b/training/utils/sentclient.cc @@ -0,0 +1,76 @@ +/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <netinet/in.h> +#include <netdb.h> +#include <string.h> + +#include "sentserver.h" + +int main (int argc, char *argv[]) { + int sock, port; + char *s, *key; + struct hostent *hp; + struct sockaddr_in server; + int errors = 0; + + if (argc < 3) { + fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n"); + exit(1); + } + + s = strchr(argv[1], ':'); + key = NULL; + + if (s == NULL) { + port = DEFAULT_PORT; + } else { + *s = '\0'; + s+=1; + /* dumb hack */ + key = strchr(s, ':'); + if (key != NULL){ + *key = '\0'; + key += 1; + } + port = atoi(s); + } + + sock = socket(AF_INET, SOCK_STREAM, 0); + + hp = gethostbyname(argv[1]); + if (hp == NULL) { + fprintf(stderr, "unknown host %s\n", argv[1]); + exit(1); + } + + bzero((char *)&server, sizeof(server)); + bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); + server.sin_family = hp->h_addrtype; + server.sin_port = htons(port); + + while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) { + perror("connect()"); + sleep(1); + errors++; + if (errors > 5) + exit(1); + } + + close(0); + close(1); + dup2(sock, 0); + dup2(sock, 1); + + if (key != NULL){ + write(1, key, strlen(key)); + write(1, "\n", 1); + } + + execvp(argv[2], argv+2); + return 0; +} diff --git a/training/utils/sentserver.cc b/training/utils/sentserver.cc new file mode 100644 index 00000000..b425955f --- /dev/null +++ b/training/utils/sentserver.cc @@ -0,0 +1,515 @@ +/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ + +#include <string.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdio.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/time.h> +#include <netinet/in.h> +#include <sched.h> +#include <pthread.h> +#include <errno.h> + +#include "sentserver.h" + +#define MAX_CLIENTS 64 + +struct clientinfo { + int s; + struct sockaddr_in sin; +}; + +struct line { + int id; + char *s; + int status; + struct line *next; +} *head, **ptail; + +int n_sent = 0, n_received=0, n_flushed=0; + +#define STATUS_RUNNING 0 +#define STATUS_ABORTED 1 +#define STATUS_FINISHED 2 + +pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER; + +int n_clients = 0; +int s; +int expect_multiline_output = 0; +int log_mutex = 0; +int stay_alive = 0; /* dont panic and die with zero clients */ + +void queue_finish(struct line *node, char *s, int fid); +char * read_line(int fd, int multiline); +void done (int code); + +struct line * queue_get(int fid) { + struct line *cur; + char *s, *synch; + + if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid); + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + + /* First, check for aborted sentences. */ + + if (log_mutex) fprintf(stderr, " Checking queue for aborted jobs (fid %d)\n", fid); + for (cur = head; cur != NULL; cur = cur->next) { + if (cur->status == STATUS_ABORTED) { + cur->status = STATUS_RUNNING; + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + return cur; + } + } + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + /* Otherwise, read a new one. */ + if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); + if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); + pthread_mutex_lock(&input_mutex); + s = read_line(0,0); + + while (s) { + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); + pthread_mutex_unlock(&input_mutex); + + cur = (line*)malloc(sizeof (struct line)); + cur->id = n_sent; + cur->s = s; + cur->next = NULL; + + *ptail = cur; + ptail = &cur->next; + + n_sent++; + + if (strcmp(s,"===SYNCH===\n")==0){ + fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid); + // Note: queue_finish calls free(cur->s). + // Therefore we need to create a new string here. + synch = (char*)malloc((strlen("===SYNCH===\n")+2) * sizeof (char)); + synch = strcpy(synch, s); + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + queue_finish(cur, synch, fid); /* handles its own lock */ + + if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); + if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); + pthread_mutex_lock(&input_mutex); + + s = read_line(0,0); + } else { + if (log_mutex) fprintf(stderr, " Received new data %d (fid %d)\n", cur->id, fid); + cur->status = STATUS_RUNNING; + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + return cur; + } + } + + if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); + pthread_mutex_unlock(&input_mutex); + /* Only way to reach this point: no more output */ + + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + if (head == NULL) { + fprintf(stderr, "Reached end of file. Exiting.\n"); + done(0); + } else + ptail = NULL; /* This serves as a signal that there is no more input */ + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + return NULL; +} + +void queue_panic() { + struct line *next; + while (head && head->status == STATUS_FINISHED) { + /* Write out finished sentences */ + if (head->status == STATUS_FINISHED) { + fputs(head->s, stdout); + fflush(stdout); + } + /* Write out blank line for unfinished sentences */ + if (head->status == STATUS_ABORTED) { + fputs("\n", stdout); + fflush(stdout); + } + /* By defition, there cannot be any RUNNING sentences, since + function is only called when n_clients == 0 */ + free(head->s); + next = head->next; + free(head); + head = next; + n_flushed++; + } + fclose(stdout); + fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n"); + done(1); +} + +void queue_abort(struct line *node, int fid) { + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + node->status = STATUS_ABORTED; + if (n_clients == 0) { + if (stay_alive) { + fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n"); + } else { + queue_panic(); + } + } + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); +} + + +void queue_print() { + struct line *cur; + + fprintf(stderr, " Queue\n"); + + for (cur = head; cur != NULL; cur = cur->next) { + switch(cur->status) { + case STATUS_RUNNING: + fprintf(stderr, " %d running ", cur->id); break; + case STATUS_ABORTED: + fprintf(stderr, " %d aborted ", cur->id); break; + case STATUS_FINISHED: + fprintf(stderr, " %d finished ", cur->id); break; + + } + fprintf(stderr, "\n"); + //fprintf(stderr, cur->s); + } +} + +void queue_finish(struct line *node, char *s, int fid) { + struct line *next; + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + + free(node->s); + node->s = s; + node->status = STATUS_FINISHED; + n_received++; + + /* Flush out finished nodes */ + while (head && head->status == STATUS_FINISHED) { + + if (log_mutex) fprintf(stderr, " Flushing finished node %d\n", head->id); + + fputs(head->s, stdout); + fflush(stdout); + if (log_mutex) fprintf(stderr, " Flushed node %d\n", head->id); + free(head->s); + + next = head->next; + free(head); + + head = next; + + n_flushed++; + + if (head == NULL) { /* empty queue */ + if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */ + fprintf(stderr, "All sentences finished. Exiting.\n"); + done(0); + } else /* ptail pointed at something which was just popped off the stack -- reset to head*/ + ptail = &head; + } + } + + if (log_mutex) fprintf(stderr, " Flushing output %d\n", head->id); + fflush(stdout); + fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed); + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + +} + +char * read_line(int fd, int multiline) { + int size = 80; + char errorbuf[100]; + char *s = (char*)malloc(size+2); + int result, errors=0; + int i = 0; + + result = read(fd, s+i, 1); + + while (1) { + if (result < 0) { + perror("read()"); + sprintf(errorbuf, "Error code: %d\n", errno); + fputs(errorbuf, stderr); + errors++; + if (errors > 5) { + free(s); + return NULL; + } else { + sleep(1); /* retry after delay */ + } + } else if (result == 0) { + break; + } else if (multiline==0 && s[i] == '\n') { + break; + } else { + if (s[i] == '\n'){ + /* if we've reached this point, + then multiline must be 1, and we're + going to poll the fd for an additional + line of data. The basic design is to + run a select on the filedescriptor fd. + Select will return under two conditions: + if there is data on the fd, or if a + timeout is reached. We'll select on this + fd. If select returns because there's data + ready, keep going; else assume there's no + more and return the data we already have. + */ + + fd_set set; + FD_ZERO(&set); + FD_SET(fd, &set); + + struct timeval timeout; + timeout.tv_sec = 3; // number of seconds for timeout + timeout.tv_usec = 0; + + int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); + if (ready<1){ + break; // no more data, stop looping + } + } + i++; + + if (i == size) { + size = size*2; + s = (char*)realloc(s, size+2); + } + } + + result = read(fd, s+i, 1); + } + + if (result == 0 && i == 0) { /* end of file */ + free(s); + return NULL; + } + + s[i] = '\n'; + s[i+1] = '\0'; + + return s; +} + +void * new_client(void *arg) { + struct clientinfo *client = (struct clientinfo *)arg; + struct line *cur; + int result; + char *s; + char errorbuf[100]; + + pthread_mutex_lock(&clients_mutex); + n_clients++; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client connected (%d connected)\n", n_clients); + + for (;;) { + + cur = queue_get(client->s); + + if (cur) { + /* fprintf(stderr, "Sending to client: %s", cur->s); */ + fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s); + result = write(client->s, cur->s, strlen(cur->s)); + if (result < strlen(cur->s)){ + perror("write()"); + sprintf(errorbuf, "Error code: %d\n", errno); + fputs(errorbuf, stderr); + + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client died (%d connected)\n", n_clients); + queue_abort(cur, client->s); + + close(client->s); + free(client); + + pthread_exit(NULL); + } + } else { + close(client->s); + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + fprintf(stderr, "Client dismissed (%d connected)\n", n_clients); + pthread_exit(NULL); + } + + s = read_line(client->s,expect_multiline_output); + if (s) { + /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */ + fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id); +// queue_print(); + queue_finish(cur, s, client->s); + } else { + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client died (%d connected)\n", n_clients); + queue_abort(cur, client->s); + + close(client->s); + free(client); + + pthread_exit(NULL); + } + + } + return 0; +} + +void done (int code) { + close(s); + exit(code); +} + + + +int main (int argc, char *argv[]) { + struct sockaddr_in sin, from; + int g; + socklen_t len; + struct clientinfo *client; + int port; + int opt; + int errors = 0; + int argi; + char *key = NULL, *client_key; + int use_key = 0; + /* the key stuff here doesn't provide any + real measure of security, it's mainly to keep + jobs from bumping into each other. */ + + pthread_t tid; + port = DEFAULT_PORT; + + for (argi=1; argi < argc; argi++){ + if (strcmp(argv[argi], "-m")==0){ + expect_multiline_output = 1; + } else if (strcmp(argv[argi], "-k")==0){ + argi++; + if (argi == argc){ + fprintf(stderr, "Key must be specified after -k\n"); + exit(1); + } + key = argv[argi]; + use_key = 1; + } else if (strcmp(argv[argi], "--stay-alive")==0){ + stay_alive = 1; /* dont panic and die with zero clients */ + } else { + port = atoi(argv[argi]); + } + } + + /* Initialize data structures */ + head = NULL; + ptail = &head; + + /* Set up listener */ + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + opt = 1; + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(port); + while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) { + perror("bind()"); + sleep(1); + errors++; + if (errors > 100) + exit(1); + } + + len = sizeof(sin); + getsockname(s, (struct sockaddr *) &sin, &len); + + fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port)); + + while (listen(s, MAX_CLIENTS) < 0) { + perror("listen()"); + sleep(1); + errors++; + if (errors > 100) + exit(1); + } + + for (;;) { + len = sizeof(from); + g = accept(s, (struct sockaddr *)&from, &len); + if (g < 0) { + perror("accept()"); + sleep(1); + continue; + } + client = (clientinfo*)malloc(sizeof(struct clientinfo)); + client->s = g; + bcopy(&from, &client->sin, len); + + if (use_key){ + fd_set set; + FD_ZERO(&set); + FD_SET(client->s, &set); + + struct timeval timeout; + timeout.tv_sec = 3; // number of seconds for timeout + timeout.tv_usec = 0; + + int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); + if (ready<1){ + fprintf(stderr, "Prospective client failed to respond with correct key.\n"); + close(client->s); + free(client); + } else { + client_key = read_line(client->s,0); + client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */ + if (strcmp(key, client_key)==0){ + pthread_create(&tid, NULL, new_client, client); + } else { + fprintf(stderr, "Prospective client failed to respond with correct key.\n"); + close(client->s); + free(client); + } + free(client_key); + } + } else { + pthread_create(&tid, NULL, new_client, client); + } + } + +} + + + diff --git a/training/utils/sentserver.h b/training/utils/sentserver.h new file mode 100644 index 00000000..cd17a546 --- /dev/null +++ b/training/utils/sentserver.h @@ -0,0 +1,6 @@ +#ifndef SENTSERVER_H +#define SENTSERVER_H + +#define DEFAULT_PORT 50000 + +#endif |