summaryrefslogtreecommitdiff
path: root/training
diff options
context:
space:
mode:
authorAvneesh Saluja <asaluja@gmail.com>2013-03-28 18:28:16 -0700
committerAvneesh Saluja <asaluja@gmail.com>2013-03-28 18:28:16 -0700
commit3d8d656fa7911524e0e6885647173474524e0784 (patch)
tree81b1ee2fcb67980376d03f0aa48e42e53abff222 /training
parentbe7f57fdd484e063775d7abf083b9fa4c403b610 (diff)
parent96fedabebafe7a38a6d5928be8fff767e411d705 (diff)
fixed conflicts
Diffstat (limited to 'training')
-rw-r--r--training/Jamfile25
-rw-r--r--training/Makefile.am100
-rwxr-xr-xtraining/add-model1-features-to-scfg.pl93
-rw-r--r--training/collapse_weights.cc110
-rw-r--r--training/crf/Makefile.am31
-rw-r--r--training/crf/baum_welch_example/README.md32
-rw-r--r--training/crf/baum_welch_example/cdec.ini5
-rwxr-xr-xtraining/crf/baum_welch_example/random_init.pl9
-rw-r--r--training/crf/baum_welch_example/tagset.txt1
-rw-r--r--training/crf/baum_welch_example/train.txt2000
-rw-r--r--training/crf/cllh_observer.cc (renamed from training/cllh_observer.cc)2
-rw-r--r--training/crf/cllh_observer.h (renamed from training/cllh_observer.h)0
-rw-r--r--training/crf/mpi_batch_optimize.cc (renamed from training/mpi_batch_optimize.cc)2
-rw-r--r--training/crf/mpi_baum_welch.cc316
-rw-r--r--training/crf/mpi_compute_cllh.cc (renamed from training/mpi_compute_cllh.cc)0
-rw-r--r--training/crf/mpi_extract_features.cc (renamed from training/mpi_extract_features.cc)0
-rw-r--r--training/crf/mpi_extract_reachable.cc (renamed from training/mpi_extract_reachable.cc)0
-rw-r--r--training/crf/mpi_flex_optimize.cc (renamed from training/mpi_flex_optimize.cc)0
-rw-r--r--training/crf/mpi_online_optimize.cc (renamed from training/mpi_online_optimize.cc)16
-rwxr-xr-xtraining/dep-reorder/conll2reordering-forest.pl65
-rw-r--r--training/dep-reorder/george.conll4
-rwxr-xr-xtraining/dep-reorder/scripts/conll2simplecfg.pl57
-rw-r--r--training/dpmert/Makefile.am27
-rw-r--r--training/dpmert/ces.cc90
-rw-r--r--training/dpmert/ces.h16
-rwxr-xr-xtraining/dpmert/divide_refs.py15
-rwxr-xr-xtraining/dpmert/dpmert.pl618
-rw-r--r--training/dpmert/error_surface.cc42
-rw-r--r--training/dpmert/error_surface.h24
-rwxr-xr-xtraining/dpmert/line_mediator.pl116
-rw-r--r--training/dpmert/line_optimizer.cc114
-rw-r--r--training/dpmert/line_optimizer.h48
-rw-r--r--training/dpmert/lo_test.cc229
-rw-r--r--training/dpmert/mert_geometry.cc185
-rw-r--r--training/dpmert/mert_geometry.h81
-rw-r--r--training/dpmert/mr_dpmert_generate_mapper_input.cc81
-rw-r--r--training/dpmert/mr_dpmert_map.cc112
-rw-r--r--training/dpmert/mr_dpmert_reduce.cc77
-rw-r--r--training/dpmert/test_aer/README8
-rw-r--r--training/dpmert/test_aer/cdec.ini3
-rw-r--r--training/dpmert/test_aer/corpus.src3
-rw-r--r--training/dpmert/test_aer/grammar12
-rw-r--r--training/dpmert/test_aer/ref.03
-rw-r--r--training/dpmert/test_aer/weights13
-rw-r--r--training/dpmert/test_data/0.json.gzbin0 -> 13709 bytes
-rw-r--r--training/dpmert/test_data/1.json.gzbin0 -> 204803 bytes
-rw-r--r--training/dpmert/test_data/c2e.txt.02
-rw-r--r--training/dpmert/test_data/c2e.txt.12
-rw-r--r--training/dpmert/test_data/c2e.txt.22
-rw-r--r--training/dpmert/test_data/c2e.txt.32
-rw-r--r--training/dpmert/test_data/re.txt.05
-rw-r--r--training/dpmert/test_data/re.txt.15
-rw-r--r--training/dpmert/test_data/re.txt.25
-rw-r--r--training/dpmert/test_data/re.txt.35
-rw-r--r--training/dtrain/Makefile.am7
-rw-r--r--training/dtrain/README.md30
-rw-r--r--training/dtrain/dtrain.cc553
-rw-r--r--training/dtrain/dtrain.h92
-rw-r--r--training/dtrain/examples/parallelized/README5
-rw-r--r--training/dtrain/examples/parallelized/cdec.ini22
-rw-r--r--training/dtrain/examples/parallelized/dtrain.ini16
-rw-r--r--training/dtrain/examples/parallelized/grammar/grammar.out.0.gzbin0 -> 8318 bytes
-rw-r--r--training/dtrain/examples/parallelized/grammar/grammar.out.1.gzbin0 -> 358560 bytes
-rw-r--r--training/dtrain/examples/parallelized/grammar/grammar.out.2.gzbin0 -> 1014466 bytes
-rw-r--r--training/dtrain/examples/parallelized/grammar/grammar.out.3.gzbin0 -> 391811 bytes
-rw-r--r--training/dtrain/examples/parallelized/grammar/grammar.out.4.gzbin0 -> 149590 bytes
-rw-r--r--training/dtrain/examples/parallelized/grammar/grammar.out.5.gzbin0 -> 537024 bytes
-rw-r--r--training/dtrain/examples/parallelized/grammar/grammar.out.6.gzbin0 -> 291286 bytes
-rw-r--r--training/dtrain/examples/parallelized/grammar/grammar.out.7.gzbin0 -> 1038140 bytes
-rw-r--r--training/dtrain/examples/parallelized/grammar/grammar.out.8.gzbin0 -> 419889 bytes
-rw-r--r--training/dtrain/examples/parallelized/grammar/grammar.out.9.gzbin0 -> 409140 bytes
-rw-r--r--training/dtrain/examples/parallelized/in10
-rw-r--r--training/dtrain/examples/parallelized/refs10
-rw-r--r--training/dtrain/examples/parallelized/work/out.0.061
-rw-r--r--training/dtrain/examples/parallelized/work/out.0.162
-rw-r--r--training/dtrain/examples/parallelized/work/out.1.061
-rw-r--r--training/dtrain/examples/parallelized/work/out.1.162
-rw-r--r--training/dtrain/examples/parallelized/work/shard.0.0.in5
-rw-r--r--training/dtrain/examples/parallelized/work/shard.0.0.refs5
-rw-r--r--training/dtrain/examples/parallelized/work/shard.1.0.in5
-rw-r--r--training/dtrain/examples/parallelized/work/shard.1.0.refs5
-rw-r--r--training/dtrain/examples/parallelized/work/weights.012
-rw-r--r--training/dtrain/examples/parallelized/work/weights.0.012
-rw-r--r--training/dtrain/examples/parallelized/work/weights.0.112
-rw-r--r--training/dtrain/examples/parallelized/work/weights.112
-rw-r--r--training/dtrain/examples/parallelized/work/weights.1.011
-rw-r--r--training/dtrain/examples/parallelized/work/weights.1.112
-rw-r--r--training/dtrain/examples/standard/README2
-rw-r--r--training/dtrain/examples/standard/cdec.ini26
-rw-r--r--training/dtrain/examples/standard/dtrain.ini24
-rw-r--r--training/dtrain/examples/standard/expected-output91
-rw-r--r--training/dtrain/examples/standard/nc-wmt11.de.gzbin0 -> 58324 bytes
-rw-r--r--training/dtrain/examples/standard/nc-wmt11.en.gzbin0 -> 49600 bytes
-rw-r--r--training/dtrain/examples/standard/nc-wmt11.en.srilm.gzbin0 -> 16017291 bytes
-rw-r--r--training/dtrain/examples/standard/nc-wmt11.grammar.gzbin0 -> 1399924 bytes
-rw-r--r--training/dtrain/examples/toy/cdec.ini3
-rw-r--r--training/dtrain/examples/toy/dtrain.ini13
-rw-r--r--training/dtrain/examples/toy/expected-output77
-rw-r--r--training/dtrain/examples/toy/grammar.gzbin0 -> 219 bytes
-rw-r--r--training/dtrain/examples/toy/src2
-rw-r--r--training/dtrain/examples/toy/tgt2
-rw-r--r--training/dtrain/kbestget.h152
-rw-r--r--training/dtrain/ksampler.h61
-rwxr-xr-xtraining/dtrain/lplp.rb123
-rw-r--r--training/dtrain/pairsampling.h140
-rwxr-xr-xtraining/dtrain/parallelize.rb149
-rw-r--r--training/dtrain/score.cc283
-rw-r--r--training/dtrain/score.h217
-rw-r--r--training/fast_align.cc271
-rw-r--r--training/feature_expectations.cc232
-rw-r--r--training/lbl_model.cc421
-rw-r--r--training/liblbfgs/Jamfile5
-rw-r--r--training/liblbfgs/Makefile.am11
-rw-r--r--training/minrisk/Makefile.am8
-rwxr-xr-xtraining/minrisk/minrisk.pl540
-rwxr-xr-xtraining/minrisk/minrisk_generate_input.pl18
-rw-r--r--training/minrisk/minrisk_optimize.cc197
-rw-r--r--training/mira/Makefile.am6
-rw-r--r--training/mira/kbest_mira.cc322
-rw-r--r--training/mpi_em_optimize.cc389
-rw-r--r--training/mr_em_adapted_reduce.cc173
-rw-r--r--training/mr_em_map_adapter.cc160
-rw-r--r--training/mr_optimize_reduce.cc231
-rw-r--r--training/mr_reduce_to_weights.cc109
-rw-r--r--training/pro/Makefile.am13
-rwxr-xr-xtraining/pro/mr_pro_generate_mapper_input.pl18
-rw-r--r--training/pro/mr_pro_map.cc201
-rw-r--r--training/pro/mr_pro_reduce.cc286
-rwxr-xr-xtraining/pro/pro.pl555
-rw-r--r--training/rampion/Makefile.am8
-rwxr-xr-xtraining/rampion/rampion.pl540
-rw-r--r--training/rampion/rampion_cccp.cc168
-rwxr-xr-xtraining/rampion/rampion_generate_input.pl18
-rw-r--r--training/ttables.cc31
-rw-r--r--training/ttables.h101
-rw-r--r--training/utils/Makefile.am46
-rw-r--r--training/utils/candidate_set.cc (renamed from training/candidate_set.cc)0
-rw-r--r--training/utils/candidate_set.h (renamed from training/candidate_set.h)0
-rwxr-xr-xtraining/utils/decode-and-evaluate.pl246
-rw-r--r--training/utils/entropy.cc (renamed from training/entropy.cc)0
-rw-r--r--training/utils/entropy.h (renamed from training/entropy.h)0
-rw-r--r--training/utils/grammar_convert.cc (renamed from training/grammar_convert.cc)0
-rw-r--r--training/utils/lbfgs.h (renamed from training/lbfgs.h)0
-rw-r--r--training/utils/lbfgs_test.cc (renamed from training/lbfgs_test.cc)0
-rw-r--r--training/utils/libcall.pl71
-rw-r--r--training/utils/online_optimizer.cc (renamed from training/online_optimizer.cc)0
-rw-r--r--training/utils/online_optimizer.h (renamed from training/online_optimizer.h)0
-rw-r--r--training/utils/optimize.cc (renamed from training/optimize.cc)0
-rw-r--r--training/utils/optimize.h (renamed from training/optimize.h)0
-rw-r--r--training/utils/optimize_test.cc (renamed from training/optimize_test.cc)0
-rwxr-xr-xtraining/utils/parallelize.pl423
-rw-r--r--training/utils/risk.cc (renamed from training/risk.cc)0
-rw-r--r--training/utils/risk.h (renamed from training/risk.h)0
-rw-r--r--training/utils/sentclient.cc76
-rw-r--r--training/utils/sentserver.cc515
-rw-r--r--training/utils/sentserver.h6
156 files changed, 11101 insertions, 2574 deletions
diff --git a/training/Jamfile b/training/Jamfile
deleted file mode 100644
index 073451fa..00000000
--- a/training/Jamfile
+++ /dev/null
@@ -1,25 +0,0 @@
-import testing ;
-import option ;
-
-lib training :
- ..//utils
- ..//mteval
- ..//decoder
- ../klm/lm//kenlm
- ..//boost_program_options
- ttables.cc
- : <include>.
- : :
- <library>..//decoder
- <library>../klm/lm//kenlm
- <library>..//utils
- <library>..//mteval
- <library>..//boost_program_options
- ;
-
-exe model1 : model1.cc : <include>../decoder ;
-
-# // all_tests [ glob *_test.cc ] : ..//decoder : <testing.arg>$(TOP)/decoder/test_data ;
-
-alias programs : model1 ;
-
diff --git a/training/Makefile.am b/training/Makefile.am
index 5254333a..e95e045f 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -1,91 +1,11 @@
-bin_PROGRAMS = \
- fast_align \
- lbl_model \
- test_ngram \
- mr_em_map_adapter \
- mr_em_adapted_reduce \
- mr_reduce_to_weights \
- mr_optimize_reduce \
- grammar_convert \
- plftools \
- collapse_weights \
- mpi_extract_reachable \
- mpi_extract_features \
- mpi_online_optimize \
- mpi_flex_optimize \
- mpi_batch_optimize \
- mpi_compute_cllh \
- augment_grammar
+SUBDIRS = \
+ liblbfgs \
+ utils \
+ crf \
+ minrisk \
+ dpmert \
+ pro \
+ dtrain \
+ mira \
+ rampion
-noinst_PROGRAMS = \
- lbfgs_test \
- optimize_test
-
-TESTS = lbfgs_test optimize_test
-
-noinst_LIBRARIES = libtraining.a
-libtraining_a_SOURCES = \
- candidate_set.cc \
- entropy.cc \
- optimize.cc \
- online_optimizer.cc \
- risk.cc
-
-mpi_online_optimize_SOURCES = mpi_online_optimize.cc
-mpi_online_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc
-mpi_flex_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc
-mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_extract_features_SOURCES = mpi_extract_features.cc
-mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc
-mpi_batch_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc
-mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-augment_grammar_SOURCES = augment_grammar.cc
-augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-test_ngram_SOURCES = test_ngram.cc
-test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-fast_align_SOURCES = fast_align.cc ttables.cc
-fast_align_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
-lbl_model_SOURCES = lbl_model.cc
-lbl_model_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
-grammar_convert_SOURCES = grammar_convert.cc
-grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
-optimize_test_SOURCES = optimize_test.cc
-optimize_test_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
-collapse_weights_SOURCES = collapse_weights.cc
-collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
-lbfgs_test_SOURCES = lbfgs_test.cc
-lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc
-mr_optimize_reduce_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc
-mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc
-mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc
-mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
-plftools_SOURCES = plftools.cc
-plftools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
-
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm
diff --git a/training/add-model1-features-to-scfg.pl b/training/add-model1-features-to-scfg.pl
deleted file mode 100755
index a0074317..00000000
--- a/training/add-model1-features-to-scfg.pl
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/perl -w
-
-# [X] ||| so [X,1] die [X,2] der ||| as [X,1] existing [X,2] the ||| 2.47712135315 2.53182387352 5.07100057602 ||| 0-0 2-2 4-4
-# [X] ||| so [X,1] die [X,2] der ||| this [X,1] the [X,2] of ||| 2.47712135315 3.19828724861 2.38270020485 ||| 0-0 2-2 4-4
-# [X] ||| so [X,1] die [X,2] der ||| as [X,1] the [X,2] the ||| 2.47712135315 2.53182387352 1.48463630676 ||| 0-0 2-2 4-4
-# [X] ||| so [X,1] die [X,2] der ||| is [X,1] the [X,2] of the ||| 2.47712135315 3.45197868347 2.64251494408 ||| 0-0 2-2 4-4 4-5
-
-die "Usage: $0 model1.f-e model1.e-f < grammar.scfg\n (use trianing/model1 to extract the model files)\n" unless scalar @ARGV == 2;
-
-my $fm1 = shift @ARGV;
-die unless $fm1;
-my $frm1 = shift @ARGV;
-die unless $frm1;
-open M1,"<$fm1" or die;
-open RM1,"<$frm1" or die;
-print STDERR "Loading Model 1 probs from $fm1...\n";
-my %m1;
-while(<M1>) {
- chomp;
- my ($f, $e, $lp) = split /\s+/;
- $m1{$e}->{$f} = exp($lp);
-}
-close M1;
-
-print STDERR "Loading Inverse Model 1 probs from $frm1...\n";
-my %rm1;
-while(<RM1>) {
- chomp;
- my ($e, $f, $lp) = split /\s+/;
- $rm1{$f}->{$e} = exp($lp);
-}
-close RM1;
-
-my @label = qw( EGivenF LexFGivenE LexEGivenF );
-while(<>) {
- chomp;
- my ($l, $f, $e, $sscores, $al) = split / \|\|\| /;
- my @scores = split /\s+/, $sscores;
- unless ($sscores =~ /=/) {
- for (my $i=0; $i<3; $i++) { $scores[$i] = "$label[$i]=$scores[$i]"; }
- }
- push @scores, "RuleCount=1";
- my @fs = split /\s+/, $f;
- my @es = split /\s+/, $e;
- my $flen = scalar @fs;
- my $elen = scalar @es;
- my $pgen = 0;
- my $nongen = 0;
- for (my $i =0; $i < $flen; $i++) {
- my $ftot = 0;
- next if ($fs[$i] =~ /\[X/);
- my $cr = $rm1{$fs[$i]};
- for (my $j=0; $j <= $elen; $j++) {
- my $ej = '<eps>';
- if ($j < $elen) { $ej = $es[$j]; }
- my $p = $cr->{$ej};
- if (defined $p) { $ftot += $p; }
- }
- if ($ftot == 0) { $nongen = 1; last; }
- $pgen += log($ftot) - log($elen);
- }
- my $bad = 0;
- my $good = 0;
- unless ($nongen) { push @scores, "RGood=1"; $good++; } else { push @scores, "RBad=1"; $bad++; }
-
- $nongen = 0;
- $pgen = 0;
- for (my $i =0; $i < $elen; $i++) {
- my $etot = 0;
- next if ($es[$i] =~ /\[X/);
- my $cr = $m1{$es[$i]};
-# print STDERR "$es[$i]\n";
- for (my $j=0; $j <= $flen; $j++) {
- my $fj = '<eps>';
- if ($j < $flen) { $fj = $fs[$j]; }
- my $p = $cr->{$fj};
-# print STDERR " $fs[$j] : $p\n";
- if (defined $p) { $etot += $p; }
- }
- if ($etot == 0) { $nongen = 1; last; }
- $pgen += log($etot) - log($flen);
- }
- unless ($nongen) {
- push @scores, "FGood=1";
- if ($good) { push @scores, "BothGood=1"; } else { push @scores, "SusDel=1"; }
- } else {
- push @scores, "FBad=1";
- if ($bad) { push @scores, "BothBad=1"; } else { push @scores, "SusHall=1"; }
- }
- print "$l ||| $f ||| $e ||| @scores";
- if (defined $al) { print " ||| $al\n"; } else { print "\n"; }
-}
-
diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc
deleted file mode 100644
index dc480f6c..00000000
--- a/training/collapse_weights.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-char const* NOTES =
- "ZF_and_E means unnormalized scaled features.\n"
- "For grammars with one nonterminal: F_and_E is joint,\n"
- "F_given_E and E_given_F are conditional.\n"
- "TODO: group rules by root nonterminal and then normalize.\n";
-
-
-#include <iostream>
-#include <fstream>
-#include <tr1/unordered_map>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include <boost/functional/hash.hpp>
-
-#include "prob.h"
-#include "filelib.h"
-#include "trule.h"
-#include "weights.h"
-
-namespace po = boost::program_options;
-using namespace std;
-
-typedef std::tr1::unordered_map<vector<WordID>, prob_t, boost::hash<vector<WordID> > > MarginalMap;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("grammar,g", po::value<string>(), "Grammar file")
- ("weights,w", po::value<string>(), "Weights file")
- ("unnormalized,u", "Always include ZF_and_E unnormalized score (default: only if sum was >1)")
- ;
- po::options_description clo("Command line options");
- clo.add_options()
- ("config,c", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- const string cfg = (*conf)["config"].as<string>();
- cerr << "Configuration file: " << cfg << endl;
- ifstream config(cfg.c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || !conf->count("grammar") || !conf->count("weights")) {
- cerr << dcmdline_options << endl;
- cerr << NOTES << endl;
- exit(1);
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const string wfile = conf["weights"].as<string>();
- const string gfile = conf["grammar"].as<string>();
- vector<weight_t> w;
- Weights::InitFromFile(wfile, &w);
- MarginalMap e_tots;
- MarginalMap f_tots;
- prob_t tot;
- {
- ReadFile rf(gfile);
- assert(*rf.stream());
- istream& in = *rf.stream();
- cerr << "Computing marginals...\n";
- int lc = 0;
- while(in) {
- string line;
- getline(in, line);
- ++lc;
- if (line.empty()) continue;
- TRule tr(line, true);
- if (tr.GetFeatureValues().empty())
- cerr << "Line " << lc << ": empty features - may introduce bias\n";
- prob_t prob;
- prob.logeq(tr.GetFeatureValues().dot(w));
- e_tots[tr.e_] += prob;
- f_tots[tr.f_] += prob;
- tot += prob;
- }
- }
- bool normalized = (fabs(log(tot)) < 0.001);
- cerr << "Total: " << tot << (normalized ? " [normalized]" : " [scaled]") << endl;
- ReadFile rf(gfile);
- istream&in = *rf.stream();
- while(in) {
- string line;
- getline(in, line);
- if (line.empty()) continue;
- TRule tr(line, true);
- const double lp = tr.GetFeatureValues().dot(w);
- if (isinf(lp)) { continue; }
- tr.scores_.clear();
-
- cout << tr.AsString() << " ||| F_and_E=" << lp - log(tot);
- if (!normalized || conf.count("unnormalized")) {
- cout << ";ZF_and_E=" << lp;
- }
- cout << ";F_given_E=" << lp - log(e_tots[tr.e_])
- << ";E_given_F=" << lp - log(f_tots[tr.f_]) << endl;
- }
- return 0;
-}
-
diff --git a/training/crf/Makefile.am b/training/crf/Makefile.am
new file mode 100644
index 00000000..4a8c30fd
--- /dev/null
+++ b/training/crf/Makefile.am
@@ -0,0 +1,31 @@
+bin_PROGRAMS = \
+ mpi_batch_optimize \
+ mpi_compute_cllh \
+ mpi_extract_features \
+ mpi_extract_reachable \
+ mpi_flex_optimize \
+ mpi_online_optimize \
+ mpi_baum_welch
+
+mpi_baum_welch_SOURCES = mpi_baum_welch.cc
+mpi_baum_welch_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz
+
+mpi_online_optimize_SOURCES = mpi_online_optimize.cc
+mpi_online_optimize_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz
+
+mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc
+mpi_flex_optimize_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz
+
+mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc
+mpi_extract_reachable_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz
+
+mpi_extract_features_SOURCES = mpi_extract_features.cc
+mpi_extract_features_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz
+
+mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc cllh_observer.h
+mpi_batch_optimize_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz
+
+mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc cllh_observer.h
+mpi_compute_cllh_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lz
+
+AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir)/training -I$(top_srcdir)/training/utils -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/crf/baum_welch_example/README.md b/training/crf/baum_welch_example/README.md
new file mode 100644
index 00000000..97525da5
--- /dev/null
+++ b/training/crf/baum_welch_example/README.md
@@ -0,0 +1,32 @@
+Here's how to do Baum-Welch training with `cdec`.
+
+## Set the tags you want.
+
+First, set the number of tags you want in tagset.txt (these
+can be any symbols, listed one after another, separated
+by whitespace), e.g.:
+
+ C1 C2 C3 C4
+
+## Extract the parameter feature names
+
+ ../mpi_extract_features -c cdec.ini -t train.txt
+
+If you have compiled with MPI, you can use `mpirun`:
+
+ mpirun -np 8 ../mpi_extract_features -c cdec.ini -t train.txt
+
+## Randomly initialize the weights file
+
+ sort -u features.* | ./random_init.pl > weights.init
+
+## Run training
+
+ ../mpi_baum_welch -c cdec.ini -t train.txt -w weights.init -n 50
+
+Again, if you have compiled with MPI, you can use `mpirun`:
+
+ mpirun -np 8 ../mpi_baum_welch -c cdec.ini -t train.txt -w weights.init -n 50
+
+The `-n` flag indicates how many iterations to run for.
+
diff --git a/training/crf/baum_welch_example/cdec.ini b/training/crf/baum_welch_example/cdec.ini
new file mode 100644
index 00000000..61203da7
--- /dev/null
+++ b/training/crf/baum_welch_example/cdec.ini
@@ -0,0 +1,5 @@
+feature_function=Tagger_BigramIndicator
+feature_function=LexicalPairIndicator
+formalism=tagger
+tagger_tagset=tagset.txt
+intersection_strategy=full
diff --git a/training/crf/baum_welch_example/random_init.pl b/training/crf/baum_welch_example/random_init.pl
new file mode 100755
index 00000000..98467ed1
--- /dev/null
+++ b/training/crf/baum_welch_example/random_init.pl
@@ -0,0 +1,9 @@
+#!/usr/bin/perl -w
+while(<>) {
+ chomp;
+ my ($a,$b,@d) =split /\s+/;
+ die "Bad input" if scalar @d > 0;
+ $r = -rand() * rand() - 0.5;
+ $r = 0 if $a =~ /^Uni:/;
+ print "$a $r\n";
+}
diff --git a/training/crf/baum_welch_example/tagset.txt b/training/crf/baum_welch_example/tagset.txt
new file mode 100644
index 00000000..93a48451
--- /dev/null
+++ b/training/crf/baum_welch_example/tagset.txt
@@ -0,0 +1 @@
+1 2 3 4
diff --git a/training/crf/baum_welch_example/train.txt b/training/crf/baum_welch_example/train.txt
new file mode 100644
index 00000000..e9c3455e
--- /dev/null
+++ b/training/crf/baum_welch_example/train.txt
@@ -0,0 +1,2000 @@
+t h e
+t o
+o f
+i n
+a n d
+a
+s a i d
+f o r
+o n
+t h a t
+w i t h
+w a s
+i s
+b y
+a t
+h e
+a s
+f r o m
+i t
+h a s
+b e
+h i s
+h a v e
+w i l l
+a n
+a r e
+w e r e
+b u t
+n o t
+w h o
+a f t e r
+h a d
+y e a r
+i t s
+t w o
+t h i s
+w h i c h
+t h e y
+t h e i r
+g o v e r n m e n t
+b e e n
+w e
+p e r c e n t
+w o u l d
+n e w
+i
+a l s o
+u p
+m o r e
+o n e
+p e o p l e
+f i r s t
+l a s t
+a b o u t
+c h i n a
+p r e s i d e n t
+o v e r
+m i l l i o n
+o r
+o u t
+w o r l d
+w h e n
+a l l
+o t h e r
+m i n i s t e r
+t h r e e
+t h a n
+u n i t e d
+t h e r e
+a g a i n s t
+i n t o
+c o u n t r y
+s o m e
+p o l i c e
+n o
+t i m e
+y e a r s
+s t a t e
+w e d n e s d a y
+t u e s d a y
+t h u r s d a y
+s t a t e s
+m o n d a y
+u s
+c o u l d
+i f
+f r i d a y
+s i n c e
+b i l l i o n
+s h e
+f o r e i g n
+o f f i c i a l s
+d a y
+i n t e r n a t i o n a l
+h e r
+b e t w e e n
+o n l y
+b e f o r e
+s o u t h
+w h i l e
+d u r i n g
+n a t i o n a l
+t o l d
+s e c o n d
+g r o u p
+f o u r
+d o w n
+c i t y
+p a r t y
+t h e m
+s e c u r i t y
+d o
+m a d e
+d o l l a r s
+p o i n t s
+u n d e r
+m i l i t a r y
+b e c a u s e
+w e e k
+c o u n t r i e s
+c a n
+c h i n e s e
+o f f
+s u n d a y
+m o s t
+s o
+h i m
+e c o n o m i c
+f o r m e r
+i r a q
+f i v e
+s a t u r d a y
+a c c o r d i n g
+d i d
+n o w
+o f f i c i a l
+m a y
+n e w s
+w a r
+a n y
+w h e r e
+t e a m
+m e e t i n g
+k i l l e d
+b a n k
+s h o u l d
+j u s t
+r e p o r t e d
+m a n y
+n e x t
+w h a t
+c o m p a n y
+i n c l u d i n g
+b a c k
+m o n t h
+r e p o r t
+o u r
+p r i m e
+m a r k e t
+s t i l l
+b e i n g
+c o u r t
+t r a d e
+h e r e
+p e a c e
+h i g h
+o l d
+s e t
+t h r o u g h
+y o u
+i s r a e l
+t a l k s
+e n d
+t a k e
+e x p e c t e d
+p o l i t i c a l
+s i x
+s u c h
+b o t h
+m a k e
+h o m e
+l o c a l
+j a p a n
+r u s s i a
+s a y i n g
+g e n e r a l
+t o p
+a n o t h e r
+e u r o p e a n
+n o r t h
+h e l d
+t h i r d
+m a j o r
+s t a t e m e n t
+w e l l
+a m e r i c a n
+i s r a e l i
+t a i w a n
+l e a d e r
+c a p i t a l
+l o n g
+o i l
+t h o s e
+c a l l e d
+p a r t
+s p o k e s m a n
+w o r k
+d e v e l o p m e n t
+a d d e d
+s a y s
+w o n
+m e m b e r s
+l e f t
+c h i e f
+g a m e
+l i k e
+t h e n
+h e l p
+s a y
+p a l e s t i n i a n
+v e r y
+c u p
+p u b l i c
+f r a n c e
+c e n t r a l
+l e a d e r s
+w i n
+b u s h
+m i n i s t r y
+m o n t h s
+g e t
+w a y
+d a y s
+r e g i o n
+s u p p o r t
+t r o o p s
+a g e n c y
+f o r c e s
+e a r l i e r
+e v e n
+n a t i o n s
+v i s i t
+g a m e s
+e u
+f i n a l
+a m o n g
+h o u s e
+s e v e r a l
+e a r l y
+l e d
+d l r s
+l a t e r
+w o m e n
+k o n g
+h o n g
+p r e s s
+p o w e r
+t o d a y
+o p e n
+i n d e x
+o f f i c e
+f o l l o w i n g
+a r o u n d
+b a s e d
+c o n f e r e n c e
+b r i t i s h
+c o u n c i l
+u n i o n
+t o o k
+c a m e
+w e s t
+r u n
+h o w e v e r
+e a s t
+l a t e
+s e a s o n
+g o o d
+c l o s e
+g e r m a n y
+l e a d
+p a s t
+d e f e n s e
+p l a c e
+n u m b e r
+a r m y
+r u s s i a n
+l a w
+i n d i a
+m e n
+f i n a n c i a l
+e c o n o m y
+l e a s t
+s e c r e t a r y
+s a m e
+y o r k
+f o u n d
+g o i n g
+r i g h t
+g o
+m y
+o p p o s i t i o n
+f o r c e
+a g r e e m e n t
+e l e c t i o n
+h o w
+b u s i n e s s
+f r e n c h
+a u t h o r i t i e s
+p l a y
+m u c h
+r i g h t s
+t i m e s
+c o m m i t t e e
+r o u n d
+p r o v i n c e
+k o r e a
+h a l f
+a t t a c k
+p r i c e s
+s t o c k
+h i t
+p l a n
+a r e a
+c o o p e r a t i o n
+s e v e n
+n e a r
+e x c h a n g e
+u s e d
+n u c l e a r
+p a k i s t a n
+b e i j i n g
+a n n o u n c e d
+a i r
+a f r i c a
+c e n t e r
+a g o
+t h e s e
+d e c i s i o n
+a t t a c k s
+w i t h o u t
+m a t c h
+m a r c h
+n a t i o n
+h e a d
+t o t a l
+c o m p a n i e s
+m a n
+d e a l
+w a s h i n g t o n
+r e c e n t
+c a s e
+f i r e
+n i g h t
+a u s t r a l i a
+a f r i c a n
+u n t i l
+i r a n
+e l e c t i o n s
+s o u t h e r n
+l e a g u e
+p u t
+e a c h
+m e m b e r
+c h i l d r e n
+h e a l t h
+p c
+p a r l i a m e n t
+l o s t
+t h i n k
+d e a t h
+m u s t
+e i g h t
+w o r k e r s
+u s e
+b r i t a i n
+w a n t
+s y s t e m
+r e c o r d
+d e p a r t m e n t
+p r o g r a m
+e f f o r t s
+g r o w t h
+r e s u l t s
+i r a q i
+i s s u e
+b e s t
+w h e t h e r
+h u m a n
+n o r t h e r n
+c o n t r o l
+f a r
+f u r t h e r
+a l r e a d y
+s h a r e s
+r e l a t i o n s
+m e e t
+s o l d i e r s
+s e e
+f r e e
+c o m e
+j a p a n e s e
+m o n e y
+d o l l a r
+r e p o r t s
+d i r e c t o r
+s h a r e
+g i v e
+j u n e
+c o m m i s s i o n
+l a r g e s t
+i n d u s t r y
+c o n t i n u e
+s t a r t
+c a m p a i g n
+l e a d i n g
+q u a r t e r
+i n f o r m a t i o n
+v i c t o r y
+r o s e
+o t h e r s
+a n t i
+r e t u r n
+f a m i l y
+i s s u e s
+s h o t
+p o l i c y
+e u r o p e
+m e d i a
+p l a n s
+b o r d e r
+n e e d
+p e r
+a r e a s
+j u l y
+v i o l e n c e
+d e s p i t e
+d o e s
+s t r o n g
+c h a i r m a n
+s e r v i c e
+v o t e
+a s k e d
+f o o d
+f e d e r a l
+w e n t
+t a k e n
+d u e
+m o v e
+o w n
+b e g a n
+i t a l y
+g r o u p s
+p o s s i b l e
+l i f e
+c l e a r
+r a t e
+f e l l
+c r i s i s
+r e b e l s
+p o i n t
+d e m o c r a t i c
+w a t e r
+a h e a d
+i n v e s t m e n t
+i n c r e a s e
+s h o w
+p l a y e r s
+r e l e a s e d
+g e r m a n
+t o w n
+n e a r l y
+p r o c e s s
+m i l e s
+f e w
+d i e d
+a d m i n i s t r a t i o n
+a p r i l
+w e e k s
+l e v e l
+k e y
+a s i a
+s a l e s
+w e a p o n s
+c l o s e d
+b e h i n d
+m i n u t e s
+a g r e e d
+p r e s i d e n t i a l
+g r e a t
+m a k i n g
+t o o
+p r i c e
+j o h n
+g o t
+f o u r t h
+a g a i n
+k i l o m e t e r s
+s i t u a t i o n
+m a i n
+w h i t e
+r e p o r t e r s
+h o u r s
+s e n i o r
+a s i a n
+r e p u b l i c
+a w a y
+g l o b a l
+f i g h t i n g
+s e r i e s
+b e t t e r
+n e w s p a p e r
+m e
+c l i n t o n
+a r r e s t e d
+h i g h e r
+k n o w
+f u t u r e
+s c o r e d
+g o l d
+n a t o
+m o r n i n g
+n e v e r
+b e a t
+w i t h i n
+r n
+c u r r e n t
+p e r i o d
+b e c o m e
+w e s t e r n
+i m p o r t a n t
+l o n d o n
+a u s t r a l i a n
+s p a i n
+e n e r g y
+a i d
+a c c u s e d
+o l y m p i c
+n i n e
+a c r o s s
+f a c e
+o r g a n i z a t i o n
+g o a l
+t a k i n g
+i n j u r e d
+s p e c i a l
+r a c e
+d a i l y
+s u m m i t
+s i d e
+l i n e
+o r d e r
+u n i v e r s i t y
+a f g h a n i s t a n
+p l a y e d
+b i g
+c a r
+t r y i n g
+e n g l a n d
+q u o t e d
+d e
+a l o n g
+i s l a m i c
+o u t s i d e
+t r a d i n g
+e d s
+c u t
+a c t i o n
+p r o b l e m s
+v i c e
+w o r k i n g
+y e n
+b u i l d i n g
+s i g n e d
+k n o w n
+c h a n g e
+c h a r g e s
+s m a l l
+l o w e r
+a l t h o u g h
+s e n t
+c o n g r e s s
+h o s p i t a l
+h o l d
+m i g h t
+u n
+e v e r y
+g i v e n
+d e p u t y
+i n t e r e s t
+i s l a n d
+s c h o o l
+d r u g
+k i l l i n g
+r u l i n g
+t o u r
+o p e n i n g
+t e r m
+f u l l
+c l r
+l i t t l e
+m a r k e t s
+c o a c h
+j a n u a r y
+s c h e d u l e d
+k e e p
+t u r k e y
+p r e v i o u s
+e x e c u t i v e
+g a s
+m e t
+j o i n t
+t r i a l
+b o a r d
+p r o d u c t i o n
+i n d o n e s i a
+s e r v i c e s
+l i k e l y
+t h o u s a n d s
+i n d i a n
+p o s t
+a r a b
+c e n t s
+h o p e
+s i n g a p o r e
+p a l e s t i n i a n s
+p a r t i e s
+g a v e
+b i l l
+d e a d
+r o l e
+s e p t e m b e r
+t e l e v i s i o n
+c o m m u n i t y
+r e g i o n a l
+a d d i n g
+a m e r i c a
+o n c e
+y u a n
+t e s t
+s t o c k s
+w h o s e
+p a y
+p r i v a t e
+l a t e s t
+i n v e s t o r s
+f r o n t
+c a n a d a
+r e l e a s e
+r e c e i v e d
+m e a n w h i l e
+l e s s
+t h a i l a n d
+l a n d
+c h a m p i o n
+r e a c h e d
+u r g e d
+d e c e m b e r
+a s s o c i a t i o n
+f i g h t
+s i d e s
+s t a r t e d
+l a r g e
+y e t
+m i d d l e
+c a l l
+p r e s s u r e
+e n d e d
+s o c i a l
+p r o j e c t
+l o w
+h a r d
+c l u b
+p r e m i e r
+t e c h n o l o g y
+f a i l e d
+t o u r n a m e n t
+r e a l
+p r o v i d e
+g a z a
+m i n u t e
+a f f a i r s
+m i n i s t e r s
+p r o d u c t s
+r e s e a r c h
+s e e n
+g e o r g e
+e v e n t
+s t o p
+i n v e s t i g a t i o n
+a i r p o r t
+m e x i c o
+t i t l e
+t o k y o
+e a s t e r n
+b i g g e s t
+y o u n g
+d e m a n d
+t h o u g h
+a r m e d
+s a n
+o p e n e d
+m e a s u r e s
+n o v e m b e r
+a v e r a g e
+m a r k
+o c t o b e r
+k o r e a n
+r a d i o
+b o d y
+s e c t o r
+c a b i n e t
+g m t
+a s s o c i a t e d
+a p
+c i v i l
+t e r r o r i s m
+s h o w e d
+p r i s o n
+s i t e
+p r o b l e m
+s e s s i o n
+b r a z i l
+m u s l i m
+c o a l i t i o n
+b a g h d a d
+b i d
+s t r e e t
+c o m i n g
+b e l i e v e
+m a l a y s i a
+s t u d e n t s
+d e c i d e d
+f i e l d
+r e d
+n e g o t i a t i o n s
+w i n n i n g
+o p e r a t i o n
+c r o s s
+s o o n
+p l a n n e d
+a b l e
+t i e s
+t a x
+j u s t i c e
+d o m e s t i c
+d a v i d
+i n c l u d e
+n a m e
+b o m b
+t r a i n i n g
+j u d g e
+v i c t i m s
+m e d i c a l
+c o n d i t i o n
+f i n d
+r e m a i n
+i s s u e d
+f i n a n c e
+l o t
+l a b o r
+b t
+e n o u g h
+i m m e d i a t e l y
+s h o r t
+l o s s
+a n n u a l
+m o v e d
+r e b e l
+s t r i k e
+r o a d
+r e c e n t l y
+i t a l i a n
+c o n s t r u c t i o n
+t r y
+a u g u s t
+e x p r e s s e d
+m i l i t a n t s
+t o g e t h e r
+w a n t e d
+r a t e s
+f u n d
+f o r w a r d
+m i s s i o n
+d i s c u s s
+r e s u l t
+c a l l s
+k o s o v o
+o p e r a t i o n s
+c a s e s
+z e a l a n d
+s o u r c e s
+i n c r e a s e d
+l e g a l
+b a n k s
+i n v o l v e d
+o f f i c e r s
+l e a v e
+m e t e r s
+w a r n e d
+h a v i n g
+r e a c h
+b r i n g
+h i s t o r y
+d i s t r i c t
+j o b
+a l l o w e d
+a r r i v e d
+t o w a r d
+c l a i m e d
+e g y p t
+t e a m s
+a l l o w
+a l m o s t
+f e b r u a r y
+s e r i o u s
+p o o r
+c o n t i n u e d
+s t e p
+i n t e r v i e w
+e d u c a t i o n
+n o n
+r e a l l y
+s t a r
+l e e
+r e s i d e n t s
+b a n
+s o c c e r
+n e e d e d
+p a r i s
+i n d u s t r i a l
+p l a y e r
+m o s c o w
+s t a t i o n
+o f f e r
+h u n d r e d s
+t a l i b a n
+w o m a n
+m a n a g e m e n t
+l e b a n o n
+n o t e d
+c h e n
+p o s i t i o n
+f i n i s h e d
+c o s t
+e x p e r t s
+e v e r
+m o v e m e n t
+t e r r o r i s t
+p l a n e
+b l a c k
+d i f f e r e n t
+b e l i e v e d
+p l a y i n g
+c a u s e d
+h o p e s
+c o n d i t i o n s
+b r o u g h t
+f o r c e d
+l a u n c h e d
+w e e k e n d
+m i c h a e l
+s e a
+r i s e
+d e t a i l s
+s p o r t s
+e t h n i c
+s t a f f
+c h a n c e
+g o a l s
+b u d g e t
+h a n d
+b a s e
+s e c o n d s
+s r i
+s p e a k i n g
+o f f i c e r
+m a j o r i t y
+w a n t s
+c h a r g e d
+s h a n g h a i
+v i e t n a m
+x i n h u a
+c o m m e n t
+d r o p p e d
+t u r n e d
+p r o t e s t
+r e f o r m
+s u s p e c t e d
+a m i d
+t r i e d
+c i t i e s
+g r o u n d
+t u r k i s h
+s t a g e
+e f f o r t
+s
+c o m m u n i s t
+a n a l y s t s
+h a m a s
+p r o j e c t s
+c o n t r a c t
+i n d e p e n d e n c e
+l o o k i n g
+a m
+s i g n
+f o l l o w e d
+r e m a i n s
+c o m p a r e d
+u s i n g
+h e a v y
+a f t e r n o o n
+s t r a i g h t
+l o o k
+f a l l
+r e a d y
+e u r o
+c h a r g e
+w o u n d e d
+p r o g r e s s
+p a c i f i c
+d e n i e d
+h o u r
+c a r e e r
+c o n f i r m e d
+t h a i
+r u l e
+c o u r s e
+w i f e
+e x p o r t s
+b e c a m e
+a m e r i c a n s
+e m e r g e n c y
+a r a f a t
+r e f u s e d
+l i s t
+a l l e g e d
+c h a m p i o n s h i p
+p o p u l a t i o n
+n e e d s
+c o m p e t i t i o n
+o r d e r e d
+s a f e t y
+a u t h o r i t y
+i l l e g a l
+t v
+d o n e
+e v i d e n c e
+s t a y
+f i f t h
+s e e k i n g
+s t u d y
+l i v e
+r u n s
+c o a s t
+s a u d i
+h e l p e d
+a c t i v i t i e s
+m a n a g e r
+w o r t h
+k i n g
+g r o w i n g
+r u n n i n g
+f i r e d
+i n c l u d e d
+p a u l
+w a l l
+r e t u r n e d
+c o n f l i c t
+m y a n m a r
+d e m o c r a c y
+p r o
+f o r m
+a l w a y s
+a m b a s s a d o r
+m a t c h e s
+t h i n g s
+m a i n l a n d
+s a w
+d i s e a s e
+r e l a t e d
+f u n d s
+i n d e p e n d e n t
+t o n s
+a p p r o v e d
+e m b a s s y
+c u r r e n c y
+b r e a k
+s e n a t e
+c o n c e r n s
+f i g u r e s
+j o i n
+r e s o l u t i o n
+o f t e n
+c o n f i d e n c e
+e s p e c i a l l y
+w i n n e r
+c a r r i e d
+i m p r o v e
+s w e d e n
+z i m b a b w e
+t h r e a t
+c u r r e n t l y
+s i n g l e
+h i m s e l f
+l i v i n g
+r e f u g e e s
+a i m e d
+c o u n t y
+c a n n o t
+a r m s
+b u i l d
+g e t t i n g
+a p p e a r e d
+d i f f i c u l t
+s p a n i s h
+r i v e r
+m i s s i n g
+e s t i m a t e d
+s o m e t h i n g
+p r o p o s e d
+c e r e m o n y
+i n s t e a d
+b r o k e
+c h u r c h
+o l y m p i c s
+s p a c e
+p r o f i t
+v i l l a g e
+l i g h t
+p e r f o r m a n c e
+d e l e g a t i o n
+t r i p
+o v e r a l l
+p a r t s
+a c t
+c o r r u p t i o n
+d i v i s i o n
+s i m i l a r
+p o s i t i v e
+c a m p
+g r a n d
+p o r t
+s u p p o r t e r s
+r e p u b l i c a n
+b e g i n
+j o n e s
+p a r k
+b i l a t e r a l
+c l o u d y
+d i p l o m a t i c
+p r e s e n t
+l o s
+a r g e n t i n a
+t r a v e l
+s p e e c h
+a t t e n t i o n
+n e t
+j o b s
+a r r e s t
+p r o s e c u t o r s
+i n f l a t i o n
+n a m e d
+j o r d a n
+s o n
+g o v e r n m e n t s
+r u l e s
+p r o t e c t i o n
+k e n y a
+h o m e s
+l i v e s
+s e r b
+s a n c t i o n s
+a t t e m p t
+e x p o r t
+m e a n s
+n i g e r i a
+r e m a i n e d
+t u r n
+c r i m e s
+c o n c e r n
+e n v i r o n m e n t
+p l a n t
+l e t t e r
+v a l u e
+r e s p o n s e
+a s s e m b l y
+p r o p o s a l
+h o l d i n g
+b o m b i n g
+e n s u r e
+a f g h a n
+r e s o u r c e s
+f a m i l i e s
+r e s t
+i n s i d e
+t h r o u g h o u t
+m a t t e r
+c a u s e
+l a w m a k e r s
+i i
+f u e l
+c a l i f o r n i a
+e g y p t i a n
+o w n e d
+s u i c i d e
+c z e c h
+c a r e
+a t t o r n e y
+c l a i m s
+v o t e r s
+n e t w o r k
+b a l l
+p h i l i p p i n e
+f o o t b a l l
+s p o k e s w o m a n
+i n c i d e n t
+p r e v e n t
+w h y
+d e v e l o p i n g
+c i v i l i a n s
+e n g l i s h
+o b a m a
+i n t e r n e t
+r i c e
+s a d d a m
+y o u r
+u p d a t e s
+l e t
+d o i n g
+a i r c r a f t
+f l i g h t
+a n g e l e s
+i n t e l l i g e n c e
+p h i l i p p i n e s
+f a t h e r
+c r e d i t
+a l l i a n c e
+t e r m s
+r a i s e d
+i r a n i a n
+c h a n g e s
+s y r i a
+v a r i o u s
+i n d o n e s i a n
+l i
+i r e l a n d
+l e a v i n g
+d e c l i n e d
+c o m m o n
+i n j u r y
+t r e a t m e n t
+a v a i l a b l e
+c h a m p i o n s
+e l e c t e d
+s u m m e r
+d a t a
+o v e r s e a s
+p a i d
+c e n t u r y
+n o t h i n g
+f i r m
+r e l i g i o u s
+s w i t z e r l a n d
+o f f e r e d
+c h a m p i o n s h i p s
+t h o u g h t
+c a n d i d a t e
+c o n s i d e r e d
+r i s k
+c r i m e
+g o v e r n o r
+f i l m
+r a l l y
+f l o r i d a
+t e r r o r
+d o u b l e
+e q u i p m e n t
+j e r u s a l e m
+c a r r y i n g
+p e r s o n
+f e e l
+t e r r i t o r y
+a l
+c o m m e r c i a l
+u k r a i n e
+b o d i e s
+p r o t e s t s
+n e t h e r l a n d s
+f i n i s h
+a c c e s s
+t a r g e t
+a u s t r i a
+s o u r c e
+r e p r e s e n t a t i v e s
+s p e n t
+j e w i s h
+p o t e n t i a l
+r i s i n g
+t r e a t y
+c a n a d i a n
+a g e
+c a
+s p e n d i n g
+n e c e s s a r y
+r a i n
+z o n e
+c a r s
+p r o m o t e
+n a t u r a l
+d a m a g e
+f o c u s
+w e a t h e r
+p o l i c i e s
+p r o t e c t
+a i d s
+c o
+g i v i n g
+b c
+b a c k e d
+l a n k a
+a p p e a l
+r e j e c t e d
+f a n s
+b a d
+s o u t h e a s t
+r i v a l
+p l a n n i n g
+b o s n i a
+c o m e s
+b u y
+s o v i e t
+h o t e l
+d u t c h
+q u e s t i o n
+t a i p e i
+b o o s t
+c o s t s
+i n s t i t u t e
+s o c i e t y
+s h o o t i n g
+t h e m s e l v e s
+e v e n t s
+k i n d
+p a p e r
+w o r k e d
+c o n s t i t u t i o n
+u r g e n t
+s e t t l e m e n t
+e a r n i n g s
+j o s e
+m o t h e r
+a c c i d e n t
+f a c t
+d r o p
+r a n g e
+h a n d s
+s e e k
+h u g e
+l a w y e r
+s t a r t i n g
+h e a r t
+c o m m a n d e r
+t o u r i s m
+p a s s e n g e r s
+s u s p e c t s
+h i g h e s t
+p o p u l a r
+s t a b i l i t y
+s u p r e m e
+b u s
+r o b e r t
+b a t t l e
+p r o g r a m s
+c u b a
+w i n s
+d r u g s
+s u r v e y
+h o s t
+m u r d e r
+d a t e
+g u l f
+w i l l i a m s
+s e n d
+s u f f e r e d
+p e n a l t y
+k e p t
+s t a d i u m
+c i t i z e n s
+f i g u r e
+h e a d q u a r t e r s
+g u a r d
+p u b l i s h e d
+s t a n d
+t e n n i s
+c r e a t e
+b e g i n n i n g
+e v e n i n g
+p h o n e
+f o o t
+r u l e d
+c a s h
+s o l d
+c h i c a g o
+p o l a n d
+d e m o c r a t s
+r e f o r m s
+b o s n i a n
+s u r e
+c h i l d
+m a y o r
+a t t e n d
+l e a d e r s h i p
+e m p l o y e e s
+t e l e p h o n e
+l o s s e s
+b o r n
+a s s i s t a n c e
+t h i n g
+t r a i n
+s u p p l y
+e i t h e r
+b u i l t
+l a u n c h
+c r u d e
+m o v i n g
+g r e e c e
+t r a c k
+r a i s e
+d r i v e
+r e s p o n s i b i l i t y
+f e d e r a t i o n
+c o l o m b i a
+g r e e n
+c o n c e r n e d
+c a n d i d a t e s
+n e w s p a p e r s
+r e v i e w
+i n t e r i o r
+d e b t
+w h o l e
+t e x a s
+m o s t l y
+r e l i e f
+f a r m e r s
+g o o d s
+p a k i s t a n i
+d e g r e e s
+s e l l
+d e t a i n e d
+s w i s s
+c r i m i n a l
+d e c a d e s
+m i s s i l e
+a b o v e
+d r a w
+p a s s e d
+e x p l o s i o n
+m a k e s
+l a w s
+b a n g l a d e s h
+t a l k
+m a d r i d
+m a s s
+c o n v i c t e d
+i t e m s
+m e d a l
+s u c c e s s
+s e a t s
+q u i c k l y
+c a l l i n g
+k i m
+t r a f f i c
+d i r e c t
+o r g a n i z a t i o n s
+l e v e l s
+s e r v e
+a d d r e s s
+s t r e s s e d
+s t a n d i n g
+w a n g
+d e c l a r e d
+j a m e s
+c a p t a i n
+t h r e a t e n e d
+p r o m i s e d
+s u d a n
+v a n
+p a s s
+e n v i r o n m e n t a l
+r a t h e r
+w o r s t
+p o u n d s
+b l u e
+s i x t h
+m e t e r
+i n c l u d e s
+m u s i c
+r e d u c e
+t a k e s
+v o t e s
+r e s c u e
+c o m p l e t e d
+s e a r c h
+i n n i n g s
+v e h i c l e s
+c l a i m
+t r a n s p o r t
+a v o i d
+i n c o m e
+p o l l
+a f f e c t e d
+g e o r g i a
+g a i n e d
+w o
+r e
+v i s i t i n g
+r e s p o n s i b l e
+e f f e c t
+p o l l s
+h e a r i n g
+l o s i n g
+e s t a b l i s h e d
+f a i r
+g i a n t
+c h a l l e n g e
+f e e t
+p r o p e r t y
+t e s t s
+l e g
+a g r i c u l t u r e
+l o n g e r
+d e a t h s
+s q u a r e
+p a r t i c u l a r l y
+d i s p u t e
+b
+e n t e r p r i s e s
+v o l u m e
+c a r r y
+m i d
+s e p a r a t e
+i d e n t i f i e d
+i t s e l f
+h e a d e d
+a n o n y m i t y
+p a r l i a m e n t a r y
+c r a s h
+r e m a i n i n g
+j o u r n a l i s t s
+i n c r e a s i n g
+s t a t i s t i c s
+d e s c r i b e d
+b u r e a u
+i n j u r i e s
+p r o v i d e d
+j o i n e d
+i m m e d i a t e
+d e b a t e
+i m p a c t
+m e s s a g e
+m e e t i n g s
+r e q u e s t
+s c h o o l s
+o c c u r r e d
+r e m a r k s
+c o m m i t t e d
+p r o t e s t e r s
+t o u g h
+s p o k e
+s t r i p
+f a c e s
+c r o w d
+s h o w s
+w a r n i n g
+s t o r y
+q u a l i t y
+p e t e r
+f r e e d o m
+d e v e l o p
+m a r t i n
+p e r s o n a l
+s e r b i a
+a n y t h i n g
+b l a m e d
+i n t e r e s t s
+n e i g h b o r i n g
+d o c t o r s
+f l i g h t s
+s h i p
+r e g i m e
+b l a i r
+u n i t
+a g e n c i e s
+a f p
+s u g g e s t e d
+l a c k
+s e l l i n g
+a n n a n
+y u g o s l a v i a
+l a
+c o n s u m e r
+s u s p e n d e d
+s t o p p e d
+c o m m e n t s
+c o m p u t e r
+c o n s i d e r
+a i r l i n e s
+l e b a n e s e
+p r e p a r e d
+d i a l o g u e
+e x p e c t
+t w i c e
+p u t i n
+a l l e g a t i o n s
+b r o w n
+a c c e p t
+a p p r o v a l
+w i d e
+n e a r b y
+s y s t e m s
+v i e w
+p u s h
+p r o b a b l y
+e v e r y t h i n g
+d r a f t
+t r a d i t i o n a l
+s t a t u s
+s t r u c k
+s e i z e d
+p a r t l y
+s t a n d a r d
+h u s s e i n
+p o v e r t y
+d o z e n s
+r e g i o n s
+c r i c k e t
+l o a n s
+e
+b o o k
+b a s i s
+a n n o u n c e m e n t
+r u r a l
+s e r b s
+a d d i t i o n
+g r e e k
+c o m p l e t e
+r o o m
+g r e a t e r
+a l l e g e d l y
+f i n a l s
+f a c i n g
+l i m i t e d
+c u t s
+r i c h a r d
+b u s i n e s s e s
+l i n k e d
+p e a c e f u l
+c r e w
+t o u r i s t s
+m a i n l y
+p r i s o n e r s
+p o w e r f u l
+c r o a t i a
+f i l e d
+k u w a i t
+f o r u m
+r e s e r v e
+m i l a n
+b l a s t
+a n n i v e r s a r y
+a t t e n d e d
+e n d i n g
+d e v e l o p e d
+c e r t a i n
+b e l o w
+f e l t
+p r o v i n c i a l
+c y p r u s
+c r i t i c i z e d
+o p p o r t u n i t y
+s m i t h
+p o l i t i c s
+s e l f
+h u m a n i t a r i a n
+r e a s o n
+l a w y e r s
+r e v e n u e
+d o c u m e n t s
+w r o t e
+q u e s t i o n s
+n o r w a y
+d o w
+p a n e l
+f e a r
+s e n t e n c e d
+b a n n e d
+c i v i l i a n
+c u l t u r a l
+p e r s o n n e l
+b e l g i u m
+a b u
+c a p a c i t y
+a m o u n t
+s e c u r i t i e s
+b l o o d
+s i g n i f i c a n t
+e x p e r i e n c e
+a s e a n
+h o u s i n g
+j o h n s o n
+p h o t o s
+r o y a l
+i m p o r t s
+a d d i t i o n a l
+y e l t s i n
+c d y
+h e a r d
+t h o m a s
+b a n k i n g
+l e a d s
+v i s i t e d
+f e a r s
+u g a n d a
+d r i v e r
+c o n t r o l l e d
+d e m a n d s
+i n s t i t u t i o n s
+a l i
+c h r i s t i a n
+s t o r m
+f o r e c a s t
+g r a f
+f i g h t e r s
+s t r e e t s
+r e s p e c t
+s p o t
+w e b
+m i s s e d
+s c i e n c e
+h e a d s
+h i t s
+m a s s i v e
+c u l t u r e
+c o u p l e
+v e n e z u e l a
+r e p o r t e d l y
+i n s u r a n c e
+s p r e a d
+s o l u t i o n
+p l a c e d
+s e r v e d
+f a c i l i t i e s
+s t r a t e g y
+t e c h n i c a l
+s t e p s
+d e e p
+h o p e d
+d e c i d e
+s a l e
+j a i l
+d i s c u s s e d
+s a v e
+n e p a l
+a r a b i a
+e n v o y
+a t t a c k e d
+w a y s
+r e c e i v e
+h a p p y
+h a l l
+g u i l t y
+p r a c t i c e
+l o v e
+e u r o s
+o p e r a t i n g
+c h a n g e d
+b o s t o n
+d e c a d e
+d e f i c i t
+p r o d u c t
+l i n e s
+p a t i e n t s
+f r i e n d s
+s y d n e y
+a c c o r d
+t i e d
+s p e e d
+w o r d s
+t i e
+s c o r e
+c o n d u c t e d
+c r i t i c i s m
+m u s l i m s
+b r o t h e r
+c l a s s
+r o m a n i a
+h e l p i n g
+f a s t
+h a p p e n e d
+d e f e n d i n g
+n a v y
+w i t n e s s e s
+f u l l y
+s u s p e c t
+i s l a n d s
+m a i n t a i n
+p r e s e n c e
+j a k a r t a
+p a c k a g e
+y a r d s
+g a i n
+a c c o u n t
+s q u a d
+s h a r o n
+w i n g
+a c t i o n s
+a t h e n s
+s t r a t e g i c
+s t r e n g t h e n
+f r i e n d l y
+d e s t r o y e d
+a p p a r e n t l y
+c o n s e r v a t i v e
+g a i n s
+f a i l u r e
+f u t u r e s
+s h o t s
+r e l a t i o n s h i p
+c o m m i s s i o n e r
+m a l a y s i a n
+r e q u i r e d
+a t l a n t a
+a g r e e
+d e f e a t
+s t r i k e r
+a d v a n c e d
+b r a z i l i a n
+a s s e t s
+h o u s e s
+s u p p l i e s
+s a f e
+m i l l i o n s
+s o u g h t
+f r e s h
+v i d e o
+p r o s e c u t o r
+p u l l e d
+v e h i c l e
+t o l l
+p a r e n t s
+c e a s e
+a c t i v i s t s
+o r g a n i z e d
+e n t e r e d
+s h i i t e
+l a n g u a g e
+a b b a s
+b i n
+p r e v i o u s l y
+c l o s i n g
+w o r k s
+t e r r o r i s t s
+t o n y
+c o v e r
+f o l l o w
+l e g i s l a t i v e
+r i c h
+c l a s h e s
+i m p o s e d
+r a n
+m c c a i n
+s u c c e s s f u l
+s e v e n t h
+s c o r i n g
+c a u g h t
+a p p o i n t e d
+a l l i e s
+a d m i t t e d
+w o r l d w i d e
+o r d e r s
+d e m a n d e d
+c r e a t e d
+r a n k e d
+m i l i t a n t
+i n v e s t i g a t o r s
+s h o w i n g
+p o s s i b i l i t y
+s e a t
+d a u g h t e r
+s i t e s
+s h o r t l y
+c o m m e r c e
+n e t a n y a h u
+a d v a n c e
+a i r l i n e
+f i r m s
+a b r o a d
+f o u n d a t i o n
+c o m m i t m e n t
+p l e d g e d
+k i l l
+r e p r e s e n t a t i v e
+n o r t h w e s t
+s c e n e
+b e a t i n g
+i m p r o v e d
+r e s u m e
+w h o m
+s l i g h t l y
+v o t i n g
+b o m b i n g s
+s e r i o u s l y
+s e t t i n g
+c a r l o s
+e f f e c t i v e
+h k
+r e g u l a r
+j i a n g
+p r i n c e
+d e c l i n e
+b a y
+n o r t h e a s t
+s o l d i e r
+r e a c h i n g
+a g r e e m e n t s
+m i k e
+h u r t
+c r i t i c a l
+i d e a
+m i l o s e v i c
+f i s c a l
+t a r g e t s
+a g r i c u l t u r a l
+m u s h a r r a f
+d e s i g n e d
+o v e r n i g h t
+b o y
+d o z e n
+p r o d u c e
+c a l m
+s t a n d a r d s
+l e g i s l a t i o n
+s e n t e n c e
+w i t h d r a w a l
+s e e d e d
+c o m p o s i t e
+t r a d e d
+w i n t e r
+d a v i s
+t r u s t
+c l i m a t e
+i n d u s t r i e s
+p r o f i t s
+v o t e d
+c a m b o d i a
+s y r i a n
+s i g n s
+l o a n
+s t e e l
+e l e c t r i c i t y
+t e h r a n
+c i t i n g
+h u s b a n d
+b i t
+c o m b a t
+h a n d e d
+f e s t i v a l
+i m f
+p r e s i d e n c y
+c a p t u r e d
+s t u d e n t
+f i n e
+s t a t i o n s
+s i l v e r
+c h a v e z
+i n t e r
+m o m e n t
+t a b l e
+c o u p
+p o p e
+p r o v i n c e s
+a h m e d
+b u i l d i n g s
+o u t p u t
+l i b e r a t i o n
+m o n e t a r y
+c l o s e r
+c o l l e g e
+f l u
+a d v a n t a g e
+a s s i s t a n t
+g o n e
+s e c r e t
+x
+c a t h o l i c
+n a m e s
+l i s t e d
+f i n a l l y
+c a n c e r
+p r o d u c e d
+m e a s u r e
+f l e d
+l a r g e l y
+d e f e a t e d
+c o n g o
+b a s i c
+j e a n
+l o s e
+p r i z e
+b a n g k o k
+a s k
+f r a n c i s c o
+r e g i s t e r e d
+d i s a s t e r
+g o l f
+i n d i v i d u a l
+c o n t i n u e s
+w t o
+i n i t i a l
+a n y o n e
+q u a k e
+f a c e d
+s c i e n t i s t s
+m o b i l e
+p o s i t i o n s
+f i e l d s
+r e c o v e r y
+m u s e u m
+n u m b e r s
+d e n m a r k
+m a n i l a
+h o l d s
+c e n t
+e x
+e s t a b l i s h
+w i d e l y
+o f f i c e s
+i n s i s t e d
+u n i t s
+k a s h m i r
+r e f e r e n d u m
+l o c a t e d
+u p o n
+a l l o w i n g
+s c a l e
+o p p o s e d
+w a t c h
+i n d i c a t e d
+p a r t n e r
+e a r t h q u a k e
+s c a n d a l
+e v e r y o n e
+a p p r o a c h
+t r u c k
+i m p o r t a n c e
+t h r e a t s
+p o r t u g a l
+s e x
+r e c o r d s
+s u p e r
+s t o o d
+c o n t a c t
+m a t e r i a l s
+v i o l e n t
+p l a c e s
+a n a l y s t
+a d d s
+a l o n e
+g o e s
+m o v i e
+e x p e c t s
+a r t
+s e o u l
+m e x i c a n
+y e s t e r d a y
+p l a n e s
+n i n t h
+o n l i n e
+h e l i c o p t e r
+i m m i g r a t i o n
+p a r t n e r s
+i n f r a s t r u c t u r e
+b o a t
+v i s i t s
+n o r m a l
+s t a k e
+g u e r r i l l a s
+m a c a o
+w i l l i n g
+s u n
+a w a r d
+t e l l
+s o u t h w e s t
+s p o r t
+e n t e r
+r e s o l v e
+c h a n c e s
+m i a m i
+e l
+e n t i r e
diff --git a/training/cllh_observer.cc b/training/crf/cllh_observer.cc
index 58232769..4ec2fa65 100644
--- a/training/cllh_observer.cc
+++ b/training/crf/cllh_observer.cc
@@ -45,7 +45,7 @@ void ConditionalLikelihoodObserver::NotifyAlignmentForest(const SentenceMetadata
cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
exit(1);
}
- assert(!isnan(log_ref_z));
+ assert(!std::isnan(log_ref_z));
acc_obj += (cur_obj - log_ref_z);
trg_words += smeta.GetReference().size();
}
diff --git a/training/cllh_observer.h b/training/crf/cllh_observer.h
index 0de47331..0de47331 100644
--- a/training/cllh_observer.h
+++ b/training/crf/cllh_observer.h
diff --git a/training/mpi_batch_optimize.cc b/training/crf/mpi_batch_optimize.cc
index 6432f4a2..2eff07e4 100644
--- a/training/mpi_batch_optimize.cc
+++ b/training/crf/mpi_batch_optimize.cc
@@ -142,7 +142,7 @@ struct TrainingObserver : public DecoderObserver {
cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
exit(1);
}
- assert(!isnan(log_ref_z));
+ assert(!std::isnan(log_ref_z));
ref_exp -= cur_model_exp;
acc_grad -= ref_exp;
acc_obj += (cur_obj - log_ref_z);
diff --git a/training/crf/mpi_baum_welch.cc b/training/crf/mpi_baum_welch.cc
new file mode 100644
index 00000000..d69b1769
--- /dev/null
+++ b/training/crf/mpi_baum_welch.cc
@@ -0,0 +1,316 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+
+#include "config.h"
+#ifdef HAVE_MPI
+#include <boost/mpi/timer.hpp>
+#include <boost/mpi.hpp>
+namespace mpi = boost::mpi;
+#endif
+
+#include <boost/unordered_map.hpp>
+#include <boost/functional/hash.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sentence_metadata.h"
+#include "verbose.h"
+#include "hg.h"
+#include "prob.h"
+#include "inside_outside.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("input_weights,w",po::value<string>(),"Input feature weights file")
+ ("iterations,n",po::value<unsigned>()->default_value(50), "Number of training iterations")
+ ("training_data,t",po::value<string>(),"Training data")
+ ("decoder_config,c",po::value<string>(),"Decoder configuration file");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || !conf->count("input_weights") || !(conf->count("training_data")) || !conf->count("decoder_config")) {
+ cerr << dcmdline_options << endl;
+ return false;
+ }
+ return true;
+}
+
+void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ int lc = 0;
+ while(in) {
+ getline(in, line);
+ if (!in) break;
+ if (lc % size == rank) c->push_back(line);
+ ++lc;
+ }
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct TrainingObserver : public DecoderObserver {
+ void Reset() {
+ acc_grad.clear();
+ acc_obj = 0;
+ total_complete = 0;
+ trg_words = 0;
+ }
+
+ void SetLocalGradientAndObjective(vector<double>* g, double* o) const {
+ *o = acc_obj;
+ for (SparseVector<double>::const_iterator it = acc_grad.begin(); it != acc_grad.end(); ++it)
+ (*g)[it->first] = it->second;
+ }
+
+ virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
+ state = 1;
+ }
+
+ // compute model expectations, denominator of objective
+ virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+ assert(state == 1);
+ trg_words += smeta.GetSourceLength();
+ state = 2;
+ SparseVector<prob_t> exps;
+ const prob_t z = InsideOutside<prob_t,
+ EdgeProb,
+ SparseVector<prob_t>,
+ EdgeFeaturesAndProbWeightFunction>(*hg, &exps);
+ exps /= z;
+ for (SparseVector<prob_t>::iterator it = exps.begin(); it != exps.end(); ++it)
+ acc_grad.add_value(it->first, it->second.as_float());
+
+ acc_obj += log(z);
+ }
+
+ // compute "empirical" expectations, numerator of objective
+ virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+ cerr << "Shouldn't get an alignment forest!\n";
+ abort();
+ }
+
+ virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
+ ++total_complete;
+ }
+
+ int total_complete;
+ SparseVector<double> acc_grad;
+ double acc_obj;
+ unsigned trg_words;
+ int state;
+};
+
+void ReadConfig(const string& ini, vector<string>* out) {
+ ReadFile rf(ini);
+ istream& in = *rf.stream();
+ while(in) {
+ string line;
+ getline(in, line);
+ if (!in) continue;
+ out->push_back(line);
+ }
+}
+
+void StoreConfig(const vector<string>& cfg, istringstream* o) {
+ ostringstream os;
+ for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; }
+ o->str(os.str());
+}
+
+#if 0
+template <typename T>
+struct VectorPlus : public binary_function<vector<T>, vector<T>, vector<T> > {
+ vector<T> operator()(const vector<int>& a, const vector<int>& b) const {
+ assert(a.size() == b.size());
+ vector<T> v(a.size());
+ transform(a.begin(), a.end(), b.begin(), v.begin(), plus<T>());
+ return v;
+ }
+};
+#endif
+
+int main(int argc, char** argv) {
+#ifdef HAVE_MPI
+ mpi::environment env(argc, argv);
+ mpi::communicator world;
+ const int size = world.size();
+ const int rank = world.rank();
+#else
+ const int size = 1;
+ const int rank = 0;
+#endif
+ SetSilent(true); // turn off verbose decoder output
+ register_feature_functions();
+
+ po::variables_map conf;
+ if (!InitCommandLine(argc, argv, &conf)) return 1;
+ const unsigned iterations = conf["iterations"].as<unsigned>();
+
+ // load cdec.ini and set up decoder
+ vector<string> cdec_ini;
+ ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
+ istringstream ini;
+ StoreConfig(cdec_ini, &ini);
+ Decoder* decoder = new Decoder(&ini);
+ if (decoder->GetConf()["input"].as<string>() != "-") {
+ cerr << "cdec.ini must not set an input file\n";
+ return 1;
+ }
+
+ // load initial weights
+ if (rank == 0) { cerr << "Loading weights...\n"; }
+ vector<weight_t>& lambdas = decoder->CurrentWeightVector();
+ Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
+ if (rank == 0) { cerr << "Done loading weights.\n"; }
+
+ // freeze feature set (should be optional?)
+ const bool freeze_feature_set = true;
+ if (freeze_feature_set) FD::Freeze();
+
+ const int num_feats = FD::NumFeats();
+ if (rank == 0) cerr << "Number of features: " << num_feats << endl;
+ lambdas.resize(num_feats);
+
+ vector<double> gradient(num_feats, 0.0);
+ vector<double> rcv_grad;
+ rcv_grad.clear();
+ bool converged = false;
+
+ vector<string> corpus, test_corpus;
+ ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
+ assert(corpus.size() > 0);
+ if (conf.count("test_data"))
+ ReadTrainingCorpus(conf["test_data"].as<string>(), rank, size, &test_corpus);
+
+ // build map from feature id to the accumulator that should normalize
+ boost::unordered_map<std::string, boost::unordered_map<int, double>, boost::hash<std::string> > ccs;
+ vector<boost::unordered_map<int, double>* > cpd_to_acc;
+ if (rank == 0) {
+ cpd_to_acc.resize(num_feats);
+ for (unsigned f = 1; f < num_feats; ++f) {
+ string normalizer;
+ //0 ||| 7 9 ||| Bi:BOS_7=1 Bi:7_9=1 Bi:9_EOS=1 Id:a:7=1 Uni:7=1 Id:b:9=1 Uni:9=1 ||| 0
+ const string& fstr = FD::Convert(f);
+ if (fstr.find("Bi:") == 0) {
+ size_t pos = fstr.rfind('_');
+ if (pos < fstr.size())
+ normalizer = fstr.substr(0, pos);
+ } else if (fstr.find("Id:") == 0) {
+ size_t pos = fstr.rfind(':');
+ if (pos < fstr.size()) {
+ normalizer = "Emit:";
+ normalizer += fstr.substr(pos);
+ }
+ }
+ if (normalizer.size() > 0) {
+ boost::unordered_map<int, double>& acc = ccs[normalizer];
+ cpd_to_acc[f] = &acc;
+ }
+ }
+ }
+
+ TrainingObserver observer;
+ int iteration = 0;
+ while (!converged) {
+ ++iteration;
+ observer.Reset();
+#ifdef HAVE_MPI
+ mpi::timer timer;
+ world.barrier();
+#endif
+ if (rank == 0) {
+ cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n";
+ cerr << " Testset size: " << test_corpus.size() << " sentences / proc)\n";
+ for(boost::unordered_map<string, boost::unordered_map<int,double>, boost::hash<string> >::iterator it = ccs.begin(); it != ccs.end(); ++it)
+ it->second.clear();
+ }
+ for (int i = 0; i < corpus.size(); ++i)
+ decoder->Decode(corpus[i], &observer);
+ cerr << " process " << rank << '/' << size << " done\n";
+ fill(gradient.begin(), gradient.end(), 0);
+ double objective = 0;
+ observer.SetLocalGradientAndObjective(&gradient, &objective);
+
+ unsigned total_words = 0;
+#ifdef HAVE_MPI
+ double to = 0;
+ rcv_grad.resize(num_feats, 0.0);
+ mpi::reduce(world, &gradient[0], gradient.size(), &rcv_grad[0], plus<double>(), 0);
+ swap(gradient, rcv_grad);
+ rcv_grad.clear();
+
+ reduce(world, observer.trg_words, total_words, std::plus<unsigned>(), 0);
+ mpi::reduce(world, objective, to, plus<double>(), 0);
+ objective = to;
+#else
+ total_words = observer.trg_words;
+#endif
+ if (rank == 0) { // run optimizer only on rank=0 node
+ cerr << "TRAINING CORPUS: ln p(x)=" << objective << "\t log_2 p(x) = " << (objective/log(2)) << "\t cross entropy = " << (objective/log(2) / total_words) << "\t ppl = " << pow(2, (-objective/log(2) / total_words)) << endl;
+ for (unsigned f = 1; f < num_feats; ++f) {
+ boost::unordered_map<int, double>* m = cpd_to_acc[f];
+ if (m && gradient[f]) {
+ (*m)[f] += gradient[f];
+ }
+ for(boost::unordered_map<string, boost::unordered_map<int,double>, boost::hash<string> >::iterator it = ccs.begin(); it != ccs.end(); ++it) {
+ const boost::unordered_map<int,double>& ccs = it->second;
+ double z = 0;
+ for (boost::unordered_map<int,double>::const_iterator ci = ccs.begin(); ci != ccs.end(); ++ci)
+ z += ci->second + 1e-09;
+ double lz = log(z);
+ for (boost::unordered_map<int,double>::const_iterator ci = ccs.begin(); ci != ccs.end(); ++ci)
+ lambdas[ci->first] = log(ci->second + 1e-09) - lz;
+ }
+ }
+ Weights::SanityCheck(lambdas);
+ Weights::ShowLargestFeatures(lambdas);
+
+ converged = (iteration == iterations);
+
+ string fname = "weights.cur.gz";
+ if (converged) { fname = "weights.final.gz"; }
+ ostringstream vv;
+ vv << "Objective = " << objective << " (eval count=" << iteration << ")";
+ const string svv = vv.str();
+ Weights::WriteToFile(fname, lambdas, true, &svv);
+ } // rank == 0
+ int cint = converged;
+#ifdef HAVE_MPI
+ mpi::broadcast(world, &lambdas[0], lambdas.size(), 0);
+ mpi::broadcast(world, cint, 0);
+ if (rank == 0) { cerr << " ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; }
+#endif
+ converged = cint;
+ }
+ return 0;
+}
+
diff --git a/training/mpi_compute_cllh.cc b/training/crf/mpi_compute_cllh.cc
index 066389d0..066389d0 100644
--- a/training/mpi_compute_cllh.cc
+++ b/training/crf/mpi_compute_cllh.cc
diff --git a/training/mpi_extract_features.cc b/training/crf/mpi_extract_features.cc
index 6750aa15..6750aa15 100644
--- a/training/mpi_extract_features.cc
+++ b/training/crf/mpi_extract_features.cc
diff --git a/training/mpi_extract_reachable.cc b/training/crf/mpi_extract_reachable.cc
index 2a7c2b9d..2a7c2b9d 100644
--- a/training/mpi_extract_reachable.cc
+++ b/training/crf/mpi_extract_reachable.cc
diff --git a/training/mpi_flex_optimize.cc b/training/crf/mpi_flex_optimize.cc
index b52decdc..b52decdc 100644
--- a/training/mpi_flex_optimize.cc
+++ b/training/crf/mpi_flex_optimize.cc
diff --git a/training/mpi_online_optimize.cc b/training/crf/mpi_online_optimize.cc
index 993627f0..9e1ae34c 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/crf/mpi_online_optimize.cc
@@ -5,6 +5,7 @@
#include <cassert>
#include <cmath>
#include <tr1/memory>
+#include <ctime>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
@@ -41,6 +42,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
("training_agenda,a",po::value<string>(), "Text file listing a series of configuration files and the number of iterations to train using each configuration successively")
("minibatch_size_per_proc,s", po::value<unsigned>()->default_value(5), "Number of training instances evaluated per processor in each minibatch")
("optimization_method,m", po::value<string>()->default_value("sgd"), "Optimization method (sgd)")
+ ("max_walltime", po::value<unsigned>(), "Maximum walltime to run (in minutes)")
("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
("eta_0,e", po::value<double>()->default_value(0.2), "Initial learning rate for SGD (eta_0)")
("L1,1","Use L1 regularization")
@@ -143,7 +145,7 @@ struct TrainingObserver : public DecoderObserver {
cerr << "DIFF. ERR! log_model_z < log_ref_z: " << cur_obj << " " << log_ref_z << endl;
exit(1);
}
- assert(!isnan(log_ref_z));
+ assert(!std::isnan(log_ref_z));
ref_exp -= cur_model_exp;
acc_grad += ref_exp;
acc_obj += (cur_obj - log_ref_z);
@@ -304,6 +306,9 @@ int main(int argc, char** argv) {
int write_weights_every_ith = 100; // TODO configure
int titer = -1;
+ unsigned timeout = 0;
+ if (conf.count("max_walltime")) timeout = 60 * conf["max_walltime"].as<unsigned>();
+ const time_t start_time = time(NULL);
for (int ai = 0; ai < agenda.size(); ++ai) {
const string& cur_config = agenda[ai].first;
const unsigned max_iteration = agenda[ai].second;
@@ -330,15 +335,20 @@ int main(int argc, char** argv) {
if (rank == 0) {
converged = (iter == max_iteration);
Weights::SanityCheck(lambdas);
- Weights::ShowLargestFeatures(lambdas);
+ static int cc = 0; ++cc; if (cc > 1) { Weights::ShowLargestFeatures(lambdas); }
string fname = "weights.cur.gz";
if (iter % write_weights_every_ith == 0) {
ostringstream o; o << "weights.epoch_" << (ai+1) << '.' << iter << ".gz";
fname = o.str();
}
+ const time_t cur_time = time(NULL);
+ if (timeout) {
+ if ((cur_time - start_time) > timeout) converged = true;
+ }
if (converged && ((ai+1)==agenda.size())) { fname = "weights.final.gz"; }
ostringstream vv;
- vv << "total iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << " eta=" << lr->eta(titer);
+ double minutes = (cur_time - start_time) / 60.0;
+ vv << "total walltime=" << minutes << "min iter=" << titer << " (of current config iter=" << iter << ") minibatch=" << size_per_proc << " sentences/proc x " << size << " procs. num_feats=" << x.size() << '/' << FD::NumFeats() << " passes_thru_data=" << (titer * size_per_proc / static_cast<double>(corpus.size())) << " eta=" << lr->eta(titer);
const string svv = vv.str();
cerr << svv << endl;
Weights::WriteToFile(fname, lambdas, true, &svv);
diff --git a/training/dep-reorder/conll2reordering-forest.pl b/training/dep-reorder/conll2reordering-forest.pl
deleted file mode 100755
index 3cd226be..00000000
--- a/training/dep-reorder/conll2reordering-forest.pl
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
-my $FIRST_CONV = "$script_dir/scripts/conll2simplecfg.pl";
-my $CDEC = "$script_dir/../../decoder/cdec";
-
-our $tfile1 = "grammar1.$$";
-our $tfile2 = "text.$$";
-
-die "Usage: $0 parses.conll\n" unless scalar @ARGV == 1;
-open C, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!";
-
-END { unlink $tfile1; unlink "$tfile1.cfg"; unlink $tfile2; }
-
-my $first = 1;
-open T, ">$tfile1" or die "Can't write $tfile1: $!";
-my $lc = 0;
-my $flag = 0;
-my @words = ();
-while(<C>) {
- print T;
- chomp;
- if (/^$/) {
- if ($first) { $first = undef; } else { if ($flag) { print "\n"; $flag = 0; } }
- $first = undef;
- close T;
- open SO, ">$tfile2" or die "Can't write $tfile2: $!";
- print SO "@words\n";
- close SO;
- @words=();
- `$FIRST_CONV < $tfile1 > $tfile1.cfg`;
- if ($? != 0) {
- die "Error code: $?";
- }
- my $cfg = `$CDEC -n -S 10000 -f scfg -g $tfile1.cfg -i $tfile2 --show_cfg_search_space 2>/dev/null`;
- if ($? != 0) {
- die "Error code: $?";
- }
- my @rules = split /\n/, $cfg;
- shift @rules; # get rid of output
- for my $rule (@rules) {
- my ($lhs, $f, $e, $feats) = split / \|\|\| /, $rule;
- $f =~ s/,\d\]/\]/g;
- $feats = 'TOP=1' unless $feats;
- if ($lhs =~ /\[Goal_\d+\]/) { $lhs = '[S]'; }
- print "$lhs ||| $f ||| $feats\n";
- if ($e eq '[1] [2]') {
- my ($a, $b) = split /\s+/, $f;
- $feats =~ s/=1$//;
- my ($x, $y) = split /_/, $feats;
- print "$lhs ||| $b $a ||| ${y}_$x=1\n";
- }
- $flag = 1;
- }
- open T, ">$tfile1" or die "Can't write $tfile1: $!";
- $lc = -1;
- } else {
- my ($ind, $word, @dmmy) = split /\s+/;
- push @words, $word;
- }
- $lc++;
-}
-close T;
-
diff --git a/training/dep-reorder/george.conll b/training/dep-reorder/george.conll
deleted file mode 100644
index 7eebb360..00000000
--- a/training/dep-reorder/george.conll
+++ /dev/null
@@ -1,4 +0,0 @@
-1 George _ GEORGE _ _ 2 X _ _
-2 hates _ HATES _ _ 0 X _ _
-3 broccoli _ BROC _ _ 2 X _ _
-
diff --git a/training/dep-reorder/scripts/conll2simplecfg.pl b/training/dep-reorder/scripts/conll2simplecfg.pl
deleted file mode 100755
index b101347a..00000000
--- a/training/dep-reorder/scripts/conll2simplecfg.pl
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-# 1 在 _ 10 _ _ 4 X _ _
-# 2 门厅 _ 3 _ _ 1 X _ _
-# 3 下面 _ 23 _ _ 4 X _ _
-# 4 。 _ 45 _ _ 0 X _ _
-
-my @ldeps;
-my @rdeps;
-@ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; }
-@rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; }
-my $rootcat = 0;
-my @cats = ('S');
-my $len = 0;
-my @noposcats = ('S');
-while(<>) {
- chomp;
- if (/^\s*$/) {
- write_cfg($len);
- $len = 0;
- @cats=('S');
- @noposcats = ('S');
- @ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; }
- @rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; }
- next;
- }
- $len++;
- my ($pos, $word, $d1, $xcat, $d2, $d3, $headpos, $deptype) = split /\s+/;
- my $cat = "C$xcat";
- my $catpos = $cat . "_$pos";
- push @cats, $catpos;
- push @noposcats, $cat;
- print "[$catpos] ||| $word ||| $word ||| Word=1\n";
- if ($headpos == 0) { $rootcat = $pos; }
- if ($pos < $headpos) {
- push @{$ldeps[$headpos]}, $pos;
- } else {
- push @{$rdeps[$headpos]}, $pos;
- }
-}
-
-sub write_cfg {
- my $len = shift;
- for (my $i = 1; $i <= $len; $i++) {
- my @lds = @{$ldeps[$i]};
- for my $ld (@lds) {
- print "[$cats[$i]] ||| [$cats[$ld],1] [$cats[$i],2] ||| [1] [2] ||| $noposcats[$ld]_$noposcats[$i]=1\n";
- }
- my @rds = @{$rdeps[$i]};
- for my $rd (@rds) {
- print "[$cats[$i]] ||| [$cats[$i],1] [$cats[$rd],2] ||| [1] [2] ||| $noposcats[$i]_$noposcats[$rd]=1\n";
- }
- }
- print "[S] ||| [$cats[$rootcat],1] ||| [1] ||| TOP=1\n";
-}
-
diff --git a/training/dpmert/Makefile.am b/training/dpmert/Makefile.am
new file mode 100644
index 00000000..b85bb275
--- /dev/null
+++ b/training/dpmert/Makefile.am
@@ -0,0 +1,27 @@
+bin_PROGRAMS = \
+ mr_dpmert_map \
+ mr_dpmert_reduce \
+ mr_dpmert_generate_mapper_input
+
+noinst_PROGRAMS = \
+ lo_test
+TESTS = lo_test
+
+mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc
+mr_dpmert_generate_mapper_input_LDADD = ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a
+
+# nbest2hg_SOURCES = nbest2hg.cc
+# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst
+
+mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc ces.h error_surface.h line_optimizer.h mert_geometry.h
+mr_dpmert_map_LDADD = ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a
+
+mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc ces.h error_surface.h line_optimizer.h mert_geometry.h
+mr_dpmert_reduce_LDADD = ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a
+
+lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc ces.h error_surface.h line_optimizer.h mert_geometry.h
+lo_test_LDADD = $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a
+
+EXTRA_DIST = test_data dpmert.pl
+
+AM_CPPFLAGS = -DTEST_DATA=\"$(top_srcdir)/training/dpmert/test_data\" -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/dpmert/ces.cc b/training/dpmert/ces.cc
new file mode 100644
index 00000000..157b2d17
--- /dev/null
+++ b/training/dpmert/ces.cc
@@ -0,0 +1,90 @@
+#include "ces.h"
+
+#include <vector>
+#include <sstream>
+#include <boost/shared_ptr.hpp>
+
+// TODO, if AER is to be optimized again, we will need this
+// #include "aligner.h"
+#include "lattice.h"
+#include "mert_geometry.h"
+#include "error_surface.h"
+#include "ns.h"
+
+using namespace std;
+
+const bool minimize_segments = true; // if adjacent segments have equal scores, merge them
+
+void ComputeErrorSurface(const SegmentEvaluator& ss,
+ const ConvexHull& ve,
+ ErrorSurface* env,
+ const EvaluationMetric* metric,
+ const Hypergraph& hg) {
+ vector<WordID> prev_trans;
+ const vector<boost::shared_ptr<MERTPoint> >& ienv = ve.GetSortedSegs();
+ env->resize(ienv.size());
+ SufficientStats prev_score; // defaults to 0
+ int j = 0;
+ for (unsigned i = 0; i < ienv.size(); ++i) {
+ const MERTPoint& seg = *ienv[i];
+ vector<WordID> trans;
+#if 0
+ if (type == AER) {
+ vector<bool> edges(hg.edges_.size(), false);
+ seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi
+ // alignment
+ ostringstream os;
+ const string* psrc = ss.GetSource();
+ if (psrc == NULL) {
+ cerr << "AER scoring in VEST requires source, but it is missing!\n";
+ abort();
+ }
+ size_t pos = psrc->rfind(" ||| ");
+ if (pos == string::npos) {
+ cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl;
+ abort();
+ }
+ Lattice src;
+ Lattice ref;
+ LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src);
+ LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref);
+ AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges);
+ string tstr = os.str();
+ TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans);
+ } else {
+#endif
+ seg.ConstructTranslation(&trans);
+ //}
+ //cerr << "Scoring: " << TD::GetString(trans) << endl;
+ if (trans == prev_trans) {
+ if (!minimize_segments) {
+ ErrorSegment& out = (*env)[j];
+ out.delta.fields.clear();
+ out.x = seg.x;
+ ++j;
+ }
+ //cerr << "Identical translation, skipping scoring\n";
+ } else {
+ SufficientStats score;
+ ss.Evaluate(trans, &score);
+ // cerr << "score= " << score->ComputeScore() << "\n";
+ //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl;
+ const SufficientStats delta = score - prev_score;
+ //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl;
+ //string xx; delta.Encode(&xx); cerr << xx << endl;
+ prev_trans.swap(trans);
+ prev_score = score;
+ if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) {
+ ErrorSegment& out = (*env)[j];
+ out.delta = delta;
+ out.x = seg.x;
+ ++j;
+ }
+ }
+ }
+ // cerr << " In segments: " << ienv.size() << endl;
+ // cerr << "Out segments: " << j << endl;
+ assert(j > 0);
+ env->resize(j);
+}
+
diff --git a/training/dpmert/ces.h b/training/dpmert/ces.h
new file mode 100644
index 00000000..e4fa2080
--- /dev/null
+++ b/training/dpmert/ces.h
@@ -0,0 +1,16 @@
+#ifndef _CES_H_
+#define _CES_H_
+
+class ConvexHull;
+class Hypergraph;
+class SegmentEvaluator;
+class ErrorSurface;
+class EvaluationMetric;
+
+void ComputeErrorSurface(const SegmentEvaluator& ss,
+ const ConvexHull& convex_hull,
+ ErrorSurface* es,
+ const EvaluationMetric* metric,
+ const Hypergraph& hg);
+
+#endif
diff --git a/training/dpmert/divide_refs.py b/training/dpmert/divide_refs.py
new file mode 100755
index 00000000..b478f918
--- /dev/null
+++ b/training/dpmert/divide_refs.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+import sys
+
+(numRefs, outPrefix) = sys.argv[1:]
+numRefs = int(numRefs)
+
+outs = [open(outPrefix+str(i), "w") for i in range(numRefs)]
+
+i = 0
+for line in sys.stdin:
+ outs[i].write(line)
+ i = (i + 1) % numRefs
+
+for out in outs:
+ out.close()
diff --git a/training/dpmert/dpmert.pl b/training/dpmert/dpmert.pl
new file mode 100755
index 00000000..559420f5
--- /dev/null
+++ b/training/dpmert/dpmert.pl
@@ -0,0 +1,618 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use File::Basename qw(basename);
+require "libcall.pl";
+
+my $QSUB_CMD = qsub_args(mert_memory());
+
+# Default settings
+my $srcFile; # deprecated
+my $refFiles; # deprecated
+my $default_jobs = env_default_jobs();
+my $bin_dir = $SCRIPT_DIR;
+my $util_dir = "$SCRIPT_DIR/../utils";
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input";
+my $MAPPER = "$bin_dir/mr_dpmert_map";
+my $REDUCER = "$bin_dir/mr_dpmert_reduce";
+my $parallelize = "$util_dir/parallelize.pl";
+my $libcall = "$util_dir/libcall.pl";
+my $sentserver = "$util_dir/sentserver";
+my $sentclient = "$util_dir/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 200;
+my $rand_directions = 15;
+my $iteration = 1;
+my $best_weights;
+my $max_iterations = 15;
+my $optimization_iters = 6;
+my $jobs = $default_jobs; # number of decode nodes
+my $pmem = "9g";
+my $disable_clean = 0;
+my %seen_weights;
+my $help = 0;
+my $epsilon = 0.0001;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $initialWeights;
+my $bleu_weight=1;
+my $use_make = 1; # use make to parallelize line search
+my $useqsub;
+my $pass_suffix = '';
+my $devset;
+# Process command-line options
+if (GetOptions(
+ "config=s" => \$iniFile,
+ "weights=s" => \$initialWeights,
+ "devset=s" => \$devset,
+ "jobs=i" => \$jobs,
+ "pass-suffix=s" => \$pass_suffix,
+ "help" => \$help,
+ "qsub" => \$useqsub,
+ "iterations=i" => \$max_iterations,
+ "pmem=s" => \$pmem,
+ "random-directions=i" => \$rand_directions,
+ "metric=s" => \$metric,
+ "source-file=s" => \$srcFile,
+ "output-dir=s" => \$dir,
+) == 0 || @ARGV!=0 || $help) {
+ print_help();
+ exit;
+}
+
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (defined $srcFile || defined $refFiles) {
+ die <<EOT;
+
+ The options --ref-files and --source-file are no longer supported.
+ Please specify the input file and its reference translations with
+ --devset FILE
+
+EOT
+}
+
+if (!defined $iniFile) { push @missing_args, "--config"; }
+if (!defined $devset) { push @missing_args, "--devset"; }
+if (!defined $initialWeights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args);
+
+if ($metric =~ /^(combi|ter)$/i) {
+ $lines_per_mapper = 40;
+} elsif ($metric =~ /^meteor$/i) {
+ $lines_per_mapper = 2000; # start up time is really high for METEOR
+}
+
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+ $DIR_FLAG = '';
+}
+
+unless ($dir){
+ $dir = "dpmert";
+}
+unless ($dir =~ /^\//){ # convert relative path to absolute path
+ my $basedir = check_output("pwd");
+ chomp $basedir;
+ $dir = "$basedir/$dir";
+}
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+ print STDERR "Cleanup...\n";
+ for my $pid (@childpids){ unchecked_call("kill $pid"); }
+ for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+ exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit = sub{ cleanup(); };
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = basename($decoder); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+
+sub dirsize {
+ opendir ISEMPTY,$_[0];
+ return scalar(readdir(ISEMPTY))-1;
+}
+if (-e $dir) {
+ # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs
+ die "ERROR: output directory $dir already exists (remove or use --output-dir dir)\n\n";
+} else {
+ mkdir "$dir" or die "Can't mkdir $dir: $!";
+ mkdir "$dir/hgs" or die;
+ mkdir "$dir/scripts" or die;
+ print STDERR <<EOT;
+ DECODER: $decoder
+ INI FILE: $iniFile
+ WORKING DIR: $dir
+ DEVSET: $devset
+ EVAL METRIC: $metric
+ MAX ITERATIONS: $max_iterations
+ PARALLEL JOBS: $jobs
+ HEAD NODE: $host
+ PMEM (DECODING): $pmem
+ INITIAL WEIGHTS: $initialWeights
+EOT
+}
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+check_call("cp $initialWeights $dir/weights.0");
+$iniFile = $newIniFile;
+
+split_devset($devset, "$dir/dev.input.raw", "$dir/dev.refs");
+my $refs = "-r $dir/dev.refs";
+my $newsrc = "$dir/dev.input";
+enseg("$dir/dev.input.raw", $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+ print STDERR "\n\nITERATION $iteration\n==========\n";
+
+ if ($iteration > $max_iterations){
+ print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+ last;
+ }
+ # iteration-specific files
+ my $runFile="$dir/run.raw.$iteration";
+ my $onebestFile="$dir/1best.$iteration";
+ my $logdir="$dir/logs.$iteration";
+ my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+ my $scorerLog="$logdir/scorer.log.$iteration";
+ check_call("mkdir -p $logdir");
+
+
+ #decode
+ print STDERR "RUNNING DECODER AT ";
+ print STDERR unchecked_output("date");
+ my $im1 = $iteration - 1;
+ my $weightsFile="$dir/weights.$im1";
+ my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+ my $pcmd;
+ if ($use_make) {
+ $pcmd = "cat $srcFile | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --";
+ } else {
+ $pcmd = "cat $srcFile | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --";
+ }
+ my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ my $num_hgs;
+ my $num_topbest;
+ my $retries = 0;
+ while($retries < 5) {
+ $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+ $num_topbest = check_output("wc -l < $runFile");
+ print STDERR "NUMBER OF HGs: $num_hgs\n";
+ print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+ if($devSize == $num_hgs && $devSize == $num_topbest) {
+ last;
+ } else {
+ print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+ sleep(3);
+ }
+ $retries++;
+ }
+ die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+ my $dec_score = check_output("cat $runFile | $SCORER $refs -m $metric");
+ chomp $dec_score;
+ print STDERR "DECODER SCORE: $dec_score\n";
+
+ # save space
+ check_call("gzip -f $runFile");
+ check_call("gzip -f $decoderLog");
+
+ # run optimizer
+ print STDERR "RUNNING OPTIMIZER AT ";
+ print STDERR unchecked_output("date");
+ my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+ my $score = 0;
+ my $icc = 0;
+ my $inweights="$dir/weights.$im1";
+ for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) {
+ print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";
+ print STDERR unchecked_output("date");
+ $icc++;
+ $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ check_call("mkdir -p $dir/splag.$im1");
+ $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput.";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
+ my @shards = grep { /^mapinput\./ } readdir(DIR);
+ closedir DIR;
+ die "No shards!" unless scalar @shards > 0;
+ my $joblist = "";
+ my $nmappers = 0;
+ my @mapoutputs = ();
+ @cleanupcmds = ();
+ my %o2i = ();
+ my $first_shard = 1;
+ my $mkfile; # only used with makefiles
+ my $mkfilename;
+ if ($use_make) {
+ $mkfilename = "$dir/splag.$im1/domap.mk";
+ open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
+ print $mkfile "all: $dir/splag.$im1/map.done\n\n";
+ }
+ my @mkouts = (); # only used with makefiles
+ for my $shard (@shards) {
+ my $mapoutput = $shard;
+ my $client_name = $shard;
+ $client_name =~ s/mapinput.//;
+ $client_name = "dpmert.$client_name";
+ $mapoutput =~ s/mapinput/mapoutput/;
+ push @mapoutputs, "$dir/splag.$im1/$mapoutput";
+ $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
+ my $script = "$MAPPER -s $srcFile -m $metric $refs < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput";
+ if ($use_make) {
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "#!/bin/bash\n";
+ print F "$script\n";
+ close F;
+ my $output = "$dir/splag.$im1/$mapoutput";
+ push @mkouts, $output;
+ chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
+ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+ print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
+ } else {
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "$script\n";
+ close F;
+ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+
+ $nmappers++;
+ my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+ my $jobid = check_output("$qcmd");
+ chomp $jobid;
+ $jobid =~ s/^(\d+)(.*?)$/\1/g;
+ $jobid =~ s/^Your job (\d+) .*$/\1/;
+ push(@cleanupcmds, "qdel $jobid 2> /dev/null");
+ print STDERR " $jobid";
+ if ($joblist == "") { $joblist = $jobid; }
+ else {$joblist = $joblist . "\|" . $jobid; }
+ }
+ }
+ if ($use_make) {
+ print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
+ close $mkfile;
+ my $mcmd = "make -j $jobs -f $mkfilename";
+ print STDERR "\nExecuting: $mcmd\n";
+ check_call($mcmd);
+ } else {
+ print STDERR "\nLaunched $nmappers mappers.\n";
+ sleep 8;
+ print STDERR "Waiting for mappers to complete...\n";
+ while ($nmappers > 0) {
+ sleep 5;
+ my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
+ $nmappers = scalar @livejobs;
+ }
+ print STDERR "All mappers complete.\n";
+ }
+ my $tol = 0;
+ my $til = 0;
+ for my $mo (@mapoutputs) {
+ my $olines = get_lines($mo);
+ my $ilines = get_lines($o2i{$mo});
+ $tol += $olines;
+ $til += $ilines;
+ die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines;
+ }
+ print STDERR "Results for $tol/$til lines\n";
+ print STDERR "\nSORTING AND RUNNING VEST REDUCER\n";
+ print STDERR unchecked_output("date");
+ $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1";
+ # sort returns failure even when it doesn't fail for some reason
+ my $best=unchecked_output("$cmd"); chomp $best;
+ print STDERR "$best\n";
+ my ($oa, $x, $xscore) = split /\|/, $best;
+ $score = $xscore;
+ print STDERR "PROJECTED SCORE: $score\n";
+ if (abs($x) < $epsilon) {
+ print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n";
+ last;
+ }
+ my $psd = $score - $last_score;
+ $last_score = $score;
+ if (abs($psd) < $epsilon) {
+ print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n";
+ last;
+ }
+ my ($origin, $axis) = split /\s+/, $oa;
+
+ my %ori = convert($origin);
+ my %axi = convert($axis);
+
+ my $finalFile="$dir/weights.$im1-$opt_iter";
+ open W, ">$finalFile" or die "Can't write: $finalFile: $!";
+ my $norm = 0;
+ for my $k (sort keys %ori) {
+ my $dd = $ori{$k} + $axi{$k} * $x;
+ $norm += $dd * $dd;
+ }
+ $norm = sqrt($norm);
+ $norm = 1;
+ for my $k (sort keys %ori) {
+ my $v = ($ori{$k} + $axi{$k} * $x) / $norm;
+ print W "$k $v\n";
+ }
+ check_call("rm $dir/splag.$im1/*");
+ $inweights = $finalFile;
+ }
+ $lastWeightsFile = "$dir/weights.$iteration";
+ check_call("cp $inweights $lastWeightsFile");
+ if ($icc < 2) {
+ print STDERR "\nREACHED STOPPING CRITERION: score change too little\n";
+ last;
+ }
+ $lastPScore = $score;
+ $iteration++;
+ print STDERR "\n==========\n";
+}
+
+check_call("cp $lastWeightsFile $dir/weights.final");
+print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n";
+print STDOUT "$dir/weights.final\n";
+exit 0;
+
+
+sub get_lines {
+ my $fn = shift @_;
+ open FL, "<$fn" or die "Couldn't read $fn: $!";
+ my $lc = 0;
+ while(<FL>) { $lc++; }
+ return $lc;
+}
+
+sub read_weights_file {
+ my ($file) = @_;
+ open F, "<$file" or die "Couldn't read $file: $!";
+ my @r = ();
+ my $pm = -1;
+ while(<F>) {
+ next if /^#/;
+ next if /^\s*$/;
+ chomp;
+ if (/^(.+)\s+(.+)$/) {
+ my $m = $1;
+ my $w = $2;
+ die "Weights out of order: $m <= $pm" unless $m > $pm;
+ push @r, $w;
+ } else {
+ warn "Unexpected feature name in weight file: $_";
+ }
+ }
+ close F;
+ return join ' ', @r;
+}
+
+sub update_weights_file {
+ my ($neww, $rfn, $rpts) = @_;
+ my @feats = @$rfn;
+ my @pts = @$rpts;
+ my $num_feats = scalar @feats;
+ my $num_pts = scalar @pts;
+ die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+ open G, ">$neww" or die;
+ for (my $i = 0; $i < $num_feats; $i++) {
+ my $f = $feats[$i];
+ my $lambda = $pts[$i];
+ print G "$f $lambda\n";
+ }
+ close G;
+}
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
+ print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
+ } else {
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ }
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+}
+
+sub print_help {
+
+ my $executable = basename($0); chomp $executable;
+ print << "Help";
+
+Usage: $executable [options] <ini file>
+
+ $executable [options]
+ Runs a complete MERT optimization. Required options are --weights,
+ --devset, and --config.
+
+Options:
+
+ --config <file> [-c <file>]
+ The decoder configuration file.
+
+ --devset <file> [-d <file>]
+ The source *and* references for the development set.
+
+ --weights <file> [-w <file>]
+ A file specifying initial feature weights. The format is
+ FeatureName_1 value1
+ FeatureName_2 value2
+ **All and only the weights listed in <file> will be optimized!**
+
+ --metric <name>
+ Metric to optimize.
+ Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+ --iterations <M>
+ Maximum number of iterations to run. If not specified, defaults
+ to 10.
+
+ --pass-suffix <S>
+ If the decoder is doing multi-pass decoding, the pass suffix "2",
+ "3", etc., is used to control what iteration of weights is set.
+
+ --rand-directions <num>
+ MERT will attempt to optimize along all of the principle directions,
+ set this parameter to explore other directions. Defaults to 5.
+
+ --output-dir <dir>
+ Directory for intermediate and output files.
+
+ --help
+ Print this message and exit.
+
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
+
+sub cmdline {
+ return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+
+sub escaped_shell_args {
+ return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+ return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+ return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
+
+sub split_devset {
+ my ($infile, $outsrc, $outref) = @_;
+ open F, "<$infile" or die "Can't read $infile: $!";
+ open S, ">$outsrc" or die "Can't write $outsrc: $!";
+ open R, ">$outref" or die "Can't write $outref: $!";
+ while(<F>) {
+ chomp;
+ my ($src, @refs) = split /\s*\|\|\|\s*/;
+ die "Malformed devset line: $_\n" unless scalar @refs > 0;
+ print S "$src\n";
+ print R join(' ||| ', @refs) . "\n";
+ }
+ close R;
+ close S;
+ close F;
+}
+
diff --git a/training/dpmert/error_surface.cc b/training/dpmert/error_surface.cc
new file mode 100644
index 00000000..515b67f8
--- /dev/null
+++ b/training/dpmert/error_surface.cc
@@ -0,0 +1,42 @@
+#include "error_surface.h"
+
+#include <cassert>
+#include <sstream>
+
+using namespace std;
+
+ErrorSurface::~ErrorSurface() {}
+
+void ErrorSurface::Serialize(std::string* out) const {
+ const int segments = this->size();
+ ostringstream os(ios::binary);
+ os.write((const char*)&segments,sizeof(segments));
+ for (int i = 0; i < segments; ++i) {
+ const ErrorSegment& cur = (*this)[i];
+ string senc;
+ cur.delta.Encode(&senc);
+ assert(senc.size() < 1024);
+ unsigned char len = senc.size();
+ os.write((const char*)&cur.x, sizeof(cur.x));
+ os.write((const char*)&len, sizeof(len));
+ os.write((const char*)&senc[0], len);
+ }
+ *out = os.str();
+}
+
+void ErrorSurface::Deserialize(const std::string& in) {
+ istringstream is(in, ios::binary);
+ int segments;
+ is.read((char*)&segments, sizeof(segments));
+ this->resize(segments);
+ for (int i = 0; i < segments; ++i) {
+ ErrorSegment& cur = (*this)[i];
+ unsigned char len;
+ is.read((char*)&cur.x, sizeof(cur.x));
+ is.read((char*)&len, sizeof(len));
+ string senc(len, '\0'); assert(senc.size() == len);
+ is.read((char*)&senc[0], len);
+ cur.delta = SufficientStats(senc);
+ }
+}
+
diff --git a/training/dpmert/error_surface.h b/training/dpmert/error_surface.h
new file mode 100644
index 00000000..bb65847b
--- /dev/null
+++ b/training/dpmert/error_surface.h
@@ -0,0 +1,24 @@
+#ifndef _ERROR_SURFACE_H_
+#define _ERROR_SURFACE_H_
+
+#include <vector>
+#include <string>
+
+#include "ns.h"
+
+class Score;
+
+struct ErrorSegment {
+ double x;
+ SufficientStats delta;
+ ErrorSegment() : x(0), delta() {}
+};
+
+class ErrorSurface : public std::vector<ErrorSegment> {
+ public:
+ ~ErrorSurface();
+ void Serialize(std::string* out) const;
+ void Deserialize(const std::string& in);
+};
+
+#endif
diff --git a/training/dpmert/line_mediator.pl b/training/dpmert/line_mediator.pl
new file mode 100755
index 00000000..bc2bb24c
--- /dev/null
+++ b/training/dpmert/line_mediator.pl
@@ -0,0 +1,116 @@
+#!/usr/bin/perl -w
+#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication
+
+# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver)
+
+#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism.
+
+use strict;
+use IPC::Open2;
+use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO);
+
+my $quiet=!$ENV{DEBUG};
+$quiet=1 if $ENV{QUIET};
+sub info {
+ local $,=' ';
+ print STDERR @_ unless $quiet;
+}
+
+my $mode='CROSS';
+my $ser='DIRECT';
+$mode='PIPE' if $ENV{PIPE};
+$mode='SNAKE' if $ENV{SNAKE};
+$mode='CROSS' if $ENV{CROSS};
+$ser='SERIAL' if $ENV{SERIAL};
+$ser='DIRECT' if $ENV{DIRECT};
+$ser='SERIAL' if $mode eq 'SNAKE';
+info("mode: $mode\n");
+info("connection: $ser\n");
+
+
+my @c1;
+if (scalar @ARGV) {
+ do {
+ push @c1,shift
+ } while scalar @ARGV && $c1[$#c1] ne '--';
+}
+pop @c1;
+my @c2=@ARGV;
+@ARGV=();
+(scalar @c1 && scalar @c2) || die qq{
+usage: $0 cmd1 args -- cmd2 args
+all options are environment variables.
+DEBUG=1 env var enables debugging output.
+CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication. crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output. cmd1 initiates the conversation (sends the first line). default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork).
+SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG.
+if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout.
+if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2.
+DIRECT=1 (default) will override SERIAL=1.
+CROSS=1 (default) will override SNAKE or PIPE.
+};
+
+info("1 cmd:",@c1,"\n");
+info("2 cmd:",@c2,"\n");
+
+sub lineto {
+ select $_[0];
+ $|=1;
+ shift;
+ print @_;
+}
+
+if ($ser eq 'SERIAL') {
+ my ($R1,$W1,$R2,$W2);
+ my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3.
+ my $c2p=open2($R2,$W2,@c2);
+ if ($mode eq 'CROSS') {
+ while(<$R1>) {
+ info("1:",$_);
+ lineto($W2,$_);
+ last unless defined ($_=<$R2>);
+ info("1|2:",$_);
+ lineto($W1,$_);
+ }
+ } else {
+ my $snake=$mode eq 'SNAKE';
+ while(<STDIN>) {
+ info("IN:",$_);
+ lineto($W1,$_);
+ last unless defined ($_=<$R1>);
+ info("IN|1:",$_);
+ lineto($W2,$_);
+ last unless defined ($_=<$R2>);
+ info("IN|1|2:",$_);
+ if ($snake) {
+ lineto($W1,$_);
+ last unless defined ($_=<$R1>);
+ info("IN|1|2|1:",$_);
+ }
+ lineto(*STDOUT,$_);
+ }
+ }
+} else {
+ info("DIRECT mode\n");
+ my @rw1=POSIX::pipe();
+ my @rw2=POSIX::pipe();
+ my $pid=undef;
+ $SIG{CHLD} = sub { wait };
+ while (not defined ($pid=fork())) {
+ sleep 1;
+ }
+ my $pipe = $mode eq 'PIPE';
+ unless ($pipe) {
+ POSIX::close(STDOUT_FILENO);
+ POSIX::close(STDIN_FILENO);
+ }
+ if ($pid) {
+ POSIX::dup2($rw1[1],STDOUT_FILENO);
+ POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe;
+ exec @c1;
+ } else {
+ POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe;
+ POSIX::dup2($rw1[0],STDIN_FILENO);
+ exec @c2;
+ }
+ while (wait()!=-1) {}
+}
diff --git a/training/dpmert/line_optimizer.cc b/training/dpmert/line_optimizer.cc
new file mode 100644
index 00000000..9cf33502
--- /dev/null
+++ b/training/dpmert/line_optimizer.cc
@@ -0,0 +1,114 @@
+#include "line_optimizer.h"
+
+#include <limits>
+#include <algorithm>
+
+#include "sparse_vector.h"
+#include "ns.h"
+
+using namespace std;
+
+typedef ErrorSurface::const_iterator ErrorIter;
+
+// sort by increasing x-ints
+struct IntervalComp {
+ bool operator() (const ErrorIter& a, const ErrorIter& b) const {
+ return a->x < b->x;
+ }
+};
+
+double LineOptimizer::LineOptimize(
+ const EvaluationMetric* metric,
+ const vector<ErrorSurface>& surfaces,
+ const LineOptimizer::ScoreType type,
+ float* best_score,
+ const double epsilon) {
+ // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << " MINE=" << type << endl;
+ vector<ErrorIter> all_ints;
+ for (vector<ErrorSurface>::const_iterator i = surfaces.begin();
+ i != surfaces.end(); ++i) {
+ const ErrorSurface& surface = *i;
+ for (ErrorIter j = surface.begin(); j != surface.end(); ++j)
+ all_ints.push_back(j);
+ }
+ sort(all_ints.begin(), all_ints.end(), IntervalComp());
+ double last_boundary = all_ints.front()->x;
+ SufficientStats acc;
+ float& cur_best_score = *best_score;
+ cur_best_score = (type == MAXIMIZE_SCORE ?
+ -numeric_limits<float>::max() : numeric_limits<float>::max());
+ bool left_edge = true;
+ double pos = numeric_limits<double>::quiet_NaN();
+ for (vector<ErrorIter>::iterator i = all_ints.begin();
+ i != all_ints.end(); ++i) {
+ const ErrorSegment& seg = **i;
+ if (seg.x - last_boundary > epsilon) {
+ float sco = metric->ComputeScore(acc);
+ if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
+ (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
+ cur_best_score = sco;
+ if (left_edge) {
+ pos = seg.x - 0.1;
+ left_edge = false;
+ } else {
+ pos = last_boundary + (seg.x - last_boundary) / 2;
+ }
+ //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n";
+ }
+ // string xx = metric->DetailedScore(acc); cerr << "---- " << xx;
+#undef SHOW_ERROR_SURFACES
+#ifdef SHOW_ERROR_SURFACES
+ cerr << "x=" << seg.x << "\ts=" << sco << "\n";
+#endif
+ last_boundary = seg.x;
+ }
+ // cerr << "x-boundary=" << seg.x << "\n";
+ //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl;
+ //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl;
+ acc += seg.delta;
+ }
+ float sco = metric->ComputeScore(acc);
+ if ((type == MAXIMIZE_SCORE && sco > cur_best_score) ||
+ (type == MINIMIZE_SCORE && sco < cur_best_score) ) {
+ cur_best_score = sco;
+ if (left_edge) {
+ pos = 0;
+ } else {
+ pos = last_boundary + 1000.0;
+ }
+ }
+ return pos;
+}
+
+void LineOptimizer::RandomUnitVector(const vector<int>& features_to_optimize,
+ SparseVector<double>* axis,
+ RandomNumberGenerator<boost::mt19937>* rng) {
+ axis->clear();
+ for (int i = 0; i < features_to_optimize.size(); ++i)
+ axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0));
+ (*axis) /= axis->l2norm();
+}
+
+void LineOptimizer::CreateOptimizationDirections(
+ const vector<int>& features_to_optimize,
+ int additional_random_directions,
+ RandomNumberGenerator<boost::mt19937>* rng,
+ vector<SparseVector<double> >* dirs
+ , bool include_orthogonal
+ ) {
+ dirs->clear();
+ typedef SparseVector<double> Dir;
+ vector<Dir> &out=*dirs;
+ int i=0;
+ if (include_orthogonal)
+ for (;i<features_to_optimize.size();++i) {
+ Dir d;
+ d.set_value(features_to_optimize[i],1.);
+ out.push_back(d);
+ }
+ out.resize(i+additional_random_directions);
+ for (;i<out.size();++i)
+ RandomUnitVector(features_to_optimize, &out[i], rng);
+ cerr << "Generated " << out.size() << " total axes to optimize along.\n";
+}
+
diff --git a/training/dpmert/line_optimizer.h b/training/dpmert/line_optimizer.h
new file mode 100644
index 00000000..83819f41
--- /dev/null
+++ b/training/dpmert/line_optimizer.h
@@ -0,0 +1,48 @@
+#ifndef LINE_OPTIMIZER_H_
+#define LINE_OPTIMIZER_H_
+
+#include <vector>
+
+#include "sparse_vector.h"
+#include "error_surface.h"
+#include "sampler.h"
+
+class EvaluationMetric;
+class Weights;
+
+struct LineOptimizer {
+
+ // use MINIMIZE_SCORE for things like TER, WER
+ // MAXIMIZE_SCORE for things like BLEU
+ enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE };
+
+ // merge all the error surfaces together into a global
+ // error surface and find (the middle of) the best segment
+ static double LineOptimize(
+ const EvaluationMetric* metric,
+ const std::vector<ErrorSurface>& envs,
+ const LineOptimizer::ScoreType type,
+ float* best_score,
+ const double epsilon = 1.0/65536.0);
+
+ // return a random vector of length 1 where all dimensions
+ // not listed in dimensions will be 0.
+ static void RandomUnitVector(const std::vector<int>& dimensions,
+ SparseVector<double>* axis,
+ RandomNumberGenerator<boost::mt19937>* rng);
+
+ // generate a list of directions to optimize; the list will
+ // contain the orthogonal vectors corresponding to the dimensions in
+ // primary and then additional_random_directions directions in those
+ // dimensions as well. All vectors will be length 1.
+ static void CreateOptimizationDirections(
+ const std::vector<int>& primary,
+ int additional_random_directions,
+ RandomNumberGenerator<boost::mt19937>* rng,
+ std::vector<SparseVector<double> >* dirs
+ , bool include_primary=true
+ );
+
+};
+
+#endif
diff --git a/training/dpmert/lo_test.cc b/training/dpmert/lo_test.cc
new file mode 100644
index 00000000..d89bcd99
--- /dev/null
+++ b/training/dpmert/lo_test.cc
@@ -0,0 +1,229 @@
+#define BOOST_TEST_MODULE LineOptimizerTest
+#include <boost/test/unit_test.hpp>
+#include <boost/test/floating_point_comparison.hpp>
+
+#include <cmath>
+#include <iostream>
+#include <fstream>
+
+#include <boost/shared_ptr.hpp>
+
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "ces.h"
+#include "fdict.h"
+#include "hg.h"
+#include "kbest.h"
+#include "hg_io.h"
+#include "filelib.h"
+#include "inside_outside.h"
+#include "viterbi.h"
+#include "mert_geometry.h"
+#include "line_optimizer.h"
+
+using namespace std;
+
+const char* ref11 = "australia reopens embassy in manila";
+const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack .";
+const char* ref21 = "australia reopened manila embassy";
+const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack .";
+const char* ref31 = "australia to reopen embassy in manila";
+const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats .";
+const char* ref41 = "australia to re - open its embassy to manila";
+const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago .";
+
+BOOST_AUTO_TEST_CASE( TestCheckNaN) {
+ double x = 0;
+ double y = 0;
+ double z = x / y;
+ BOOST_CHECK_EQUAL(true, std::isnan(z));
+}
+
+BOOST_AUTO_TEST_CASE(TestConvexHull) {
+ boost::shared_ptr<MERTPoint> a1(new MERTPoint(-1, 0));
+ boost::shared_ptr<MERTPoint> b1(new MERTPoint(1, 0));
+ boost::shared_ptr<MERTPoint> a2(new MERTPoint(-1, 1));
+ boost::shared_ptr<MERTPoint> b2(new MERTPoint(1, -1));
+ vector<boost::shared_ptr<MERTPoint> > sa; sa.push_back(a1); sa.push_back(b1);
+ vector<boost::shared_ptr<MERTPoint> > sb; sb.push_back(a2); sb.push_back(b2);
+ ConvexHull a(sa);
+ cerr << a << endl;
+ ConvexHull b(sb);
+ ConvexHull c = a;
+ c *= b;
+ cerr << a << " (*) " << b << " = " << c << endl;
+ BOOST_CHECK_EQUAL(3, c.size());
+}
+
+BOOST_AUTO_TEST_CASE(TestConvexHullInside) {
+ const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}";
+ Hypergraph hg;
+ istringstream instr(json);
+ HypergraphIO::ReadFromJSON(&instr, &hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("f1"), 0.4);
+ wts.set_value(FD::Convert("f2"), 1.0);
+ hg.Reweight(wts);
+ vector<pair<vector<WordID>, prob_t> > list;
+ std::vector<SparseVector<double> > features;
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10);
+ for (int i = 0; i < 10; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+ }
+ SparseVector<double> dir; dir.set_value(FD::Convert("f1"), 1.0);
+ ConvexHullWeightFunction wf(wts, dir);
+ ConvexHull env = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+ cerr << env << endl;
+ const vector<boost::shared_ptr<MERTPoint> >& segs = env.GetSortedSegs();
+ dir *= segs[1]->x;
+ wts += dir;
+ hg.Reweight(wts);
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest2(hg, 10);
+ for (int i = 0; i < 10; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest2.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+ }
+ for (unsigned i = 0; i < segs.size(); ++i) {
+ cerr << "seg=" << i << endl;
+ vector<WordID> trans;
+ segs[i]->ConstructTranslation(&trans);
+ cerr << TD::GetString(trans) << endl;
+ }
+}
+
+BOOST_AUTO_TEST_CASE( TestS1) {
+ int fPhraseModel_0 = FD::Convert("PhraseModel_0");
+ int fPhraseModel_1 = FD::Convert("PhraseModel_1");
+ int fPhraseModel_2 = FD::Convert("PhraseModel_2");
+ int fLanguageModel = FD::Convert("LanguageModel");
+ int fWordPenalty = FD::Convert("WordPenalty");
+ int fPassThrough = FD::Convert("PassThrough");
+ SparseVector<double> wts;
+ wts.set_value(fWordPenalty, 4.25);
+ wts.set_value(fLanguageModel, -1.1165);
+ wts.set_value(fPhraseModel_0, -0.96);
+ wts.set_value(fPhraseModel_1, -0.65);
+ wts.set_value(fPhraseModel_2, -0.77);
+ wts.set_value(fPassThrough, -10.0);
+
+ vector<int> to_optimize;
+ to_optimize.push_back(fWordPenalty);
+ to_optimize.push_back(fLanguageModel);
+ to_optimize.push_back(fPhraseModel_0);
+ to_optimize.push_back(fPhraseModel_1);
+ to_optimize.push_back(fPhraseModel_2);
+
+ std::string path(boost::unit_test::framework::master_test_suite().argc == 2 ? boost::unit_test::framework::master_test_suite().argv[1] : TEST_DATA);
+
+ Hypergraph hg;
+ ReadFile rf(path + "/0.json.gz");
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ hg.Reweight(wts);
+
+ Hypergraph hg2;
+ ReadFile rf2(path + "/1.json.gz");
+ HypergraphIO::ReadFromJSON(rf2.stream(), &hg2);
+ hg2.Reweight(wts);
+
+ vector<vector<WordID> > refs1(4);
+ TD::ConvertSentence(ref11, &refs1[0]);
+ TD::ConvertSentence(ref21, &refs1[1]);
+ TD::ConvertSentence(ref31, &refs1[2]);
+ TD::ConvertSentence(ref41, &refs1[3]);
+ vector<vector<WordID> > refs2(4);
+ TD::ConvertSentence(ref12, &refs2[0]);
+ TD::ConvertSentence(ref22, &refs2[1]);
+ TD::ConvertSentence(ref32, &refs2[2]);
+ TD::ConvertSentence(ref42, &refs2[3]);
+ vector<ConvexHull> envs(2);
+
+ RandomNumberGenerator<boost::mt19937> rng;
+
+ vector<SparseVector<double> > axes; // directions to search
+ LineOptimizer::CreateOptimizationDirections(
+ to_optimize,
+ 10,
+ &rng,
+ &axes);
+ assert(axes.size() == 10 + to_optimize.size());
+ for (unsigned i = 0; i < axes.size(); ++i)
+ cerr << axes[i] << endl;
+ const SparseVector<double>& axis = axes[0];
+
+ cerr << "Computing Viterbi envelope using inside algorithm...\n";
+ cerr << "axis: " << axis << endl;
+ clock_t t_start=clock();
+ ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction
+ envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+ envs[1] = Inside<ConvexHull, ConvexHullWeightFunction>(hg2, NULL, wf);
+
+ vector<ErrorSurface> es(2);
+ EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU");
+ boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(refs1);
+ boost::shared_ptr<SegmentEvaluator> scorer2 = metric->CreateSegmentEvaluator(refs2);
+ ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg);
+ ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2);
+ cerr << envs[0].size() << " " << envs[1].size() << endl;
+ cerr << es[0].size() << " " << es[1].size() << endl;
+ envs.clear();
+ clock_t t_env=clock();
+ float score;
+ double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score);
+ clock_t t_opt=clock();
+ cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n";
+ BOOST_CHECK_CLOSE(0.48719698, score, 1e-5);
+ SparseVector<double> res = axis;
+ res *= m;
+ res += wts;
+ cerr << "res: " << res << endl;
+ cerr << "ENVELOPE PROCESSING=" << (static_cast<double>(t_env - t_start) / 1000.0) << endl;
+ cerr << " LINE OPTIMIZATION=" << (static_cast<double>(t_opt - t_env) / 1000.0) << endl;
+ hg.Reweight(res);
+ hg2.Reweight(res);
+ vector<WordID> t1,t2;
+ ViterbiESentence(hg, &t1);
+ ViterbiESentence(hg2, &t2);
+ cerr << TD::GetString(t1) << endl;
+ cerr << TD::GetString(t2) << endl;
+}
+
+BOOST_AUTO_TEST_CASE(TestZeroOrigin) {
+ const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}";
+ Hypergraph hg;
+ istringstream instr(json);
+ HypergraphIO::ReadFromJSON(&instr, &hg);
+ SparseVector<double> wts;
+ wts.set_value(FD::Convert("PassThrough"), -0.929201533002898);
+ hg.Reweight(wts);
+
+ vector<pair<vector<WordID>, prob_t> > list;
+ std::vector<SparseVector<double> > features;
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, 10);
+ for (int i = 0; i < 10; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl;
+ }
+
+ SparseVector<double> axis; axis.set_value(FD::Convert("Glue"),1.0);
+ ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction
+ vector<ConvexHull> envs(1);
+ envs[0] = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+
+ vector<vector<WordID> > mr(4);
+ TD::ConvertSentence("untitled", &mr[0]);
+ TD::ConvertSentence("with no title", &mr[1]);
+ TD::ConvertSentence("without a title", &mr[2]);
+ TD::ConvertSentence("without title", &mr[3]);
+ EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU");
+ boost::shared_ptr<SegmentEvaluator> scorer1 = metric->CreateSegmentEvaluator(mr);
+ vector<ErrorSurface> es(1);
+ ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg);
+}
+
diff --git a/training/dpmert/mert_geometry.cc b/training/dpmert/mert_geometry.cc
new file mode 100644
index 00000000..d6973658
--- /dev/null
+++ b/training/dpmert/mert_geometry.cc
@@ -0,0 +1,185 @@
+#include "mert_geometry.h"
+
+#include <cassert>
+#include <limits>
+
+using namespace std;
+
+ConvexHull::ConvexHull(int i) {
+ if (i == 0) {
+ // do nothing - <>
+ } else if (i == 1) {
+ points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(0, 0, 0, boost::shared_ptr<MERTPoint>(), boost::shared_ptr<MERTPoint>())));
+ assert(this->IsMultiplicativeIdentity());
+ } else {
+ cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n";
+ abort();
+ }
+}
+
+const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const {
+ const double m = direction.dot(e.feature_values_);
+ const double b = origin.dot(e.feature_values_);
+ MERTPoint* point = new MERTPoint(m, b, e);
+ return ConvexHull(1, point);
+}
+
+ostream& operator<<(ostream& os, const ConvexHull& env) {
+ os << '<';
+ const vector<boost::shared_ptr<MERTPoint> >& points = env.GetSortedSegs();
+ for (int i = 0; i < points.size(); ++i)
+ os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2;
+ return os << '>';
+}
+
+#define ORIGINAL_MERT_IMPLEMENTATION 1
+#ifdef ORIGINAL_MERT_IMPLEMENTATION
+
+struct SlopeCompare {
+ bool operator() (const boost::shared_ptr<MERTPoint>& a, const boost::shared_ptr<MERTPoint>& b) const {
+ return a->m < b->m;
+ }
+};
+
+const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) {
+ if (!other.is_sorted) other.Sort();
+ if (points.empty()) {
+ points = other.points;
+ return *this;
+ }
+ is_sorted = false;
+ int j = points.size();
+ points.resize(points.size() + other.points.size());
+ for (int i = 0; i < other.points.size(); ++i)
+ points[j++] = other.points[i];
+ assert(j == points.size());
+ return *this;
+}
+
+void ConvexHull::Sort() const {
+ sort(points.begin(), points.end(), SlopeCompare());
+ const int k = points.size();
+ int j = 0;
+ for (int i = 0; i < k; ++i) {
+ MERTPoint l = *points[i];
+ l.x = kMinusInfinity;
+ // cerr << "m=" << l.m << endl;
+ if (0 < j) {
+ if (points[j-1]->m == l.m) { // lines are parallel
+ if (l.b <= points[j-1]->b) continue;
+ --j;
+ }
+ while(0 < j) {
+ l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m);
+ if (points[j-1]->x < l.x) break;
+ --j;
+ }
+ if (0 == j) l.x = kMinusInfinity;
+ }
+ *points[j++] = l;
+ }
+ points.resize(j);
+ is_sorted = true;
+}
+
+const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) {
+ if (other.IsMultiplicativeIdentity()) { return *this; }
+ if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; }
+
+ if (!is_sorted) Sort();
+ if (!other.is_sorted) other.Sort();
+
+ if (this->IsEdgeEnvelope()) {
+// if (other.size() > 1)
+// cerr << *this << " (TIMES) " << other << endl;
+ boost::shared_ptr<MERTPoint> edge_parent = points[0];
+ const double& edge_b = edge_parent->b;
+ const double& edge_m = edge_parent->m;
+ points.clear();
+ for (int i = 0; i < other.points.size(); ++i) {
+ const MERTPoint& p = *other.points[i];
+ const double m = p.m + edge_m;
+ const double b = p.b + edge_b;
+ const double& x = p.x; // x's don't change with *
+ points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(x, m, b, edge_parent, other.points[i])));
+ assert(points.back()->p1->edge);
+ }
+// if (other.size() > 1)
+// cerr << " = " << *this << endl;
+ } else {
+ vector<boost::shared_ptr<MERTPoint> > new_points;
+ int this_i = 0;
+ int other_i = 0;
+ const int this_size = points.size();
+ const int other_size = other.points.size();
+ double cur_x = kMinusInfinity; // moves from left to right across the
+ // real numbers, stopping for all inter-
+ // sections
+ double this_next_val = (1 < this_size ? points[1]->x : kPlusInfinity);
+ double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity);
+ while (this_i < this_size && other_i < other_size) {
+ const MERTPoint& this_point = *points[this_i];
+ const MERTPoint& other_point= *other.points[other_i];
+ const double m = this_point.m + other_point.m;
+ const double b = this_point.b + other_point.b;
+
+ new_points.push_back(boost::shared_ptr<MERTPoint>(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i])));
+ int comp = 0;
+ if (this_next_val < other_next_val) comp = -1; else
+ if (this_next_val > other_next_val) comp = 1;
+ if (0 == comp) { // the next values are equal, advance both indices
+ ++this_i;
+ ++other_i;
+ cur_x = this_next_val; // could be other_next_val (they're equal!)
+ this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity);
+ other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity);
+ } else { // advance the i with the lower x, update cur_x
+ if (-1 == comp) {
+ ++this_i;
+ cur_x = this_next_val;
+ this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity);
+ } else {
+ ++other_i;
+ cur_x = other_next_val;
+ other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity);
+ }
+ }
+ }
+ points.swap(new_points);
+ }
+ //cerr << "Multiply: result=" << (*this) << endl;
+ return *this;
+}
+
+// recursively construct translation
+void MERTPoint::ConstructTranslation(vector<WordID>* trans) const {
+ const MERTPoint* cur = this;
+ vector<vector<WordID> > ant_trans;
+ while(!cur->edge) {
+ ant_trans.resize(ant_trans.size() + 1);
+ cur->p2->ConstructTranslation(&ant_trans.back());
+ cur = cur->p1.get();
+ }
+ size_t ant_size = ant_trans.size();
+ vector<const vector<WordID>*> pants(ant_size);
+ assert(ant_size == cur->edge->tail_nodes_.size());
+ --ant_size;
+ for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i];
+ cur->edge->rule_->ESubstitute(pants, trans);
+}
+
+void MERTPoint::CollectEdgesUsed(std::vector<bool>* edges_used) const {
+ if (edge) {
+ assert(edge->id_ < edges_used->size());
+ (*edges_used)[edge->id_] = true;
+ }
+ if (p1) p1->CollectEdgesUsed(edges_used);
+ if (p2) p2->CollectEdgesUsed(edges_used);
+}
+
+#else
+
+// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS
+
+#endif
+
diff --git a/training/dpmert/mert_geometry.h b/training/dpmert/mert_geometry.h
new file mode 100644
index 00000000..a8b6959e
--- /dev/null
+++ b/training/dpmert/mert_geometry.h
@@ -0,0 +1,81 @@
+#ifndef _MERT_GEOMETRY_H_
+#define _MERT_GEOMETRY_H_
+
+#include <vector>
+#include <iostream>
+#include <boost/shared_ptr.hpp>
+
+#include "hg.h"
+#include "sparse_vector.h"
+
+static const double kMinusInfinity = -std::numeric_limits<double>::infinity();
+static const double kPlusInfinity = std::numeric_limits<double>::infinity();
+
+struct MERTPoint {
+ MERTPoint() : x(), m(), b(), edge() {}
+ MERTPoint(double _m, double _b) :
+ x(kMinusInfinity), m(_m), b(_b), edge() {}
+ MERTPoint(double _x, double _m, double _b, const boost::shared_ptr<MERTPoint>& p1_, const boost::shared_ptr<MERTPoint>& p2_) :
+ x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {}
+ MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) :
+ x(kMinusInfinity), m(_m), b(_b), edge(&edge) {}
+
+ double x; // x intersection with previous segment in env, or -inf if none
+ double m; // this line's slope
+ double b; // intercept with y-axis
+
+ // we keep a pointer to the "parents" of this segment so we can reconstruct
+ // the Viterbi translation corresponding to this segment
+ boost::shared_ptr<MERTPoint> p1;
+ boost::shared_ptr<MERTPoint> p2;
+
+ // only MERTPoints created from an edge using the ConvexHullWeightFunction
+ // have rules
+ // TRulePtr rule;
+ const Hypergraph::Edge* edge;
+
+ // recursively recover the Viterbi translation that will result from setting
+ // the weights to origin + axis * x, where x is any value from this->x up
+ // until the next largest x in the containing ConvexHull
+ void ConstructTranslation(std::vector<WordID>* trans) const;
+ void CollectEdgesUsed(std::vector<bool>* edges_used) const;
+};
+
+// this is the semiring value type,
+// it defines constructors for 0, 1, and the operations + and *
+struct ConvexHull {
+ // create semiring zero
+ ConvexHull() : is_sorted(true) {} // zero
+ // for debugging:
+ ConvexHull(const std::vector<boost::shared_ptr<MERTPoint> >& s) : points(s) { Sort(); }
+ // create semiring 1 or 0
+ explicit ConvexHull(int i);
+ ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr<MERTPoint>(point)) {}
+ const ConvexHull& operator+=(const ConvexHull& other);
+ const ConvexHull& operator*=(const ConvexHull& other);
+ bool IsMultiplicativeIdentity() const {
+ return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); }
+ const std::vector<boost::shared_ptr<MERTPoint> >& GetSortedSegs() const {
+ if (!is_sorted) Sort();
+ return points;
+ }
+ size_t size() const { return points.size(); }
+
+ private:
+ bool IsEdgeEnvelope() const {
+ return points.size() == 1 && points[0]->edge; }
+ void Sort() const;
+ mutable bool is_sorted;
+ mutable std::vector<boost::shared_ptr<MERTPoint> > points;
+};
+std::ostream& operator<<(std::ostream& os, const ConvexHull& env);
+
+struct ConvexHullWeightFunction {
+ ConvexHullWeightFunction(const SparseVector<double>& ori,
+ const SparseVector<double>& dir) : origin(ori), direction(dir) {}
+ const ConvexHull operator()(const Hypergraph::Edge& e) const;
+ const SparseVector<double> origin;
+ const SparseVector<double> direction;
+};
+
+#endif
diff --git a/training/dpmert/mr_dpmert_generate_mapper_input.cc b/training/dpmert/mr_dpmert_generate_mapper_input.cc
new file mode 100644
index 00000000..199cd23a
--- /dev/null
+++ b/training/dpmert/mr_dpmert_generate_mapper_input.cc
@@ -0,0 +1,81 @@
+#include <iostream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "weights.h"
+#include "line_optimizer.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)")
+ ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
+ ("weights,w",po::value<string>(),"[REQD] Current feature weights file")
+ ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
+ ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (conf->count("dev_set_size") == 0) {
+ cerr << "Please specify the size of the development set using -d N\n";
+ flag = true;
+ }
+ if (conf->count("weights") == 0) {
+ cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
+ flag = true;
+ }
+ if (conf->count("forest_repository") == 0) {
+ cerr << "Please specify the forest repository location using -r <DIR>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+int main(int argc, char** argv) {
+ RandomNumberGenerator<boost::mt19937> rng;
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ vector<string> features;
+ SparseVector<weight_t> origin;
+ vector<weight_t> w;
+ Weights::InitFromFile(conf["weights"].as<string>(), &w, &features);
+ Weights::InitSparseVector(w, &origin);
+ const string forest_repository = conf["forest_repository"].as<string>();
+ if (!DirectoryExists(forest_repository)) {
+ cerr << "Forest repository directory " << forest_repository << " not found!\n";
+ return 1;
+ }
+ if (conf.count("optimize_feature") > 0)
+ features=conf["optimize_feature"].as<vector<string> >();
+ vector<SparseVector<weight_t> > directions;
+ vector<int> fids(features.size());
+ for (unsigned i = 0; i < features.size(); ++i)
+ fids[i] = FD::Convert(features[i]);
+ LineOptimizer::CreateOptimizationDirections(
+ fids,
+ conf["random_directions"].as<unsigned int>(),
+ &rng,
+ &directions);
+ unsigned dev_set_size = conf["dev_set_size"].as<unsigned>();
+ for (unsigned i = 0; i < dev_set_size; ++i) {
+ for (unsigned j = 0; j < directions.size(); ++j) {
+ cout << forest_repository << '/' << i << ".json.gz " << i << ' ';
+ print(cout, origin, "=", ";");
+ cout << ' ';
+ print(cout, directions[j], "=", ";");
+ cout << endl;
+ }
+ }
+ return 0;
+}
diff --git a/training/dpmert/mr_dpmert_map.cc b/training/dpmert/mr_dpmert_map.cc
new file mode 100644
index 00000000..d1efcf96
--- /dev/null
+++ b/training/dpmert/mr_dpmert_map.cc
@@ -0,0 +1,112 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "ces.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "sparse_vector.h"
+#include "mert_geometry.h"
+#include "inside_outside.h"
+#include "error_surface.h"
+#include "b64tools.h"
+#include "hg_io.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+ ("source,s",po::value<string>(), "Source file (ignored, except for AER)")
+ ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric being optimized")
+ ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (!conf->count("reference")) {
+ cerr << "Please specify one or more references using -r <REF.TXT>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+bool ReadSparseVectorString(const string& s, SparseVector<double>* v) {
+#if 0
+ // this should work, but untested.
+ std::istringstream i(s);
+ i>>*v;
+#else
+ vector<string> fields;
+ Tokenize(s, ';', &fields);
+ if (fields.empty()) return false;
+ for (unsigned i = 0; i < fields.size(); ++i) {
+ vector<string> pair(2);
+ Tokenize(fields[i], '=', &pair);
+ if (pair.size() != 2) {
+ cerr << "Error parsing vector string: " << fields[i] << endl;
+ return false;
+ }
+ v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str()));
+ }
+ return true;
+#endif
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string evaluation_metric = conf["evaluation_metric"].as<string>();
+ EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+ DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
+ Hypergraph hg;
+ string last_file;
+ ReadFile in_read(conf["input"].as<string>());
+ istream &in=*in_read.stream();
+ while(in) {
+ string line;
+ getline(in, line);
+ if (line.empty()) continue;
+ istringstream is(line);
+ int sent_id;
+ string file, s_origin, s_direction;
+ // path-to-file (JSON) sent_ed starting-point search-direction
+ is >> file >> sent_id >> s_origin >> s_direction;
+ SparseVector<double> origin;
+ ReadSparseVectorString(s_origin, &origin);
+ SparseVector<double> direction;
+ ReadSparseVectorString(s_direction, &direction);
+ // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl;
+ if (last_file != file) {
+ last_file = file;
+ ReadFile rf(file);
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ }
+ const ConvexHullWeightFunction wf(origin, direction);
+ const ConvexHull hull = Inside<ConvexHull, ConvexHullWeightFunction>(hg, NULL, wf);
+
+ ErrorSurface es;
+ ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg);
+ //cerr << "Viterbi envelope has " << ve.size() << " segments\n";
+ // cerr << "Error surface has " << es.size() << " segments\n";
+ string val;
+ es.Serialize(&val);
+ cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t';
+ B64::b64encode(val.c_str(), val.size(), &cout);
+ cout << endl << flush;
+ }
+ return 0;
+}
diff --git a/training/dpmert/mr_dpmert_reduce.cc b/training/dpmert/mr_dpmert_reduce.cc
new file mode 100644
index 00000000..31512a03
--- /dev/null
+++ b/training/dpmert/mr_dpmert_reduce.cc
@@ -0,0 +1,77 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "sparse_vector.h"
+#include "error_surface.h"
+#include "line_optimizer.h"
+#include "b64tools.h"
+#include "stringlib.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("evaluation_metric,m",po::value<string>(), "Evaluation metric (IBM_BLEU, etc.)")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = conf->count("evaluation_metric") == 0;
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string evaluation_metric = conf["evaluation_metric"].as<string>();
+ EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+ LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE;
+ if (metric->IsErrorMetric())
+ opt_type = LineOptimizer::MINIMIZE_SCORE;
+
+ vector<ErrorSurface> esv;
+ string last_key, line, key, val;
+ while(getline(cin, line)) {
+ size_t ks = line.find("\t");
+ assert(string::npos != ks);
+ assert(ks > 2);
+ key = line.substr(2, ks - 2);
+ val = line.substr(ks + 1);
+ if (key != last_key) {
+ if (!last_key.empty()) {
+ float score;
+ double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);
+ cout << last_key << "|" << x << "|" << score << endl;
+ }
+ last_key.swap(key);
+ esv.clear();
+ }
+ if (val.size() % 4 != 0) {
+ cerr << "B64 encoding error 1! Skipping.\n";
+ continue;
+ }
+ string encoded(val.size() / 4 * 3, '\0');
+ if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) {
+ cerr << "B64 encoding error 2! Skipping.\n";
+ continue;
+ }
+ esv.push_back(ErrorSurface());
+ esv.back().Deserialize(encoded);
+ }
+ if (!esv.empty()) {
+ float score;
+ double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score);
+ cout << last_key << "|" << x << "|" << score << endl;
+ }
+ return 0;
+}
diff --git a/training/dpmert/test_aer/README b/training/dpmert/test_aer/README
new file mode 100644
index 00000000..819b2e32
--- /dev/null
+++ b/training/dpmert/test_aer/README
@@ -0,0 +1,8 @@
+To run the test:
+
+../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights
+
+This will optimize the parameters of the tiny lexical translation model
+so as to minimize the AER of the Viterbi alignment on the development
+set in corpus.src according to the reference alignments in ref.0.
+
diff --git a/training/dpmert/test_aer/cdec.ini b/training/dpmert/test_aer/cdec.ini
new file mode 100644
index 00000000..08187848
--- /dev/null
+++ b/training/dpmert/test_aer/cdec.ini
@@ -0,0 +1,3 @@
+formalism=lextrans
+grammar=grammar
+aligner=true
diff --git a/training/dpmert/test_aer/corpus.src b/training/dpmert/test_aer/corpus.src
new file mode 100644
index 00000000..31b23971
--- /dev/null
+++ b/training/dpmert/test_aer/corpus.src
@@ -0,0 +1,3 @@
+el gato negro ||| the black cat
+el gato ||| the cat
+el libro ||| the book
diff --git a/training/dpmert/test_aer/grammar b/training/dpmert/test_aer/grammar
new file mode 100644
index 00000000..9d857824
--- /dev/null
+++ b/training/dpmert/test_aer/grammar
@@ -0,0 +1,12 @@
+el ||| cat ||| F1=1
+el ||| the ||| F2=1
+el ||| black ||| F3=1
+el ||| book ||| F11=1
+gato ||| cat ||| F4=1 NN=1
+gato ||| black ||| F5=1
+gato ||| the ||| F6=1
+negro ||| the ||| F7=1
+negro ||| cat ||| F8=1
+negro ||| black ||| F9=1
+libro ||| the ||| F10=1
+libro ||| book ||| F12=1 NN=1
diff --git a/training/dpmert/test_aer/ref.0 b/training/dpmert/test_aer/ref.0
new file mode 100644
index 00000000..734a9c5b
--- /dev/null
+++ b/training/dpmert/test_aer/ref.0
@@ -0,0 +1,3 @@
+0-0 1-2 2-1
+0-0 1-1
+0-0 1-1
diff --git a/training/dpmert/test_aer/weights b/training/dpmert/test_aer/weights
new file mode 100644
index 00000000..afc9282e
--- /dev/null
+++ b/training/dpmert/test_aer/weights
@@ -0,0 +1,13 @@
+F1 0.1
+F2 -.5980815
+F3 0.24235
+F4 0.625
+F5 0.4514
+F6 0.112316
+F7 -0.123415
+F8 -0.25390285
+F9 -0.23852
+F10 0.646
+F11 0.413141
+F12 0.343216
+NN -0.1215
diff --git a/training/dpmert/test_data/0.json.gz b/training/dpmert/test_data/0.json.gz
new file mode 100644
index 00000000..30f8dd77
--- /dev/null
+++ b/training/dpmert/test_data/0.json.gz
Binary files differ
diff --git a/training/dpmert/test_data/1.json.gz b/training/dpmert/test_data/1.json.gz
new file mode 100644
index 00000000..c82cc179
--- /dev/null
+++ b/training/dpmert/test_data/1.json.gz
Binary files differ
diff --git a/training/dpmert/test_data/c2e.txt.0 b/training/dpmert/test_data/c2e.txt.0
new file mode 100644
index 00000000..12c4abe9
--- /dev/null
+++ b/training/dpmert/test_data/c2e.txt.0
@@ -0,0 +1,2 @@
+australia reopens embassy in manila
+( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack .
diff --git a/training/dpmert/test_data/c2e.txt.1 b/training/dpmert/test_data/c2e.txt.1
new file mode 100644
index 00000000..4ac12df1
--- /dev/null
+++ b/training/dpmert/test_data/c2e.txt.1
@@ -0,0 +1,2 @@
+australia reopened manila embassy
+( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack .
diff --git a/training/dpmert/test_data/c2e.txt.2 b/training/dpmert/test_data/c2e.txt.2
new file mode 100644
index 00000000..2f67b72f
--- /dev/null
+++ b/training/dpmert/test_data/c2e.txt.2
@@ -0,0 +1,2 @@
+australia to reopen embassy in manila
+( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats .
diff --git a/training/dpmert/test_data/c2e.txt.3 b/training/dpmert/test_data/c2e.txt.3
new file mode 100644
index 00000000..5483cef6
--- /dev/null
+++ b/training/dpmert/test_data/c2e.txt.3
@@ -0,0 +1,2 @@
+australia to re - open its embassy to manila
+( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago .
diff --git a/training/dpmert/test_data/re.txt.0 b/training/dpmert/test_data/re.txt.0
new file mode 100644
index 00000000..86eff087
--- /dev/null
+++ b/training/dpmert/test_data/re.txt.0
@@ -0,0 +1,5 @@
+erdogan states turkey to reject any pressures to urge it to recognize cyprus
+ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened .
+erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus .
+we will discuss this dossier in the course of membership negotiations . "
+he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . "
diff --git a/training/dpmert/test_data/re.txt.1 b/training/dpmert/test_data/re.txt.1
new file mode 100644
index 00000000..2140f198
--- /dev/null
+++ b/training/dpmert/test_data/re.txt.1
@@ -0,0 +1,5 @@
+erdogan confirms turkey will resist any pressure to recognize cyprus
+ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara .
+erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus .
+we shall discuss this issue in the course of the membership negotiations . "
+he added : " let me be clear - i cannot confine turkey . this is something we do not accept . "
diff --git a/training/dpmert/test_data/re.txt.2 b/training/dpmert/test_data/re.txt.2
new file mode 100644
index 00000000..94e46286
--- /dev/null
+++ b/training/dpmert/test_data/re.txt.2
@@ -0,0 +1,5 @@
+erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus
+ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara .
+erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus .
+we shall discuss this dossier during the negotiations on joining . "
+and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . "
diff --git a/training/dpmert/test_data/re.txt.3 b/training/dpmert/test_data/re.txt.3
new file mode 100644
index 00000000..f87c3308
--- /dev/null
+++ b/training/dpmert/test_data/re.txt.3
@@ -0,0 +1,5 @@
+erdogan stresses that turkey will reject all pressures to force it to recognize cyprus
+ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not .
+erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus .
+we will discuss this file during the negotiations on joining . "
+he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . "
diff --git a/training/dtrain/Makefile.am b/training/dtrain/Makefile.am
new file mode 100644
index 00000000..844c790d
--- /dev/null
+++ b/training/dtrain/Makefile.am
@@ -0,0 +1,7 @@
+bin_PROGRAMS = dtrain
+
+dtrain_SOURCES = dtrain.cc score.cc dtrain.h kbestget.h ksampler.h pairsampling.h score.h
+dtrain_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
+
diff --git a/training/dtrain/README.md b/training/dtrain/README.md
new file mode 100644
index 00000000..2ab2f232
--- /dev/null
+++ b/training/dtrain/README.md
@@ -0,0 +1,30 @@
+This is a simple (and parallelizable) tuning method for cdec
+which is able to train the weights of very many (sparse) features.
+It was used here:
+ "Joint Feature Selection in Distributed Stochastic
+ Learning for Large-Scale Discriminative Training in
+ SMT"
+(Simianer, Riezler, Dyer; ACL 2012)
+
+
+Building
+--------
+Builds when building cdec, see ../BUILDING .
+To build only parts needed for dtrain do
+```
+ autoreconf -ifv
+ ./configure
+ cd training/dtrain/; make
+```
+
+Running
+-------
+See directories under test/ .
+
+Legal
+-----
+Copyright (c) 2012-2013 by Patrick Simianer <p@simianer.de>
+
+See the file LICENSE.txt in the root folder for the licensing terms that this software is
+released under.
+
diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
new file mode 100644
index 00000000..149f87d4
--- /dev/null
+++ b/training/dtrain/dtrain.cc
@@ -0,0 +1,553 @@
+#include "dtrain.h"
+
+
+bool
+dtrain_init(int argc, char** argv, po::variables_map* cfg)
+{
+ po::options_description ini("Configuration File Options");
+ ini.add_options()
+ ("input", po::value<string>()->default_value("-"), "input file (src)")
+ ("refs,r", po::value<string>(), "references")
+ ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT")
+ ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)")
+ ("decoder_config", po::value<string>(), "configuration file for cdec")
+ ("print_weights", po::value<string>(), "weights to print on each iteration")
+ ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences")
+ ("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration")
+ ("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)")
+ ("k", po::value<unsigned>()->default_value(100), "how many translations to sample")
+ ("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'")
+ ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'")
+ ("pair_sampling", po::value<string>()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'")
+ ("hi_lo", po::value<float>()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5")
+ ("pair_threshold", po::value<score_t>()->default_value(0.), "bleu [0,1] threshold to filter pairs")
+ ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)")
+ ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_, lc_")
+ ("learning_rate", po::value<weight_t>()->default_value(1.0), "learning rate")
+ ("gamma", po::value<weight_t>()->default_value(0.), "gamma for SVM (0 for perceptron)")
+ ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)")
+ ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input")
+ ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010) UNTESTED")
+ ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength")
+ ("fselect", po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPLEMENTED") // TODO
+ ("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU")
+ ("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair")
+ ("loss_margin", po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near")
+ ("max_pairs", po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.")
+ ("noup", po::value<bool>()->zero_tokens(), "do not update weights");
+ po::options_description cl("Command Line Options");
+ cl.add_options()
+ ("config,c", po::value<string>(), "dtrain config file")
+ ("quiet,q", po::value<bool>()->zero_tokens(), "be quiet")
+ ("verbose,v", po::value<bool>()->zero_tokens(), "be verbose");
+ cl.add(ini);
+ po::store(parse_command_line(argc, argv, cl), *cfg);
+ if (cfg->count("config")) {
+ ifstream ini_f((*cfg)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(ini_f, ini), *cfg);
+ }
+ po::notify(*cfg);
+ if (!cfg->count("decoder_config")) {
+ cerr << cl << endl;
+ return false;
+ }
+ if ((*cfg)["sample_from"].as<string>() != "kbest"
+ && (*cfg)["sample_from"].as<string>() != "forest") {
+ cerr << "Wrong 'sample_from' param: '" << (*cfg)["sample_from"].as<string>() << "', use 'kbest' or 'forest'." << endl;
+ return false;
+ }
+ if ((*cfg)["sample_from"].as<string>() == "kbest" && (*cfg)["filter"].as<string>() != "uniq" &&
+ (*cfg)["filter"].as<string>() != "not") {
+ cerr << "Wrong 'filter' param: '" << (*cfg)["filter"].as<string>() << "', use 'uniq' or 'not'." << endl;
+ return false;
+ }
+ if ((*cfg)["pair_sampling"].as<string>() != "all" && (*cfg)["pair_sampling"].as<string>() != "XYX" &&
+ (*cfg)["pair_sampling"].as<string>() != "PRO") {
+ cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as<string>() << "'." << endl;
+ return false;
+ }
+ if(cfg->count("hi_lo") && (*cfg)["pair_sampling"].as<string>() != "XYX") {
+ cerr << "Warning: hi_lo only works with pair_sampling XYX." << endl;
+ }
+ if((*cfg)["hi_lo"].as<float>() > 0.5 || (*cfg)["hi_lo"].as<float>() < 0.01) {
+ cerr << "hi_lo must lie in [0.01, 0.5]" << endl;
+ return false;
+ }
+ if ((*cfg)["pair_threshold"].as<score_t>() < 0) {
+ cerr << "The threshold must be >= 0!" << endl;
+ return false;
+ }
+ if ((*cfg)["select_weights"].as<string>() != "last" && (*cfg)["select_weights"].as<string>() != "best" &&
+ (*cfg)["select_weights"].as<string>() != "avg" && (*cfg)["select_weights"].as<string>() != "VOID") {
+ cerr << "Wrong 'select_weights' param: '" << (*cfg)["select_weights"].as<string>() << "', use 'last' or 'best'." << endl;
+ return false;
+ }
+ return true;
+}
+
+int
+main(int argc, char** argv)
+{
+ // handle most parameters
+ po::variables_map cfg;
+ if (!dtrain_init(argc, argv, &cfg)) exit(1); // something is wrong
+ bool quiet = false;
+ if (cfg.count("quiet")) quiet = true;
+ bool verbose = false;
+ if (cfg.count("verbose")) verbose = true;
+ bool noup = false;
+ if (cfg.count("noup")) noup = true;
+ bool rescale = false;
+ if (cfg.count("rescale")) rescale = true;
+ bool keep = false;
+ if (cfg.count("keep")) keep = true;
+
+ const unsigned k = cfg["k"].as<unsigned>();
+ const unsigned N = cfg["N"].as<unsigned>();
+ const unsigned T = cfg["epochs"].as<unsigned>();
+ const unsigned stop_after = cfg["stop_after"].as<unsigned>();
+ const string filter_type = cfg["filter"].as<string>();
+ const string sample_from = cfg["sample_from"].as<string>();
+ const string pair_sampling = cfg["pair_sampling"].as<string>();
+ const score_t pair_threshold = cfg["pair_threshold"].as<score_t>();
+ const string select_weights = cfg["select_weights"].as<string>();
+ const float hi_lo = cfg["hi_lo"].as<float>();
+ const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>();
+ const unsigned max_pairs = cfg["max_pairs"].as<unsigned>();
+ weight_t loss_margin = cfg["loss_margin"].as<weight_t>();
+ if (loss_margin > 9998.) loss_margin = std::numeric_limits<float>::max();
+ bool scale_bleu_diff = false;
+ if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true;
+ bool average = false;
+ if (select_weights == "avg")
+ average = true;
+ vector<string> print_weights;
+ if (cfg.count("print_weights"))
+ boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" "));
+
+
+ // setup decoder
+ register_feature_functions();
+ SetSilent(true);
+ ReadFile ini_rf(cfg["decoder_config"].as<string>());
+ if (!quiet)
+ cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
+ Decoder decoder(ini_rf.stream());
+
+ // scoring metric/scorer
+ string scorer_str = cfg["scorer"].as<string>();
+ LocalScorer* scorer;
+ if (scorer_str == "bleu") {
+ scorer = dynamic_cast<BleuScorer*>(new BleuScorer);
+ } else if (scorer_str == "stupid_bleu") {
+ scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer);
+ } else if (scorer_str == "fixed_stupid_bleu") {
+ scorer = dynamic_cast<FixedStupidBleuScorer*>(new FixedStupidBleuScorer);
+ } else if (scorer_str == "smooth_bleu") {
+ scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
+ } else if (scorer_str == "sum_bleu") {
+ scorer = dynamic_cast<SumBleuScorer*>(new SumBleuScorer);
+ } else if (scorer_str == "sumexp_bleu") {
+ scorer = dynamic_cast<SumExpBleuScorer*>(new SumExpBleuScorer);
+ } else if (scorer_str == "sumwhatever_bleu") {
+ scorer = dynamic_cast<SumWhateverBleuScorer*>(new SumWhateverBleuScorer);
+ } else if (scorer_str == "approx_bleu") {
+ scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d));
+ } else if (scorer_str == "lc_bleu") {
+ scorer = dynamic_cast<LinearBleuScorer*>(new LinearBleuScorer(N));
+ } else {
+ cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;
+ exit(1);
+ }
+ vector<score_t> bleu_weights;
+ scorer->Init(N, bleu_weights);
+
+ // setup decoder observer
+ MT19937 rng; // random number generator, only for forest sampling
+ HypSampler* observer;
+ if (sample_from == "kbest")
+ observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type));
+ else
+ observer = dynamic_cast<KSampler*>(new KSampler(k, &rng));
+ observer->SetScorer(scorer);
+
+ // init weights
+ vector<weight_t>& dense_weights = decoder.CurrentWeightVector();
+ SparseVector<weight_t> lambdas, cumulative_penalties, w_average;
+ if (cfg.count("input_weights")) Weights::InitFromFile(cfg["input_weights"].as<string>(), &dense_weights);
+ Weights::InitSparseVector(dense_weights, &lambdas);
+
+ // meta params for perceptron, SVM
+ weight_t eta = cfg["learning_rate"].as<weight_t>();
+ weight_t gamma = cfg["gamma"].as<weight_t>();
+
+ // faster perceptron: consider only misranked pairs, see
+ // DO NOT ENABLE WITH SVM (gamma > 0) OR loss_margin!
+ bool faster_perceptron = false;
+ if (gamma==0 && loss_margin==0) faster_perceptron = true;
+
+ // l1 regularization
+ bool l1naive = false;
+ bool l1clip = false;
+ bool l1cumul = false;
+ weight_t l1_reg = 0;
+ if (cfg["l1_reg"].as<string>() != "none") {
+ string s = cfg["l1_reg"].as<string>();
+ if (s == "naive") l1naive = true;
+ else if (s == "clip") l1clip = true;
+ else if (s == "cumul") l1cumul = true;
+ l1_reg = cfg["l1_reg_strength"].as<weight_t>();
+ }
+
+ // output
+ string output_fn = cfg["output"].as<string>();
+ // input
+ string input_fn = cfg["input"].as<string>();
+ ReadFile input(input_fn);
+ // buffer input for t > 0
+ vector<string> src_str_buf; // source strings (decoder takes only strings)
+ vector<vector<WordID> > ref_ids_buf; // references as WordID vecs
+ string refs_fn = cfg["refs"].as<string>();
+ ReadFile refs(refs_fn);
+
+ unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size
+ vector<pair<score_t, score_t> > all_scores;
+ score_t max_score = 0.;
+ unsigned best_it = 0;
+ float overall_time = 0.;
+
+ // output cfg
+ if (!quiet) {
+ cerr << _p5;
+ cerr << endl << "dtrain" << endl << "Parameters:" << endl;
+ cerr << setw(25) << "k " << k << endl;
+ cerr << setw(25) << "N " << N << endl;
+ cerr << setw(25) << "T " << T << endl;
+ cerr << setw(26) << "scorer '" << scorer_str << "'" << endl;
+ if (scorer_str == "approx_bleu")
+ cerr << setw(25) << "approx. B discount " << approx_bleu_d << endl;
+ cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl;
+ if (sample_from == "kbest")
+ cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl;
+ if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl;
+ else cerr << setw(25) << "learning rate " << "bleu diff" << endl;
+ cerr << setw(25) << "gamma " << gamma << endl;
+ cerr << setw(25) << "loss margin " << loss_margin << endl;
+ cerr << setw(25) << "faster perceptron " << faster_perceptron << endl;
+ cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;
+ if (pair_sampling == "XYX")
+ cerr << setw(25) << "hi lo " << hi_lo << endl;
+ cerr << setw(25) << "pair threshold " << pair_threshold << endl;
+ cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl;
+ if (cfg.count("l1_reg"))
+ cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl;
+ if (rescale)
+ cerr << setw(25) << "rescale " << rescale << endl;
+ cerr << setw(25) << "max pairs " << max_pairs << endl;
+ cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
+ cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
+ cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl;
+ cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
+ if (cfg.count("input_weights"))
+ cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;
+ if (stop_after > 0)
+ cerr << setw(25) << "stop_after " << stop_after << endl;
+ if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl;
+ }
+
+
+ for (unsigned t = 0; t < T; t++) // T epochs
+ {
+
+ time_t start, end;
+ time(&start);
+ score_t score_sum = 0.;
+ score_t model_sum(0);
+ unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0;
+ if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl;
+
+ while(true)
+ {
+
+ string in;
+ bool next = false, stop = false; // next iteration or premature stop
+ if (t == 0) {
+ if(!getline(*input, in)) next = true;
+ } else {
+ if (ii == in_sz) next = true; // stop if we reach the end of our input
+ }
+ // stop after X sentences (but still go on for those)
+ if (stop_after > 0 && stop_after == ii && !next) stop = true;
+
+ // produce some pretty output
+ if (!quiet && !verbose) {
+ if (ii == 0) cerr << " ";
+ if ((ii+1) % (DTRAIN_DOTS) == 0) {
+ cerr << ".";
+ cerr.flush();
+ }
+ if ((ii+1) % (20*DTRAIN_DOTS) == 0) {
+ cerr << " " << ii+1 << endl;
+ if (!next && !stop) cerr << " ";
+ }
+ if (stop) {
+ if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl;
+ cerr << "Stopping after " << stop_after << " input sentences." << endl;
+ } else {
+ if (next) {
+ if (ii % (20*DTRAIN_DOTS) != 0) cerr << " " << ii << endl;
+ }
+ }
+ }
+
+ // next iteration
+ if (next || stop) break;
+
+ // weights
+ lambdas.init_vector(&dense_weights);
+
+ // getting input
+ vector<WordID> ref_ids; // reference as vector<WordID>
+ if (t == 0) {
+ string r_;
+ getline(*refs, r_);
+ vector<string> ref_tok;
+ boost::split(ref_tok, r_, boost::is_any_of(" "));
+ register_and_convert(ref_tok, ref_ids);
+ ref_ids_buf.push_back(ref_ids);
+ src_str_buf.push_back(in);
+ } else {
+ ref_ids = ref_ids_buf[ii];
+ }
+ observer->SetRef(ref_ids);
+ if (t == 0)
+ decoder.Decode(in, observer);
+ else
+ decoder.Decode(src_str_buf[ii], observer);
+
+ // get (scored) samples
+ vector<ScoredHyp>* samples = observer->GetSamples();
+
+ if (verbose) {
+ cerr << "--- ref for " << ii << ": ";
+ if (t > 0) printWordIDVec(ref_ids_buf[ii]);
+ else printWordIDVec(ref_ids);
+ cerr << endl;
+ for (unsigned u = 0; u < samples->size(); u++) {
+ cerr << _p2 << _np << "[" << u << ". '";
+ printWordIDVec((*samples)[u].w);
+ cerr << "'" << endl;
+ cerr << "SCORE=" << (*samples)[u].score << ",model="<< (*samples)[u].model << endl;
+ cerr << "F{" << (*samples)[u].f << "} ]" << endl << endl;
+ }
+ }
+
+ score_sum += (*samples)[0].score; // stats for 1best
+ model_sum += (*samples)[0].model;
+
+ f_count += observer->get_f_count();
+ list_sz += observer->get_sz();
+
+ // weight updates
+ if (!noup) {
+ // get pairs
+ vector<pair<ScoredHyp,ScoredHyp> > pairs;
+ if (pair_sampling == "all")
+ all_pairs(samples, pairs, pair_threshold, max_pairs, faster_perceptron);
+ if (pair_sampling == "XYX")
+ partXYX(samples, pairs, pair_threshold, max_pairs, faster_perceptron, hi_lo);
+ if (pair_sampling == "PRO")
+ PROsampling(samples, pairs, pair_threshold, max_pairs);
+ npairs += pairs.size();
+
+ for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin();
+ it != pairs.end(); it++) {
+ bool rank_error;
+ score_t margin;
+ if (faster_perceptron) { // we only have considering misranked pairs
+ rank_error = true; // pair sampling already did this for us
+ margin = std::numeric_limits<float>::max();
+ } else {
+ rank_error = it->first.model <= it->second.model;
+ margin = fabs(fabs(it->first.model) - fabs(it->second.model));
+ if (!rank_error && margin < loss_margin) margin_violations++;
+ }
+ if (rank_error) rank_errors++;
+ if (scale_bleu_diff) eta = it->first.score - it->second.score;
+ if (rank_error || margin < loss_margin) {
+ SparseVector<weight_t> diff_vec = it->first.f - it->second.f;
+ lambdas.plus_eq_v_times_s(diff_vec, eta);
+ if (gamma)
+ lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs));
+ }
+ }
+
+ // l1 regularization
+ // please note that this penalizes _all_ weights
+ // (contrary to only the ones changed by the last update)
+ // after a _sentence_ (not after each example/pair)
+ if (l1naive) {
+ FastSparseVector<weight_t>::iterator it = lambdas.begin();
+ for (; it != lambdas.end(); ++it) {
+ it->second -= sign(it->second) * l1_reg;
+ }
+ } else if (l1clip) {
+ FastSparseVector<weight_t>::iterator it = lambdas.begin();
+ for (; it != lambdas.end(); ++it) {
+ if (it->second != 0) {
+ weight_t v = it->second;
+ if (v > 0) {
+ it->second = max(0., v - l1_reg);
+ } else {
+ it->second = min(0., v + l1_reg);
+ }
+ }
+ }
+ } else if (l1cumul) {
+ weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input
+ FastSparseVector<weight_t>::iterator it = lambdas.begin();
+ for (; it != lambdas.end(); ++it) {
+ if (it->second != 0) {
+ weight_t v = it->second;
+ weight_t penalized = 0.;
+ if (v > 0) {
+ penalized = max(0., v-(acc_penalty + cumulative_penalties.get(it->first)));
+ } else {
+ penalized = min(0., v+(acc_penalty - cumulative_penalties.get(it->first)));
+ }
+ it->second = penalized;
+ cumulative_penalties.set_value(it->first, cumulative_penalties.get(it->first)+penalized);
+ }
+ }
+ }
+
+ }
+
+ if (rescale) lambdas /= lambdas.l2norm();
+
+ ++ii;
+
+ } // input loop
+
+ if (average) w_average += lambdas;
+
+ if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset();
+
+ if (t == 0) {
+ in_sz = ii; // remember size of input (# lines)
+ }
+
+ // print some stats
+ score_t score_avg = score_sum/(score_t)in_sz;
+ score_t model_avg = model_sum/(score_t)in_sz;
+ score_t score_diff, model_diff;
+ if (t > 0) {
+ score_diff = score_avg - all_scores[t-1].first;
+ model_diff = model_avg - all_scores[t-1].second;
+ } else {
+ score_diff = score_avg;
+ model_diff = model_avg;
+ }
+
+ unsigned nonz = 0;
+ if (!quiet) nonz = (unsigned)lambdas.num_nonzero();
+
+ if (!quiet) {
+ cerr << _p5 << _p << "WEIGHTS" << endl;
+ for (vector<string>::iterator it = print_weights.begin(); it != print_weights.end(); it++) {
+ cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl;
+ }
+ cerr << " ---" << endl;
+ cerr << _np << " 1best avg score: " << score_avg;
+ cerr << _p << " (" << score_diff << ")" << endl;
+ cerr << _np << " 1best avg model score: " << model_avg;
+ cerr << _p << " (" << model_diff << ")" << endl;
+ cerr << " avg # pairs: ";
+ cerr << _np << npairs/(float)in_sz;
+ if (faster_perceptron) cerr << " (meaningless)";
+ cerr << endl;
+ cerr << " avg # rank err: ";
+ cerr << rank_errors/(float)in_sz << endl;
+ cerr << " avg # margin viol: ";
+ cerr << margin_violations/(float)in_sz << endl;
+ cerr << " non0 feature count: " << nonz << endl;
+ cerr << " avg list sz: " << list_sz/(float)in_sz << endl;
+ cerr << " avg f count: " << f_count/(float)list_sz << endl;
+ }
+
+ pair<score_t,score_t> remember;
+ remember.first = score_avg;
+ remember.second = model_avg;
+ all_scores.push_back(remember);
+ if (score_avg > max_score) {
+ max_score = score_avg;
+ best_it = t;
+ }
+ time (&end);
+ float time_diff = difftime(end, start);
+ overall_time += time_diff;
+ if (!quiet) {
+ cerr << _p2 << _np << "(time " << time_diff/60. << " min, ";
+ cerr << time_diff/in_sz << " s/S)" << endl;
+ }
+ if (t+1 != T && !quiet) cerr << endl;
+
+ if (noup) break;
+
+ // write weights to file
+ if (select_weights == "best" || keep) {
+ lambdas.init_vector(&dense_weights);
+ string w_fn = "weights." + boost::lexical_cast<string>(t) + ".gz";
+ Weights::WriteToFile(w_fn, dense_weights, true);
+ }
+
+ } // outer loop
+
+ if (average) w_average /= (weight_t)T;
+
+ if (!noup) {
+ if (!quiet) cerr << endl << "Writing weights file to '" << output_fn << "' ..." << endl;
+ if (select_weights == "last" || average) { // last, average
+ WriteFile of(output_fn); // works with '-'
+ ostream& o = *of.stream();
+ o.precision(17);
+ o << _np;
+ if (average) {
+ for (SparseVector<weight_t>::iterator it = w_average.begin(); it != w_average.end(); ++it) {
+ if (it->second == 0) continue;
+ o << FD::Convert(it->first) << '\t' << it->second << endl;
+ }
+ } else {
+ for (SparseVector<weight_t>::iterator it = lambdas.begin(); it != lambdas.end(); ++it) {
+ if (it->second == 0) continue;
+ o << FD::Convert(it->first) << '\t' << it->second << endl;
+ }
+ }
+ } else if (select_weights == "VOID") { // do nothing with the weights
+ } else { // best
+ if (output_fn != "-") {
+ CopyFile("weights."+boost::lexical_cast<string>(best_it)+".gz", output_fn);
+ } else {
+ ReadFile bestw("weights."+boost::lexical_cast<string>(best_it)+".gz");
+ string o;
+ cout.precision(17);
+ cout << _np;
+ while(getline(*bestw, o)) cout << o << endl;
+ }
+ if (!keep) {
+ for (unsigned i = 0; i < T; i++) {
+ string s = "weights." + boost::lexical_cast<string>(i) + ".gz";
+ unlink(s.c_str());
+ }
+ }
+ }
+ if (!quiet) cerr << "done" << endl;
+ }
+
+ if (!quiet) {
+ cerr << _p5 << _np << endl << "---" << endl << "Best iteration: ";
+ cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl;
+ cerr << "This took " << overall_time/60. << " min." << endl;
+ }
+}
+
diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h
new file mode 100644
index 00000000..eb0b9f17
--- /dev/null
+++ b/training/dtrain/dtrain.h
@@ -0,0 +1,92 @@
+#ifndef _DTRAIN_H_
+#define _DTRAIN_H_
+
+#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
+#define DTRAIN_SCALE 100000
+
+#include <iomanip>
+#include <climits>
+#include <string.h>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/program_options.hpp>
+
+#include "ksampler.h"
+#include "pairsampling.h"
+
+#include "filelib.h"
+
+
+using namespace std;
+using namespace dtrain;
+namespace po = boost::program_options;
+
+inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids)
+{
+ vector<string>::const_iterator it;
+ for (it = strs.begin(); it < strs.end(); it++)
+ ids.push_back(TD::Convert(*it));
+}
+
+inline string gettmpf(const string path, const string infix)
+{
+ char fn[path.size() + infix.size() + 8];
+ strcpy(fn, path.c_str());
+ strcat(fn, "/");
+ strcat(fn, infix.c_str());
+ strcat(fn, "-XXXXXX");
+ if (!mkstemp(fn)) {
+ cerr << "Cannot make temp file in" << path << " , exiting." << endl;
+ exit(1);
+ }
+ return string(fn);
+}
+
+inline void split_in(string& s, vector<string>& parts)
+{
+ unsigned f = 0;
+ for(unsigned i = 0; i < 3; i++) {
+ unsigned e = f;
+ f = s.find("\t", f+1);
+ if (e != 0) parts.push_back(s.substr(e+1, f-e-1));
+ else parts.push_back(s.substr(0, f));
+ }
+ s.erase(0, f+1);
+}
+
+struct HSReporter
+{
+ string task_id_;
+
+ HSReporter(string task_id) : task_id_(task_id) {}
+
+ inline void update_counter(string name, unsigned amount) {
+ cerr << "reporter:counter:" << task_id_ << "," << name << "," << amount << endl;
+ }
+ inline void update_gcounter(string name, unsigned amount) {
+ cerr << "reporter:counter:Global," << name << "," << amount << endl;
+ }
+};
+
+inline ostream& _np(ostream& out) { return out << resetiosflags(ios::showpos); }
+inline ostream& _p(ostream& out) { return out << setiosflags(ios::showpos); }
+inline ostream& _p2(ostream& out) { return out << setprecision(2); }
+inline ostream& _p5(ostream& out) { return out << setprecision(5); }
+
+inline void printWordIDVec(vector<WordID>& v)
+{
+ for (unsigned i = 0; i < v.size(); i++) {
+ cerr << TD::Convert(v[i]);
+ if (i < v.size()-1) cerr << " ";
+ }
+}
+
+template<typename T>
+inline T sign(T z)
+{
+ if (z == 0) return 0;
+ return z < 0 ? -1 : +1;
+}
+
+#endif
+
diff --git a/training/dtrain/examples/parallelized/README b/training/dtrain/examples/parallelized/README
new file mode 100644
index 00000000..89715105
--- /dev/null
+++ b/training/dtrain/examples/parallelized/README
@@ -0,0 +1,5 @@
+run for example
+ ../../parallelize.rb ./dtrain.ini 4 false 2 2 ./in ./refs
+
+final weights will be in the file work/weights.3
+
diff --git a/training/dtrain/examples/parallelized/cdec.ini b/training/dtrain/examples/parallelized/cdec.ini
new file mode 100644
index 00000000..e43ba1c4
--- /dev/null
+++ b/training/dtrain/examples/parallelized/cdec.ini
@@ -0,0 +1,22 @@
+formalism=scfg
+add_pass_through_rules=true
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=200
+scfg_max_span_limit=15
+feature_function=WordPenalty
+feature_function=KLanguageModel ../example/nc-wmt11.en.srilm.gz
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+#feature_function=RuleIdentityFeatures
+#feature_function=RuleNgramFeatures
+#feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/training/dtrain/examples/parallelized/dtrain.ini b/training/dtrain/examples/parallelized/dtrain.ini
new file mode 100644
index 00000000..f19ef891
--- /dev/null
+++ b/training/dtrain/examples/parallelized/dtrain.ini
@@ -0,0 +1,16 @@
+k=100
+N=4
+learning_rate=0.0001
+gamma=0
+loss_margin=1.0
+epochs=1
+scorer=stupid_bleu
+sample_from=kbest
+filter=uniq
+pair_sampling=XYX
+hi_lo=0.1
+select_weights=last
+print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+# newer version of the grammar extractor use different feature names:
+#print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+decoder_config=cdec.ini
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz
new file mode 100644
index 00000000..1e28a24b
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.0.gz
Binary files differ
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz
new file mode 100644
index 00000000..372f5675
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.1.gz
Binary files differ
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz
new file mode 100644
index 00000000..145d0dc0
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.2.gz
Binary files differ
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz
new file mode 100644
index 00000000..105593ff
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.3.gz
Binary files differ
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz
new file mode 100644
index 00000000..30781f48
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.4.gz
Binary files differ
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz
new file mode 100644
index 00000000..834ee759
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.5.gz
Binary files differ
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz
new file mode 100644
index 00000000..2e76f348
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.6.gz
Binary files differ
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz
new file mode 100644
index 00000000..3741a887
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.7.gz
Binary files differ
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz
new file mode 100644
index 00000000..ebf6bd0c
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.8.gz
Binary files differ
diff --git a/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz b/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz
new file mode 100644
index 00000000..c1791059
--- /dev/null
+++ b/training/dtrain/examples/parallelized/grammar/grammar.out.9.gz
Binary files differ
diff --git a/training/dtrain/examples/parallelized/in b/training/dtrain/examples/parallelized/in
new file mode 100644
index 00000000..51d01fe7
--- /dev/null
+++ b/training/dtrain/examples/parallelized/in
@@ -0,0 +1,10 @@
+<seg grammar="grammar/grammar.out.0.gz" id="0">europas nach rassen geteiltes haus</seg>
+<seg grammar="grammar/grammar.out.1.gz" id="1">ein gemeinsames merkmal aller extremen rechten in europa ist ihr rassismus und die tatsache , daß sie das einwanderungsproblem als politischen hebel benutzen .</seg>
+<seg grammar="grammar/grammar.out.2.gz" id="2">der lega nord in italien , der vlaams block in den niederlanden , die anhänger von le pens nationaler front in frankreich , sind beispiele für parteien oder bewegungen , die sich um das gemeinsame thema : ablehnung der zuwanderung gebildet haben und um forderung nach einer vereinfachten politik , um sie zu regeln .</seg>
+<seg grammar="grammar/grammar.out.3.gz" id="3">während individuen wie jörg haidar und jean @-@ marie le pen kommen und ( leider nicht zu bald ) wieder gehen mögen , wird die rassenfrage aus der europäischer politik nicht so bald verschwinden .</seg>
+<seg grammar="grammar/grammar.out.4.gz" id="4">eine alternde einheimische bevölkerung und immer offenere grenzen vermehren die rassistische zersplitterung in den europäischen ländern .</seg>
+<seg grammar="grammar/grammar.out.5.gz" id="5">die großen parteien der rechten und der linken mitte haben sich dem problem gestellt , in dem sie den kopf in den sand gesteckt und allen aussichten zuwider gehofft haben , es möge bald verschwinden .</seg>
+<seg grammar="grammar/grammar.out.6.gz" id="6">das aber wird es nicht , wie die geschichte des rassismus in amerika deutlich zeigt .</seg>
+<seg grammar="grammar/grammar.out.7.gz" id="7">die beziehungen zwischen den rassen standen in den usa über jahrzehnte - und tun das noch heute - im zentrum der politischen debatte . das ging so weit , daß rassentrennung genauso wichtig wie das einkommen wurde , - wenn nicht sogar noch wichtiger - um politische zuneigungen und einstellungen zu bestimmen .</seg>
+<seg grammar="grammar/grammar.out.8.gz" id="8">der erste schritt , um mit der rassenfrage umzugehen ist , ursache und folgen rassistischer feindseligkeiten zu verstehen , auch dann , wenn das bedeutet , unangenehme tatsachen aufzudecken .</seg>
+<seg grammar="grammar/grammar.out.9.gz" id="9">genau das haben in den usa eine große anzahl an forschungsvorhaben in wirtschaft , soziologie , psychologie und politikwissenschaft geleistet . diese forschungen zeigten , daß menschen unterschiedlicher rasse einander deutlich weniger vertrauen .</seg>
diff --git a/training/dtrain/examples/parallelized/refs b/training/dtrain/examples/parallelized/refs
new file mode 100644
index 00000000..632e27b0
--- /dev/null
+++ b/training/dtrain/examples/parallelized/refs
@@ -0,0 +1,10 @@
+europe 's divided racial house
+a common feature of europe 's extreme right is its racism and use of the immigration issue as a political wedge .
+the lega nord in italy , the vlaams blok in the netherlands , the supporters of le pen 's national front in france , are all examples of parties or movements formed on the common theme of aversion to immigrants and promotion of simplistic policies to control them .
+while individuals like jorg haidar and jean @-@ marie le pen may come and ( never to soon ) go , the race question will not disappear from european politics anytime soon .
+an aging population at home and ever more open borders imply increasing racial fragmentation in european countries .
+mainstream parties of the center left and center right have confronted this prospect by hiding their heads in the ground , hoping against hope that the problem will disappear .
+it will not , as america 's racial history clearly shows .
+race relations in the us have been for decades - and remain - at the center of political debate , to the point that racial cleavages are as important as income , if not more , as determinants of political preferences and attitudes .
+the first step to address racial politics is to understand the origin and consequences of racial animosity , even if it means uncovering unpleasant truths .
+this is precisely what a large amount of research in economics , sociology , psychology and political science has done for the us .
diff --git a/training/dtrain/examples/parallelized/work/out.0.0 b/training/dtrain/examples/parallelized/work/out.0.0
new file mode 100644
index 00000000..7a00ed0f
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/out.0.0
@@ -0,0 +1,61 @@
+ cdec cfg 'cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading ../example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+Seeding random number sequence to 3121929377
+
+dtrain
+Parameters:
+ k 100
+ N 4
+ T 1
+ scorer 'stupid_bleu'
+ sample from 'kbest'
+ filter 'uniq'
+ learning rate 0.0001
+ gamma 0
+ loss margin 1
+ pairs 'XYX'
+ hi lo 0.1
+ pair threshold 0
+ select weights 'last'
+ l1 reg 0 'none'
+ max pairs 4294967295
+ cdec cfg 'cdec.ini'
+ input 'work/shard.0.0.in'
+ refs 'work/shard.0.0.refs'
+ output 'work/weights.0.0'
+(a dot represents 10 inputs)
+Iteration #1 of 1.
+ 5
+WEIGHTS
+ Glue = +0.2663
+ WordPenalty = -0.0079042
+ LanguageModel = +0.44782
+ LanguageModel_OOV = -0.0401
+ PhraseModel_0 = -0.193
+ PhraseModel_1 = +0.71321
+ PhraseModel_2 = +0.85196
+ PhraseModel_3 = -0.43986
+ PhraseModel_4 = -0.44803
+ PhraseModel_5 = -0.0538
+ PhraseModel_6 = -0.1788
+ PassThrough = -0.1477
+ ---
+ 1best avg score: 0.17521 (+0.17521)
+ 1best avg model score: 21.556 (+21.556)
+ avg # pairs: 1671.2
+ avg # rank err: 1118.6
+ avg # margin viol: 552.6
+ non0 feature count: 12
+ avg list sz: 100
+ avg f count: 11.32
+(time 0.37 min, 4.4 s/S)
+
+Writing weights file to 'work/weights.0.0' ...
+done
+
+---
+Best iteration: 1 [SCORE 'stupid_bleu'=0.17521].
+This took 0.36667 min.
diff --git a/training/dtrain/examples/parallelized/work/out.0.1 b/training/dtrain/examples/parallelized/work/out.0.1
new file mode 100644
index 00000000..e2bd6649
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/out.0.1
@@ -0,0 +1,62 @@
+ cdec cfg 'cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading ../example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+Seeding random number sequence to 2767202922
+
+dtrain
+Parameters:
+ k 100
+ N 4
+ T 1
+ scorer 'stupid_bleu'
+ sample from 'kbest'
+ filter 'uniq'
+ learning rate 0.0001
+ gamma 0
+ loss margin 1
+ pairs 'XYX'
+ hi lo 0.1
+ pair threshold 0
+ select weights 'last'
+ l1 reg 0 'none'
+ max pairs 4294967295
+ cdec cfg 'cdec.ini'
+ input 'work/shard.0.0.in'
+ refs 'work/shard.0.0.refs'
+ output 'work/weights.0.1'
+ weights in 'work/weights.0'
+(a dot represents 10 inputs)
+Iteration #1 of 1.
+ 5
+WEIGHTS
+ Glue = -0.2699
+ WordPenalty = +0.080605
+ LanguageModel = -0.026572
+ LanguageModel_OOV = -0.30025
+ PhraseModel_0 = -0.32076
+ PhraseModel_1 = +0.67451
+ PhraseModel_2 = +0.92
+ PhraseModel_3 = -0.36402
+ PhraseModel_4 = -0.592
+ PhraseModel_5 = -0.0269
+ PhraseModel_6 = -0.28755
+ PassThrough = -0.33285
+ ---
+ 1best avg score: 0.26638 (+0.26638)
+ 1best avg model score: 53.197 (+53.197)
+ avg # pairs: 2028.6
+ avg # rank err: 998.2
+ avg # margin viol: 918.8
+ non0 feature count: 12
+ avg list sz: 100
+ avg f count: 10.496
+(time 0.32 min, 3.8 s/S)
+
+Writing weights file to 'work/weights.0.1' ...
+done
+
+---
+Best iteration: 1 [SCORE 'stupid_bleu'=0.26638].
+This took 0.31667 min.
diff --git a/training/dtrain/examples/parallelized/work/out.1.0 b/training/dtrain/examples/parallelized/work/out.1.0
new file mode 100644
index 00000000..6e790e38
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/out.1.0
@@ -0,0 +1,61 @@
+ cdec cfg 'cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading ../example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+Seeding random number sequence to 1432415010
+
+dtrain
+Parameters:
+ k 100
+ N 4
+ T 1
+ scorer 'stupid_bleu'
+ sample from 'kbest'
+ filter 'uniq'
+ learning rate 0.0001
+ gamma 0
+ loss margin 1
+ pairs 'XYX'
+ hi lo 0.1
+ pair threshold 0
+ select weights 'last'
+ l1 reg 0 'none'
+ max pairs 4294967295
+ cdec cfg 'cdec.ini'
+ input 'work/shard.1.0.in'
+ refs 'work/shard.1.0.refs'
+ output 'work/weights.1.0'
+(a dot represents 10 inputs)
+Iteration #1 of 1.
+ 5
+WEIGHTS
+ Glue = -0.3815
+ WordPenalty = +0.20064
+ LanguageModel = +0.95304
+ LanguageModel_OOV = -0.264
+ PhraseModel_0 = -0.22362
+ PhraseModel_1 = +0.12254
+ PhraseModel_2 = +0.26328
+ PhraseModel_3 = +0.38018
+ PhraseModel_4 = -0.48654
+ PhraseModel_5 = +0
+ PhraseModel_6 = -0.3645
+ PassThrough = -0.2216
+ ---
+ 1best avg score: 0.10863 (+0.10863)
+ 1best avg model score: -4.9841 (-4.9841)
+ avg # pairs: 1345.4
+ avg # rank err: 822.4
+ avg # margin viol: 501
+ non0 feature count: 11
+ avg list sz: 100
+ avg f count: 11.814
+(time 0.45 min, 5.4 s/S)
+
+Writing weights file to 'work/weights.1.0' ...
+done
+
+---
+Best iteration: 1 [SCORE 'stupid_bleu'=0.10863].
+This took 0.45 min.
diff --git a/training/dtrain/examples/parallelized/work/out.1.1 b/training/dtrain/examples/parallelized/work/out.1.1
new file mode 100644
index 00000000..0b984761
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/out.1.1
@@ -0,0 +1,62 @@
+ cdec cfg 'cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading ../example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+Seeding random number sequence to 1771918374
+
+dtrain
+Parameters:
+ k 100
+ N 4
+ T 1
+ scorer 'stupid_bleu'
+ sample from 'kbest'
+ filter 'uniq'
+ learning rate 0.0001
+ gamma 0
+ loss margin 1
+ pairs 'XYX'
+ hi lo 0.1
+ pair threshold 0
+ select weights 'last'
+ l1 reg 0 'none'
+ max pairs 4294967295
+ cdec cfg 'cdec.ini'
+ input 'work/shard.1.0.in'
+ refs 'work/shard.1.0.refs'
+ output 'work/weights.1.1'
+ weights in 'work/weights.0'
+(a dot represents 10 inputs)
+Iteration #1 of 1.
+ 5
+WEIGHTS
+ Glue = -0.3178
+ WordPenalty = +0.11092
+ LanguageModel = +0.17269
+ LanguageModel_OOV = -0.13485
+ PhraseModel_0 = -0.45371
+ PhraseModel_1 = +0.38789
+ PhraseModel_2 = +0.75311
+ PhraseModel_3 = -0.38163
+ PhraseModel_4 = -0.58817
+ PhraseModel_5 = -0.0269
+ PhraseModel_6 = -0.27315
+ PassThrough = -0.16745
+ ---
+ 1best avg score: 0.13169 (+0.13169)
+ 1best avg model score: 24.226 (+24.226)
+ avg # pairs: 1951.2
+ avg # rank err: 985.4
+ avg # margin viol: 951
+ non0 feature count: 12
+ avg list sz: 100
+ avg f count: 11.224
+(time 0.42 min, 5 s/S)
+
+Writing weights file to 'work/weights.1.1' ...
+done
+
+---
+Best iteration: 1 [SCORE 'stupid_bleu'=0.13169].
+This took 0.41667 min.
diff --git a/training/dtrain/examples/parallelized/work/shard.0.0.in b/training/dtrain/examples/parallelized/work/shard.0.0.in
new file mode 100644
index 00000000..92f9c78e
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/shard.0.0.in
@@ -0,0 +1,5 @@
+<seg grammar="grammar/grammar.out.0.gz" id="0">europas nach rassen geteiltes haus</seg>
+<seg grammar="grammar/grammar.out.1.gz" id="1">ein gemeinsames merkmal aller extremen rechten in europa ist ihr rassismus und die tatsache , daß sie das einwanderungsproblem als politischen hebel benutzen .</seg>
+<seg grammar="grammar/grammar.out.2.gz" id="2">der lega nord in italien , der vlaams block in den niederlanden , die anhänger von le pens nationaler front in frankreich , sind beispiele für parteien oder bewegungen , die sich um das gemeinsame thema : ablehnung der zuwanderung gebildet haben und um forderung nach einer vereinfachten politik , um sie zu regeln .</seg>
+<seg grammar="grammar/grammar.out.3.gz" id="3">während individuen wie jörg haidar und jean @-@ marie le pen kommen und ( leider nicht zu bald ) wieder gehen mögen , wird die rassenfrage aus der europäischer politik nicht so bald verschwinden .</seg>
+<seg grammar="grammar/grammar.out.4.gz" id="4">eine alternde einheimische bevölkerung und immer offenere grenzen vermehren die rassistische zersplitterung in den europäischen ländern .</seg>
diff --git a/training/dtrain/examples/parallelized/work/shard.0.0.refs b/training/dtrain/examples/parallelized/work/shard.0.0.refs
new file mode 100644
index 00000000..bef68fee
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/shard.0.0.refs
@@ -0,0 +1,5 @@
+europe 's divided racial house
+a common feature of europe 's extreme right is its racism and use of the immigration issue as a political wedge .
+the lega nord in italy , the vlaams blok in the netherlands , the supporters of le pen 's national front in france , are all examples of parties or movements formed on the common theme of aversion to immigrants and promotion of simplistic policies to control them .
+while individuals like jorg haidar and jean @-@ marie le pen may come and ( never to soon ) go , the race question will not disappear from european politics anytime soon .
+an aging population at home and ever more open borders imply increasing racial fragmentation in european countries .
diff --git a/training/dtrain/examples/parallelized/work/shard.1.0.in b/training/dtrain/examples/parallelized/work/shard.1.0.in
new file mode 100644
index 00000000..b7695ce7
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/shard.1.0.in
@@ -0,0 +1,5 @@
+<seg grammar="grammar/grammar.out.5.gz" id="5">die großen parteien der rechten und der linken mitte haben sich dem problem gestellt , in dem sie den kopf in den sand gesteckt und allen aussichten zuwider gehofft haben , es möge bald verschwinden .</seg>
+<seg grammar="grammar/grammar.out.6.gz" id="6">das aber wird es nicht , wie die geschichte des rassismus in amerika deutlich zeigt .</seg>
+<seg grammar="grammar/grammar.out.7.gz" id="7">die beziehungen zwischen den rassen standen in den usa über jahrzehnte - und tun das noch heute - im zentrum der politischen debatte . das ging so weit , daß rassentrennung genauso wichtig wie das einkommen wurde , - wenn nicht sogar noch wichtiger - um politische zuneigungen und einstellungen zu bestimmen .</seg>
+<seg grammar="grammar/grammar.out.8.gz" id="8">der erste schritt , um mit der rassenfrage umzugehen ist , ursache und folgen rassistischer feindseligkeiten zu verstehen , auch dann , wenn das bedeutet , unangenehme tatsachen aufzudecken .</seg>
+<seg grammar="grammar/grammar.out.9.gz" id="9">genau das haben in den usa eine große anzahl an forschungsvorhaben in wirtschaft , soziologie , psychologie und politikwissenschaft geleistet . diese forschungen zeigten , daß menschen unterschiedlicher rasse einander deutlich weniger vertrauen .</seg>
diff --git a/training/dtrain/examples/parallelized/work/shard.1.0.refs b/training/dtrain/examples/parallelized/work/shard.1.0.refs
new file mode 100644
index 00000000..6076f6d5
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/shard.1.0.refs
@@ -0,0 +1,5 @@
+mainstream parties of the center left and center right have confronted this prospect by hiding their heads in the ground , hoping against hope that the problem will disappear .
+it will not , as america 's racial history clearly shows .
+race relations in the us have been for decades - and remain - at the center of political debate , to the point that racial cleavages are as important as income , if not more , as determinants of political preferences and attitudes .
+the first step to address racial politics is to understand the origin and consequences of racial animosity , even if it means uncovering unpleasant truths .
+this is precisely what a large amount of research in economics , sociology , psychology and political science has done for the us .
diff --git a/training/dtrain/examples/parallelized/work/weights.0 b/training/dtrain/examples/parallelized/work/weights.0
new file mode 100644
index 00000000..ddd595a8
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.0
@@ -0,0 +1,12 @@
+LanguageModel 0.7004298992212881
+PhraseModel_2 0.5576194336478857
+PhraseModel_1 0.41787318415343155
+PhraseModel_4 -0.46728502545635164
+PhraseModel_3 -0.029839521598455515
+Glue -0.05760000000000068
+PhraseModel_6 -0.2716499999999978
+PhraseModel_0 -0.20831031065605327
+LanguageModel_OOV -0.15205000000000077
+PassThrough -0.1846500000000006
+WordPenalty 0.09636994553433414
+PhraseModel_5 -0.026900000000000257
diff --git a/training/dtrain/examples/parallelized/work/weights.0.0 b/training/dtrain/examples/parallelized/work/weights.0.0
new file mode 100644
index 00000000..c9370b18
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.0.0
@@ -0,0 +1,12 @@
+WordPenalty -0.0079041595706392243
+LanguageModel 0.44781580828279532
+LanguageModel_OOV -0.04010000000000042
+Glue 0.26629999999999948
+PhraseModel_0 -0.19299677809125185
+PhraseModel_1 0.71321026861732773
+PhraseModel_2 0.85195540993310537
+PhraseModel_3 -0.43986310822842656
+PhraseModel_4 -0.44802855630415955
+PhraseModel_5 -0.053800000000000514
+PhraseModel_6 -0.17879999999999835
+PassThrough -0.14770000000000036
diff --git a/training/dtrain/examples/parallelized/work/weights.0.1 b/training/dtrain/examples/parallelized/work/weights.0.1
new file mode 100644
index 00000000..8fad3de8
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.0.1
@@ -0,0 +1,12 @@
+WordPenalty 0.080605055841244472
+LanguageModel -0.026571720531022844
+LanguageModel_OOV -0.30024999999999141
+Glue -0.26989999999999842
+PhraseModel_2 0.92000295209089566
+PhraseModel_1 0.67450748692470841
+PhraseModel_4 -0.5920000014976784
+PhraseModel_3 -0.36402437203127397
+PhraseModel_6 -0.28754999999999603
+PhraseModel_0 -0.32076244202907672
+PassThrough -0.33284999999999004
+PhraseModel_5 -0.026900000000000257
diff --git a/training/dtrain/examples/parallelized/work/weights.1 b/training/dtrain/examples/parallelized/work/weights.1
new file mode 100644
index 00000000..03058a16
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.1
@@ -0,0 +1,12 @@
+PhraseModel_2 0.8365578543552836
+PhraseModel_4 -0.5900840266009169
+PhraseModel_1 0.5312000609786991
+PhraseModel_0 -0.3872342271319619
+PhraseModel_3 -0.3728279676912084
+Glue -0.2938500000000036
+PhraseModel_6 -0.2803499999999967
+PassThrough -0.25014999999999626
+LanguageModel_OOV -0.21754999999999702
+LanguageModel 0.07306061161169894
+WordPenalty 0.09576193325966899
+PhraseModel_5 -0.026900000000000257
diff --git a/training/dtrain/examples/parallelized/work/weights.1.0 b/training/dtrain/examples/parallelized/work/weights.1.0
new file mode 100644
index 00000000..6a6a65c1
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.1.0
@@ -0,0 +1,11 @@
+WordPenalty 0.20064405063930751
+LanguageModel 0.9530439901597807
+LanguageModel_OOV -0.26400000000000112
+Glue -0.38150000000000084
+PhraseModel_0 -0.22362384322085468
+PhraseModel_1 0.12253609968953538
+PhraseModel_2 0.26328345736266612
+PhraseModel_3 0.38018406503151553
+PhraseModel_4 -0.48654149460854373
+PhraseModel_6 -0.36449999999999722
+PassThrough -0.22160000000000085
diff --git a/training/dtrain/examples/parallelized/work/weights.1.1 b/training/dtrain/examples/parallelized/work/weights.1.1
new file mode 100644
index 00000000..f56ea4a2
--- /dev/null
+++ b/training/dtrain/examples/parallelized/work/weights.1.1
@@ -0,0 +1,12 @@
+WordPenalty 0.1109188106780935
+LanguageModel 0.17269294375442074
+LanguageModel_OOV -0.13485000000000266
+Glue -0.3178000000000088
+PhraseModel_2 0.75311275661967159
+PhraseModel_1 0.38789263503268989
+PhraseModel_4 -0.58816805170415531
+PhraseModel_3 -0.38163156335114284
+PhraseModel_6 -0.27314999999999739
+PhraseModel_0 -0.45370601223484697
+PassThrough -0.16745000000000249
+PhraseModel_5 -0.026900000000000257
diff --git a/training/dtrain/examples/standard/README b/training/dtrain/examples/standard/README
new file mode 100644
index 00000000..ce37d31a
--- /dev/null
+++ b/training/dtrain/examples/standard/README
@@ -0,0 +1,2 @@
+Call `dtrain` from this folder with ../../dtrain -c dtrain.ini .
+
diff --git a/training/dtrain/examples/standard/cdec.ini b/training/dtrain/examples/standard/cdec.ini
new file mode 100644
index 00000000..e1edc68d
--- /dev/null
+++ b/training/dtrain/examples/standard/cdec.ini
@@ -0,0 +1,26 @@
+formalism=scfg
+add_pass_through_rules=true
+scfg_max_span_limit=15
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=200
+grammar=nc-wmt11.grammar.gz
+feature_function=WordPenalty
+feature_function=KLanguageModel ./nc-wmt11.en.srilm.gz
+# all currently working feature functions for translation:
+# (with those features active that were used in the ACL paper)
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=Dwarf
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+feature_function=RuleIdentityFeatures
+feature_function=RuleSourceBigramFeatures
+feature_function=RuleTargetBigramFeatures
+feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini
new file mode 100644
index 00000000..e1072d30
--- /dev/null
+++ b/training/dtrain/examples/standard/dtrain.ini
@@ -0,0 +1,24 @@
+input=./nc-wmt11.de.gz
+refs=./nc-wmt11.en.gz
+output=- # a weights file (add .gz for gzip compression) or STDOUT '-'
+select_weights=VOID # output average (over epochs) weight vector
+decoder_config=./cdec.ini # config for cdec
+# weights for these features will be printed on each iteration
+print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+# newer version of the grammar extractor use different feature names:
+#print_weights= EgivenFCoherent SampleCountF CountEF MaxLexFgivenE MaxLexEgivenF IsSingletonF IsSingletonFE Glue WordPenalty PassThrough LanguageModel LanguageModel_OOV
+stop_after=10 # stop epoch after 10 inputs
+
+# interesting stuff
+epochs=2 # run over input 2 times
+k=100 # use 100best lists
+N=4 # optimize (approx) BLEU4
+scorer=stupid_bleu # use 'stupid' BLEU+1
+learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron)
+gamma=0 # use SVM reg
+sample_from=kbest # use kbest lists (as opposed to forest)
+filter=uniq # only unique entries in kbest (surface form)
+pair_sampling=XYX #
+hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here
+pair_threshold=0 # minimum distance in BLEU (here: > 0)
+loss_margin=0 # update if correctly ranked, but within this margin
diff --git a/training/dtrain/examples/standard/expected-output b/training/dtrain/examples/standard/expected-output
new file mode 100644
index 00000000..7cd09dbf
--- /dev/null
+++ b/training/dtrain/examples/standard/expected-output
@@ -0,0 +1,91 @@
+ cdec cfg './cdec.ini'
+Loading the LM will be faster if you build a binary file.
+Reading ./nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+ Example feature: Shape_S00000_T00000
+Seeding random number sequence to 2679584485
+
+dtrain
+Parameters:
+ k 100
+ N 4
+ T 2
+ scorer 'stupid_bleu'
+ sample from 'kbest'
+ filter 'uniq'
+ learning rate 1
+ gamma 0
+ loss margin 0
+ faster perceptron 1
+ pairs 'XYX'
+ hi lo 0.1
+ pair threshold 0
+ select weights 'VOID'
+ l1 reg 0 'none'
+ max pairs 4294967295
+ cdec cfg './cdec.ini'
+ input './nc-wmt11.de.gz'
+ refs './nc-wmt11.en.gz'
+ output '-'
+ stop_after 10
+(a dot represents 10 inputs)
+Iteration #1 of 2.
+ . 10
+Stopping after 10 input sentences.
+WEIGHTS
+ Glue = -576
+ WordPenalty = +417.79
+ LanguageModel = +5117.5
+ LanguageModel_OOV = -1307
+ PhraseModel_0 = -1612
+ PhraseModel_1 = -2159.6
+ PhraseModel_2 = -677.36
+ PhraseModel_3 = +2663.8
+ PhraseModel_4 = -1025.9
+ PhraseModel_5 = -8
+ PhraseModel_6 = +70
+ PassThrough = -1455
+ ---
+ 1best avg score: 0.27697 (+0.27697)
+ 1best avg model score: -47918 (-47918)
+ avg # pairs: 581.9 (meaningless)
+ avg # rank err: 581.9
+ avg # margin viol: 0
+ non0 feature count: 703
+ avg list sz: 90.9
+ avg f count: 100.09
+(time 0.25 min, 1.5 s/S)
+
+Iteration #2 of 2.
+ . 10
+WEIGHTS
+ Glue = -622
+ WordPenalty = +898.56
+ LanguageModel = +8066.2
+ LanguageModel_OOV = -2590
+ PhraseModel_0 = -4335.8
+ PhraseModel_1 = -5864.4
+ PhraseModel_2 = -1729.8
+ PhraseModel_3 = +2831.9
+ PhraseModel_4 = -5384.8
+ PhraseModel_5 = +1449
+ PhraseModel_6 = +480
+ PassThrough = -2578
+ ---
+ 1best avg score: 0.37119 (+0.094226)
+ 1best avg model score: -1.3174e+05 (-83822)
+ avg # pairs: 584.1 (meaningless)
+ avg # rank err: 584.1
+ avg # margin viol: 0
+ non0 feature count: 1115
+ avg list sz: 91.3
+ avg f count: 90.755
+(time 0.3 min, 1.8 s/S)
+
+Writing weights file to '-' ...
+done
+
+---
+Best iteration: 2 [SCORE 'stupid_bleu'=0.37119].
+This took 0.55 min.
diff --git a/training/dtrain/examples/standard/nc-wmt11.de.gz b/training/dtrain/examples/standard/nc-wmt11.de.gz
new file mode 100644
index 00000000..0741fd92
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.de.gz
Binary files differ
diff --git a/training/dtrain/examples/standard/nc-wmt11.en.gz b/training/dtrain/examples/standard/nc-wmt11.en.gz
new file mode 100644
index 00000000..1c0bd401
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.en.gz
Binary files differ
diff --git a/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz b/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz
new file mode 100644
index 00000000..7ce81057
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.en.srilm.gz
Binary files differ
diff --git a/training/dtrain/examples/standard/nc-wmt11.grammar.gz b/training/dtrain/examples/standard/nc-wmt11.grammar.gz
new file mode 100644
index 00000000..ce4024a1
--- /dev/null
+++ b/training/dtrain/examples/standard/nc-wmt11.grammar.gz
Binary files differ
diff --git a/training/dtrain/examples/toy/cdec.ini b/training/dtrain/examples/toy/cdec.ini
new file mode 100644
index 00000000..b14f4819
--- /dev/null
+++ b/training/dtrain/examples/toy/cdec.ini
@@ -0,0 +1,3 @@
+formalism=scfg
+add_pass_through_rules=true
+grammar=grammar.gz
diff --git a/training/dtrain/examples/toy/dtrain.ini b/training/dtrain/examples/toy/dtrain.ini
new file mode 100644
index 00000000..cd715f26
--- /dev/null
+++ b/training/dtrain/examples/toy/dtrain.ini
@@ -0,0 +1,13 @@
+decoder_config=cdec.ini
+input=src
+refs=tgt
+output=-
+print_weights=logp shell_rule house_rule small_rule little_rule PassThrough
+k=4
+N=4
+epochs=2
+scorer=bleu
+sample_from=kbest
+filter=uniq
+pair_sampling=all
+learning_rate=1
diff --git a/training/dtrain/examples/toy/expected-output b/training/dtrain/examples/toy/expected-output
new file mode 100644
index 00000000..1da2aadd
--- /dev/null
+++ b/training/dtrain/examples/toy/expected-output
@@ -0,0 +1,77 @@
+Warning: hi_lo only works with pair_sampling XYX.
+ cdec cfg 'cdec.ini'
+Seeding random number sequence to 1664825829
+
+dtrain
+Parameters:
+ k 4
+ N 4
+ T 2
+ scorer 'bleu'
+ sample from 'kbest'
+ filter 'uniq'
+ learning rate 1
+ gamma 0
+ loss margin 0
+ pairs 'all'
+ pair threshold 0
+ select weights 'last'
+ l1 reg 0 'none'
+ max pairs 4294967295
+ cdec cfg 'cdec.ini'
+ input 'src'
+ refs 'tgt'
+ output '-'
+(a dot represents 10 inputs)
+Iteration #1 of 2.
+ 2
+WEIGHTS
+ logp = +0
+ shell_rule = -1
+ house_rule = +2
+ small_rule = -2
+ little_rule = +3
+ PassThrough = -5
+ ---
+ 1best avg score: 0.5 (+0.5)
+ 1best avg model score: 2.5 (+2.5)
+ avg # pairs: 4
+ avg # rank err: 1.5
+ avg # margin viol: 0
+ non0 feature count: 6
+ avg list sz: 4
+ avg f count: 2.875
+(time 0 min, 0 s/S)
+
+Iteration #2 of 2.
+ 2
+WEIGHTS
+ logp = +0
+ shell_rule = -1
+ house_rule = +2
+ small_rule = -2
+ little_rule = +3
+ PassThrough = -5
+ ---
+ 1best avg score: 1 (+0.5)
+ 1best avg model score: 5 (+2.5)
+ avg # pairs: 5
+ avg # rank err: 0
+ avg # margin viol: 0
+ non0 feature count: 6
+ avg list sz: 4
+ avg f count: 3
+(time 0 min, 0 s/S)
+
+Writing weights file to '-' ...
+house_rule 2
+little_rule 3
+Glue -4
+PassThrough -5
+small_rule -2
+shell_rule -1
+done
+
+---
+Best iteration: 2 [SCORE 'bleu'=1].
+This took 0 min.
diff --git a/training/dtrain/examples/toy/grammar.gz b/training/dtrain/examples/toy/grammar.gz
new file mode 100644
index 00000000..8eb0d29e
--- /dev/null
+++ b/training/dtrain/examples/toy/grammar.gz
Binary files differ
diff --git a/training/dtrain/examples/toy/src b/training/dtrain/examples/toy/src
new file mode 100644
index 00000000..87e39ef2
--- /dev/null
+++ b/training/dtrain/examples/toy/src
@@ -0,0 +1,2 @@
+ich sah ein kleines haus
+ich fand ein kleines haus
diff --git a/training/dtrain/examples/toy/tgt b/training/dtrain/examples/toy/tgt
new file mode 100644
index 00000000..174926b3
--- /dev/null
+++ b/training/dtrain/examples/toy/tgt
@@ -0,0 +1,2 @@
+i saw a little house
+i found a little house
diff --git a/training/dtrain/kbestget.h b/training/dtrain/kbestget.h
new file mode 100644
index 00000000..dd8882e1
--- /dev/null
+++ b/training/dtrain/kbestget.h
@@ -0,0 +1,152 @@
+#ifndef _DTRAIN_KBESTGET_H_
+#define _DTRAIN_KBESTGET_H_
+
+#include "kbest.h" // cdec
+#include "sentence_metadata.h"
+
+#include "verbose.h"
+#include "viterbi.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "weights.h"
+#include "logval.h"
+
+using namespace std;
+
+namespace dtrain
+{
+
+
+typedef double score_t;
+
+struct ScoredHyp
+{
+ vector<WordID> w;
+ SparseVector<double> f;
+ score_t model;
+ score_t score;
+ unsigned rank;
+};
+
+struct LocalScorer
+{
+ unsigned N_;
+ vector<score_t> w_;
+
+ virtual score_t
+ Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len)=0;
+
+ void Reset() {} // only for approx bleu
+
+ inline void
+ Init(unsigned N, vector<score_t> weights)
+ {
+ assert(N > 0);
+ N_ = N;
+ if (weights.empty()) for (unsigned i = 0; i < N_; i++) w_.push_back(1./N_);
+ else w_ = weights;
+ }
+
+ inline score_t
+ brevity_penalty(const unsigned hyp_len, const unsigned ref_len)
+ {
+ if (hyp_len > ref_len) return 1;
+ return exp(1 - (score_t)ref_len/hyp_len);
+ }
+};
+
+struct HypSampler : public DecoderObserver
+{
+ LocalScorer* scorer_;
+ vector<WordID>* ref_;
+ unsigned f_count_, sz_;
+ virtual vector<ScoredHyp>* GetSamples()=0;
+ inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; }
+ inline void SetRef(vector<WordID>& ref) { ref_ = &ref; }
+ inline unsigned get_f_count() { return f_count_; }
+ inline unsigned get_sz() { return sz_; }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+
+struct KBestGetter : public HypSampler
+{
+ const unsigned k_;
+ const string filter_type_;
+ vector<ScoredHyp> s_;
+ unsigned src_len_;
+
+ KBestGetter(const unsigned k, const string filter_type) :
+ k_(k), filter_type_(filter_type) {}
+
+ virtual void
+ NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+ {
+ src_len_ = smeta.GetSourceLength();
+ KBestScored(*hg);
+ }
+
+ vector<ScoredHyp>* GetSamples() { return &s_; }
+
+ void
+ KBestScored(const Hypergraph& forest)
+ {
+ if (filter_type_ == "uniq") {
+ KBestUnique(forest);
+ } else if (filter_type_ == "not") {
+ KBestNoFilter(forest);
+ }
+ }
+
+ void
+ KBestUnique(const Hypergraph& forest)
+ {
+ s_.clear(); sz_ = f_count_ = 0;
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,
+ KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_);
+ for (unsigned i = 0; i < k_; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique,
+ prob_t, EdgeProb>::Derivation* d =
+ kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+ if (!d) break;
+ ScoredHyp h;
+ h.w = d->yield;
+ h.f = d->feature_values;
+ h.model = log(d->score);
+ h.rank = i;
+ h.score = scorer_->Score(h.w, *ref_, i, src_len_);
+ s_.push_back(h);
+ sz_++;
+ f_count_ += h.f.size();
+ }
+ }
+
+ void
+ KBestNoFilter(const Hypergraph& forest)
+ {
+ s_.clear(); sz_ = f_count_ = 0;
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_);
+ for (unsigned i = 0; i < k_; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+ if (!d) break;
+ ScoredHyp h;
+ h.w = d->yield;
+ h.f = d->feature_values;
+ h.model = log(d->score);
+ h.rank = i;
+ h.score = scorer_->Score(h.w, *ref_, i, src_len_);
+ s_.push_back(h);
+ sz_++;
+ f_count_ += h.f.size();
+ }
+ }
+};
+
+
+} // namespace
+
+#endif
+
diff --git a/training/dtrain/ksampler.h b/training/dtrain/ksampler.h
new file mode 100644
index 00000000..bc2f56cd
--- /dev/null
+++ b/training/dtrain/ksampler.h
@@ -0,0 +1,61 @@
+#ifndef _DTRAIN_KSAMPLER_H_
+#define _DTRAIN_KSAMPLER_H_
+
+#include "hg_sampler.h" // cdec
+#include "kbestget.h"
+#include "score.h"
+
+namespace dtrain
+{
+
+bool
+cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b)
+{
+ return a.model > b.model;
+}
+
+struct KSampler : public HypSampler
+{
+ const unsigned k_;
+ vector<ScoredHyp> s_;
+ MT19937* prng_;
+ score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);
+ unsigned src_len_;
+
+ explicit KSampler(const unsigned k, MT19937* prng) :
+ k_(k), prng_(prng) {}
+
+ virtual void
+ NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+ {
+ src_len_ = smeta.GetSourceLength();
+ ScoredSamples(*hg);
+ }
+
+ vector<ScoredHyp>* GetSamples() { return &s_; }
+
+ void ScoredSamples(const Hypergraph& forest) {
+ s_.clear(); sz_ = f_count_ = 0;
+ std::vector<HypergraphSampler::Hypothesis> samples;
+ HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples);
+ for (unsigned i = 0; i < k_; ++i) {
+ ScoredHyp h;
+ h.w = samples[i].words;
+ h.f = samples[i].fmap;
+ h.model = log(samples[i].model_score);
+ h.rank = i;
+ h.score = scorer_->Score(h.w, *ref_, i, src_len_);
+ s_.push_back(h);
+ sz_++;
+ f_count_ += h.f.size();
+ }
+ sort(s_.begin(), s_.end(), cmp_hyp_by_model_d);
+ for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i;
+ }
+};
+
+
+} // namespace
+
+#endif
+
diff --git a/training/dtrain/lplp.rb b/training/dtrain/lplp.rb
new file mode 100755
index 00000000..86e835e8
--- /dev/null
+++ b/training/dtrain/lplp.rb
@@ -0,0 +1,123 @@
+# lplp.rb
+
+# norms
+def l0(feature_column, n)
+ if feature_column.size >= n then return 1 else return 0 end
+end
+
+def l1(feature_column, n=-1)
+ return feature_column.map { |i| i.abs }.reduce { |sum,i| sum+i }
+end
+
+def l2(feature_column, n=-1)
+ return Math.sqrt feature_column.map { |i| i.abs2 }.reduce { |sum,i| sum+i }
+end
+
+def linfty(feature_column, n=-1)
+ return feature_column.map { |i| i.abs }.max
+end
+
+# stats
+def median(feature_column, n)
+ return feature_column.concat(0.step(n-feature_column.size-1).map{|i|0}).sort[feature_column.size/2]
+end
+
+def mean(feature_column, n)
+ return feature_column.reduce { |sum, i| sum+i } / n
+end
+
+# selection
+def select_k(weights, norm_fun, n, k=10000)
+ weights.sort{|a,b| norm_fun.call(b[1], n) <=> norm_fun.call(a[1], n)}.each { |p|
+ puts "#{p[0]}\t#{mean(p[1], n)}"
+ k -= 1
+ if k == 0 then break end
+ }
+end
+
+def cut(weights, norm_fun, n, epsilon=0.0001)
+ weights.each { |k,v|
+ if norm_fun.call(v, n).abs >= epsilon
+ puts "#{k}\t#{mean(v, n)}"
+ end
+ }
+end
+
+# test
+def _test()
+ puts
+ w = {}
+ w["a"] = [1, 2, 3]
+ w["b"] = [1, 2]
+ w["c"] = [66]
+ w["d"] = [10, 20, 30]
+ n = 3
+ puts w.to_s
+ puts
+ puts "select_k"
+ puts "l0 expect ad"
+ select_k(w, method(:l0), n, 2)
+ puts "l1 expect cd"
+ select_k(w, method(:l1), n, 2)
+ puts "l2 expect c"
+ select_k(w, method(:l2), n, 1)
+ puts
+ puts "cut"
+ puts "l1 expect cd"
+ cut(w, method(:l1), n, 7)
+ puts
+ puts "median"
+ a = [1,2,3,4,5]
+ puts a.to_s
+ puts median(a, 5)
+ puts
+ puts "#{median(a, 7)} <- that's because we add missing 0s:"
+ puts a.concat(0.step(7-a.size-1).map{|i|0}).to_s
+ puts
+ puts "mean expect bc"
+ w.clear
+ w["a"] = [2]
+ w["b"] = [2.1]
+ w["c"] = [2.2]
+ cut(w, method(:mean), 1, 2.05)
+ exit
+end
+#_test()
+
+
+def usage()
+ puts "lplp.rb <l0,l1,l2,linfty,mean,median> <cut|select_k> <k|threshold> <#shards> < <input>"
+ puts " l0...: norms for selection"
+ puts "select_k: only output top k (according to the norm of their column vector) features"
+ puts " cut: output features with weight >= threshold"
+ puts " n: if we do not have a shard count use this number for averaging"
+ exit 1
+end
+
+if ARGV.size < 4 then usage end
+norm_fun = method(ARGV[0].to_sym)
+type = ARGV[1]
+x = ARGV[2].to_f
+shard_count = ARGV[3].to_f
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+w = {}
+while line = STDIN.gets
+ key, val = line.split /\s+/
+ if w.has_key? key
+ w[key].push val.to_f
+ else
+ w[key] = [val.to_f]
+ end
+end
+
+if type == 'cut'
+ cut(w, norm_fun, shard_count, x)
+elsif type == 'select_k'
+ select_k(w, norm_fun, shard_count, x)
+else
+ puts "oh oh"
+end
+
diff --git a/training/dtrain/pairsampling.h b/training/dtrain/pairsampling.h
new file mode 100644
index 00000000..3f67e209
--- /dev/null
+++ b/training/dtrain/pairsampling.h
@@ -0,0 +1,140 @@
+#ifndef _DTRAIN_PAIRSAMPLING_H_
+#define _DTRAIN_PAIRSAMPLING_H_
+
+namespace dtrain
+{
+
+
+bool
+accept_pair(score_t a, score_t b, score_t threshold)
+{
+ if (fabs(a - b) < threshold) return false;
+ return true;
+}
+
+bool
+cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)
+{
+ return a.score > b.score;
+}
+
+inline void
+all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool misranked_only, float _unused=1)
+{
+ sort(s->begin(), s->end(), cmp_hyp_by_score_d);
+ unsigned sz = s->size();
+ bool b = false;
+ unsigned count = 0;
+ for (unsigned i = 0; i < sz-1; i++) {
+ for (unsigned j = i+1; j < sz; j++) {
+ if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
+ if (threshold > 0) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ } else {
+ if ((*s)[i].score != (*s)[j].score)
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ }
+ if (++count == max) {
+ b = true;
+ break;
+ }
+ }
+ if (b) break;
+ }
+}
+
+/*
+ * multipartite ranking
+ * sort (descending) by bleu
+ * compare top X to middle Y and low X
+ * cmp middle Y to low X
+ */
+
+inline void
+partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool misranked_only, float hi_lo)
+{
+ unsigned sz = s->size();
+ if (sz < 2) return;
+ sort(s->begin(), s->end(), cmp_hyp_by_score_d);
+ unsigned sep = round(sz*hi_lo);
+ unsigned sep_hi = sep;
+ if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi;
+ else sep_hi = 1;
+ bool b = false;
+ unsigned count = 0;
+ for (unsigned i = 0; i < sep_hi; i++) {
+ for (unsigned j = sep_hi; j < sz; j++) {
+ if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
+ if (threshold > 0) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ } else {
+ if ((*s)[i].score != (*s)[j].score)
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ }
+ if (++count == max) {
+ b = true;
+ break;
+ }
+ }
+ if (b) break;
+ }
+ unsigned sep_lo = sz-sep;
+ while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo;
+ for (unsigned i = sep_hi; i < sz-sep_lo; i++) {
+ for (unsigned j = sz-sep_lo; j < sz; j++) {
+ if (misranked_only && !((*s)[i].model <= (*s)[j].model)) continue;
+ if (threshold > 0) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ } else {
+ if ((*s)[i].score != (*s)[j].score)
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ }
+ if (++count == max) return;
+ }
+ }
+}
+
+/*
+ * pair sampling as in
+ * 'Tuning as Ranking' (Hopkins & May, 2011)
+ * count = 5000
+ * threshold = 5% BLEU (0.05 for param 3)
+ * cut = top 50
+ */
+bool
+_PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b)
+{
+ return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
+}
+inline void
+PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, bool _unused=false, float _also_unused=0)
+{
+ unsigned max_count = 5000, count = 0, sz = s->size();
+ bool b = false;
+ for (unsigned i = 0; i < sz-1; i++) {
+ for (unsigned j = i+1; j < sz; j++) {
+ if (accept_pair((*s)[i].score, (*s)[j].score, threshold)) {
+ training.push_back(make_pair((*s)[i], (*s)[j]));
+ if (++count == max_count) {
+ b = true;
+ break;
+ }
+ }
+ }
+ if (b) break;
+ }
+ if (training.size() > 50) {
+ sort(training.begin(), training.end(), _PRO_cmp_pair_by_diff_d);
+ training.erase(training.begin()+50, training.end());
+ }
+ return;
+}
+
+
+} // namespace
+
+#endif
+
diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb
new file mode 100755
index 00000000..e661416e
--- /dev/null
+++ b/training/dtrain/parallelize.rb
@@ -0,0 +1,149 @@
+#!/usr/bin/env ruby
+
+require 'trollop'
+
+def usage
+ STDERR.write "Usage: "
+ STDERR.write "ruby parallelize.rb -c <dtrain.ini> [-e <epochs=10>] [--randomize/-z] [--reshard/-y] -s <#shards|0> [-p <at once=9999>] -i <input> -r <refs> [--qsub/-q] [--dtrain_binary <path to dtrain binary>] [-l \"l2 select_k 100000\"]\n"
+ exit 1
+end
+
+opts = Trollop::options do
+ opt :config, "dtrain config file", :type => :string
+ opt :epochs, "number of epochs", :type => :int, :default => 10
+ opt :lplp_args, "arguments for lplp.rb", :type => :string, :default => "l2 select_k 100000"
+ opt :randomize, "randomize shards before each epoch", :type => :bool, :short => '-z', :default => false
+ opt :reshard, "reshard after each epoch", :type => :bool, :short => '-y', :default => false
+ opt :shards, "number of shards", :type => :int
+ opt :processes_at_once, "have this number (max) running at the same time", :type => :int, :default => 9999
+ opt :input, "input", :type => :string
+ opt :references, "references", :type => :string
+ opt :qsub, "use qsub", :type => :bool, :default => false
+ opt :dtrain_binary, "path to dtrain binary", :type => :string
+end
+usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references]
+
+
+dtrain_dir = File.expand_path File.dirname(__FILE__)
+if not opts[:dtrain_binary]
+ dtrain_bin = "#{dtrain_dir}/dtrain"
+else
+ dtrain_bin = opts[:dtrain_binary]
+end
+ruby = '/usr/bin/ruby'
+lplp_rb = "#{dtrain_dir}/lplp.rb"
+lplp_args = opts[:lplp_args]
+cat = '/bin/cat'
+
+ini = opts[:config]
+epochs = opts[:epochs]
+rand = opts[:randomize]
+reshard = opts[:reshard]
+predefined_shards = false
+if opts[:shards] == 0
+ predefined_shards = true
+ num_shards = 0
+else
+ num_shards = opts[:shards]
+end
+input = opts[:input]
+refs = opts[:references]
+use_qsub = opts[:qsub]
+shards_at_once = opts[:processes_at_once]
+
+`mkdir work`
+
+def make_shards(input, refs, num_shards, epoch, rand)
+ lc = `wc -l #{input}`.split.first.to_i
+ index = (0..lc-1).to_a
+ index.reverse!
+ index.shuffle! if rand
+ shard_sz = lc / num_shards
+ leftover = lc % num_shards
+ in_f = File.new input, 'r'
+ in_lines = in_f.readlines
+ refs_f = File.new refs, 'r'
+ refs_lines = refs_f.readlines
+ shard_in_files = []
+ shard_refs_files = []
+ in_fns = []
+ refs_fns = []
+ 0.upto(num_shards-1) { |shard|
+ in_fn = "work/shard.#{shard}.#{epoch}.in"
+ shard_in = File.new in_fn, 'w+'
+ in_fns << in_fn
+ refs_fn = "work/shard.#{shard}.#{epoch}.refs"
+ shard_refs = File.new refs_fn, 'w+'
+ refs_fns << refs_fn
+ 0.upto(shard_sz-1) { |i|
+ j = index.pop
+ shard_in.write in_lines[j]
+ shard_refs.write refs_lines[j]
+ }
+ shard_in_files << shard_in
+ shard_refs_files << shard_refs
+ }
+ while leftover > 0
+ j = index.pop
+ shard_in_files[-1].write in_lines[j]
+ shard_refs_files[-1].write refs_lines[j]
+ leftover -= 1
+ end
+ (shard_in_files + shard_refs_files).each do |f| f.close end
+ in_f.close
+ refs_f.close
+ return [in_fns, refs_fns]
+end
+
+input_files = []
+refs_files = []
+if predefined_shards
+ input_files = File.new(input).readlines.map {|i| i.strip }
+ refs_files = File.new(refs).readlines.map {|i| i.strip }
+ num_shards = input_files.size
+else
+ input_files, refs_files = make_shards input, refs, num_shards, 0, rand
+end
+
+0.upto(epochs-1) { |epoch|
+ puts "epoch #{epoch+1}"
+ pids = []
+ input_weights = ''
+ if epoch > 0 then input_weights = "--input_weights work/weights.#{epoch-1}" end
+ weights_files = []
+ shard = 0
+ remaining_shards = num_shards
+ while remaining_shards > 0
+ shards_at_once.times {
+ break if remaining_shards==0
+ qsub_str_start = qsub_str_end = ''
+ local_end = ''
+ if use_qsub
+ qsub_str_start = "qsub -cwd -sync y -b y -j y -o work/out.#{shard}.#{epoch} -N dtrain.#{shard}.#{epoch} \""
+ qsub_str_end = "\""
+ local_end = ''
+ else
+ local_end = "&>work/out.#{shard}.#{epoch}"
+ end
+ pids << Kernel.fork {
+ `#{qsub_str_start}#{dtrain_bin} -c #{ini}\
+ --input #{input_files[shard]}\
+ --refs #{refs_files[shard]} #{input_weights}\
+ --output work/weights.#{shard}.#{epoch}#{qsub_str_end} #{local_end}`
+ }
+ weights_files << "work/weights.#{shard}.#{epoch}"
+ shard += 1
+ remaining_shards -= 1
+ }
+ pids.each { |pid| Process.wait(pid) }
+ pids.clear
+ end
+ `#{cat} work/weights.*.#{epoch} > work/weights_cat`
+ `#{ruby} #{lplp_rb} #{lplp_args} #{num_shards} < work/weights_cat > work/weights.#{epoch}`
+ if rand and reshard and epoch+1!=epochs
+ input_files, refs_files = make_shards input, refs, num_shards, epoch+1, rand
+ end
+}
+
+`rm work/weights_cat`
+
diff --git a/training/dtrain/score.cc b/training/dtrain/score.cc
new file mode 100644
index 00000000..96d6e10a
--- /dev/null
+++ b/training/dtrain/score.cc
@@ -0,0 +1,283 @@
+#include "score.h"
+
+namespace dtrain
+{
+
+
+/*
+ * bleu
+ *
+ * as in "BLEU: a Method for Automatic Evaluation
+ * of Machine Translation"
+ * (Papineni et al. '02)
+ *
+ * NOTE: 0 if for one n \in {1..N} count is 0
+ */
+score_t
+BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len)
+{
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ unsigned M = N_;
+ vector<score_t> v = w_;
+ if (ref_len < N_) {
+ M = ref_len;
+ for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M);
+ }
+ score_t sum = 0;
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) return 0.;
+ sum += v[i] * log((score_t)counts.clipped_[i]/counts.sum_[i]);
+ }
+ return brevity_penalty(hyp_len, ref_len) * exp(sum);
+}
+
+score_t
+BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ return Bleu(counts, hyp_len, ref_len);
+}
+
+/*
+ * 'stupid' bleu
+ *
+ * as in "ORANGE: a Method for Evaluating
+ * Automatic Evaluation Metrics
+ * for Machine Translation"
+ * (Lin & Och '04)
+ *
+ * NOTE: 0 iff no 1gram match ('grounded')
+ */
+score_t
+StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ unsigned M = N_;
+ vector<score_t> v = w_;
+ if (ref_len < N_) {
+ M = ref_len;
+ for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M);
+ }
+ score_t sum = 0, add = 0;
+ for (unsigned i = 0; i < M; i++) {
+ if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.;
+ if (i == 1) add = 1;
+ sum += v[i] * log(((score_t)counts.clipped_[i] + add)/((counts.sum_[i] + add)));
+ }
+ return brevity_penalty(hyp_len, ref_len) * exp(sum);
+}
+
+/*
+ * fixed 'stupid' bleu
+ *
+ * as in "Optimizing for Sentence-Level BLEU+1
+ * Yields Short Translations"
+ * (Nakov et al. '12)
+ */
+score_t
+FixedStupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ unsigned M = N_;
+ vector<score_t> v = w_;
+ if (ref_len < N_) {
+ M = ref_len;
+ for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M);
+ }
+ score_t sum = 0, add = 0;
+ for (unsigned i = 0; i < M; i++) {
+ if (i == 0 && (counts.sum_[i] == 0 || counts.clipped_[i] == 0)) return 0.;
+ if (i == 1) add = 1;
+ sum += v[i] * log(((score_t)counts.clipped_[i] + add)/((counts.sum_[i] + add)));
+ }
+ return brevity_penalty(hyp_len, ref_len+1) * exp(sum); // <- fix
+}
+
+/*
+ * smooth bleu
+ *
+ * as in "An End-to-End Discriminative Approach
+ * to Machine Translation"
+ * (Liang et al. '06)
+ *
+ * NOTE: max is 0.9375 (with N=4)
+ */
+score_t
+SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ unsigned M = N_;
+ if (ref_len < N_) M = ref_len;
+ score_t sum = 0.;
+ vector<score_t> i_bleu;
+ for (unsigned i = 0; i < M; i++) i_bleu.push_back(0.);
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) {
+ break;
+ } else {
+ score_t i_ng = log((score_t)counts.clipped_[i]/counts.sum_[i]);
+ for (unsigned j = i; j < M; j++) {
+ i_bleu[j] += (1/((score_t)j+1)) * i_ng;
+ }
+ }
+ sum += exp(i_bleu[i])/pow(2.0, (double)(N_-i));
+ }
+ return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+/*
+ * 'sum' bleu
+ *
+ * sum up Ngram precisions
+ */
+score_t
+SumBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ unsigned M = N_;
+ if (ref_len < N_) M = ref_len;
+ score_t sum = 0.;
+ unsigned j = 1;
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
+ sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2.0, (double) (N_-j+1));
+ j++;
+ }
+ return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+/*
+ * 'sum' (exp) bleu
+ *
+ * sum up exp(Ngram precisions)
+ */
+score_t
+SumExpBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ unsigned M = N_;
+ if (ref_len < N_) M = ref_len;
+ score_t sum = 0.;
+ unsigned j = 1;
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
+ sum += exp(((score_t)counts.clipped_[i]/counts.sum_[i]))/pow(2.0, (double) (N_-j+1));
+ j++;
+ }
+ return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+/*
+ * 'sum' (whatever) bleu
+ *
+ * sum up exp(weight * log(Ngram precisions))
+ */
+score_t
+SumWhateverBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ unsigned M = N_;
+ vector<score_t> v = w_;
+ if (ref_len < N_) {
+ M = ref_len;
+ for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M);
+ }
+ score_t sum = 0.;
+ unsigned j = 1;
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
+ sum += exp(v[i] * log(((score_t)counts.clipped_[i]/counts.sum_[i])))/pow(2.0, (double) (N_-j+1));
+ j++;
+ }
+ return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+/*
+ * approx. bleu
+ *
+ * as in "Online Large-Margin Training of Syntactic
+ * and Structural Translation Features"
+ * (Chiang et al. '08)
+ *
+ * NOTE: Needs some more code in dtrain.cc .
+ * No scaling by src len.
+ */
+score_t
+ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned rank, const unsigned src_len)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (ref_len == 0) return 0.;
+ score_t score = 0.;
+ NgramCounts counts(N_);
+ if (hyp_len > 0) {
+ counts = make_ngram_counts(hyp, ref, N_);
+ NgramCounts tmp = glob_onebest_counts_ + counts;
+ score = Bleu(tmp, hyp_len, ref_len);
+ }
+ if (rank == 0) { // 'context of 1best translations'
+ glob_onebest_counts_ += counts;
+ glob_onebest_counts_ *= discount_;
+ glob_hyp_len_ = discount_ * (glob_hyp_len_ + hyp_len);
+ glob_ref_len_ = discount_ * (glob_ref_len_ + ref_len);
+ glob_src_len_ = discount_ * (glob_src_len_ + src_len);
+ }
+ return score;
+}
+
+/*
+ * Linear (Corpus) Bleu
+ *
+ * as in "Lattice Minimum Bayes-Risk Decoding
+ * for Statistical Machine Translation"
+ * (Tromble et al. '08)
+ *
+ */
+score_t
+LinearBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned rank, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (ref_len == 0) return 0.;
+ unsigned M = N_;
+ if (ref_len < N_) M = ref_len;
+ NgramCounts counts(M);
+ if (hyp_len > 0)
+ counts = make_ngram_counts(hyp, ref, M);
+ score_t ret = 0.;
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || onebest_counts_.sum_[i] == 0) break;
+ ret += counts.sum_[i]/onebest_counts_.sum_[i];
+ }
+ ret = -(hyp_len/(score_t)onebest_len_) + (1./M) * ret;
+ if (rank == 0) {
+ onebest_len_ += hyp_len;
+ onebest_counts_ += counts;
+ }
+ return ret;
+}
+
+
+} // namespace
+
diff --git a/training/dtrain/score.h b/training/dtrain/score.h
new file mode 100644
index 00000000..bddaa071
--- /dev/null
+++ b/training/dtrain/score.h
@@ -0,0 +1,217 @@
+#ifndef _DTRAIN_SCORE_H_
+#define _DTRAIN_SCORE_H_
+
+#include "kbestget.h"
+
+using namespace std;
+
+namespace dtrain
+{
+
+
+struct NgramCounts
+{
+ unsigned N_;
+ map<unsigned, score_t> clipped_;
+ map<unsigned, score_t> sum_;
+
+ NgramCounts(const unsigned N) : N_(N) { Zero(); }
+
+ inline void
+ operator+=(const NgramCounts& rhs)
+ {
+ if (rhs.N_ > N_) Resize(rhs.N_);
+ for (unsigned i = 0; i < N_; i++) {
+ this->clipped_[i] += rhs.clipped_.find(i)->second;
+ this->sum_[i] += rhs.sum_.find(i)->second;
+ }
+ }
+
+ inline const NgramCounts
+ operator+(const NgramCounts &other) const
+ {
+ NgramCounts result = *this;
+ result += other;
+ return result;
+ }
+
+ inline void
+ operator*=(const score_t rhs)
+ {
+ for (unsigned i = 0; i < N_; i++) {
+ this->clipped_[i] *= rhs;
+ this->sum_[i] *= rhs;
+ }
+ }
+
+ inline void
+ Add(const unsigned count, const unsigned ref_count, const unsigned i)
+ {
+ assert(i < N_);
+ if (count > ref_count) {
+ clipped_[i] += ref_count;
+ } else {
+ clipped_[i] += count;
+ }
+ sum_[i] += count;
+ }
+
+ inline void
+ Zero()
+ {
+ for (unsigned i = 0; i < N_; i++) {
+ clipped_[i] = 0.;
+ sum_[i] = 0.;
+ }
+ }
+
+ inline void
+ One()
+ {
+ for (unsigned i = 0; i < N_; i++) {
+ clipped_[i] = 1.;
+ sum_[i] = 1.;
+ }
+ }
+
+ inline void
+ Print()
+ {
+ for (unsigned i = 0; i < N_; i++) {
+ cout << i+1 << "grams (clipped):\t" << clipped_[i] << endl;
+ cout << i+1 << "grams:\t\t\t" << sum_[i] << endl;
+ }
+ }
+
+ inline void Resize(unsigned N)
+ {
+ if (N == N_) return;
+ else if (N > N_) {
+ for (unsigned i = N_; i < N; i++) {
+ clipped_[i] = 0.;
+ sum_[i] = 0.;
+ }
+ } else { // N < N_
+ for (unsigned i = N_-1; i > N-1; i--) {
+ clipped_.erase(i);
+ sum_.erase(i);
+ }
+ }
+ N_ = N;
+ }
+};
+
+typedef map<vector<WordID>, unsigned> Ngrams;
+
+inline Ngrams
+make_ngrams(const vector<WordID>& s, const unsigned N)
+{
+ Ngrams ngrams;
+ vector<WordID> ng;
+ for (size_t i = 0; i < s.size(); i++) {
+ ng.clear();
+ for (unsigned j = i; j < min(i+N, s.size()); j++) {
+ ng.push_back(s[j]);
+ ngrams[ng]++;
+ }
+ }
+ return ngrams;
+}
+
+inline NgramCounts
+make_ngram_counts(const vector<WordID>& hyp, const vector<WordID>& ref, const unsigned N)
+{
+ Ngrams hyp_ngrams = make_ngrams(hyp, N);
+ Ngrams ref_ngrams = make_ngrams(ref, N);
+ NgramCounts counts(N);
+ Ngrams::iterator it;
+ Ngrams::iterator ti;
+ for (it = hyp_ngrams.begin(); it != hyp_ngrams.end(); it++) {
+ ti = ref_ngrams.find(it->first);
+ if (ti != ref_ngrams.end()) {
+ counts.Add(it->second, ti->second, it->first.size() - 1);
+ } else {
+ counts.Add(it->second, 0, it->first.size() - 1);
+ }
+ }
+ return counts;
+}
+
+struct BleuScorer : public LocalScorer
+{
+ score_t Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len);
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct StupidBleuScorer : public LocalScorer
+{
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct FixedStupidBleuScorer : public LocalScorer
+{
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct SmoothBleuScorer : public LocalScorer
+{
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct SumBleuScorer : public LocalScorer
+{
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct SumExpBleuScorer : public LocalScorer
+{
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct SumWhateverBleuScorer : public LocalScorer
+{
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct ApproxBleuScorer : public BleuScorer
+{
+ NgramCounts glob_onebest_counts_;
+ unsigned glob_hyp_len_, glob_ref_len_, glob_src_len_;
+ score_t discount_;
+
+ ApproxBleuScorer(unsigned N, score_t d) : glob_onebest_counts_(NgramCounts(N)), discount_(d)
+ {
+ glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0;
+ }
+
+ inline void Reset() {
+ glob_onebest_counts_.Zero();
+ glob_hyp_len_ = glob_ref_len_ = glob_src_len_ = 0.;
+ }
+
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len);
+};
+
+struct LinearBleuScorer : public BleuScorer
+{
+ unsigned onebest_len_;
+ NgramCounts onebest_counts_;
+
+ LinearBleuScorer(unsigned N) : onebest_len_(1), onebest_counts_(N)
+ {
+ onebest_counts_.One();
+ }
+
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned /*src_len*/);
+
+ inline void Reset() {
+ onebest_len_ = 1;
+ onebest_counts_.One();
+ }
+};
+
+
+} // namespace
+
+#endif
+
diff --git a/training/fast_align.cc b/training/fast_align.cc
deleted file mode 100644
index 0d7b0202..00000000
--- a/training/fast_align.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-#include <iostream>
-#include <cmath>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "m.h"
-#include "corpus_tools.h"
-#include "stringlib.h"
-#include "filelib.h"
-#include "ttables.h"
-#include "tdict.h"
-
-namespace po = boost::program_options;
-using namespace std;
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input,i",po::value<string>(),"Parallel corpus input file")
- ("reverse,r","Reverse estimation (swap source and target during training)")
- ("iterations,I",po::value<unsigned>()->default_value(5),"Number of iterations of EM training")
- //("bidir,b", "Run bidirectional alignment")
- ("favor_diagonal,d", "Use a static alignment distribution that assigns higher probabilities to alignments near the diagonal")
- ("prob_align_null", po::value<double>()->default_value(0.08), "When --favor_diagonal is set, what's the probability of a null alignment?")
- ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (<1 = flat >1 = sharp)")
- ("variational_bayes,v","Infer VB estimate of parameters under a symmetric Dirichlet prior")
- ("alpha,a", po::value<double>()->default_value(0.01), "Hyperparameter for optional Dirichlet prior")
- ("no_null_word,N","Do not generate from a null token")
- ("output_parameters,p", "Write model parameters instead of alignments")
- ("beam_threshold,t",po::value<double>()->default_value(-4),"When writing parameters, log_10 of beam threshold for writing parameter (-10000 to include everything, 0 max parameter only)")
- ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model")
- ("no_add_viterbi,V","When writing model parameters, do not add Viterbi alignment points (may generate a grammar where some training sentence pairs are unreachable)");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || conf->count("input") == 0) {
- cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n";
- cerr << dcmdline_options << endl;
- return false;
- }
- return true;
-}
-
-double PosteriorInference(const vector<WordID>& src, const vector<WordID>& trg) {
- double llh = 0;
- static vector<double> unnormed_a_i;
- if (src.size() > unnormed_a_i.size())
- unnormed_a_i.resize(src.size());
- return llh;
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- if (!InitCommandLine(argc, argv, &conf)) return 1;
- const string fname = conf["input"].as<string>();
- const bool reverse = conf.count("reverse") > 0;
- const int ITERATIONS = conf["iterations"].as<unsigned>();
- const double BEAM_THRESHOLD = pow(10.0, conf["beam_threshold"].as<double>());
- const bool use_null = (conf.count("no_null_word") == 0);
- const WordID kNULL = TD::Convert("<eps>");
- const bool add_viterbi = (conf.count("no_add_viterbi") == 0);
- const bool variational_bayes = (conf.count("variational_bayes") > 0);
- const bool write_alignments = (conf.count("output_parameters") == 0);
- const double diagonal_tension = conf["diagonal_tension"].as<double>();
- const double prob_align_null = conf["prob_align_null"].as<double>();
- string testset;
- if (conf.count("testset")) testset = conf["testset"].as<string>();
- const double prob_align_not_null = 1.0 - prob_align_null;
- const double alpha = conf["alpha"].as<double>();
- const bool favor_diagonal = conf.count("favor_diagonal");
- if (variational_bayes && alpha <= 0.0) {
- cerr << "--alpha must be > 0\n";
- return 1;
- }
-
- TTable s2t, t2s;
- TTable::Word2Word2Double s2t_viterbi;
- double tot_len_ratio = 0;
- double mean_srclen_multiplier = 0;
- vector<double> unnormed_a_i;
- for (int iter = 0; iter < ITERATIONS; ++iter) {
- const bool final_iteration = (iter == (ITERATIONS - 1));
- cerr << "ITERATION " << (iter + 1) << (final_iteration ? " (FINAL)" : "") << endl;
- ReadFile rf(fname);
- istream& in = *rf.stream();
- double likelihood = 0;
- double denom = 0.0;
- int lc = 0;
- bool flag = false;
- string line;
- string ssrc, strg;
- vector<WordID> src, trg;
- while(true) {
- getline(in, line);
- if (!in) break;
- ++lc;
- if (lc % 1000 == 0) { cerr << '.'; flag = true; }
- if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
- src.clear(); trg.clear();
- CorpusTools::ReadLine(line, &src, &trg);
- if (reverse) swap(src, trg);
- if (src.size() == 0 || trg.size() == 0) {
- cerr << "Error: " << lc << "\n" << line << endl;
- return 1;
- }
- if (src.size() > unnormed_a_i.size())
- unnormed_a_i.resize(src.size());
- if (iter == 0)
- tot_len_ratio += static_cast<double>(trg.size()) / static_cast<double>(src.size());
- denom += trg.size();
- vector<double> probs(src.size() + 1);
- bool first_al = true; // used for write_alignments
- for (int j = 0; j < trg.size(); ++j) {
- const WordID& f_j = trg[j];
- double sum = 0;
- const double j_over_ts = double(j) / trg.size();
- double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1)
- if (use_null) {
- if (favor_diagonal) prob_a_i = prob_align_null;
- probs[0] = s2t.prob(kNULL, f_j) * prob_a_i;
- sum += probs[0];
- }
- double az = 0;
- if (favor_diagonal) {
- for (int ta = 0; ta < src.size(); ++ta) {
- unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension);
- az += unnormed_a_i[ta];
- }
- az /= prob_align_not_null;
- }
- for (int i = 1; i <= src.size(); ++i) {
- if (favor_diagonal)
- prob_a_i = unnormed_a_i[i-1] / az;
- probs[i] = s2t.prob(src[i-1], f_j) * prob_a_i;
- sum += probs[i];
- }
- if (final_iteration) {
- if (add_viterbi || write_alignments) {
- WordID max_i = 0;
- double max_p = -1;
- int max_index = -1;
- if (use_null) {
- max_i = kNULL;
- max_index = 0;
- max_p = probs[0];
- }
- for (int i = 1; i <= src.size(); ++i) {
- if (probs[i] > max_p) {
- max_index = i;
- max_p = probs[i];
- max_i = src[i-1];
- }
- }
- if (write_alignments) {
- if (max_index > 0) {
- if (first_al) first_al = false; else cout << ' ';
- if (reverse)
- cout << j << '-' << (max_index - 1);
- else
- cout << (max_index - 1) << '-' << j;
- }
- }
- s2t_viterbi[max_i][f_j] = 1.0;
- }
- } else {
- if (use_null)
- s2t.Increment(kNULL, f_j, probs[0] / sum);
- for (int i = 1; i <= src.size(); ++i)
- s2t.Increment(src[i-1], f_j, probs[i] / sum);
- }
- likelihood += log(sum);
- }
- if (write_alignments && final_iteration) cout << endl;
- }
-
- // log(e) = 1.0
- double base2_likelihood = likelihood / log(2);
-
- if (flag) { cerr << endl; }
- if (iter == 0) {
- mean_srclen_multiplier = tot_len_ratio / lc;
- cerr << "expected target length = source length * " << mean_srclen_multiplier << endl;
- }
- cerr << " log_e likelihood: " << likelihood << endl;
- cerr << " log_2 likelihood: " << base2_likelihood << endl;
- cerr << " cross entropy: " << (-base2_likelihood / denom) << endl;
- cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl;
- if (!final_iteration) {
- if (variational_bayes)
- s2t.NormalizeVB(alpha);
- else
- s2t.Normalize();
- }
- }
- if (testset.size()) {
- ReadFile rf(testset);
- istream& in = *rf.stream();
- int lc = 0;
- double tlp = 0;
- string ssrc, strg, line;
- while (getline(in, line)) {
- ++lc;
- vector<WordID> src, trg;
- CorpusTools::ReadLine(line, &src, &trg);
- double log_prob = Md::log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier);
- if (src.size() > unnormed_a_i.size())
- unnormed_a_i.resize(src.size());
-
- // compute likelihood
- for (int j = 0; j < trg.size(); ++j) {
- const WordID& f_j = trg[j];
- double sum = 0;
- const double j_over_ts = double(j) / trg.size();
- double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1)
- if (use_null) {
- if (favor_diagonal) prob_a_i = prob_align_null;
- sum += s2t.prob(kNULL, f_j) * prob_a_i;
- }
- double az = 0;
- if (favor_diagonal) {
- for (int ta = 0; ta < src.size(); ++ta) {
- unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension);
- az += unnormed_a_i[ta];
- }
- az /= prob_align_not_null;
- }
- for (int i = 1; i <= src.size(); ++i) {
- if (favor_diagonal)
- prob_a_i = unnormed_a_i[i-1] / az;
- sum += s2t.prob(src[i-1], f_j) * prob_a_i;
- }
- log_prob += log(sum);
- }
- tlp += log_prob;
- cerr << ssrc << " ||| " << strg << " ||| " << log_prob << endl;
- }
- cerr << "TOTAL LOG PROB " << tlp << endl;
- }
-
- if (write_alignments) return 0;
-
- for (TTable::Word2Word2Double::iterator ei = s2t.ttable.begin(); ei != s2t.ttable.end(); ++ei) {
- const TTable::Word2Double& cpd = ei->second;
- const TTable::Word2Double& vit = s2t_viterbi[ei->first];
- const string& esym = TD::Convert(ei->first);
- double max_p = -1;
- for (TTable::Word2Double::const_iterator fi = cpd.begin(); fi != cpd.end(); ++fi)
- if (fi->second > max_p) max_p = fi->second;
- const double threshold = max_p * BEAM_THRESHOLD;
- for (TTable::Word2Double::const_iterator fi = cpd.begin(); fi != cpd.end(); ++fi) {
- if (fi->second > threshold || (vit.find(fi->first) != vit.end())) {
- cout << esym << ' ' << TD::Convert(fi->first) << ' ' << log(fi->second) << endl;
- }
- }
- }
- return 0;
-}
-
diff --git a/training/feature_expectations.cc b/training/feature_expectations.cc
deleted file mode 100644
index f1a85495..00000000
--- a/training/feature_expectations.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-#include <tr1/memory>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "online_optimizer.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "sampler.h"
-
-#ifdef HAVE_MPI
-#include <boost/mpi/timer.hpp>
-#include <boost/mpi.hpp>
-namespace mpi = boost::mpi;
-#endif
-
-using namespace std;
-namespace po = boost::program_options;
-
-struct FComp {
- const vector<double>& w_;
- FComp(const vector<double>& w) : w_(w) {}
- bool operator()(int a, int b) const {
- return fabs(w_[a]) > fabs(w_[b]);
- }
-};
-
-void ShowFeatures(const vector<double>& w) {
- vector<int> fnums(w.size());
- for (int i = 0; i < w.size(); ++i)
- fnums[i] = i;
- sort(fnums.begin(), fnums.end(), FComp(w));
- for (vector<int>::iterator i = fnums.begin(); i != fnums.end(); ++i) {
- if (w[*i]) cout << FD::Convert(*i) << ' ' << w[*i] << endl;
- }
-}
-
-void ReadConfig(const string& ini, vector<string>* out) {
- ReadFile rf(ini);
- istream& in = *rf.stream();
- while(in) {
- string line;
- getline(in, line);
- if (!in) continue;
- out->push_back(line);
- }
-}
-
-void StoreConfig(const vector<string>& cfg, istringstream* o) {
- ostringstream os;
- for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; }
- o->str(os.str());
-}
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input,i",po::value<string>(),"Corpus of source language sentences")
- ("weights,w",po::value<string>(),"Input feature weights file")
- ("decoder_config,c",po::value<string>(), "cdec.ini file");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || !conf->count("input") || !conf->count("decoder_config")) {
- cerr << dcmdline_options << endl;
- return false;
- }
- return true;
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
- ReadFile rf(fname);
- istream& in = *rf.stream();
- string line;
- int id = 0;
- while(in) {
- getline(in, line);
- if (!in) break;
- if (id % size == rank) {
- c->push_back(line);
- order->push_back(id);
- }
- ++id;
- }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct TrainingObserver : public DecoderObserver {
- void Reset() {
- acc_exp.clear();
- total_complete = 0;
- }
-
- virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
- cur_model_exp.clear();
- state = 1;
- }
-
- // compute model expectations, denominator of objective
- virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
- assert(state == 1);
- state = 2;
- const prob_t z = InsideOutside<prob_t,
- EdgeProb,
- SparseVector<prob_t>,
- EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
- cur_model_exp /= z;
- acc_exp += cur_model_exp;
- }
-
- virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
- cerr << "IGNORING ALIGNMENT FOREST!\n";
- }
-
- virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
- if (state == 2) {
- ++total_complete;
- }
- }
-
- void GetExpectations(SparseVector<double>* g) const {
- g->clear();
- for (SparseVector<prob_t>::const_iterator it = acc_exp.begin(); it != acc_exp.end(); ++it)
- g->set_value(it->first, it->second);
- }
-
- int total_complete;
- SparseVector<prob_t> cur_model_exp;
- SparseVector<prob_t> acc_exp;
- int state;
-};
-
-#ifdef HAVE_MPI
-namespace boost { namespace mpi {
- template<>
- struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> >
- : mpl::true_ { };
-} } // end namespace boost::mpi
-#endif
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
- mpi::environment env(argc, argv);
- mpi::communicator world;
- const int size = world.size();
- const int rank = world.rank();
-#else
- const int size = 1;
- const int rank = 0;
-#endif
- if (size > 1) SetSilent(true); // turn off verbose decoder output
- register_feature_functions();
-
- po::variables_map conf;
- if (!InitCommandLine(argc, argv, &conf))
- return 1;
-
- // load initial weights
- Weights weights;
- if (conf.count("weights"))
- weights.InitFromFile(conf["weights"].as<string>());
-
- vector<string> corpus;
- vector<int> ids;
- ReadTrainingCorpus(conf["input"].as<string>(), rank, size, &corpus, &ids);
- assert(corpus.size() > 0);
-
- vector<string> cdec_ini;
- ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
- istringstream ini;
- StoreConfig(cdec_ini, &ini);
- Decoder decoder(&ini);
- if (decoder.GetConf()["input"].as<string>() != "-") {
- cerr << "cdec.ini must not set an input file\n";
- return 1;
- }
-
- SparseVector<double> x;
- weights.InitSparseVector(&x);
- TrainingObserver observer;
-
- weights.InitFromVector(x);
- vector<double> lambdas;
- weights.InitVector(&lambdas);
- decoder.SetWeights(lambdas);
- observer.Reset();
- for (unsigned i = 0; i < corpus.size(); ++i) {
- int id = ids[i];
- decoder.SetId(id);
- decoder.Decode(corpus[i], &observer);
- }
- SparseVector<double> local_exps, exps;
- observer.GetExpectations(&local_exps);
-#ifdef HAVE_MPI
- reduce(world, local_exps, exps, std::plus<SparseVector<double> >(), 0);
-#else
- exps.swap(local_exps);
-#endif
-
- weights.InitFromVector(exps);
- weights.InitVector(&lambdas);
- ShowFeatures(lambdas);
-
- return 0;
-}
diff --git a/training/lbl_model.cc b/training/lbl_model.cc
deleted file mode 100644
index a46ce33c..00000000
--- a/training/lbl_model.cc
+++ /dev/null
@@ -1,421 +0,0 @@
-#include <iostream>
-
-#include "config.h"
-#ifndef HAVE_EIGEN
- int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; }
-#else
-
-#include <cstdlib>
-#include <algorithm>
-#include <cmath>
-#include <set>
-#include <cstring> // memset
-#include <ctime>
-
-#ifdef HAVE_MPI
-#include <boost/mpi/timer.hpp>
-#include <boost/mpi.hpp>
-#include <boost/archive/text_oarchive.hpp>
-namespace mpi = boost::mpi;
-#endif
-#include <boost/math/special_functions/fpclassify.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include <Eigen/Dense>
-
-#include "corpus_tools.h"
-#include "optimize.h"
-#include "array2d.h"
-#include "m.h"
-#include "lattice.h"
-#include "stringlib.h"
-#include "filelib.h"
-#include "tdict.h"
-
-namespace po = boost::program_options;
-using namespace std;
-
-#define kDIMENSIONS 10
-typedef Eigen::Matrix<double, kDIMENSIONS, 1> RVector;
-typedef Eigen::Matrix<double, 1, kDIMENSIONS> RTVector;
-typedef Eigen::Matrix<double, kDIMENSIONS, kDIMENSIONS> TMatrix;
-vector<RVector> r_src, r_trg;
-
-#if HAVE_MPI
-namespace boost {
-namespace serialization {
-
-template<class Archive>
-void serialize(Archive & ar, RVector & v, const unsigned int version) {
- for (unsigned i = 0; i < kDIMENSIONS; ++i)
- ar & v[i];
-}
-
-} // namespace serialization
-} // namespace boost
-#endif
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input,i",po::value<string>(),"Input file")
- ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training")
- ("regularization_strength,C",po::value<double>()->default_value(0.1),"L2 regularization strength (0 for no regularization)")
- ("eta", po::value<double>()->default_value(0.1f), "Eta for SGD")
- ("source_embeddings,f", po::value<string>(), "File containing source embeddings (if unset, random vectors will be used)")
- ("target_embeddings,e", po::value<string>(), "File containing target embeddings (if unset, random vectors will be used)")
- ("random_seed,s", po::value<unsigned>(), "Random seed")
- ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)")
- ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (argc < 2 || conf->count("help")) {
- cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n";
- cerr << dcmdline_options << endl;
- return false;
- }
- return true;
-}
-
-void Normalize(RVector* v) {
- double norm = v->norm();
- assert(norm > 0.0f);
- *v /= norm;
-}
-
-void Flatten(const TMatrix& m, vector<double>* v) {
- unsigned c = 0;
- v->resize(kDIMENSIONS * kDIMENSIONS);
- for (unsigned i = 0; i < kDIMENSIONS; ++i)
- for (unsigned j = 0; j < kDIMENSIONS; ++j) {
- assert(boost::math::isfinite(m(i, j)));
- (*v)[c++] = m(i,j);
- }
-}
-
-void Unflatten(const vector<double>& v, TMatrix* m) {
- unsigned c = 0;
- for (unsigned i = 0; i < kDIMENSIONS; ++i)
- for (unsigned j = 0; j < kDIMENSIONS; ++j) {
- assert(boost::math::isfinite(v[c]));
- (*m)(i, j) = v[c++];
- }
-}
-
-double ApplyRegularization(const double C,
- const vector<double>& weights,
- vector<double>* g) {
- assert(weights.size() == g->size());
- double reg = 0;
- for (size_t i = 0; i < weights.size(); ++i) {
- const double& w_i = weights[i];
- double& g_i = (*g)[i];
- reg += C * w_i * w_i;
- g_i += 2 * C * w_i;
- }
- return reg;
-}
-
-void LoadEmbeddings(const string& filename, vector<RVector>* pv) {
- vector<RVector>& v = *pv;
- cerr << "Reading embeddings from " << filename << " ...\n";
- ReadFile rf(filename);
- istream& in = *rf.stream();
- string line;
- unsigned lc = 0;
- while(getline(in, line)) {
- ++lc;
- size_t cur = line.find(' ');
- if (cur == string::npos || cur == 0) {
- cerr << "Parse error reading line " << lc << ":\n" << line << endl;
- abort();
- }
- WordID w = TD::Convert(line.substr(0, cur));
- if (w >= v.size()) continue;
- RVector& curv = v[w];
- line[cur] = 0;
- size_t start = cur + 1;
- cur = start + 1;
- size_t c = 0;
- while(cur < line.size()) {
- if (line[cur] == ' ') {
- line[cur] = 0;
- curv[c++] = strtod(&line[start], NULL);
- start = cur + 1;
- cur = start;
- if (c == kDIMENSIONS) break;
- }
- ++cur;
- }
- if (c < kDIMENSIONS && cur != start) {
- if (cur < line.size()) line[cur] = 0;
- curv[c++] = strtod(&line[start], NULL);
- }
- if (c != kDIMENSIONS) {
- static bool first = true;
- if (first) {
- cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n";
- first = false;
- }
- for (; c < kDIMENSIONS; ++c) curv[c] = rand();
- }
- if (c == kDIMENSIONS && cur != line.size()) {
- static bool first = true;
- if (first) {
- cerr << " embedding file contains more dimensions than configured with, truncating.\n";
- first = false;
- }
- }
- }
-}
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
- std::cerr << "**MPI enabled.\n";
- mpi::environment env(argc, argv);
- mpi::communicator world;
- const int size = world.size();
- const int rank = world.rank();
-#else
- std::cerr << "**MPI disabled.\n";
- const int rank = 0;
- const int size = 1;
-#endif
- po::variables_map conf;
- if (!InitCommandLine(argc, argv, &conf)) return 1;
- const string fname = conf["input"].as<string>();
- const double reg_strength = conf["regularization_strength"].as<double>();
- const bool has_l2 = reg_strength;
- assert(reg_strength >= 0.0f);
- const int ITERATIONS = conf["iterations"].as<unsigned>();
- const double eta = conf["eta"].as<double>();
- const double diagonal_tension = conf["diagonal_tension"].as<double>();
- bool SGD = false;
- if (diagonal_tension < 0.0) {
- cerr << "Invalid value for diagonal_tension: must be >= 0\n";
- return 1;
- }
- string testset;
- if (conf.count("testset")) testset = conf["testset"].as<string>();
-
- unsigned lc = 0;
- vector<double> unnormed_a_i;
- bool flag = false;
- vector<vector<WordID> > srcs, trgs;
- vector<WordID> vocab_e;
- {
- set<WordID> svocab_e, svocab_f;
- CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size);
- copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e));
- }
- cerr << "Number of target word types: " << vocab_e.size() << endl;
- const double num_examples = lc;
-
- boost::shared_ptr<LBFGSOptimizer> lbfgs;
- if (rank == 0)
- lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100));
- r_trg.resize(TD::NumWords() + 1);
- r_src.resize(TD::NumWords() + 1);
- vector<set<unsigned> > trg_pos(TD::NumWords() + 1);
-
- if (conf.count("random_seed")) {
- srand(conf["random_seed"].as<unsigned>());
- } else {
- unsigned seed = time(NULL) + rank * 100;
- cerr << "Random seed: " << seed << endl;
- srand(seed);
- }
-
- TMatrix t = TMatrix::Zero();
- if (rank == 0) {
- t = TMatrix::Random() / 50.0;
- for (unsigned i = 1; i < r_trg.size(); ++i) {
- r_trg[i] = RVector::Random();
- r_src[i] = RVector::Random();
- }
- if (conf.count("source_embeddings"))
- LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src);
- if (conf.count("target_embeddings"))
- LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg);
- }
-
- // do optimization
- TMatrix g = TMatrix::Zero();
- vector<TMatrix> exp_src;
- vector<double> z_src;
- vector<double> flat_g, flat_t, rcv_grad;
- Flatten(t, &flat_t);
- bool converged = false;
-#if HAVE_MPI
- mpi::broadcast(world, &flat_t[0], flat_t.size(), 0);
- mpi::broadcast(world, r_trg, 0);
- mpi::broadcast(world, r_src, 0);
-#endif
- cerr << "rank=" << rank << ": " << r_trg[0][4] << endl;
- for (int iter = 0; !converged && iter < ITERATIONS; ++iter) {
- if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl;
- Unflatten(flat_t, &t);
- double likelihood = 0;
- double denom = 0.0;
- lc = 0;
- flag = false;
- g *= 0;
- for (unsigned i = 0; i < srcs.size(); ++i) {
- const vector<WordID>& src = srcs[i];
- const vector<WordID>& trg = trgs[i];
- ++lc;
- if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; }
- if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
- denom += trg.size();
-
- exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero());
- z_src.clear(); z_src.resize(src.size(), 0.0);
- Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero());
- Array2D<double> z_refs(src.size(), trg.size(), 0.0);
- for (unsigned j = 0; j < trg.size(); ++j)
- trg_pos[trg[j]].insert(j);
-
- for (unsigned i = 0; i < src.size(); ++i) {
- const RVector& r_s = r_src[src[i]];
- const RTVector pred = r_s.transpose() * t;
- TMatrix& exp_m = exp_src[i];
- double& z = z_src[i];
- for (unsigned k = 0; k < vocab_e.size(); ++k) {
- const WordID v_k = vocab_e[k];
- const RVector& r_t = r_trg[v_k];
- const double dot_prod = pred * r_t;
- const double u = exp(dot_prod);
- z += u;
- const TMatrix v = r_s * r_t.transpose() * u;
- exp_m += v;
- set<unsigned>& ref_locs = trg_pos[v_k];
- if (!ref_locs.empty()) {
- for (set<unsigned>::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) {
- TMatrix& exp_ref_ij = exp_refs(i, *it);
- double& z_ref_ij = z_refs(i, *it);
- z_ref_ij += u;
- exp_ref_ij += v;
- }
- }
- }
- }
- for (unsigned j = 0; j < trg.size(); ++j)
- trg_pos[trg[j]].clear();
-
- // model expectations for a single target generation with
- // uniform alignment prior
- // TODO: when using a non-uniform alignment, m_exp will be
- // a function of j (below)
- double m_z = 0;
- TMatrix m_exp = TMatrix::Zero();
- for (unsigned i = 0; i < src.size(); ++i) {
- m_exp += exp_src[i];
- m_z += z_src[i];
- }
- m_exp /= m_z;
-
- Array2D<bool> al(src.size(), trg.size(), false);
- for (unsigned j = 0; j < trg.size(); ++j) {
- double ref_z = 0;
- TMatrix ref_exp = TMatrix::Zero();
- int max_i = 0;
- double max_s = -9999999;
- for (unsigned i = 0; i < src.size(); ++i) {
- ref_exp += exp_refs(i, j);
- ref_z += z_refs(i, j);
- if (log(z_refs(i, j)) > max_s) {
- max_s = log(z_refs(i, j));
- max_i = i;
- }
- // TODO handle alignment prob
- }
- if (ref_z <= 0) {
- cerr << "TRG=" << TD::Convert(trg[j]) << endl;
- cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl;
- cerr << " REF_EXP=\n" << ref_exp << endl;
- cerr << " M_EXP=\n" << m_exp << endl;
- abort();
- }
- al(max_i, j) = true;
- ref_exp /= ref_z;
- g += m_exp - ref_exp;
- likelihood += log(ref_z) - log(m_z);
- if (SGD) {
- t -= g * eta / num_examples;
- g *= 0;
- }
- }
-
- if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; }
- }
- if (flag && rank == 0) { cerr << endl; }
-
- double obj = 0;
- if (!SGD) {
- Flatten(g, &flat_g);
- obj = -likelihood;
-#if HAVE_MPI
- rcv_grad.resize(flat_g.size(), 0.0);
- mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus<double>(), 0);
- swap(flat_g, rcv_grad);
- rcv_grad.clear();
-
- double to = 0;
- mpi::reduce(world, obj, to, plus<double>(), 0);
- obj = to;
- double tlh = 0;
- mpi::reduce(world, likelihood, tlh, plus<double>(), 0);
- likelihood = tlh;
- double td = 0;
- mpi::reduce(world, denom, td, plus<double>(), 0);
- denom = td;
-#endif
- }
-
- if (rank == 0) {
- double gn = 0;
- for (unsigned i = 0; i < flat_g.size(); ++i)
- gn += flat_g[i]*flat_g[i];
- const double base2_likelihood = likelihood / log(2);
- cerr << " log_e likelihood: " << likelihood << endl;
- cerr << " log_2 likelihood: " << base2_likelihood << endl;
- cerr << " cross entropy: " << (-base2_likelihood / denom) << endl;
- cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl;
- cerr << " gradient norm: " << sqrt(gn) << endl;
- if (!SGD) {
- if (has_l2) {
- const double r = ApplyRegularization(reg_strength,
- flat_t,
- &flat_g);
- obj += r;
- cerr << " regularization: " << r << endl;
- }
- lbfgs->Optimize(obj, flat_g, &flat_t);
- converged = (lbfgs->HasConverged());
- }
- }
-#ifdef HAVE_MPI
- mpi::broadcast(world, &flat_t[0], flat_t.size(), 0);
- mpi::broadcast(world, converged, 0);
-#endif
- }
- if (rank == 0)
- cerr << "TRANSLATION MATRIX:" << endl << t << endl;
- return 0;
-}
-
-#endif
-
diff --git a/training/liblbfgs/Jamfile b/training/liblbfgs/Jamfile
deleted file mode 100644
index 49c82748..00000000
--- a/training/liblbfgs/Jamfile
+++ /dev/null
@@ -1,5 +0,0 @@
-import testing ;
-
-lib liblbfgs : lbfgs.c : <include>.. ;
-
-unit-test ll_test : ll_test.cc liblbfgs : <include>.. ;
diff --git a/training/liblbfgs/Makefile.am b/training/liblbfgs/Makefile.am
index 64a3794d..272d6f56 100644
--- a/training/liblbfgs/Makefile.am
+++ b/training/liblbfgs/Makefile.am
@@ -6,10 +6,17 @@ ll_test_LDADD = liblbfgs.a -lz
noinst_LIBRARIES = liblbfgs.a
-liblbfgs_a_SOURCES = lbfgs.c
+liblbfgs_a_SOURCES = \
+ lbfgs.c \
+ arithmetic_ansi.h \
+ arithmetic_sse_double.h \
+ arithmetic_sse_float.h \
+ lbfgs++.h \
+ lbfgs.h
################################################################
# do NOT NOT NOT add any other -I includes NO NO NO NO NO ######
AM_LDFLAGS = liblbfgs.a -lz
-AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -I. -I..
+AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -I$(top_srcdir)/training -I$(top_srcdir)/training/liblbfgs
################################################################
+
diff --git a/training/minrisk/Makefile.am b/training/minrisk/Makefile.am
new file mode 100644
index 00000000..ebf6fa91
--- /dev/null
+++ b/training/minrisk/Makefile.am
@@ -0,0 +1,8 @@
+bin_PROGRAMS = minrisk_optimize
+
+minrisk_optimize_SOURCES = minrisk_optimize.cc
+minrisk_optimize_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a ../../training/liblbfgs/liblbfgs.a
+
+EXTRA_DIST = minrisk.pl minrisk_generate_input.pl
+
+AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training -I$(top_srcdir)/training/utils
diff --git a/training/minrisk/minrisk.pl b/training/minrisk/minrisk.pl
new file mode 100755
index 00000000..0f8bacd0
--- /dev/null
+++ b/training/minrisk/minrisk.pl
@@ -0,0 +1,540 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+my $default_jobs = env_default_jobs();
+
+my $UTILS_DIR="$SCRIPT_DIR/../utils";
+require "$UTILS_DIR/libcall.pl";
+
+# Default settings
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/minrisk_generate_input.pl";
+my $MAPPER = "$bin_dir/minrisk_optimize";
+my $parallelize = "$UTILS_DIR/parallelize.pl";
+my $libcall = "$UTILS_DIR/libcall.pl";
+my $sentserver = "$UTILS_DIR/sentserver";
+my $sentclient = "$UTILS_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 30;
+my $iteration = 1;
+my $best_weights;
+my $psi = 1;
+my $default_max_iter = 30;
+my $max_iterations = $default_max_iter;
+my $jobs = $default_jobs; # number of decode nodes
+my $pmem = "4g";
+my $disable_clean = 0;
+my %seen_weights;
+my $help = 0;
+my $epsilon = 0.0001;
+my $dryrun = 0;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $use_make = 1; # use make to parallelize
+my $useqsub = 0;
+my $initial_weights;
+my $pass_suffix = '';
+my $cpbin=1;
+
+# regularization strength
+my $tune_regularizer = 0;
+my $reg = 500;
+my $reg_previous = 5000;
+my $dont_accum = 0;
+
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+ "jobs=i" => \$jobs,
+ "dont-clean" => \$disable_clean,
+ "dont-accumulate" => \$dont_accum,
+ "pass-suffix=s" => \$pass_suffix,
+ "qsub" => \$useqsub,
+ "dry-run" => \$dryrun,
+ "epsilon=s" => \$epsilon,
+ "help" => \$help,
+ "weights=s" => \$initial_weights,
+ "reg=f" => \$reg,
+ "use-make=i" => \$use_make,
+ "max-iterations=i" => \$max_iterations,
+ "pmem=s" => \$pmem,
+ "cpbin!" => \$cpbin,
+ "ref-files=s" => \$refFiles,
+ "metric=s" => \$metric,
+ "source-file=s" => \$srcFile,
+ "workdir=s" => \$dir,
+) == 0 || @ARGV!=1 || $help) {
+ print_help();
+ exit;
+}
+
+die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer;
+
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $srcFile) { push @missing_args, "--source-file"; }
+if (!defined $refFiles) { push @missing_args, "--ref-files"; }
+if (!defined $initial_weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
+
+if ($metric =~ /^(combi|ter)$/i) {
+ $lines_per_mapper = 5;
+}
+
+($iniFile) = @ARGV;
+
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+ $DIR_FLAG = '';
+}
+
+my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
+
+unless ($dir){
+ $dir = "minrisk";
+}
+unless ($dir =~ /^\//){ # convert relative path to absolute path
+ my $basedir = check_output("pwd");
+ chomp $basedir;
+ $dir = "$basedir/$dir";
+}
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+ print STDERR "Cleanup...\n";
+ for my $pid (@childpids){ unchecked_call("kill $pid"); }
+ for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+ exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit =
+ sub{ cleanup(); };
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+use File::Basename qw(basename);
+#pass bindir, refs to vars holding bin
+sub modbin {
+ local $_;
+ my $bindir=shift;
+ check_call("mkdir -p $bindir");
+ -d $bindir || die "couldn't make bindir $bindir";
+ for (@_) {
+ my $src=$$_;
+ $$_="$bindir/".basename($src);
+ check_call("cp -p $src $$_");
+ }
+}
+sub dirsize {
+ opendir ISEMPTY,$_[0];
+ return scalar(readdir(ISEMPTY))-1;
+}
+my @allweights;
+if ($dryrun){
+ write_config(*STDERR);
+ exit 0;
+} else {
+ if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs
+ die "ERROR: working dir $dir already exists\n\n";
+ } else {
+ -e $dir || mkdir $dir;
+ mkdir "$dir/hgs";
+ modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
+ mkdir "$dir/scripts";
+ my $cmdfile="$dir/rerun-pro.sh";
+ open CMD,'>',$cmdfile;
+ print CMD "cd ",&getcwd,"\n";
+# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
+ my $cline=&cmdline."\n";
+ print CMD $cline;
+ close CMD;
+ print STDERR $cline;
+ chmod(0755,$cmdfile);
+ check_call("cp $initial_weights $dir/weights.0");
+ die "Can't find weights.0" unless (-e "$dir/weights.0");
+ }
+ write_config(*STDERR);
+}
+
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+enseg($srcFile, $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+my $kbest = "$dir/kbest";
+if ($dont_accum) {
+ $kbest = '';
+} else {
+ check_call("mkdir -p $kbest");
+ $kbest = "--kbest_repository $kbest";
+}
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+ print STDERR "\n\nITERATION $iteration\n==========\n";
+
+ if ($iteration > $max_iterations){
+ print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+ last;
+ }
+ # iteration-specific files
+ my $runFile="$dir/run.raw.$iteration";
+ my $onebestFile="$dir/1best.$iteration";
+ my $logdir="$dir/logs.$iteration";
+ my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+ my $scorerLog="$logdir/scorer.log.$iteration";
+ check_call("mkdir -p $logdir");
+
+
+ #decode
+ print STDERR "RUNNING DECODER AT ";
+ print STDERR unchecked_output("date");
+ my $im1 = $iteration - 1;
+ my $weightsFile="$dir/weights.$im1";
+ push @allweights, "-w $dir/weights.$im1";
+ `rm -f $dir/hgs/*.gz`;
+ my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+ my $pcmd;
+ if ($use_make) {
+ $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
+ } else {
+ $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
+ }
+ my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ my $num_hgs;
+ my $num_topbest;
+ my $retries = 0;
+ while($retries < 5) {
+ $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+ $num_topbest = check_output("wc -l < $runFile");
+ print STDERR "NUMBER OF HGs: $num_hgs\n";
+ print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+ if($devSize == $num_hgs && $devSize == $num_topbest) {
+ last;
+ } else {
+ print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+ sleep(3);
+ }
+ $retries++;
+ }
+ die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+ my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric");
+ chomp $dec_score;
+ print STDERR "DECODER SCORE: $dec_score\n";
+
+ # save space
+ check_call("gzip -f $runFile");
+ check_call("gzip -f $decoderLog");
+
+ # run optimizer
+ print STDERR "RUNNING OPTIMIZER AT ";
+ print STDERR unchecked_output("date");
+ print STDERR " - GENERATE TRAINING EXEMPLARS\n";
+ my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+ my $score = 0;
+ my $icc = 0;
+ my $inweights="$dir/weights.$im1";
+ my $outweights="$dir/weights.$iteration";
+ $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ $cmd="$MAPPER $refs_comma_sep -m $metric -i $dir/agenda.$im1 $kbest -w $inweights > $outweights";
+ check_call($cmd);
+ $lastWeightsFile = $outweights;
+ $iteration++;
+ `rm hgs/*.gz`;
+ print STDERR "\n==========\n";
+}
+
+print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
+
+print STDOUT "$lastWeightsFile\n";
+
+exit 0;
+
+sub get_lines {
+ my $fn = shift @_;
+ open FL, "<$fn" or die "Couldn't read $fn: $!";
+ my $lc = 0;
+ while(<FL>) { $lc++; }
+ return $lc;
+}
+
+sub get_comma_sep_refs {
+ my ($r,$p) = @_;
+ my $o = check_output("echo $p");
+ chomp $o;
+ my @files = split /\s+/, $o;
+ return "-$r " . join(" -$r ", @files);
+}
+
+sub read_weights_file {
+ my ($file) = @_;
+ open F, "<$file" or die "Couldn't read $file: $!";
+ my @r = ();
+ my $pm = -1;
+ while(<F>) {
+ next if /^#/;
+ next if /^\s*$/;
+ chomp;
+ if (/^(.+)\s+(.+)$/) {
+ my $m = $1;
+ my $w = $2;
+ die "Weights out of order: $m <= $pm" unless $m > $pm;
+ push @r, $w;
+ } else {
+ warn "Unexpected feature name in weight file: $_";
+ }
+ }
+ close F;
+ return join ' ', @r;
+}
+
+# subs
+sub write_config {
+ my $fh = shift;
+ my $cleanup = "yes";
+ if ($disable_clean) {$cleanup = "no";}
+
+ print $fh "\n";
+ print $fh "DECODER: $decoder\n";
+ print $fh "INI FILE: $iniFile\n";
+ print $fh "WORKING DIR: $dir\n";
+ print $fh "SOURCE (DEV): $srcFile\n";
+ print $fh "REFS (DEV): $refFiles\n";
+ print $fh "EVAL METRIC: $metric\n";
+ print $fh "MAX ITERATIONS: $max_iterations\n";
+ print $fh "JOBS: $jobs\n";
+ print $fh "HEAD NODE: $host\n";
+ print $fh "PMEM (DECODING): $pmem\n";
+ print $fh "CLEANUP: $cleanup\n";
+}
+
+sub update_weights_file {
+ my ($neww, $rfn, $rpts) = @_;
+ my @feats = @$rfn;
+ my @pts = @$rpts;
+ my $num_feats = scalar @feats;
+ my $num_pts = scalar @pts;
+ die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+ open G, ">$neww" or die;
+ for (my $i = 0; $i < $num_feats; $i++) {
+ my $f = $feats[$i];
+ my $lambda = $pts[$i];
+ print G "$f $lambda\n";
+ }
+ close G;
+}
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
+ print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
+ } else {
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ }
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+ die "Empty dev set!" if ($i == 0);
+}
+
+sub print_help {
+
+ my $executable = check_output("basename $0"); chomp $executable;
+ print << "Help";
+
+Usage: $executable [options] <ini file>
+
+ $executable [options] <ini file>
+ Runs a complete PRO optimization using the ini file specified.
+
+Required:
+
+ --ref-files <files>
+ Dev set ref files. This option takes only a single string argument.
+ To use multiple files (including file globbing), this argument should
+ be quoted.
+
+ --source-file <file>
+ Dev set source file.
+
+ --weights <file>
+ Initial weights file (use empty file to start from 0)
+
+General options:
+
+ --help
+ Print this message and exit.
+
+ --dont-accumulate
+ Don't accumulate k-best lists from multiple iterations.
+
+ --max-iterations <M>
+ Maximum number of iterations to run. If not specified, defaults
+ to $default_max_iter.
+
+ --metric <method>
+ Metric to optimize.
+ Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+ --pass-suffix <S>
+ If the decoder is doing multi-pass decoding, the pass suffix "2",
+ "3", etc., is used to control what iteration of weights is set.
+
+ --workdir <dir>
+ Directory for intermediate and output files. If not specified, the
+ name is derived from the ini filename. Assuming that the ini
+ filename begins with the decoder name and ends with ini, the default
+ name of the working directory is inferred from the middle part of
+ the filename. E.g. an ini file named decoder.foo.ini would have
+ a default working directory name foo.
+
+Regularization options:
+
+ --reg <F>
+ l2 regularization strength [default=500]. The greater this value,
+ the closer to zero the weights will be.
+
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
+sub cmdline {
+ return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+
+sub escaped_shell_args {
+ return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+ return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+ return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
diff --git a/training/minrisk/minrisk_generate_input.pl b/training/minrisk/minrisk_generate_input.pl
new file mode 100755
index 00000000..b30fc4fd
--- /dev/null
+++ b/training/minrisk/minrisk_generate_input.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
+my $d = shift @ARGV;
+die "Can't find directory $d" unless -d $d;
+
+opendir(DIR, $d) or die "Can't read $d: $!";
+my @hgs = grep { /\.gz$/ } readdir(DIR);
+closedir DIR;
+
+for my $hg (@hgs) {
+ my $file = $hg;
+ my $id = $hg;
+ $id =~ s/(\.json)?\.gz//;
+ print "$d/$file $id\n";
+}
+
diff --git a/training/minrisk/minrisk_optimize.cc b/training/minrisk/minrisk_optimize.cc
new file mode 100644
index 00000000..da8b5260
--- /dev/null
+++ b/training/minrisk/minrisk_optimize.cc
@@ -0,0 +1,197 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <limits>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "liblbfgs/lbfgs++.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "weights.h"
+#include "hg_io.h"
+#include "kbest.h"
+#include "viterbi.h"
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "candidate_set.h"
+#include "risk.h"
+#include "entropy.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+ ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
+ ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+ ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")
+ ("temperature,T",po::value<double>()->default_value(0.0), "Temperature parameter for objective (>0 increases the entropy)")
+ ("l1_strength,C",po::value<double>()->default_value(0.0), "L1 regularization strength")
+ ("memory_buffers,M",po::value<unsigned>()->default_value(20), "Memory buffers used in LBFGS")
+ ("kbest_repository,R",po::value<string>(), "Accumulate k-best lists from previous iterations (parameter is path to repository)")
+ ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (!conf->count("reference")) {
+ cerr << "Please specify one or more references using -r <REF.TXT>\n";
+ flag = true;
+ }
+ if (!conf->count("weights")) {
+ cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+EvaluationMetric* metric = NULL;
+
+struct RiskObjective {
+ explicit RiskObjective(const vector<training::CandidateSet>& tr, const double temp) : training(tr), T(temp) {}
+ double operator()(const vector<double>& x, double* g) const {
+ fill(g, g + x.size(), 0.0);
+ double obj = 0;
+ double h = 0;
+ for (unsigned i = 0; i < training.size(); ++i) {
+ training::CandidateSetRisk risk(training[i], *metric);
+ training::CandidateSetEntropy entropy(training[i]);
+ SparseVector<double> tg, hg;
+ double r = risk(x, &tg);
+ double hh = entropy(x, &hg);
+ h += hh;
+ obj += r;
+ for (SparseVector<double>::iterator it = tg.begin(); it != tg.end(); ++it)
+ g[it->first] += it->second;
+ if (T) {
+ for (SparseVector<double>::iterator it = hg.begin(); it != hg.end(); ++it)
+ g[it->first] += T * it->second;
+ }
+ }
+ cerr << (1-(obj / training.size())) << " H=" << h << endl;
+ return obj - T * h;
+ }
+ const vector<training::CandidateSet>& training;
+ const double T; // temperature for entropy regularization
+};
+
+double LearnParameters(const vector<training::CandidateSet>& training,
+ const double temp, // > 0 increases the entropy, < 0 decreases the entropy
+ const double C1,
+ const unsigned memory_buffers,
+ vector<weight_t>* px) {
+ RiskObjective obj(training, temp);
+ LBFGS<RiskObjective> lbfgs(px, obj, memory_buffers, C1);
+ lbfgs.MinimizeFunction();
+ return 0;
+}
+
+#if 0
+struct FooLoss {
+ double operator()(const vector<double>& x, double* g) const {
+ fill(g, g + x.size(), 0.0);
+ training::CandidateSet cs;
+ training::CandidateSetEntropy cse(cs);
+ cs.cs.resize(3);
+ cs.cs[0].fmap.set_value(FD::Convert("F1"), -1.0);
+ cs.cs[1].fmap.set_value(FD::Convert("F2"), 1.0);
+ cs.cs[2].fmap.set_value(FD::Convert("F1"), 2.0);
+ cs.cs[2].fmap.set_value(FD::Convert("F2"), 0.5);
+ SparseVector<double> xx;
+ double h = cse(x, &xx);
+ cerr << cse(x, &xx) << endl; cerr << "G: " << xx << endl;
+ for (SparseVector<double>::iterator i = xx.begin(); i != xx.end(); ++i)
+ g[i->first] += i->second;
+ return -h;
+ }
+};
+#endif
+
+int main(int argc, char** argv) {
+#if 0
+ training::CandidateSet cs;
+ training::CandidateSetEntropy cse(cs);
+ cs.cs.resize(3);
+ cs.cs[0].fmap.set_value(FD::Convert("F1"), -1.0);
+ cs.cs[1].fmap.set_value(FD::Convert("F2"), 1.0);
+ cs.cs[2].fmap.set_value(FD::Convert("F1"), 2.0);
+ cs.cs[2].fmap.set_value(FD::Convert("F2"), 0.5);
+ FooLoss foo;
+ vector<double> ww(FD::NumFeats()); ww[FD::Convert("F1")] = 1.0;
+ LBFGS<FooLoss> lbfgs(&ww, foo, 100, 0.0);
+ lbfgs.MinimizeFunction();
+ return 1;
+#endif
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string evaluation_metric = conf["evaluation_metric"].as<string>();
+
+ metric = EvaluationMetric::Instance(evaluation_metric);
+ DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
+
+ Hypergraph hg;
+ string last_file;
+ ReadFile in_read(conf["input"].as<string>());
+ string kbest_repo;
+ if (conf.count("kbest_repository")) {
+ kbest_repo = conf["kbest_repository"].as<string>();
+ MkDirP(kbest_repo);
+ }
+ istream &in=*in_read.stream();
+ const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
+ vector<weight_t> weights;
+ const string weightsf = conf["weights"].as<string>();
+ Weights::InitFromFile(weightsf, &weights);
+ double t = 0;
+ for (unsigned i = 0; i < weights.size(); ++i)
+ t += weights[i] * weights[i];
+ if (t > 0) {
+ for (unsigned i = 0; i < weights.size(); ++i)
+ weights[i] /= sqrt(t);
+ }
+ string line, file;
+ vector<training::CandidateSet> kis;
+ cerr << "Loading hypergraphs...\n";
+ while(getline(in, line)) {
+ istringstream is(line);
+ int sent_id;
+ kis.resize(kis.size() + 1);
+ training::CandidateSet& curkbest = kis.back();
+ string kbest_file;
+ if (kbest_repo.size()) {
+ ostringstream os;
+ os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
+ kbest_file = os.str();
+ if (FileExists(kbest_file))
+ curkbest.ReadFromFile(kbest_file);
+ }
+ is >> file >> sent_id;
+ ReadFile rf(file);
+ if (kis.size() % 5 == 0) { cerr << '.'; }
+ if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; }
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ hg.Reweight(weights);
+ curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
+ if (kbest_file.size())
+ curkbest.WriteToFile(kbest_file);
+ }
+ cerr << "\nHypergraphs loaded.\n";
+ weights.resize(FD::NumFeats());
+
+ double c1 = conf["l1_strength"].as<double>();
+ double temp = conf["temperature"].as<double>();
+ unsigned m = conf["memory_buffers"].as<unsigned>();
+ LearnParameters(kis, temp, c1, m, &weights);
+ Weights::WriteToFile("-", weights);
+ return 0;
+}
+
diff --git a/training/mira/Makefile.am b/training/mira/Makefile.am
new file mode 100644
index 00000000..fa4fb22d
--- /dev/null
+++ b/training/mira/Makefile.am
@@ -0,0 +1,6 @@
+bin_PROGRAMS = kbest_mira
+
+kbest_mira_SOURCES = kbest_mira.cc
+kbest_mira_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/mira/kbest_mira.cc b/training/mira/kbest_mira.cc
new file mode 100644
index 00000000..d59b4224
--- /dev/null
+++ b/training/mira/kbest_mira.cc
@@ -0,0 +1,322 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <tr1/memory>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "stringlib.h"
+#include "hg_sampler.h"
+#include "sentence_metadata.h"
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "hg.h"
+#include "prob.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "filelib.h"
+#include "fdict.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "sampler.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+bool invert_score;
+std::tr1::shared_ptr<MT19937> rng;
+
+void RandomPermutation(int len, vector<int>* p_ids) {
+ vector<int>& ids = *p_ids;
+ ids.resize(len);
+ for (int i = 0; i < len; ++i) ids[i] = i;
+ for (int i = len; i > 0; --i) {
+ int j = rng->next() * i;
+ if (j == i) i--;
+ swap(ids[i-1], ids[j]);
+ }
+}
+
+bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("input_weights,w",po::value<string>(),"Input feature weights file")
+ ("source,i",po::value<string>(),"Source file for development set")
+ ("passes,p", po::value<int>()->default_value(15), "Number of passes through the training data")
+ ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation(s) (tokenized text file)")
+ ("mt_metric,m",po::value<string>()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)")
+ ("max_step_size,C", po::value<double>()->default_value(0.01), "regularization strength (C)")
+ ("mt_metric_scale,s", po::value<double>()->default_value(1.0), "Amount to scale MT loss function by")
+ ("k_best_size,k", po::value<int>()->default_value(250), "Size of hypothesis list to search for oracles")
+ ("sample_forest,f", "Instead of a k-best list, sample k hypotheses from the decoder's forest")
+ ("sample_forest_unit_weight_vector,x", "Before sampling (must use -f option), rescale the weight vector used so it has unit length; this may improve the quality of the samples")
+ ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+ ("decoder_config,c",po::value<string>(),"Decoder configuration file");
+ po::options_description clo("Command line options");
+ clo.add_options()
+ ("config", po::value<string>(), "Configuration file")
+ ("help,h", "Print this help message and exit");
+ po::options_description dconfig_options, dcmdline_options;
+ dconfig_options.add(opts);
+ dcmdline_options.add(opts).add(clo);
+
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("config")) {
+ ifstream config((*conf)["config"].as<string>().c_str());
+ po::store(po::parse_config_file(config, dconfig_options), *conf);
+ }
+ po::notify(*conf);
+
+ if (conf->count("help") || !conf->count("input_weights") || !conf->count("source") || !conf->count("decoder_config") || !conf->count("reference")) {
+ cerr << dcmdline_options << endl;
+ return false;
+ }
+ return true;
+}
+
+static const double kMINUS_EPSILON = -1e-6;
+
+struct HypothesisInfo {
+ SparseVector<double> features;
+ double mt_metric;
+};
+
+struct GoodBadOracle {
+ std::tr1::shared_ptr<HypothesisInfo> good;
+ std::tr1::shared_ptr<HypothesisInfo> bad;
+};
+
+struct TrainingObserver : public DecoderObserver {
+ TrainingObserver(const int k, const DocumentScorer& d, const EvaluationMetric& m, bool sf, vector<GoodBadOracle>* o) : ds(d), metric(m), oracles(*o), kbest_size(k), sample_forest(sf) {}
+ const DocumentScorer& ds;
+ const EvaluationMetric& metric;
+ vector<GoodBadOracle>& oracles;
+ std::tr1::shared_ptr<HypothesisInfo> cur_best;
+ const int kbest_size;
+ const bool sample_forest;
+
+ const HypothesisInfo& GetCurrentBestHypothesis() const {
+ return *cur_best;
+ }
+
+ virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
+ UpdateOracles(smeta.GetSentenceID(), *hg);
+ }
+
+ std::tr1::shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double score) {
+ std::tr1::shared_ptr<HypothesisInfo> h(new HypothesisInfo);
+ h->features = feats;
+ h->mt_metric = score;
+ return h;
+ }
+
+ void UpdateOracles(int sent_id, const Hypergraph& forest) {
+ std::tr1::shared_ptr<HypothesisInfo>& cur_good = oracles[sent_id].good;
+ std::tr1::shared_ptr<HypothesisInfo>& cur_bad = oracles[sent_id].bad;
+ cur_bad.reset(); // TODO get rid of??
+
+ if (sample_forest) {
+ vector<WordID> cur_prediction;
+ ViterbiESentence(forest, &cur_prediction);
+ SufficientStats sstats;
+ ds[sent_id]->Evaluate(cur_prediction, &sstats);
+ float sentscore = metric.ComputeScore(sstats);
+ cur_best = MakeHypothesisInfo(ViterbiFeatures(forest), sentscore);
+
+ vector<HypergraphSampler::Hypothesis> samples;
+ HypergraphSampler::sample_hypotheses(forest, kbest_size, &*rng, &samples);
+ for (unsigned i = 0; i < samples.size(); ++i) {
+ ds[sent_id]->Evaluate(samples[i].words, &sstats);
+ float sentscore = metric.ComputeScore(sstats);
+ if (invert_score) sentscore *= -1.0;
+ if (!cur_good || sentscore > cur_good->mt_metric)
+ cur_good = MakeHypothesisInfo(samples[i].fmap, sentscore);
+ if (!cur_bad || sentscore < cur_bad->mt_metric)
+ cur_bad = MakeHypothesisInfo(samples[i].fmap, sentscore);
+ }
+ } else {
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, kbest_size);
+ SufficientStats sstats;
+ for (int i = 0; i < kbest_size; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+ if (!d) break;
+ ds[sent_id]->Evaluate(d->yield, &sstats);
+ float sentscore = metric.ComputeScore(sstats);
+ if (invert_score) sentscore *= -1.0;
+ // cerr << TD::GetString(d->yield) << " ||| " << d->score << " ||| " << sentscore << endl;
+ if (i == 0)
+ cur_best = MakeHypothesisInfo(d->feature_values, sentscore);
+ if (!cur_good || sentscore > cur_good->mt_metric)
+ cur_good = MakeHypothesisInfo(d->feature_values, sentscore);
+ if (!cur_bad || sentscore < cur_bad->mt_metric)
+ cur_bad = MakeHypothesisInfo(d->feature_values, sentscore);
+ }
+ //cerr << "GOOD: " << cur_good->mt_metric << endl;
+ //cerr << " CUR: " << cur_best->mt_metric << endl;
+ //cerr << " BAD: " << cur_bad->mt_metric << endl;
+ }
+ }
+};
+
+void ReadTrainingCorpus(const string& fname, vector<string>* c) {
+ ReadFile rf(fname);
+ istream& in = *rf.stream();
+ string line;
+ while(in) {
+ getline(in, line);
+ if (!in) break;
+ c->push_back(line);
+ }
+}
+
+bool ApproxEqual(double a, double b) {
+ if (a == b) return true;
+ return (fabs(a-b)/fabs(b)) < 0.000001;
+}
+
+int main(int argc, char** argv) {
+ register_feature_functions();
+ SetSilent(true); // turn off verbose decoder output
+
+ po::variables_map conf;
+ if (!InitCommandLine(argc, argv, &conf)) return 1;
+
+ if (conf.count("random_seed"))
+ rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ rng.reset(new MT19937);
+ const bool sample_forest = conf.count("sample_forest") > 0;
+ const bool sample_forest_unit_weight_vector = conf.count("sample_forest_unit_weight_vector") > 0;
+ if (sample_forest_unit_weight_vector && !sample_forest) {
+ cerr << "Cannot --sample_forest_unit_weight_vector without --sample_forest" << endl;
+ return 1;
+ }
+ vector<string> corpus;
+ ReadTrainingCorpus(conf["source"].as<string>(), &corpus);
+
+ string metric_name = UppercaseString(conf["mt_metric"].as<string>());
+ if (metric_name == "COMBI") {
+ cerr << "WARNING: 'combi' metric is no longer supported, switching to 'COMB:TER=-0.5;IBM_BLEU=0.5'\n";
+ metric_name = "COMB:TER=-0.5;IBM_BLEU=0.5";
+ } else if (metric_name == "BLEU") {
+ cerr << "WARNING: 'BLEU' is ambiguous, assuming 'IBM_BLEU'\n";
+ metric_name = "IBM_BLEU";
+ }
+ EvaluationMetric* metric = EvaluationMetric::Instance(metric_name);
+ DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << metric_name << endl;
+ invert_score = metric->IsErrorMetric();
+
+ if (ds.size() != corpus.size()) {
+ cerr << "Mismatched number of references (" << ds.size() << ") and sources (" << corpus.size() << ")\n";
+ return 1;
+ }
+
+ ReadFile ini_rf(conf["decoder_config"].as<string>());
+ Decoder decoder(ini_rf.stream());
+
+ // load initial weights
+ vector<weight_t>& dense_weights = decoder.CurrentWeightVector();
+ SparseVector<weight_t> lambdas;
+ Weights::InitFromFile(conf["input_weights"].as<string>(), &dense_weights);
+ Weights::InitSparseVector(dense_weights, &lambdas);
+
+ const double max_step_size = conf["max_step_size"].as<double>();
+ const double mt_metric_scale = conf["mt_metric_scale"].as<double>();
+
+ assert(corpus.size() > 0);
+ vector<GoodBadOracle> oracles(corpus.size());
+
+ TrainingObserver observer(conf["k_best_size"].as<int>(), ds, *metric, sample_forest, &oracles);
+ int cur_sent = 0;
+ int lcount = 0;
+ int normalizer = 0;
+ double tot_loss = 0;
+ int dots = 0;
+ int cur_pass = 0;
+ SparseVector<double> tot;
+ tot += lambdas; // initial weights
+ normalizer++; // count for initial weights
+ int max_iteration = conf["passes"].as<int>() * corpus.size();
+ string msg = "# MIRA tuned weights";
+ string msga = "# MIRA tuned weights AVERAGED";
+ vector<int> order;
+ RandomPermutation(corpus.size(), &order);
+ while (lcount <= max_iteration) {
+ lambdas.init_vector(&dense_weights);
+ if ((cur_sent * 40 / corpus.size()) > dots) { ++dots; cerr << '.'; }
+ if (corpus.size() == cur_sent) {
+ cerr << " [AVG METRIC LAST PASS=" << (tot_loss / corpus.size()) << "]\n";
+ Weights::ShowLargestFeatures(dense_weights);
+ cur_sent = 0;
+ tot_loss = 0;
+ dots = 0;
+ ostringstream os;
+ os << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << ".gz";
+ SparseVector<double> x = tot;
+ x /= normalizer;
+ ostringstream sa;
+ sa << "weights.mira-pass" << (cur_pass < 10 ? "0" : "") << cur_pass << "-avg.gz";
+ x.init_vector(&dense_weights);
+ Weights::WriteToFile(os.str(), dense_weights, true, &msg);
+ ++cur_pass;
+ RandomPermutation(corpus.size(), &order);
+ }
+ if (cur_sent == 0) {
+ cerr << "PASS " << (lcount / corpus.size() + 1) << endl;
+ }
+ decoder.SetId(order[cur_sent]);
+ double sc = 1.0;
+ if (sample_forest_unit_weight_vector) {
+ sc = lambdas.l2norm();
+ if (sc > 0) {
+ for (unsigned i = 0; i < dense_weights.size(); ++i)
+ dense_weights[i] /= sc;
+ }
+ }
+ decoder.Decode(corpus[order[cur_sent]], &observer); // update oracles
+ if (sc && sc != 1.0) {
+ for (unsigned i = 0; i < dense_weights.size(); ++i)
+ dense_weights[i] *= sc;
+ }
+ const HypothesisInfo& cur_hyp = observer.GetCurrentBestHypothesis();
+ const HypothesisInfo& cur_good = *oracles[order[cur_sent]].good;
+ const HypothesisInfo& cur_bad = *oracles[order[cur_sent]].bad;
+ tot_loss += cur_hyp.mt_metric;
+ if (!ApproxEqual(cur_hyp.mt_metric, cur_good.mt_metric)) {
+ const double loss = cur_bad.features.dot(dense_weights) - cur_good.features.dot(dense_weights) +
+ mt_metric_scale * (cur_good.mt_metric - cur_bad.mt_metric);
+ //cerr << "LOSS: " << loss << endl;
+ if (loss > 0.0) {
+ SparseVector<double> diff = cur_good.features;
+ diff -= cur_bad.features;
+ double step_size = loss / diff.l2norm_sq();
+ //cerr << loss << " " << step_size << " " << diff << endl;
+ if (step_size > max_step_size) step_size = max_step_size;
+ lambdas += (cur_good.features * step_size);
+ lambdas -= (cur_bad.features * step_size);
+ //cerr << "L: " << lambdas << endl;
+ }
+ }
+ tot += lambdas;
+ ++normalizer;
+ ++lcount;
+ ++cur_sent;
+ }
+ cerr << endl;
+ Weights::WriteToFile("weights.mira-final.gz", dense_weights, true, &msg);
+ tot /= normalizer;
+ tot.init_vector(dense_weights);
+ msg = "# MIRA tuned weights (averaged vector)";
+ Weights::WriteToFile("weights.mira-final-avg.gz", dense_weights, true, &msg);
+ cerr << "Optimization complete.\nAVERAGED WEIGHTS: weights.mira-final-avg.gz\n";
+ return 0;
+}
+
diff --git a/training/mpi_em_optimize.cc b/training/mpi_em_optimize.cc
deleted file mode 100644
index 48683b15..00000000
--- a/training/mpi_em_optimize.cc
+++ /dev/null
@@ -1,389 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-#include <boost/shared_ptr.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "optimize.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-
-using namespace std;
-using boost::shared_ptr;
-namespace po = boost::program_options;
-
-void SanityCheck(const vector<double>& w) {
- for (int i = 0; i < w.size(); ++i) {
- assert(!isnan(w[i]));
- assert(!isinf(w[i]));
- }
-}
-
-struct FComp {
- const vector<double>& w_;
- FComp(const vector<double>& w) : w_(w) {}
- bool operator()(int a, int b) const {
- return fabs(w_[a]) > fabs(w_[b]);
- }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
- vector<int> fnums(w.size());
- for (int i = 0; i < w.size(); ++i)
- fnums[i] = i;
- vector<int>::iterator mid = fnums.begin();
- mid += (w.size() > 10 ? 10 : w.size());
- partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
- cerr << "TOP FEATURES:";
- for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
- cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
- }
- cerr << endl;
-}
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input_weights,w",po::value<string>(),"Input feature weights file")
- ("training_data,t",po::value<string>(),"Training data")
- ("decoder_config,c",po::value<string>(),"Decoder configuration file")
- ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || !(conf->count("training_data")) || !conf->count("decoder_config")) {
- cerr << dcmdline_options << endl;
-#ifdef HAVE_MPI
- MPI::Finalize();
-#endif
- exit(1);
- }
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
- ReadFile rf(fname);
- istream& in = *rf.stream();
- string line;
- int lc = 0;
- while(in) {
- getline(in, line);
- if (!in) break;
- if (lc % size == rank) c->push_back(line);
- ++lc;
- }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct TrainingObserver : public DecoderObserver {
- void Reset() {
- total_complete = 0;
- cur_obj = 0;
- tot_obj = 0;
- tot.clear();
- }
-
- void SetLocalGradientAndObjective(SparseVector<double>* g, double* o) const {
- *o = tot_obj;
- *g = tot;
- }
-
- virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
- cur_obj = 0;
- state = 1;
- }
-
- void ExtractExpectedCounts(Hypergraph* hg) {
- vector<prob_t> posts;
- cur.clear();
- const prob_t z = hg->ComputeEdgePosteriors(1.0, &posts);
- cur_obj = log(z);
- for (int i = 0; i < posts.size(); ++i) {
- const SparseVector<double>& efeats = hg->edges_[i].feature_values_;
- const double post = static_cast<double>(posts[i] / z);
- for (SparseVector<double>::const_iterator j = efeats.begin(); j != efeats.end(); ++j)
- cur.add_value(j->first, post);
- }
- }
-
- // compute model expectations, denominator of objective
- virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
- assert(state == 1);
- state = 2;
- ExtractExpectedCounts(hg);
- }
-
- // replace translation forest, since we're doing EM training (we don't know which)
- virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
- assert(state == 2);
- state = 3;
- ExtractExpectedCounts(hg);
- }
-
- virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
- ++total_complete;
- tot_obj += cur_obj;
- tot += cur;
- }
-
- int total_complete;
- double cur_obj;
- double tot_obj;
- SparseVector<double> cur, tot;
- int state;
-};
-
-void ReadConfig(const string& ini, vector<string>* out) {
- ReadFile rf(ini);
- istream& in = *rf.stream();
- while(in) {
- string line;
- getline(in, line);
- if (!in) continue;
- out->push_back(line);
- }
-}
-
-void StoreConfig(const vector<string>& cfg, istringstream* o) {
- ostringstream os;
- for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; }
- o->str(os.str());
-}
-
-struct OptimizableMultinomialFamily {
- struct CPD {
- CPD() : z() {}
- double z;
- map<WordID, double> c2counts;
- };
- map<WordID, CPD> counts;
- double Value(WordID conditioning, WordID generated) const {
- map<WordID, CPD>::const_iterator it = counts.find(conditioning);
- assert(it != counts.end());
- map<WordID,double>::const_iterator r = it->second.c2counts.find(generated);
- if (r == it->second.c2counts.end()) return 0;
- return r->second;
- }
- void Increment(WordID conditioning, WordID generated, double count) {
- CPD& cc = counts[conditioning];
- cc.z += count;
- cc.c2counts[generated] += count;
- }
- void Optimize() {
- for (map<WordID, CPD>::iterator i = counts.begin(); i != counts.end(); ++i) {
- CPD& cpd = i->second;
- for (map<WordID, double>::iterator j = cpd.c2counts.begin(); j != cpd.c2counts.end(); ++j) {
- j->second /= cpd.z;
- // cerr << "P(" << TD::Convert(j->first) << " | " << TD::Convert(i->first) << " ) = " << j->second << endl;
- }
- }
- }
- void Clear() {
- counts.clear();
- }
-};
-
-struct CountManager {
- CountManager(size_t num_types) : oms_(num_types) {}
- virtual ~CountManager();
- virtual void AddCounts(const SparseVector<double>& c) = 0;
- void Optimize(SparseVector<double>* weights) {
- for (int i = 0; i < oms_.size(); ++i) {
- oms_[i].Optimize();
- }
- GetOptimalValues(weights);
- for (int i = 0; i < oms_.size(); ++i) {
- oms_[i].Clear();
- }
- }
- virtual void GetOptimalValues(SparseVector<double>* wv) const = 0;
- vector<OptimizableMultinomialFamily> oms_;
-};
-CountManager::~CountManager() {}
-
-struct TaggerCountManager : public CountManager {
- // 0 = transitions, 2 = emissions
- TaggerCountManager() : CountManager(2) {}
- void AddCounts(const SparseVector<double>& c);
- void GetOptimalValues(SparseVector<double>* wv) const {
- for (set<int>::const_iterator it = fids_.begin(); it != fids_.end(); ++it) {
- int ftype;
- WordID cond, gen;
- bool is_optimized = TaggerCountManager::GetFeature(*it, &ftype, &cond, &gen);
- assert(is_optimized);
- wv->set_value(*it, log(oms_[ftype].Value(cond, gen)));
- }
- }
- // Id:0:a=1 Bi:a_b=1 Bi:b_c=1 Bi:c_d=1 Uni:a=1 Uni:b=1 Uni:c=1 Uni:d=1 Id:1:b=1 Bi:BOS_a=1 Id:2:c=1
- static bool GetFeature(const int fid, int* feature_type, WordID* cond, WordID* gen) {
- const string& feat = FD::Convert(fid);
- if (feat.size() > 5 && feat[0] == 'I' && feat[1] == 'd' && feat[2] == ':') {
- // emission
- const size_t p = feat.rfind(':');
- assert(p != string::npos);
- *cond = TD::Convert(feat.substr(p+1));
- *gen = TD::Convert(feat.substr(3, p - 3));
- *feature_type = 1;
- return true;
- } else if (feat[0] == 'B' && feat.size() > 5 && feat[2] == ':' && feat[1] == 'i') {
- // transition
- const size_t p = feat.rfind('_');
- assert(p != string::npos);
- *gen = TD::Convert(feat.substr(p+1));
- *cond = TD::Convert(feat.substr(3, p - 3));
- *feature_type = 0;
- return true;
- } else if (feat[0] == 'U' && feat.size() > 4 && feat[1] == 'n' && feat[2] == 'i' && feat[3] == ':') {
- // ignore
- return false;
- } else {
- cerr << "Don't know how to deal with feature of type: " << feat << endl;
- abort();
- }
- }
- set<int> fids_;
-};
-
-void TaggerCountManager::AddCounts(const SparseVector<double>& c) {
- for (SparseVector<double>::const_iterator it = c.begin(); it != c.end(); ++it) {
- const double& val = it->second;
- int ftype;
- WordID cond, gen;
- if (GetFeature(it->first, &ftype, &cond, &gen)) {
- oms_[ftype].Increment(cond, gen, val);
- fids_.insert(it->first);
- }
- }
-}
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
- MPI::Init(argc, argv);
- const int size = MPI::COMM_WORLD.Get_size();
- const int rank = MPI::COMM_WORLD.Get_rank();
-#else
- const int size = 1;
- const int rank = 0;
-#endif
- SetSilent(true); // turn off verbose decoder output
- register_feature_functions();
-
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- TaggerCountManager tcm;
-
- // load cdec.ini and set up decoder
- vector<string> cdec_ini;
- ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
- istringstream ini;
- StoreConfig(cdec_ini, &ini);
- if (rank == 0) cerr << "Loading grammar...\n";
- Decoder* decoder = new Decoder(&ini);
- if (decoder->GetConf()["input"].as<string>() != "-") {
- cerr << "cdec.ini must not set an input file\n";
-#ifdef HAVE_MPI
- MPI::COMM_WORLD.Abort(1);
-#endif
- }
- if (rank == 0) cerr << "Done loading grammar!\n";
- Weights w;
- if (conf.count("input_weights"))
- w.InitFromFile(conf["input_weights"].as<string>());
-
- double objective = 0;
- bool converged = false;
-
- vector<double> lambdas;
- w.InitVector(&lambdas);
- vector<string> corpus;
- ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
- assert(corpus.size() > 0);
-
- int iteration = 0;
- TrainingObserver observer;
- while (!converged) {
- ++iteration;
- observer.Reset();
- if (rank == 0) {
- cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n";
- }
- decoder->SetWeights(lambdas);
- for (int i = 0; i < corpus.size(); ++i)
- decoder->Decode(corpus[i], &observer);
-
- SparseVector<double> x;
- observer.SetLocalGradientAndObjective(&x, &objective);
- cerr << "COUNTS = " << x << endl;
- cerr << " OBJ = " << objective << endl;
- tcm.AddCounts(x);
-
-#if 0
-#ifdef HAVE_MPI
- MPI::COMM_WORLD.Reduce(const_cast<double*>(&gradient.data()[0]), &rcv_grad[0], num_feats, MPI::DOUBLE, MPI::SUM, 0);
- MPI::COMM_WORLD.Reduce(&objective, &to, 1, MPI::DOUBLE, MPI::SUM, 0);
- swap(gradient, rcv_grad);
- objective = to;
-#endif
-#endif
-
- if (rank == 0) {
- SparseVector<double> wsv;
- tcm.Optimize(&wsv);
-
- w.InitFromVector(wsv);
- w.InitVector(&lambdas);
-
- ShowLargestFeatures(lambdas);
-
- converged = iteration > 100;
- if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
-
- string fname = "weights.cur.gz";
- if (converged) { fname = "weights.final.gz"; }
- ostringstream vv;
- vv << "Objective = " << objective << " (ITERATION=" << iteration << ")";
- const string svv = vv.str();
- w.WriteToFile(fname, true, &svv);
- } // rank == 0
- int cint = converged;
-#ifdef HAVE_MPI
- MPI::COMM_WORLD.Bcast(const_cast<double*>(&lambdas.data()[0]), num_feats, MPI::DOUBLE, 0);
- MPI::COMM_WORLD.Bcast(&cint, 1, MPI::INT, 0);
- MPI::COMM_WORLD.Barrier();
-#endif
- converged = cint;
- }
-#ifdef HAVE_MPI
- MPI::Finalize();
-#endif
- return 0;
-}
diff --git a/training/mr_em_adapted_reduce.cc b/training/mr_em_adapted_reduce.cc
deleted file mode 100644
index f65b5440..00000000
--- a/training/mr_em_adapted_reduce.cc
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "filelib.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "m.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("optimization_method,m", po::value<string>()->default_value("em"), "Optimization method (em, vb)")
- ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-double NoZero(const double& x) {
- if (x) return x;
- return 1e-35;
-}
-
-void Maximize(const bool use_vb,
- const double& alpha,
- const int total_event_types,
- SparseVector<double>* pc) {
- const SparseVector<double>& counts = *pc;
-
- if (use_vb)
- assert(total_event_types >= counts.size());
-
- double tot = 0;
- for (SparseVector<double>::const_iterator it = counts.begin();
- it != counts.end(); ++it)
- tot += it->second;
-// cerr << " = " << tot << endl;
- assert(tot > 0.0);
- double ltot = log(tot);
- if (use_vb)
- ltot = Md::digamma(tot + total_event_types * alpha);
- for (SparseVector<double>::const_iterator it = counts.begin();
- it != counts.end(); ++it) {
- if (use_vb) {
- pc->set_value(it->first, NoZero(Md::digamma(it->second + alpha) - ltot));
- } else {
- pc->set_value(it->first, NoZero(log(it->second) - ltot));
- }
- }
-#if 0
- if (counts.size() < 50) {
- for (SparseVector<double>::const_iterator it = counts.begin();
- it != counts.end(); ++it) {
- cerr << " p(" << FD::Convert(it->first) << ")=" << exp(it->second);
- }
- cerr << endl;
- }
-#endif
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- const bool use_b64 = conf["input_format"].as<string>() == "b64";
- const bool use_vb = conf["optimization_method"].as<string>() == "vb";
- const double alpha = 1e-09;
- if (use_vb)
- cerr << "Using variational Bayes, make sure alphas are set\n";
-
- const string s_obj = "**OBJ**";
- // E-step
- string cur_key = "";
- SparseVector<double> acc;
- double logprob = 0;
- while(cin) {
- string line;
- getline(cin, line);
- if (line.empty()) continue;
- int feat;
- double val;
- size_t i = line.find("\t");
- const string key = line.substr(0, i);
- assert(i != string::npos);
- ++i;
- if (key != cur_key) {
- if (cur_key.size() > 0) {
- // TODO shouldn't be num_active, should be total number
- // of events
- Maximize(use_vb, alpha, acc.size(), &acc);
- cout << cur_key << '\t';
- if (use_b64)
- B64::Encode(0.0, acc, &cout);
- else
- cout << acc;
- cout << endl;
- acc.clear();
- }
- cur_key = key;
- }
- if (use_b64) {
- SparseVector<double> g;
- double obj;
- if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
- cerr << "B64 decoder returned error, skipping!\n";
- continue;
- }
- logprob += obj;
- acc += g;
- } else { // text encoding - your counts will not be accurate!
- while (i < line.size()) {
- size_t start = i;
- while (line[i] != '=' && i < line.size()) ++i;
- if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
- string fname = line.substr(start, i - start);
- if (fname == s_obj) {
- feat = -1;
- } else {
- feat = FD::Convert(line.substr(start, i - start));
- }
- ++i;
- start = i;
- while (line[i] != ';' && i < line.size()) ++i;
- if (i - start == 0) continue;
- val = atof(line.substr(start, i - start).c_str());
- ++i;
- if (feat == -1) {
- logprob += val;
- } else {
- acc.add_value(feat, val);
- }
- }
- }
- }
- // TODO shouldn't be num_active, should be total number
- // of events
- Maximize(use_vb, alpha, acc.size(), &acc);
- cout << cur_key << '\t';
- if (use_b64)
- B64::Encode(0.0, acc, &cout);
- else
- cout << acc;
- cout << endl << flush;
-
- cerr << "LOGPROB: " << logprob << endl;
-
- return 0;
-}
diff --git a/training/mr_em_map_adapter.cc b/training/mr_em_map_adapter.cc
deleted file mode 100644
index ead4598d..00000000
--- a/training/mr_em_map_adapter.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cassert>
-#include <cmath>
-
-#include <boost/utility.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include "boost/tuple/tuple.hpp"
-
-#include "fdict.h"
-#include "sparse_vector.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-// useful for EM models parameterized by a bunch of multinomials
-// this converts event counts (returned from cdec as feature expectations)
-// into different keys and values (which are lists of all the events,
-// conditioned on the key) for summing and normalization by a reducer
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("buffer_size,b", po::value<int>()->default_value(1), "Buffer size (in # of counts) before emitting counts")
- ("format,f",po::value<string>()->default_value("b64"), "Encoding of the input (b64 or text)");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct EventMapper {
- int Map(int fid) {
- int& cv = map_[fid];
- if (!cv) {
- cv = GetConditioningVariable(fid);
- }
- return cv;
- }
- void Clear() { map_.clear(); }
- protected:
- virtual int GetConditioningVariable(int fid) const = 0;
- private:
- map<int, int> map_;
-};
-
-struct LexAlignEventMapper : public EventMapper {
- protected:
- virtual int GetConditioningVariable(int fid) const {
- const string& str = FD::Convert(fid);
- size_t pos = str.rfind("_");
- if (pos == string::npos || pos == 0 || pos >= str.size() - 1) {
- cerr << "Bad feature for EM adapter: " << str << endl;
- abort();
- }
- return FD::Convert(str.substr(0, pos));
- }
-};
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- const bool use_b64 = conf["format"].as<string>() == "b64";
- const int buffer_size = conf["buffer_size"].as<int>();
-
- const string s_obj = "**OBJ**";
- // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2;
- // 0<TAB>**OBJ**=1.1;Feat1=1.0;
-
- EventMapper* event_mapper = new LexAlignEventMapper;
- map<int, SparseVector<double> > counts;
- size_t total = 0;
- while(cin) {
- string line;
- getline(cin, line);
- if (line.empty()) continue;
- int feat;
- double val;
- size_t i = line.find("\t");
- assert(i != string::npos);
- ++i;
- SparseVector<double> g;
- double obj = 0;
- if (use_b64) {
- if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
- cerr << "B64 decoder returned error, skipping!\n";
- continue;
- }
- } else { // text encoding - your counts will not be accurate!
- while (i < line.size()) {
- size_t start = i;
- while (line[i] != '=' && i < line.size()) ++i;
- if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
- string fname = line.substr(start, i - start);
- if (fname == s_obj) {
- feat = -1;
- } else {
- feat = FD::Convert(line.substr(start, i - start));
- }
- ++i;
- start = i;
- while (line[i] != ';' && i < line.size()) ++i;
- if (i - start == 0) continue;
- val = atof(line.substr(start, i - start).c_str());
- ++i;
- if (feat == -1) {
- obj = val;
- } else {
- g.set_value(feat, val);
- }
- }
- }
- //cerr << "OBJ: " << obj << endl;
- const SparseVector<double>& cg = g;
- for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) {
- const int cond_var = event_mapper->Map(it->first);
- SparseVector<double>& cond_counts = counts[cond_var];
- int delta = cond_counts.size();
- cond_counts.add_value(it->first, it->second);
- delta = cond_counts.size() - delta;
- total += delta;
- }
- if (total > buffer_size) {
- for (map<int, SparseVector<double> >::iterator it = counts.begin();
- it != counts.end(); ++it) {
- const SparseVector<double>& cc = it->second;
- cout << FD::Convert(it->first) << '\t';
- if (use_b64) {
- B64::Encode(0.0, cc, &cout);
- } else {
- abort();
- }
- cout << endl;
- }
- cout << flush;
- total = 0;
- counts.clear();
- }
- }
-
- return 0;
-}
-
diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc
deleted file mode 100644
index 461e6b5f..00000000
--- a/training/mr_optimize_reduce.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include <boost/shared_ptr.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "optimize.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void SanityCheck(const vector<double>& w) {
- for (int i = 0; i < w.size(); ++i) {
- assert(!isnan(w[i]));
- assert(!isinf(w[i]));
- }
-}
-
-struct FComp {
- const vector<double>& w_;
- FComp(const vector<double>& w) : w_(w) {}
- bool operator()(int a, int b) const {
- return fabs(w_[a]) > fabs(w_[b]);
- }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
- vector<int> fnums(w.size());
- for (int i = 0; i < w.size(); ++i)
- fnums[i] = i;
- vector<int>::iterator mid = fnums.begin();
- mid += (w.size() > 10 ? 10 : w.size());
- partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
- cerr << "TOP FEATURES:";
- for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
- cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
- }
- cerr << endl;
-}
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input_weights,i",po::value<string>(),"Input feature weights file")
- ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file")
- ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)")
- ("state,s",po::value<string>(),"Read (and write if output_state is not set) optimizer state from this state file. In the first iteration, the file should not exist.")
- ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)")
- ("output_state,S", po::value<string>(), "Output state file (optional override)")
- ("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory")
- ("eta,e", po::value<double>()->default_value(0.1), "Learning rate for SGD (eta)")
- ("gaussian_prior,p","Use a Gaussian prior on the weights")
- ("means,u", po::value<string>(), "File containing the means for Gaussian prior")
- ("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || !conf->count("input_weights") || !conf->count("state")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- const bool use_b64 = conf["input_format"].as<string>() == "b64";
-
- vector<weight_t> lambdas;
- Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
- const string s_obj = "**OBJ**";
- int num_feats = FD::NumFeats();
- cerr << "Number of features: " << num_feats << endl;
- const bool gaussian_prior = conf.count("gaussian_prior");
- vector<weight_t> means(num_feats, 0);
- if (conf.count("means")) {
- if (!gaussian_prior) {
- cerr << "Don't use --means without --gaussian_prior!\n";
- exit(1);
- }
- Weights::InitFromFile(conf["means"].as<string>(), &means);
- }
- boost::shared_ptr<BatchOptimizer> o;
- const string omethod = conf["optimization_method"].as<string>();
- if (omethod == "rprop")
- o.reset(new RPropOptimizer(num_feats)); // TODO add configuration
- else
- o.reset(new LBFGSOptimizer(num_feats, conf["correction_buffers"].as<int>()));
- cerr << "Optimizer: " << o->Name() << endl;
- string state_file = conf["state"].as<string>();
- {
- ifstream in(state_file.c_str(), ios::binary);
- if (in)
- o->Load(&in);
- else
- cerr << "No state file found, assuming ITERATION 1\n";
- }
-
- double objective = 0;
- vector<double> gradient(num_feats, 0);
- // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2;
- // 0<TAB>**OBJ**=1.1;Feat1=1.0;
- int total_lines = 0; // TODO - this should be a count of the
- // training instances!!
- while(cin) {
- string line;
- getline(cin, line);
- if (line.empty()) continue;
- ++total_lines;
- int feat;
- double val;
- size_t i = line.find("\t");
- assert(i != string::npos);
- ++i;
- if (use_b64) {
- SparseVector<double> g;
- double obj;
- if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
- cerr << "B64 decoder returned error, skipping gradient!\n";
- cerr << " START: " << line.substr(0,line.size() > 200 ? 200 : line.size()) << endl;
- if (line.size() > 200)
- cerr << " END: " << line.substr(line.size() - 200, 200) << endl;
- cout << "-1\tRESTART\n";
- exit(99);
- }
- objective += obj;
- const SparseVector<double>& cg = g;
- for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) {
- if (it->first >= num_feats) {
- cerr << "Unexpected feature in gradient: " << FD::Convert(it->first) << endl;
- abort();
- }
- gradient[it->first] -= it->second;
- }
- } else { // text encoding - your gradients will not be accurate!
- while (i < line.size()) {
- size_t start = i;
- while (line[i] != '=' && i < line.size()) ++i;
- if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
- string fname = line.substr(start, i - start);
- if (fname == s_obj) {
- feat = -1;
- } else {
- feat = FD::Convert(line.substr(start, i - start));
- if (feat >= num_feats) {
- cerr << "Unexpected feature in gradient: " << line.substr(start, i - start) << endl;
- abort();
- }
- }
- ++i;
- start = i;
- while (line[i] != ';' && i < line.size()) ++i;
- if (i - start == 0) continue;
- val = atof(line.substr(start, i - start).c_str());
- ++i;
- if (feat == -1) {
- objective += val;
- } else {
- gradient[feat] -= val;
- }
- }
- }
- }
-
- if (gaussian_prior) {
- const double sigsq = conf["sigma_squared"].as<double>();
- double norm = 0;
- for (int k = 1; k < lambdas.size(); ++k) {
- const double& lambda_k = lambdas[k];
- if (lambda_k) {
- const double param = (lambda_k - means[k]);
- norm += param * param;
- gradient[k] += param / sigsq;
- }
- }
- const double reg = norm / (2.0 * sigsq);
- cerr << "REGULARIZATION TERM: " << reg << endl;
- objective += reg;
- }
- cerr << "EVALUATION #" << o->EvaluationCount() << " OBJECTIVE: " << objective << endl;
- double gnorm = 0;
- for (int i = 0; i < gradient.size(); ++i)
- gnorm += gradient[i] * gradient[i];
- cerr << " GNORM=" << sqrt(gnorm) << endl;
- vector<double> old = lambdas;
- int c = 0;
- while (old == lambdas) {
- ++c;
- if (c > 1) { cerr << "Same lambdas, repeating optimization\n"; }
- o->Optimize(objective, gradient, &lambdas);
- assert(c < 5);
- }
- old.clear();
- SanityCheck(lambdas);
- ShowLargestFeatures(lambdas);
- Weights::WriteToFile(conf["output_weights"].as<string>(), lambdas, false);
-
- const bool conv = o->HasConverged();
- if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
-
- if (conf.count("output_state"))
- state_file = conf["output_state"].as<string>();
- ofstream out(state_file.c_str(), ios::binary);
- cerr << "Writing state to: " << state_file << endl;
- o->Save(&out);
- out.close();
-
- cout << o->EvaluationCount() << "\t" << conv << endl;
- return 0;
-}
diff --git a/training/mr_reduce_to_weights.cc b/training/mr_reduce_to_weights.cc
deleted file mode 100644
index 16b47720..00000000
--- a/training/mr_reduce_to_weights.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "filelib.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)")
- ("input,i",po::value<string>()->default_value("-"),"Read file from")
- ("output,o",po::value<string>()->default_value("-"),"Write weights to");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void WriteWeights(const SparseVector<double>& weights, ostream* out) {
- for (SparseVector<double>::const_iterator it = weights.begin();
- it != weights.end(); ++it) {
- (*out) << FD::Convert(it->first) << " " << it->second << endl;
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- const bool use_b64 = conf["input_format"].as<string>() == "b64";
-
- const string s_obj = "**OBJ**";
- // E-step
- ReadFile rf(conf["input"].as<string>());
- istream* in = rf.stream();
- assert(*in);
- WriteFile wf(conf["output"].as<string>());
- ostream* out = wf.stream();
- out->precision(17);
- while(*in) {
- string line;
- getline(*in, line);
- if (line.empty()) continue;
- int feat;
- double val;
- size_t i = line.find("\t");
- assert(i != string::npos);
- ++i;
- if (use_b64) {
- SparseVector<double> g;
- double obj;
- if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
- cerr << "B64 decoder returned error, skipping!\n";
- continue;
- }
- WriteWeights(g, out);
- } else { // text encoding - your counts will not be accurate!
- SparseVector<double> weights;
- while (i < line.size()) {
- size_t start = i;
- while (line[i] != '=' && i < line.size()) ++i;
- if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
- string fname = line.substr(start, i - start);
- if (fname == s_obj) {
- feat = -1;
- } else {
- feat = FD::Convert(line.substr(start, i - start));
- }
- ++i;
- start = i;
- while (line[i] != ';' && i < line.size()) ++i;
- if (i - start == 0) continue;
- val = atof(line.substr(start, i - start).c_str());
- ++i;
- if (feat != -1) {
- weights.set_value(feat, val);
- }
- }
- WriteWeights(weights, out);
- }
- }
-
- return 0;
-}
diff --git a/training/pro/Makefile.am b/training/pro/Makefile.am
new file mode 100644
index 00000000..09364804
--- /dev/null
+++ b/training/pro/Makefile.am
@@ -0,0 +1,13 @@
+bin_PROGRAMS = \
+ mr_pro_map \
+ mr_pro_reduce
+
+mr_pro_map_SOURCES = mr_pro_map.cc
+mr_pro_map_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a
+
+mr_pro_reduce_SOURCES = mr_pro_reduce.cc
+mr_pro_reduce_LDADD = ../../training/liblbfgs/liblbfgs.a ../../utils/libutils.a
+
+EXTRA_DIST = mr_pro_generate_mapper_input.pl pro.pl
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils -I$(top_srcdir)/training
diff --git a/training/pro/mr_pro_generate_mapper_input.pl b/training/pro/mr_pro_generate_mapper_input.pl
new file mode 100755
index 00000000..b30fc4fd
--- /dev/null
+++ b/training/pro/mr_pro_generate_mapper_input.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
+my $d = shift @ARGV;
+die "Can't find directory $d" unless -d $d;
+
+opendir(DIR, $d) or die "Can't read $d: $!";
+my @hgs = grep { /\.gz$/ } readdir(DIR);
+closedir DIR;
+
+for my $hg (@hgs) {
+ my $file = $hg;
+ my $id = $hg;
+ $id =~ s/(\.json)?\.gz//;
+ print "$d/$file $id\n";
+}
+
diff --git a/training/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc
new file mode 100644
index 00000000..eef40b8a
--- /dev/null
+++ b/training/pro/mr_pro_map.cc
@@ -0,0 +1,201 @@
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <tr1/unordered_map>
+
+#include <boost/functional/hash.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "candidate_set.h"
+#include "sampler.h"
+#include "filelib.h"
+#include "stringlib.h"
+#include "weights.h"
+#include "inside_outside.h"
+#include "hg_io.h"
+#include "ns.h"
+#include "ns_docscorer.h"
+
+// This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011)
+
+using namespace std;
+namespace po = boost::program_options;
+
+boost::shared_ptr<MT19937> rng;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+ ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
+ ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)")
+ ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+ ("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)")
+ ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")
+ ("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")
+ ("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")
+ ("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)")
+ ("random_seed,S", po::value<uint32_t>(), "Random seed (if not specified, /dev/random will be used)")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (!conf->count("reference")) {
+ cerr << "Please specify one or more references using -r <REF.TXT>\n";
+ flag = true;
+ }
+ if (!conf->count("weights")) {
+ cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+struct ThresholdAlpha {
+ explicit ThresholdAlpha(double t = 0.05) : threshold(t) {}
+ double operator()(double mag) const {
+ if (mag < threshold) return 0.0; else return 1.0;
+ }
+ const double threshold;
+};
+
+struct TrainingInstance {
+ TrainingInstance(const SparseVector<weight_t>& feats, bool positive, float diff) : x(feats), y(positive), gdiff(diff) {}
+ SparseVector<weight_t> x;
+#undef DEBUGGING_PRO
+#ifdef DEBUGGING_PRO
+ vector<WordID> a;
+ vector<WordID> b;
+#endif
+ bool y;
+ float gdiff;
+};
+#ifdef DEBUGGING_PRO
+ostream& operator<<(ostream& os, const TrainingInstance& d) {
+ return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x;
+}
+#endif
+
+struct DiffOrder {
+ bool operator()(const TrainingInstance& a, const TrainingInstance& b) const {
+ return a.gdiff > b.gdiff;
+ }
+};
+
+void Sample(const unsigned gamma,
+ const unsigned xi,
+ const training::CandidateSet& J_i,
+ const EvaluationMetric* metric,
+ vector<TrainingInstance>* pv) {
+ const bool invert_score = metric->IsErrorMetric();
+ vector<TrainingInstance> v1, v2;
+ float avg_diff = 0;
+ for (unsigned i = 0; i < gamma; ++i) {
+ const size_t a = rng->inclusive(0, J_i.size() - 1)();
+ const size_t b = rng->inclusive(0, J_i.size() - 1)();
+ if (a == b) continue;
+ float ga = metric->ComputeScore(J_i[a].eval_feats);
+ float gb = metric->ComputeScore(J_i[b].eval_feats);
+ bool positive = gb < ga;
+ if (invert_score) positive = !positive;
+ const float gdiff = fabs(ga - gb);
+ if (!gdiff) continue;
+ avg_diff += gdiff;
+ SparseVector<weight_t> xdiff = (J_i[a].fmap - J_i[b].fmap).erase_zeros();
+ if (xdiff.empty()) {
+ cerr << "Empty diff:\n " << TD::GetString(J_i[a].ewords) << endl << "x=" << J_i[a].fmap << endl;
+ cerr << " " << TD::GetString(J_i[b].ewords) << endl << "x=" << J_i[b].fmap << endl;
+ continue;
+ }
+ v1.push_back(TrainingInstance(xdiff, positive, gdiff));
+#ifdef DEBUGGING_PRO
+ v1.back().a = J_i[a].hyp;
+ v1.back().b = J_i[b].hyp;
+ cerr << "N: " << v1.back() << endl;
+#endif
+ }
+ avg_diff /= v1.size();
+
+ for (unsigned i = 0; i < v1.size(); ++i) {
+ double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff));
+ // cerr << "avg_diff=" << avg_diff << " gdiff=" << v1[i].gdiff << " p=" << p << endl;
+ if (rng->next() < p) v2.push_back(v1[i]);
+ }
+ vector<TrainingInstance>::iterator mid = v2.begin() + xi;
+ if (xi > v2.size()) mid = v2.end();
+ partial_sort(v2.begin(), mid, v2.end(), DiffOrder());
+ copy(v2.begin(), mid, back_inserter(*pv));
+#ifdef DEBUGGING_PRO
+ if (v2.size() >= 5) {
+ for (int i =0; i < (mid - v2.begin()); ++i) {
+ cerr << v2[i] << endl;
+ }
+ cerr << pv->back() << endl;
+ }
+#endif
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ if (conf.count("random_seed"))
+ rng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+ else
+ rng.reset(new MT19937);
+ const string evaluation_metric = conf["evaluation_metric"].as<string>();
+
+ EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+ DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
+
+ Hypergraph hg;
+ string last_file;
+ ReadFile in_read(conf["input"].as<string>());
+ istream &in=*in_read.stream();
+ const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
+ const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
+ const unsigned xi = conf["best_pairs"].as<unsigned>();
+ string weightsf = conf["weights"].as<string>();
+ vector<weight_t> weights;
+ Weights::InitFromFile(weightsf, &weights);
+ string kbest_repo = conf["kbest_repository"].as<string>();
+ MkDirP(kbest_repo);
+ while(in) {
+ vector<TrainingInstance> v;
+ string line;
+ getline(in, line);
+ if (line.empty()) continue;
+ istringstream is(line);
+ int sent_id;
+ string file;
+ // path-to-file (JSON) sent_id
+ is >> file >> sent_id;
+ ReadFile rf(file);
+ ostringstream os;
+ training::CandidateSet J_i;
+ os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
+ const string kbest_file = os.str();
+ if (FileExists(kbest_file))
+ J_i.ReadFromFile(kbest_file);
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ hg.Reweight(weights);
+ J_i.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
+ J_i.WriteToFile(kbest_file);
+
+ Sample(gamma, xi, J_i, metric, &v);
+ for (unsigned i = 0; i < v.size(); ++i) {
+ const TrainingInstance& vi = v[i];
+ cout << vi.y << "\t" << vi.x << endl;
+ cout << (!vi.y) << "\t" << (vi.x * -1.0) << endl;
+ }
+ }
+ return 0;
+}
+
diff --git a/training/pro/mr_pro_reduce.cc b/training/pro/mr_pro_reduce.cc
new file mode 100644
index 00000000..5ef9b470
--- /dev/null
+++ b/training/pro/mr_pro_reduce.cc
@@ -0,0 +1,286 @@
+#include <cstdlib>
+#include <sstream>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "weights.h"
+#include "sparse_vector.h"
+#include "optimize.h"
+#include "liblbfgs/lbfgs++.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+// since this is a ranking model, there should be equal numbers of
+// positive and negative examples, so the bias should be 0
+static const double MAX_BIAS = 1e-10;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
+ ("regularization_strength,C",po::value<double>()->default_value(500.0), "l2 regularization strength")
+ ("l1",po::value<double>()->default_value(0.0), "l1 regularization strength")
+ ("regularize_to_weights,y",po::value<double>()->default_value(5000.0), "Differences in learned weights to previous weights are penalized with an l2 penalty with this strength; 0.0 = no effect")
+ ("memory_buffers,m",po::value<unsigned>()->default_value(100), "Number of memory buffers (LBFGS)")
+ ("min_reg,r",po::value<double>()->default_value(0.01), "When tuning (-T) regularization strength, minimum regularization strenght")
+ ("max_reg,R",po::value<double>()->default_value(1e6), "When tuning (-T) regularization strength, maximum regularization strenght")
+ ("testset,t",po::value<string>(), "Optional held-out test set")
+ ("tune_regularizer,T", "Use the held out test set (-t) to tune the regularization strength")
+ ("interpolate_with_weights,p",po::value<double>()->default_value(1.0), "[deprecated] Output weights are p*w + (1-p)*w_prev; 1.0 = no effect")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ if (conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+void ParseSparseVector(string& line, size_t cur, SparseVector<weight_t>* out) {
+ SparseVector<weight_t>& x = *out;
+ size_t last_start = cur;
+ size_t last_comma = string::npos;
+ while(cur <= line.size()) {
+ if (line[cur] == ' ' || cur == line.size()) {
+ if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
+ cerr << "[ERROR] " << line << endl << " position = " << cur << endl;
+ exit(1);
+ }
+ const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
+ if (cur < line.size()) line[cur] = 0;
+ const weight_t val = strtod(&line[last_comma + 1], NULL);
+ x.set_value(fid, val);
+
+ last_comma = string::npos;
+ last_start = cur+1;
+ } else {
+ if (line[cur] == '=')
+ last_comma = cur;
+ }
+ ++cur;
+ }
+}
+
+void ReadCorpus(istream* pin, vector<pair<bool, SparseVector<weight_t> > >* corpus) {
+ istream& in = *pin;
+ corpus->clear();
+ bool flag = false;
+ int lc = 0;
+ string line;
+ SparseVector<weight_t> x;
+ while(getline(in, line)) {
+ ++lc;
+ if (lc % 1000 == 0) { cerr << '.'; flag = true; }
+ if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; }
+ if (line.empty()) continue;
+ const size_t ks = line.find("\t");
+ assert(string::npos != ks);
+ assert(ks == 1);
+ const bool y = line[0] == '1';
+ x.clear();
+ ParseSparseVector(line, ks + 1, &x);
+ corpus->push_back(make_pair(y, x));
+ }
+ if (flag) cerr << endl;
+}
+
+void GradAdd(const SparseVector<weight_t>& v, const double scale, weight_t* acc) {
+ for (SparseVector<weight_t>::const_iterator it = v.begin();
+ it != v.end(); ++it) {
+ acc[it->first] += it->second * scale;
+ }
+}
+
+double ApplyRegularizationTerms(const double C,
+ const double T,
+ const vector<weight_t>& weights,
+ const vector<weight_t>& prev_weights,
+ weight_t* g) {
+ double reg = 0;
+ for (size_t i = 0; i < weights.size(); ++i) {
+ const double prev_w_i = (i < prev_weights.size() ? prev_weights[i] : 0.0);
+ const double& w_i = weights[i];
+ reg += C * w_i * w_i;
+ g[i] += 2 * C * w_i;
+
+ const double diff_i = w_i - prev_w_i;
+ reg += T * diff_i * diff_i;
+ g[i] += 2 * T * diff_i;
+ }
+ return reg;
+}
+
+double TrainingInference(const vector<weight_t>& x,
+ const vector<pair<bool, SparseVector<weight_t> > >& corpus,
+ weight_t* g = NULL) {
+ double cll = 0;
+ for (int i = 0; i < corpus.size(); ++i) {
+ const double dotprod = corpus[i].second.dot(x) + (x.size() ? x[0] : weight_t()); // x[0] is bias
+ double lp_false = dotprod;
+ double lp_true = -dotprod;
+ if (0 < lp_true) {
+ lp_true += log1p(exp(-lp_true));
+ lp_false = log1p(exp(lp_false));
+ } else {
+ lp_true = log1p(exp(lp_true));
+ lp_false += log1p(exp(-lp_false));
+ }
+ lp_true*=-1;
+ lp_false*=-1;
+ if (corpus[i].first) { // true label
+ cll -= lp_true;
+ if (g) {
+ // g -= corpus[i].second * exp(lp_false);
+ GradAdd(corpus[i].second, -exp(lp_false), g);
+ g[0] -= exp(lp_false); // bias
+ }
+ } else { // false label
+ cll -= lp_false;
+ if (g) {
+ // g += corpus[i].second * exp(lp_true);
+ GradAdd(corpus[i].second, exp(lp_true), g);
+ g[0] += exp(lp_true); // bias
+ }
+ }
+ }
+ return cll;
+}
+
+struct ProLoss {
+ ProLoss(const vector<pair<bool, SparseVector<weight_t> > >& tr,
+ const vector<pair<bool, SparseVector<weight_t> > >& te,
+ const double c,
+ const double t,
+ const vector<weight_t>& px) : training(tr), testing(te), C(c), T(t), prev_x(px){}
+ double operator()(const vector<double>& x, double* g) const {
+ fill(g, g + x.size(), 0.0);
+ double cll = TrainingInference(x, training, g);
+ tppl = 0;
+ if (testing.size())
+ tppl = pow(2.0, TrainingInference(x, testing, g) / (log(2) * testing.size()));
+ double ppl = cll / log(2);
+ ppl /= training.size();
+ ppl = pow(2.0, ppl);
+ double reg = ApplyRegularizationTerms(C, T, x, prev_x, g);
+ return cll + reg;
+ }
+ const vector<pair<bool, SparseVector<weight_t> > >& training, testing;
+ const double C, T;
+ const vector<double>& prev_x;
+ mutable double tppl;
+};
+
+// return held-out log likelihood
+double LearnParameters(const vector<pair<bool, SparseVector<weight_t> > >& training,
+ const vector<pair<bool, SparseVector<weight_t> > >& testing,
+ const double C,
+ const double C1,
+ const double T,
+ const unsigned memory_buffers,
+ const vector<weight_t>& prev_x,
+ vector<weight_t>* px) {
+ assert(px->size() == prev_x.size());
+ ProLoss loss(training, testing, C, T, prev_x);
+ LBFGS<ProLoss> lbfgs(px, loss, memory_buffers, C1);
+ lbfgs.MinimizeFunction();
+ return loss.tppl;
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ string line;
+ vector<pair<bool, SparseVector<weight_t> > > training, testing;
+ const bool tune_regularizer = conf.count("tune_regularizer");
+ if (tune_regularizer && !conf.count("testset")) {
+ cerr << "--tune_regularizer requires --testset to be set\n";
+ return 1;
+ }
+ const double min_reg = conf["min_reg"].as<double>();
+ const double max_reg = conf["max_reg"].as<double>();
+ double C = conf["regularization_strength"].as<double>(); // will be overridden if parameter is tuned
+ double C1 = conf["l1"].as<double>(); // will be overridden if parameter is tuned
+ const double T = conf["regularize_to_weights"].as<double>();
+ assert(C >= 0.0);
+ assert(min_reg >= 0.0);
+ assert(max_reg >= 0.0);
+ assert(max_reg > min_reg);
+ const double psi = conf["interpolate_with_weights"].as<double>();
+ if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; return 1; }
+ ReadCorpus(&cin, &training);
+ if (conf.count("testset")) {
+ ReadFile rf(conf["testset"].as<string>());
+ ReadCorpus(rf.stream(), &testing);
+ }
+ cerr << "Number of features: " << FD::NumFeats() << endl;
+
+ vector<weight_t> x, prev_x; // x[0] is bias
+ if (conf.count("weights")) {
+ Weights::InitFromFile(conf["weights"].as<string>(), &x);
+ x.resize(FD::NumFeats());
+ prev_x = x;
+ } else {
+ x.resize(FD::NumFeats());
+ prev_x = x;
+ }
+ cerr << " Number of features: " << x.size() << endl;
+ cerr << "Number of training examples: " << training.size() << endl;
+ cerr << "Number of testing examples: " << testing.size() << endl;
+ double tppl = 0.0;
+ vector<pair<double,double> > sp;
+ vector<double> smoothed;
+ if (tune_regularizer) {
+ C = min_reg;
+ const double steps = 18;
+ double sweep_factor = exp((log(max_reg) - log(min_reg)) / steps);
+ cerr << "SWEEP FACTOR: " << sweep_factor << endl;
+ while(C < max_reg) {
+ cerr << "C=" << C << "\tT=" <<T << endl;
+ tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x);
+ sp.push_back(make_pair(C, tppl));
+ C *= sweep_factor;
+ }
+ smoothed.resize(sp.size(), 0);
+ smoothed[0] = sp[0].second;
+ smoothed.back() = sp.back().second;
+ for (int i = 1; i < sp.size()-1; ++i) {
+ double prev = sp[i-1].second;
+ double next = sp[i+1].second;
+ double cur = sp[i].second;
+ smoothed[i] = (prev*0.2) + cur * 0.6 + (0.2*next);
+ }
+ double best_ppl = 9999999;
+ unsigned best_i = 0;
+ for (unsigned i = 0; i < sp.size(); ++i) {
+ if (smoothed[i] < best_ppl) {
+ best_ppl = smoothed[i];
+ best_i = i;
+ }
+ }
+ C = sp[best_i].first;
+ } // tune regularizer
+ tppl = LearnParameters(training, testing, C, C1, T, conf["memory_buffers"].as<unsigned>(), prev_x, &x);
+ if (conf.count("weights")) {
+ for (int i = 1; i < x.size(); ++i) {
+ x[i] = (x[i] * psi) + prev_x[i] * (1.0 - psi);
+ }
+ }
+ cout.precision(15);
+ cout << "# C=" << C << "\theld out perplexity=";
+ if (tppl) { cout << tppl << endl; } else { cout << "N/A\n"; }
+ if (sp.size()) {
+ cout << "# Parameter sweep:\n";
+ for (int i = 0; i < sp.size(); ++i) {
+ cout << "# " << sp[i].first << "\t" << sp[i].second << "\t" << smoothed[i] << endl;
+ }
+ }
+ Weights::WriteToFile("-", x);
+ return 0;
+}
diff --git a/training/pro/pro.pl b/training/pro/pro.pl
new file mode 100755
index 00000000..3b30c379
--- /dev/null
+++ b/training/pro/pro.pl
@@ -0,0 +1,555 @@
+#!/usr/bin/env perl
+use strict;
+use File::Basename qw(basename);
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+my $default_jobs = env_default_jobs();
+
+my $UTILS_DIR="$SCRIPT_DIR/../utils";
+require "$UTILS_DIR/libcall.pl";
+
+# Default settings
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl";
+my $MAPPER = "$bin_dir/mr_pro_map";
+my $REDUCER = "$bin_dir/mr_pro_reduce";
+my $parallelize = "$UTILS_DIR/parallelize.pl";
+my $libcall = "$UTILS_DIR/libcall.pl";
+my $sentserver = "$UTILS_DIR/sentserver";
+my $sentclient = "$UTILS_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 30;
+my $iteration = 1;
+my $best_weights;
+my $psi = 1;
+my $default_max_iter = 30;
+my $max_iterations = $default_max_iter;
+my $jobs = $default_jobs; # number of decode nodes
+my $pmem = "4g";
+my $disable_clean = 0;
+my %seen_weights;
+my $help = 0;
+my $epsilon = 0.0001;
+my $dryrun = 0;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $use_make = 1; # use make to parallelize
+my $useqsub = 0;
+my $initial_weights;
+my $pass_suffix = '';
+my $devset;
+
+# regularization strength
+my $reg = 500;
+my $reg_previous = 5000;
+
+# Process command-line options
+if (GetOptions(
+ "config=s" => \$iniFile,
+ "weights=s" => \$initial_weights,
+ "devset=s" => \$devset,
+ "jobs=i" => \$jobs,
+ "metric=s" => \$metric,
+ "pass-suffix=s" => \$pass_suffix,
+ "qsub" => \$useqsub,
+ "help" => \$help,
+ "reg=f" => \$reg,
+ "reg-previous=f" => \$reg_previous,
+ "output-dir=s" => \$dir,
+) == 0 || @ARGV!=0 || $help) {
+ print_help();
+ exit;
+}
+
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $iniFile) { push @missing_args, "--config"; }
+if (!defined $devset) { push @missing_args, "--devset"; }
+if (!defined $initial_weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
+
+if ($metric =~ /^(combi|ter)$/i) {
+ $lines_per_mapper = 5;
+}
+
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+ $DIR_FLAG = '';
+}
+
+unless ($dir){
+ $dir = 'pro';
+}
+unless ($dir =~ /^\//){ # convert relative path to absolute path
+ my $basedir = check_output("pwd");
+ chomp $basedir;
+ $dir = "$basedir/$dir";
+}
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+ print STDERR "Cleanup...\n";
+ for my $pid (@childpids){ unchecked_call("kill $pid"); }
+ for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+ exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit =
+ sub{ cleanup(); };
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+
+
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+if (-e $dir) {
+ die "ERROR: working dir $dir already exists\n\n";
+} else {
+ mkdir "$dir" or die "Can't mkdir $dir: $!";
+ mkdir "$dir/hgs" or die;
+ mkdir "$dir/scripts" or die;
+ print STDERR <<EOT;
+ DECODER: $decoder
+ INI FILE: $iniFile
+ WORKING DIR: $dir
+ DEVSET: $devset
+ EVAL METRIC: $metric
+ MAX ITERATIONS: $max_iterations
+ PARALLEL JOBS: $jobs
+ HEAD NODE: $host
+ PMEM (DECODING): $pmem
+ INITIAL WEIGHTS: $initial_weights
+EOT
+}
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+check_call("cp $initial_weights $dir/weights.0");
+$iniFile = $newIniFile;
+
+my $refs = "$dir/dev.refs";
+split_devset($devset, "$dir/dev.input.raw", $refs);
+my $newsrc = "$dir/dev.input";
+enseg("$dir/dev.input.raw", $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+my @allweights;
+while (1){
+ print STDERR "\n\nITERATION $iteration\n==========\n";
+
+ if ($iteration > $max_iterations){
+ print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+ last;
+ }
+ # iteration-specific files
+ my $runFile="$dir/run.raw.$iteration";
+ my $onebestFile="$dir/1best.$iteration";
+ my $logdir="$dir/logs.$iteration";
+ my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+ my $scorerLog="$logdir/scorer.log.$iteration";
+ check_call("mkdir -p $logdir");
+
+
+ #decode
+ print STDERR "RUNNING DECODER AT ";
+ print STDERR unchecked_output("date");
+ my $im1 = $iteration - 1;
+ my $weightsFile="$dir/weights.$im1";
+ push @allweights, "-w $dir/weights.$im1";
+ `rm -f $dir/hgs/*.gz`;
+ my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+ my $pcmd;
+ if ($use_make) {
+ $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
+ } else {
+ $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
+ }
+ my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ my $num_hgs;
+ my $num_topbest;
+ my $retries = 0;
+ while($retries < 5) {
+ $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+ $num_topbest = check_output("wc -l < $runFile");
+ print STDERR "NUMBER OF HGs: $num_hgs\n";
+ print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+ if($devSize == $num_hgs && $devSize == $num_topbest) {
+ last;
+ } else {
+ print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+ sleep(3);
+ }
+ $retries++;
+ }
+ die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+ my $dec_score = check_output("cat $runFile | $SCORER -r $refs -m $metric");
+ chomp $dec_score;
+ print STDERR "DECODER SCORE: $dec_score\n";
+
+ # save space
+ check_call("gzip -f $runFile");
+ check_call("gzip -f $decoderLog");
+
+ # run optimizer
+ print STDERR "RUNNING OPTIMIZER AT ";
+ print STDERR unchecked_output("date");
+ print STDERR " - GENERATE TRAINING EXEMPLARS\n";
+ my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+ my $score = 0;
+ my $icc = 0;
+ my $inweights="$dir/weights.$im1";
+ $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ check_call("mkdir -p $dir/splag.$im1");
+ $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1 $dir/splag.$im1/mapinput.";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!";
+ my @shards = grep { /^mapinput\./ } readdir(DIR);
+ closedir DIR;
+ die "No shards!" unless scalar @shards > 0;
+ my $joblist = "";
+ my $nmappers = 0;
+ @cleanupcmds = ();
+ my %o2i = ();
+ my $first_shard = 1;
+ my $mkfile; # only used with makefiles
+ my $mkfilename;
+ if ($use_make) {
+ $mkfilename = "$dir/splag.$im1/domap.mk";
+ open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!";
+ print $mkfile "all: $dir/splag.$im1/map.done\n\n";
+ }
+ my @mkouts = (); # only used with makefiles
+ my @mapoutputs = ();
+ for my $shard (@shards) {
+ my $mapoutput = $shard;
+ my $client_name = $shard;
+ $client_name =~ s/mapinput.//;
+ $client_name = "pro.$client_name";
+ $mapoutput =~ s/mapinput/mapoutput/;
+ push @mapoutputs, "$dir/splag.$im1/$mapoutput";
+ $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
+ my $script = "$MAPPER -s $srcFile -m $metric -r $refs -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
+ if ($use_make) {
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "#!/bin/bash\n";
+ print F "$script\n";
+ close F;
+ my $output = "$dir/splag.$im1/$mapoutput";
+ push @mkouts, $output;
+ chmod(0755, $script_file) or die "Can't chmod $script_file: $!";
+ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+ print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n";
+ } else {
+ my $script_file = "$dir/scripts/map.$shard";
+ open F, ">$script_file" or die "Can't write $script_file: $!";
+ print F "$script\n";
+ close F;
+ if ($first_shard) { print STDERR "$script\n"; $first_shard=0; }
+
+ $nmappers++;
+ my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file";
+ my $jobid = check_output("$qcmd");
+ chomp $jobid;
+ $jobid =~ s/^(\d+)(.*?)$/\1/g;
+ $jobid =~ s/^Your job (\d+) .*$/\1/;
+ push(@cleanupcmds, "qdel $jobid 2> /dev/null");
+ print STDERR " $jobid";
+ if ($joblist == "") { $joblist = $jobid; }
+ else {$joblist = $joblist . "\|" . $jobid; }
+ }
+ }
+ my @dev_outs = ();
+ my @devtest_outs = ();
+ @dev_outs = @mapoutputs;
+ if ($use_make) {
+ print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n";
+ close $mkfile;
+ my $mcmd = "make -j $jobs -f $mkfilename";
+ print STDERR "\nExecuting: $mcmd\n";
+ check_call($mcmd);
+ } else {
+ print STDERR "\nLaunched $nmappers mappers.\n";
+ sleep 8;
+ print STDERR "Waiting for mappers to complete...\n";
+ while ($nmappers > 0) {
+ sleep 5;
+ my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '")));
+ $nmappers = scalar @livejobs;
+ }
+ print STDERR "All mappers complete.\n";
+ }
+ my $tol = 0;
+ my $til = 0;
+ my $dev_test_file = "$dir/splag.$im1/devtest.gz";
+ print STDERR "\nRUNNING CLASSIFIER (REDUCER)\n";
+ print STDERR unchecked_output("date");
+ $cmd="cat @dev_outs | $REDUCER -w $dir/weights.$im1 -C $reg -y $reg_previous --interpolate_with_weights $psi";
+ $cmd .= " > $dir/weights.$iteration";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ $lastWeightsFile = "$dir/weights.$iteration";
+ $lastPScore = $score;
+ $iteration++;
+ print STDERR "\n==========\n";
+}
+
+
+check_call("cp $lastWeightsFile $dir/weights.final");
+print STDERR "\nFINAL WEIGHTS: $dir/weights.final\n(Use -w <this file> with the decoder)\n\n";
+print STDOUT "$dir/weights.final\n";
+
+exit 0;
+
+sub read_weights_file {
+ my ($file) = @_;
+ open F, "<$file" or die "Couldn't read $file: $!";
+ my @r = ();
+ my $pm = -1;
+ while(<F>) {
+ next if /^#/;
+ next if /^\s*$/;
+ chomp;
+ if (/^(.+)\s+(.+)$/) {
+ my $m = $1;
+ my $w = $2;
+ die "Weights out of order: $m <= $pm" unless $m > $pm;
+ push @r, $w;
+ } else {
+ warn "Unexpected feature name in weight file: $_";
+ }
+ }
+ close F;
+ return join ' ', @r;
+}
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
+ print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
+ } else {
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ }
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+ die "Empty dev set!" if ($i == 0);
+}
+
+sub print_help {
+
+ my $executable = basename($0); chomp $executable;
+ print << "Help";
+
+Usage: $executable [options]
+
+ $executable [options]
+ Runs a complete PRO optimization using the ini file specified.
+
+Required:
+
+ --config <cdec.ini>
+ Decoder configuration file.
+
+ --devset <files>
+ Dev set source and reference data.
+
+ --weights <file>
+ Initial weights file (use empty file to start from 0)
+
+General options:
+
+ --help
+ Print this message and exit.
+
+ --max-iterations <M>
+ Maximum number of iterations to run. If not specified, defaults
+ to $default_max_iter.
+
+ --metric <method>
+ Metric to optimize.
+ Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+ --pass-suffix <S>
+ If the decoder is doing multi-pass decoding, the pass suffix "2",
+ "3", etc., is used to control what iteration of weights is set.
+
+ --workdir <dir>
+ Directory for intermediate and output files. If not specified, the
+ name is derived from the ini filename. Assuming that the ini
+ filename begins with the decoder name and ends with ini, the default
+ name of the working directory is inferred from the middle part of
+ the filename. E.g. an ini file named decoder.foo.ini would have
+ a default working directory name foo.
+
+Regularization options:
+
+ --reg <F>
+ l2 regularization strength [default=500]. The greater this value,
+ the closer to zero the weights will be.
+
+ --reg-previous <F>
+ l2 penalty for moving away from the weights from the previous
+ iteration. [default=5000]. The greater this value, the closer
+ to the previous iteration's weights the next iteration's weights
+ will be.
+
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
+Deprecated options:
+
+ --interpolate-with-weights <F>
+ [deprecated] At each iteration the resulting weights are
+ interpolated with the weights from the previous iteration, with
+ this factor. [default=1.0, i.e., no effect]
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
+sub cmdline {
+ return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+
+sub escaped_shell_args {
+ return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+ return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+ return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
+
+sub split_devset {
+ my ($infile, $outsrc, $outref) = @_;
+ open F, "<$infile" or die "Can't read $infile: $!";
+ open S, ">$outsrc" or die "Can't write $outsrc: $!";
+ open R, ">$outref" or die "Can't write $outref: $!";
+ while(<F>) {
+ chomp;
+ my ($src, @refs) = split /\s*\|\|\|\s*/;
+ die "Malformed devset line: $_\n" unless scalar @refs > 0;
+ print S "$src\n";
+ print R join(' ||| ', @refs) . "\n";
+ }
+ close R;
+ close S;
+ close F;
+}
+
diff --git a/training/rampion/Makefile.am b/training/rampion/Makefile.am
new file mode 100644
index 00000000..c72283cd
--- /dev/null
+++ b/training/rampion/Makefile.am
@@ -0,0 +1,8 @@
+bin_PROGRAMS = rampion_cccp
+
+rampion_cccp_SOURCES = rampion_cccp.cc
+rampion_cccp_LDADD = ../../training/utils/libtraining_utils.a ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a
+
+EXTRA_DIST = rampion.pl rampion_generate_input.pl
+
+AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils
diff --git a/training/rampion/rampion.pl b/training/rampion/rampion.pl
new file mode 100755
index 00000000..ae084db6
--- /dev/null
+++ b/training/rampion/rampion.pl
@@ -0,0 +1,540 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use IPC::Open2;
+use POSIX ":sys_wait_h";
+my $QSUB_CMD = qsub_args(mert_memory());
+my $default_jobs = env_default_jobs();
+
+my $UTILS_DIR="$SCRIPT_DIR/../utils";
+require "$UTILS_DIR/libcall.pl";
+
+# Default settings
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $MAPINPUT = "$bin_dir/rampion_generate_input.pl";
+my $MAPPER = "$bin_dir/rampion_cccp";
+my $parallelize = "$UTILS_DIR/parallelize.pl";
+my $libcall = "$UTILS_DIR/libcall.pl";
+my $sentserver = "$UTILS_DIR/sentserver";
+my $sentclient = "$UTILS_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+die "Can't find $MAPPER" unless -x $MAPPER;
+my $cdec = "$bin_dir/../../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $lines_per_mapper = 30;
+my $iteration = 1;
+my $best_weights;
+my $psi = 1;
+my $default_max_iter = 30;
+my $max_iterations = $default_max_iter;
+my $jobs = $default_jobs; # number of decode nodes
+my $pmem = "4g";
+my $disable_clean = 0;
+my %seen_weights;
+my $help = 0;
+my $epsilon = 0.0001;
+my $dryrun = 0;
+my $last_score = -10000000;
+my $metric = "ibm_bleu";
+my $dir;
+my $iniFile;
+my $weights;
+my $use_make = 1; # use make to parallelize
+my $useqsub = 0;
+my $initial_weights;
+my $pass_suffix = '';
+my $cpbin=1;
+
+# regularization strength
+my $tune_regularizer = 0;
+my $reg = 500;
+my $reg_previous = 5000;
+my $dont_accum = 0;
+
+# Process command-line options
+Getopt::Long::Configure("no_auto_abbrev");
+if (GetOptions(
+ "jobs=i" => \$jobs,
+ "dont-clean" => \$disable_clean,
+ "dont-accumulate" => \$dont_accum,
+ "pass-suffix=s" => \$pass_suffix,
+ "qsub" => \$useqsub,
+ "dry-run" => \$dryrun,
+ "epsilon=s" => \$epsilon,
+ "help" => \$help,
+ "weights=s" => \$initial_weights,
+ "reg=f" => \$reg,
+ "use-make=i" => \$use_make,
+ "max-iterations=i" => \$max_iterations,
+ "pmem=s" => \$pmem,
+ "cpbin!" => \$cpbin,
+ "ref-files=s" => \$refFiles,
+ "metric=s" => \$metric,
+ "source-file=s" => \$srcFile,
+ "workdir=s" => \$dir,
+) == 0 || @ARGV!=1 || $help) {
+ print_help();
+ exit;
+}
+
+die "--tune-regularizer is no longer supported with --reg-previous and --reg. Please tune manually.\n" if $tune_regularizer;
+
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+if (!defined $srcFile) { push @missing_args, "--source-file"; }
+if (!defined $refFiles) { push @missing_args, "--ref-files"; }
+if (!defined $initial_weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args);
+
+if ($metric =~ /^(combi|ter)$/i) {
+ $lines_per_mapper = 5;
+}
+
+($iniFile) = @ARGV;
+
+
+sub write_config;
+sub enseg;
+sub print_help;
+
+my $nodelist;
+my $host =check_output("hostname"); chomp $host;
+my $bleu;
+my $interval_count = 0;
+my $logfile;
+my $projected_score;
+
+# used in sorting scores
+my $DIR_FLAG = '-r';
+if ($metric =~ /^ter$|^aer$/i) {
+ $DIR_FLAG = '';
+}
+
+my $refs_comma_sep = get_comma_sep_refs('r',$refFiles);
+
+unless ($dir){
+ $dir = "rampion";
+}
+unless ($dir =~ /^\//){ # convert relative path to absolute path
+ my $basedir = check_output("pwd");
+ chomp $basedir;
+ $dir = "$basedir/$dir";
+}
+
+
+# Initializations and helper functions
+srand;
+
+my @childpids = ();
+my @cleanupcmds = ();
+
+sub cleanup {
+ print STDERR "Cleanup...\n";
+ for my $pid (@childpids){ unchecked_call("kill $pid"); }
+ for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); }
+ exit 1;
+};
+# Always call cleanup, no matter how we exit
+*CORE::GLOBAL::exit =
+ sub{ cleanup(); };
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+my $decoderBase = check_output("basename $decoder"); chomp $decoderBase;
+my $newIniFile = "$dir/$decoderBase.ini";
+my $inputFileName = "$dir/input";
+my $user = $ENV{"USER"};
+# process ini file
+-e $iniFile || die "Error: could not open $iniFile for reading\n";
+open(INI, $iniFile);
+
+use File::Basename qw(basename);
+#pass bindir, refs to vars holding bin
+sub modbin {
+ local $_;
+ my $bindir=shift;
+ check_call("mkdir -p $bindir");
+ -d $bindir || die "couldn't make bindir $bindir";
+ for (@_) {
+ my $src=$$_;
+ $$_="$bindir/".basename($src);
+ check_call("cp -p $src $$_");
+ }
+}
+sub dirsize {
+ opendir ISEMPTY,$_[0];
+ return scalar(readdir(ISEMPTY))-1;
+}
+my @allweights;
+if ($dryrun){
+ write_config(*STDERR);
+ exit 0;
+} else {
+ if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-pro.pl outputs
+ die "ERROR: working dir $dir already exists\n\n";
+ } else {
+ -e $dir || mkdir $dir;
+ mkdir "$dir/hgs";
+ modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin;
+ mkdir "$dir/scripts";
+ my $cmdfile="$dir/rerun-pro.sh";
+ open CMD,'>',$cmdfile;
+ print CMD "cd ",&getcwd,"\n";
+# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted.
+ my $cline=&cmdline."\n";
+ print CMD $cline;
+ close CMD;
+ print STDERR $cline;
+ chmod(0755,$cmdfile);
+ check_call("cp $initial_weights $dir/weights.0");
+ die "Can't find weights.0" unless (-e "$dir/weights.0");
+ }
+ write_config(*STDERR);
+}
+
+
+# Generate initial files and values
+check_call("cp $iniFile $newIniFile");
+$iniFile = $newIniFile;
+
+my $newsrc = "$dir/dev.input";
+enseg($srcFile, $newsrc);
+$srcFile = $newsrc;
+my $devSize = 0;
+open F, "<$srcFile" or die "Can't read $srcFile: $!";
+while(<F>) { $devSize++; }
+close F;
+
+unless($best_weights){ $best_weights = $weights; }
+unless($projected_score){ $projected_score = 0.0; }
+$seen_weights{$weights} = 1;
+my $kbest = "$dir/kbest";
+if ($dont_accum) {
+ $kbest = '';
+} else {
+ check_call("mkdir -p $kbest");
+ $kbest = "--kbest_repository $kbest";
+}
+
+my $random_seed = int(time / 1000);
+my $lastWeightsFile;
+my $lastPScore = 0;
+# main optimization loop
+while (1){
+ print STDERR "\n\nITERATION $iteration\n==========\n";
+
+ if ($iteration > $max_iterations){
+ print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n";
+ last;
+ }
+ # iteration-specific files
+ my $runFile="$dir/run.raw.$iteration";
+ my $onebestFile="$dir/1best.$iteration";
+ my $logdir="$dir/logs.$iteration";
+ my $decoderLog="$logdir/decoder.sentserver.log.$iteration";
+ my $scorerLog="$logdir/scorer.log.$iteration";
+ check_call("mkdir -p $logdir");
+
+
+ #decode
+ print STDERR "RUNNING DECODER AT ";
+ print STDERR unchecked_output("date");
+ my $im1 = $iteration - 1;
+ my $weightsFile="$dir/weights.$im1";
+ push @allweights, "-w $dir/weights.$im1";
+ `rm -f $dir/hgs/*.gz`;
+ my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
+ my $pcmd;
+ if ($use_make) {
+ $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --";
+ } else {
+ $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --";
+ }
+ my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_bash_call($cmd);
+ my $num_hgs;
+ my $num_topbest;
+ my $retries = 0;
+ while($retries < 5) {
+ $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l");
+ $num_topbest = check_output("wc -l < $runFile");
+ print STDERR "NUMBER OF HGs: $num_hgs\n";
+ print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n";
+ if($devSize == $num_hgs && $devSize == $num_topbest) {
+ last;
+ } else {
+ print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n";
+ sleep(3);
+ }
+ $retries++;
+ }
+ die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest);
+ my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric");
+ chomp $dec_score;
+ print STDERR "DECODER SCORE: $dec_score\n";
+
+ # save space
+ check_call("gzip -f $runFile");
+ check_call("gzip -f $decoderLog");
+
+ # run optimizer
+ print STDERR "RUNNING OPTIMIZER AT ";
+ print STDERR unchecked_output("date");
+ print STDERR " - GENERATE TRAINING EXEMPLARS\n";
+ my $mergeLog="$logdir/prune-merge.log.$iteration";
+
+ my $score = 0;
+ my $icc = 0;
+ my $inweights="$dir/weights.$im1";
+ my $outweights="$dir/weights.$iteration";
+ $cmd="$MAPINPUT $dir/hgs > $dir/agenda.$im1";
+ print STDERR "COMMAND:\n$cmd\n";
+ check_call($cmd);
+ $cmd="$MAPPER $refs_comma_sep -m $metric -i $dir/agenda.$im1 $kbest -w $inweights > $outweights";
+ check_call($cmd);
+ $lastWeightsFile = $outweights;
+ $iteration++;
+ `rm hgs/*.gz`;
+ print STDERR "\n==========\n";
+}
+
+print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w <this file> with the decoder)\n\n";
+
+print STDOUT "$lastWeightsFile\n";
+
+exit 0;
+
+sub get_lines {
+ my $fn = shift @_;
+ open FL, "<$fn" or die "Couldn't read $fn: $!";
+ my $lc = 0;
+ while(<FL>) { $lc++; }
+ return $lc;
+}
+
+sub get_comma_sep_refs {
+ my ($r,$p) = @_;
+ my $o = check_output("echo $p");
+ chomp $o;
+ my @files = split /\s+/, $o;
+ return "-$r " . join(" -$r ", @files);
+}
+
+sub read_weights_file {
+ my ($file) = @_;
+ open F, "<$file" or die "Couldn't read $file: $!";
+ my @r = ();
+ my $pm = -1;
+ while(<F>) {
+ next if /^#/;
+ next if /^\s*$/;
+ chomp;
+ if (/^(.+)\s+(.+)$/) {
+ my $m = $1;
+ my $w = $2;
+ die "Weights out of order: $m <= $pm" unless $m > $pm;
+ push @r, $w;
+ } else {
+ warn "Unexpected feature name in weight file: $_";
+ }
+ }
+ close F;
+ return join ' ', @r;
+}
+
+# subs
+sub write_config {
+ my $fh = shift;
+ my $cleanup = "yes";
+ if ($disable_clean) {$cleanup = "no";}
+
+ print $fh "\n";
+ print $fh "DECODER: $decoder\n";
+ print $fh "INI FILE: $iniFile\n";
+ print $fh "WORKING DIR: $dir\n";
+ print $fh "SOURCE (DEV): $srcFile\n";
+ print $fh "REFS (DEV): $refFiles\n";
+ print $fh "EVAL METRIC: $metric\n";
+ print $fh "MAX ITERATIONS: $max_iterations\n";
+ print $fh "JOBS: $jobs\n";
+ print $fh "HEAD NODE: $host\n";
+ print $fh "PMEM (DECODING): $pmem\n";
+ print $fh "CLEANUP: $cleanup\n";
+}
+
+sub update_weights_file {
+ my ($neww, $rfn, $rpts) = @_;
+ my @feats = @$rfn;
+ my @pts = @$rpts;
+ my $num_feats = scalar @feats;
+ my $num_pts = scalar @pts;
+ die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts;
+ open G, ">$neww" or die;
+ for (my $i = 0; $i < $num_feats; $i++) {
+ my $f = $feats[$i];
+ my $lambda = $pts[$i];
+ print G "$f $lambda\n";
+ }
+ close G;
+}
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
+ print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
+ } else {
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ }
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+ die "Empty dev set!" if ($i == 0);
+}
+
+sub print_help {
+
+ my $executable = check_output("basename $0"); chomp $executable;
+ print << "Help";
+
+Usage: $executable [options] <ini file>
+
+ $executable [options] <ini file>
+ Runs a complete PRO optimization using the ini file specified.
+
+Required:
+
+ --ref-files <files>
+ Dev set ref files. This option takes only a single string argument.
+ To use multiple files (including file globbing), this argument should
+ be quoted.
+
+ --source-file <file>
+ Dev set source file.
+
+ --weights <file>
+ Initial weights file (use empty file to start from 0)
+
+General options:
+
+ --help
+ Print this message and exit.
+
+ --dont-accumulate
+ Don't accumulate k-best lists from multiple iterations.
+
+ --max-iterations <M>
+ Maximum number of iterations to run. If not specified, defaults
+ to $default_max_iter.
+
+ --metric <method>
+ Metric to optimize.
+ Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
+
+ --pass-suffix <S>
+ If the decoder is doing multi-pass decoding, the pass suffix "2",
+ "3", etc., is used to control what iteration of weights is set.
+
+ --workdir <dir>
+ Directory for intermediate and output files. If not specified, the
+ name is derived from the ini filename. Assuming that the ini
+ filename begins with the decoder name and ends with ini, the default
+ name of the working directory is inferred from the middle part of
+ the filename. E.g. an ini file named decoder.foo.ini would have
+ a default working directory name foo.
+
+Regularization options:
+
+ --reg <F>
+ l2 regularization strength [default=500]. The greater this value,
+ the closer to zero the weights will be.
+
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
+sub cmdline {
+ return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+
+sub escaped_shell_args {
+ return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+ return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+ return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
diff --git a/training/rampion/rampion_cccp.cc b/training/rampion/rampion_cccp.cc
new file mode 100644
index 00000000..1e36dc51
--- /dev/null
+++ b/training/rampion/rampion_cccp.cc
@@ -0,0 +1,168 @@
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <limits>
+
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "filelib.h"
+#include "stringlib.h"
+#include "weights.h"
+#include "hg_io.h"
+#include "kbest.h"
+#include "viterbi.h"
+#include "ns.h"
+#include "ns_docscorer.h"
+#include "candidate_set.h"
+
+using namespace std;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+ ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
+ ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
+ ("evaluation_metric,m",po::value<string>()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)")
+ ("kbest_repository,R",po::value<string>(), "Accumulate k-best lists from previous iterations (parameter is path to repository)")
+ ("kbest_size,k",po::value<unsigned>()->default_value(500u), "Top k-hypotheses to extract")
+ ("cccp_iterations,I", po::value<unsigned>()->default_value(10u), "CCCP iterations (T')")
+ ("ssd_iterations,J", po::value<unsigned>()->default_value(5u), "Stochastic subgradient iterations (T'')")
+ ("eta", po::value<double>()->default_value(1e-4), "Step size")
+ ("regularization_strength,C", po::value<double>()->default_value(1.0), "L2 regularization strength")
+ ("alpha,a", po::value<double>()->default_value(10.0), "Cost scale (alpha); alpha * [1-metric(y,y')]")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (!conf->count("reference")) {
+ cerr << "Please specify one or more references using -r <REF.TXT>\n";
+ flag = true;
+ }
+ if (!conf->count("weights")) {
+ cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+struct GainFunction {
+ explicit GainFunction(const EvaluationMetric* m) : metric(m) {}
+ float operator()(const SufficientStats& eval_feats) const {
+ float g = metric->ComputeScore(eval_feats);
+ if (!metric->IsErrorMetric()) g = 1 - g;
+ return g;
+ }
+ const EvaluationMetric* metric;
+};
+
+template <typename GainFunc>
+void CostAugmentedSearch(const GainFunc& gain,
+ const training::CandidateSet& cs,
+ const SparseVector<double>& w,
+ double alpha,
+ SparseVector<double>* fmap) {
+ unsigned best_i = 0;
+ double best = -numeric_limits<double>::infinity();
+ for (unsigned i = 0; i < cs.size(); ++i) {
+ double s = cs[i].fmap.dot(w) + alpha * gain(cs[i].eval_feats);
+ if (s > best) {
+ best = s;
+ best_i = i;
+ }
+ }
+ *fmap = cs[best_i].fmap;
+}
+
+
+
+// runs lines 4--15 of rampion algorithm
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string evaluation_metric = conf["evaluation_metric"].as<string>();
+
+ EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric);
+ DocumentScorer ds(metric, conf["reference"].as<vector<string> >());
+ cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl;
+ double goodsign = -1;
+ double badsign = -goodsign;
+
+ Hypergraph hg;
+ string last_file;
+ ReadFile in_read(conf["input"].as<string>());
+ string kbest_repo;
+ if (conf.count("kbest_repository")) {
+ kbest_repo = conf["kbest_repository"].as<string>();
+ MkDirP(kbest_repo);
+ }
+ istream &in=*in_read.stream();
+ const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
+ const unsigned tp = conf["cccp_iterations"].as<unsigned>();
+ const unsigned tpp = conf["ssd_iterations"].as<unsigned>();
+ const double eta = conf["eta"].as<double>();
+ const double reg = conf["regularization_strength"].as<double>();
+ const double alpha = conf["alpha"].as<double>();
+ SparseVector<weight_t> weights;
+ {
+ vector<weight_t> vweights;
+ const string weightsf = conf["weights"].as<string>();
+ Weights::InitFromFile(weightsf, &vweights);
+ Weights::InitSparseVector(vweights, &weights);
+ }
+ string line, file;
+ vector<training::CandidateSet> kis;
+ cerr << "Loading hypergraphs...\n";
+ while(getline(in, line)) {
+ istringstream is(line);
+ int sent_id;
+ kis.resize(kis.size() + 1);
+ training::CandidateSet& curkbest = kis.back();
+ string kbest_file;
+ if (kbest_repo.size()) {
+ ostringstream os;
+ os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
+ kbest_file = os.str();
+ if (FileExists(kbest_file))
+ curkbest.ReadFromFile(kbest_file);
+ }
+ is >> file >> sent_id;
+ ReadFile rf(file);
+ if (kis.size() % 5 == 0) { cerr << '.'; }
+ if (kis.size() % 200 == 0) { cerr << " [" << kis.size() << "]\n"; }
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ hg.Reweight(weights);
+ curkbest.AddKBestCandidates(hg, kbest_size, ds[sent_id]);
+ if (kbest_file.size())
+ curkbest.WriteToFile(kbest_file);
+ }
+ cerr << "\nHypergraphs loaded.\n";
+
+ vector<SparseVector<weight_t> > goals(kis.size()); // f(x_i,y+,h+)
+ SparseVector<weight_t> fear; // f(x,y-,h-)
+ const GainFunction gain(metric);
+ for (unsigned iterp = 1; iterp <= tp; ++iterp) {
+ cerr << "CCCP Iteration " << iterp << endl;
+ for (unsigned i = 0; i < goals.size(); ++i)
+ CostAugmentedSearch(gain, kis[i], weights, goodsign * alpha, &goals[i]);
+ for (unsigned iterpp = 1; iterpp <= tpp; ++iterpp) {
+ cerr << " SSD Iteration " << iterpp << endl;
+ for (unsigned i = 0; i < goals.size(); ++i) {
+ CostAugmentedSearch(gain, kis[i], weights, badsign * alpha, &fear);
+ weights -= weights * (eta * reg / goals.size());
+ weights += (goals[i] - fear) * eta;
+ }
+ }
+ }
+ vector<weight_t> w;
+ weights.init_vector(&w);
+ Weights::WriteToFile("-", w);
+ return 0;
+}
+
diff --git a/training/rampion/rampion_generate_input.pl b/training/rampion/rampion_generate_input.pl
new file mode 100755
index 00000000..b30fc4fd
--- /dev/null
+++ b/training/rampion/rampion_generate_input.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 HG_DIR\n" unless scalar @ARGV == 1;
+my $d = shift @ARGV;
+die "Can't find directory $d" unless -d $d;
+
+opendir(DIR, $d) or die "Can't read $d: $!";
+my @hgs = grep { /\.gz$/ } readdir(DIR);
+closedir DIR;
+
+for my $hg (@hgs) {
+ my $file = $hg;
+ my $id = $hg;
+ $id =~ s/(\.json)?\.gz//;
+ print "$d/$file $id\n";
+}
+
diff --git a/training/ttables.cc b/training/ttables.cc
deleted file mode 100644
index 45bf14c5..00000000
--- a/training/ttables.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-#include "ttables.h"
-
-#include <cassert>
-
-#include "dict.h"
-
-using namespace std;
-using namespace std::tr1;
-
-void TTable::DeserializeProbsFromText(std::istream* in) {
- int c = 0;
- while(*in) {
- string e;
- string f;
- double p;
- (*in) >> e >> f >> p;
- if (e.empty()) break;
- ++c;
- ttable[TD::Convert(e)][TD::Convert(f)] = p;
- }
- cerr << "Loaded " << c << " translation parameters.\n";
-}
-
-void TTable::SerializeHelper(string* out, const Word2Word2Double& o) {
- assert(!"not implemented");
-}
-
-void TTable::DeserializeHelper(const string& in, Word2Word2Double* o) {
- assert(!"not implemented");
-}
-
diff --git a/training/ttables.h b/training/ttables.h
deleted file mode 100644
index 9baa13ca..00000000
--- a/training/ttables.h
+++ /dev/null
@@ -1,101 +0,0 @@
-#ifndef _TTABLES_H_
-#define _TTABLES_H_
-
-#include <iostream>
-#include <tr1/unordered_map>
-
-#include "sparse_vector.h"
-#include "m.h"
-#include "wordid.h"
-#include "tdict.h"
-
-class TTable {
- public:
- TTable() {}
- typedef std::tr1::unordered_map<WordID, double> Word2Double;
- typedef std::tr1::unordered_map<WordID, Word2Double> Word2Word2Double;
- inline double prob(const int& e, const int& f) const {
- const Word2Word2Double::const_iterator cit = ttable.find(e);
- if (cit != ttable.end()) {
- const Word2Double& cpd = cit->second;
- const Word2Double::const_iterator it = cpd.find(f);
- if (it == cpd.end()) return 1e-9;
- return it->second;
- } else {
- return 1e-9;
- }
- }
- inline void Increment(const int& e, const int& f) {
- counts[e][f] += 1.0;
- }
- inline void Increment(const int& e, const int& f, double x) {
- counts[e][f] += x;
- }
- void NormalizeVB(const double alpha) {
- ttable.swap(counts);
- for (Word2Word2Double::iterator cit = ttable.begin();
- cit != ttable.end(); ++cit) {
- double tot = 0;
- Word2Double& cpd = cit->second;
- for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
- tot += it->second + alpha;
- for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
- it->second = exp(Md::digamma(it->second + alpha) - Md::digamma(tot));
- }
- counts.clear();
- }
- void Normalize() {
- ttable.swap(counts);
- for (Word2Word2Double::iterator cit = ttable.begin();
- cit != ttable.end(); ++cit) {
- double tot = 0;
- Word2Double& cpd = cit->second;
- for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
- tot += it->second;
- for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it)
- it->second /= tot;
- }
- counts.clear();
- }
- // adds counts from another TTable - probabilities remain unchanged
- TTable& operator+=(const TTable& rhs) {
- for (Word2Word2Double::const_iterator it = rhs.counts.begin();
- it != rhs.counts.end(); ++it) {
- const Word2Double& cpd = it->second;
- Word2Double& tgt = counts[it->first];
- for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) {
- tgt[j->first] += j->second;
- }
- }
- return *this;
- }
- void ShowTTable() const {
- for (Word2Word2Double::const_iterator it = ttable.begin(); it != ttable.end(); ++it) {
- const Word2Double& cpd = it->second;
- for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) {
- std::cerr << "P(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl;
- }
- }
- }
- void ShowCounts() const {
- for (Word2Word2Double::const_iterator it = counts.begin(); it != counts.end(); ++it) {
- const Word2Double& cpd = it->second;
- for (Word2Double::const_iterator j = cpd.begin(); j != cpd.end(); ++j) {
- std::cerr << "c(" << TD::Convert(j->first) << '|' << TD::Convert(it->first) << ") = " << j->second << std::endl;
- }
- }
- }
- void DeserializeProbsFromText(std::istream* in);
- void SerializeCounts(std::string* out) const { SerializeHelper(out, counts); }
- void DeserializeCounts(const std::string& in) { DeserializeHelper(in, &counts); }
- void SerializeProbs(std::string* out) const { SerializeHelper(out, ttable); }
- void DeserializeProbs(const std::string& in) { DeserializeHelper(in, &ttable); }
- private:
- static void SerializeHelper(std::string*, const Word2Word2Double& o);
- static void DeserializeHelper(const std::string&, Word2Word2Double* o);
- public:
- Word2Word2Double ttable;
- Word2Word2Double counts;
-};
-
-#endif
diff --git a/training/utils/Makefile.am b/training/utils/Makefile.am
new file mode 100644
index 00000000..27c6e344
--- /dev/null
+++ b/training/utils/Makefile.am
@@ -0,0 +1,46 @@
+noinst_LIBRARIES = libtraining_utils.a
+
+bin_PROGRAMS = \
+ sentserver \
+ sentclient \
+ grammar_convert
+
+noinst_PROGRAMS = \
+ lbfgs_test \
+ optimize_test
+
+EXTRA_DIST = decode-and-evaluate.pl libcall.pl parallelize.pl
+
+sentserver_SOURCES = sentserver.cc
+sentserver_LDFLAGS = -pthread
+
+sentclient_SOURCES = sentclient.cc
+sentclient_LDFLAGS = -pthread
+
+TESTS = lbfgs_test optimize_test
+
+libtraining_utils_a_SOURCES = \
+ candidate_set.h \
+ entropy.h \
+ lbfgs.h \
+ online_optimizer.h \
+ optimize.h \
+ risk.h \
+ sentserver.h \
+ candidate_set.cc \
+ entropy.cc \
+ optimize.cc \
+ online_optimizer.cc \
+ risk.cc
+
+optimize_test_SOURCES = optimize_test.cc
+optimize_test_LDADD = libtraining_utils.a ../../utils/libutils.a
+
+grammar_convert_SOURCES = grammar_convert.cc
+grammar_convert_LDADD = ../../decoder/libcdec.a ../../mteval/libmteval.a ../../utils/libutils.a
+
+lbfgs_test_SOURCES = lbfgs_test.cc
+lbfgs_test_LDADD = ../../utils/libutils.a
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I$(top_srcdir)/klm
+
diff --git a/training/candidate_set.cc b/training/utils/candidate_set.cc
index 087efec3..087efec3 100644
--- a/training/candidate_set.cc
+++ b/training/utils/candidate_set.cc
diff --git a/training/candidate_set.h b/training/utils/candidate_set.h
index 9d326ed0..9d326ed0 100644
--- a/training/candidate_set.h
+++ b/training/utils/candidate_set.h
diff --git a/training/utils/decode-and-evaluate.pl b/training/utils/decode-and-evaluate.pl
new file mode 100755
index 00000000..1a332c08
--- /dev/null
+++ b/training/utils/decode-and-evaluate.pl
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+use strict;
+my @ORIG_ARGV=@ARGV;
+use Cwd qw(getcwd);
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
+
+# Skip local config (used for distributing jobs) if we're running in local-only mode
+use LocalConfig;
+use Getopt::Long;
+use File::Basename qw(basename);
+my $QSUB_CMD = qsub_args(mert_memory());
+
+require "libcall.pl";
+
+# Default settings
+my $default_jobs = env_default_jobs();
+my $bin_dir = $SCRIPT_DIR;
+die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
+die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
+my $parallelize = "$bin_dir/parallelize.pl";
+my $libcall = "$bin_dir/libcall.pl";
+my $sentserver = "$bin_dir/sentserver";
+my $sentclient = "$bin_dir/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
+
+my $SCORER = $FAST_SCORE;
+my $cdec = "$bin_dir/../../decoder/cdec";
+die "Can't find decoder in $cdec" unless -x $cdec;
+die "Can't find $parallelize" unless -x $parallelize;
+die "Can't find $libcall" unless -e $libcall;
+my $decoder = $cdec;
+my $jobs = $default_jobs; # number of decode nodes
+my $pmem = "9g";
+my $help = 0;
+my $config;
+my $test_set;
+my $weights;
+my $use_make = 1;
+my $useqsub;
+my $cpbin=1;
+# Process command-line options
+if (GetOptions(
+ "jobs=i" => \$jobs,
+ "help" => \$help,
+ "qsub" => \$useqsub,
+ "input=s" => \$test_set,
+ "config=s" => \$config,
+ "weights=s" => \$weights,
+) == 0 || @ARGV!=0 || $help) {
+ print_help();
+ exit;
+}
+
+if ($useqsub) {
+ $use_make = 0;
+ die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
+}
+
+my @missing_args = ();
+
+if (!defined $test_set) { push @missing_args, "--input"; }
+if (!defined $config) { push @missing_args, "--config"; }
+if (!defined $weights) { push @missing_args, "--weights"; }
+die "Please specify missing arguments: " . join (', ', @missing_args) . "\nUse --help for more information.\n" if (@missing_args);
+
+my @tf = localtime(time);
+my $tname = basename($test_set);
+$tname =~ s/\.(sgm|sgml|xml)$//i;
+my $dir = "eval.$tname." . sprintf('%d%02d%02d-%02d%02d%02d', 1900+$tf[5], $tf[4], $tf[3], $tf[2], $tf[1], $tf[0]);
+
+my $time = unchecked_output("date");
+
+check_call("mkdir -p $dir");
+
+split_devset($test_set, "$dir/test.input.raw", "$dir/test.refs");
+my $refs = "-r $dir/test.refs";
+my $newsrc = "$dir/test.input";
+enseg("$dir/test.input.raw", $newsrc);
+my $src_file = $newsrc;
+open F, "<$src_file" or die "Can't read $src_file: $!"; close F;
+
+my $test_trans="$dir/test.trans";
+my $logdir="$dir/logs";
+my $decoderLog="$logdir/decoder.sentserver.log";
+check_call("mkdir -p $logdir");
+
+#decode
+print STDERR "RUNNING DECODER AT ";
+print STDERR unchecked_output("date");
+my $decoder_cmd = "$decoder -c $config --weights $weights";
+my $pcmd;
+if ($use_make) {
+ $pcmd = "cat $src_file | $parallelize --workdir $dir --use-fork -p $pmem -e $logdir -j $jobs --";
+} else {
+ $pcmd = "cat $src_file | $parallelize --workdir $dir -p $pmem -e $logdir -j $jobs --";
+}
+my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $test_trans";
+check_bash_call($cmd);
+print STDERR "DECODER COMPLETED AT ";
+print STDERR unchecked_output("date");
+print STDERR "\nOUTPUT: $test_trans\n\n";
+my $bleu = check_output("cat $test_trans | $SCORER $refs -m ibm_bleu");
+chomp $bleu;
+print STDERR "BLEU: $bleu\n";
+my $ter = check_output("cat $test_trans | $SCORER $refs -m ter");
+chomp $ter;
+print STDERR " TER: $ter\n";
+open TR, ">$dir/test.scores" or die "Can't write $dir/test.scores: $!";
+print TR <<EOT;
+### SCORE REPORT #############################################################
+ OUTPUT=$test_trans
+ SCRIPT INPUT=$test_set
+ DECODER INPUT=$src_file
+ REFERENCES=$dir/test.refs
+------------------------------------------------------------------------------
+ BLEU=$bleu
+ TER=$ter
+##############################################################################
+EOT
+close TR;
+my $sr = unchecked_output("cat $dir/test.scores");
+print STDERR "\n\n$sr\n(A copy of this report can be found in $dir/test.scores)\n\n";
+exit 0;
+
+sub enseg {
+ my $src = shift;
+ my $newsrc = shift;
+ open(SRC, $src);
+ open(NEWSRC, ">$newsrc");
+ my $i=0;
+ while (my $line=<SRC>){
+ chomp $line;
+ if ($line =~ /^\s*<seg/i) {
+ if($line =~ /id="[0-9]+"/) {
+ print NEWSRC "$line\n";
+ } else {
+ die "When using segments with pre-generated <seg> tags, you must include a zero-based id attribute";
+ }
+ } else {
+ print NEWSRC "<seg id=\"$i\">$line</seg>\n";
+ }
+ $i++;
+ }
+ close SRC;
+ close NEWSRC;
+}
+
+sub print_help {
+ my $executable = basename($0); chomp $executable;
+ print << "Help";
+
+Usage: $executable [options] <ini file>
+
+ $executable --config cdec.ini --weights weights.txt [--jobs N] [--qsub] <testset.in-ref>
+
+Options:
+
+ --help
+ Print this message and exit.
+
+ --config <file>
+ A path to the cdec.ini file.
+
+ --weights <file>
+ A file specifying feature weights.
+
+ --dir <dir>
+ Directory for intermediate and output files.
+
+Job control options:
+
+ --jobs <I>
+ Number of decoder processes to run in parallel. [default=$default_jobs]
+
+ --qsub
+ Use qsub to run jobs in parallel (qsub must be configured in
+ environment/LocalEnvironment.pm)
+
+ --pmem <N>
+ Amount of physical memory requested for parallel decoding jobs
+ (used with qsub requests only)
+
+Help
+}
+
+sub convert {
+ my ($str) = @_;
+ my @ps = split /;/, $str;
+ my %dict = ();
+ for my $p (@ps) {
+ my ($k, $v) = split /=/, $p;
+ $dict{$k} = $v;
+ }
+ return %dict;
+}
+
+
+
+sub cmdline {
+ return join ' ',($0,@ORIG_ARGV);
+}
+
+#buggy: last arg gets quoted sometimes?
+my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]};
+my $shell_escape_in_quote=qr{[\\"\$`!]};
+
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+
+sub escaped_shell_args {
+ return map {local $_=$_;chomp;escape_shell($_)} @_;
+}
+
+sub escaped_shell_args_str {
+ return join ' ',&escaped_shell_args(@_);
+}
+
+sub escaped_cmdline {
+ return "$0 ".&escaped_shell_args_str(@ORIG_ARGV);
+}
+
+sub split_devset {
+ my ($infile, $outsrc, $outref) = @_;
+ open F, "<$infile" or die "Can't read $infile: $!";
+ open S, ">$outsrc" or die "Can't write $outsrc: $!";
+ open R, ">$outref" or die "Can't write $outref: $!";
+ while(<F>) {
+ chomp;
+ my ($src, @refs) = split /\s*\|\|\|\s*/;
+ die "Malformed devset line: $_\n" unless scalar @refs > 0;
+ print S "$src\n";
+ print R join(' ||| ', @refs) . "\n";
+ }
+ close R;
+ close S;
+ close F;
+}
+
diff --git a/training/entropy.cc b/training/utils/entropy.cc
index 4fdbe2be..4fdbe2be 100644
--- a/training/entropy.cc
+++ b/training/utils/entropy.cc
diff --git a/training/entropy.h b/training/utils/entropy.h
index 796589ca..796589ca 100644
--- a/training/entropy.h
+++ b/training/utils/entropy.h
diff --git a/training/grammar_convert.cc b/training/utils/grammar_convert.cc
index 607a7cb9..607a7cb9 100644
--- a/training/grammar_convert.cc
+++ b/training/utils/grammar_convert.cc
diff --git a/training/lbfgs.h b/training/utils/lbfgs.h
index e8baecab..e8baecab 100644
--- a/training/lbfgs.h
+++ b/training/utils/lbfgs.h
diff --git a/training/lbfgs_test.cc b/training/utils/lbfgs_test.cc
index 9678e788..9678e788 100644
--- a/training/lbfgs_test.cc
+++ b/training/utils/lbfgs_test.cc
diff --git a/training/utils/libcall.pl b/training/utils/libcall.pl
new file mode 100644
index 00000000..c7d0f128
--- /dev/null
+++ b/training/utils/libcall.pl
@@ -0,0 +1,71 @@
+use IPC::Open3;
+use Symbol qw(gensym);
+
+$DUMMY_STDERR = gensym();
+$DUMMY_STDIN = gensym();
+
+# Run the command and ignore failures
+sub unchecked_call {
+ system("@_")
+}
+
+# Run the command and return its output, if any ignoring failures
+sub unchecked_output {
+ return `@_`
+}
+
+# WARNING: Do not use this for commands that will return large amounts
+# of stdout or stderr -- they might block indefinitely
+sub check_output {
+ print STDERR "Executing and gathering output: @_\n";
+
+ my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_);
+ my $proc_output = "";
+ while( <PH> ) {
+ $proc_output .= $_;
+ }
+ waitpid($pid, 0);
+ # TODO: Grab signal that the process died from
+ my $child_exit_status = $? >> 8;
+ if($child_exit_status == 0) {
+ return $proc_output;
+ } else {
+ print STDERR "ERROR: Execution of @_ failed.\n";
+ exit(1);
+ }
+}
+
+# Based on Moses' safesystem sub
+sub check_call {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ my $exitcode = $? >> 8;
+ if($exitcode == 0) {
+ return 0;
+ } elsif ($? == -1) {
+ print STDERR "ERROR: Failed to execute: @_\n $!\n";
+ exit(1);
+
+ } elsif ($? & 127) {
+ printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit(1);
+
+ } else {
+ print STDERR "Failed with exit code: $exitcode\n" if $exitcode;
+ exit($exitcode);
+ }
+}
+
+sub check_bash_call {
+ my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_");
+ check_call(@args);
+}
+
+sub check_bash_output {
+ my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_");
+ return check_output(@args);
+}
+
+# perl module weirdness...
+return 1;
diff --git a/training/online_optimizer.cc b/training/utils/online_optimizer.cc
index 3ed95452..3ed95452 100644
--- a/training/online_optimizer.cc
+++ b/training/utils/online_optimizer.cc
diff --git a/training/online_optimizer.h b/training/utils/online_optimizer.h
index 28d89344..28d89344 100644
--- a/training/online_optimizer.h
+++ b/training/utils/online_optimizer.h
diff --git a/training/optimize.cc b/training/utils/optimize.cc
index 41ac90d8..41ac90d8 100644
--- a/training/optimize.cc
+++ b/training/utils/optimize.cc
diff --git a/training/optimize.h b/training/utils/optimize.h
index 07943b44..07943b44 100644
--- a/training/optimize.h
+++ b/training/utils/optimize.h
diff --git a/training/optimize_test.cc b/training/utils/optimize_test.cc
index bff2ca03..bff2ca03 100644
--- a/training/optimize_test.cc
+++ b/training/utils/optimize_test.cc
diff --git a/training/utils/parallelize.pl b/training/utils/parallelize.pl
new file mode 100755
index 00000000..4197e0e5
--- /dev/null
+++ b/training/utils/parallelize.pl
@@ -0,0 +1,423 @@
+#!/usr/bin/env perl
+
+# Author: Adam Lopez
+#
+# This script takes a command that processes input
+# from stdin one-line-at-time, and parallelizes it
+# on the cluster using David Chiang's sentserver/
+# sentclient architecture.
+#
+# Prerequisites: the command *must* read each line
+# without waiting for subsequent lines of input
+# (for instance, a command which must read all lines
+# of input before processing will not work) and
+# return it to the output *without* buffering
+# multiple lines.
+
+#TODO: if -j 1, run immediately, not via sentserver? possible differences in environment might make debugging harder
+
+#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s
+
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
+use LocalConfig;
+
+use Cwd qw/ abs_path cwd getcwd /;
+use File::Temp qw/ tempfile /;
+use Getopt::Long;
+use IPC::Open2;
+use strict;
+use POSIX ":sys_wait_h";
+
+use File::Basename;
+my $myDir = dirname(__FILE__);
+print STDERR __FILE__." -> $myDir\n";
+push(@INC, $myDir);
+require "libcall.pl";
+
+my $tailn=5; # +0 = concatenate all the client logs. 5 = last 5 lines
+my $recycle_clients; # spawn new clients when previous ones terminate
+my $stay_alive; # dont let server die when having zero clients
+my $joblist = "";
+my $errordir="";
+my $multiline;
+my $workdir = '.';
+my $numnodes = 8;
+my $user = $ENV{"USER"};
+my $pmem = "9g";
+my $basep=50300;
+my $randp=300;
+my $tryp=50;
+my $no_which;
+my $no_cd;
+
+my $DEBUG=$ENV{DEBUG};
+print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG;
+my $verbose = 1;
+sub verbose {
+ if ($verbose) {
+ print STDERR @_,"\n";
+ }
+}
+sub debug {
+ if ($DEBUG) {
+ my ($package, $filename, $line) = caller;
+ print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n";
+ }
+}
+my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].;
+my $shell_escape_in_quote=qr.[\\"\$`!].;
+sub escape_shell {
+ my ($arg)=@_;
+ return undef unless defined $arg;
+ return '""' unless $arg;
+ if ($arg =~ /$is_shell_special/) {
+ $arg =~ s/($shell_escape_in_quote)/\\$1/g;
+ return "\"$arg\"";
+ }
+ return $arg;
+}
+sub preview_files {
+ my ($l,$skipempty,$footer,$n)=@_;
+ $n=$tailn unless defined $n;
+ my @f=grep { ! ($skipempty && -z $_) } @$l;
+ my $fn=join(' ',map {escape_shell($_)} @f);
+ my $cmd="tail -n $n $fn";
+ unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":"");
+}
+sub prefix_dirname($) {
+ #like `dirname but if ends in / then return the whole thing
+ local ($_)=@_;
+ if (/\/$/) {
+ $_;
+ } else {
+ s#/[^/]$##;
+ $_ ? $_ : '';
+ }
+}
+sub ensure_final_slash($) {
+ local ($_)=@_;
+ m#/$# ? $_ : ($_."/");
+}
+sub extend_path($$;$$) {
+ my ($base,$ext,$mkdir,$baseisdir)=@_;
+ if (-d $base) {
+ $base.="/";
+ } else {
+ my $dir;
+ if ($baseisdir) {
+ $dir=$base;
+ $base.='/' unless $base =~ /\/$/;
+ } else {
+ $dir=prefix_dirname($base);
+ }
+ my @cmd=("/bin/mkdir","-p",$dir);
+ check_call(@cmd) if $mkdir;
+ }
+ return $base.$ext;
+}
+
+my $abscwd=abs_path(&getcwd);
+sub print_help;
+
+my $use_fork;
+my @pids;
+
+# Process command-line options
+unless (GetOptions(
+ "stay-alive" => \$stay_alive,
+ "recycle-clients" => \$recycle_clients,
+ "error-dir=s" => \$errordir,
+ "multi-line" => \$multiline,
+ "workdir=s" => \$workdir,
+ "use-fork" => \$use_fork,
+ "verbose" => \$verbose,
+ "jobs=i" => \$numnodes,
+ "pmem=s" => \$pmem,
+ "baseport=i" => \$basep,
+# "iport=i" => \$randp, #for short name -i
+ "no-which!" => \$no_which,
+ "no-cd!" => \$no_cd,
+ "tailn=s" => \$tailn,
+) && scalar @ARGV){
+ print_help();
+ die "bad options.";
+}
+
+my $cmd = "";
+my $prog=shift;
+if ($no_which) {
+ $cmd=$prog;
+} else {
+ $cmd=check_output("which $prog");
+ chomp $cmd;
+ die "$prog not found - $cmd" unless $cmd;
+}
+#$cmd=abs_path($cmd);
+for my $arg (@ARGV) {
+ $cmd .= " ".escape_shell($arg);
+}
+die "Please specify a command to parallelize\n" if $cmd eq '';
+
+my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n");
+
+my $executable = $cmd;
+$executable =~ s/^\s*(\S+)($|\s.*)/$1/;
+$executable=check_output("basename $executable");
+chomp $executable;
+
+
+print STDERR "Parallelizing ($numnodes ways): $cmd\n\n";
+
+# create -e dir and save .sh
+use File::Temp qw/tempdir/;
+unless ($errordir) {
+ $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1);
+}
+if ($errordir) {
+ my $scriptfile=extend_path("$errordir/","$executable.sh",1,1);
+ -d $errordir || die "should have created -e dir $errordir";
+ open SF,">",$scriptfile || die;
+ print SF "$cdcmd$cmd\n";
+ close SF;
+ chmod 0755,$scriptfile;
+ $errordir=abs_path($errordir);
+ &verbose("-e dir: $errordir");
+}
+
+# set cleanup handler
+my @cleanup_cmds;
+sub cleanup;
+sub cleanup_and_die;
+$SIG{INT} = "cleanup_and_die";
+$SIG{TERM} = "cleanup_and_die";
+$SIG{HUP} = "cleanup_and_die";
+
+# other subs:
+sub numof_live_jobs;
+sub launch_job_on_node;
+
+
+# vars
+my $mydir = check_output("dirname $0"); chomp $mydir;
+my $sentserver = "$mydir/sentserver";
+my $sentclient = "$mydir/sentclient";
+my $host = check_output("hostname");
+chomp $host;
+
+
+# find open port
+srand;
+my $port = 50300+int(rand($randp));
+my $endp=$port+$tryp;
+sub listening_port_lines {
+ my $quiet=$verbose?'':'2>/dev/null';
+ return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp");
+}
+my $netstat=&listening_port_lines;
+
+if ($verbose){ print STDERR "Testing port $port...";}
+
+while ($netstat=~/$port/ || &listening_port_lines=~/$port/){
+ if ($verbose){ print STDERR "port is busy\n";}
+ $port++;
+ if ($port > $endp){
+ die "Unable to find open port\n";
+ }
+ if ($verbose){ print STDERR "Testing port $port... "; }
+}
+if ($verbose){
+ print STDERR "port $port is available\n";
+}
+
+my $key = int(rand()*1000000);
+
+my $multiflag = "";
+if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; }
+my $stay_alive_flag = "";
+if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; }
+
+my $node_count = 0;
+my $script = "";
+# fork == one thread runs the sentserver, while the
+# other spawns the sentclient commands.
+my $pid = fork;
+if ($pid == 0) { # child
+ sleep 8; # give other thread time to start sentserver
+ $script = "$cdcmd$sentclient $host:$port:$key $cmd";
+
+ if ($verbose){
+ print STDERR "Client script:\n====\n";
+ print STDERR $script;
+ print STDERR "====\n";
+ }
+ for (my $jobn=0; $jobn<$numnodes; $jobn++){
+ launch_job();
+ }
+ if ($recycle_clients) {
+ my $ret;
+ my $livejobs;
+ while (1) {
+ $ret = waitpid($pid, WNOHANG);
+ #print STDERR "waitpid $pid ret = $ret \n";
+ last if ($ret != 0);
+ $livejobs = numof_live_jobs();
+ if ($numnodes >= $livejobs ) { # a client terminated, OR # lines of input was less than -j
+ print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n";
+ launch_job();
+ } else {
+ sleep 15;
+ }
+ }
+ }
+ print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n";
+ for my $p (@pids) {
+ waitpid($p, 0);
+ }
+} else {
+# my $todo = "$sentserver -k $key $multiflag $port ";
+ my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag ";
+ if ($verbose){ print STDERR "Running: $todo\n"; }
+ check_call($todo);
+ print STDERR "Call to $sentserver returned.\n";
+ cleanup();
+ exit(0);
+}
+
+sub numof_live_jobs {
+ if ($use_fork) {
+ die "not implemented";
+ } else {
+ # We can probably continue decoding if the qstat error is only temporary
+ my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat")));
+ return ($#livejobs + 1);
+ }
+}
+my (@errors,@outs,@cmds);
+
+sub launch_job {
+ if ($use_fork) { return launch_job_fork(); }
+ my $errorfile = "/dev/null";
+ my $outfile = "/dev/null";
+ $node_count++;
+ my $clientname = $executable;
+ $clientname =~ s/^(.{4}).*$/$1/;
+ $clientname = "$clientname.$node_count";
+ if ($errordir){
+ $errorfile = "$errordir/$clientname.ER";
+ $outfile = "$errordir/$clientname.OU";
+ push @errors,$errorfile;
+ push @outs,$outfile;
+ }
+ my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile";
+ push @cmds,$todo;
+
+ print STDERR "Running: $todo\n";
+ local(*QOUT, *QIN);
+ open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!";
+ print QIN $script;
+ close QIN;
+ while (my $jobid=<QOUT>){
+ chomp $jobid;
+ if ($verbose){ print STDERR "Launched client job: $jobid"; }
+ $jobid =~ s/^(\d+)(.*?)$/\1/g;
+ $jobid =~ s/^Your job (\d+) .*$/\1/;
+ print STDERR " short job id $jobid\n";
+ if ($verbose){
+ print STDERR "cd: $abscwd\n";
+ print STDERR "cmd: $cmd\n";
+ }
+ if ($joblist == "") { $joblist = $jobid; }
+ else {$joblist = $joblist . "\|" . $jobid; }
+ my $cleanfn="qdel $jobid 2> /dev/null";
+ push(@cleanup_cmds, $cleanfn);
+ }
+ close QOUT;
+}
+
+sub launch_job_fork {
+ my $errorfile = "/dev/null";
+ my $outfile = "/dev/null";
+ $node_count++;
+ my $clientname = $executable;
+ $clientname =~ s/^(.{4}).*$/$1/;
+ $clientname = "$clientname.$node_count";
+ if ($errordir){
+ $errorfile = "$errordir/$clientname.ER";
+ $outfile = "$errordir/$clientname.OU";
+ push @errors,$errorfile;
+ push @outs,$outfile;
+ }
+ my $pid = fork;
+ if ($pid == 0) {
+ my ($fh, $scr_name) = get_temp_script();
+ print $fh $script;
+ close $fh;
+ my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile";
+ print STDERR "EXEC: $todo\n";
+ my $out = check_output("$todo");
+ unlink $scr_name or warn "Failed to remove $scr_name";
+ exit 0;
+ } else {
+ push @pids, $pid;
+ }
+}
+
+sub get_temp_script {
+ my ($fh, $filename) = tempfile( "$workdir/workXXXX", SUFFIX => '.sh');
+ return ($fh, $filename);
+}
+
+sub cleanup_and_die {
+ cleanup();
+ die "\n";
+}
+
+sub cleanup {
+ print STDERR "Cleaning up...\n";
+ for $cmd (@cleanup_cmds){
+ print STDERR " Cleanup command: $cmd\n";
+ eval $cmd;
+ }
+ print STDERR "outputs:\n",preview_files(\@outs,1),"\n";
+ print STDERR "errors:\n",preview_files(\@errors,1),"\n";
+ print STDERR "cmd:\n",$cmd,"\n";
+ print STDERR " cat $errordir/*.ER\nfor logs.\n";
+ print STDERR "Cleanup finished.\n";
+}
+
+sub print_help
+{
+ my $name = check_output("basename $0"); chomp $name;
+ print << "Help";
+
+usage: $name [options]
+
+ Automatic black-box parallelization of commands.
+
+options:
+
+ --use-fork
+ Instead of using qsub, use fork.
+
+ -e, --error-dir <dir>
+ Retain output files from jobs in <dir>, rather
+ than silently deleting them.
+
+ -m, --multi-line
+ Expect that command may produce multiple output
+ lines for a single input line. $name makes a
+ reasonable attempt to obtain all output before
+ processing additional inputs. However, use of this
+ option is inherently unsafe.
+
+ -v, --verbose
+ Print diagnostic informatoin on stderr.
+
+ -j, --jobs
+ Number of jobs to use.
+
+ -p, --pmem
+ pmem setting for each job.
+
+Help
+}
diff --git a/training/risk.cc b/training/utils/risk.cc
index d5a12cfd..d5a12cfd 100644
--- a/training/risk.cc
+++ b/training/utils/risk.cc
diff --git a/training/risk.h b/training/utils/risk.h
index 2e8db0fb..2e8db0fb 100644
--- a/training/risk.h
+++ b/training/utils/risk.h
diff --git a/training/utils/sentclient.cc b/training/utils/sentclient.cc
new file mode 100644
index 00000000..91d994ab
--- /dev/null
+++ b/training/utils/sentclient.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2001 by David Chiang. All rights reserved.*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <string.h>
+
+#include "sentserver.h"
+
+int main (int argc, char *argv[]) {
+ int sock, port;
+ char *s, *key;
+ struct hostent *hp;
+ struct sockaddr_in server;
+ int errors = 0;
+
+ if (argc < 3) {
+ fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n");
+ exit(1);
+ }
+
+ s = strchr(argv[1], ':');
+ key = NULL;
+
+ if (s == NULL) {
+ port = DEFAULT_PORT;
+ } else {
+ *s = '\0';
+ s+=1;
+ /* dumb hack */
+ key = strchr(s, ':');
+ if (key != NULL){
+ *key = '\0';
+ key += 1;
+ }
+ port = atoi(s);
+ }
+
+ sock = socket(AF_INET, SOCK_STREAM, 0);
+
+ hp = gethostbyname(argv[1]);
+ if (hp == NULL) {
+ fprintf(stderr, "unknown host %s\n", argv[1]);
+ exit(1);
+ }
+
+ bzero((char *)&server, sizeof(server));
+ bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length);
+ server.sin_family = hp->h_addrtype;
+ server.sin_port = htons(port);
+
+ while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) {
+ perror("connect()");
+ sleep(1);
+ errors++;
+ if (errors > 5)
+ exit(1);
+ }
+
+ close(0);
+ close(1);
+ dup2(sock, 0);
+ dup2(sock, 1);
+
+ if (key != NULL){
+ write(1, key, strlen(key));
+ write(1, "\n", 1);
+ }
+
+ execvp(argv[2], argv+2);
+ return 0;
+}
diff --git a/training/utils/sentserver.cc b/training/utils/sentserver.cc
new file mode 100644
index 00000000..b425955f
--- /dev/null
+++ b/training/utils/sentserver.cc
@@ -0,0 +1,515 @@
+/* Copyright (c) 2001 by David Chiang. All rights reserved.*/
+
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <netinet/in.h>
+#include <sched.h>
+#include <pthread.h>
+#include <errno.h>
+
+#include "sentserver.h"
+
+#define MAX_CLIENTS 64
+
+struct clientinfo {
+ int s;
+ struct sockaddr_in sin;
+};
+
+struct line {
+ int id;
+ char *s;
+ int status;
+ struct line *next;
+} *head, **ptail;
+
+int n_sent = 0, n_received=0, n_flushed=0;
+
+#define STATUS_RUNNING 0
+#define STATUS_ABORTED 1
+#define STATUS_FINISHED 2
+
+pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+int n_clients = 0;
+int s;
+int expect_multiline_output = 0;
+int log_mutex = 0;
+int stay_alive = 0; /* dont panic and die with zero clients */
+
+void queue_finish(struct line *node, char *s, int fid);
+char * read_line(int fd, int multiline);
+void done (int code);
+
+struct line * queue_get(int fid) {
+ struct line *cur;
+ char *s, *synch;
+
+ if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid);
+ if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+ pthread_mutex_lock(&queue_mutex);
+
+ /* First, check for aborted sentences. */
+
+ if (log_mutex) fprintf(stderr, " Checking queue for aborted jobs (fid %d)\n", fid);
+ for (cur = head; cur != NULL; cur = cur->next) {
+ if (cur->status == STATUS_ABORTED) {
+ cur->status = STATUS_RUNNING;
+
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+
+ return cur;
+ }
+ }
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+
+ /* Otherwise, read a new one. */
+ if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid);
+ if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid);
+ pthread_mutex_lock(&input_mutex);
+ s = read_line(0,0);
+
+ while (s) {
+ if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+ pthread_mutex_lock(&queue_mutex);
+ if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid);
+ pthread_mutex_unlock(&input_mutex);
+
+ cur = (line*)malloc(sizeof (struct line));
+ cur->id = n_sent;
+ cur->s = s;
+ cur->next = NULL;
+
+ *ptail = cur;
+ ptail = &cur->next;
+
+ n_sent++;
+
+ if (strcmp(s,"===SYNCH===\n")==0){
+ fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid);
+ // Note: queue_finish calls free(cur->s).
+ // Therefore we need to create a new string here.
+ synch = (char*)malloc((strlen("===SYNCH===\n")+2) * sizeof (char));
+ synch = strcpy(synch, s);
+
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+ queue_finish(cur, synch, fid); /* handles its own lock */
+
+ if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid);
+ if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid);
+ pthread_mutex_lock(&input_mutex);
+
+ s = read_line(0,0);
+ } else {
+ if (log_mutex) fprintf(stderr, " Received new data %d (fid %d)\n", cur->id, fid);
+ cur->status = STATUS_RUNNING;
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+ return cur;
+ }
+ }
+
+ if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid);
+ pthread_mutex_unlock(&input_mutex);
+ /* Only way to reach this point: no more output */
+
+ if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+ pthread_mutex_lock(&queue_mutex);
+ if (head == NULL) {
+ fprintf(stderr, "Reached end of file. Exiting.\n");
+ done(0);
+ } else
+ ptail = NULL; /* This serves as a signal that there is no more input */
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+
+ return NULL;
+}
+
+void queue_panic() {
+ struct line *next;
+ while (head && head->status == STATUS_FINISHED) {
+ /* Write out finished sentences */
+ if (head->status == STATUS_FINISHED) {
+ fputs(head->s, stdout);
+ fflush(stdout);
+ }
+ /* Write out blank line for unfinished sentences */
+ if (head->status == STATUS_ABORTED) {
+ fputs("\n", stdout);
+ fflush(stdout);
+ }
+ /* By defition, there cannot be any RUNNING sentences, since
+ function is only called when n_clients == 0 */
+ free(head->s);
+ next = head->next;
+ free(head);
+ head = next;
+ n_flushed++;
+ }
+ fclose(stdout);
+ fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n");
+ done(1);
+}
+
+void queue_abort(struct line *node, int fid) {
+ if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+ pthread_mutex_lock(&queue_mutex);
+ node->status = STATUS_ABORTED;
+ if (n_clients == 0) {
+ if (stay_alive) {
+ fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n");
+ } else {
+ queue_panic();
+ }
+ }
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+}
+
+
+void queue_print() {
+ struct line *cur;
+
+ fprintf(stderr, " Queue\n");
+
+ for (cur = head; cur != NULL; cur = cur->next) {
+ switch(cur->status) {
+ case STATUS_RUNNING:
+ fprintf(stderr, " %d running ", cur->id); break;
+ case STATUS_ABORTED:
+ fprintf(stderr, " %d aborted ", cur->id); break;
+ case STATUS_FINISHED:
+ fprintf(stderr, " %d finished ", cur->id); break;
+
+ }
+ fprintf(stderr, "\n");
+ //fprintf(stderr, cur->s);
+ }
+}
+
+void queue_finish(struct line *node, char *s, int fid) {
+ struct line *next;
+ if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid);
+ pthread_mutex_lock(&queue_mutex);
+
+ free(node->s);
+ node->s = s;
+ node->status = STATUS_FINISHED;
+ n_received++;
+
+ /* Flush out finished nodes */
+ while (head && head->status == STATUS_FINISHED) {
+
+ if (log_mutex) fprintf(stderr, " Flushing finished node %d\n", head->id);
+
+ fputs(head->s, stdout);
+ fflush(stdout);
+ if (log_mutex) fprintf(stderr, " Flushed node %d\n", head->id);
+ free(head->s);
+
+ next = head->next;
+ free(head);
+
+ head = next;
+
+ n_flushed++;
+
+ if (head == NULL) { /* empty queue */
+ if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */
+ fprintf(stderr, "All sentences finished. Exiting.\n");
+ done(0);
+ } else /* ptail pointed at something which was just popped off the stack -- reset to head*/
+ ptail = &head;
+ }
+ }
+
+ if (log_mutex) fprintf(stderr, " Flushing output %d\n", head->id);
+ fflush(stdout);
+ fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed);
+
+ if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid);
+ pthread_mutex_unlock(&queue_mutex);
+
+}
+
+char * read_line(int fd, int multiline) {
+ int size = 80;
+ char errorbuf[100];
+ char *s = (char*)malloc(size+2);
+ int result, errors=0;
+ int i = 0;
+
+ result = read(fd, s+i, 1);
+
+ while (1) {
+ if (result < 0) {
+ perror("read()");
+ sprintf(errorbuf, "Error code: %d\n", errno);
+ fputs(errorbuf, stderr);
+ errors++;
+ if (errors > 5) {
+ free(s);
+ return NULL;
+ } else {
+ sleep(1); /* retry after delay */
+ }
+ } else if (result == 0) {
+ break;
+ } else if (multiline==0 && s[i] == '\n') {
+ break;
+ } else {
+ if (s[i] == '\n'){
+ /* if we've reached this point,
+ then multiline must be 1, and we're
+ going to poll the fd for an additional
+ line of data. The basic design is to
+ run a select on the filedescriptor fd.
+ Select will return under two conditions:
+ if there is data on the fd, or if a
+ timeout is reached. We'll select on this
+ fd. If select returns because there's data
+ ready, keep going; else assume there's no
+ more and return the data we already have.
+ */
+
+ fd_set set;
+ FD_ZERO(&set);
+ FD_SET(fd, &set);
+
+ struct timeval timeout;
+ timeout.tv_sec = 3; // number of seconds for timeout
+ timeout.tv_usec = 0;
+
+ int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout);
+ if (ready<1){
+ break; // no more data, stop looping
+ }
+ }
+ i++;
+
+ if (i == size) {
+ size = size*2;
+ s = (char*)realloc(s, size+2);
+ }
+ }
+
+ result = read(fd, s+i, 1);
+ }
+
+ if (result == 0 && i == 0) { /* end of file */
+ free(s);
+ return NULL;
+ }
+
+ s[i] = '\n';
+ s[i+1] = '\0';
+
+ return s;
+}
+
+void * new_client(void *arg) {
+ struct clientinfo *client = (struct clientinfo *)arg;
+ struct line *cur;
+ int result;
+ char *s;
+ char errorbuf[100];
+
+ pthread_mutex_lock(&clients_mutex);
+ n_clients++;
+ pthread_mutex_unlock(&clients_mutex);
+
+ fprintf(stderr, "Client connected (%d connected)\n", n_clients);
+
+ for (;;) {
+
+ cur = queue_get(client->s);
+
+ if (cur) {
+ /* fprintf(stderr, "Sending to client: %s", cur->s); */
+ fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s);
+ result = write(client->s, cur->s, strlen(cur->s));
+ if (result < strlen(cur->s)){
+ perror("write()");
+ sprintf(errorbuf, "Error code: %d\n", errno);
+ fputs(errorbuf, stderr);
+
+ pthread_mutex_lock(&clients_mutex);
+ n_clients--;
+ pthread_mutex_unlock(&clients_mutex);
+
+ fprintf(stderr, "Client died (%d connected)\n", n_clients);
+ queue_abort(cur, client->s);
+
+ close(client->s);
+ free(client);
+
+ pthread_exit(NULL);
+ }
+ } else {
+ close(client->s);
+ pthread_mutex_lock(&clients_mutex);
+ n_clients--;
+ pthread_mutex_unlock(&clients_mutex);
+ fprintf(stderr, "Client dismissed (%d connected)\n", n_clients);
+ pthread_exit(NULL);
+ }
+
+ s = read_line(client->s,expect_multiline_output);
+ if (s) {
+ /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */
+ fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id);
+// queue_print();
+ queue_finish(cur, s, client->s);
+ } else {
+ pthread_mutex_lock(&clients_mutex);
+ n_clients--;
+ pthread_mutex_unlock(&clients_mutex);
+
+ fprintf(stderr, "Client died (%d connected)\n", n_clients);
+ queue_abort(cur, client->s);
+
+ close(client->s);
+ free(client);
+
+ pthread_exit(NULL);
+ }
+
+ }
+ return 0;
+}
+
+void done (int code) {
+ close(s);
+ exit(code);
+}
+
+
+
+int main (int argc, char *argv[]) {
+ struct sockaddr_in sin, from;
+ int g;
+ socklen_t len;
+ struct clientinfo *client;
+ int port;
+ int opt;
+ int errors = 0;
+ int argi;
+ char *key = NULL, *client_key;
+ int use_key = 0;
+ /* the key stuff here doesn't provide any
+ real measure of security, it's mainly to keep
+ jobs from bumping into each other. */
+
+ pthread_t tid;
+ port = DEFAULT_PORT;
+
+ for (argi=1; argi < argc; argi++){
+ if (strcmp(argv[argi], "-m")==0){
+ expect_multiline_output = 1;
+ } else if (strcmp(argv[argi], "-k")==0){
+ argi++;
+ if (argi == argc){
+ fprintf(stderr, "Key must be specified after -k\n");
+ exit(1);
+ }
+ key = argv[argi];
+ use_key = 1;
+ } else if (strcmp(argv[argi], "--stay-alive")==0){
+ stay_alive = 1; /* dont panic and die with zero clients */
+ } else {
+ port = atoi(argv[argi]);
+ }
+ }
+
+ /* Initialize data structures */
+ head = NULL;
+ ptail = &head;
+
+ /* Set up listener */
+ s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+ opt = 1;
+ setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
+
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(port);
+ while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) {
+ perror("bind()");
+ sleep(1);
+ errors++;
+ if (errors > 100)
+ exit(1);
+ }
+
+ len = sizeof(sin);
+ getsockname(s, (struct sockaddr *) &sin, &len);
+
+ fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port));
+
+ while (listen(s, MAX_CLIENTS) < 0) {
+ perror("listen()");
+ sleep(1);
+ errors++;
+ if (errors > 100)
+ exit(1);
+ }
+
+ for (;;) {
+ len = sizeof(from);
+ g = accept(s, (struct sockaddr *)&from, &len);
+ if (g < 0) {
+ perror("accept()");
+ sleep(1);
+ continue;
+ }
+ client = (clientinfo*)malloc(sizeof(struct clientinfo));
+ client->s = g;
+ bcopy(&from, &client->sin, len);
+
+ if (use_key){
+ fd_set set;
+ FD_ZERO(&set);
+ FD_SET(client->s, &set);
+
+ struct timeval timeout;
+ timeout.tv_sec = 3; // number of seconds for timeout
+ timeout.tv_usec = 0;
+
+ int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout);
+ if (ready<1){
+ fprintf(stderr, "Prospective client failed to respond with correct key.\n");
+ close(client->s);
+ free(client);
+ } else {
+ client_key = read_line(client->s,0);
+ client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */
+ if (strcmp(key, client_key)==0){
+ pthread_create(&tid, NULL, new_client, client);
+ } else {
+ fprintf(stderr, "Prospective client failed to respond with correct key.\n");
+ close(client->s);
+ free(client);
+ }
+ free(client_key);
+ }
+ } else {
+ pthread_create(&tid, NULL, new_client, client);
+ }
+ }
+
+}
+
+
+
diff --git a/training/utils/sentserver.h b/training/utils/sentserver.h
new file mode 100644
index 00000000..cd17a546
--- /dev/null
+++ b/training/utils/sentserver.h
@@ -0,0 +1,6 @@
+#ifndef SENTSERVER_H
+#define SENTSERVER_H
+
+#define DEFAULT_PORT 50000
+
+#endif