summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore28
-rw-r--r--Makefile.am7
-rw-r--r--configure.ac32
-rw-r--r--dpmert/README.shared-mem9
-rw-r--r--minrisk/Makefile.am6
-rw-r--r--pro/README.shared-mem9
-rw-r--r--training/Makefile.am100
-rwxr-xr-xtraining/add-model1-features-to-scfg.pl93
-rw-r--r--training/collapse_weights.cc110
-rw-r--r--training/crf/Makefile.am27
-rw-r--r--training/crf/cllh_observer.cc (renamed from training/cllh_observer.cc)0
-rw-r--r--training/crf/cllh_observer.h (renamed from training/cllh_observer.h)0
-rw-r--r--training/crf/mpi_batch_optimize.cc (renamed from training/mpi_batch_optimize.cc)0
-rw-r--r--training/crf/mpi_compute_cllh.cc (renamed from training/mpi_compute_cllh.cc)0
-rw-r--r--training/crf/mpi_extract_features.cc (renamed from training/mpi_extract_features.cc)0
-rw-r--r--training/crf/mpi_extract_reachable.cc (renamed from training/mpi_extract_reachable.cc)0
-rw-r--r--training/crf/mpi_flex_optimize.cc (renamed from training/mpi_flex_optimize.cc)0
-rw-r--r--training/crf/mpi_online_optimize.cc (renamed from training/mpi_online_optimize.cc)0
-rwxr-xr-xtraining/dep-reorder/conll2reordering-forest.pl65
-rw-r--r--training/dep-reorder/george.conll4
-rwxr-xr-xtraining/dep-reorder/scripts/conll2simplecfg.pl57
-rw-r--r--training/dpmert/Makefile.am (renamed from dpmert/Makefile.am)10
-rw-r--r--training/dpmert/ces.cc (renamed from dpmert/ces.cc)0
-rw-r--r--training/dpmert/ces.h (renamed from dpmert/ces.h)0
-rwxr-xr-xtraining/dpmert/divide_refs.py (renamed from dpmert/divide_refs.py)0
-rwxr-xr-xtraining/dpmert/dpmert.pl (renamed from dpmert/dpmert.pl)17
-rw-r--r--training/dpmert/error_surface.cc (renamed from dpmert/error_surface.cc)0
-rw-r--r--training/dpmert/error_surface.h (renamed from dpmert/error_surface.h)0
-rwxr-xr-xtraining/dpmert/line_mediator.pl (renamed from dpmert/line_mediator.pl)0
-rw-r--r--training/dpmert/line_optimizer.cc (renamed from dpmert/line_optimizer.cc)0
-rw-r--r--training/dpmert/line_optimizer.h (renamed from dpmert/line_optimizer.h)0
-rw-r--r--training/dpmert/lo_test.cc (renamed from dpmert/lo_test.cc)0
-rw-r--r--training/dpmert/mert_geometry.cc (renamed from dpmert/mert_geometry.cc)0
-rw-r--r--training/dpmert/mert_geometry.h (renamed from dpmert/mert_geometry.h)0
-rw-r--r--training/dpmert/mr_dpmert_generate_mapper_input.cc (renamed from dpmert/mr_dpmert_generate_mapper_input.cc)0
-rw-r--r--training/dpmert/mr_dpmert_map.cc (renamed from dpmert/mr_dpmert_map.cc)0
-rw-r--r--training/dpmert/mr_dpmert_reduce.cc (renamed from dpmert/mr_dpmert_reduce.cc)0
-rw-r--r--training/dpmert/test_aer/README (renamed from dpmert/test_aer/README)0
-rw-r--r--training/dpmert/test_aer/cdec.ini (renamed from dpmert/test_aer/cdec.ini)0
-rw-r--r--training/dpmert/test_aer/corpus.src (renamed from dpmert/test_aer/corpus.src)0
-rw-r--r--training/dpmert/test_aer/grammar (renamed from dpmert/test_aer/grammar)0
-rw-r--r--training/dpmert/test_aer/ref.0 (renamed from dpmert/test_aer/ref.0)0
-rw-r--r--training/dpmert/test_aer/weights (renamed from dpmert/test_aer/weights)0
-rw-r--r--training/dpmert/test_data/0.json.gz (renamed from dpmert/test_data/0.json.gz)bin13709 -> 13709 bytes
-rw-r--r--training/dpmert/test_data/1.json.gz (renamed from dpmert/test_data/1.json.gz)bin204803 -> 204803 bytes
-rw-r--r--training/dpmert/test_data/c2e.txt.0 (renamed from dpmert/test_data/c2e.txt.0)0
-rw-r--r--training/dpmert/test_data/c2e.txt.1 (renamed from dpmert/test_data/c2e.txt.1)0
-rw-r--r--training/dpmert/test_data/c2e.txt.2 (renamed from dpmert/test_data/c2e.txt.2)0
-rw-r--r--training/dpmert/test_data/c2e.txt.3 (renamed from dpmert/test_data/c2e.txt.3)0
-rw-r--r--training/dpmert/test_data/re.txt.0 (renamed from dpmert/test_data/re.txt.0)0
-rw-r--r--training/dpmert/test_data/re.txt.1 (renamed from dpmert/test_data/re.txt.1)0
-rw-r--r--training/dpmert/test_data/re.txt.2 (renamed from dpmert/test_data/re.txt.2)0
-rw-r--r--training/dpmert/test_data/re.txt.3 (renamed from dpmert/test_data/re.txt.3)0
-rw-r--r--training/dtrain/Makefile.am (renamed from dtrain/Makefile.am)2
-rw-r--r--training/dtrain/README.md (renamed from dtrain/README.md)0
-rw-r--r--training/dtrain/dtrain.cc (renamed from dtrain/dtrain.cc)0
-rw-r--r--training/dtrain/dtrain.h (renamed from dtrain/dtrain.h)0
-rwxr-xr-xtraining/dtrain/hstreaming/avg.rb (renamed from dtrain/hstreaming/avg.rb)0
-rw-r--r--training/dtrain/hstreaming/cdec.ini (renamed from dtrain/hstreaming/cdec.ini)0
-rw-r--r--training/dtrain/hstreaming/dtrain.ini (renamed from dtrain/hstreaming/dtrain.ini)0
-rwxr-xr-xtraining/dtrain/hstreaming/dtrain.sh (renamed from dtrain/hstreaming/dtrain.sh)0
-rwxr-xr-xtraining/dtrain/hstreaming/hadoop-streaming-job.sh (renamed from dtrain/hstreaming/hadoop-streaming-job.sh)0
-rwxr-xr-xtraining/dtrain/hstreaming/lplp.rb (renamed from dtrain/hstreaming/lplp.rb)0
-rw-r--r--training/dtrain/hstreaming/red-test (renamed from dtrain/hstreaming/red-test)0
-rw-r--r--training/dtrain/kbestget.h (renamed from dtrain/kbestget.h)0
-rw-r--r--training/dtrain/ksampler.h (renamed from dtrain/ksampler.h)0
-rw-r--r--training/dtrain/pairsampling.h (renamed from dtrain/pairsampling.h)0
-rwxr-xr-xtraining/dtrain/parallelize.rb (renamed from dtrain/parallelize.rb)0
-rw-r--r--training/dtrain/parallelize/test/cdec.ini (renamed from dtrain/parallelize/test/cdec.ini)0
-rw-r--r--training/dtrain/parallelize/test/dtrain.ini (renamed from dtrain/parallelize/test/dtrain.ini)0
-rw-r--r--training/dtrain/parallelize/test/in (renamed from dtrain/parallelize/test/in)0
-rw-r--r--training/dtrain/parallelize/test/refs (renamed from dtrain/parallelize/test/refs)0
-rw-r--r--training/dtrain/score.cc (renamed from dtrain/score.cc)0
-rw-r--r--training/dtrain/score.h (renamed from dtrain/score.h)0
-rw-r--r--training/dtrain/test/example/README (renamed from dtrain/test/example/README)0
-rw-r--r--training/dtrain/test/example/cdec.ini (renamed from dtrain/test/example/cdec.ini)0
-rw-r--r--training/dtrain/test/example/dtrain.ini (renamed from dtrain/test/example/dtrain.ini)0
-rw-r--r--training/dtrain/test/example/expected-output (renamed from dtrain/test/example/expected-output)0
-rw-r--r--training/dtrain/test/parallelize/cdec.ini (renamed from dtrain/test/parallelize/cdec.ini)0
-rw-r--r--training/dtrain/test/parallelize/dtrain.ini (renamed from dtrain/test/parallelize/dtrain.ini)0
-rw-r--r--training/dtrain/test/parallelize/in (renamed from dtrain/test/parallelize/in)0
-rw-r--r--training/dtrain/test/parallelize/refs (renamed from dtrain/test/parallelize/refs)0
-rw-r--r--training/dtrain/test/toy/cdec.ini (renamed from dtrain/test/toy/cdec.ini)0
-rw-r--r--training/dtrain/test/toy/dtrain.ini (renamed from dtrain/test/toy/dtrain.ini)0
-rw-r--r--training/dtrain/test/toy/input (renamed from dtrain/test/toy/input)0
-rw-r--r--training/feature_expectations.cc232
-rw-r--r--training/lbl_model.cc421
-rw-r--r--training/minrisk/Makefile.am6
-rwxr-xr-xtraining/minrisk/minrisk.pl (renamed from minrisk/minrisk.pl)20
-rwxr-xr-xtraining/minrisk/minrisk_generate_input.pl (renamed from minrisk/minrisk_generate_input.pl)0
-rw-r--r--training/minrisk/minrisk_optimize.cc (renamed from minrisk/minrisk_optimize.cc)0
-rw-r--r--training/mira/Makefile.am (renamed from mira/Makefile.am)2
-rw-r--r--training/mira/kbest_mira.cc (renamed from mira/kbest_mira.cc)0
-rw-r--r--training/mpi_em_optimize.cc389
-rw-r--r--training/mr_em_adapted_reduce.cc173
-rw-r--r--training/mr_em_map_adapter.cc160
-rw-r--r--training/mr_optimize_reduce.cc231
-rw-r--r--training/mr_reduce_to_weights.cc109
-rw-r--r--training/pro/Makefile.am (renamed from pro/Makefile.am)4
-rwxr-xr-xtraining/pro/mr_pro_generate_mapper_input.pl (renamed from pro/mr_pro_generate_mapper_input.pl)0
-rw-r--r--training/pro/mr_pro_map.cc (renamed from pro/mr_pro_map.cc)0
-rw-r--r--training/pro/mr_pro_reduce.cc (renamed from pro/mr_pro_reduce.cc)0
-rwxr-xr-xtraining/pro/pro.pl (renamed from pro/pro.pl)20
-rw-r--r--training/rampion/Makefile.am (renamed from rampion/Makefile.am)4
-rwxr-xr-xtraining/rampion/rampion.pl (renamed from rampion/rampion.pl)20
-rw-r--r--training/rampion/rampion_cccp.cc (renamed from rampion/rampion_cccp.cc)0
-rwxr-xr-xtraining/rampion/rampion_generate_input.pl (renamed from rampion/rampion_generate_input.pl)0
-rw-r--r--training/utils/candidate_set.cc (renamed from training/candidate_set.cc)0
-rw-r--r--training/utils/candidate_set.h (renamed from training/candidate_set.h)0
-rwxr-xr-xtraining/utils/decode-and-evaluate.pl (renamed from dpmert/decode-and-evaluate.pl)8
-rw-r--r--training/utils/entropy.cc (renamed from training/entropy.cc)0
-rw-r--r--training/utils/entropy.h (renamed from training/entropy.h)0
-rw-r--r--training/utils/grammar_convert.cc (renamed from training/grammar_convert.cc)0
-rw-r--r--training/utils/lbfgs.h (renamed from training/lbfgs.h)0
-rw-r--r--training/utils/lbfgs_test.cc (renamed from training/lbfgs_test.cc)0
-rw-r--r--training/utils/libcall.pl (renamed from dpmert/libcall.pl)0
-rw-r--r--training/utils/online_optimizer.cc (renamed from training/online_optimizer.cc)0
-rw-r--r--training/utils/online_optimizer.h (renamed from training/online_optimizer.h)0
-rw-r--r--training/utils/optimize.cc (renamed from training/optimize.cc)0
-rw-r--r--training/utils/optimize.h (renamed from training/optimize.h)0
-rw-r--r--training/utils/optimize_test.cc (renamed from training/optimize_test.cc)0
-rwxr-xr-xtraining/utils/parallelize.pl (renamed from dpmert/parallelize.pl)2
-rw-r--r--training/utils/risk.cc (renamed from training/risk.cc)0
-rw-r--r--training/utils/risk.h (renamed from training/risk.h)0
-rw-r--r--training/utils/sentclient.c (renamed from dpmert/sentclient.c)0
-rw-r--r--training/utils/sentserver.c (renamed from dpmert/sentserver.c)0
-rw-r--r--training/utils/sentserver.h (renamed from dpmert/sentserver.h)0
-rw-r--r--word-aligner/Makefile.am6
-rw-r--r--word-aligner/fast_align.cc (renamed from training/fast_align.cc)0
-rw-r--r--word-aligner/makefiles/makefile.grammars2
-rwxr-xr-xword-aligner/paste-parallel-files.pl35
-rw-r--r--word-aligner/ttables.cc (renamed from training/ttables.cc)0
-rw-r--r--word-aligner/ttables.h (renamed from training/ttables.h)0
133 files changed, 149 insertions, 2271 deletions
diff --git a/.gitignore b/.gitignore
index aa2e64eb..c6023822 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+example_extff/ff_example.lo
+example_extff/libff_example.la
+mteval/meteor_jar.cc
*.a
*.aux
*.bbl
@@ -176,4 +179,27 @@ utils/reconstruct_weights
utils/small_vector_test
utils/ts
utils/weights_test
-utils/unigram_pyp_lm
+training/crf/mpi_batch_optimize
+training/crf/mpi_compute_cllh
+training/crf/mpi_extract_features
+training/crf/mpi_extract_reachable
+training/crf/mpi_flex_optimize
+training/crf/mpi_online_optimize
+training/dpmert/lo_test
+training/dpmert/mr_dpmert_generate_mapper_input
+training/dpmert/mr_dpmert_map
+training/dpmert/mr_dpmert_reduce
+training/dpmert/sentclient
+training/dpmert/sentserver
+training/dtrain/dtrain
+training/minrisk/minrisk_optimize
+training/mira/kbest_mira
+training/pro/mr_pro_map
+training/pro/mr_pro_reduce
+training/rampion/rampion_cccp
+training/utils/Makefile.am
+training/utils/lbfgs_test
+training/utils/optimize_test
+training/utils/sentclient
+training/utils/sentserver
+word-aligner/fast_align
diff --git a/Makefile.am b/Makefile.am
index 7ca7268a..dbf604a1 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -10,12 +10,7 @@ SUBDIRS = \
decoder \
training \
training/liblbfgs \
- mira \
- dtrain \
- dpmert \
- pro \
- rampion \
- minrisk \
+ word-aligner \
example_extff
#gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava
diff --git a/configure.ac b/configure.ac
index 09fc5c5b..366112a3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -82,26 +82,34 @@ AC_PROG_INSTALL
CPPFLAGS="-DPIC -fPIC $CPPFLAGS -DHAVE_CONFIG_H"
+# core cdec stuff
AC_CONFIG_FILES([Makefile])
AC_CONFIG_FILES([utils/Makefile])
AC_CONFIG_FILES([mteval/Makefile])
+AC_CONFIG_FILES([mteval/meteor_jar.cc])
AC_CONFIG_FILES([decoder/Makefile])
-AC_CONFIG_FILES([training/Makefile])
-AC_CONFIG_FILES([training/liblbfgs/Makefile])
-AC_CONFIG_FILES([dpmert/Makefile])
-AC_CONFIG_FILES([pro/Makefile])
-AC_CONFIG_FILES([rampion/Makefile])
-AC_CONFIG_FILES([minrisk/Makefile])
+AC_CONFIG_FILES([python/setup.py])
+AC_CONFIG_FILES([word-aligner/Makefile])
+
+# KenLM stuff
AC_CONFIG_FILES([klm/util/Makefile])
AC_CONFIG_FILES([klm/lm/Makefile])
AC_CONFIG_FILES([klm/search/Makefile])
-AC_CONFIG_FILES([mira/Makefile])
-AC_CONFIG_FILES([dtrain/Makefile])
-AC_CONFIG_FILES([example_extff/Makefile])
-AC_CONFIG_FILES([mteval/meteor_jar.cc])
-
-AC_CONFIG_FILES([python/setup.py])
+# training stuff
+AC_CONFIG_FILES([training/Makefile])
+AC_CONFIG_FILES([training/utils/Makefile])
+AC_CONFIG_FILES([training/liblbfgs/Makefile])
+AC_CONFIG_FILES([training/crf/Makefile])
+AC_CONFIG_FILES([training/dpmert/Makefile])
+AC_CONFIG_FILES([training/pro/Makefile])
+AC_CONFIG_FILES([training/rampion/Makefile])
+AC_CONFIG_FILES([training/minrisk/Makefile])
+AC_CONFIG_FILES([training/mira/Makefile])
+AC_CONFIG_FILES([training/dtrain/Makefile])
+
+# external feature function example code
+AC_CONFIG_FILES([example_extff/Makefile])
AC_OUTPUT
diff --git a/dpmert/README.shared-mem b/dpmert/README.shared-mem
deleted file mode 100644
index 7728efc0..00000000
--- a/dpmert/README.shared-mem
+++ /dev/null
@@ -1,9 +0,0 @@
-If you want to run dist-vest.pl on a very large shared memory machine, do the
-following:
-
- ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini
-
-This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the
-decoder must load grammars, language models, etc., J should be smaller than I, but this will depend
-on the system you are running on and the complexity of the models used for decoding.
-
diff --git a/minrisk/Makefile.am b/minrisk/Makefile.am
deleted file mode 100644
index a24f047c..00000000
--- a/minrisk/Makefile.am
+++ /dev/null
@@ -1,6 +0,0 @@
-bin_PROGRAMS = minrisk_optimize
-
-minrisk_optimize_SOURCES = minrisk_optimize.cc
-minrisk_optimize_LDADD = $(top_srcdir)/training/libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/training/liblbfgs/liblbfgs.a -lz
-
-AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training
diff --git a/pro/README.shared-mem b/pro/README.shared-mem
deleted file mode 100644
index 7728efc0..00000000
--- a/pro/README.shared-mem
+++ /dev/null
@@ -1,9 +0,0 @@
-If you want to run dist-vest.pl on a very large shared memory machine, do the
-following:
-
- ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini
-
-This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the
-decoder must load grammars, language models, etc., J should be smaller than I, but this will depend
-on the system you are running on and the complexity of the models used for decoding.
-
diff --git a/training/Makefile.am b/training/Makefile.am
index f9c25391..e95e045f 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -1,91 +1,11 @@
-bin_PROGRAMS = \
- fast_align \
- lbl_model \
- test_ngram \
- mr_em_map_adapter \
- mr_em_adapted_reduce \
- mr_reduce_to_weights \
- mr_optimize_reduce \
- grammar_convert \
- plftools \
- collapse_weights \
- mpi_extract_reachable \
- mpi_extract_features \
- mpi_online_optimize \
- mpi_flex_optimize \
- mpi_batch_optimize \
- mpi_compute_cllh \
- augment_grammar
+SUBDIRS = \
+ liblbfgs \
+ utils \
+ crf \
+ minrisk \
+ dpmert \
+ pro \
+ dtrain \
+ mira \
+ rampion
-noinst_PROGRAMS = \
- lbfgs_test \
- optimize_test
-
-TESTS = lbfgs_test optimize_test
-
-noinst_LIBRARIES = libtraining.a
-libtraining_a_SOURCES = \
- candidate_set.cc \
- entropy.cc \
- optimize.cc \
- online_optimizer.cc \
- risk.cc
-
-mpi_online_optimize_SOURCES = mpi_online_optimize.cc
-mpi_online_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc
-mpi_flex_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc
-mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_extract_features_SOURCES = mpi_extract_features.cc
-mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc
-mpi_batch_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc
-mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-augment_grammar_SOURCES = augment_grammar.cc
-augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-test_ngram_SOURCES = test_ngram.cc
-test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
-fast_align_SOURCES = fast_align.cc ttables.cc
-fast_align_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-lbl_model_SOURCES = lbl_model.cc
-lbl_model_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-grammar_convert_SOURCES = grammar_convert.cc
-grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-optimize_test_SOURCES = optimize_test.cc
-optimize_test_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-collapse_weights_SOURCES = collapse_weights.cc
-collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-lbfgs_test_SOURCES = lbfgs_test.cc
-lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc
-mr_optimize_reduce_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc
-mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc
-mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc
-mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-plftools_SOURCES = plftools.cc
-plftools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz
-
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm
diff --git a/training/add-model1-features-to-scfg.pl b/training/add-model1-features-to-scfg.pl
deleted file mode 100755
index a0074317..00000000
--- a/training/add-model1-features-to-scfg.pl
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/perl -w
-
-# [X] ||| so [X,1] die [X,2] der ||| as [X,1] existing [X,2] the ||| 2.47712135315 2.53182387352 5.07100057602 ||| 0-0 2-2 4-4
-# [X] ||| so [X,1] die [X,2] der ||| this [X,1] the [X,2] of ||| 2.47712135315 3.19828724861 2.38270020485 ||| 0-0 2-2 4-4
-# [X] ||| so [X,1] die [X,2] der ||| as [X,1] the [X,2] the ||| 2.47712135315 2.53182387352 1.48463630676 ||| 0-0 2-2 4-4
-# [X] ||| so [X,1] die [X,2] der ||| is [X,1] the [X,2] of the ||| 2.47712135315 3.45197868347 2.64251494408 ||| 0-0 2-2 4-4 4-5
-
-die "Usage: $0 model1.f-e model1.e-f < grammar.scfg\n (use trianing/model1 to extract the model files)\n" unless scalar @ARGV == 2;
-
-my $fm1 = shift @ARGV;
-die unless $fm1;
-my $frm1 = shift @ARGV;
-die unless $frm1;
-open M1,"<$fm1" or die;
-open RM1,"<$frm1" or die;
-print STDERR "Loading Model 1 probs from $fm1...\n";
-my %m1;
-while(<M1>) {
- chomp;
- my ($f, $e, $lp) = split /\s+/;
- $m1{$e}->{$f} = exp($lp);
-}
-close M1;
-
-print STDERR "Loading Inverse Model 1 probs from $frm1...\n";
-my %rm1;
-while(<RM1>) {
- chomp;
- my ($e, $f, $lp) = split /\s+/;
- $rm1{$f}->{$e} = exp($lp);
-}
-close RM1;
-
-my @label = qw( EGivenF LexFGivenE LexEGivenF );
-while(<>) {
- chomp;
- my ($l, $f, $e, $sscores, $al) = split / \|\|\| /;
- my @scores = split /\s+/, $sscores;
- unless ($sscores =~ /=/) {
- for (my $i=0; $i<3; $i++) { $scores[$i] = "$label[$i]=$scores[$i]"; }
- }
- push @scores, "RuleCount=1";
- my @fs = split /\s+/, $f;
- my @es = split /\s+/, $e;
- my $flen = scalar @fs;
- my $elen = scalar @es;
- my $pgen = 0;
- my $nongen = 0;
- for (my $i =0; $i < $flen; $i++) {
- my $ftot = 0;
- next if ($fs[$i] =~ /\[X/);
- my $cr = $rm1{$fs[$i]};
- for (my $j=0; $j <= $elen; $j++) {
- my $ej = '<eps>';
- if ($j < $elen) { $ej = $es[$j]; }
- my $p = $cr->{$ej};
- if (defined $p) { $ftot += $p; }
- }
- if ($ftot == 0) { $nongen = 1; last; }
- $pgen += log($ftot) - log($elen);
- }
- my $bad = 0;
- my $good = 0;
- unless ($nongen) { push @scores, "RGood=1"; $good++; } else { push @scores, "RBad=1"; $bad++; }
-
- $nongen = 0;
- $pgen = 0;
- for (my $i =0; $i < $elen; $i++) {
- my $etot = 0;
- next if ($es[$i] =~ /\[X/);
- my $cr = $m1{$es[$i]};
-# print STDERR "$es[$i]\n";
- for (my $j=0; $j <= $flen; $j++) {
- my $fj = '<eps>';
- if ($j < $flen) { $fj = $fs[$j]; }
- my $p = $cr->{$fj};
-# print STDERR " $fs[$j] : $p\n";
- if (defined $p) { $etot += $p; }
- }
- if ($etot == 0) { $nongen = 1; last; }
- $pgen += log($etot) - log($flen);
- }
- unless ($nongen) {
- push @scores, "FGood=1";
- if ($good) { push @scores, "BothGood=1"; } else { push @scores, "SusDel=1"; }
- } else {
- push @scores, "FBad=1";
- if ($bad) { push @scores, "BothBad=1"; } else { push @scores, "SusHall=1"; }
- }
- print "$l ||| $f ||| $e ||| @scores";
- if (defined $al) { print " ||| $al\n"; } else { print "\n"; }
-}
-
diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc
deleted file mode 100644
index c03eb031..00000000
--- a/training/collapse_weights.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-char const* NOTES =
- "ZF_and_E means unnormalized scaled features.\n"
- "For grammars with one nonterminal: F_and_E is joint,\n"
- "F_given_E and E_given_F are conditional.\n"
- "TODO: group rules by root nonterminal and then normalize.\n";
-
-
-#include <iostream>
-#include <fstream>
-#include <tr1/unordered_map>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include <boost/functional/hash.hpp>
-
-#include "prob.h"
-#include "filelib.h"
-#include "trule.h"
-#include "weights.h"
-
-namespace po = boost::program_options;
-using namespace std;
-
-typedef std::tr1::unordered_map<vector<WordID>, prob_t, boost::hash<vector<WordID> > > MarginalMap;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("grammar,g", po::value<string>(), "Grammar file")
- ("weights,w", po::value<string>(), "Weights file")
- ("unnormalized,u", "Always include ZF_and_E unnormalized score (default: only if sum was >1)")
- ;
- po::options_description clo("Command line options");
- clo.add_options()
- ("config,c", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- const string cfg = (*conf)["config"].as<string>();
- cerr << "Configuration file: " << cfg << endl;
- ifstream config(cfg.c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || !conf->count("grammar") || !conf->count("weights")) {
- cerr << dcmdline_options << endl;
- cerr << NOTES << endl;
- exit(1);
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const string wfile = conf["weights"].as<string>();
- const string gfile = conf["grammar"].as<string>();
- vector<weight_t> w;
- Weights::InitFromFile(wfile, &w);
- MarginalMap e_tots;
- MarginalMap f_tots;
- prob_t tot;
- {
- ReadFile rf(gfile);
- assert(*rf.stream());
- istream& in = *rf.stream();
- cerr << "Computing marginals...\n";
- int lc = 0;
- while(in) {
- string line;
- getline(in, line);
- ++lc;
- if (line.empty()) continue;
- TRule tr(line, true);
- if (tr.GetFeatureValues().empty())
- cerr << "Line " << lc << ": empty features - may introduce bias\n";
- prob_t prob;
- prob.logeq(tr.GetFeatureValues().dot(w));
- e_tots[tr.e_] += prob;
- f_tots[tr.f_] += prob;
- tot += prob;
- }
- }
- bool normalized = (fabs(log(tot)) < 0.001);
- cerr << "Total: " << tot << (normalized ? " [normalized]" : " [scaled]") << endl;
- ReadFile rf(gfile);
- istream&in = *rf.stream();
- while(in) {
- string line;
- getline(in, line);
- if (line.empty()) continue;
- TRule tr(line, true);
- const double lp = tr.GetFeatureValues().dot(w);
- if (std::isinf(lp)) { continue; }
- tr.scores_.clear();
-
- cout << tr.AsString() << " ||| F_and_E=" << lp - log(tot);
- if (!normalized || conf.count("unnormalized")) {
- cout << ";ZF_and_E=" << lp;
- }
- cout << ";F_given_E=" << lp - log(e_tots[tr.e_])
- << ";E_given_F=" << lp - log(f_tots[tr.f_]) << endl;
- }
- return 0;
-}
-
diff --git a/training/crf/Makefile.am b/training/crf/Makefile.am
new file mode 100644
index 00000000..d203df25
--- /dev/null
+++ b/training/crf/Makefile.am
@@ -0,0 +1,27 @@
+bin_PROGRAMS = \
+ mpi_batch_optimize \
+ mpi_compute_cllh \
+ mpi_extract_features \
+ mpi_extract_reachable \
+ mpi_flex_optimize \
+ mpi_online_optimize
+
+mpi_online_optimize_SOURCES = mpi_online_optimize.cc
+mpi_online_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc
+mpi_flex_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc
+mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+mpi_extract_features_SOURCES = mpi_extract_features.cc
+mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc
+mpi_batch_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc
+mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
+
+AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir)/training -I$(top_srcdir)/training/utils -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/training/cllh_observer.cc b/training/crf/cllh_observer.cc
index 4ec2fa65..4ec2fa65 100644
--- a/training/cllh_observer.cc
+++ b/training/crf/cllh_observer.cc
diff --git a/training/cllh_observer.h b/training/crf/cllh_observer.h
index 0de47331..0de47331 100644
--- a/training/cllh_observer.h
+++ b/training/crf/cllh_observer.h
diff --git a/training/mpi_batch_optimize.cc b/training/crf/mpi_batch_optimize.cc
index 2eff07e4..2eff07e4 100644
--- a/training/mpi_batch_optimize.cc
+++ b/training/crf/mpi_batch_optimize.cc
diff --git a/training/mpi_compute_cllh.cc b/training/crf/mpi_compute_cllh.cc
index 066389d0..066389d0 100644
--- a/training/mpi_compute_cllh.cc
+++ b/training/crf/mpi_compute_cllh.cc
diff --git a/training/mpi_extract_features.cc b/training/crf/mpi_extract_features.cc
index 6750aa15..6750aa15 100644
--- a/training/mpi_extract_features.cc
+++ b/training/crf/mpi_extract_features.cc
diff --git a/training/mpi_extract_reachable.cc b/training/crf/mpi_extract_reachable.cc
index 2a7c2b9d..2a7c2b9d 100644
--- a/training/mpi_extract_reachable.cc
+++ b/training/crf/mpi_extract_reachable.cc
diff --git a/training/mpi_flex_optimize.cc b/training/crf/mpi_flex_optimize.cc
index b52decdc..b52decdc 100644
--- a/training/mpi_flex_optimize.cc
+++ b/training/crf/mpi_flex_optimize.cc
diff --git a/training/mpi_online_optimize.cc b/training/crf/mpi_online_optimize.cc
index d6968848..d6968848 100644
--- a/training/mpi_online_optimize.cc
+++ b/training/crf/mpi_online_optimize.cc
diff --git a/training/dep-reorder/conll2reordering-forest.pl b/training/dep-reorder/conll2reordering-forest.pl
deleted file mode 100755
index 3cd226be..00000000
--- a/training/dep-reorder/conll2reordering-forest.pl
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
-my $FIRST_CONV = "$script_dir/scripts/conll2simplecfg.pl";
-my $CDEC = "$script_dir/../../decoder/cdec";
-
-our $tfile1 = "grammar1.$$";
-our $tfile2 = "text.$$";
-
-die "Usage: $0 parses.conll\n" unless scalar @ARGV == 1;
-open C, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!";
-
-END { unlink $tfile1; unlink "$tfile1.cfg"; unlink $tfile2; }
-
-my $first = 1;
-open T, ">$tfile1" or die "Can't write $tfile1: $!";
-my $lc = 0;
-my $flag = 0;
-my @words = ();
-while(<C>) {
- print T;
- chomp;
- if (/^$/) {
- if ($first) { $first = undef; } else { if ($flag) { print "\n"; $flag = 0; } }
- $first = undef;
- close T;
- open SO, ">$tfile2" or die "Can't write $tfile2: $!";
- print SO "@words\n";
- close SO;
- @words=();
- `$FIRST_CONV < $tfile1 > $tfile1.cfg`;
- if ($? != 0) {
- die "Error code: $?";
- }
- my $cfg = `$CDEC -n -S 10000 -f scfg -g $tfile1.cfg -i $tfile2 --show_cfg_search_space 2>/dev/null`;
- if ($? != 0) {
- die "Error code: $?";
- }
- my @rules = split /\n/, $cfg;
- shift @rules; # get rid of output
- for my $rule (@rules) {
- my ($lhs, $f, $e, $feats) = split / \|\|\| /, $rule;
- $f =~ s/,\d\]/\]/g;
- $feats = 'TOP=1' unless $feats;
- if ($lhs =~ /\[Goal_\d+\]/) { $lhs = '[S]'; }
- print "$lhs ||| $f ||| $feats\n";
- if ($e eq '[1] [2]') {
- my ($a, $b) = split /\s+/, $f;
- $feats =~ s/=1$//;
- my ($x, $y) = split /_/, $feats;
- print "$lhs ||| $b $a ||| ${y}_$x=1\n";
- }
- $flag = 1;
- }
- open T, ">$tfile1" or die "Can't write $tfile1: $!";
- $lc = -1;
- } else {
- my ($ind, $word, @dmmy) = split /\s+/;
- push @words, $word;
- }
- $lc++;
-}
-close T;
-
diff --git a/training/dep-reorder/george.conll b/training/dep-reorder/george.conll
deleted file mode 100644
index 7eebb360..00000000
--- a/training/dep-reorder/george.conll
+++ /dev/null
@@ -1,4 +0,0 @@
-1 George _ GEORGE _ _ 2 X _ _
-2 hates _ HATES _ _ 0 X _ _
-3 broccoli _ BROC _ _ 2 X _ _
-
diff --git a/training/dep-reorder/scripts/conll2simplecfg.pl b/training/dep-reorder/scripts/conll2simplecfg.pl
deleted file mode 100755
index b101347a..00000000
--- a/training/dep-reorder/scripts/conll2simplecfg.pl
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-# 1 在 _ 10 _ _ 4 X _ _
-# 2 门厅 _ 3 _ _ 1 X _ _
-# 3 下面 _ 23 _ _ 4 X _ _
-# 4 。 _ 45 _ _ 0 X _ _
-
-my @ldeps;
-my @rdeps;
-@ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; }
-@rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; }
-my $rootcat = 0;
-my @cats = ('S');
-my $len = 0;
-my @noposcats = ('S');
-while(<>) {
- chomp;
- if (/^\s*$/) {
- write_cfg($len);
- $len = 0;
- @cats=('S');
- @noposcats = ('S');
- @ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; }
- @rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; }
- next;
- }
- $len++;
- my ($pos, $word, $d1, $xcat, $d2, $d3, $headpos, $deptype) = split /\s+/;
- my $cat = "C$xcat";
- my $catpos = $cat . "_$pos";
- push @cats, $catpos;
- push @noposcats, $cat;
- print "[$catpos] ||| $word ||| $word ||| Word=1\n";
- if ($headpos == 0) { $rootcat = $pos; }
- if ($pos < $headpos) {
- push @{$ldeps[$headpos]}, $pos;
- } else {
- push @{$rdeps[$headpos]}, $pos;
- }
-}
-
-sub write_cfg {
- my $len = shift;
- for (my $i = 1; $i <= $len; $i++) {
- my @lds = @{$ldeps[$i]};
- for my $ld (@lds) {
- print "[$cats[$i]] ||| [$cats[$ld],1] [$cats[$i],2] ||| [1] [2] ||| $noposcats[$ld]_$noposcats[$i]=1\n";
- }
- my @rds = @{$rdeps[$i]};
- for my $rd (@rds) {
- print "[$cats[$i]] ||| [$cats[$i],1] [$cats[$rd],2] ||| [1] [2] ||| $noposcats[$i]_$noposcats[$rd]=1\n";
- }
- }
- print "[S] ||| [$cats[$rootcat],1] ||| [1] ||| TOP=1\n";
-}
-
diff --git a/dpmert/Makefile.am b/training/dpmert/Makefile.am
index 00768271..ff318bef 100644
--- a/dpmert/Makefile.am
+++ b/training/dpmert/Makefile.am
@@ -1,20 +1,12 @@
bin_PROGRAMS = \
mr_dpmert_map \
mr_dpmert_reduce \
- mr_dpmert_generate_mapper_input \
- sentserver \
- sentclient
+ mr_dpmert_generate_mapper_input
noinst_PROGRAMS = \
lo_test
TESTS = lo_test
-sentserver_SOURCES = sentserver.c
-sentserver_LDFLAGS = -pthread
-
-sentclient_SOURCES = sentclient.c
-sentclient_LDFLAGS = -pthread
-
mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc
mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
diff --git a/dpmert/ces.cc b/training/dpmert/ces.cc
index 157b2d17..157b2d17 100644
--- a/dpmert/ces.cc
+++ b/training/dpmert/ces.cc
diff --git a/dpmert/ces.h b/training/dpmert/ces.h
index e4fa2080..e4fa2080 100644
--- a/dpmert/ces.h
+++ b/training/dpmert/ces.h
diff --git a/dpmert/divide_refs.py b/training/dpmert/divide_refs.py
index b478f918..b478f918 100755
--- a/dpmert/divide_refs.py
+++ b/training/dpmert/divide_refs.py
diff --git a/dpmert/dpmert.pl b/training/dpmert/dpmert.pl
index c4f98870..559420f5 100755
--- a/dpmert/dpmert.pl
+++ b/training/dpmert/dpmert.pl
@@ -2,7 +2,7 @@
use strict;
my @ORIG_ARGV=@ARGV;
use Cwd qw(getcwd);
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
# Skip local config (used for distributing jobs) if we're running in local-only mode
use LocalConfig;
@@ -17,21 +17,22 @@ my $srcFile; # deprecated
my $refFiles; # deprecated
my $default_jobs = env_default_jobs();
my $bin_dir = $SCRIPT_DIR;
+my $util_dir = "$SCRIPT_DIR/../utils";
die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
-my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input";
my $MAPPER = "$bin_dir/mr_dpmert_map";
my $REDUCER = "$bin_dir/mr_dpmert_reduce";
-my $parallelize = "$bin_dir/parallelize.pl";
-my $libcall = "$bin_dir/libcall.pl";
-my $sentserver = "$bin_dir/sentserver";
-my $sentclient = "$bin_dir/sentclient";
-my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+my $parallelize = "$util_dir/parallelize.pl";
+my $libcall = "$util_dir/libcall.pl";
+my $sentserver = "$util_dir/sentserver";
+my $sentclient = "$util_dir/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
my $SCORER = $FAST_SCORE;
die "Can't find $MAPPER" unless -x $MAPPER;
-my $cdec = "$bin_dir/../decoder/cdec";
+my $cdec = "$bin_dir/../../decoder/cdec";
die "Can't find decoder in $cdec" unless -x $cdec;
die "Can't find $parallelize" unless -x $parallelize;
die "Can't find $libcall" unless -e $libcall;
diff --git a/dpmert/error_surface.cc b/training/dpmert/error_surface.cc
index 515b67f8..515b67f8 100644
--- a/dpmert/error_surface.cc
+++ b/training/dpmert/error_surface.cc
diff --git a/dpmert/error_surface.h b/training/dpmert/error_surface.h
index bb65847b..bb65847b 100644
--- a/dpmert/error_surface.h
+++ b/training/dpmert/error_surface.h
diff --git a/dpmert/line_mediator.pl b/training/dpmert/line_mediator.pl
index bc2bb24c..bc2bb24c 100755
--- a/dpmert/line_mediator.pl
+++ b/training/dpmert/line_mediator.pl
diff --git a/dpmert/line_optimizer.cc b/training/dpmert/line_optimizer.cc
index 9cf33502..9cf33502 100644
--- a/dpmert/line_optimizer.cc
+++ b/training/dpmert/line_optimizer.cc
diff --git a/dpmert/line_optimizer.h b/training/dpmert/line_optimizer.h
index 83819f41..83819f41 100644
--- a/dpmert/line_optimizer.h
+++ b/training/dpmert/line_optimizer.h
diff --git a/dpmert/lo_test.cc b/training/dpmert/lo_test.cc
index 95a08d3d..95a08d3d 100644
--- a/dpmert/lo_test.cc
+++ b/training/dpmert/lo_test.cc
diff --git a/dpmert/mert_geometry.cc b/training/dpmert/mert_geometry.cc
index d6973658..d6973658 100644
--- a/dpmert/mert_geometry.cc
+++ b/training/dpmert/mert_geometry.cc
diff --git a/dpmert/mert_geometry.h b/training/dpmert/mert_geometry.h
index a8b6959e..a8b6959e 100644
--- a/dpmert/mert_geometry.h
+++ b/training/dpmert/mert_geometry.h
diff --git a/dpmert/mr_dpmert_generate_mapper_input.cc b/training/dpmert/mr_dpmert_generate_mapper_input.cc
index 199cd23a..199cd23a 100644
--- a/dpmert/mr_dpmert_generate_mapper_input.cc
+++ b/training/dpmert/mr_dpmert_generate_mapper_input.cc
diff --git a/dpmert/mr_dpmert_map.cc b/training/dpmert/mr_dpmert_map.cc
index d1efcf96..d1efcf96 100644
--- a/dpmert/mr_dpmert_map.cc
+++ b/training/dpmert/mr_dpmert_map.cc
diff --git a/dpmert/mr_dpmert_reduce.cc b/training/dpmert/mr_dpmert_reduce.cc
index 31512a03..31512a03 100644
--- a/dpmert/mr_dpmert_reduce.cc
+++ b/training/dpmert/mr_dpmert_reduce.cc
diff --git a/dpmert/test_aer/README b/training/dpmert/test_aer/README
index 819b2e32..819b2e32 100644
--- a/dpmert/test_aer/README
+++ b/training/dpmert/test_aer/README
diff --git a/dpmert/test_aer/cdec.ini b/training/dpmert/test_aer/cdec.ini
index 08187848..08187848 100644
--- a/dpmert/test_aer/cdec.ini
+++ b/training/dpmert/test_aer/cdec.ini
diff --git a/dpmert/test_aer/corpus.src b/training/dpmert/test_aer/corpus.src
index 31b23971..31b23971 100644
--- a/dpmert/test_aer/corpus.src
+++ b/training/dpmert/test_aer/corpus.src
diff --git a/dpmert/test_aer/grammar b/training/dpmert/test_aer/grammar
index 9d857824..9d857824 100644
--- a/dpmert/test_aer/grammar
+++ b/training/dpmert/test_aer/grammar
diff --git a/dpmert/test_aer/ref.0 b/training/dpmert/test_aer/ref.0
index 734a9c5b..734a9c5b 100644
--- a/dpmert/test_aer/ref.0
+++ b/training/dpmert/test_aer/ref.0
diff --git a/dpmert/test_aer/weights b/training/dpmert/test_aer/weights
index afc9282e..afc9282e 100644
--- a/dpmert/test_aer/weights
+++ b/training/dpmert/test_aer/weights
diff --git a/dpmert/test_data/0.json.gz b/training/dpmert/test_data/0.json.gz
index 30f8dd77..30f8dd77 100644
--- a/dpmert/test_data/0.json.gz
+++ b/training/dpmert/test_data/0.json.gz
Binary files differ
diff --git a/dpmert/test_data/1.json.gz b/training/dpmert/test_data/1.json.gz
index c82cc179..c82cc179 100644
--- a/dpmert/test_data/1.json.gz
+++ b/training/dpmert/test_data/1.json.gz
Binary files differ
diff --git a/dpmert/test_data/c2e.txt.0 b/training/dpmert/test_data/c2e.txt.0
index 12c4abe9..12c4abe9 100644
--- a/dpmert/test_data/c2e.txt.0
+++ b/training/dpmert/test_data/c2e.txt.0
diff --git a/dpmert/test_data/c2e.txt.1 b/training/dpmert/test_data/c2e.txt.1
index 4ac12df1..4ac12df1 100644
--- a/dpmert/test_data/c2e.txt.1
+++ b/training/dpmert/test_data/c2e.txt.1
diff --git a/dpmert/test_data/c2e.txt.2 b/training/dpmert/test_data/c2e.txt.2
index 2f67b72f..2f67b72f 100644
--- a/dpmert/test_data/c2e.txt.2
+++ b/training/dpmert/test_data/c2e.txt.2
diff --git a/dpmert/test_data/c2e.txt.3 b/training/dpmert/test_data/c2e.txt.3
index 5483cef6..5483cef6 100644
--- a/dpmert/test_data/c2e.txt.3
+++ b/training/dpmert/test_data/c2e.txt.3
diff --git a/dpmert/test_data/re.txt.0 b/training/dpmert/test_data/re.txt.0
index 86eff087..86eff087 100644
--- a/dpmert/test_data/re.txt.0
+++ b/training/dpmert/test_data/re.txt.0
diff --git a/dpmert/test_data/re.txt.1 b/training/dpmert/test_data/re.txt.1
index 2140f198..2140f198 100644
--- a/dpmert/test_data/re.txt.1
+++ b/training/dpmert/test_data/re.txt.1
diff --git a/dpmert/test_data/re.txt.2 b/training/dpmert/test_data/re.txt.2
index 94e46286..94e46286 100644
--- a/dpmert/test_data/re.txt.2
+++ b/training/dpmert/test_data/re.txt.2
diff --git a/dpmert/test_data/re.txt.3 b/training/dpmert/test_data/re.txt.3
index f87c3308..f87c3308 100644
--- a/dpmert/test_data/re.txt.3
+++ b/training/dpmert/test_data/re.txt.3
diff --git a/dtrain/Makefile.am b/training/dtrain/Makefile.am
index ca9581f5..5b48e756 100644
--- a/dtrain/Makefile.am
+++ b/training/dtrain/Makefile.am
@@ -1,7 +1,7 @@
bin_PROGRAMS = dtrain
dtrain_SOURCES = dtrain.cc score.cc
-dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/dtrain/README.md b/training/dtrain/README.md
index 7edabbf1..7edabbf1 100644
--- a/dtrain/README.md
+++ b/training/dtrain/README.md
diff --git a/dtrain/dtrain.cc b/training/dtrain/dtrain.cc
index 18286668..18286668 100644
--- a/dtrain/dtrain.cc
+++ b/training/dtrain/dtrain.cc
diff --git a/dtrain/dtrain.h b/training/dtrain/dtrain.h
index 4b6f415c..4b6f415c 100644
--- a/dtrain/dtrain.h
+++ b/training/dtrain/dtrain.h
diff --git a/dtrain/hstreaming/avg.rb b/training/dtrain/hstreaming/avg.rb
index 2599c732..2599c732 100755
--- a/dtrain/hstreaming/avg.rb
+++ b/training/dtrain/hstreaming/avg.rb
diff --git a/dtrain/hstreaming/cdec.ini b/training/dtrain/hstreaming/cdec.ini
index d4f5cecd..d4f5cecd 100644
--- a/dtrain/hstreaming/cdec.ini
+++ b/training/dtrain/hstreaming/cdec.ini
diff --git a/dtrain/hstreaming/dtrain.ini b/training/dtrain/hstreaming/dtrain.ini
index a2c219a1..a2c219a1 100644
--- a/dtrain/hstreaming/dtrain.ini
+++ b/training/dtrain/hstreaming/dtrain.ini
diff --git a/dtrain/hstreaming/dtrain.sh b/training/dtrain/hstreaming/dtrain.sh
index 877ff94c..877ff94c 100755
--- a/dtrain/hstreaming/dtrain.sh
+++ b/training/dtrain/hstreaming/dtrain.sh
diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/training/dtrain/hstreaming/hadoop-streaming-job.sh
index 92419956..92419956 100755
--- a/dtrain/hstreaming/hadoop-streaming-job.sh
+++ b/training/dtrain/hstreaming/hadoop-streaming-job.sh
diff --git a/dtrain/hstreaming/lplp.rb b/training/dtrain/hstreaming/lplp.rb
index f0cd58c5..f0cd58c5 100755
--- a/dtrain/hstreaming/lplp.rb
+++ b/training/dtrain/hstreaming/lplp.rb
diff --git a/dtrain/hstreaming/red-test b/training/dtrain/hstreaming/red-test
index 2623d697..2623d697 100644
--- a/dtrain/hstreaming/red-test
+++ b/training/dtrain/hstreaming/red-test
diff --git a/dtrain/kbestget.h b/training/dtrain/kbestget.h
index dd8882e1..dd8882e1 100644
--- a/dtrain/kbestget.h
+++ b/training/dtrain/kbestget.h
diff --git a/dtrain/ksampler.h b/training/dtrain/ksampler.h
index bc2f56cd..bc2f56cd 100644
--- a/dtrain/ksampler.h
+++ b/training/dtrain/ksampler.h
diff --git a/dtrain/pairsampling.h b/training/dtrain/pairsampling.h
index 84be1efb..84be1efb 100644
--- a/dtrain/pairsampling.h
+++ b/training/dtrain/pairsampling.h
diff --git a/dtrain/parallelize.rb b/training/dtrain/parallelize.rb
index 1d277ff6..1d277ff6 100755
--- a/dtrain/parallelize.rb
+++ b/training/dtrain/parallelize.rb
diff --git a/dtrain/parallelize/test/cdec.ini b/training/dtrain/parallelize/test/cdec.ini
index 72e99dc5..72e99dc5 100644
--- a/dtrain/parallelize/test/cdec.ini
+++ b/training/dtrain/parallelize/test/cdec.ini
diff --git a/dtrain/parallelize/test/dtrain.ini b/training/dtrain/parallelize/test/dtrain.ini
index 03f9d240..03f9d240 100644
--- a/dtrain/parallelize/test/dtrain.ini
+++ b/training/dtrain/parallelize/test/dtrain.ini
diff --git a/dtrain/parallelize/test/in b/training/dtrain/parallelize/test/in
index a312809f..a312809f 100644
--- a/dtrain/parallelize/test/in
+++ b/training/dtrain/parallelize/test/in
diff --git a/dtrain/parallelize/test/refs b/training/dtrain/parallelize/test/refs
index 4d3128cb..4d3128cb 100644
--- a/dtrain/parallelize/test/refs
+++ b/training/dtrain/parallelize/test/refs
diff --git a/dtrain/score.cc b/training/dtrain/score.cc
index 34fc86a9..34fc86a9 100644
--- a/dtrain/score.cc
+++ b/training/dtrain/score.cc
diff --git a/dtrain/score.h b/training/dtrain/score.h
index f317c903..f317c903 100644
--- a/dtrain/score.h
+++ b/training/dtrain/score.h
diff --git a/dtrain/test/example/README b/training/dtrain/test/example/README
index 6937b11b..6937b11b 100644
--- a/dtrain/test/example/README
+++ b/training/dtrain/test/example/README
diff --git a/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini
index d5955f0e..d5955f0e 100644
--- a/dtrain/test/example/cdec.ini
+++ b/training/dtrain/test/example/cdec.ini
diff --git a/dtrain/test/example/dtrain.ini b/training/dtrain/test/example/dtrain.ini
index 72d50ca1..72d50ca1 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/training/dtrain/test/example/dtrain.ini
diff --git a/dtrain/test/example/expected-output b/training/dtrain/test/example/expected-output
index 05326763..05326763 100644
--- a/dtrain/test/example/expected-output
+++ b/training/dtrain/test/example/expected-output
diff --git a/dtrain/test/parallelize/cdec.ini b/training/dtrain/test/parallelize/cdec.ini
index 72e99dc5..72e99dc5 100644
--- a/dtrain/test/parallelize/cdec.ini
+++ b/training/dtrain/test/parallelize/cdec.ini
diff --git a/dtrain/test/parallelize/dtrain.ini b/training/dtrain/test/parallelize/dtrain.ini
index 03f9d240..03f9d240 100644
--- a/dtrain/test/parallelize/dtrain.ini
+++ b/training/dtrain/test/parallelize/dtrain.ini
diff --git a/dtrain/test/parallelize/in b/training/dtrain/test/parallelize/in
index a312809f..a312809f 100644
--- a/dtrain/test/parallelize/in
+++ b/training/dtrain/test/parallelize/in
diff --git a/dtrain/test/parallelize/refs b/training/dtrain/test/parallelize/refs
index 4d3128cb..4d3128cb 100644
--- a/dtrain/test/parallelize/refs
+++ b/training/dtrain/test/parallelize/refs
diff --git a/dtrain/test/toy/cdec.ini b/training/dtrain/test/toy/cdec.ini
index 98b02d44..98b02d44 100644
--- a/dtrain/test/toy/cdec.ini
+++ b/training/dtrain/test/toy/cdec.ini
diff --git a/dtrain/test/toy/dtrain.ini b/training/dtrain/test/toy/dtrain.ini
index a091732f..a091732f 100644
--- a/dtrain/test/toy/dtrain.ini
+++ b/training/dtrain/test/toy/dtrain.ini
diff --git a/dtrain/test/toy/input b/training/dtrain/test/toy/input
index 4d10a9ea..4d10a9ea 100644
--- a/dtrain/test/toy/input
+++ b/training/dtrain/test/toy/input
diff --git a/training/feature_expectations.cc b/training/feature_expectations.cc
deleted file mode 100644
index f1a85495..00000000
--- a/training/feature_expectations.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-#include <tr1/memory>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "online_optimizer.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "sampler.h"
-
-#ifdef HAVE_MPI
-#include <boost/mpi/timer.hpp>
-#include <boost/mpi.hpp>
-namespace mpi = boost::mpi;
-#endif
-
-using namespace std;
-namespace po = boost::program_options;
-
-struct FComp {
- const vector<double>& w_;
- FComp(const vector<double>& w) : w_(w) {}
- bool operator()(int a, int b) const {
- return fabs(w_[a]) > fabs(w_[b]);
- }
-};
-
-void ShowFeatures(const vector<double>& w) {
- vector<int> fnums(w.size());
- for (int i = 0; i < w.size(); ++i)
- fnums[i] = i;
- sort(fnums.begin(), fnums.end(), FComp(w));
- for (vector<int>::iterator i = fnums.begin(); i != fnums.end(); ++i) {
- if (w[*i]) cout << FD::Convert(*i) << ' ' << w[*i] << endl;
- }
-}
-
-void ReadConfig(const string& ini, vector<string>* out) {
- ReadFile rf(ini);
- istream& in = *rf.stream();
- while(in) {
- string line;
- getline(in, line);
- if (!in) continue;
- out->push_back(line);
- }
-}
-
-void StoreConfig(const vector<string>& cfg, istringstream* o) {
- ostringstream os;
- for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; }
- o->str(os.str());
-}
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input,i",po::value<string>(),"Corpus of source language sentences")
- ("weights,w",po::value<string>(),"Input feature weights file")
- ("decoder_config,c",po::value<string>(), "cdec.ini file");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || !conf->count("input") || !conf->count("decoder_config")) {
- cerr << dcmdline_options << endl;
- return false;
- }
- return true;
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) {
- ReadFile rf(fname);
- istream& in = *rf.stream();
- string line;
- int id = 0;
- while(in) {
- getline(in, line);
- if (!in) break;
- if (id % size == rank) {
- c->push_back(line);
- order->push_back(id);
- }
- ++id;
- }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct TrainingObserver : public DecoderObserver {
- void Reset() {
- acc_exp.clear();
- total_complete = 0;
- }
-
- virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
- cur_model_exp.clear();
- state = 1;
- }
-
- // compute model expectations, denominator of objective
- virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
- assert(state == 1);
- state = 2;
- const prob_t z = InsideOutside<prob_t,
- EdgeProb,
- SparseVector<prob_t>,
- EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp);
- cur_model_exp /= z;
- acc_exp += cur_model_exp;
- }
-
- virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
- cerr << "IGNORING ALIGNMENT FOREST!\n";
- }
-
- virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
- if (state == 2) {
- ++total_complete;
- }
- }
-
- void GetExpectations(SparseVector<double>* g) const {
- g->clear();
- for (SparseVector<prob_t>::const_iterator it = acc_exp.begin(); it != acc_exp.end(); ++it)
- g->set_value(it->first, it->second);
- }
-
- int total_complete;
- SparseVector<prob_t> cur_model_exp;
- SparseVector<prob_t> acc_exp;
- int state;
-};
-
-#ifdef HAVE_MPI
-namespace boost { namespace mpi {
- template<>
- struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> >
- : mpl::true_ { };
-} } // end namespace boost::mpi
-#endif
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
- mpi::environment env(argc, argv);
- mpi::communicator world;
- const int size = world.size();
- const int rank = world.rank();
-#else
- const int size = 1;
- const int rank = 0;
-#endif
- if (size > 1) SetSilent(true); // turn off verbose decoder output
- register_feature_functions();
-
- po::variables_map conf;
- if (!InitCommandLine(argc, argv, &conf))
- return 1;
-
- // load initial weights
- Weights weights;
- if (conf.count("weights"))
- weights.InitFromFile(conf["weights"].as<string>());
-
- vector<string> corpus;
- vector<int> ids;
- ReadTrainingCorpus(conf["input"].as<string>(), rank, size, &corpus, &ids);
- assert(corpus.size() > 0);
-
- vector<string> cdec_ini;
- ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
- istringstream ini;
- StoreConfig(cdec_ini, &ini);
- Decoder decoder(&ini);
- if (decoder.GetConf()["input"].as<string>() != "-") {
- cerr << "cdec.ini must not set an input file\n";
- return 1;
- }
-
- SparseVector<double> x;
- weights.InitSparseVector(&x);
- TrainingObserver observer;
-
- weights.InitFromVector(x);
- vector<double> lambdas;
- weights.InitVector(&lambdas);
- decoder.SetWeights(lambdas);
- observer.Reset();
- for (unsigned i = 0; i < corpus.size(); ++i) {
- int id = ids[i];
- decoder.SetId(id);
- decoder.Decode(corpus[i], &observer);
- }
- SparseVector<double> local_exps, exps;
- observer.GetExpectations(&local_exps);
-#ifdef HAVE_MPI
- reduce(world, local_exps, exps, std::plus<SparseVector<double> >(), 0);
-#else
- exps.swap(local_exps);
-#endif
-
- weights.InitFromVector(exps);
- weights.InitVector(&lambdas);
- ShowFeatures(lambdas);
-
- return 0;
-}
diff --git a/training/lbl_model.cc b/training/lbl_model.cc
deleted file mode 100644
index a46ce33c..00000000
--- a/training/lbl_model.cc
+++ /dev/null
@@ -1,421 +0,0 @@
-#include <iostream>
-
-#include "config.h"
-#ifndef HAVE_EIGEN
- int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; }
-#else
-
-#include <cstdlib>
-#include <algorithm>
-#include <cmath>
-#include <set>
-#include <cstring> // memset
-#include <ctime>
-
-#ifdef HAVE_MPI
-#include <boost/mpi/timer.hpp>
-#include <boost/mpi.hpp>
-#include <boost/archive/text_oarchive.hpp>
-namespace mpi = boost::mpi;
-#endif
-#include <boost/math/special_functions/fpclassify.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include <Eigen/Dense>
-
-#include "corpus_tools.h"
-#include "optimize.h"
-#include "array2d.h"
-#include "m.h"
-#include "lattice.h"
-#include "stringlib.h"
-#include "filelib.h"
-#include "tdict.h"
-
-namespace po = boost::program_options;
-using namespace std;
-
-#define kDIMENSIONS 10
-typedef Eigen::Matrix<double, kDIMENSIONS, 1> RVector;
-typedef Eigen::Matrix<double, 1, kDIMENSIONS> RTVector;
-typedef Eigen::Matrix<double, kDIMENSIONS, kDIMENSIONS> TMatrix;
-vector<RVector> r_src, r_trg;
-
-#if HAVE_MPI
-namespace boost {
-namespace serialization {
-
-template<class Archive>
-void serialize(Archive & ar, RVector & v, const unsigned int version) {
- for (unsigned i = 0; i < kDIMENSIONS; ++i)
- ar & v[i];
-}
-
-} // namespace serialization
-} // namespace boost
-#endif
-
-bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input,i",po::value<string>(),"Input file")
- ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training")
- ("regularization_strength,C",po::value<double>()->default_value(0.1),"L2 regularization strength (0 for no regularization)")
- ("eta", po::value<double>()->default_value(0.1f), "Eta for SGD")
- ("source_embeddings,f", po::value<string>(), "File containing source embeddings (if unset, random vectors will be used)")
- ("target_embeddings,e", po::value<string>(), "File containing target embeddings (if unset, random vectors will be used)")
- ("random_seed,s", po::value<unsigned>(), "Random seed")
- ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)")
- ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (argc < 2 || conf->count("help")) {
- cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n";
- cerr << dcmdline_options << endl;
- return false;
- }
- return true;
-}
-
-void Normalize(RVector* v) {
- double norm = v->norm();
- assert(norm > 0.0f);
- *v /= norm;
-}
-
-void Flatten(const TMatrix& m, vector<double>* v) {
- unsigned c = 0;
- v->resize(kDIMENSIONS * kDIMENSIONS);
- for (unsigned i = 0; i < kDIMENSIONS; ++i)
- for (unsigned j = 0; j < kDIMENSIONS; ++j) {
- assert(boost::math::isfinite(m(i, j)));
- (*v)[c++] = m(i,j);
- }
-}
-
-void Unflatten(const vector<double>& v, TMatrix* m) {
- unsigned c = 0;
- for (unsigned i = 0; i < kDIMENSIONS; ++i)
- for (unsigned j = 0; j < kDIMENSIONS; ++j) {
- assert(boost::math::isfinite(v[c]));
- (*m)(i, j) = v[c++];
- }
-}
-
-double ApplyRegularization(const double C,
- const vector<double>& weights,
- vector<double>* g) {
- assert(weights.size() == g->size());
- double reg = 0;
- for (size_t i = 0; i < weights.size(); ++i) {
- const double& w_i = weights[i];
- double& g_i = (*g)[i];
- reg += C * w_i * w_i;
- g_i += 2 * C * w_i;
- }
- return reg;
-}
-
-void LoadEmbeddings(const string& filename, vector<RVector>* pv) {
- vector<RVector>& v = *pv;
- cerr << "Reading embeddings from " << filename << " ...\n";
- ReadFile rf(filename);
- istream& in = *rf.stream();
- string line;
- unsigned lc = 0;
- while(getline(in, line)) {
- ++lc;
- size_t cur = line.find(' ');
- if (cur == string::npos || cur == 0) {
- cerr << "Parse error reading line " << lc << ":\n" << line << endl;
- abort();
- }
- WordID w = TD::Convert(line.substr(0, cur));
- if (w >= v.size()) continue;
- RVector& curv = v[w];
- line[cur] = 0;
- size_t start = cur + 1;
- cur = start + 1;
- size_t c = 0;
- while(cur < line.size()) {
- if (line[cur] == ' ') {
- line[cur] = 0;
- curv[c++] = strtod(&line[start], NULL);
- start = cur + 1;
- cur = start;
- if (c == kDIMENSIONS) break;
- }
- ++cur;
- }
- if (c < kDIMENSIONS && cur != start) {
- if (cur < line.size()) line[cur] = 0;
- curv[c++] = strtod(&line[start], NULL);
- }
- if (c != kDIMENSIONS) {
- static bool first = true;
- if (first) {
- cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n";
- first = false;
- }
- for (; c < kDIMENSIONS; ++c) curv[c] = rand();
- }
- if (c == kDIMENSIONS && cur != line.size()) {
- static bool first = true;
- if (first) {
- cerr << " embedding file contains more dimensions than configured with, truncating.\n";
- first = false;
- }
- }
- }
-}
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
- std::cerr << "**MPI enabled.\n";
- mpi::environment env(argc, argv);
- mpi::communicator world;
- const int size = world.size();
- const int rank = world.rank();
-#else
- std::cerr << "**MPI disabled.\n";
- const int rank = 0;
- const int size = 1;
-#endif
- po::variables_map conf;
- if (!InitCommandLine(argc, argv, &conf)) return 1;
- const string fname = conf["input"].as<string>();
- const double reg_strength = conf["regularization_strength"].as<double>();
- const bool has_l2 = reg_strength;
- assert(reg_strength >= 0.0f);
- const int ITERATIONS = conf["iterations"].as<unsigned>();
- const double eta = conf["eta"].as<double>();
- const double diagonal_tension = conf["diagonal_tension"].as<double>();
- bool SGD = false;
- if (diagonal_tension < 0.0) {
- cerr << "Invalid value for diagonal_tension: must be >= 0\n";
- return 1;
- }
- string testset;
- if (conf.count("testset")) testset = conf["testset"].as<string>();
-
- unsigned lc = 0;
- vector<double> unnormed_a_i;
- bool flag = false;
- vector<vector<WordID> > srcs, trgs;
- vector<WordID> vocab_e;
- {
- set<WordID> svocab_e, svocab_f;
- CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size);
- copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e));
- }
- cerr << "Number of target word types: " << vocab_e.size() << endl;
- const double num_examples = lc;
-
- boost::shared_ptr<LBFGSOptimizer> lbfgs;
- if (rank == 0)
- lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100));
- r_trg.resize(TD::NumWords() + 1);
- r_src.resize(TD::NumWords() + 1);
- vector<set<unsigned> > trg_pos(TD::NumWords() + 1);
-
- if (conf.count("random_seed")) {
- srand(conf["random_seed"].as<unsigned>());
- } else {
- unsigned seed = time(NULL) + rank * 100;
- cerr << "Random seed: " << seed << endl;
- srand(seed);
- }
-
- TMatrix t = TMatrix::Zero();
- if (rank == 0) {
- t = TMatrix::Random() / 50.0;
- for (unsigned i = 1; i < r_trg.size(); ++i) {
- r_trg[i] = RVector::Random();
- r_src[i] = RVector::Random();
- }
- if (conf.count("source_embeddings"))
- LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src);
- if (conf.count("target_embeddings"))
- LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg);
- }
-
- // do optimization
- TMatrix g = TMatrix::Zero();
- vector<TMatrix> exp_src;
- vector<double> z_src;
- vector<double> flat_g, flat_t, rcv_grad;
- Flatten(t, &flat_t);
- bool converged = false;
-#if HAVE_MPI
- mpi::broadcast(world, &flat_t[0], flat_t.size(), 0);
- mpi::broadcast(world, r_trg, 0);
- mpi::broadcast(world, r_src, 0);
-#endif
- cerr << "rank=" << rank << ": " << r_trg[0][4] << endl;
- for (int iter = 0; !converged && iter < ITERATIONS; ++iter) {
- if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl;
- Unflatten(flat_t, &t);
- double likelihood = 0;
- double denom = 0.0;
- lc = 0;
- flag = false;
- g *= 0;
- for (unsigned i = 0; i < srcs.size(); ++i) {
- const vector<WordID>& src = srcs[i];
- const vector<WordID>& trg = trgs[i];
- ++lc;
- if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; }
- if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; }
- denom += trg.size();
-
- exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero());
- z_src.clear(); z_src.resize(src.size(), 0.0);
- Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero());
- Array2D<double> z_refs(src.size(), trg.size(), 0.0);
- for (unsigned j = 0; j < trg.size(); ++j)
- trg_pos[trg[j]].insert(j);
-
- for (unsigned i = 0; i < src.size(); ++i) {
- const RVector& r_s = r_src[src[i]];
- const RTVector pred = r_s.transpose() * t;
- TMatrix& exp_m = exp_src[i];
- double& z = z_src[i];
- for (unsigned k = 0; k < vocab_e.size(); ++k) {
- const WordID v_k = vocab_e[k];
- const RVector& r_t = r_trg[v_k];
- const double dot_prod = pred * r_t;
- const double u = exp(dot_prod);
- z += u;
- const TMatrix v = r_s * r_t.transpose() * u;
- exp_m += v;
- set<unsigned>& ref_locs = trg_pos[v_k];
- if (!ref_locs.empty()) {
- for (set<unsigned>::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) {
- TMatrix& exp_ref_ij = exp_refs(i, *it);
- double& z_ref_ij = z_refs(i, *it);
- z_ref_ij += u;
- exp_ref_ij += v;
- }
- }
- }
- }
- for (unsigned j = 0; j < trg.size(); ++j)
- trg_pos[trg[j]].clear();
-
- // model expectations for a single target generation with
- // uniform alignment prior
- // TODO: when using a non-uniform alignment, m_exp will be
- // a function of j (below)
- double m_z = 0;
- TMatrix m_exp = TMatrix::Zero();
- for (unsigned i = 0; i < src.size(); ++i) {
- m_exp += exp_src[i];
- m_z += z_src[i];
- }
- m_exp /= m_z;
-
- Array2D<bool> al(src.size(), trg.size(), false);
- for (unsigned j = 0; j < trg.size(); ++j) {
- double ref_z = 0;
- TMatrix ref_exp = TMatrix::Zero();
- int max_i = 0;
- double max_s = -9999999;
- for (unsigned i = 0; i < src.size(); ++i) {
- ref_exp += exp_refs(i, j);
- ref_z += z_refs(i, j);
- if (log(z_refs(i, j)) > max_s) {
- max_s = log(z_refs(i, j));
- max_i = i;
- }
- // TODO handle alignment prob
- }
- if (ref_z <= 0) {
- cerr << "TRG=" << TD::Convert(trg[j]) << endl;
- cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl;
- cerr << " REF_EXP=\n" << ref_exp << endl;
- cerr << " M_EXP=\n" << m_exp << endl;
- abort();
- }
- al(max_i, j) = true;
- ref_exp /= ref_z;
- g += m_exp - ref_exp;
- likelihood += log(ref_z) - log(m_z);
- if (SGD) {
- t -= g * eta / num_examples;
- g *= 0;
- }
- }
-
- if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; }
- }
- if (flag && rank == 0) { cerr << endl; }
-
- double obj = 0;
- if (!SGD) {
- Flatten(g, &flat_g);
- obj = -likelihood;
-#if HAVE_MPI
- rcv_grad.resize(flat_g.size(), 0.0);
- mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus<double>(), 0);
- swap(flat_g, rcv_grad);
- rcv_grad.clear();
-
- double to = 0;
- mpi::reduce(world, obj, to, plus<double>(), 0);
- obj = to;
- double tlh = 0;
- mpi::reduce(world, likelihood, tlh, plus<double>(), 0);
- likelihood = tlh;
- double td = 0;
- mpi::reduce(world, denom, td, plus<double>(), 0);
- denom = td;
-#endif
- }
-
- if (rank == 0) {
- double gn = 0;
- for (unsigned i = 0; i < flat_g.size(); ++i)
- gn += flat_g[i]*flat_g[i];
- const double base2_likelihood = likelihood / log(2);
- cerr << " log_e likelihood: " << likelihood << endl;
- cerr << " log_2 likelihood: " << base2_likelihood << endl;
- cerr << " cross entropy: " << (-base2_likelihood / denom) << endl;
- cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl;
- cerr << " gradient norm: " << sqrt(gn) << endl;
- if (!SGD) {
- if (has_l2) {
- const double r = ApplyRegularization(reg_strength,
- flat_t,
- &flat_g);
- obj += r;
- cerr << " regularization: " << r << endl;
- }
- lbfgs->Optimize(obj, flat_g, &flat_t);
- converged = (lbfgs->HasConverged());
- }
- }
-#ifdef HAVE_MPI
- mpi::broadcast(world, &flat_t[0], flat_t.size(), 0);
- mpi::broadcast(world, converged, 0);
-#endif
- }
- if (rank == 0)
- cerr << "TRANSLATION MATRIX:" << endl << t << endl;
- return 0;
-}
-
-#endif
-
diff --git a/training/minrisk/Makefile.am b/training/minrisk/Makefile.am
new file mode 100644
index 00000000..a15e821e
--- /dev/null
+++ b/training/minrisk/Makefile.am
@@ -0,0 +1,6 @@
+bin_PROGRAMS = minrisk_optimize
+
+minrisk_optimize_SOURCES = minrisk_optimize.cc
+minrisk_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/training/liblbfgs/liblbfgs.a -lz
+
+AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training -I$(top_srcdir)/training/utils
diff --git a/minrisk/minrisk.pl b/training/minrisk/minrisk.pl
index d05b9595..0f8bacd0 100755
--- a/minrisk/minrisk.pl
+++ b/training/minrisk/minrisk.pl
@@ -2,7 +2,7 @@
use strict;
my @ORIG_ARGV=@ARGV;
use Cwd qw(getcwd);
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
# Skip local config (used for distributing jobs) if we're running in local-only mode
use LocalConfig;
@@ -12,27 +12,27 @@ use POSIX ":sys_wait_h";
my $QSUB_CMD = qsub_args(mert_memory());
my $default_jobs = env_default_jobs();
-my $VEST_DIR="$SCRIPT_DIR/../dpmert";
-require "$VEST_DIR/libcall.pl";
+my $UTILS_DIR="$SCRIPT_DIR/../utils";
+require "$UTILS_DIR/libcall.pl";
# Default settings
my $srcFile;
my $refFiles;
my $bin_dir = $SCRIPT_DIR;
die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
-my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
my $MAPINPUT = "$bin_dir/minrisk_generate_input.pl";
my $MAPPER = "$bin_dir/minrisk_optimize";
-my $parallelize = "$VEST_DIR/parallelize.pl";
-my $libcall = "$VEST_DIR/libcall.pl";
-my $sentserver = "$VEST_DIR/sentserver";
-my $sentclient = "$VEST_DIR/sentclient";
-my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+my $parallelize = "$UTILS_DIR/parallelize.pl";
+my $libcall = "$UTILS_DIR/libcall.pl";
+my $sentserver = "$UTILS_DIR/sentserver";
+my $sentclient = "$UTILS_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
my $SCORER = $FAST_SCORE;
die "Can't find $MAPPER" unless -x $MAPPER;
-my $cdec = "$bin_dir/../decoder/cdec";
+my $cdec = "$bin_dir/../../decoder/cdec";
die "Can't find decoder in $cdec" unless -x $cdec;
die "Can't find $parallelize" unless -x $parallelize;
die "Can't find $libcall" unless -e $libcall;
diff --git a/minrisk/minrisk_generate_input.pl b/training/minrisk/minrisk_generate_input.pl
index b30fc4fd..b30fc4fd 100755
--- a/minrisk/minrisk_generate_input.pl
+++ b/training/minrisk/minrisk_generate_input.pl
diff --git a/minrisk/minrisk_optimize.cc b/training/minrisk/minrisk_optimize.cc
index da8b5260..da8b5260 100644
--- a/minrisk/minrisk_optimize.cc
+++ b/training/minrisk/minrisk_optimize.cc
diff --git a/mira/Makefile.am b/training/mira/Makefile.am
index 3f8f17cd..ae609ede 100644
--- a/mira/Makefile.am
+++ b/training/mira/Makefile.am
@@ -1,6 +1,6 @@
bin_PROGRAMS = kbest_mira
kbest_mira_SOURCES = kbest_mira.cc
-kbest_mira_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
+kbest_mira_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/mira/kbest_mira.cc b/training/mira/kbest_mira.cc
index 8b7993dd..8b7993dd 100644
--- a/mira/kbest_mira.cc
+++ b/training/mira/kbest_mira.cc
diff --git a/training/mpi_em_optimize.cc b/training/mpi_em_optimize.cc
deleted file mode 100644
index 48683b15..00000000
--- a/training/mpi_em_optimize.cc
+++ /dev/null
@@ -1,389 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-#include <boost/shared_ptr.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "verbose.h"
-#include "hg.h"
-#include "prob.h"
-#include "inside_outside.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "filelib.h"
-#include "optimize.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-
-using namespace std;
-using boost::shared_ptr;
-namespace po = boost::program_options;
-
-void SanityCheck(const vector<double>& w) {
- for (int i = 0; i < w.size(); ++i) {
- assert(!isnan(w[i]));
- assert(!isinf(w[i]));
- }
-}
-
-struct FComp {
- const vector<double>& w_;
- FComp(const vector<double>& w) : w_(w) {}
- bool operator()(int a, int b) const {
- return fabs(w_[a]) > fabs(w_[b]);
- }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
- vector<int> fnums(w.size());
- for (int i = 0; i < w.size(); ++i)
- fnums[i] = i;
- vector<int>::iterator mid = fnums.begin();
- mid += (w.size() > 10 ? 10 : w.size());
- partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
- cerr << "TOP FEATURES:";
- for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
- cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
- }
- cerr << endl;
-}
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input_weights,w",po::value<string>(),"Input feature weights file")
- ("training_data,t",po::value<string>(),"Training data")
- ("decoder_config,c",po::value<string>(),"Decoder configuration file")
- ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || !(conf->count("training_data")) || !conf->count("decoder_config")) {
- cerr << dcmdline_options << endl;
-#ifdef HAVE_MPI
- MPI::Finalize();
-#endif
- exit(1);
- }
-}
-
-void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) {
- ReadFile rf(fname);
- istream& in = *rf.stream();
- string line;
- int lc = 0;
- while(in) {
- getline(in, line);
- if (!in) break;
- if (lc % size == rank) c->push_back(line);
- ++lc;
- }
-}
-
-static const double kMINUS_EPSILON = -1e-6;
-
-struct TrainingObserver : public DecoderObserver {
- void Reset() {
- total_complete = 0;
- cur_obj = 0;
- tot_obj = 0;
- tot.clear();
- }
-
- void SetLocalGradientAndObjective(SparseVector<double>* g, double* o) const {
- *o = tot_obj;
- *g = tot;
- }
-
- virtual void NotifyDecodingStart(const SentenceMetadata& smeta) {
- cur_obj = 0;
- state = 1;
- }
-
- void ExtractExpectedCounts(Hypergraph* hg) {
- vector<prob_t> posts;
- cur.clear();
- const prob_t z = hg->ComputeEdgePosteriors(1.0, &posts);
- cur_obj = log(z);
- for (int i = 0; i < posts.size(); ++i) {
- const SparseVector<double>& efeats = hg->edges_[i].feature_values_;
- const double post = static_cast<double>(posts[i] / z);
- for (SparseVector<double>::const_iterator j = efeats.begin(); j != efeats.end(); ++j)
- cur.add_value(j->first, post);
- }
- }
-
- // compute model expectations, denominator of objective
- virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) {
- assert(state == 1);
- state = 2;
- ExtractExpectedCounts(hg);
- }
-
- // replace translation forest, since we're doing EM training (we don't know which)
- virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) {
- assert(state == 2);
- state = 3;
- ExtractExpectedCounts(hg);
- }
-
- virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) {
- ++total_complete;
- tot_obj += cur_obj;
- tot += cur;
- }
-
- int total_complete;
- double cur_obj;
- double tot_obj;
- SparseVector<double> cur, tot;
- int state;
-};
-
-void ReadConfig(const string& ini, vector<string>* out) {
- ReadFile rf(ini);
- istream& in = *rf.stream();
- while(in) {
- string line;
- getline(in, line);
- if (!in) continue;
- out->push_back(line);
- }
-}
-
-void StoreConfig(const vector<string>& cfg, istringstream* o) {
- ostringstream os;
- for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; }
- o->str(os.str());
-}
-
-struct OptimizableMultinomialFamily {
- struct CPD {
- CPD() : z() {}
- double z;
- map<WordID, double> c2counts;
- };
- map<WordID, CPD> counts;
- double Value(WordID conditioning, WordID generated) const {
- map<WordID, CPD>::const_iterator it = counts.find(conditioning);
- assert(it != counts.end());
- map<WordID,double>::const_iterator r = it->second.c2counts.find(generated);
- if (r == it->second.c2counts.end()) return 0;
- return r->second;
- }
- void Increment(WordID conditioning, WordID generated, double count) {
- CPD& cc = counts[conditioning];
- cc.z += count;
- cc.c2counts[generated] += count;
- }
- void Optimize() {
- for (map<WordID, CPD>::iterator i = counts.begin(); i != counts.end(); ++i) {
- CPD& cpd = i->second;
- for (map<WordID, double>::iterator j = cpd.c2counts.begin(); j != cpd.c2counts.end(); ++j) {
- j->second /= cpd.z;
- // cerr << "P(" << TD::Convert(j->first) << " | " << TD::Convert(i->first) << " ) = " << j->second << endl;
- }
- }
- }
- void Clear() {
- counts.clear();
- }
-};
-
-struct CountManager {
- CountManager(size_t num_types) : oms_(num_types) {}
- virtual ~CountManager();
- virtual void AddCounts(const SparseVector<double>& c) = 0;
- void Optimize(SparseVector<double>* weights) {
- for (int i = 0; i < oms_.size(); ++i) {
- oms_[i].Optimize();
- }
- GetOptimalValues(weights);
- for (int i = 0; i < oms_.size(); ++i) {
- oms_[i].Clear();
- }
- }
- virtual void GetOptimalValues(SparseVector<double>* wv) const = 0;
- vector<OptimizableMultinomialFamily> oms_;
-};
-CountManager::~CountManager() {}
-
-struct TaggerCountManager : public CountManager {
- // 0 = transitions, 2 = emissions
- TaggerCountManager() : CountManager(2) {}
- void AddCounts(const SparseVector<double>& c);
- void GetOptimalValues(SparseVector<double>* wv) const {
- for (set<int>::const_iterator it = fids_.begin(); it != fids_.end(); ++it) {
- int ftype;
- WordID cond, gen;
- bool is_optimized = TaggerCountManager::GetFeature(*it, &ftype, &cond, &gen);
- assert(is_optimized);
- wv->set_value(*it, log(oms_[ftype].Value(cond, gen)));
- }
- }
- // Id:0:a=1 Bi:a_b=1 Bi:b_c=1 Bi:c_d=1 Uni:a=1 Uni:b=1 Uni:c=1 Uni:d=1 Id:1:b=1 Bi:BOS_a=1 Id:2:c=1
- static bool GetFeature(const int fid, int* feature_type, WordID* cond, WordID* gen) {
- const string& feat = FD::Convert(fid);
- if (feat.size() > 5 && feat[0] == 'I' && feat[1] == 'd' && feat[2] == ':') {
- // emission
- const size_t p = feat.rfind(':');
- assert(p != string::npos);
- *cond = TD::Convert(feat.substr(p+1));
- *gen = TD::Convert(feat.substr(3, p - 3));
- *feature_type = 1;
- return true;
- } else if (feat[0] == 'B' && feat.size() > 5 && feat[2] == ':' && feat[1] == 'i') {
- // transition
- const size_t p = feat.rfind('_');
- assert(p != string::npos);
- *gen = TD::Convert(feat.substr(p+1));
- *cond = TD::Convert(feat.substr(3, p - 3));
- *feature_type = 0;
- return true;
- } else if (feat[0] == 'U' && feat.size() > 4 && feat[1] == 'n' && feat[2] == 'i' && feat[3] == ':') {
- // ignore
- return false;
- } else {
- cerr << "Don't know how to deal with feature of type: " << feat << endl;
- abort();
- }
- }
- set<int> fids_;
-};
-
-void TaggerCountManager::AddCounts(const SparseVector<double>& c) {
- for (SparseVector<double>::const_iterator it = c.begin(); it != c.end(); ++it) {
- const double& val = it->second;
- int ftype;
- WordID cond, gen;
- if (GetFeature(it->first, &ftype, &cond, &gen)) {
- oms_[ftype].Increment(cond, gen, val);
- fids_.insert(it->first);
- }
- }
-}
-
-int main(int argc, char** argv) {
-#ifdef HAVE_MPI
- MPI::Init(argc, argv);
- const int size = MPI::COMM_WORLD.Get_size();
- const int rank = MPI::COMM_WORLD.Get_rank();
-#else
- const int size = 1;
- const int rank = 0;
-#endif
- SetSilent(true); // turn off verbose decoder output
- register_feature_functions();
-
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- TaggerCountManager tcm;
-
- // load cdec.ini and set up decoder
- vector<string> cdec_ini;
- ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini);
- istringstream ini;
- StoreConfig(cdec_ini, &ini);
- if (rank == 0) cerr << "Loading grammar...\n";
- Decoder* decoder = new Decoder(&ini);
- if (decoder->GetConf()["input"].as<string>() != "-") {
- cerr << "cdec.ini must not set an input file\n";
-#ifdef HAVE_MPI
- MPI::COMM_WORLD.Abort(1);
-#endif
- }
- if (rank == 0) cerr << "Done loading grammar!\n";
- Weights w;
- if (conf.count("input_weights"))
- w.InitFromFile(conf["input_weights"].as<string>());
-
- double objective = 0;
- bool converged = false;
-
- vector<double> lambdas;
- w.InitVector(&lambdas);
- vector<string> corpus;
- ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus);
- assert(corpus.size() > 0);
-
- int iteration = 0;
- TrainingObserver observer;
- while (!converged) {
- ++iteration;
- observer.Reset();
- if (rank == 0) {
- cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n";
- }
- decoder->SetWeights(lambdas);
- for (int i = 0; i < corpus.size(); ++i)
- decoder->Decode(corpus[i], &observer);
-
- SparseVector<double> x;
- observer.SetLocalGradientAndObjective(&x, &objective);
- cerr << "COUNTS = " << x << endl;
- cerr << " OBJ = " << objective << endl;
- tcm.AddCounts(x);
-
-#if 0
-#ifdef HAVE_MPI
- MPI::COMM_WORLD.Reduce(const_cast<double*>(&gradient.data()[0]), &rcv_grad[0], num_feats, MPI::DOUBLE, MPI::SUM, 0);
- MPI::COMM_WORLD.Reduce(&objective, &to, 1, MPI::DOUBLE, MPI::SUM, 0);
- swap(gradient, rcv_grad);
- objective = to;
-#endif
-#endif
-
- if (rank == 0) {
- SparseVector<double> wsv;
- tcm.Optimize(&wsv);
-
- w.InitFromVector(wsv);
- w.InitVector(&lambdas);
-
- ShowLargestFeatures(lambdas);
-
- converged = iteration > 100;
- if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
-
- string fname = "weights.cur.gz";
- if (converged) { fname = "weights.final.gz"; }
- ostringstream vv;
- vv << "Objective = " << objective << " (ITERATION=" << iteration << ")";
- const string svv = vv.str();
- w.WriteToFile(fname, true, &svv);
- } // rank == 0
- int cint = converged;
-#ifdef HAVE_MPI
- MPI::COMM_WORLD.Bcast(const_cast<double*>(&lambdas.data()[0]), num_feats, MPI::DOUBLE, 0);
- MPI::COMM_WORLD.Bcast(&cint, 1, MPI::INT, 0);
- MPI::COMM_WORLD.Barrier();
-#endif
- converged = cint;
- }
-#ifdef HAVE_MPI
- MPI::Finalize();
-#endif
- return 0;
-}
diff --git a/training/mr_em_adapted_reduce.cc b/training/mr_em_adapted_reduce.cc
deleted file mode 100644
index f65b5440..00000000
--- a/training/mr_em_adapted_reduce.cc
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "filelib.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-#include "m.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("optimization_method,m", po::value<string>()->default_value("em"), "Optimization method (em, vb)")
- ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-double NoZero(const double& x) {
- if (x) return x;
- return 1e-35;
-}
-
-void Maximize(const bool use_vb,
- const double& alpha,
- const int total_event_types,
- SparseVector<double>* pc) {
- const SparseVector<double>& counts = *pc;
-
- if (use_vb)
- assert(total_event_types >= counts.size());
-
- double tot = 0;
- for (SparseVector<double>::const_iterator it = counts.begin();
- it != counts.end(); ++it)
- tot += it->second;
-// cerr << " = " << tot << endl;
- assert(tot > 0.0);
- double ltot = log(tot);
- if (use_vb)
- ltot = Md::digamma(tot + total_event_types * alpha);
- for (SparseVector<double>::const_iterator it = counts.begin();
- it != counts.end(); ++it) {
- if (use_vb) {
- pc->set_value(it->first, NoZero(Md::digamma(it->second + alpha) - ltot));
- } else {
- pc->set_value(it->first, NoZero(log(it->second) - ltot));
- }
- }
-#if 0
- if (counts.size() < 50) {
- for (SparseVector<double>::const_iterator it = counts.begin();
- it != counts.end(); ++it) {
- cerr << " p(" << FD::Convert(it->first) << ")=" << exp(it->second);
- }
- cerr << endl;
- }
-#endif
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- const bool use_b64 = conf["input_format"].as<string>() == "b64";
- const bool use_vb = conf["optimization_method"].as<string>() == "vb";
- const double alpha = 1e-09;
- if (use_vb)
- cerr << "Using variational Bayes, make sure alphas are set\n";
-
- const string s_obj = "**OBJ**";
- // E-step
- string cur_key = "";
- SparseVector<double> acc;
- double logprob = 0;
- while(cin) {
- string line;
- getline(cin, line);
- if (line.empty()) continue;
- int feat;
- double val;
- size_t i = line.find("\t");
- const string key = line.substr(0, i);
- assert(i != string::npos);
- ++i;
- if (key != cur_key) {
- if (cur_key.size() > 0) {
- // TODO shouldn't be num_active, should be total number
- // of events
- Maximize(use_vb, alpha, acc.size(), &acc);
- cout << cur_key << '\t';
- if (use_b64)
- B64::Encode(0.0, acc, &cout);
- else
- cout << acc;
- cout << endl;
- acc.clear();
- }
- cur_key = key;
- }
- if (use_b64) {
- SparseVector<double> g;
- double obj;
- if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
- cerr << "B64 decoder returned error, skipping!\n";
- continue;
- }
- logprob += obj;
- acc += g;
- } else { // text encoding - your counts will not be accurate!
- while (i < line.size()) {
- size_t start = i;
- while (line[i] != '=' && i < line.size()) ++i;
- if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
- string fname = line.substr(start, i - start);
- if (fname == s_obj) {
- feat = -1;
- } else {
- feat = FD::Convert(line.substr(start, i - start));
- }
- ++i;
- start = i;
- while (line[i] != ';' && i < line.size()) ++i;
- if (i - start == 0) continue;
- val = atof(line.substr(start, i - start).c_str());
- ++i;
- if (feat == -1) {
- logprob += val;
- } else {
- acc.add_value(feat, val);
- }
- }
- }
- }
- // TODO shouldn't be num_active, should be total number
- // of events
- Maximize(use_vb, alpha, acc.size(), &acc);
- cout << cur_key << '\t';
- if (use_b64)
- B64::Encode(0.0, acc, &cout);
- else
- cout << acc;
- cout << endl << flush;
-
- cerr << "LOGPROB: " << logprob << endl;
-
- return 0;
-}
diff --git a/training/mr_em_map_adapter.cc b/training/mr_em_map_adapter.cc
deleted file mode 100644
index ead4598d..00000000
--- a/training/mr_em_map_adapter.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cassert>
-#include <cmath>
-
-#include <boost/utility.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include "boost/tuple/tuple.hpp"
-
-#include "fdict.h"
-#include "sparse_vector.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-// useful for EM models parameterized by a bunch of multinomials
-// this converts event counts (returned from cdec as feature expectations)
-// into different keys and values (which are lists of all the events,
-// conditioned on the key) for summing and normalization by a reducer
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("buffer_size,b", po::value<int>()->default_value(1), "Buffer size (in # of counts) before emitting counts")
- ("format,f",po::value<string>()->default_value("b64"), "Encoding of the input (b64 or text)");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct EventMapper {
- int Map(int fid) {
- int& cv = map_[fid];
- if (!cv) {
- cv = GetConditioningVariable(fid);
- }
- return cv;
- }
- void Clear() { map_.clear(); }
- protected:
- virtual int GetConditioningVariable(int fid) const = 0;
- private:
- map<int, int> map_;
-};
-
-struct LexAlignEventMapper : public EventMapper {
- protected:
- virtual int GetConditioningVariable(int fid) const {
- const string& str = FD::Convert(fid);
- size_t pos = str.rfind("_");
- if (pos == string::npos || pos == 0 || pos >= str.size() - 1) {
- cerr << "Bad feature for EM adapter: " << str << endl;
- abort();
- }
- return FD::Convert(str.substr(0, pos));
- }
-};
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- const bool use_b64 = conf["format"].as<string>() == "b64";
- const int buffer_size = conf["buffer_size"].as<int>();
-
- const string s_obj = "**OBJ**";
- // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2;
- // 0<TAB>**OBJ**=1.1;Feat1=1.0;
-
- EventMapper* event_mapper = new LexAlignEventMapper;
- map<int, SparseVector<double> > counts;
- size_t total = 0;
- while(cin) {
- string line;
- getline(cin, line);
- if (line.empty()) continue;
- int feat;
- double val;
- size_t i = line.find("\t");
- assert(i != string::npos);
- ++i;
- SparseVector<double> g;
- double obj = 0;
- if (use_b64) {
- if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
- cerr << "B64 decoder returned error, skipping!\n";
- continue;
- }
- } else { // text encoding - your counts will not be accurate!
- while (i < line.size()) {
- size_t start = i;
- while (line[i] != '=' && i < line.size()) ++i;
- if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
- string fname = line.substr(start, i - start);
- if (fname == s_obj) {
- feat = -1;
- } else {
- feat = FD::Convert(line.substr(start, i - start));
- }
- ++i;
- start = i;
- while (line[i] != ';' && i < line.size()) ++i;
- if (i - start == 0) continue;
- val = atof(line.substr(start, i - start).c_str());
- ++i;
- if (feat == -1) {
- obj = val;
- } else {
- g.set_value(feat, val);
- }
- }
- }
- //cerr << "OBJ: " << obj << endl;
- const SparseVector<double>& cg = g;
- for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) {
- const int cond_var = event_mapper->Map(it->first);
- SparseVector<double>& cond_counts = counts[cond_var];
- int delta = cond_counts.size();
- cond_counts.add_value(it->first, it->second);
- delta = cond_counts.size() - delta;
- total += delta;
- }
- if (total > buffer_size) {
- for (map<int, SparseVector<double> >::iterator it = counts.begin();
- it != counts.end(); ++it) {
- const SparseVector<double>& cc = it->second;
- cout << FD::Convert(it->first) << '\t';
- if (use_b64) {
- B64::Encode(0.0, cc, &cout);
- } else {
- abort();
- }
- cout << endl;
- }
- cout << flush;
- total = 0;
- counts.clear();
- }
- }
-
- return 0;
-}
-
diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc
deleted file mode 100644
index d490192f..00000000
--- a/training/mr_optimize_reduce.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include <boost/shared_ptr.hpp>
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "optimize.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void SanityCheck(const vector<double>& w) {
- for (int i = 0; i < w.size(); ++i) {
- assert(!std::isnan(w[i]));
- assert(!std::isinf(w[i]));
- }
-}
-
-struct FComp {
- const vector<double>& w_;
- FComp(const vector<double>& w) : w_(w) {}
- bool operator()(int a, int b) const {
- return fabs(w_[a]) > fabs(w_[b]);
- }
-};
-
-void ShowLargestFeatures(const vector<double>& w) {
- vector<int> fnums(w.size());
- for (int i = 0; i < w.size(); ++i)
- fnums[i] = i;
- vector<int>::iterator mid = fnums.begin();
- mid += (w.size() > 10 ? 10 : w.size());
- partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
- cerr << "TOP FEATURES:";
- for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
- cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
- }
- cerr << endl;
-}
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input_weights,i",po::value<string>(),"Input feature weights file")
- ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file")
- ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)")
- ("state,s",po::value<string>(),"Read (and write if output_state is not set) optimizer state from this state file. In the first iteration, the file should not exist.")
- ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)")
- ("output_state,S", po::value<string>(), "Output state file (optional override)")
- ("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory")
- ("eta,e", po::value<double>()->default_value(0.1), "Learning rate for SGD (eta)")
- ("gaussian_prior,p","Use a Gaussian prior on the weights")
- ("means,u", po::value<string>(), "File containing the means for Gaussian prior")
- ("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help") || !conf->count("input_weights") || !conf->count("state")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- const bool use_b64 = conf["input_format"].as<string>() == "b64";
-
- vector<weight_t> lambdas;
- Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas);
- const string s_obj = "**OBJ**";
- int num_feats = FD::NumFeats();
- cerr << "Number of features: " << num_feats << endl;
- const bool gaussian_prior = conf.count("gaussian_prior");
- vector<weight_t> means(num_feats, 0);
- if (conf.count("means")) {
- if (!gaussian_prior) {
- cerr << "Don't use --means without --gaussian_prior!\n";
- exit(1);
- }
- Weights::InitFromFile(conf["means"].as<string>(), &means);
- }
- boost::shared_ptr<BatchOptimizer> o;
- const string omethod = conf["optimization_method"].as<string>();
- if (omethod == "rprop")
- o.reset(new RPropOptimizer(num_feats)); // TODO add configuration
- else
- o.reset(new LBFGSOptimizer(num_feats, conf["correction_buffers"].as<int>()));
- cerr << "Optimizer: " << o->Name() << endl;
- string state_file = conf["state"].as<string>();
- {
- ifstream in(state_file.c_str(), ios::binary);
- if (in)
- o->Load(&in);
- else
- cerr << "No state file found, assuming ITERATION 1\n";
- }
-
- double objective = 0;
- vector<double> gradient(num_feats, 0);
- // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2;
- // 0<TAB>**OBJ**=1.1;Feat1=1.0;
- int total_lines = 0; // TODO - this should be a count of the
- // training instances!!
- while(cin) {
- string line;
- getline(cin, line);
- if (line.empty()) continue;
- ++total_lines;
- int feat;
- double val;
- size_t i = line.find("\t");
- assert(i != string::npos);
- ++i;
- if (use_b64) {
- SparseVector<double> g;
- double obj;
- if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
- cerr << "B64 decoder returned error, skipping gradient!\n";
- cerr << " START: " << line.substr(0,line.size() > 200 ? 200 : line.size()) << endl;
- if (line.size() > 200)
- cerr << " END: " << line.substr(line.size() - 200, 200) << endl;
- cout << "-1\tRESTART\n";
- exit(99);
- }
- objective += obj;
- const SparseVector<double>& cg = g;
- for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) {
- if (it->first >= num_feats) {
- cerr << "Unexpected feature in gradient: " << FD::Convert(it->first) << endl;
- abort();
- }
- gradient[it->first] -= it->second;
- }
- } else { // text encoding - your gradients will not be accurate!
- while (i < line.size()) {
- size_t start = i;
- while (line[i] != '=' && i < line.size()) ++i;
- if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
- string fname = line.substr(start, i - start);
- if (fname == s_obj) {
- feat = -1;
- } else {
- feat = FD::Convert(line.substr(start, i - start));
- if (feat >= num_feats) {
- cerr << "Unexpected feature in gradient: " << line.substr(start, i - start) << endl;
- abort();
- }
- }
- ++i;
- start = i;
- while (line[i] != ';' && i < line.size()) ++i;
- if (i - start == 0) continue;
- val = atof(line.substr(start, i - start).c_str());
- ++i;
- if (feat == -1) {
- objective += val;
- } else {
- gradient[feat] -= val;
- }
- }
- }
- }
-
- if (gaussian_prior) {
- const double sigsq = conf["sigma_squared"].as<double>();
- double norm = 0;
- for (int k = 1; k < lambdas.size(); ++k) {
- const double& lambda_k = lambdas[k];
- if (lambda_k) {
- const double param = (lambda_k - means[k]);
- norm += param * param;
- gradient[k] += param / sigsq;
- }
- }
- const double reg = norm / (2.0 * sigsq);
- cerr << "REGULARIZATION TERM: " << reg << endl;
- objective += reg;
- }
- cerr << "EVALUATION #" << o->EvaluationCount() << " OBJECTIVE: " << objective << endl;
- double gnorm = 0;
- for (int i = 0; i < gradient.size(); ++i)
- gnorm += gradient[i] * gradient[i];
- cerr << " GNORM=" << sqrt(gnorm) << endl;
- vector<double> old = lambdas;
- int c = 0;
- while (old == lambdas) {
- ++c;
- if (c > 1) { cerr << "Same lambdas, repeating optimization\n"; }
- o->Optimize(objective, gradient, &lambdas);
- assert(c < 5);
- }
- old.clear();
- SanityCheck(lambdas);
- ShowLargestFeatures(lambdas);
- Weights::WriteToFile(conf["output_weights"].as<string>(), lambdas, false);
-
- const bool conv = o->HasConverged();
- if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; }
-
- if (conf.count("output_state"))
- state_file = conf["output_state"].as<string>();
- ofstream out(state_file.c_str(), ios::binary);
- cerr << "Writing state to: " << state_file << endl;
- o->Save(&out);
- out.close();
-
- cout << o->EvaluationCount() << "\t" << conv << endl;
- return 0;
-}
diff --git a/training/mr_reduce_to_weights.cc b/training/mr_reduce_to_weights.cc
deleted file mode 100644
index 16b47720..00000000
--- a/training/mr_reduce_to_weights.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "filelib.h"
-#include "fdict.h"
-#include "weights.h"
-#include "sparse_vector.h"
-
-using namespace std;
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)")
- ("input,i",po::value<string>()->default_value("-"),"Read file from")
- ("output,o",po::value<string>()->default_value("-"),"Write weights to");
- po::options_description clo("Command line options");
- clo.add_options()
- ("config", po::value<string>(), "Configuration file")
- ("help,h", "Print this help message and exit");
- po::options_description dconfig_options, dcmdline_options;
- dconfig_options.add(opts);
- dcmdline_options.add(opts).add(clo);
-
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- if (conf->count("config")) {
- ifstream config((*conf)["config"].as<string>().c_str());
- po::store(po::parse_config_file(config, dconfig_options), *conf);
- }
- po::notify(*conf);
-
- if (conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-void WriteWeights(const SparseVector<double>& weights, ostream* out) {
- for (SparseVector<double>::const_iterator it = weights.begin();
- it != weights.end(); ++it) {
- (*out) << FD::Convert(it->first) << " " << it->second << endl;
- }
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
-
- const bool use_b64 = conf["input_format"].as<string>() == "b64";
-
- const string s_obj = "**OBJ**";
- // E-step
- ReadFile rf(conf["input"].as<string>());
- istream* in = rf.stream();
- assert(*in);
- WriteFile wf(conf["output"].as<string>());
- ostream* out = wf.stream();
- out->precision(17);
- while(*in) {
- string line;
- getline(*in, line);
- if (line.empty()) continue;
- int feat;
- double val;
- size_t i = line.find("\t");
- assert(i != string::npos);
- ++i;
- if (use_b64) {
- SparseVector<double> g;
- double obj;
- if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) {
- cerr << "B64 decoder returned error, skipping!\n";
- continue;
- }
- WriteWeights(g, out);
- } else { // text encoding - your counts will not be accurate!
- SparseVector<double> weights;
- while (i < line.size()) {
- size_t start = i;
- while (line[i] != '=' && i < line.size()) ++i;
- if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; }
- string fname = line.substr(start, i - start);
- if (fname == s_obj) {
- feat = -1;
- } else {
- feat = FD::Convert(line.substr(start, i - start));
- }
- ++i;
- start = i;
- while (line[i] != ';' && i < line.size()) ++i;
- if (i - start == 0) continue;
- val = atof(line.substr(start, i - start).c_str());
- ++i;
- if (feat != -1) {
- weights.set_value(feat, val);
- }
- }
- WriteWeights(weights, out);
- }
- }
-
- return 0;
-}
diff --git a/pro/Makefile.am b/training/pro/Makefile.am
index 1e9d46b0..1916b6b2 100644
--- a/pro/Makefile.am
+++ b/training/pro/Makefile.am
@@ -3,9 +3,9 @@ bin_PROGRAMS = \
mr_pro_reduce
mr_pro_map_SOURCES = mr_pro_map.cc
-mr_pro_map_LDADD = $(top_srcdir)/training/libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+mr_pro_map_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
mr_pro_reduce_SOURCES = mr_pro_reduce.cc
mr_pro_reduce_LDADD = $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/utils/libutils.a -lz
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils -I$(top_srcdir)/training
diff --git a/pro/mr_pro_generate_mapper_input.pl b/training/pro/mr_pro_generate_mapper_input.pl
index b30fc4fd..b30fc4fd 100755
--- a/pro/mr_pro_generate_mapper_input.pl
+++ b/training/pro/mr_pro_generate_mapper_input.pl
diff --git a/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc
index eef40b8a..eef40b8a 100644
--- a/pro/mr_pro_map.cc
+++ b/training/pro/mr_pro_map.cc
diff --git a/pro/mr_pro_reduce.cc b/training/pro/mr_pro_reduce.cc
index 5ef9b470..5ef9b470 100644
--- a/pro/mr_pro_reduce.cc
+++ b/training/pro/mr_pro_reduce.cc
diff --git a/pro/pro.pl b/training/pro/pro.pl
index 891b7e4c..3b30c379 100755
--- a/pro/pro.pl
+++ b/training/pro/pro.pl
@@ -3,7 +3,7 @@ use strict;
use File::Basename qw(basename);
my @ORIG_ARGV=@ARGV;
use Cwd qw(getcwd);
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
# Skip local config (used for distributing jobs) if we're running in local-only mode
use LocalConfig;
@@ -13,28 +13,28 @@ use POSIX ":sys_wait_h";
my $QSUB_CMD = qsub_args(mert_memory());
my $default_jobs = env_default_jobs();
-my $VEST_DIR="$SCRIPT_DIR/../dpmert";
-require "$VEST_DIR/libcall.pl";
+my $UTILS_DIR="$SCRIPT_DIR/../utils";
+require "$UTILS_DIR/libcall.pl";
# Default settings
my $srcFile;
my $refFiles;
my $bin_dir = $SCRIPT_DIR;
die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
-my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl";
my $MAPPER = "$bin_dir/mr_pro_map";
my $REDUCER = "$bin_dir/mr_pro_reduce";
-my $parallelize = "$VEST_DIR/parallelize.pl";
-my $libcall = "$VEST_DIR/libcall.pl";
-my $sentserver = "$VEST_DIR/sentserver";
-my $sentclient = "$VEST_DIR/sentclient";
-my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+my $parallelize = "$UTILS_DIR/parallelize.pl";
+my $libcall = "$UTILS_DIR/libcall.pl";
+my $sentserver = "$UTILS_DIR/sentserver";
+my $sentclient = "$UTILS_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
my $SCORER = $FAST_SCORE;
die "Can't find $MAPPER" unless -x $MAPPER;
-my $cdec = "$bin_dir/../decoder/cdec";
+my $cdec = "$bin_dir/../../decoder/cdec";
die "Can't find decoder in $cdec" unless -x $cdec;
die "Can't find $parallelize" unless -x $parallelize;
die "Can't find $libcall" unless -e $libcall;
diff --git a/rampion/Makefile.am b/training/rampion/Makefile.am
index f4dbb7cc..1633d0f7 100644
--- a/rampion/Makefile.am
+++ b/training/rampion/Makefile.am
@@ -1,6 +1,6 @@
bin_PROGRAMS = rampion_cccp
rampion_cccp_SOURCES = rampion_cccp.cc
-rampion_cccp_LDADD = $(top_srcdir)/training/libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
+rampion_cccp_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training
+AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils
diff --git a/rampion/rampion.pl b/training/rampion/rampion.pl
index 55f7b3f1..ae084db6 100755
--- a/rampion/rampion.pl
+++ b/training/rampion/rampion.pl
@@ -2,7 +2,7 @@
use strict;
my @ORIG_ARGV=@ARGV;
use Cwd qw(getcwd);
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }
# Skip local config (used for distributing jobs) if we're running in local-only mode
use LocalConfig;
@@ -12,27 +12,27 @@ use POSIX ":sys_wait_h";
my $QSUB_CMD = qsub_args(mert_memory());
my $default_jobs = env_default_jobs();
-my $VEST_DIR="$SCRIPT_DIR/../dpmert";
-require "$VEST_DIR/libcall.pl";
+my $UTILS_DIR="$SCRIPT_DIR/../utils";
+require "$UTILS_DIR/libcall.pl";
# Default settings
my $srcFile;
my $refFiles;
my $bin_dir = $SCRIPT_DIR;
die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
-my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
my $MAPINPUT = "$bin_dir/rampion_generate_input.pl";
my $MAPPER = "$bin_dir/rampion_cccp";
-my $parallelize = "$VEST_DIR/parallelize.pl";
-my $libcall = "$VEST_DIR/libcall.pl";
-my $sentserver = "$VEST_DIR/sentserver";
-my $sentclient = "$VEST_DIR/sentclient";
-my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+my $parallelize = "$UTILS_DIR/parallelize.pl";
+my $libcall = "$UTILS_DIR/libcall.pl";
+my $sentserver = "$UTILS_DIR/sentserver";
+my $sentclient = "$UTILS_DIR/sentclient";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
my $SCORER = $FAST_SCORE;
die "Can't find $MAPPER" unless -x $MAPPER;
-my $cdec = "$bin_dir/../decoder/cdec";
+my $cdec = "$bin_dir/../../decoder/cdec";
die "Can't find decoder in $cdec" unless -x $cdec;
die "Can't find $parallelize" unless -x $parallelize;
die "Can't find $libcall" unless -e $libcall;
diff --git a/rampion/rampion_cccp.cc b/training/rampion/rampion_cccp.cc
index 1e36dc51..1e36dc51 100644
--- a/rampion/rampion_cccp.cc
+++ b/training/rampion/rampion_cccp.cc
diff --git a/rampion/rampion_generate_input.pl b/training/rampion/rampion_generate_input.pl
index b30fc4fd..b30fc4fd 100755
--- a/rampion/rampion_generate_input.pl
+++ b/training/rampion/rampion_generate_input.pl
diff --git a/training/candidate_set.cc b/training/utils/candidate_set.cc
index 087efec3..087efec3 100644
--- a/training/candidate_set.cc
+++ b/training/utils/candidate_set.cc
diff --git a/training/candidate_set.h b/training/utils/candidate_set.h
index 9d326ed0..9d326ed0 100644
--- a/training/candidate_set.h
+++ b/training/utils/candidate_set.h
diff --git a/dpmert/decode-and-evaluate.pl b/training/utils/decode-and-evaluate.pl
index fe765d00..1a332c08 100755
--- a/dpmert/decode-and-evaluate.pl
+++ b/training/utils/decode-and-evaluate.pl
@@ -2,7 +2,7 @@
use strict;
my @ORIG_ARGV=@ARGV;
use Cwd qw(getcwd);
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
# Skip local config (used for distributing jobs) if we're running in local-only mode
use LocalConfig;
@@ -16,16 +16,16 @@ require "libcall.pl";
my $default_jobs = env_default_jobs();
my $bin_dir = $SCRIPT_DIR;
die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
-my $FAST_SCORE="$bin_dir/../mteval/fast_score";
+my $FAST_SCORE="$bin_dir/../../mteval/fast_score";
die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;
my $parallelize = "$bin_dir/parallelize.pl";
my $libcall = "$bin_dir/libcall.pl";
my $sentserver = "$bin_dir/sentserver";
my $sentclient = "$bin_dir/sentclient";
-my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm";
+my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";
my $SCORER = $FAST_SCORE;
-my $cdec = "$bin_dir/../decoder/cdec";
+my $cdec = "$bin_dir/../../decoder/cdec";
die "Can't find decoder in $cdec" unless -x $cdec;
die "Can't find $parallelize" unless -x $parallelize;
die "Can't find $libcall" unless -e $libcall;
diff --git a/training/entropy.cc b/training/utils/entropy.cc
index 4fdbe2be..4fdbe2be 100644
--- a/training/entropy.cc
+++ b/training/utils/entropy.cc
diff --git a/training/entropy.h b/training/utils/entropy.h
index 796589ca..796589ca 100644
--- a/training/entropy.h
+++ b/training/utils/entropy.h
diff --git a/training/grammar_convert.cc b/training/utils/grammar_convert.cc
index 607a7cb9..607a7cb9 100644
--- a/training/grammar_convert.cc
+++ b/training/utils/grammar_convert.cc
diff --git a/training/lbfgs.h b/training/utils/lbfgs.h
index e8baecab..e8baecab 100644
--- a/training/lbfgs.h
+++ b/training/utils/lbfgs.h
diff --git a/training/lbfgs_test.cc b/training/utils/lbfgs_test.cc
index 9678e788..9678e788 100644
--- a/training/lbfgs_test.cc
+++ b/training/utils/lbfgs_test.cc
diff --git a/dpmert/libcall.pl b/training/utils/libcall.pl
index c7d0f128..c7d0f128 100644
--- a/dpmert/libcall.pl
+++ b/training/utils/libcall.pl
diff --git a/training/online_optimizer.cc b/training/utils/online_optimizer.cc
index 3ed95452..3ed95452 100644
--- a/training/online_optimizer.cc
+++ b/training/utils/online_optimizer.cc
diff --git a/training/online_optimizer.h b/training/utils/online_optimizer.h
index 28d89344..28d89344 100644
--- a/training/online_optimizer.h
+++ b/training/utils/online_optimizer.h
diff --git a/training/optimize.cc b/training/utils/optimize.cc
index 41ac90d8..41ac90d8 100644
--- a/training/optimize.cc
+++ b/training/utils/optimize.cc
diff --git a/training/optimize.h b/training/utils/optimize.h
index 07943b44..07943b44 100644
--- a/training/optimize.h
+++ b/training/utils/optimize.h
diff --git a/training/optimize_test.cc b/training/utils/optimize_test.cc
index bff2ca03..bff2ca03 100644
--- a/training/optimize_test.cc
+++ b/training/utils/optimize_test.cc
diff --git a/dpmert/parallelize.pl b/training/utils/parallelize.pl
index d2ebaeea..4197e0e5 100755
--- a/dpmert/parallelize.pl
+++ b/training/utils/parallelize.pl
@@ -18,7 +18,7 @@
#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s
-my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; }
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }
use LocalConfig;
use Cwd qw/ abs_path cwd getcwd /;
diff --git a/training/risk.cc b/training/utils/risk.cc
index d5a12cfd..d5a12cfd 100644
--- a/training/risk.cc
+++ b/training/utils/risk.cc
diff --git a/training/risk.h b/training/utils/risk.h
index 2e8db0fb..2e8db0fb 100644
--- a/training/risk.h
+++ b/training/utils/risk.h
diff --git a/dpmert/sentclient.c b/training/utils/sentclient.c
index 91d994ab..91d994ab 100644
--- a/dpmert/sentclient.c
+++ b/training/utils/sentclient.c
diff --git a/dpmert/sentserver.c b/training/utils/sentserver.c
index c20b4fa6..c20b4fa6 100644
--- a/dpmert/sentserver.c
+++ b/training/utils/sentserver.c
diff --git a/dpmert/sentserver.h b/training/utils/sentserver.h
index cd17a546..cd17a546 100644
--- a/dpmert/sentserver.h
+++ b/training/utils/sentserver.h
diff --git a/word-aligner/Makefile.am b/word-aligner/Makefile.am
new file mode 100644
index 00000000..280d3ae7
--- /dev/null
+++ b/word-aligner/Makefile.am
@@ -0,0 +1,6 @@
+bin_PROGRAMS = fast_align
+
+fast_align_SOURCES = fast_align.cc ttables.cc
+fast_align_LDADD = $(top_srcdir)/utils/libutils.a -lz
+
+AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/training
diff --git a/training/fast_align.cc b/word-aligner/fast_align.cc
index 7492d26f..7492d26f 100644
--- a/training/fast_align.cc
+++ b/word-aligner/fast_align.cc
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index 08ff33e1..ce3e1638 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -16,7 +16,7 @@ STEM_E = $(SCRIPT_DIR)/stemmers/$(E_LANG).pl
CLASSIFY = $(SUPPORT_DIR)/classify.pl
MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl
-MODEL1 = $(TRAINING_DIR)/fast_align
+MODEL1 = $(SCRIPT_DIR)/fast_align
MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl
e.voc: corpus.e
diff --git a/word-aligner/paste-parallel-files.pl b/word-aligner/paste-parallel-files.pl
deleted file mode 100755
index ce53b325..00000000
--- a/word-aligner/paste-parallel-files.pl
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my @fs = ();
-for my $file (@ARGV) {
- my $fh;
- open $fh, "<$file" or die "Can't open $file for reading: $!";
- push @fs, $fh;
-}
-my $num = scalar @fs;
-die "Usage: $0 file1.txt file2.txt [...]\n" unless $num > 1;
-
-my $first = $fs[0];
-while(<$first>) {
- chomp;
- my @out = ();
- push @out, $_;
- for (my $i=1; $i < $num; $i++) {
- my $f = $fs[$i];
- my $line = <$f>;
- die "Mismatched number of lines!" unless defined $line;
- chomp $line;
- push @out, $line;
- }
- print join(' ||| ', @out) . "\n";
-}
-
-for my $fh (@fs) {
- my $x=<$fh>;
- die "Mismatched number of lines!" if defined $x;
- close $fh;
-}
-
-exit 0;
-
diff --git a/training/ttables.cc b/word-aligner/ttables.cc
index 45bf14c5..45bf14c5 100644
--- a/training/ttables.cc
+++ b/word-aligner/ttables.cc
diff --git a/training/ttables.h b/word-aligner/ttables.h
index 9baa13ca..9baa13ca 100644
--- a/training/ttables.h
+++ b/word-aligner/ttables.h