diff options
| -rw-r--r-- | .gitignore | 28 | ||||
| -rw-r--r-- | Makefile.am | 7 | ||||
| -rw-r--r-- | configure.ac | 32 | ||||
| -rw-r--r-- | dpmert/README.shared-mem | 9 | ||||
| -rw-r--r-- | minrisk/Makefile.am | 6 | ||||
| -rw-r--r-- | pro/README.shared-mem | 9 | ||||
| -rw-r--r-- | training/Makefile.am | 100 | ||||
| -rwxr-xr-x | training/add-model1-features-to-scfg.pl | 93 | ||||
| -rw-r--r-- | training/collapse_weights.cc | 110 | ||||
| -rw-r--r-- | training/crf/Makefile.am | 27 | ||||
| -rw-r--r-- | training/crf/cllh_observer.cc (renamed from training/cllh_observer.cc) | 0 | ||||
| -rw-r--r-- | training/crf/cllh_observer.h (renamed from training/cllh_observer.h) | 0 | ||||
| -rw-r--r-- | training/crf/mpi_batch_optimize.cc (renamed from training/mpi_batch_optimize.cc) | 0 | ||||
| -rw-r--r-- | training/crf/mpi_compute_cllh.cc (renamed from training/mpi_compute_cllh.cc) | 0 | ||||
| -rw-r--r-- | training/crf/mpi_extract_features.cc (renamed from training/mpi_extract_features.cc) | 0 | ||||
| -rw-r--r-- | training/crf/mpi_extract_reachable.cc (renamed from training/mpi_extract_reachable.cc) | 0 | ||||
| -rw-r--r-- | training/crf/mpi_flex_optimize.cc (renamed from training/mpi_flex_optimize.cc) | 0 | ||||
| -rw-r--r-- | training/crf/mpi_online_optimize.cc (renamed from training/mpi_online_optimize.cc) | 0 | ||||
| -rwxr-xr-x | training/dep-reorder/conll2reordering-forest.pl | 65 | ||||
| -rw-r--r-- | training/dep-reorder/george.conll | 4 | ||||
| -rwxr-xr-x | training/dep-reorder/scripts/conll2simplecfg.pl | 57 | ||||
| -rw-r--r-- | training/dpmert/Makefile.am (renamed from dpmert/Makefile.am) | 10 | ||||
| -rw-r--r-- | training/dpmert/ces.cc (renamed from dpmert/ces.cc) | 0 | ||||
| -rw-r--r-- | training/dpmert/ces.h (renamed from dpmert/ces.h) | 0 | ||||
| -rwxr-xr-x | training/dpmert/divide_refs.py (renamed from dpmert/divide_refs.py) | 0 | ||||
| -rwxr-xr-x | training/dpmert/dpmert.pl (renamed from dpmert/dpmert.pl) | 17 | ||||
| -rw-r--r-- | training/dpmert/error_surface.cc (renamed from dpmert/error_surface.cc) | 0 | ||||
| -rw-r--r-- | training/dpmert/error_surface.h (renamed from dpmert/error_surface.h) | 0 | ||||
| -rwxr-xr-x | training/dpmert/line_mediator.pl (renamed from dpmert/line_mediator.pl) | 0 | ||||
| -rw-r--r-- | training/dpmert/line_optimizer.cc (renamed from dpmert/line_optimizer.cc) | 0 | ||||
| -rw-r--r-- | training/dpmert/line_optimizer.h (renamed from dpmert/line_optimizer.h) | 0 | ||||
| -rw-r--r-- | training/dpmert/lo_test.cc (renamed from dpmert/lo_test.cc) | 0 | ||||
| -rw-r--r-- | training/dpmert/mert_geometry.cc (renamed from dpmert/mert_geometry.cc) | 0 | ||||
| -rw-r--r-- | training/dpmert/mert_geometry.h (renamed from dpmert/mert_geometry.h) | 0 | ||||
| -rw-r--r-- | training/dpmert/mr_dpmert_generate_mapper_input.cc (renamed from dpmert/mr_dpmert_generate_mapper_input.cc) | 0 | ||||
| -rw-r--r-- | training/dpmert/mr_dpmert_map.cc (renamed from dpmert/mr_dpmert_map.cc) | 0 | ||||
| -rw-r--r-- | training/dpmert/mr_dpmert_reduce.cc (renamed from dpmert/mr_dpmert_reduce.cc) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_aer/README (renamed from dpmert/test_aer/README) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_aer/cdec.ini (renamed from dpmert/test_aer/cdec.ini) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_aer/corpus.src (renamed from dpmert/test_aer/corpus.src) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_aer/grammar (renamed from dpmert/test_aer/grammar) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_aer/ref.0 (renamed from dpmert/test_aer/ref.0) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_aer/weights (renamed from dpmert/test_aer/weights) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_data/0.json.gz (renamed from dpmert/test_data/0.json.gz) | bin | 13709 -> 13709 bytes | |||
| -rw-r--r-- | training/dpmert/test_data/1.json.gz (renamed from dpmert/test_data/1.json.gz) | bin | 204803 -> 204803 bytes | |||
| -rw-r--r-- | training/dpmert/test_data/c2e.txt.0 (renamed from dpmert/test_data/c2e.txt.0) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_data/c2e.txt.1 (renamed from dpmert/test_data/c2e.txt.1) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_data/c2e.txt.2 (renamed from dpmert/test_data/c2e.txt.2) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_data/c2e.txt.3 (renamed from dpmert/test_data/c2e.txt.3) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_data/re.txt.0 (renamed from dpmert/test_data/re.txt.0) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_data/re.txt.1 (renamed from dpmert/test_data/re.txt.1) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_data/re.txt.2 (renamed from dpmert/test_data/re.txt.2) | 0 | ||||
| -rw-r--r-- | training/dpmert/test_data/re.txt.3 (renamed from dpmert/test_data/re.txt.3) | 0 | ||||
| -rw-r--r-- | training/dtrain/Makefile.am (renamed from dtrain/Makefile.am) | 2 | ||||
| -rw-r--r-- | training/dtrain/README.md (renamed from dtrain/README.md) | 0 | ||||
| -rw-r--r-- | training/dtrain/dtrain.cc (renamed from dtrain/dtrain.cc) | 0 | ||||
| -rw-r--r-- | training/dtrain/dtrain.h (renamed from dtrain/dtrain.h) | 0 | ||||
| -rwxr-xr-x | training/dtrain/hstreaming/avg.rb (renamed from dtrain/hstreaming/avg.rb) | 0 | ||||
| -rw-r--r-- | training/dtrain/hstreaming/cdec.ini (renamed from dtrain/hstreaming/cdec.ini) | 0 | ||||
| -rw-r--r-- | training/dtrain/hstreaming/dtrain.ini (renamed from dtrain/hstreaming/dtrain.ini) | 0 | ||||
| -rwxr-xr-x | training/dtrain/hstreaming/dtrain.sh (renamed from dtrain/hstreaming/dtrain.sh) | 0 | ||||
| -rwxr-xr-x | training/dtrain/hstreaming/hadoop-streaming-job.sh (renamed from dtrain/hstreaming/hadoop-streaming-job.sh) | 0 | ||||
| -rwxr-xr-x | training/dtrain/hstreaming/lplp.rb (renamed from dtrain/hstreaming/lplp.rb) | 0 | ||||
| -rw-r--r-- | training/dtrain/hstreaming/red-test (renamed from dtrain/hstreaming/red-test) | 0 | ||||
| -rw-r--r-- | training/dtrain/kbestget.h (renamed from dtrain/kbestget.h) | 0 | ||||
| -rw-r--r-- | training/dtrain/ksampler.h (renamed from dtrain/ksampler.h) | 0 | ||||
| -rw-r--r-- | training/dtrain/pairsampling.h (renamed from dtrain/pairsampling.h) | 0 | ||||
| -rwxr-xr-x | training/dtrain/parallelize.rb (renamed from dtrain/parallelize.rb) | 0 | ||||
| -rw-r--r-- | training/dtrain/parallelize/test/cdec.ini (renamed from dtrain/parallelize/test/cdec.ini) | 0 | ||||
| -rw-r--r-- | training/dtrain/parallelize/test/dtrain.ini (renamed from dtrain/parallelize/test/dtrain.ini) | 0 | ||||
| -rw-r--r-- | training/dtrain/parallelize/test/in (renamed from dtrain/parallelize/test/in) | 0 | ||||
| -rw-r--r-- | training/dtrain/parallelize/test/refs (renamed from dtrain/parallelize/test/refs) | 0 | ||||
| -rw-r--r-- | training/dtrain/score.cc (renamed from dtrain/score.cc) | 0 | ||||
| -rw-r--r-- | training/dtrain/score.h (renamed from dtrain/score.h) | 0 | ||||
| -rw-r--r-- | training/dtrain/test/example/README (renamed from dtrain/test/example/README) | 0 | ||||
| -rw-r--r-- | training/dtrain/test/example/cdec.ini (renamed from dtrain/test/example/cdec.ini) | 0 | ||||
| -rw-r--r-- | training/dtrain/test/example/dtrain.ini (renamed from dtrain/test/example/dtrain.ini) | 0 | ||||
| -rw-r--r-- | training/dtrain/test/example/expected-output (renamed from dtrain/test/example/expected-output) | 0 | ||||
| -rw-r--r-- | training/dtrain/test/parallelize/cdec.ini (renamed from dtrain/test/parallelize/cdec.ini) | 0 | ||||
| -rw-r--r-- | training/dtrain/test/parallelize/dtrain.ini (renamed from dtrain/test/parallelize/dtrain.ini) | 0 | ||||
| -rw-r--r-- | training/dtrain/test/parallelize/in (renamed from dtrain/test/parallelize/in) | 0 | ||||
| -rw-r--r-- | training/dtrain/test/parallelize/refs (renamed from dtrain/test/parallelize/refs) | 0 | ||||
| -rw-r--r-- | training/dtrain/test/toy/cdec.ini (renamed from dtrain/test/toy/cdec.ini) | 0 | ||||
| -rw-r--r-- | training/dtrain/test/toy/dtrain.ini (renamed from dtrain/test/toy/dtrain.ini) | 0 | ||||
| -rw-r--r-- | training/dtrain/test/toy/input (renamed from dtrain/test/toy/input) | 0 | ||||
| -rw-r--r-- | training/feature_expectations.cc | 232 | ||||
| -rw-r--r-- | training/lbl_model.cc | 421 | ||||
| -rw-r--r-- | training/minrisk/Makefile.am | 6 | ||||
| -rwxr-xr-x | training/minrisk/minrisk.pl (renamed from minrisk/minrisk.pl) | 20 | ||||
| -rwxr-xr-x | training/minrisk/minrisk_generate_input.pl (renamed from minrisk/minrisk_generate_input.pl) | 0 | ||||
| -rw-r--r-- | training/minrisk/minrisk_optimize.cc (renamed from minrisk/minrisk_optimize.cc) | 0 | ||||
| -rw-r--r-- | training/mira/Makefile.am (renamed from mira/Makefile.am) | 2 | ||||
| -rw-r--r-- | training/mira/kbest_mira.cc (renamed from mira/kbest_mira.cc) | 0 | ||||
| -rw-r--r-- | training/mpi_em_optimize.cc | 389 | ||||
| -rw-r--r-- | training/mr_em_adapted_reduce.cc | 173 | ||||
| -rw-r--r-- | training/mr_em_map_adapter.cc | 160 | ||||
| -rw-r--r-- | training/mr_optimize_reduce.cc | 231 | ||||
| -rw-r--r-- | training/mr_reduce_to_weights.cc | 109 | ||||
| -rw-r--r-- | training/pro/Makefile.am (renamed from pro/Makefile.am) | 4 | ||||
| -rwxr-xr-x | training/pro/mr_pro_generate_mapper_input.pl (renamed from pro/mr_pro_generate_mapper_input.pl) | 0 | ||||
| -rw-r--r-- | training/pro/mr_pro_map.cc (renamed from pro/mr_pro_map.cc) | 0 | ||||
| -rw-r--r-- | training/pro/mr_pro_reduce.cc (renamed from pro/mr_pro_reduce.cc) | 0 | ||||
| -rwxr-xr-x | training/pro/pro.pl (renamed from pro/pro.pl) | 20 | ||||
| -rw-r--r-- | training/rampion/Makefile.am (renamed from rampion/Makefile.am) | 4 | ||||
| -rwxr-xr-x | training/rampion/rampion.pl (renamed from rampion/rampion.pl) | 20 | ||||
| -rw-r--r-- | training/rampion/rampion_cccp.cc (renamed from rampion/rampion_cccp.cc) | 0 | ||||
| -rwxr-xr-x | training/rampion/rampion_generate_input.pl (renamed from rampion/rampion_generate_input.pl) | 0 | ||||
| -rw-r--r-- | training/utils/candidate_set.cc (renamed from training/candidate_set.cc) | 0 | ||||
| -rw-r--r-- | training/utils/candidate_set.h (renamed from training/candidate_set.h) | 0 | ||||
| -rwxr-xr-x | training/utils/decode-and-evaluate.pl (renamed from dpmert/decode-and-evaluate.pl) | 8 | ||||
| -rw-r--r-- | training/utils/entropy.cc (renamed from training/entropy.cc) | 0 | ||||
| -rw-r--r-- | training/utils/entropy.h (renamed from training/entropy.h) | 0 | ||||
| -rw-r--r-- | training/utils/grammar_convert.cc (renamed from training/grammar_convert.cc) | 0 | ||||
| -rw-r--r-- | training/utils/lbfgs.h (renamed from training/lbfgs.h) | 0 | ||||
| -rw-r--r-- | training/utils/lbfgs_test.cc (renamed from training/lbfgs_test.cc) | 0 | ||||
| -rw-r--r-- | training/utils/libcall.pl (renamed from dpmert/libcall.pl) | 0 | ||||
| -rw-r--r-- | training/utils/online_optimizer.cc (renamed from training/online_optimizer.cc) | 0 | ||||
| -rw-r--r-- | training/utils/online_optimizer.h (renamed from training/online_optimizer.h) | 0 | ||||
| -rw-r--r-- | training/utils/optimize.cc (renamed from training/optimize.cc) | 0 | ||||
| -rw-r--r-- | training/utils/optimize.h (renamed from training/optimize.h) | 0 | ||||
| -rw-r--r-- | training/utils/optimize_test.cc (renamed from training/optimize_test.cc) | 0 | ||||
| -rwxr-xr-x | training/utils/parallelize.pl (renamed from dpmert/parallelize.pl) | 2 | ||||
| -rw-r--r-- | training/utils/risk.cc (renamed from training/risk.cc) | 0 | ||||
| -rw-r--r-- | training/utils/risk.h (renamed from training/risk.h) | 0 | ||||
| -rw-r--r-- | training/utils/sentclient.c (renamed from dpmert/sentclient.c) | 0 | ||||
| -rw-r--r-- | training/utils/sentserver.c (renamed from dpmert/sentserver.c) | 0 | ||||
| -rw-r--r-- | training/utils/sentserver.h (renamed from dpmert/sentserver.h) | 0 | ||||
| -rw-r--r-- | word-aligner/Makefile.am | 6 | ||||
| -rw-r--r-- | word-aligner/fast_align.cc (renamed from training/fast_align.cc) | 0 | ||||
| -rw-r--r-- | word-aligner/makefiles/makefile.grammars | 2 | ||||
| -rwxr-xr-x | word-aligner/paste-parallel-files.pl | 35 | ||||
| -rw-r--r-- | word-aligner/ttables.cc (renamed from training/ttables.cc) | 0 | ||||
| -rw-r--r-- | word-aligner/ttables.h (renamed from training/ttables.h) | 0 | 
133 files changed, 149 insertions, 2271 deletions
| @@ -1,3 +1,6 @@ +example_extff/ff_example.lo +example_extff/libff_example.la +mteval/meteor_jar.cc  *.a  *.aux  *.bbl @@ -176,4 +179,27 @@ utils/reconstruct_weights  utils/small_vector_test  utils/ts  utils/weights_test -utils/unigram_pyp_lm +training/crf/mpi_batch_optimize +training/crf/mpi_compute_cllh +training/crf/mpi_extract_features +training/crf/mpi_extract_reachable +training/crf/mpi_flex_optimize +training/crf/mpi_online_optimize +training/dpmert/lo_test +training/dpmert/mr_dpmert_generate_mapper_input +training/dpmert/mr_dpmert_map +training/dpmert/mr_dpmert_reduce +training/dpmert/sentclient +training/dpmert/sentserver +training/dtrain/dtrain +training/minrisk/minrisk_optimize +training/mira/kbest_mira +training/pro/mr_pro_map +training/pro/mr_pro_reduce +training/rampion/rampion_cccp +training/utils/Makefile.am +training/utils/lbfgs_test +training/utils/optimize_test +training/utils/sentclient +training/utils/sentserver +word-aligner/fast_align diff --git a/Makefile.am b/Makefile.am index 7ca7268a..dbf604a1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,12 +10,7 @@ SUBDIRS = \    decoder \    training \    training/liblbfgs \ -  mira \ -  dtrain \ -  dpmert \ -  pro \ -  rampion \ -  minrisk \ +  word-aligner \    example_extff  #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava diff --git a/configure.ac b/configure.ac index 09fc5c5b..366112a3 100644 --- a/configure.ac +++ b/configure.ac @@ -82,26 +82,34 @@ AC_PROG_INSTALL  CPPFLAGS="-DPIC -fPIC $CPPFLAGS -DHAVE_CONFIG_H" +# core cdec stuff  AC_CONFIG_FILES([Makefile])  AC_CONFIG_FILES([utils/Makefile])  AC_CONFIG_FILES([mteval/Makefile]) +AC_CONFIG_FILES([mteval/meteor_jar.cc])  AC_CONFIG_FILES([decoder/Makefile]) -AC_CONFIG_FILES([training/Makefile]) -AC_CONFIG_FILES([training/liblbfgs/Makefile]) -AC_CONFIG_FILES([dpmert/Makefile]) -AC_CONFIG_FILES([pro/Makefile]) -AC_CONFIG_FILES([rampion/Makefile]) -AC_CONFIG_FILES([minrisk/Makefile]) +AC_CONFIG_FILES([python/setup.py]) +AC_CONFIG_FILES([word-aligner/Makefile]) + +# KenLM stuff  AC_CONFIG_FILES([klm/util/Makefile])  AC_CONFIG_FILES([klm/lm/Makefile])  AC_CONFIG_FILES([klm/search/Makefile]) -AC_CONFIG_FILES([mira/Makefile]) -AC_CONFIG_FILES([dtrain/Makefile]) -AC_CONFIG_FILES([example_extff/Makefile]) -AC_CONFIG_FILES([mteval/meteor_jar.cc]) - -AC_CONFIG_FILES([python/setup.py]) +# training stuff +AC_CONFIG_FILES([training/Makefile]) +AC_CONFIG_FILES([training/utils/Makefile]) +AC_CONFIG_FILES([training/liblbfgs/Makefile]) +AC_CONFIG_FILES([training/crf/Makefile]) +AC_CONFIG_FILES([training/dpmert/Makefile]) +AC_CONFIG_FILES([training/pro/Makefile]) +AC_CONFIG_FILES([training/rampion/Makefile]) +AC_CONFIG_FILES([training/minrisk/Makefile]) +AC_CONFIG_FILES([training/mira/Makefile]) +AC_CONFIG_FILES([training/dtrain/Makefile]) + +# external feature function example code +AC_CONFIG_FILES([example_extff/Makefile])  AC_OUTPUT diff --git a/dpmert/README.shared-mem b/dpmert/README.shared-mem deleted file mode 100644 index 7728efc0..00000000 --- a/dpmert/README.shared-mem +++ /dev/null @@ -1,9 +0,0 @@ -If you want to run dist-vest.pl on a very large shared memory machine, do the -following: - -  ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini - -This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the -decoder must load grammars, language models, etc., J should be smaller than I, but this will depend -on the system you are running on and the complexity of the models used for decoding. - diff --git a/minrisk/Makefile.am b/minrisk/Makefile.am deleted file mode 100644 index a24f047c..00000000 --- a/minrisk/Makefile.am +++ /dev/null @@ -1,6 +0,0 @@ -bin_PROGRAMS = minrisk_optimize - -minrisk_optimize_SOURCES = minrisk_optimize.cc -minrisk_optimize_LDADD = $(top_srcdir)/training/libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/training/liblbfgs/liblbfgs.a -lz - -AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training diff --git a/pro/README.shared-mem b/pro/README.shared-mem deleted file mode 100644 index 7728efc0..00000000 --- a/pro/README.shared-mem +++ /dev/null @@ -1,9 +0,0 @@ -If you want to run dist-vest.pl on a very large shared memory machine, do the -following: - -  ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini - -This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the -decoder must load grammars, language models, etc., J should be smaller than I, but this will depend -on the system you are running on and the complexity of the models used for decoding. - diff --git a/training/Makefile.am b/training/Makefile.am index f9c25391..e95e045f 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -1,91 +1,11 @@ -bin_PROGRAMS = \ -  fast_align \ -  lbl_model \ -  test_ngram \ -  mr_em_map_adapter \ -  mr_em_adapted_reduce \ -  mr_reduce_to_weights \ -  mr_optimize_reduce \ -  grammar_convert \ -  plftools \ -  collapse_weights \ -  mpi_extract_reachable \ -  mpi_extract_features \ -  mpi_online_optimize \ -  mpi_flex_optimize \ -  mpi_batch_optimize \ -  mpi_compute_cllh \ -  augment_grammar +SUBDIRS = \ +  liblbfgs \ +  utils \ +  crf \ +  minrisk \ +  dpmert \ +  pro \ +  dtrain \ +  mira \ +  rampion -noinst_PROGRAMS = \ -  lbfgs_test \ -  optimize_test - -TESTS = lbfgs_test optimize_test - -noinst_LIBRARIES = libtraining.a -libtraining_a_SOURCES = \ -  candidate_set.cc \ -  entropy.cc \ -  optimize.cc \ -  online_optimizer.cc \ -  risk.cc - -mpi_online_optimize_SOURCES = mpi_online_optimize.cc -mpi_online_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc -mpi_flex_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc -mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_extract_features_SOURCES = mpi_extract_features.cc -mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc -mpi_batch_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc -mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -augment_grammar_SOURCES = augment_grammar.cc -augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -test_ngram_SOURCES = test_ngram.cc -test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -fast_align_SOURCES = fast_align.cc ttables.cc -fast_align_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -lbl_model_SOURCES = lbl_model.cc -lbl_model_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -grammar_convert_SOURCES = grammar_convert.cc -grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -optimize_test_SOURCES = optimize_test.cc -optimize_test_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -collapse_weights_SOURCES = collapse_weights.cc -collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -lbfgs_test_SOURCES = lbfgs_test.cc -lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc -mr_optimize_reduce_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc -mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc -mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc -mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -plftools_SOURCES = plftools.cc -plftools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm diff --git a/training/add-model1-features-to-scfg.pl b/training/add-model1-features-to-scfg.pl deleted file mode 100755 index a0074317..00000000 --- a/training/add-model1-features-to-scfg.pl +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/perl -w - -# [X] ||| so [X,1] die [X,2] der ||| as [X,1] existing [X,2] the ||| 2.47712135315 2.53182387352 5.07100057602 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| this [X,1] the [X,2] of ||| 2.47712135315 3.19828724861 2.38270020485 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| as [X,1] the [X,2] the ||| 2.47712135315 2.53182387352 1.48463630676 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| is [X,1] the [X,2] of the ||| 2.47712135315 3.45197868347 2.64251494408 ||| 0-0 2-2 4-4 4-5 - -die "Usage: $0 model1.f-e model1.e-f < grammar.scfg\n  (use trianing/model1 to extract the model files)\n" unless scalar @ARGV == 2; - -my $fm1 = shift @ARGV; -die unless $fm1; -my $frm1 = shift @ARGV; -die unless $frm1; -open M1,"<$fm1" or die; -open RM1,"<$frm1" or die; -print STDERR "Loading Model 1 probs from $fm1...\n"; -my %m1; -while(<M1>) { -  chomp; -  my ($f, $e, $lp) = split /\s+/; -  $m1{$e}->{$f} = exp($lp); -} -close M1; - -print STDERR "Loading Inverse Model 1 probs from $frm1...\n"; -my %rm1; -while(<RM1>) { -  chomp; -  my ($e, $f, $lp) = split /\s+/; -  $rm1{$f}->{$e} = exp($lp); -} -close RM1; - -my @label = qw( EGivenF LexFGivenE LexEGivenF ); -while(<>) { -  chomp; -  my ($l, $f, $e, $sscores, $al) = split / \|\|\| /; -  my @scores = split /\s+/, $sscores; -  unless ($sscores =~ /=/) { -    for (my $i=0; $i<3; $i++) { $scores[$i] = "$label[$i]=$scores[$i]"; } -  } -  push @scores, "RuleCount=1"; -  my @fs = split /\s+/, $f; -  my @es = split /\s+/, $e; -  my $flen = scalar @fs; -  my $elen = scalar @es; -  my $pgen = 0; -  my $nongen = 0; -  for (my $i =0; $i < $flen; $i++) { -    my $ftot = 0; -    next if ($fs[$i] =~ /\[X/); -    my $cr = $rm1{$fs[$i]}; -    for (my $j=0; $j <= $elen; $j++) { -      my $ej = '<eps>'; -      if ($j < $elen) { $ej = $es[$j]; } -      my $p = $cr->{$ej}; -      if (defined $p) { $ftot += $p; } -    } -    if ($ftot == 0) { $nongen = 1; last; } -    $pgen += log($ftot) - log($elen); -  } -  my $bad = 0; -  my $good = 0; -  unless ($nongen) { push @scores, "RGood=1"; $good++; } else { push @scores, "RBad=1"; $bad++; } - -  $nongen = 0; -  $pgen = 0; -  for (my $i =0; $i < $elen; $i++) { -    my $etot = 0; -    next if ($es[$i] =~ /\[X/); -    my $cr = $m1{$es[$i]}; -#    print STDERR "$es[$i]\n"; -    for (my $j=0; $j <= $flen; $j++) { -      my $fj = '<eps>'; -      if ($j < $flen) { $fj = $fs[$j]; } -      my $p = $cr->{$fj}; -#      print STDERR "  $fs[$j] : $p\n"; -      if (defined $p) { $etot += $p; } -    } -    if ($etot == 0) { $nongen = 1; last; } -    $pgen += log($etot) - log($flen); -  } -  unless ($nongen) { -    push @scores, "FGood=1"; -    if ($good) { push @scores, "BothGood=1"; } else { push @scores, "SusDel=1"; } -  } else { -    push @scores, "FBad=1"; -    if ($bad) { push @scores, "BothBad=1"; } else { push @scores, "SusHall=1"; } -  } -  print "$l ||| $f ||| $e ||| @scores"; -  if (defined $al) { print " ||| $al\n"; } else { print "\n"; } -} - diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc deleted file mode 100644 index c03eb031..00000000 --- a/training/collapse_weights.cc +++ /dev/null @@ -1,110 +0,0 @@ -char const* NOTES = -  "ZF_and_E means unnormalized scaled features.\n" -  "For grammars with one nonterminal: F_and_E is joint,\n" -  "F_given_E and E_given_F are conditional.\n" -  "TODO: group rules by root nonterminal and then normalize.\n"; - - -#include <iostream> -#include <fstream> -#include <tr1/unordered_map> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include <boost/functional/hash.hpp> - -#include "prob.h" -#include "filelib.h" -#include "trule.h" -#include "weights.h" - -namespace po = boost::program_options; -using namespace std; - -typedef std::tr1::unordered_map<vector<WordID>, prob_t, boost::hash<vector<WordID> > > MarginalMap; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("grammar,g", po::value<string>(), "Grammar file") -        ("weights,w", po::value<string>(), "Weights file") -    ("unnormalized,u", "Always include ZF_and_E unnormalized score (default: only if sum was >1)") -    ; -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config,c", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); - -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    const string cfg = (*conf)["config"].as<string>(); -    cerr << "Configuration file: " << cfg << endl; -    ifstream config(cfg.c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || !conf->count("grammar") || !conf->count("weights")) { -    cerr << dcmdline_options << endl; -    cerr << NOTES << endl; -    exit(1); -  } -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); -  const string wfile = conf["weights"].as<string>(); -  const string gfile = conf["grammar"].as<string>(); -  vector<weight_t> w; -  Weights::InitFromFile(wfile, &w); -  MarginalMap e_tots; -  MarginalMap f_tots; -  prob_t tot; -  { -    ReadFile rf(gfile); -    assert(*rf.stream()); -    istream& in = *rf.stream(); -    cerr << "Computing marginals...\n"; -    int lc = 0; -    while(in) { -      string line; -      getline(in, line); -      ++lc; -      if (line.empty()) continue; -      TRule tr(line, true); -      if (tr.GetFeatureValues().empty()) -        cerr << "Line " << lc << ": empty features - may introduce bias\n"; -      prob_t prob; -      prob.logeq(tr.GetFeatureValues().dot(w)); -      e_tots[tr.e_] += prob; -      f_tots[tr.f_] += prob; -      tot += prob; -    } -  } -  bool normalized = (fabs(log(tot)) < 0.001); -  cerr << "Total: " << tot << (normalized ? " [normalized]" : " [scaled]") << endl; -  ReadFile rf(gfile); -  istream&in = *rf.stream(); -  while(in) { -    string line; -    getline(in, line); -    if (line.empty()) continue; -    TRule tr(line, true); -    const double lp = tr.GetFeatureValues().dot(w); -    if (std::isinf(lp)) { continue; } -    tr.scores_.clear(); - -    cout << tr.AsString() << " ||| F_and_E=" << lp - log(tot); -    if (!normalized || conf.count("unnormalized")) { -      cout << ";ZF_and_E=" << lp; -    } -    cout << ";F_given_E=" << lp - log(e_tots[tr.e_]) -         << ";E_given_F=" << lp - log(f_tots[tr.f_]) << endl; -  } -  return 0; -} - diff --git a/training/crf/Makefile.am b/training/crf/Makefile.am new file mode 100644 index 00000000..d203df25 --- /dev/null +++ b/training/crf/Makefile.am @@ -0,0 +1,27 @@ +bin_PROGRAMS = \ +  mpi_batch_optimize \ +  mpi_compute_cllh \ +  mpi_extract_features \ +  mpi_extract_reachable \ +  mpi_flex_optimize \ +  mpi_online_optimize + +mpi_online_optimize_SOURCES = mpi_online_optimize.cc +mpi_online_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc +mpi_flex_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc +mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_extract_features_SOURCES = mpi_extract_features.cc +mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc +mpi_batch_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc +mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir)/training -I$(top_srcdir)/training/utils -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/cllh_observer.cc b/training/crf/cllh_observer.cc index 4ec2fa65..4ec2fa65 100644 --- a/training/cllh_observer.cc +++ b/training/crf/cllh_observer.cc diff --git a/training/cllh_observer.h b/training/crf/cllh_observer.h index 0de47331..0de47331 100644 --- a/training/cllh_observer.h +++ b/training/crf/cllh_observer.h diff --git a/training/mpi_batch_optimize.cc b/training/crf/mpi_batch_optimize.cc index 2eff07e4..2eff07e4 100644 --- a/training/mpi_batch_optimize.cc +++ b/training/crf/mpi_batch_optimize.cc diff --git a/training/mpi_compute_cllh.cc b/training/crf/mpi_compute_cllh.cc index 066389d0..066389d0 100644 --- a/training/mpi_compute_cllh.cc +++ b/training/crf/mpi_compute_cllh.cc diff --git a/training/mpi_extract_features.cc b/training/crf/mpi_extract_features.cc index 6750aa15..6750aa15 100644 --- a/training/mpi_extract_features.cc +++ b/training/crf/mpi_extract_features.cc diff --git a/training/mpi_extract_reachable.cc b/training/crf/mpi_extract_reachable.cc index 2a7c2b9d..2a7c2b9d 100644 --- a/training/mpi_extract_reachable.cc +++ b/training/crf/mpi_extract_reachable.cc diff --git a/training/mpi_flex_optimize.cc b/training/crf/mpi_flex_optimize.cc index b52decdc..b52decdc 100644 --- a/training/mpi_flex_optimize.cc +++ b/training/crf/mpi_flex_optimize.cc diff --git a/training/mpi_online_optimize.cc b/training/crf/mpi_online_optimize.cc index d6968848..d6968848 100644 --- a/training/mpi_online_optimize.cc +++ b/training/crf/mpi_online_optimize.cc diff --git a/training/dep-reorder/conll2reordering-forest.pl b/training/dep-reorder/conll2reordering-forest.pl deleted file mode 100755 index 3cd226be..00000000 --- a/training/dep-reorder/conll2reordering-forest.pl +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } -my $FIRST_CONV = "$script_dir/scripts/conll2simplecfg.pl"; -my $CDEC = "$script_dir/../../decoder/cdec"; - -our $tfile1 = "grammar1.$$"; -our $tfile2 = "text.$$"; - -die "Usage: $0 parses.conll\n" unless scalar @ARGV == 1; -open C, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; - -END { unlink $tfile1; unlink "$tfile1.cfg"; unlink $tfile2; } - -my $first = 1; -open T, ">$tfile1" or die "Can't write $tfile1: $!"; -my $lc = 0; -my $flag = 0; -my @words = (); -while(<C>) { -  print T; -  chomp; -  if (/^$/) { -    if ($first) { $first = undef; } else { if ($flag) { print "\n"; $flag = 0; } } -    $first = undef; -    close T; -    open SO, ">$tfile2" or die "Can't write $tfile2: $!"; -    print SO "@words\n"; -    close SO; -    @words=(); -    `$FIRST_CONV < $tfile1 > $tfile1.cfg`; -    if ($? != 0) { -      die "Error code: $?"; -    } -    my $cfg = `$CDEC -n -S 10000 -f scfg -g $tfile1.cfg -i $tfile2 --show_cfg_search_space 2>/dev/null`; -    if ($? != 0) { -      die "Error code: $?"; -    } -    my @rules = split /\n/, $cfg; -    shift @rules; # get rid of output -    for my $rule (@rules) { -      my ($lhs, $f, $e, $feats) = split / \|\|\| /, $rule; -      $f =~ s/,\d\]/\]/g; -      $feats = 'TOP=1' unless $feats; -      if ($lhs =~ /\[Goal_\d+\]/) { $lhs = '[S]'; } -      print "$lhs ||| $f ||| $feats\n"; -      if ($e eq '[1] [2]') { -        my ($a, $b) = split /\s+/, $f; -        $feats =~ s/=1$//; -        my ($x, $y) = split /_/, $feats; -        print "$lhs ||| $b $a ||| ${y}_$x=1\n"; -      } -      $flag = 1; -    } -    open T, ">$tfile1" or die "Can't write $tfile1: $!"; -    $lc = -1; -  } else { -    my ($ind, $word, @dmmy) = split /\s+/; -    push @words, $word; -  } -  $lc++; -} -close T; - diff --git a/training/dep-reorder/george.conll b/training/dep-reorder/george.conll deleted file mode 100644 index 7eebb360..00000000 --- a/training/dep-reorder/george.conll +++ /dev/null @@ -1,4 +0,0 @@ -1	George	_	GEORGE	_	_	2	X	_	_ -2	hates	_	HATES	_	_	0	X	_	_ -3	broccoli	_	BROC	_	_	2	X	_	_ - diff --git a/training/dep-reorder/scripts/conll2simplecfg.pl b/training/dep-reorder/scripts/conll2simplecfg.pl deleted file mode 100755 index b101347a..00000000 --- a/training/dep-reorder/scripts/conll2simplecfg.pl +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -# 1	在	_	10	_	_	4	X	_	_ -# 2	门厅	_	3	_	_	1	X	_	_ -# 3	下面	_	23	_	_	4	X	_	_ -# 4	。	_	45	_	_	0	X	_	_ - -my @ldeps; -my @rdeps; -@ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } -@rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } -my $rootcat = 0; -my @cats = ('S'); -my $len = 0; -my @noposcats = ('S'); -while(<>) { -  chomp; -  if (/^\s*$/) { -    write_cfg($len); -    $len = 0; -    @cats=('S'); -    @noposcats = ('S'); -    @ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } -    @rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } -    next; -  } -  $len++; -  my ($pos, $word, $d1, $xcat, $d2, $d3, $headpos, $deptype) = split /\s+/; -  my $cat = "C$xcat"; -  my $catpos = $cat . "_$pos"; -  push @cats, $catpos; -  push @noposcats, $cat; -  print "[$catpos] ||| $word ||| $word ||| Word=1\n"; -  if ($headpos == 0) { $rootcat = $pos; } -  if ($pos < $headpos) { -    push @{$ldeps[$headpos]}, $pos; -  } else { -    push @{$rdeps[$headpos]}, $pos; -  } -} - -sub write_cfg { -  my $len = shift; -  for (my $i = 1; $i <= $len; $i++) { -    my @lds = @{$ldeps[$i]}; -    for my $ld (@lds) { -      print "[$cats[$i]] ||| [$cats[$ld],1] [$cats[$i],2] ||| [1] [2] ||| $noposcats[$ld]_$noposcats[$i]=1\n"; -    } -    my @rds = @{$rdeps[$i]}; -    for my $rd (@rds) { -      print "[$cats[$i]] ||| [$cats[$i],1] [$cats[$rd],2] ||| [1] [2] ||| $noposcats[$i]_$noposcats[$rd]=1\n"; -    } -  } -  print "[S] ||| [$cats[$rootcat],1] ||| [1] ||| TOP=1\n"; -} - diff --git a/dpmert/Makefile.am b/training/dpmert/Makefile.am index 00768271..ff318bef 100644 --- a/dpmert/Makefile.am +++ b/training/dpmert/Makefile.am @@ -1,20 +1,12 @@  bin_PROGRAMS = \    mr_dpmert_map \    mr_dpmert_reduce \ -  mr_dpmert_generate_mapper_input \ -  sentserver \ -  sentclient +  mr_dpmert_generate_mapper_input  noinst_PROGRAMS = \    lo_test  TESTS = lo_test -sentserver_SOURCES = sentserver.c -sentserver_LDFLAGS = -pthread - -sentclient_SOURCES = sentclient.c -sentclient_LDFLAGS = -pthread -  mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc  mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz diff --git a/dpmert/ces.cc b/training/dpmert/ces.cc index 157b2d17..157b2d17 100644 --- a/dpmert/ces.cc +++ b/training/dpmert/ces.cc diff --git a/dpmert/ces.h b/training/dpmert/ces.h index e4fa2080..e4fa2080 100644 --- a/dpmert/ces.h +++ b/training/dpmert/ces.h diff --git a/dpmert/divide_refs.py b/training/dpmert/divide_refs.py index b478f918..b478f918 100755 --- a/dpmert/divide_refs.py +++ b/training/dpmert/divide_refs.py diff --git a/dpmert/dpmert.pl b/training/dpmert/dpmert.pl index c4f98870..559420f5 100755 --- a/dpmert/dpmert.pl +++ b/training/dpmert/dpmert.pl @@ -2,7 +2,7 @@  use strict;  my @ORIG_ARGV=@ARGV;  use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }  # Skip local config (used for distributing jobs) if we're running in local-only mode  use LocalConfig; @@ -17,21 +17,22 @@ my $srcFile;  # deprecated  my $refFiles; # deprecated  my $default_jobs = env_default_jobs();  my $bin_dir = $SCRIPT_DIR; +my $util_dir = "$SCRIPT_DIR/../utils";  die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score";  die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;  my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input";  my $MAPPER = "$bin_dir/mr_dpmert_map";  my $REDUCER = "$bin_dir/mr_dpmert_reduce"; -my $parallelize = "$bin_dir/parallelize.pl"; -my $libcall = "$bin_dir/libcall.pl"; -my $sentserver = "$bin_dir/sentserver"; -my $sentclient = "$bin_dir/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; +my $parallelize = "$util_dir/parallelize.pl"; +my $libcall = "$util_dir/libcall.pl"; +my $sentserver = "$util_dir/sentserver"; +my $sentclient = "$util_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";  my $SCORER = $FAST_SCORE;  die "Can't find $MAPPER" unless -x $MAPPER; -my $cdec = "$bin_dir/../decoder/cdec"; +my $cdec = "$bin_dir/../../decoder/cdec";  die "Can't find decoder in $cdec" unless -x $cdec;  die "Can't find $parallelize" unless -x $parallelize;  die "Can't find $libcall" unless -e $libcall; diff --git a/dpmert/error_surface.cc b/training/dpmert/error_surface.cc index 515b67f8..515b67f8 100644 --- a/dpmert/error_surface.cc +++ b/training/dpmert/error_surface.cc diff --git a/dpmert/error_surface.h b/training/dpmert/error_surface.h index bb65847b..bb65847b 100644 --- a/dpmert/error_surface.h +++ b/training/dpmert/error_surface.h diff --git a/dpmert/line_mediator.pl b/training/dpmert/line_mediator.pl index bc2bb24c..bc2bb24c 100755 --- a/dpmert/line_mediator.pl +++ b/training/dpmert/line_mediator.pl diff --git a/dpmert/line_optimizer.cc b/training/dpmert/line_optimizer.cc index 9cf33502..9cf33502 100644 --- a/dpmert/line_optimizer.cc +++ b/training/dpmert/line_optimizer.cc diff --git a/dpmert/line_optimizer.h b/training/dpmert/line_optimizer.h index 83819f41..83819f41 100644 --- a/dpmert/line_optimizer.h +++ b/training/dpmert/line_optimizer.h diff --git a/dpmert/lo_test.cc b/training/dpmert/lo_test.cc index 95a08d3d..95a08d3d 100644 --- a/dpmert/lo_test.cc +++ b/training/dpmert/lo_test.cc diff --git a/dpmert/mert_geometry.cc b/training/dpmert/mert_geometry.cc index d6973658..d6973658 100644 --- a/dpmert/mert_geometry.cc +++ b/training/dpmert/mert_geometry.cc diff --git a/dpmert/mert_geometry.h b/training/dpmert/mert_geometry.h index a8b6959e..a8b6959e 100644 --- a/dpmert/mert_geometry.h +++ b/training/dpmert/mert_geometry.h diff --git a/dpmert/mr_dpmert_generate_mapper_input.cc b/training/dpmert/mr_dpmert_generate_mapper_input.cc index 199cd23a..199cd23a 100644 --- a/dpmert/mr_dpmert_generate_mapper_input.cc +++ b/training/dpmert/mr_dpmert_generate_mapper_input.cc diff --git a/dpmert/mr_dpmert_map.cc b/training/dpmert/mr_dpmert_map.cc index d1efcf96..d1efcf96 100644 --- a/dpmert/mr_dpmert_map.cc +++ b/training/dpmert/mr_dpmert_map.cc diff --git a/dpmert/mr_dpmert_reduce.cc b/training/dpmert/mr_dpmert_reduce.cc index 31512a03..31512a03 100644 --- a/dpmert/mr_dpmert_reduce.cc +++ b/training/dpmert/mr_dpmert_reduce.cc diff --git a/dpmert/test_aer/README b/training/dpmert/test_aer/README index 819b2e32..819b2e32 100644 --- a/dpmert/test_aer/README +++ b/training/dpmert/test_aer/README diff --git a/dpmert/test_aer/cdec.ini b/training/dpmert/test_aer/cdec.ini index 08187848..08187848 100644 --- a/dpmert/test_aer/cdec.ini +++ b/training/dpmert/test_aer/cdec.ini diff --git a/dpmert/test_aer/corpus.src b/training/dpmert/test_aer/corpus.src index 31b23971..31b23971 100644 --- a/dpmert/test_aer/corpus.src +++ b/training/dpmert/test_aer/corpus.src diff --git a/dpmert/test_aer/grammar b/training/dpmert/test_aer/grammar index 9d857824..9d857824 100644 --- a/dpmert/test_aer/grammar +++ b/training/dpmert/test_aer/grammar diff --git a/dpmert/test_aer/ref.0 b/training/dpmert/test_aer/ref.0 index 734a9c5b..734a9c5b 100644 --- a/dpmert/test_aer/ref.0 +++ b/training/dpmert/test_aer/ref.0 diff --git a/dpmert/test_aer/weights b/training/dpmert/test_aer/weights index afc9282e..afc9282e 100644 --- a/dpmert/test_aer/weights +++ b/training/dpmert/test_aer/weights diff --git a/dpmert/test_data/0.json.gz b/training/dpmert/test_data/0.json.gzBinary files differ index 30f8dd77..30f8dd77 100644 --- a/dpmert/test_data/0.json.gz +++ b/training/dpmert/test_data/0.json.gz diff --git a/dpmert/test_data/1.json.gz b/training/dpmert/test_data/1.json.gzBinary files differ index c82cc179..c82cc179 100644 --- a/dpmert/test_data/1.json.gz +++ b/training/dpmert/test_data/1.json.gz diff --git a/dpmert/test_data/c2e.txt.0 b/training/dpmert/test_data/c2e.txt.0 index 12c4abe9..12c4abe9 100644 --- a/dpmert/test_data/c2e.txt.0 +++ b/training/dpmert/test_data/c2e.txt.0 diff --git a/dpmert/test_data/c2e.txt.1 b/training/dpmert/test_data/c2e.txt.1 index 4ac12df1..4ac12df1 100644 --- a/dpmert/test_data/c2e.txt.1 +++ b/training/dpmert/test_data/c2e.txt.1 diff --git a/dpmert/test_data/c2e.txt.2 b/training/dpmert/test_data/c2e.txt.2 index 2f67b72f..2f67b72f 100644 --- a/dpmert/test_data/c2e.txt.2 +++ b/training/dpmert/test_data/c2e.txt.2 diff --git a/dpmert/test_data/c2e.txt.3 b/training/dpmert/test_data/c2e.txt.3 index 5483cef6..5483cef6 100644 --- a/dpmert/test_data/c2e.txt.3 +++ b/training/dpmert/test_data/c2e.txt.3 diff --git a/dpmert/test_data/re.txt.0 b/training/dpmert/test_data/re.txt.0 index 86eff087..86eff087 100644 --- a/dpmert/test_data/re.txt.0 +++ b/training/dpmert/test_data/re.txt.0 diff --git a/dpmert/test_data/re.txt.1 b/training/dpmert/test_data/re.txt.1 index 2140f198..2140f198 100644 --- a/dpmert/test_data/re.txt.1 +++ b/training/dpmert/test_data/re.txt.1 diff --git a/dpmert/test_data/re.txt.2 b/training/dpmert/test_data/re.txt.2 index 94e46286..94e46286 100644 --- a/dpmert/test_data/re.txt.2 +++ b/training/dpmert/test_data/re.txt.2 diff --git a/dpmert/test_data/re.txt.3 b/training/dpmert/test_data/re.txt.3 index f87c3308..f87c3308 100644 --- a/dpmert/test_data/re.txt.3 +++ b/training/dpmert/test_data/re.txt.3 diff --git a/dtrain/Makefile.am b/training/dtrain/Makefile.am index ca9581f5..5b48e756 100644 --- a/dtrain/Makefile.am +++ b/training/dtrain/Makefile.am @@ -1,7 +1,7 @@  bin_PROGRAMS = dtrain  dtrain_SOURCES = dtrain.cc score.cc -dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz  AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/README.md b/training/dtrain/README.md index 7edabbf1..7edabbf1 100644 --- a/dtrain/README.md +++ b/training/dtrain/README.md diff --git a/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 18286668..18286668 100644 --- a/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc diff --git a/dtrain/dtrain.h b/training/dtrain/dtrain.h index 4b6f415c..4b6f415c 100644 --- a/dtrain/dtrain.h +++ b/training/dtrain/dtrain.h diff --git a/dtrain/hstreaming/avg.rb b/training/dtrain/hstreaming/avg.rb index 2599c732..2599c732 100755 --- a/dtrain/hstreaming/avg.rb +++ b/training/dtrain/hstreaming/avg.rb diff --git a/dtrain/hstreaming/cdec.ini b/training/dtrain/hstreaming/cdec.ini index d4f5cecd..d4f5cecd 100644 --- a/dtrain/hstreaming/cdec.ini +++ b/training/dtrain/hstreaming/cdec.ini diff --git a/dtrain/hstreaming/dtrain.ini b/training/dtrain/hstreaming/dtrain.ini index a2c219a1..a2c219a1 100644 --- a/dtrain/hstreaming/dtrain.ini +++ b/training/dtrain/hstreaming/dtrain.ini diff --git a/dtrain/hstreaming/dtrain.sh b/training/dtrain/hstreaming/dtrain.sh index 877ff94c..877ff94c 100755 --- a/dtrain/hstreaming/dtrain.sh +++ b/training/dtrain/hstreaming/dtrain.sh diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/training/dtrain/hstreaming/hadoop-streaming-job.sh index 92419956..92419956 100755 --- a/dtrain/hstreaming/hadoop-streaming-job.sh +++ b/training/dtrain/hstreaming/hadoop-streaming-job.sh diff --git a/dtrain/hstreaming/lplp.rb b/training/dtrain/hstreaming/lplp.rb index f0cd58c5..f0cd58c5 100755 --- a/dtrain/hstreaming/lplp.rb +++ b/training/dtrain/hstreaming/lplp.rb diff --git a/dtrain/hstreaming/red-test b/training/dtrain/hstreaming/red-test index 2623d697..2623d697 100644 --- a/dtrain/hstreaming/red-test +++ b/training/dtrain/hstreaming/red-test diff --git a/dtrain/kbestget.h b/training/dtrain/kbestget.h index dd8882e1..dd8882e1 100644 --- a/dtrain/kbestget.h +++ b/training/dtrain/kbestget.h diff --git a/dtrain/ksampler.h b/training/dtrain/ksampler.h index bc2f56cd..bc2f56cd 100644 --- a/dtrain/ksampler.h +++ b/training/dtrain/ksampler.h diff --git a/dtrain/pairsampling.h b/training/dtrain/pairsampling.h index 84be1efb..84be1efb 100644 --- a/dtrain/pairsampling.h +++ b/training/dtrain/pairsampling.h diff --git a/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index 1d277ff6..1d277ff6 100755 --- a/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb diff --git a/dtrain/parallelize/test/cdec.ini b/training/dtrain/parallelize/test/cdec.ini index 72e99dc5..72e99dc5 100644 --- a/dtrain/parallelize/test/cdec.ini +++ b/training/dtrain/parallelize/test/cdec.ini diff --git a/dtrain/parallelize/test/dtrain.ini b/training/dtrain/parallelize/test/dtrain.ini index 03f9d240..03f9d240 100644 --- a/dtrain/parallelize/test/dtrain.ini +++ b/training/dtrain/parallelize/test/dtrain.ini diff --git a/dtrain/parallelize/test/in b/training/dtrain/parallelize/test/in index a312809f..a312809f 100644 --- a/dtrain/parallelize/test/in +++ b/training/dtrain/parallelize/test/in diff --git a/dtrain/parallelize/test/refs b/training/dtrain/parallelize/test/refs index 4d3128cb..4d3128cb 100644 --- a/dtrain/parallelize/test/refs +++ b/training/dtrain/parallelize/test/refs diff --git a/dtrain/score.cc b/training/dtrain/score.cc index 34fc86a9..34fc86a9 100644 --- a/dtrain/score.cc +++ b/training/dtrain/score.cc diff --git a/dtrain/score.h b/training/dtrain/score.h index f317c903..f317c903 100644 --- a/dtrain/score.h +++ b/training/dtrain/score.h diff --git a/dtrain/test/example/README b/training/dtrain/test/example/README index 6937b11b..6937b11b 100644 --- a/dtrain/test/example/README +++ b/training/dtrain/test/example/README diff --git a/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini index d5955f0e..d5955f0e 100644 --- a/dtrain/test/example/cdec.ini +++ b/training/dtrain/test/example/cdec.ini diff --git a/dtrain/test/example/dtrain.ini b/training/dtrain/test/example/dtrain.ini index 72d50ca1..72d50ca1 100644 --- a/dtrain/test/example/dtrain.ini +++ b/training/dtrain/test/example/dtrain.ini diff --git a/dtrain/test/example/expected-output b/training/dtrain/test/example/expected-output index 05326763..05326763 100644 --- a/dtrain/test/example/expected-output +++ b/training/dtrain/test/example/expected-output diff --git a/dtrain/test/parallelize/cdec.ini b/training/dtrain/test/parallelize/cdec.ini index 72e99dc5..72e99dc5 100644 --- a/dtrain/test/parallelize/cdec.ini +++ b/training/dtrain/test/parallelize/cdec.ini diff --git a/dtrain/test/parallelize/dtrain.ini b/training/dtrain/test/parallelize/dtrain.ini index 03f9d240..03f9d240 100644 --- a/dtrain/test/parallelize/dtrain.ini +++ b/training/dtrain/test/parallelize/dtrain.ini diff --git a/dtrain/test/parallelize/in b/training/dtrain/test/parallelize/in index a312809f..a312809f 100644 --- a/dtrain/test/parallelize/in +++ b/training/dtrain/test/parallelize/in diff --git a/dtrain/test/parallelize/refs b/training/dtrain/test/parallelize/refs index 4d3128cb..4d3128cb 100644 --- a/dtrain/test/parallelize/refs +++ b/training/dtrain/test/parallelize/refs diff --git a/dtrain/test/toy/cdec.ini b/training/dtrain/test/toy/cdec.ini index 98b02d44..98b02d44 100644 --- a/dtrain/test/toy/cdec.ini +++ b/training/dtrain/test/toy/cdec.ini diff --git a/dtrain/test/toy/dtrain.ini b/training/dtrain/test/toy/dtrain.ini index a091732f..a091732f 100644 --- a/dtrain/test/toy/dtrain.ini +++ b/training/dtrain/test/toy/dtrain.ini diff --git a/dtrain/test/toy/input b/training/dtrain/test/toy/input index 4d10a9ea..4d10a9ea 100644 --- a/dtrain/test/toy/input +++ b/training/dtrain/test/toy/input diff --git a/training/feature_expectations.cc b/training/feature_expectations.cc deleted file mode 100644 index f1a85495..00000000 --- a/training/feature_expectations.cc +++ /dev/null @@ -1,232 +0,0 @@ -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> -#include <cmath> -#include <tr1/memory> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "verbose.h" -#include "hg.h" -#include "prob.h" -#include "inside_outside.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "online_optimizer.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "sampler.h" - -#ifdef HAVE_MPI -#include <boost/mpi/timer.hpp> -#include <boost/mpi.hpp> -namespace mpi = boost::mpi; -#endif - -using namespace std; -namespace po = boost::program_options; - -struct FComp { -  const vector<double>& w_; -  FComp(const vector<double>& w) : w_(w) {} -  bool operator()(int a, int b) const { -    return fabs(w_[a]) > fabs(w_[b]); -  } -}; - -void ShowFeatures(const vector<double>& w) { -  vector<int> fnums(w.size()); -  for (int i = 0; i < w.size(); ++i) -    fnums[i] = i; -  sort(fnums.begin(), fnums.end(), FComp(w)); -  for (vector<int>::iterator i = fnums.begin(); i != fnums.end(); ++i) { -    if (w[*i]) cout << FD::Convert(*i) << ' ' << w[*i] << endl; -  } -} - -void ReadConfig(const string& ini, vector<string>* out) { -  ReadFile rf(ini); -  istream& in = *rf.stream(); -  while(in) { -    string line; -    getline(in, line); -    if (!in) continue; -    out->push_back(line); -  } -} - -void StoreConfig(const vector<string>& cfg, istringstream* o) { -  ostringstream os; -  for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; } -  o->str(os.str()); -} - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("input,i",po::value<string>(),"Corpus of source language sentences") -        ("weights,w",po::value<string>(),"Input feature weights file") -        ("decoder_config,c",po::value<string>(), "cdec.ini file"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || !conf->count("input") || !conf->count("decoder_config")) { -    cerr << dcmdline_options << endl; -    return false; -  } -  return true; -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) { -  ReadFile rf(fname); -  istream& in = *rf.stream(); -  string line; -  int id = 0; -  while(in) { -    getline(in, line); -    if (!in) break; -    if (id % size == rank) { -      c->push_back(line); -      order->push_back(id); -    } -    ++id; -  } -} - -static const double kMINUS_EPSILON = -1e-6; - -struct TrainingObserver : public DecoderObserver { -  void Reset() { -    acc_exp.clear(); -    total_complete = 0; -  }  - -  virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { -    cur_model_exp.clear(); -    state = 1; -  } - -  // compute model expectations, denominator of objective -  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { -    assert(state == 1); -    state = 2; -    const prob_t z = InsideOutside<prob_t, -                                   EdgeProb, -                                   SparseVector<prob_t>, -                                   EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); -    cur_model_exp /= z; -    acc_exp += cur_model_exp; -  } - -  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { -    cerr << "IGNORING ALIGNMENT FOREST!\n"; -  } - -  virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { -    if (state == 2) { -      ++total_complete; -    } -  } - -  void GetExpectations(SparseVector<double>* g) const { -    g->clear(); -    for (SparseVector<prob_t>::const_iterator it = acc_exp.begin(); it != acc_exp.end(); ++it) -      g->set_value(it->first, it->second); -  } - -  int total_complete; -  SparseVector<prob_t> cur_model_exp; -  SparseVector<prob_t> acc_exp; -  int state; -}; - -#ifdef HAVE_MPI -namespace boost { namespace mpi { -  template<> -  struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> >  -    : mpl::true_ { }; -} } // end namespace boost::mpi -#endif - -int main(int argc, char** argv) { -#ifdef HAVE_MPI -  mpi::environment env(argc, argv); -  mpi::communicator world; -  const int size = world.size();  -  const int rank = world.rank(); -#else -  const int size = 1; -  const int rank = 0; -#endif -  if (size > 1) SetSilent(true);  // turn off verbose decoder output -  register_feature_functions(); - -  po::variables_map conf; -  if (!InitCommandLine(argc, argv, &conf)) -    return 1; - -  // load initial weights -  Weights weights; -  if (conf.count("weights")) -    weights.InitFromFile(conf["weights"].as<string>()); - -  vector<string> corpus; -  vector<int> ids; -  ReadTrainingCorpus(conf["input"].as<string>(), rank, size, &corpus, &ids); -  assert(corpus.size() > 0); - -  vector<string> cdec_ini; -  ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini); -  istringstream ini; -  StoreConfig(cdec_ini, &ini); -  Decoder decoder(&ini); -  if (decoder.GetConf()["input"].as<string>() != "-") { -    cerr << "cdec.ini must not set an input file\n"; -    return 1; -  } - -  SparseVector<double> x; -  weights.InitSparseVector(&x); -  TrainingObserver observer; - -  weights.InitFromVector(x); -  vector<double> lambdas; -  weights.InitVector(&lambdas); -  decoder.SetWeights(lambdas); -  observer.Reset(); -  for (unsigned i = 0; i < corpus.size(); ++i) { -    int id = ids[i]; -    decoder.SetId(id); -    decoder.Decode(corpus[i], &observer); -  } -  SparseVector<double> local_exps, exps; -  observer.GetExpectations(&local_exps); -#ifdef HAVE_MPI -  reduce(world, local_exps, exps, std::plus<SparseVector<double> >(), 0); -#else -  exps.swap(local_exps); -#endif - -  weights.InitFromVector(exps); -  weights.InitVector(&lambdas); -  ShowFeatures(lambdas); - -  return 0; -} diff --git a/training/lbl_model.cc b/training/lbl_model.cc deleted file mode 100644 index a46ce33c..00000000 --- a/training/lbl_model.cc +++ /dev/null @@ -1,421 +0,0 @@ -#include <iostream> - -#include "config.h" -#ifndef HAVE_EIGEN -  int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; } -#else - -#include <cstdlib> -#include <algorithm> -#include <cmath> -#include <set> -#include <cstring> // memset -#include <ctime> - -#ifdef HAVE_MPI -#include <boost/mpi/timer.hpp> -#include <boost/mpi.hpp> -#include <boost/archive/text_oarchive.hpp> -namespace mpi = boost::mpi; -#endif -#include <boost/math/special_functions/fpclassify.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include <Eigen/Dense> - -#include "corpus_tools.h" -#include "optimize.h" -#include "array2d.h" -#include "m.h" -#include "lattice.h" -#include "stringlib.h" -#include "filelib.h" -#include "tdict.h" - -namespace po = boost::program_options; -using namespace std; - -#define kDIMENSIONS 10 -typedef Eigen::Matrix<double, kDIMENSIONS, 1> RVector; -typedef Eigen::Matrix<double, 1, kDIMENSIONS> RTVector; -typedef Eigen::Matrix<double, kDIMENSIONS, kDIMENSIONS> TMatrix; -vector<RVector> r_src, r_trg; - -#if HAVE_MPI -namespace boost { -namespace serialization { - -template<class Archive> -void serialize(Archive & ar, RVector & v, const unsigned int version) { -  for (unsigned i = 0; i < kDIMENSIONS; ++i) -    ar & v[i]; -} - -} // namespace serialization -} // namespace boost -#endif - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("input,i",po::value<string>(),"Input file") -        ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training") -        ("regularization_strength,C",po::value<double>()->default_value(0.1),"L2 regularization strength (0 for no regularization)") -        ("eta", po::value<double>()->default_value(0.1f), "Eta for SGD") -        ("source_embeddings,f", po::value<string>(), "File containing source embeddings (if unset, random vectors will be used)") -        ("target_embeddings,e", po::value<string>(), "File containing target embeddings (if unset, random vectors will be used)") -        ("random_seed,s", po::value<unsigned>(), "Random seed") -        ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") -        ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (argc < 2 || conf->count("help")) { -    cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; -    cerr << dcmdline_options << endl; -    return false; -  } -  return true; -} - -void Normalize(RVector* v) { -  double norm = v->norm(); -  assert(norm > 0.0f); -  *v /= norm; -} - -void Flatten(const TMatrix& m, vector<double>* v) { -  unsigned c = 0; -  v->resize(kDIMENSIONS * kDIMENSIONS); -  for (unsigned i = 0; i < kDIMENSIONS; ++i) -    for (unsigned j = 0; j < kDIMENSIONS; ++j) { -      assert(boost::math::isfinite(m(i, j))); -      (*v)[c++] = m(i,j); -    } -} - -void Unflatten(const vector<double>& v, TMatrix* m) { -  unsigned c = 0; -  for (unsigned i = 0; i < kDIMENSIONS; ++i) -    for (unsigned j = 0; j < kDIMENSIONS; ++j) { -      assert(boost::math::isfinite(v[c])); -      (*m)(i, j) = v[c++]; -    } -} - -double ApplyRegularization(const double C, -                           const vector<double>& weights, -                           vector<double>* g) { -  assert(weights.size() == g->size()); -  double reg = 0; -  for (size_t i = 0; i < weights.size(); ++i) { -    const double& w_i = weights[i]; -    double& g_i = (*g)[i]; -    reg += C * w_i * w_i; -    g_i += 2 * C * w_i; -  } -  return reg; -} - -void LoadEmbeddings(const string& filename, vector<RVector>* pv) { -  vector<RVector>& v = *pv; -  cerr << "Reading embeddings from " << filename << " ...\n"; -  ReadFile rf(filename); -  istream& in = *rf.stream(); -  string line; -  unsigned lc = 0; -  while(getline(in, line)) { -    ++lc; -    size_t cur = line.find(' '); -    if (cur == string::npos || cur == 0) { -      cerr << "Parse error reading line " << lc << ":\n" << line << endl; -      abort(); -    } -    WordID w = TD::Convert(line.substr(0, cur)); -    if (w >= v.size()) continue; -    RVector& curv = v[w]; -    line[cur] = 0; -    size_t start = cur + 1; -    cur = start + 1; -    size_t c = 0; -    while(cur < line.size()) { -      if (line[cur] == ' ') { -        line[cur] = 0; -        curv[c++] = strtod(&line[start], NULL); -        start = cur + 1; -        cur = start; -        if (c == kDIMENSIONS) break; -      } -      ++cur; -    } -    if (c < kDIMENSIONS && cur != start) { -      if (cur < line.size()) line[cur] = 0; -      curv[c++] = strtod(&line[start], NULL); -    } -    if (c != kDIMENSIONS) { -      static bool first = true; -      if (first) { -        cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n"; -        first = false; -      } -      for (; c < kDIMENSIONS; ++c) curv[c] = rand(); -    } -    if (c == kDIMENSIONS && cur != line.size()) { -      static bool first = true; -      if (first) { -        cerr << " embedding file contains more dimensions than configured with, truncating.\n"; -        first = false; -      } -    } -  } -} - -int main(int argc, char** argv) { -#ifdef HAVE_MPI -  std::cerr << "**MPI enabled.\n"; -  mpi::environment env(argc, argv); -  mpi::communicator world; -  const int size = world.size();  -  const int rank = world.rank(); -#else -  std::cerr << "**MPI disabled.\n"; -  const int rank = 0; -  const int size = 1; -#endif -  po::variables_map conf; -  if (!InitCommandLine(argc, argv, &conf)) return 1; -  const string fname = conf["input"].as<string>(); -  const double reg_strength = conf["regularization_strength"].as<double>(); -  const bool has_l2 = reg_strength; -  assert(reg_strength >= 0.0f); -  const int ITERATIONS = conf["iterations"].as<unsigned>(); -  const double eta = conf["eta"].as<double>(); -  const double diagonal_tension = conf["diagonal_tension"].as<double>(); -  bool SGD = false; -  if (diagonal_tension < 0.0) { -    cerr << "Invalid value for diagonal_tension: must be >= 0\n"; -    return 1; -  } -  string testset; -  if (conf.count("testset")) testset = conf["testset"].as<string>(); - -  unsigned lc = 0; -  vector<double> unnormed_a_i; -  bool flag = false; -  vector<vector<WordID> > srcs, trgs; -  vector<WordID> vocab_e; -  { -    set<WordID> svocab_e, svocab_f; -    CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size); -    copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e)); -  } -  cerr << "Number of target word types: " << vocab_e.size() << endl; -  const double num_examples = lc; - -  boost::shared_ptr<LBFGSOptimizer> lbfgs; -  if (rank == 0) -    lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100)); -  r_trg.resize(TD::NumWords() + 1); -  r_src.resize(TD::NumWords() + 1); -  vector<set<unsigned> > trg_pos(TD::NumWords() + 1); - -  if (conf.count("random_seed")) { -    srand(conf["random_seed"].as<unsigned>()); -  } else { -    unsigned seed = time(NULL) + rank * 100; -    cerr << "Random seed: " << seed << endl; -    srand(seed); -  } -   -  TMatrix t = TMatrix::Zero(); -  if (rank == 0) { -    t = TMatrix::Random() / 50.0; -    for (unsigned i = 1; i < r_trg.size(); ++i) { -      r_trg[i] = RVector::Random(); -      r_src[i] = RVector::Random(); -    } -    if (conf.count("source_embeddings")) -      LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src); -    if (conf.count("target_embeddings")) -      LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg); -  } - -  // do optimization -  TMatrix g = TMatrix::Zero(); -  vector<TMatrix> exp_src; -  vector<double> z_src; -  vector<double> flat_g, flat_t, rcv_grad; -  Flatten(t, &flat_t); -  bool converged = false; -#if HAVE_MPI -  mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); -  mpi::broadcast(world, r_trg, 0); -  mpi::broadcast(world, r_src, 0); -#endif -  cerr << "rank=" << rank << ": " << r_trg[0][4] << endl; -  for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { -    if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl; -    Unflatten(flat_t, &t); -    double likelihood = 0; -    double denom = 0.0; -    lc = 0; -    flag = false; -    g *= 0; -    for (unsigned i = 0; i < srcs.size(); ++i) { -      const vector<WordID>& src = srcs[i]; -      const vector<WordID>& trg = trgs[i]; -      ++lc; -      if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; } -      if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } -      denom += trg.size(); - -      exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero()); -      z_src.clear(); z_src.resize(src.size(), 0.0); -      Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero()); -      Array2D<double> z_refs(src.size(), trg.size(), 0.0); -      for (unsigned j = 0; j < trg.size(); ++j) -        trg_pos[trg[j]].insert(j); - -      for (unsigned i = 0; i < src.size(); ++i) { -        const RVector& r_s = r_src[src[i]]; -        const RTVector pred = r_s.transpose() * t; -        TMatrix& exp_m = exp_src[i]; -        double& z = z_src[i]; -        for (unsigned k = 0; k < vocab_e.size(); ++k) { -          const WordID v_k = vocab_e[k]; -          const RVector& r_t = r_trg[v_k]; -          const double dot_prod = pred * r_t; -          const double u = exp(dot_prod); -          z += u; -          const TMatrix v = r_s * r_t.transpose() * u; -          exp_m += v; -          set<unsigned>& ref_locs = trg_pos[v_k]; -          if (!ref_locs.empty()) { -            for (set<unsigned>::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) { -              TMatrix& exp_ref_ij = exp_refs(i, *it); -              double& z_ref_ij = z_refs(i, *it); -              z_ref_ij += u; -              exp_ref_ij += v; -            } -          } -        } -      } -      for (unsigned j = 0; j < trg.size(); ++j) -        trg_pos[trg[j]].clear(); - -      // model expectations for a single target generation with -      // uniform alignment prior -      // TODO: when using a non-uniform alignment, m_exp will be -      // a function of j (below) -      double m_z = 0; -      TMatrix m_exp = TMatrix::Zero(); -      for (unsigned i = 0; i < src.size(); ++i) { -        m_exp += exp_src[i]; -        m_z += z_src[i]; -      } -      m_exp /= m_z; - -      Array2D<bool> al(src.size(), trg.size(), false); -      for (unsigned j = 0; j < trg.size(); ++j) { -        double ref_z = 0; -        TMatrix ref_exp = TMatrix::Zero(); -        int max_i = 0; -        double max_s = -9999999; -        for (unsigned i = 0; i < src.size(); ++i) { -          ref_exp += exp_refs(i, j); -          ref_z += z_refs(i, j); -          if (log(z_refs(i, j)) > max_s) { -            max_s = log(z_refs(i, j)); -            max_i = i; -          } -          // TODO handle alignment prob -        } -        if (ref_z <= 0) {  -          cerr << "TRG=" << TD::Convert(trg[j]) << endl; -          cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl; -          cerr << " REF_EXP=\n" << ref_exp << endl; -          cerr << " M_EXP=\n" << m_exp << endl; -          abort(); -        } -        al(max_i, j) = true; -        ref_exp /= ref_z; -        g += m_exp - ref_exp; -        likelihood += log(ref_z) - log(m_z); -        if (SGD) { -          t -= g * eta / num_examples; -          g *= 0; -        } -      } -       -      if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; } -    } -    if (flag && rank == 0) { cerr << endl; } - -    double obj = 0; -    if (!SGD) { -      Flatten(g, &flat_g); -      obj = -likelihood; -#if HAVE_MPI -      rcv_grad.resize(flat_g.size(), 0.0); -      mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus<double>(), 0); -      swap(flat_g, rcv_grad); -      rcv_grad.clear(); - -      double to = 0; -      mpi::reduce(world, obj, to, plus<double>(), 0); -      obj = to; -      double tlh = 0; -      mpi::reduce(world, likelihood, tlh, plus<double>(), 0); -      likelihood = tlh; -      double td = 0; -      mpi::reduce(world, denom, td, plus<double>(), 0); -      denom = td; -#endif -    } - -    if (rank == 0) { -      double gn = 0; -      for (unsigned i = 0; i < flat_g.size(); ++i) -        gn += flat_g[i]*flat_g[i]; -      const double base2_likelihood = likelihood / log(2); -      cerr << "  log_e likelihood: " << likelihood << endl; -      cerr << "  log_2 likelihood: " << base2_likelihood << endl; -      cerr << "     cross entropy: " << (-base2_likelihood / denom) << endl; -      cerr << "        perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; -      cerr << "     gradient norm: " << sqrt(gn) << endl; -      if (!SGD) { -        if (has_l2) { -          const double r = ApplyRegularization(reg_strength, -                                               flat_t, -                                               &flat_g); -          obj += r; -          cerr << "    regularization: " << r << endl; -        } -        lbfgs->Optimize(obj, flat_g, &flat_t); -        converged = (lbfgs->HasConverged()); -      } -    } -#ifdef HAVE_MPI -    mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); -    mpi::broadcast(world, converged, 0); -#endif -  } -  if (rank == 0) -    cerr << "TRANSLATION MATRIX:" << endl << t << endl; -  return 0; -} - -#endif - diff --git a/training/minrisk/Makefile.am b/training/minrisk/Makefile.am new file mode 100644 index 00000000..a15e821e --- /dev/null +++ b/training/minrisk/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = minrisk_optimize + +minrisk_optimize_SOURCES = minrisk_optimize.cc +minrisk_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/training/liblbfgs/liblbfgs.a -lz + +AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training -I$(top_srcdir)/training/utils diff --git a/minrisk/minrisk.pl b/training/minrisk/minrisk.pl index d05b9595..0f8bacd0 100755 --- a/minrisk/minrisk.pl +++ b/training/minrisk/minrisk.pl @@ -2,7 +2,7 @@  use strict;  my @ORIG_ARGV=@ARGV;  use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }  # Skip local config (used for distributing jobs) if we're running in local-only mode  use LocalConfig; @@ -12,27 +12,27 @@ use POSIX ":sys_wait_h";  my $QSUB_CMD = qsub_args(mert_memory());  my $default_jobs = env_default_jobs(); -my $VEST_DIR="$SCRIPT_DIR/../dpmert"; -require "$VEST_DIR/libcall.pl"; +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl";  # Default settings  my $srcFile;  my $refFiles;  my $bin_dir = $SCRIPT_DIR;  die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score";  die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;  my $MAPINPUT = "$bin_dir/minrisk_generate_input.pl";  my $MAPPER = "$bin_dir/minrisk_optimize"; -my $parallelize = "$VEST_DIR/parallelize.pl"; -my $libcall = "$VEST_DIR/libcall.pl"; -my $sentserver = "$VEST_DIR/sentserver"; -my $sentclient = "$VEST_DIR/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";  my $SCORER = $FAST_SCORE;  die "Can't find $MAPPER" unless -x $MAPPER; -my $cdec = "$bin_dir/../decoder/cdec"; +my $cdec = "$bin_dir/../../decoder/cdec";  die "Can't find decoder in $cdec" unless -x $cdec;  die "Can't find $parallelize" unless -x $parallelize;  die "Can't find $libcall" unless -e $libcall; diff --git a/minrisk/minrisk_generate_input.pl b/training/minrisk/minrisk_generate_input.pl index b30fc4fd..b30fc4fd 100755 --- a/minrisk/minrisk_generate_input.pl +++ b/training/minrisk/minrisk_generate_input.pl diff --git a/minrisk/minrisk_optimize.cc b/training/minrisk/minrisk_optimize.cc index da8b5260..da8b5260 100644 --- a/minrisk/minrisk_optimize.cc +++ b/training/minrisk/minrisk_optimize.cc diff --git a/mira/Makefile.am b/training/mira/Makefile.am index 3f8f17cd..ae609ede 100644 --- a/mira/Makefile.am +++ b/training/mira/Makefile.am @@ -1,6 +1,6 @@  bin_PROGRAMS = kbest_mira  kbest_mira_SOURCES = kbest_mira.cc -kbest_mira_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +kbest_mira_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz  AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/mira/kbest_mira.cc b/training/mira/kbest_mira.cc index 8b7993dd..8b7993dd 100644 --- a/mira/kbest_mira.cc +++ b/training/mira/kbest_mira.cc diff --git a/training/mpi_em_optimize.cc b/training/mpi_em_optimize.cc deleted file mode 100644 index 48683b15..00000000 --- a/training/mpi_em_optimize.cc +++ /dev/null @@ -1,389 +0,0 @@ -#include <sstream> -#include <iostream> -#include <vector> -#include <cassert> -#include <cmath> - -#ifdef HAVE_MPI -#include <mpi.h> -#endif - -#include <boost/shared_ptr.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "verbose.h" -#include "hg.h" -#include "prob.h" -#include "inside_outside.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "optimize.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -using boost::shared_ptr; -namespace po = boost::program_options; - -void SanityCheck(const vector<double>& w) { -  for (int i = 0; i < w.size(); ++i) { -    assert(!isnan(w[i])); -    assert(!isinf(w[i])); -  } -} - -struct FComp { -  const vector<double>& w_; -  FComp(const vector<double>& w) : w_(w) {} -  bool operator()(int a, int b) const { -    return fabs(w_[a]) > fabs(w_[b]); -  } -}; - -void ShowLargestFeatures(const vector<double>& w) { -  vector<int> fnums(w.size()); -  for (int i = 0; i < w.size(); ++i) -    fnums[i] = i; -  vector<int>::iterator mid = fnums.begin(); -  mid += (w.size() > 10 ? 10 : w.size()); -  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); -  cerr << "TOP FEATURES:"; -  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) { -    cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; -  } -  cerr << endl; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("input_weights,w",po::value<string>(),"Input feature weights file") -        ("training_data,t",po::value<string>(),"Training data") -        ("decoder_config,c",po::value<string>(),"Decoder configuration file") -        ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || !(conf->count("training_data")) || !conf->count("decoder_config")) { -    cerr << dcmdline_options << endl; -#ifdef HAVE_MPI -    MPI::Finalize(); -#endif -    exit(1); -  } -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) { -  ReadFile rf(fname); -  istream& in = *rf.stream(); -  string line; -  int lc = 0; -  while(in) { -    getline(in, line); -    if (!in) break; -    if (lc % size == rank) c->push_back(line); -    ++lc; -  } -} - -static const double kMINUS_EPSILON = -1e-6; - -struct TrainingObserver : public DecoderObserver { -  void Reset() { -    total_complete = 0; -    cur_obj = 0; -    tot_obj = 0; -    tot.clear(); -  }  - -  void SetLocalGradientAndObjective(SparseVector<double>* g, double* o) const { -    *o = tot_obj; -    *g = tot; -  } - -  virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { -    cur_obj = 0; -    state = 1; -  } - -  void ExtractExpectedCounts(Hypergraph* hg) { -    vector<prob_t> posts; -    cur.clear(); -    const prob_t z = hg->ComputeEdgePosteriors(1.0, &posts); -    cur_obj = log(z); -    for (int i = 0; i < posts.size(); ++i) { -      const SparseVector<double>& efeats = hg->edges_[i].feature_values_; -      const double post = static_cast<double>(posts[i] / z); -      for (SparseVector<double>::const_iterator j = efeats.begin(); j != efeats.end(); ++j) -        cur.add_value(j->first, post); -    } -  } - -  // compute model expectations, denominator of objective -  virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { -    assert(state == 1); -    state = 2; -    ExtractExpectedCounts(hg); -  } - -  // replace translation forest, since we're doing EM training (we don't know which) -  virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { -    assert(state == 2); -    state = 3; -    ExtractExpectedCounts(hg); -  } - -  virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { -    ++total_complete; -    tot_obj += cur_obj; -    tot += cur; -  } - -  int total_complete; -  double cur_obj; -  double tot_obj; -  SparseVector<double> cur, tot; -  int state; -}; - -void ReadConfig(const string& ini, vector<string>* out) { -  ReadFile rf(ini); -  istream& in = *rf.stream(); -  while(in) { -    string line; -    getline(in, line); -    if (!in) continue; -    out->push_back(line); -  } -} - -void StoreConfig(const vector<string>& cfg, istringstream* o) { -  ostringstream os; -  for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; } -  o->str(os.str()); -} - -struct OptimizableMultinomialFamily { -  struct CPD { -    CPD() : z() {} -    double z; -    map<WordID, double> c2counts; -  }; -  map<WordID, CPD> counts; -  double Value(WordID conditioning, WordID generated) const { -    map<WordID, CPD>::const_iterator it = counts.find(conditioning); -    assert(it != counts.end()); -    map<WordID,double>::const_iterator r = it->second.c2counts.find(generated); -    if (r == it->second.c2counts.end()) return 0; -    return r->second; -  } -  void Increment(WordID conditioning, WordID generated, double count) { -    CPD& cc = counts[conditioning]; -    cc.z += count; -    cc.c2counts[generated] += count; -  } -  void Optimize() { -    for (map<WordID, CPD>::iterator i = counts.begin(); i != counts.end(); ++i) { -      CPD& cpd = i->second; -      for (map<WordID, double>::iterator j = cpd.c2counts.begin(); j != cpd.c2counts.end(); ++j) { -        j->second /= cpd.z; -        // cerr << "P(" << TD::Convert(j->first) << " | " << TD::Convert(i->first) << " ) =  " << j->second << endl; -      } -    } -  } -  void Clear() { -    counts.clear(); -  } -}; - -struct CountManager { -  CountManager(size_t num_types) : oms_(num_types) {} -  virtual ~CountManager(); -  virtual void AddCounts(const SparseVector<double>& c) = 0; -  void Optimize(SparseVector<double>* weights) { -    for (int i = 0; i < oms_.size(); ++i) { -      oms_[i].Optimize(); -    } -    GetOptimalValues(weights); -    for (int i = 0; i < oms_.size(); ++i) { -      oms_[i].Clear(); -    } -  } -  virtual void GetOptimalValues(SparseVector<double>* wv) const = 0; -  vector<OptimizableMultinomialFamily> oms_; -}; -CountManager::~CountManager() {} - -struct TaggerCountManager : public CountManager { -  // 0 = transitions, 2 = emissions -  TaggerCountManager() : CountManager(2) {} -  void AddCounts(const SparseVector<double>& c); -  void GetOptimalValues(SparseVector<double>* wv) const { -    for (set<int>::const_iterator it = fids_.begin(); it != fids_.end(); ++it) { -      int ftype; -      WordID cond, gen; -      bool is_optimized = TaggerCountManager::GetFeature(*it, &ftype, &cond, &gen); -      assert(is_optimized); -      wv->set_value(*it, log(oms_[ftype].Value(cond, gen))); -    } -  } -  // Id:0:a=1 Bi:a_b=1 Bi:b_c=1 Bi:c_d=1 Uni:a=1 Uni:b=1 Uni:c=1 Uni:d=1 Id:1:b=1 Bi:BOS_a=1 Id:2:c=1 -  static bool GetFeature(const int fid, int* feature_type, WordID* cond, WordID* gen) { -    const string& feat = FD::Convert(fid); -    if (feat.size() > 5 && feat[0] == 'I' && feat[1] == 'd' && feat[2] == ':') { -      // emission -      const size_t p = feat.rfind(':'); -      assert(p != string::npos); -      *cond = TD::Convert(feat.substr(p+1)); -      *gen = TD::Convert(feat.substr(3, p - 3)); -      *feature_type = 1; -      return true; -    } else if (feat[0] == 'B' && feat.size() > 5 && feat[2] == ':' && feat[1] == 'i') { -      // transition -      const size_t p = feat.rfind('_'); -      assert(p != string::npos); -      *gen = TD::Convert(feat.substr(p+1)); -      *cond = TD::Convert(feat.substr(3, p - 3)); -      *feature_type = 0; -      return true; -    } else if (feat[0] == 'U' && feat.size() > 4 && feat[1] == 'n' && feat[2] == 'i' && feat[3] == ':') { -      // ignore -      return false; -    } else { -      cerr << "Don't know how to deal with feature of type: " << feat << endl; -      abort(); -    } -  } -  set<int> fids_; -}; - -void TaggerCountManager::AddCounts(const SparseVector<double>& c) { -  for (SparseVector<double>::const_iterator it = c.begin(); it != c.end(); ++it) { -    const double& val = it->second; -    int ftype; -    WordID cond, gen; -    if (GetFeature(it->first, &ftype, &cond, &gen)) { -      oms_[ftype].Increment(cond, gen, val); -      fids_.insert(it->first); -    } -  } -} - -int main(int argc, char** argv) { -#ifdef HAVE_MPI -  MPI::Init(argc, argv); -  const int size = MPI::COMM_WORLD.Get_size();  -  const int rank = MPI::COMM_WORLD.Get_rank(); -#else -  const int size = 1; -  const int rank = 0; -#endif -  SetSilent(true);  // turn off verbose decoder output -  register_feature_functions(); - -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); - -  TaggerCountManager tcm; - -  // load cdec.ini and set up decoder -  vector<string> cdec_ini; -  ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini); -  istringstream ini; -  StoreConfig(cdec_ini, &ini); -  if (rank == 0) cerr << "Loading grammar...\n"; -  Decoder* decoder = new Decoder(&ini); -  if (decoder->GetConf()["input"].as<string>() != "-") { -    cerr << "cdec.ini must not set an input file\n"; -#ifdef HAVE_MPI -    MPI::COMM_WORLD.Abort(1); -#endif -  } -  if (rank == 0) cerr << "Done loading grammar!\n"; -  Weights w; -  if (conf.count("input_weights")) -    w.InitFromFile(conf["input_weights"].as<string>()); - -  double objective = 0; -  bool converged = false; - -  vector<double> lambdas; -  w.InitVector(&lambdas); -  vector<string> corpus; -  ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus); -  assert(corpus.size() > 0); - -  int iteration = 0; -  TrainingObserver observer; -  while (!converged) { -    ++iteration; -    observer.Reset(); -    if (rank == 0) { -      cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n"; -    } -    decoder->SetWeights(lambdas); -    for (int i = 0; i < corpus.size(); ++i) -      decoder->Decode(corpus[i], &observer); - -    SparseVector<double> x; -    observer.SetLocalGradientAndObjective(&x, &objective); -    cerr << "COUNTS = " << x << endl; -    cerr << "   OBJ = " << objective << endl; -    tcm.AddCounts(x); - -#if 0 -#ifdef HAVE_MPI -    MPI::COMM_WORLD.Reduce(const_cast<double*>(&gradient.data()[0]), &rcv_grad[0], num_feats, MPI::DOUBLE, MPI::SUM, 0); -    MPI::COMM_WORLD.Reduce(&objective, &to, 1, MPI::DOUBLE, MPI::SUM, 0); -    swap(gradient, rcv_grad); -    objective = to; -#endif -#endif - -    if (rank == 0) { -      SparseVector<double> wsv; -      tcm.Optimize(&wsv); - -      w.InitFromVector(wsv); -      w.InitVector(&lambdas); - -      ShowLargestFeatures(lambdas); - -      converged = iteration > 100; -      if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } - -      string fname = "weights.cur.gz"; -      if (converged) { fname = "weights.final.gz"; } -      ostringstream vv; -      vv << "Objective = " << objective << "  (ITERATION=" << iteration << ")"; -      const string svv = vv.str(); -      w.WriteToFile(fname, true, &svv); -    }  // rank == 0 -    int cint = converged; -#ifdef HAVE_MPI -    MPI::COMM_WORLD.Bcast(const_cast<double*>(&lambdas.data()[0]), num_feats, MPI::DOUBLE, 0); -    MPI::COMM_WORLD.Bcast(&cint, 1, MPI::INT, 0); -    MPI::COMM_WORLD.Barrier(); -#endif -    converged = cint; -  } -#ifdef HAVE_MPI -  MPI::Finalize();  -#endif -  return 0; -} diff --git a/training/mr_em_adapted_reduce.cc b/training/mr_em_adapted_reduce.cc deleted file mode 100644 index f65b5440..00000000 --- a/training/mr_em_adapted_reduce.cc +++ /dev/null @@ -1,173 +0,0 @@ -#include <iostream> -#include <vector> -#include <cassert> -#include <cmath> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "m.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("optimization_method,m", po::value<string>()->default_value("em"), "Optimization method (em, vb)") -        ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help")) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -double NoZero(const double& x) { -  if (x) return x; -  return 1e-35; -} - -void Maximize(const bool use_vb, -              const double& alpha, -              const int total_event_types, -              SparseVector<double>* pc) { -  const SparseVector<double>& counts = *pc; - -  if (use_vb) -    assert(total_event_types >= counts.size()); - -  double tot = 0; -  for (SparseVector<double>::const_iterator it = counts.begin(); -       it != counts.end(); ++it) -    tot += it->second; -//  cerr << " = " << tot << endl; -  assert(tot > 0.0); -  double ltot = log(tot); -  if (use_vb) -    ltot = Md::digamma(tot + total_event_types * alpha); -  for (SparseVector<double>::const_iterator it = counts.begin(); -       it != counts.end(); ++it) { -    if (use_vb) { -      pc->set_value(it->first, NoZero(Md::digamma(it->second + alpha) - ltot)); -    } else { -      pc->set_value(it->first, NoZero(log(it->second) - ltot)); -    } -  } -#if 0 -  if (counts.size() < 50) { -    for (SparseVector<double>::const_iterator it = counts.begin(); -         it != counts.end(); ++it) { -      cerr << " p(" << FD::Convert(it->first) << ")=" << exp(it->second); -    } -    cerr << endl; -  } -#endif -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); - -  const bool use_b64 = conf["input_format"].as<string>() == "b64"; -  const bool use_vb = conf["optimization_method"].as<string>() == "vb"; -  const double alpha = 1e-09; -  if (use_vb) -    cerr << "Using variational Bayes, make sure alphas are set\n"; - -  const string s_obj = "**OBJ**"; -  // E-step -  string cur_key = ""; -  SparseVector<double> acc; -  double logprob = 0; -  while(cin) { -    string line; -    getline(cin, line); -    if (line.empty()) continue; -    int feat; -    double val; -    size_t i = line.find("\t"); -    const string key = line.substr(0, i); -    assert(i != string::npos); -    ++i; -    if (key != cur_key) { -      if  (cur_key.size() > 0) { -        // TODO shouldn't be num_active, should be total number -        // of events -        Maximize(use_vb, alpha, acc.size(), &acc); -        cout << cur_key << '\t'; -        if (use_b64) -          B64::Encode(0.0, acc, &cout); -        else -          cout << acc; -        cout << endl; -        acc.clear(); -      } -      cur_key = key; -    } -    if (use_b64) { -      SparseVector<double> g; -      double obj; -      if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { -        cerr << "B64 decoder returned error, skipping!\n"; -        continue; -      } -      logprob += obj; -      acc += g; -    } else {       // text encoding - your counts will not be accurate! -      while (i < line.size()) { -        size_t start = i; -        while (line[i] != '=' && i < line.size()) ++i; -        if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } -        string fname = line.substr(start, i - start); -        if (fname == s_obj) { -          feat = -1; -        } else { -          feat = FD::Convert(line.substr(start, i - start)); -        } -        ++i; -        start = i; -        while (line[i] != ';' && i < line.size()) ++i; -        if (i - start == 0) continue; -        val = atof(line.substr(start, i - start).c_str()); -        ++i; -        if (feat == -1) { -          logprob += val; -        } else { -          acc.add_value(feat, val); -        } -      } -    } -  } -  // TODO shouldn't be num_active, should be total number -  // of events -  Maximize(use_vb, alpha, acc.size(), &acc); -  cout << cur_key << '\t'; -  if (use_b64) -    B64::Encode(0.0, acc, &cout); -  else -    cout << acc; -  cout << endl << flush; - -  cerr << "LOGPROB: " << logprob << endl; - -  return 0; -} diff --git a/training/mr_em_map_adapter.cc b/training/mr_em_map_adapter.cc deleted file mode 100644 index ead4598d..00000000 --- a/training/mr_em_map_adapter.cc +++ /dev/null @@ -1,160 +0,0 @@ -#include <iostream> -#include <fstream> -#include <cassert> -#include <cmath> - -#include <boost/utility.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include "boost/tuple/tuple.hpp" - -#include "fdict.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -// useful for EM models parameterized by a bunch of multinomials -// this converts event counts (returned from cdec as feature expectations) -// into different keys and values (which are lists of all the events, -// conditioned on the key) for summing and normalization by a reducer - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("buffer_size,b", po::value<int>()->default_value(1), "Buffer size (in # of counts) before emitting counts") -        ("format,f",po::value<string>()->default_value("b64"), "Encoding of the input (b64 or text)"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help")) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -struct EventMapper { -  int Map(int fid) { -    int& cv = map_[fid]; -    if (!cv) { -      cv = GetConditioningVariable(fid); -    } -    return cv; -  } -  void Clear() { map_.clear(); } - protected: -  virtual int GetConditioningVariable(int fid) const = 0; - private: -  map<int, int> map_; -}; - -struct LexAlignEventMapper : public EventMapper { - protected: -  virtual int GetConditioningVariable(int fid) const { -    const string& str = FD::Convert(fid); -    size_t pos = str.rfind("_"); -    if (pos == string::npos || pos == 0 || pos >= str.size() - 1) { -      cerr << "Bad feature for EM adapter: " << str << endl; -      abort(); -    } -    return FD::Convert(str.substr(0, pos)); -  } -}; - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); - -  const bool use_b64 = conf["format"].as<string>() == "b64"; -  const int buffer_size = conf["buffer_size"].as<int>(); - -  const string s_obj = "**OBJ**"; -  // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; -  // 0<TAB>**OBJ**=1.1;Feat1=1.0; - -  EventMapper* event_mapper = new LexAlignEventMapper; -  map<int, SparseVector<double> > counts; -  size_t total = 0; -  while(cin) { -    string line; -    getline(cin, line); -    if (line.empty()) continue; -    int feat; -    double val; -    size_t i = line.find("\t"); -    assert(i != string::npos); -    ++i; -    SparseVector<double> g; -    double obj = 0; -    if (use_b64) { -      if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { -        cerr << "B64 decoder returned error, skipping!\n"; -        continue; -      } -    } else {       // text encoding - your counts will not be accurate! -      while (i < line.size()) { -        size_t start = i; -        while (line[i] != '=' && i < line.size()) ++i; -        if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } -        string fname = line.substr(start, i - start); -        if (fname == s_obj) { -          feat = -1; -        } else { -          feat = FD::Convert(line.substr(start, i - start)); -        } -        ++i; -        start = i; -        while (line[i] != ';' && i < line.size()) ++i; -        if (i - start == 0) continue; -        val = atof(line.substr(start, i - start).c_str()); -        ++i; -        if (feat == -1) { -          obj = val; -        } else { -          g.set_value(feat, val); -        } -      } -    } -    //cerr << "OBJ: " << obj << endl; -    const SparseVector<double>& cg = g; -    for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) { -      const int cond_var = event_mapper->Map(it->first); -      SparseVector<double>& cond_counts = counts[cond_var]; -      int delta = cond_counts.size(); -      cond_counts.add_value(it->first, it->second); -      delta = cond_counts.size() - delta; -      total += delta; -    } -    if (total > buffer_size) { -      for (map<int, SparseVector<double> >::iterator it = counts.begin(); -           it != counts.end(); ++it) { -        const SparseVector<double>& cc = it->second; -        cout << FD::Convert(it->first) << '\t'; -        if (use_b64) { -          B64::Encode(0.0, cc, &cout); -        } else { -          abort(); -        } -        cout << endl; -      } -      cout << flush; -      total = 0; -      counts.clear(); -    } -  } - -  return 0; -} - diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc deleted file mode 100644 index d490192f..00000000 --- a/training/mr_optimize_reduce.cc +++ /dev/null @@ -1,231 +0,0 @@ -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> -#include <cmath> - -#include <boost/shared_ptr.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "optimize.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -void SanityCheck(const vector<double>& w) { -  for (int i = 0; i < w.size(); ++i) { -    assert(!std::isnan(w[i])); -    assert(!std::isinf(w[i])); -  } -} - -struct FComp { -  const vector<double>& w_; -  FComp(const vector<double>& w) : w_(w) {} -  bool operator()(int a, int b) const { -    return fabs(w_[a]) > fabs(w_[b]); -  } -}; - -void ShowLargestFeatures(const vector<double>& w) { -  vector<int> fnums(w.size()); -  for (int i = 0; i < w.size(); ++i) -    fnums[i] = i; -  vector<int>::iterator mid = fnums.begin(); -  mid += (w.size() > 10 ? 10 : w.size()); -  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); -  cerr << "TOP FEATURES:"; -  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) { -    cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; -  } -  cerr << endl; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("input_weights,i",po::value<string>(),"Input feature weights file") -        ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file") -        ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)") -        ("state,s",po::value<string>(),"Read (and write if output_state is not set) optimizer state from this state file. In the first iteration, the file should not exist.") -        ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)") -        ("output_state,S", po::value<string>(), "Output state file (optional override)") -	("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory") -        ("eta,e", po::value<double>()->default_value(0.1), "Learning rate for SGD (eta)") -        ("gaussian_prior,p","Use a Gaussian prior on the weights") -        ("means,u", po::value<string>(), "File containing the means for Gaussian prior") -        ("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help") || !conf->count("input_weights") || !conf->count("state")) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); - -  const bool use_b64 = conf["input_format"].as<string>() == "b64"; - -  vector<weight_t> lambdas; -  Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas); -  const string s_obj = "**OBJ**"; -  int num_feats = FD::NumFeats(); -  cerr << "Number of features: " << num_feats << endl; -  const bool gaussian_prior = conf.count("gaussian_prior"); -  vector<weight_t> means(num_feats, 0); -  if (conf.count("means")) { -    if (!gaussian_prior) { -      cerr << "Don't use --means without --gaussian_prior!\n"; -      exit(1); -    } -    Weights::InitFromFile(conf["means"].as<string>(), &means); -  } -  boost::shared_ptr<BatchOptimizer> o; -  const string omethod = conf["optimization_method"].as<string>(); -  if (omethod == "rprop") -    o.reset(new RPropOptimizer(num_feats));  // TODO add configuration -  else -    o.reset(new LBFGSOptimizer(num_feats, conf["correction_buffers"].as<int>())); -  cerr << "Optimizer: " << o->Name() << endl; -  string state_file = conf["state"].as<string>(); -  { -    ifstream in(state_file.c_str(), ios::binary); -    if (in) -      o->Load(&in); -    else -      cerr << "No state file found, assuming ITERATION 1\n"; -  } - -  double objective = 0; -  vector<double> gradient(num_feats, 0); -  // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; -  // 0<TAB>**OBJ**=1.1;Feat1=1.0; -  int total_lines = 0;  // TODO - this should be a count of the -                        // training instances!! -  while(cin) { -    string line; -    getline(cin, line); -    if (line.empty()) continue; -    ++total_lines; -    int feat; -    double val; -    size_t i = line.find("\t"); -    assert(i != string::npos); -    ++i; -    if (use_b64) { -      SparseVector<double> g; -      double obj; -      if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { -        cerr << "B64 decoder returned error, skipping gradient!\n"; -	cerr << "  START: " << line.substr(0,line.size() > 200 ? 200 : line.size()) << endl; -	if (line.size() > 200) -	  cerr << "    END: " << line.substr(line.size() - 200, 200) << endl; -        cout << "-1\tRESTART\n"; -        exit(99); -      } -      objective += obj; -      const SparseVector<double>& cg = g; -      for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) { -        if (it->first >= num_feats) { -	  cerr << "Unexpected feature in gradient: " << FD::Convert(it->first) << endl; -	  abort(); -        } -        gradient[it->first] -= it->second; -      } -    } else {       // text encoding - your gradients will not be accurate! -      while (i < line.size()) { -        size_t start = i; -        while (line[i] != '=' && i < line.size()) ++i; -        if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } -        string fname = line.substr(start, i - start); -        if (fname == s_obj) { -          feat = -1; -        } else { -          feat = FD::Convert(line.substr(start, i - start)); -          if (feat >= num_feats) { -	    cerr << "Unexpected feature in gradient: " << line.substr(start, i - start) << endl; -	    abort(); -	  } -        } -        ++i; -        start = i; -        while (line[i] != ';' && i < line.size()) ++i; -        if (i - start == 0) continue; -        val = atof(line.substr(start, i - start).c_str()); -        ++i; -        if (feat == -1) { -          objective += val; -        } else { -          gradient[feat] -= val; -        } -      } -    } -  } - -  if (gaussian_prior) { -    const double sigsq = conf["sigma_squared"].as<double>(); -    double norm = 0; -    for (int k = 1; k < lambdas.size(); ++k) { -      const double& lambda_k = lambdas[k]; -      if (lambda_k) { -        const double param = (lambda_k - means[k]); -        norm += param * param; -        gradient[k] += param / sigsq; -      } -    } -    const double reg = norm / (2.0 * sigsq); -    cerr << "REGULARIZATION TERM: " << reg << endl; -    objective += reg; -  } -  cerr << "EVALUATION #" << o->EvaluationCount() << " OBJECTIVE: " << objective << endl; -  double gnorm = 0; -  for (int i = 0; i < gradient.size(); ++i) -    gnorm += gradient[i] * gradient[i]; -  cerr << "  GNORM=" << sqrt(gnorm) << endl; -  vector<double> old = lambdas; -  int c = 0; -  while (old == lambdas) { -    ++c; -    if (c > 1) { cerr << "Same lambdas, repeating optimization\n"; } -    o->Optimize(objective, gradient, &lambdas); -    assert(c < 5); -  } -  old.clear(); -  SanityCheck(lambdas); -  ShowLargestFeatures(lambdas); -  Weights::WriteToFile(conf["output_weights"].as<string>(), lambdas, false); - -  const bool conv = o->HasConverged(); -  if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } -   -  if (conf.count("output_state")) -    state_file = conf["output_state"].as<string>(); -  ofstream out(state_file.c_str(), ios::binary); -  cerr << "Writing state to: " << state_file << endl; -  o->Save(&out); -  out.close(); - -  cout << o->EvaluationCount() << "\t" << conv << endl; -  return 0; -} diff --git a/training/mr_reduce_to_weights.cc b/training/mr_reduce_to_weights.cc deleted file mode 100644 index 16b47720..00000000 --- a/training/mr_reduce_to_weights.cc +++ /dev/null @@ -1,109 +0,0 @@ -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { -  po::options_description opts("Configuration options"); -  opts.add_options() -        ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)") -        ("input,i",po::value<string>()->default_value("-"),"Read file from") -        ("output,o",po::value<string>()->default_value("-"),"Write weights to"); -  po::options_description clo("Command line options"); -  clo.add_options() -        ("config", po::value<string>(), "Configuration file") -        ("help,h", "Print this help message and exit"); -  po::options_description dconfig_options, dcmdline_options; -  dconfig_options.add(opts); -  dcmdline_options.add(opts).add(clo); -   -  po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  if (conf->count("config")) { -    ifstream config((*conf)["config"].as<string>().c_str()); -    po::store(po::parse_config_file(config, dconfig_options), *conf); -  } -  po::notify(*conf); - -  if (conf->count("help")) { -    cerr << dcmdline_options << endl; -    exit(1); -  } -} - -void WriteWeights(const SparseVector<double>& weights, ostream* out) { -  for (SparseVector<double>::const_iterator it = weights.begin(); -       it != weights.end(); ++it) { -    (*out) << FD::Convert(it->first) << " " << it->second << endl; -  } -} - -int main(int argc, char** argv) { -  po::variables_map conf; -  InitCommandLine(argc, argv, &conf); - -  const bool use_b64 = conf["input_format"].as<string>() == "b64"; - -  const string s_obj = "**OBJ**"; -  // E-step -  ReadFile rf(conf["input"].as<string>()); -  istream* in = rf.stream(); -  assert(*in); -  WriteFile wf(conf["output"].as<string>()); -  ostream* out = wf.stream(); -  out->precision(17); -  while(*in) { -    string line; -    getline(*in, line); -    if (line.empty()) continue; -    int feat; -    double val; -    size_t i = line.find("\t"); -    assert(i != string::npos); -    ++i; -    if (use_b64) { -      SparseVector<double> g; -      double obj; -      if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { -        cerr << "B64 decoder returned error, skipping!\n"; -        continue; -      } -      WriteWeights(g, out); -    } else {       // text encoding - your counts will not be accurate! -      SparseVector<double> weights; -      while (i < line.size()) { -        size_t start = i; -        while (line[i] != '=' && i < line.size()) ++i; -        if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } -        string fname = line.substr(start, i - start); -        if (fname == s_obj) { -          feat = -1; -        } else { -          feat = FD::Convert(line.substr(start, i - start)); -        } -        ++i; -        start = i; -        while (line[i] != ';' && i < line.size()) ++i; -        if (i - start == 0) continue; -        val = atof(line.substr(start, i - start).c_str()); -        ++i; -        if (feat != -1) { -          weights.set_value(feat, val); -        } -      } -      WriteWeights(weights, out); -    } -  } - -  return 0; -} diff --git a/pro/Makefile.am b/training/pro/Makefile.am index 1e9d46b0..1916b6b2 100644 --- a/pro/Makefile.am +++ b/training/pro/Makefile.am @@ -3,9 +3,9 @@ bin_PROGRAMS = \    mr_pro_reduce  mr_pro_map_SOURCES = mr_pro_map.cc -mr_pro_map_LDADD = $(top_srcdir)/training/libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz +mr_pro_map_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz  mr_pro_reduce_SOURCES = mr_pro_reduce.cc  mr_pro_reduce_LDADD = $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils -I$(top_srcdir)/training diff --git a/pro/mr_pro_generate_mapper_input.pl b/training/pro/mr_pro_generate_mapper_input.pl index b30fc4fd..b30fc4fd 100755 --- a/pro/mr_pro_generate_mapper_input.pl +++ b/training/pro/mr_pro_generate_mapper_input.pl diff --git a/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc index eef40b8a..eef40b8a 100644 --- a/pro/mr_pro_map.cc +++ b/training/pro/mr_pro_map.cc diff --git a/pro/mr_pro_reduce.cc b/training/pro/mr_pro_reduce.cc index 5ef9b470..5ef9b470 100644 --- a/pro/mr_pro_reduce.cc +++ b/training/pro/mr_pro_reduce.cc diff --git a/pro/pro.pl b/training/pro/pro.pl index 891b7e4c..3b30c379 100755 --- a/pro/pro.pl +++ b/training/pro/pro.pl @@ -3,7 +3,7 @@ use strict;  use File::Basename qw(basename);  my @ORIG_ARGV=@ARGV;  use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }  # Skip local config (used for distributing jobs) if we're running in local-only mode  use LocalConfig; @@ -13,28 +13,28 @@ use POSIX ":sys_wait_h";  my $QSUB_CMD = qsub_args(mert_memory());  my $default_jobs = env_default_jobs(); -my $VEST_DIR="$SCRIPT_DIR/../dpmert"; -require "$VEST_DIR/libcall.pl"; +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl";  # Default settings  my $srcFile;  my $refFiles;  my $bin_dir = $SCRIPT_DIR;  die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score";  die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;  my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl";  my $MAPPER = "$bin_dir/mr_pro_map";  my $REDUCER = "$bin_dir/mr_pro_reduce"; -my $parallelize = "$VEST_DIR/parallelize.pl"; -my $libcall = "$VEST_DIR/libcall.pl"; -my $sentserver = "$VEST_DIR/sentserver"; -my $sentclient = "$VEST_DIR/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";  my $SCORER = $FAST_SCORE;  die "Can't find $MAPPER" unless -x $MAPPER; -my $cdec = "$bin_dir/../decoder/cdec"; +my $cdec = "$bin_dir/../../decoder/cdec";  die "Can't find decoder in $cdec" unless -x $cdec;  die "Can't find $parallelize" unless -x $parallelize;  die "Can't find $libcall" unless -e $libcall; diff --git a/rampion/Makefile.am b/training/rampion/Makefile.am index f4dbb7cc..1633d0f7 100644 --- a/rampion/Makefile.am +++ b/training/rampion/Makefile.am @@ -1,6 +1,6 @@  bin_PROGRAMS = rampion_cccp  rampion_cccp_SOURCES = rampion_cccp.cc -rampion_cccp_LDADD = $(top_srcdir)/training/libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz +rampion_cccp_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training +AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils diff --git a/rampion/rampion.pl b/training/rampion/rampion.pl index 55f7b3f1..ae084db6 100755 --- a/rampion/rampion.pl +++ b/training/rampion/rampion.pl @@ -2,7 +2,7 @@  use strict;  my @ORIG_ARGV=@ARGV;  use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; }  # Skip local config (used for distributing jobs) if we're running in local-only mode  use LocalConfig; @@ -12,27 +12,27 @@ use POSIX ":sys_wait_h";  my $QSUB_CMD = qsub_args(mert_memory());  my $default_jobs = env_default_jobs(); -my $VEST_DIR="$SCRIPT_DIR/../dpmert"; -require "$VEST_DIR/libcall.pl"; +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl";  # Default settings  my $srcFile;  my $refFiles;  my $bin_dir = $SCRIPT_DIR;  die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score";  die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;  my $MAPINPUT = "$bin_dir/rampion_generate_input.pl";  my $MAPPER = "$bin_dir/rampion_cccp"; -my $parallelize = "$VEST_DIR/parallelize.pl"; -my $libcall = "$VEST_DIR/libcall.pl"; -my $sentserver = "$VEST_DIR/sentserver"; -my $sentclient = "$VEST_DIR/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";  my $SCORER = $FAST_SCORE;  die "Can't find $MAPPER" unless -x $MAPPER; -my $cdec = "$bin_dir/../decoder/cdec"; +my $cdec = "$bin_dir/../../decoder/cdec";  die "Can't find decoder in $cdec" unless -x $cdec;  die "Can't find $parallelize" unless -x $parallelize;  die "Can't find $libcall" unless -e $libcall; diff --git a/rampion/rampion_cccp.cc b/training/rampion/rampion_cccp.cc index 1e36dc51..1e36dc51 100644 --- a/rampion/rampion_cccp.cc +++ b/training/rampion/rampion_cccp.cc diff --git a/rampion/rampion_generate_input.pl b/training/rampion/rampion_generate_input.pl index b30fc4fd..b30fc4fd 100755 --- a/rampion/rampion_generate_input.pl +++ b/training/rampion/rampion_generate_input.pl diff --git a/training/candidate_set.cc b/training/utils/candidate_set.cc index 087efec3..087efec3 100644 --- a/training/candidate_set.cc +++ b/training/utils/candidate_set.cc diff --git a/training/candidate_set.h b/training/utils/candidate_set.h index 9d326ed0..9d326ed0 100644 --- a/training/candidate_set.h +++ b/training/utils/candidate_set.h diff --git a/dpmert/decode-and-evaluate.pl b/training/utils/decode-and-evaluate.pl index fe765d00..1a332c08 100755 --- a/dpmert/decode-and-evaluate.pl +++ b/training/utils/decode-and-evaluate.pl @@ -2,7 +2,7 @@  use strict;  my @ORIG_ARGV=@ARGV;  use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }  # Skip local config (used for distributing jobs) if we're running in local-only mode  use LocalConfig; @@ -16,16 +16,16 @@ require "libcall.pl";  my $default_jobs = env_default_jobs();  my $bin_dir = $SCRIPT_DIR;  die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score";  die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE;  my $parallelize = "$bin_dir/parallelize.pl";  my $libcall = "$bin_dir/libcall.pl";  my $sentserver = "$bin_dir/sentserver";  my $sentclient = "$bin_dir/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm";  my $SCORER = $FAST_SCORE; -my $cdec = "$bin_dir/../decoder/cdec"; +my $cdec = "$bin_dir/../../decoder/cdec";  die "Can't find decoder in $cdec" unless -x $cdec;  die "Can't find $parallelize" unless -x $parallelize;  die "Can't find $libcall" unless -e $libcall; diff --git a/training/entropy.cc b/training/utils/entropy.cc index 4fdbe2be..4fdbe2be 100644 --- a/training/entropy.cc +++ b/training/utils/entropy.cc diff --git a/training/entropy.h b/training/utils/entropy.h index 796589ca..796589ca 100644 --- a/training/entropy.h +++ b/training/utils/entropy.h diff --git a/training/grammar_convert.cc b/training/utils/grammar_convert.cc index 607a7cb9..607a7cb9 100644 --- a/training/grammar_convert.cc +++ b/training/utils/grammar_convert.cc diff --git a/training/lbfgs.h b/training/utils/lbfgs.h index e8baecab..e8baecab 100644 --- a/training/lbfgs.h +++ b/training/utils/lbfgs.h diff --git a/training/lbfgs_test.cc b/training/utils/lbfgs_test.cc index 9678e788..9678e788 100644 --- a/training/lbfgs_test.cc +++ b/training/utils/lbfgs_test.cc diff --git a/dpmert/libcall.pl b/training/utils/libcall.pl index c7d0f128..c7d0f128 100644 --- a/dpmert/libcall.pl +++ b/training/utils/libcall.pl diff --git a/training/online_optimizer.cc b/training/utils/online_optimizer.cc index 3ed95452..3ed95452 100644 --- a/training/online_optimizer.cc +++ b/training/utils/online_optimizer.cc diff --git a/training/online_optimizer.h b/training/utils/online_optimizer.h index 28d89344..28d89344 100644 --- a/training/online_optimizer.h +++ b/training/utils/online_optimizer.h diff --git a/training/optimize.cc b/training/utils/optimize.cc index 41ac90d8..41ac90d8 100644 --- a/training/optimize.cc +++ b/training/utils/optimize.cc diff --git a/training/optimize.h b/training/utils/optimize.h index 07943b44..07943b44 100644 --- a/training/optimize.h +++ b/training/utils/optimize.h diff --git a/training/optimize_test.cc b/training/utils/optimize_test.cc index bff2ca03..bff2ca03 100644 --- a/training/optimize_test.cc +++ b/training/utils/optimize_test.cc diff --git a/dpmert/parallelize.pl b/training/utils/parallelize.pl index d2ebaeea..4197e0e5 100755 --- a/dpmert/parallelize.pl +++ b/training/utils/parallelize.pl @@ -18,7 +18,7 @@  #ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps.  time cut down to 15s from 60s -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; }  use LocalConfig;  use Cwd qw/ abs_path cwd getcwd /;  diff --git a/training/risk.cc b/training/utils/risk.cc index d5a12cfd..d5a12cfd 100644 --- a/training/risk.cc +++ b/training/utils/risk.cc diff --git a/training/risk.h b/training/utils/risk.h index 2e8db0fb..2e8db0fb 100644 --- a/training/risk.h +++ b/training/utils/risk.h diff --git a/dpmert/sentclient.c b/training/utils/sentclient.c index 91d994ab..91d994ab 100644 --- a/dpmert/sentclient.c +++ b/training/utils/sentclient.c diff --git a/dpmert/sentserver.c b/training/utils/sentserver.c index c20b4fa6..c20b4fa6 100644 --- a/dpmert/sentserver.c +++ b/training/utils/sentserver.c diff --git a/dpmert/sentserver.h b/training/utils/sentserver.h index cd17a546..cd17a546 100644 --- a/dpmert/sentserver.h +++ b/training/utils/sentserver.h diff --git a/word-aligner/Makefile.am b/word-aligner/Makefile.am new file mode 100644 index 00000000..280d3ae7 --- /dev/null +++ b/word-aligner/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = fast_align + +fast_align_SOURCES = fast_align.cc ttables.cc +fast_align_LDADD = $(top_srcdir)/utils/libutils.a -lz + +AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/training diff --git a/training/fast_align.cc b/word-aligner/fast_align.cc index 7492d26f..7492d26f 100644 --- a/training/fast_align.cc +++ b/word-aligner/fast_align.cc diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index 08ff33e1..ce3e1638 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -16,7 +16,7 @@ STEM_E = $(SCRIPT_DIR)/stemmers/$(E_LANG).pl  CLASSIFY = $(SUPPORT_DIR)/classify.pl  MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl -MODEL1 = $(TRAINING_DIR)/fast_align +MODEL1 = $(SCRIPT_DIR)/fast_align  MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl  e.voc: corpus.e diff --git a/word-aligner/paste-parallel-files.pl b/word-aligner/paste-parallel-files.pl deleted file mode 100755 index ce53b325..00000000 --- a/word-aligner/paste-parallel-files.pl +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my @fs = (); -for my $file (@ARGV) { -  my $fh; -  open $fh, "<$file" or die "Can't open $file for reading: $!"; -  push @fs, $fh; -} -my $num = scalar @fs; -die "Usage: $0 file1.txt file2.txt [...]\n" unless $num > 1; - -my $first = $fs[0]; -while(<$first>) { -  chomp; -  my @out = (); -  push @out, $_; -  for (my $i=1; $i < $num; $i++) { -    my $f = $fs[$i]; -    my $line = <$f>; -    die "Mismatched number of lines!" unless defined $line; -    chomp $line; -    push @out, $line; -  } -  print join(' ||| ', @out) . "\n"; -} - -for my $fh (@fs) { -  my $x=<$fh>; -  die "Mismatched number of lines!" if defined $x; -  close $fh; -} - -exit 0; - diff --git a/training/ttables.cc b/word-aligner/ttables.cc index 45bf14c5..45bf14c5 100644 --- a/training/ttables.cc +++ b/word-aligner/ttables.cc diff --git a/training/ttables.h b/word-aligner/ttables.h index 9baa13ca..9baa13ca 100644 --- a/training/ttables.h +++ b/word-aligner/ttables.h | 
