diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-18 13:35:42 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-18 13:35:42 -0500 |
commit | 8aa29810bb77611cc20b7a384897ff6703783ea1 (patch) | |
tree | 8635daa8fffb3f2cd90e30b41e27f4f9e0909447 | |
parent | fbdacabc85bea65d735f2cb7f92b98e08ce72d04 (diff) |
major restructure of the training code
-rw-r--r-- | .gitignore | 28 | ||||
-rw-r--r-- | Makefile.am | 7 | ||||
-rw-r--r-- | configure.ac | 32 | ||||
-rw-r--r-- | dpmert/README.shared-mem | 9 | ||||
-rw-r--r-- | minrisk/Makefile.am | 6 | ||||
-rw-r--r-- | pro/README.shared-mem | 9 | ||||
-rw-r--r-- | training/Makefile.am | 100 | ||||
-rwxr-xr-x | training/add-model1-features-to-scfg.pl | 93 | ||||
-rw-r--r-- | training/collapse_weights.cc | 110 | ||||
-rw-r--r-- | training/crf/Makefile.am | 27 | ||||
-rw-r--r-- | training/crf/cllh_observer.cc (renamed from training/cllh_observer.cc) | 0 | ||||
-rw-r--r-- | training/crf/cllh_observer.h (renamed from training/cllh_observer.h) | 0 | ||||
-rw-r--r-- | training/crf/mpi_batch_optimize.cc (renamed from training/mpi_batch_optimize.cc) | 0 | ||||
-rw-r--r-- | training/crf/mpi_compute_cllh.cc (renamed from training/mpi_compute_cllh.cc) | 0 | ||||
-rw-r--r-- | training/crf/mpi_extract_features.cc (renamed from training/mpi_extract_features.cc) | 0 | ||||
-rw-r--r-- | training/crf/mpi_extract_reachable.cc (renamed from training/mpi_extract_reachable.cc) | 0 | ||||
-rw-r--r-- | training/crf/mpi_flex_optimize.cc (renamed from training/mpi_flex_optimize.cc) | 0 | ||||
-rw-r--r-- | training/crf/mpi_online_optimize.cc (renamed from training/mpi_online_optimize.cc) | 0 | ||||
-rwxr-xr-x | training/dep-reorder/conll2reordering-forest.pl | 65 | ||||
-rw-r--r-- | training/dep-reorder/george.conll | 4 | ||||
-rwxr-xr-x | training/dep-reorder/scripts/conll2simplecfg.pl | 57 | ||||
-rw-r--r-- | training/dpmert/Makefile.am (renamed from dpmert/Makefile.am) | 10 | ||||
-rw-r--r-- | training/dpmert/ces.cc (renamed from dpmert/ces.cc) | 0 | ||||
-rw-r--r-- | training/dpmert/ces.h (renamed from dpmert/ces.h) | 0 | ||||
-rwxr-xr-x | training/dpmert/divide_refs.py (renamed from dpmert/divide_refs.py) | 0 | ||||
-rwxr-xr-x | training/dpmert/dpmert.pl (renamed from dpmert/dpmert.pl) | 17 | ||||
-rw-r--r-- | training/dpmert/error_surface.cc (renamed from dpmert/error_surface.cc) | 0 | ||||
-rw-r--r-- | training/dpmert/error_surface.h (renamed from dpmert/error_surface.h) | 0 | ||||
-rwxr-xr-x | training/dpmert/line_mediator.pl (renamed from dpmert/line_mediator.pl) | 0 | ||||
-rw-r--r-- | training/dpmert/line_optimizer.cc (renamed from dpmert/line_optimizer.cc) | 0 | ||||
-rw-r--r-- | training/dpmert/line_optimizer.h (renamed from dpmert/line_optimizer.h) | 0 | ||||
-rw-r--r-- | training/dpmert/lo_test.cc (renamed from dpmert/lo_test.cc) | 0 | ||||
-rw-r--r-- | training/dpmert/mert_geometry.cc (renamed from dpmert/mert_geometry.cc) | 0 | ||||
-rw-r--r-- | training/dpmert/mert_geometry.h (renamed from dpmert/mert_geometry.h) | 0 | ||||
-rw-r--r-- | training/dpmert/mr_dpmert_generate_mapper_input.cc (renamed from dpmert/mr_dpmert_generate_mapper_input.cc) | 0 | ||||
-rw-r--r-- | training/dpmert/mr_dpmert_map.cc (renamed from dpmert/mr_dpmert_map.cc) | 0 | ||||
-rw-r--r-- | training/dpmert/mr_dpmert_reduce.cc (renamed from dpmert/mr_dpmert_reduce.cc) | 0 | ||||
-rw-r--r-- | training/dpmert/test_aer/README (renamed from dpmert/test_aer/README) | 0 | ||||
-rw-r--r-- | training/dpmert/test_aer/cdec.ini (renamed from dpmert/test_aer/cdec.ini) | 0 | ||||
-rw-r--r-- | training/dpmert/test_aer/corpus.src (renamed from dpmert/test_aer/corpus.src) | 0 | ||||
-rw-r--r-- | training/dpmert/test_aer/grammar (renamed from dpmert/test_aer/grammar) | 0 | ||||
-rw-r--r-- | training/dpmert/test_aer/ref.0 (renamed from dpmert/test_aer/ref.0) | 0 | ||||
-rw-r--r-- | training/dpmert/test_aer/weights (renamed from dpmert/test_aer/weights) | 0 | ||||
-rw-r--r-- | training/dpmert/test_data/0.json.gz (renamed from dpmert/test_data/0.json.gz) | bin | 13709 -> 13709 bytes | |||
-rw-r--r-- | training/dpmert/test_data/1.json.gz (renamed from dpmert/test_data/1.json.gz) | bin | 204803 -> 204803 bytes | |||
-rw-r--r-- | training/dpmert/test_data/c2e.txt.0 (renamed from dpmert/test_data/c2e.txt.0) | 0 | ||||
-rw-r--r-- | training/dpmert/test_data/c2e.txt.1 (renamed from dpmert/test_data/c2e.txt.1) | 0 | ||||
-rw-r--r-- | training/dpmert/test_data/c2e.txt.2 (renamed from dpmert/test_data/c2e.txt.2) | 0 | ||||
-rw-r--r-- | training/dpmert/test_data/c2e.txt.3 (renamed from dpmert/test_data/c2e.txt.3) | 0 | ||||
-rw-r--r-- | training/dpmert/test_data/re.txt.0 (renamed from dpmert/test_data/re.txt.0) | 0 | ||||
-rw-r--r-- | training/dpmert/test_data/re.txt.1 (renamed from dpmert/test_data/re.txt.1) | 0 | ||||
-rw-r--r-- | training/dpmert/test_data/re.txt.2 (renamed from dpmert/test_data/re.txt.2) | 0 | ||||
-rw-r--r-- | training/dpmert/test_data/re.txt.3 (renamed from dpmert/test_data/re.txt.3) | 0 | ||||
-rw-r--r-- | training/dtrain/Makefile.am (renamed from dtrain/Makefile.am) | 2 | ||||
-rw-r--r-- | training/dtrain/README.md (renamed from dtrain/README.md) | 0 | ||||
-rw-r--r-- | training/dtrain/dtrain.cc (renamed from dtrain/dtrain.cc) | 0 | ||||
-rw-r--r-- | training/dtrain/dtrain.h (renamed from dtrain/dtrain.h) | 0 | ||||
-rwxr-xr-x | training/dtrain/hstreaming/avg.rb (renamed from dtrain/hstreaming/avg.rb) | 0 | ||||
-rw-r--r-- | training/dtrain/hstreaming/cdec.ini (renamed from dtrain/hstreaming/cdec.ini) | 0 | ||||
-rw-r--r-- | training/dtrain/hstreaming/dtrain.ini (renamed from dtrain/hstreaming/dtrain.ini) | 0 | ||||
-rwxr-xr-x | training/dtrain/hstreaming/dtrain.sh (renamed from dtrain/hstreaming/dtrain.sh) | 0 | ||||
-rwxr-xr-x | training/dtrain/hstreaming/hadoop-streaming-job.sh (renamed from dtrain/hstreaming/hadoop-streaming-job.sh) | 0 | ||||
-rwxr-xr-x | training/dtrain/hstreaming/lplp.rb (renamed from dtrain/hstreaming/lplp.rb) | 0 | ||||
-rw-r--r-- | training/dtrain/hstreaming/red-test (renamed from dtrain/hstreaming/red-test) | 0 | ||||
-rw-r--r-- | training/dtrain/kbestget.h (renamed from dtrain/kbestget.h) | 0 | ||||
-rw-r--r-- | training/dtrain/ksampler.h (renamed from dtrain/ksampler.h) | 0 | ||||
-rw-r--r-- | training/dtrain/pairsampling.h (renamed from dtrain/pairsampling.h) | 0 | ||||
-rwxr-xr-x | training/dtrain/parallelize.rb (renamed from dtrain/parallelize.rb) | 0 | ||||
-rw-r--r-- | training/dtrain/parallelize/test/cdec.ini (renamed from dtrain/parallelize/test/cdec.ini) | 0 | ||||
-rw-r--r-- | training/dtrain/parallelize/test/dtrain.ini (renamed from dtrain/parallelize/test/dtrain.ini) | 0 | ||||
-rw-r--r-- | training/dtrain/parallelize/test/in (renamed from dtrain/parallelize/test/in) | 0 | ||||
-rw-r--r-- | training/dtrain/parallelize/test/refs (renamed from dtrain/parallelize/test/refs) | 0 | ||||
-rw-r--r-- | training/dtrain/score.cc (renamed from dtrain/score.cc) | 0 | ||||
-rw-r--r-- | training/dtrain/score.h (renamed from dtrain/score.h) | 0 | ||||
-rw-r--r-- | training/dtrain/test/example/README (renamed from dtrain/test/example/README) | 0 | ||||
-rw-r--r-- | training/dtrain/test/example/cdec.ini (renamed from dtrain/test/example/cdec.ini) | 0 | ||||
-rw-r--r-- | training/dtrain/test/example/dtrain.ini (renamed from dtrain/test/example/dtrain.ini) | 0 | ||||
-rw-r--r-- | training/dtrain/test/example/expected-output (renamed from dtrain/test/example/expected-output) | 0 | ||||
-rw-r--r-- | training/dtrain/test/parallelize/cdec.ini (renamed from dtrain/test/parallelize/cdec.ini) | 0 | ||||
-rw-r--r-- | training/dtrain/test/parallelize/dtrain.ini (renamed from dtrain/test/parallelize/dtrain.ini) | 0 | ||||
-rw-r--r-- | training/dtrain/test/parallelize/in (renamed from dtrain/test/parallelize/in) | 0 | ||||
-rw-r--r-- | training/dtrain/test/parallelize/refs (renamed from dtrain/test/parallelize/refs) | 0 | ||||
-rw-r--r-- | training/dtrain/test/toy/cdec.ini (renamed from dtrain/test/toy/cdec.ini) | 0 | ||||
-rw-r--r-- | training/dtrain/test/toy/dtrain.ini (renamed from dtrain/test/toy/dtrain.ini) | 0 | ||||
-rw-r--r-- | training/dtrain/test/toy/input (renamed from dtrain/test/toy/input) | 0 | ||||
-rw-r--r-- | training/feature_expectations.cc | 232 | ||||
-rw-r--r-- | training/lbl_model.cc | 421 | ||||
-rw-r--r-- | training/minrisk/Makefile.am | 6 | ||||
-rwxr-xr-x | training/minrisk/minrisk.pl (renamed from minrisk/minrisk.pl) | 20 | ||||
-rwxr-xr-x | training/minrisk/minrisk_generate_input.pl (renamed from minrisk/minrisk_generate_input.pl) | 0 | ||||
-rw-r--r-- | training/minrisk/minrisk_optimize.cc (renamed from minrisk/minrisk_optimize.cc) | 0 | ||||
-rw-r--r-- | training/mira/Makefile.am (renamed from mira/Makefile.am) | 2 | ||||
-rw-r--r-- | training/mira/kbest_mira.cc (renamed from mira/kbest_mira.cc) | 0 | ||||
-rw-r--r-- | training/mpi_em_optimize.cc | 389 | ||||
-rw-r--r-- | training/mr_em_adapted_reduce.cc | 173 | ||||
-rw-r--r-- | training/mr_em_map_adapter.cc | 160 | ||||
-rw-r--r-- | training/mr_optimize_reduce.cc | 231 | ||||
-rw-r--r-- | training/mr_reduce_to_weights.cc | 109 | ||||
-rw-r--r-- | training/pro/Makefile.am (renamed from pro/Makefile.am) | 4 | ||||
-rwxr-xr-x | training/pro/mr_pro_generate_mapper_input.pl (renamed from pro/mr_pro_generate_mapper_input.pl) | 0 | ||||
-rw-r--r-- | training/pro/mr_pro_map.cc (renamed from pro/mr_pro_map.cc) | 0 | ||||
-rw-r--r-- | training/pro/mr_pro_reduce.cc (renamed from pro/mr_pro_reduce.cc) | 0 | ||||
-rwxr-xr-x | training/pro/pro.pl (renamed from pro/pro.pl) | 20 | ||||
-rw-r--r-- | training/rampion/Makefile.am (renamed from rampion/Makefile.am) | 4 | ||||
-rwxr-xr-x | training/rampion/rampion.pl (renamed from rampion/rampion.pl) | 20 | ||||
-rw-r--r-- | training/rampion/rampion_cccp.cc (renamed from rampion/rampion_cccp.cc) | 0 | ||||
-rwxr-xr-x | training/rampion/rampion_generate_input.pl (renamed from rampion/rampion_generate_input.pl) | 0 | ||||
-rw-r--r-- | training/utils/candidate_set.cc (renamed from training/candidate_set.cc) | 0 | ||||
-rw-r--r-- | training/utils/candidate_set.h (renamed from training/candidate_set.h) | 0 | ||||
-rwxr-xr-x | training/utils/decode-and-evaluate.pl (renamed from dpmert/decode-and-evaluate.pl) | 8 | ||||
-rw-r--r-- | training/utils/entropy.cc (renamed from training/entropy.cc) | 0 | ||||
-rw-r--r-- | training/utils/entropy.h (renamed from training/entropy.h) | 0 | ||||
-rw-r--r-- | training/utils/grammar_convert.cc (renamed from training/grammar_convert.cc) | 0 | ||||
-rw-r--r-- | training/utils/lbfgs.h (renamed from training/lbfgs.h) | 0 | ||||
-rw-r--r-- | training/utils/lbfgs_test.cc (renamed from training/lbfgs_test.cc) | 0 | ||||
-rw-r--r-- | training/utils/libcall.pl (renamed from dpmert/libcall.pl) | 0 | ||||
-rw-r--r-- | training/utils/online_optimizer.cc (renamed from training/online_optimizer.cc) | 0 | ||||
-rw-r--r-- | training/utils/online_optimizer.h (renamed from training/online_optimizer.h) | 0 | ||||
-rw-r--r-- | training/utils/optimize.cc (renamed from training/optimize.cc) | 0 | ||||
-rw-r--r-- | training/utils/optimize.h (renamed from training/optimize.h) | 0 | ||||
-rw-r--r-- | training/utils/optimize_test.cc (renamed from training/optimize_test.cc) | 0 | ||||
-rwxr-xr-x | training/utils/parallelize.pl (renamed from dpmert/parallelize.pl) | 2 | ||||
-rw-r--r-- | training/utils/risk.cc (renamed from training/risk.cc) | 0 | ||||
-rw-r--r-- | training/utils/risk.h (renamed from training/risk.h) | 0 | ||||
-rw-r--r-- | training/utils/sentclient.c (renamed from dpmert/sentclient.c) | 0 | ||||
-rw-r--r-- | training/utils/sentserver.c (renamed from dpmert/sentserver.c) | 0 | ||||
-rw-r--r-- | training/utils/sentserver.h (renamed from dpmert/sentserver.h) | 0 | ||||
-rw-r--r-- | word-aligner/Makefile.am | 6 | ||||
-rw-r--r-- | word-aligner/fast_align.cc (renamed from training/fast_align.cc) | 0 | ||||
-rw-r--r-- | word-aligner/makefiles/makefile.grammars | 2 | ||||
-rwxr-xr-x | word-aligner/paste-parallel-files.pl | 35 | ||||
-rw-r--r-- | word-aligner/ttables.cc (renamed from training/ttables.cc) | 0 | ||||
-rw-r--r-- | word-aligner/ttables.h (renamed from training/ttables.h) | 0 |
133 files changed, 149 insertions, 2271 deletions
@@ -1,3 +1,6 @@ +example_extff/ff_example.lo +example_extff/libff_example.la +mteval/meteor_jar.cc *.a *.aux *.bbl @@ -176,4 +179,27 @@ utils/reconstruct_weights utils/small_vector_test utils/ts utils/weights_test -utils/unigram_pyp_lm +training/crf/mpi_batch_optimize +training/crf/mpi_compute_cllh +training/crf/mpi_extract_features +training/crf/mpi_extract_reachable +training/crf/mpi_flex_optimize +training/crf/mpi_online_optimize +training/dpmert/lo_test +training/dpmert/mr_dpmert_generate_mapper_input +training/dpmert/mr_dpmert_map +training/dpmert/mr_dpmert_reduce +training/dpmert/sentclient +training/dpmert/sentserver +training/dtrain/dtrain +training/minrisk/minrisk_optimize +training/mira/kbest_mira +training/pro/mr_pro_map +training/pro/mr_pro_reduce +training/rampion/rampion_cccp +training/utils/Makefile.am +training/utils/lbfgs_test +training/utils/optimize_test +training/utils/sentclient +training/utils/sentserver +word-aligner/fast_align diff --git a/Makefile.am b/Makefile.am index 7ca7268a..dbf604a1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,12 +10,7 @@ SUBDIRS = \ decoder \ training \ training/liblbfgs \ - mira \ - dtrain \ - dpmert \ - pro \ - rampion \ - minrisk \ + word-aligner \ example_extff #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava diff --git a/configure.ac b/configure.ac index 09fc5c5b..366112a3 100644 --- a/configure.ac +++ b/configure.ac @@ -82,26 +82,34 @@ AC_PROG_INSTALL CPPFLAGS="-DPIC -fPIC $CPPFLAGS -DHAVE_CONFIG_H" +# core cdec stuff AC_CONFIG_FILES([Makefile]) AC_CONFIG_FILES([utils/Makefile]) AC_CONFIG_FILES([mteval/Makefile]) +AC_CONFIG_FILES([mteval/meteor_jar.cc]) AC_CONFIG_FILES([decoder/Makefile]) -AC_CONFIG_FILES([training/Makefile]) -AC_CONFIG_FILES([training/liblbfgs/Makefile]) -AC_CONFIG_FILES([dpmert/Makefile]) -AC_CONFIG_FILES([pro/Makefile]) -AC_CONFIG_FILES([rampion/Makefile]) -AC_CONFIG_FILES([minrisk/Makefile]) +AC_CONFIG_FILES([python/setup.py]) +AC_CONFIG_FILES([word-aligner/Makefile]) + +# KenLM stuff AC_CONFIG_FILES([klm/util/Makefile]) AC_CONFIG_FILES([klm/lm/Makefile]) AC_CONFIG_FILES([klm/search/Makefile]) -AC_CONFIG_FILES([mira/Makefile]) -AC_CONFIG_FILES([dtrain/Makefile]) -AC_CONFIG_FILES([example_extff/Makefile]) -AC_CONFIG_FILES([mteval/meteor_jar.cc]) - -AC_CONFIG_FILES([python/setup.py]) +# training stuff +AC_CONFIG_FILES([training/Makefile]) +AC_CONFIG_FILES([training/utils/Makefile]) +AC_CONFIG_FILES([training/liblbfgs/Makefile]) +AC_CONFIG_FILES([training/crf/Makefile]) +AC_CONFIG_FILES([training/dpmert/Makefile]) +AC_CONFIG_FILES([training/pro/Makefile]) +AC_CONFIG_FILES([training/rampion/Makefile]) +AC_CONFIG_FILES([training/minrisk/Makefile]) +AC_CONFIG_FILES([training/mira/Makefile]) +AC_CONFIG_FILES([training/dtrain/Makefile]) + +# external feature function example code +AC_CONFIG_FILES([example_extff/Makefile]) AC_OUTPUT diff --git a/dpmert/README.shared-mem b/dpmert/README.shared-mem deleted file mode 100644 index 7728efc0..00000000 --- a/dpmert/README.shared-mem +++ /dev/null @@ -1,9 +0,0 @@ -If you want to run dist-vest.pl on a very large shared memory machine, do the -following: - - ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini - -This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the -decoder must load grammars, language models, etc., J should be smaller than I, but this will depend -on the system you are running on and the complexity of the models used for decoding. - diff --git a/minrisk/Makefile.am b/minrisk/Makefile.am deleted file mode 100644 index a24f047c..00000000 --- a/minrisk/Makefile.am +++ /dev/null @@ -1,6 +0,0 @@ -bin_PROGRAMS = minrisk_optimize - -minrisk_optimize_SOURCES = minrisk_optimize.cc -minrisk_optimize_LDADD = $(top_srcdir)/training/libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/training/liblbfgs/liblbfgs.a -lz - -AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training diff --git a/pro/README.shared-mem b/pro/README.shared-mem deleted file mode 100644 index 7728efc0..00000000 --- a/pro/README.shared-mem +++ /dev/null @@ -1,9 +0,0 @@ -If you want to run dist-vest.pl on a very large shared memory machine, do the -following: - - ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini - -This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the -decoder must load grammars, language models, etc., J should be smaller than I, but this will depend -on the system you are running on and the complexity of the models used for decoding. - diff --git a/training/Makefile.am b/training/Makefile.am index f9c25391..e95e045f 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -1,91 +1,11 @@ -bin_PROGRAMS = \ - fast_align \ - lbl_model \ - test_ngram \ - mr_em_map_adapter \ - mr_em_adapted_reduce \ - mr_reduce_to_weights \ - mr_optimize_reduce \ - grammar_convert \ - plftools \ - collapse_weights \ - mpi_extract_reachable \ - mpi_extract_features \ - mpi_online_optimize \ - mpi_flex_optimize \ - mpi_batch_optimize \ - mpi_compute_cllh \ - augment_grammar +SUBDIRS = \ + liblbfgs \ + utils \ + crf \ + minrisk \ + dpmert \ + pro \ + dtrain \ + mira \ + rampion -noinst_PROGRAMS = \ - lbfgs_test \ - optimize_test - -TESTS = lbfgs_test optimize_test - -noinst_LIBRARIES = libtraining.a -libtraining_a_SOURCES = \ - candidate_set.cc \ - entropy.cc \ - optimize.cc \ - online_optimizer.cc \ - risk.cc - -mpi_online_optimize_SOURCES = mpi_online_optimize.cc -mpi_online_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc -mpi_flex_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc -mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_extract_features_SOURCES = mpi_extract_features.cc -mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc -mpi_batch_optimize_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc -mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -augment_grammar_SOURCES = augment_grammar.cc -augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -test_ngram_SOURCES = test_ngram.cc -test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - -fast_align_SOURCES = fast_align.cc ttables.cc -fast_align_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -lbl_model_SOURCES = lbl_model.cc -lbl_model_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -grammar_convert_SOURCES = grammar_convert.cc -grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -optimize_test_SOURCES = optimize_test.cc -optimize_test_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -collapse_weights_SOURCES = collapse_weights.cc -collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -lbfgs_test_SOURCES = lbfgs_test.cc -lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc -mr_optimize_reduce_LDADD = libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc -mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc -mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc -mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -plftools_SOURCES = plftools.cc -plftools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/utils/libutils.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval -I../klm diff --git a/training/add-model1-features-to-scfg.pl b/training/add-model1-features-to-scfg.pl deleted file mode 100755 index a0074317..00000000 --- a/training/add-model1-features-to-scfg.pl +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/perl -w - -# [X] ||| so [X,1] die [X,2] der ||| as [X,1] existing [X,2] the ||| 2.47712135315 2.53182387352 5.07100057602 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| this [X,1] the [X,2] of ||| 2.47712135315 3.19828724861 2.38270020485 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| as [X,1] the [X,2] the ||| 2.47712135315 2.53182387352 1.48463630676 ||| 0-0 2-2 4-4 -# [X] ||| so [X,1] die [X,2] der ||| is [X,1] the [X,2] of the ||| 2.47712135315 3.45197868347 2.64251494408 ||| 0-0 2-2 4-4 4-5 - -die "Usage: $0 model1.f-e model1.e-f < grammar.scfg\n (use trianing/model1 to extract the model files)\n" unless scalar @ARGV == 2; - -my $fm1 = shift @ARGV; -die unless $fm1; -my $frm1 = shift @ARGV; -die unless $frm1; -open M1,"<$fm1" or die; -open RM1,"<$frm1" or die; -print STDERR "Loading Model 1 probs from $fm1...\n"; -my %m1; -while(<M1>) { - chomp; - my ($f, $e, $lp) = split /\s+/; - $m1{$e}->{$f} = exp($lp); -} -close M1; - -print STDERR "Loading Inverse Model 1 probs from $frm1...\n"; -my %rm1; -while(<RM1>) { - chomp; - my ($e, $f, $lp) = split /\s+/; - $rm1{$f}->{$e} = exp($lp); -} -close RM1; - -my @label = qw( EGivenF LexFGivenE LexEGivenF ); -while(<>) { - chomp; - my ($l, $f, $e, $sscores, $al) = split / \|\|\| /; - my @scores = split /\s+/, $sscores; - unless ($sscores =~ /=/) { - for (my $i=0; $i<3; $i++) { $scores[$i] = "$label[$i]=$scores[$i]"; } - } - push @scores, "RuleCount=1"; - my @fs = split /\s+/, $f; - my @es = split /\s+/, $e; - my $flen = scalar @fs; - my $elen = scalar @es; - my $pgen = 0; - my $nongen = 0; - for (my $i =0; $i < $flen; $i++) { - my $ftot = 0; - next if ($fs[$i] =~ /\[X/); - my $cr = $rm1{$fs[$i]}; - for (my $j=0; $j <= $elen; $j++) { - my $ej = '<eps>'; - if ($j < $elen) { $ej = $es[$j]; } - my $p = $cr->{$ej}; - if (defined $p) { $ftot += $p; } - } - if ($ftot == 0) { $nongen = 1; last; } - $pgen += log($ftot) - log($elen); - } - my $bad = 0; - my $good = 0; - unless ($nongen) { push @scores, "RGood=1"; $good++; } else { push @scores, "RBad=1"; $bad++; } - - $nongen = 0; - $pgen = 0; - for (my $i =0; $i < $elen; $i++) { - my $etot = 0; - next if ($es[$i] =~ /\[X/); - my $cr = $m1{$es[$i]}; -# print STDERR "$es[$i]\n"; - for (my $j=0; $j <= $flen; $j++) { - my $fj = '<eps>'; - if ($j < $flen) { $fj = $fs[$j]; } - my $p = $cr->{$fj}; -# print STDERR " $fs[$j] : $p\n"; - if (defined $p) { $etot += $p; } - } - if ($etot == 0) { $nongen = 1; last; } - $pgen += log($etot) - log($flen); - } - unless ($nongen) { - push @scores, "FGood=1"; - if ($good) { push @scores, "BothGood=1"; } else { push @scores, "SusDel=1"; } - } else { - push @scores, "FBad=1"; - if ($bad) { push @scores, "BothBad=1"; } else { push @scores, "SusHall=1"; } - } - print "$l ||| $f ||| $e ||| @scores"; - if (defined $al) { print " ||| $al\n"; } else { print "\n"; } -} - diff --git a/training/collapse_weights.cc b/training/collapse_weights.cc deleted file mode 100644 index c03eb031..00000000 --- a/training/collapse_weights.cc +++ /dev/null @@ -1,110 +0,0 @@ -char const* NOTES = - "ZF_and_E means unnormalized scaled features.\n" - "For grammars with one nonterminal: F_and_E is joint,\n" - "F_given_E and E_given_F are conditional.\n" - "TODO: group rules by root nonterminal and then normalize.\n"; - - -#include <iostream> -#include <fstream> -#include <tr1/unordered_map> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include <boost/functional/hash.hpp> - -#include "prob.h" -#include "filelib.h" -#include "trule.h" -#include "weights.h" - -namespace po = boost::program_options; -using namespace std; - -typedef std::tr1::unordered_map<vector<WordID>, prob_t, boost::hash<vector<WordID> > > MarginalMap; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("grammar,g", po::value<string>(), "Grammar file") - ("weights,w", po::value<string>(), "Weights file") - ("unnormalized,u", "Always include ZF_and_E unnormalized score (default: only if sum was >1)") - ; - po::options_description clo("Command line options"); - clo.add_options() - ("config,c", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - const string cfg = (*conf)["config"].as<string>(); - cerr << "Configuration file: " << cfg << endl; - ifstream config(cfg.c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("grammar") || !conf->count("weights")) { - cerr << dcmdline_options << endl; - cerr << NOTES << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string wfile = conf["weights"].as<string>(); - const string gfile = conf["grammar"].as<string>(); - vector<weight_t> w; - Weights::InitFromFile(wfile, &w); - MarginalMap e_tots; - MarginalMap f_tots; - prob_t tot; - { - ReadFile rf(gfile); - assert(*rf.stream()); - istream& in = *rf.stream(); - cerr << "Computing marginals...\n"; - int lc = 0; - while(in) { - string line; - getline(in, line); - ++lc; - if (line.empty()) continue; - TRule tr(line, true); - if (tr.GetFeatureValues().empty()) - cerr << "Line " << lc << ": empty features - may introduce bias\n"; - prob_t prob; - prob.logeq(tr.GetFeatureValues().dot(w)); - e_tots[tr.e_] += prob; - f_tots[tr.f_] += prob; - tot += prob; - } - } - bool normalized = (fabs(log(tot)) < 0.001); - cerr << "Total: " << tot << (normalized ? " [normalized]" : " [scaled]") << endl; - ReadFile rf(gfile); - istream&in = *rf.stream(); - while(in) { - string line; - getline(in, line); - if (line.empty()) continue; - TRule tr(line, true); - const double lp = tr.GetFeatureValues().dot(w); - if (std::isinf(lp)) { continue; } - tr.scores_.clear(); - - cout << tr.AsString() << " ||| F_and_E=" << lp - log(tot); - if (!normalized || conf.count("unnormalized")) { - cout << ";ZF_and_E=" << lp; - } - cout << ";F_given_E=" << lp - log(e_tots[tr.e_]) - << ";E_given_F=" << lp - log(f_tots[tr.f_]) << endl; - } - return 0; -} - diff --git a/training/crf/Makefile.am b/training/crf/Makefile.am new file mode 100644 index 00000000..d203df25 --- /dev/null +++ b/training/crf/Makefile.am @@ -0,0 +1,27 @@ +bin_PROGRAMS = \ + mpi_batch_optimize \ + mpi_compute_cllh \ + mpi_extract_features \ + mpi_extract_reachable \ + mpi_flex_optimize \ + mpi_online_optimize + +mpi_online_optimize_SOURCES = mpi_online_optimize.cc +mpi_online_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_flex_optimize_SOURCES = mpi_flex_optimize.cc +mpi_flex_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_extract_reachable_SOURCES = mpi_extract_reachable.cc +mpi_extract_reachable_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_extract_features_SOURCES = mpi_extract_features.cc +mpi_extract_features_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_batch_optimize_SOURCES = mpi_batch_optimize.cc cllh_observer.cc +mpi_batch_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +mpi_compute_cllh_SOURCES = mpi_compute_cllh.cc cllh_observer.cc +mpi_compute_cllh_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz + +AM_CPPFLAGS = -DBOOST_TEST_DYN_LINK -W -Wall -Wno-sign-compare -I$(top_srcdir)/training -I$(top_srcdir)/training/utils -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/cllh_observer.cc b/training/crf/cllh_observer.cc index 4ec2fa65..4ec2fa65 100644 --- a/training/cllh_observer.cc +++ b/training/crf/cllh_observer.cc diff --git a/training/cllh_observer.h b/training/crf/cllh_observer.h index 0de47331..0de47331 100644 --- a/training/cllh_observer.h +++ b/training/crf/cllh_observer.h diff --git a/training/mpi_batch_optimize.cc b/training/crf/mpi_batch_optimize.cc index 2eff07e4..2eff07e4 100644 --- a/training/mpi_batch_optimize.cc +++ b/training/crf/mpi_batch_optimize.cc diff --git a/training/mpi_compute_cllh.cc b/training/crf/mpi_compute_cllh.cc index 066389d0..066389d0 100644 --- a/training/mpi_compute_cllh.cc +++ b/training/crf/mpi_compute_cllh.cc diff --git a/training/mpi_extract_features.cc b/training/crf/mpi_extract_features.cc index 6750aa15..6750aa15 100644 --- a/training/mpi_extract_features.cc +++ b/training/crf/mpi_extract_features.cc diff --git a/training/mpi_extract_reachable.cc b/training/crf/mpi_extract_reachable.cc index 2a7c2b9d..2a7c2b9d 100644 --- a/training/mpi_extract_reachable.cc +++ b/training/crf/mpi_extract_reachable.cc diff --git a/training/mpi_flex_optimize.cc b/training/crf/mpi_flex_optimize.cc index b52decdc..b52decdc 100644 --- a/training/mpi_flex_optimize.cc +++ b/training/crf/mpi_flex_optimize.cc diff --git a/training/mpi_online_optimize.cc b/training/crf/mpi_online_optimize.cc index d6968848..d6968848 100644 --- a/training/mpi_online_optimize.cc +++ b/training/crf/mpi_online_optimize.cc diff --git a/training/dep-reorder/conll2reordering-forest.pl b/training/dep-reorder/conll2reordering-forest.pl deleted file mode 100755 index 3cd226be..00000000 --- a/training/dep-reorder/conll2reordering-forest.pl +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } -my $FIRST_CONV = "$script_dir/scripts/conll2simplecfg.pl"; -my $CDEC = "$script_dir/../../decoder/cdec"; - -our $tfile1 = "grammar1.$$"; -our $tfile2 = "text.$$"; - -die "Usage: $0 parses.conll\n" unless scalar @ARGV == 1; -open C, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; - -END { unlink $tfile1; unlink "$tfile1.cfg"; unlink $tfile2; } - -my $first = 1; -open T, ">$tfile1" or die "Can't write $tfile1: $!"; -my $lc = 0; -my $flag = 0; -my @words = (); -while(<C>) { - print T; - chomp; - if (/^$/) { - if ($first) { $first = undef; } else { if ($flag) { print "\n"; $flag = 0; } } - $first = undef; - close T; - open SO, ">$tfile2" or die "Can't write $tfile2: $!"; - print SO "@words\n"; - close SO; - @words=(); - `$FIRST_CONV < $tfile1 > $tfile1.cfg`; - if ($? != 0) { - die "Error code: $?"; - } - my $cfg = `$CDEC -n -S 10000 -f scfg -g $tfile1.cfg -i $tfile2 --show_cfg_search_space 2>/dev/null`; - if ($? != 0) { - die "Error code: $?"; - } - my @rules = split /\n/, $cfg; - shift @rules; # get rid of output - for my $rule (@rules) { - my ($lhs, $f, $e, $feats) = split / \|\|\| /, $rule; - $f =~ s/,\d\]/\]/g; - $feats = 'TOP=1' unless $feats; - if ($lhs =~ /\[Goal_\d+\]/) { $lhs = '[S]'; } - print "$lhs ||| $f ||| $feats\n"; - if ($e eq '[1] [2]') { - my ($a, $b) = split /\s+/, $f; - $feats =~ s/=1$//; - my ($x, $y) = split /_/, $feats; - print "$lhs ||| $b $a ||| ${y}_$x=1\n"; - } - $flag = 1; - } - open T, ">$tfile1" or die "Can't write $tfile1: $!"; - $lc = -1; - } else { - my ($ind, $word, @dmmy) = split /\s+/; - push @words, $word; - } - $lc++; -} -close T; - diff --git a/training/dep-reorder/george.conll b/training/dep-reorder/george.conll deleted file mode 100644 index 7eebb360..00000000 --- a/training/dep-reorder/george.conll +++ /dev/null @@ -1,4 +0,0 @@ -1 George _ GEORGE _ _ 2 X _ _ -2 hates _ HATES _ _ 0 X _ _ -3 broccoli _ BROC _ _ 2 X _ _ - diff --git a/training/dep-reorder/scripts/conll2simplecfg.pl b/training/dep-reorder/scripts/conll2simplecfg.pl deleted file mode 100755 index b101347a..00000000 --- a/training/dep-reorder/scripts/conll2simplecfg.pl +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -# 1 在 _ 10 _ _ 4 X _ _ -# 2 门厅 _ 3 _ _ 1 X _ _ -# 3 下面 _ 23 _ _ 4 X _ _ -# 4 。 _ 45 _ _ 0 X _ _ - -my @ldeps; -my @rdeps; -@ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } -@rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } -my $rootcat = 0; -my @cats = ('S'); -my $len = 0; -my @noposcats = ('S'); -while(<>) { - chomp; - if (/^\s*$/) { - write_cfg($len); - $len = 0; - @cats=('S'); - @noposcats = ('S'); - @ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } - @rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } - next; - } - $len++; - my ($pos, $word, $d1, $xcat, $d2, $d3, $headpos, $deptype) = split /\s+/; - my $cat = "C$xcat"; - my $catpos = $cat . "_$pos"; - push @cats, $catpos; - push @noposcats, $cat; - print "[$catpos] ||| $word ||| $word ||| Word=1\n"; - if ($headpos == 0) { $rootcat = $pos; } - if ($pos < $headpos) { - push @{$ldeps[$headpos]}, $pos; - } else { - push @{$rdeps[$headpos]}, $pos; - } -} - -sub write_cfg { - my $len = shift; - for (my $i = 1; $i <= $len; $i++) { - my @lds = @{$ldeps[$i]}; - for my $ld (@lds) { - print "[$cats[$i]] ||| [$cats[$ld],1] [$cats[$i],2] ||| [1] [2] ||| $noposcats[$ld]_$noposcats[$i]=1\n"; - } - my @rds = @{$rdeps[$i]}; - for my $rd (@rds) { - print "[$cats[$i]] ||| [$cats[$i],1] [$cats[$rd],2] ||| [1] [2] ||| $noposcats[$i]_$noposcats[$rd]=1\n"; - } - } - print "[S] ||| [$cats[$rootcat],1] ||| [1] ||| TOP=1\n"; -} - diff --git a/dpmert/Makefile.am b/training/dpmert/Makefile.am index 00768271..ff318bef 100644 --- a/dpmert/Makefile.am +++ b/training/dpmert/Makefile.am @@ -1,20 +1,12 @@ bin_PROGRAMS = \ mr_dpmert_map \ mr_dpmert_reduce \ - mr_dpmert_generate_mapper_input \ - sentserver \ - sentclient + mr_dpmert_generate_mapper_input noinst_PROGRAMS = \ lo_test TESTS = lo_test -sentserver_SOURCES = sentserver.c -sentserver_LDFLAGS = -pthread - -sentclient_SOURCES = sentclient.c -sentclient_LDFLAGS = -pthread - mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz diff --git a/dpmert/ces.cc b/training/dpmert/ces.cc index 157b2d17..157b2d17 100644 --- a/dpmert/ces.cc +++ b/training/dpmert/ces.cc diff --git a/dpmert/ces.h b/training/dpmert/ces.h index e4fa2080..e4fa2080 100644 --- a/dpmert/ces.h +++ b/training/dpmert/ces.h diff --git a/dpmert/divide_refs.py b/training/dpmert/divide_refs.py index b478f918..b478f918 100755 --- a/dpmert/divide_refs.py +++ b/training/dpmert/divide_refs.py diff --git a/dpmert/dpmert.pl b/training/dpmert/dpmert.pl index c4f98870..559420f5 100755 --- a/dpmert/dpmert.pl +++ b/training/dpmert/dpmert.pl @@ -2,7 +2,7 @@ use strict; my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } # Skip local config (used for distributing jobs) if we're running in local-only mode use LocalConfig; @@ -17,21 +17,22 @@ my $srcFile; # deprecated my $refFiles; # deprecated my $default_jobs = env_default_jobs(); my $bin_dir = $SCRIPT_DIR; +my $util_dir = "$SCRIPT_DIR/../utils"; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input"; my $MAPPER = "$bin_dir/mr_dpmert_map"; my $REDUCER = "$bin_dir/mr_dpmert_reduce"; -my $parallelize = "$bin_dir/parallelize.pl"; -my $libcall = "$bin_dir/libcall.pl"; -my $sentserver = "$bin_dir/sentserver"; -my $sentclient = "$bin_dir/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; +my $parallelize = "$util_dir/parallelize.pl"; +my $libcall = "$util_dir/libcall.pl"; +my $sentserver = "$util_dir/sentserver"; +my $sentclient = "$util_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; my $SCORER = $FAST_SCORE; die "Can't find $MAPPER" unless -x $MAPPER; -my $cdec = "$bin_dir/../decoder/cdec"; +my $cdec = "$bin_dir/../../decoder/cdec"; die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; die "Can't find $libcall" unless -e $libcall; diff --git a/dpmert/error_surface.cc b/training/dpmert/error_surface.cc index 515b67f8..515b67f8 100644 --- a/dpmert/error_surface.cc +++ b/training/dpmert/error_surface.cc diff --git a/dpmert/error_surface.h b/training/dpmert/error_surface.h index bb65847b..bb65847b 100644 --- a/dpmert/error_surface.h +++ b/training/dpmert/error_surface.h diff --git a/dpmert/line_mediator.pl b/training/dpmert/line_mediator.pl index bc2bb24c..bc2bb24c 100755 --- a/dpmert/line_mediator.pl +++ b/training/dpmert/line_mediator.pl diff --git a/dpmert/line_optimizer.cc b/training/dpmert/line_optimizer.cc index 9cf33502..9cf33502 100644 --- a/dpmert/line_optimizer.cc +++ b/training/dpmert/line_optimizer.cc diff --git a/dpmert/line_optimizer.h b/training/dpmert/line_optimizer.h index 83819f41..83819f41 100644 --- a/dpmert/line_optimizer.h +++ b/training/dpmert/line_optimizer.h diff --git a/dpmert/lo_test.cc b/training/dpmert/lo_test.cc index 95a08d3d..95a08d3d 100644 --- a/dpmert/lo_test.cc +++ b/training/dpmert/lo_test.cc diff --git a/dpmert/mert_geometry.cc b/training/dpmert/mert_geometry.cc index d6973658..d6973658 100644 --- a/dpmert/mert_geometry.cc +++ b/training/dpmert/mert_geometry.cc diff --git a/dpmert/mert_geometry.h b/training/dpmert/mert_geometry.h index a8b6959e..a8b6959e 100644 --- a/dpmert/mert_geometry.h +++ b/training/dpmert/mert_geometry.h diff --git a/dpmert/mr_dpmert_generate_mapper_input.cc b/training/dpmert/mr_dpmert_generate_mapper_input.cc index 199cd23a..199cd23a 100644 --- a/dpmert/mr_dpmert_generate_mapper_input.cc +++ b/training/dpmert/mr_dpmert_generate_mapper_input.cc diff --git a/dpmert/mr_dpmert_map.cc b/training/dpmert/mr_dpmert_map.cc index d1efcf96..d1efcf96 100644 --- a/dpmert/mr_dpmert_map.cc +++ b/training/dpmert/mr_dpmert_map.cc diff --git a/dpmert/mr_dpmert_reduce.cc b/training/dpmert/mr_dpmert_reduce.cc index 31512a03..31512a03 100644 --- a/dpmert/mr_dpmert_reduce.cc +++ b/training/dpmert/mr_dpmert_reduce.cc diff --git a/dpmert/test_aer/README b/training/dpmert/test_aer/README index 819b2e32..819b2e32 100644 --- a/dpmert/test_aer/README +++ b/training/dpmert/test_aer/README diff --git a/dpmert/test_aer/cdec.ini b/training/dpmert/test_aer/cdec.ini index 08187848..08187848 100644 --- a/dpmert/test_aer/cdec.ini +++ b/training/dpmert/test_aer/cdec.ini diff --git a/dpmert/test_aer/corpus.src b/training/dpmert/test_aer/corpus.src index 31b23971..31b23971 100644 --- a/dpmert/test_aer/corpus.src +++ b/training/dpmert/test_aer/corpus.src diff --git a/dpmert/test_aer/grammar b/training/dpmert/test_aer/grammar index 9d857824..9d857824 100644 --- a/dpmert/test_aer/grammar +++ b/training/dpmert/test_aer/grammar diff --git a/dpmert/test_aer/ref.0 b/training/dpmert/test_aer/ref.0 index 734a9c5b..734a9c5b 100644 --- a/dpmert/test_aer/ref.0 +++ b/training/dpmert/test_aer/ref.0 diff --git a/dpmert/test_aer/weights b/training/dpmert/test_aer/weights index afc9282e..afc9282e 100644 --- a/dpmert/test_aer/weights +++ b/training/dpmert/test_aer/weights diff --git a/dpmert/test_data/0.json.gz b/training/dpmert/test_data/0.json.gz Binary files differindex 30f8dd77..30f8dd77 100644 --- a/dpmert/test_data/0.json.gz +++ b/training/dpmert/test_data/0.json.gz diff --git a/dpmert/test_data/1.json.gz b/training/dpmert/test_data/1.json.gz Binary files differindex c82cc179..c82cc179 100644 --- a/dpmert/test_data/1.json.gz +++ b/training/dpmert/test_data/1.json.gz diff --git a/dpmert/test_data/c2e.txt.0 b/training/dpmert/test_data/c2e.txt.0 index 12c4abe9..12c4abe9 100644 --- a/dpmert/test_data/c2e.txt.0 +++ b/training/dpmert/test_data/c2e.txt.0 diff --git a/dpmert/test_data/c2e.txt.1 b/training/dpmert/test_data/c2e.txt.1 index 4ac12df1..4ac12df1 100644 --- a/dpmert/test_data/c2e.txt.1 +++ b/training/dpmert/test_data/c2e.txt.1 diff --git a/dpmert/test_data/c2e.txt.2 b/training/dpmert/test_data/c2e.txt.2 index 2f67b72f..2f67b72f 100644 --- a/dpmert/test_data/c2e.txt.2 +++ b/training/dpmert/test_data/c2e.txt.2 diff --git a/dpmert/test_data/c2e.txt.3 b/training/dpmert/test_data/c2e.txt.3 index 5483cef6..5483cef6 100644 --- a/dpmert/test_data/c2e.txt.3 +++ b/training/dpmert/test_data/c2e.txt.3 diff --git a/dpmert/test_data/re.txt.0 b/training/dpmert/test_data/re.txt.0 index 86eff087..86eff087 100644 --- a/dpmert/test_data/re.txt.0 +++ b/training/dpmert/test_data/re.txt.0 diff --git a/dpmert/test_data/re.txt.1 b/training/dpmert/test_data/re.txt.1 index 2140f198..2140f198 100644 --- a/dpmert/test_data/re.txt.1 +++ b/training/dpmert/test_data/re.txt.1 diff --git a/dpmert/test_data/re.txt.2 b/training/dpmert/test_data/re.txt.2 index 94e46286..94e46286 100644 --- a/dpmert/test_data/re.txt.2 +++ b/training/dpmert/test_data/re.txt.2 diff --git a/dpmert/test_data/re.txt.3 b/training/dpmert/test_data/re.txt.3 index f87c3308..f87c3308 100644 --- a/dpmert/test_data/re.txt.3 +++ b/training/dpmert/test_data/re.txt.3 diff --git a/dtrain/Makefile.am b/training/dtrain/Makefile.am index ca9581f5..5b48e756 100644 --- a/dtrain/Makefile.am +++ b/training/dtrain/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = dtrain dtrain_SOURCES = dtrain.cc score.cc -dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/README.md b/training/dtrain/README.md index 7edabbf1..7edabbf1 100644 --- a/dtrain/README.md +++ b/training/dtrain/README.md diff --git a/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 18286668..18286668 100644 --- a/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc diff --git a/dtrain/dtrain.h b/training/dtrain/dtrain.h index 4b6f415c..4b6f415c 100644 --- a/dtrain/dtrain.h +++ b/training/dtrain/dtrain.h diff --git a/dtrain/hstreaming/avg.rb b/training/dtrain/hstreaming/avg.rb index 2599c732..2599c732 100755 --- a/dtrain/hstreaming/avg.rb +++ b/training/dtrain/hstreaming/avg.rb diff --git a/dtrain/hstreaming/cdec.ini b/training/dtrain/hstreaming/cdec.ini index d4f5cecd..d4f5cecd 100644 --- a/dtrain/hstreaming/cdec.ini +++ b/training/dtrain/hstreaming/cdec.ini diff --git a/dtrain/hstreaming/dtrain.ini b/training/dtrain/hstreaming/dtrain.ini index a2c219a1..a2c219a1 100644 --- a/dtrain/hstreaming/dtrain.ini +++ b/training/dtrain/hstreaming/dtrain.ini diff --git a/dtrain/hstreaming/dtrain.sh b/training/dtrain/hstreaming/dtrain.sh index 877ff94c..877ff94c 100755 --- a/dtrain/hstreaming/dtrain.sh +++ b/training/dtrain/hstreaming/dtrain.sh diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/training/dtrain/hstreaming/hadoop-streaming-job.sh index 92419956..92419956 100755 --- a/dtrain/hstreaming/hadoop-streaming-job.sh +++ b/training/dtrain/hstreaming/hadoop-streaming-job.sh diff --git a/dtrain/hstreaming/lplp.rb b/training/dtrain/hstreaming/lplp.rb index f0cd58c5..f0cd58c5 100755 --- a/dtrain/hstreaming/lplp.rb +++ b/training/dtrain/hstreaming/lplp.rb diff --git a/dtrain/hstreaming/red-test b/training/dtrain/hstreaming/red-test index 2623d697..2623d697 100644 --- a/dtrain/hstreaming/red-test +++ b/training/dtrain/hstreaming/red-test diff --git a/dtrain/kbestget.h b/training/dtrain/kbestget.h index dd8882e1..dd8882e1 100644 --- a/dtrain/kbestget.h +++ b/training/dtrain/kbestget.h diff --git a/dtrain/ksampler.h b/training/dtrain/ksampler.h index bc2f56cd..bc2f56cd 100644 --- a/dtrain/ksampler.h +++ b/training/dtrain/ksampler.h diff --git a/dtrain/pairsampling.h b/training/dtrain/pairsampling.h index 84be1efb..84be1efb 100644 --- a/dtrain/pairsampling.h +++ b/training/dtrain/pairsampling.h diff --git a/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index 1d277ff6..1d277ff6 100755 --- a/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb diff --git a/dtrain/parallelize/test/cdec.ini b/training/dtrain/parallelize/test/cdec.ini index 72e99dc5..72e99dc5 100644 --- a/dtrain/parallelize/test/cdec.ini +++ b/training/dtrain/parallelize/test/cdec.ini diff --git a/dtrain/parallelize/test/dtrain.ini b/training/dtrain/parallelize/test/dtrain.ini index 03f9d240..03f9d240 100644 --- a/dtrain/parallelize/test/dtrain.ini +++ b/training/dtrain/parallelize/test/dtrain.ini diff --git a/dtrain/parallelize/test/in b/training/dtrain/parallelize/test/in index a312809f..a312809f 100644 --- a/dtrain/parallelize/test/in +++ b/training/dtrain/parallelize/test/in diff --git a/dtrain/parallelize/test/refs b/training/dtrain/parallelize/test/refs index 4d3128cb..4d3128cb 100644 --- a/dtrain/parallelize/test/refs +++ b/training/dtrain/parallelize/test/refs diff --git a/dtrain/score.cc b/training/dtrain/score.cc index 34fc86a9..34fc86a9 100644 --- a/dtrain/score.cc +++ b/training/dtrain/score.cc diff --git a/dtrain/score.h b/training/dtrain/score.h index f317c903..f317c903 100644 --- a/dtrain/score.h +++ b/training/dtrain/score.h diff --git a/dtrain/test/example/README b/training/dtrain/test/example/README index 6937b11b..6937b11b 100644 --- a/dtrain/test/example/README +++ b/training/dtrain/test/example/README diff --git a/dtrain/test/example/cdec.ini b/training/dtrain/test/example/cdec.ini index d5955f0e..d5955f0e 100644 --- a/dtrain/test/example/cdec.ini +++ b/training/dtrain/test/example/cdec.ini diff --git a/dtrain/test/example/dtrain.ini b/training/dtrain/test/example/dtrain.ini index 72d50ca1..72d50ca1 100644 --- a/dtrain/test/example/dtrain.ini +++ b/training/dtrain/test/example/dtrain.ini diff --git a/dtrain/test/example/expected-output b/training/dtrain/test/example/expected-output index 05326763..05326763 100644 --- a/dtrain/test/example/expected-output +++ b/training/dtrain/test/example/expected-output diff --git a/dtrain/test/parallelize/cdec.ini b/training/dtrain/test/parallelize/cdec.ini index 72e99dc5..72e99dc5 100644 --- a/dtrain/test/parallelize/cdec.ini +++ b/training/dtrain/test/parallelize/cdec.ini diff --git a/dtrain/test/parallelize/dtrain.ini b/training/dtrain/test/parallelize/dtrain.ini index 03f9d240..03f9d240 100644 --- a/dtrain/test/parallelize/dtrain.ini +++ b/training/dtrain/test/parallelize/dtrain.ini diff --git a/dtrain/test/parallelize/in b/training/dtrain/test/parallelize/in index a312809f..a312809f 100644 --- a/dtrain/test/parallelize/in +++ b/training/dtrain/test/parallelize/in diff --git a/dtrain/test/parallelize/refs b/training/dtrain/test/parallelize/refs index 4d3128cb..4d3128cb 100644 --- a/dtrain/test/parallelize/refs +++ b/training/dtrain/test/parallelize/refs diff --git a/dtrain/test/toy/cdec.ini b/training/dtrain/test/toy/cdec.ini index 98b02d44..98b02d44 100644 --- a/dtrain/test/toy/cdec.ini +++ b/training/dtrain/test/toy/cdec.ini diff --git a/dtrain/test/toy/dtrain.ini b/training/dtrain/test/toy/dtrain.ini index a091732f..a091732f 100644 --- a/dtrain/test/toy/dtrain.ini +++ b/training/dtrain/test/toy/dtrain.ini diff --git a/dtrain/test/toy/input b/training/dtrain/test/toy/input index 4d10a9ea..4d10a9ea 100644 --- a/dtrain/test/toy/input +++ b/training/dtrain/test/toy/input diff --git a/training/feature_expectations.cc b/training/feature_expectations.cc deleted file mode 100644 index f1a85495..00000000 --- a/training/feature_expectations.cc +++ /dev/null @@ -1,232 +0,0 @@ -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> -#include <cmath> -#include <tr1/memory> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "verbose.h" -#include "hg.h" -#include "prob.h" -#include "inside_outside.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "online_optimizer.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "sampler.h" - -#ifdef HAVE_MPI -#include <boost/mpi/timer.hpp> -#include <boost/mpi.hpp> -namespace mpi = boost::mpi; -#endif - -using namespace std; -namespace po = boost::program_options; - -struct FComp { - const vector<double>& w_; - FComp(const vector<double>& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowFeatures(const vector<double>& w) { - vector<int> fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - sort(fnums.begin(), fnums.end(), FComp(w)); - for (vector<int>::iterator i = fnums.begin(); i != fnums.end(); ++i) { - if (w[*i]) cout << FD::Convert(*i) << ' ' << w[*i] << endl; - } -} - -void ReadConfig(const string& ini, vector<string>* out) { - ReadFile rf(ini); - istream& in = *rf.stream(); - while(in) { - string line; - getline(in, line); - if (!in) continue; - out->push_back(line); - } -} - -void StoreConfig(const vector<string>& cfg, istringstream* o) { - ostringstream os; - for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; } - o->str(os.str()); -} - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input,i",po::value<string>(),"Corpus of source language sentences") - ("weights,w",po::value<string>(),"Input feature weights file") - ("decoder_config,c",po::value<string>(), "cdec.ini file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("input") || !conf->count("decoder_config")) { - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c, vector<int>* order) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - int id = 0; - while(in) { - getline(in, line); - if (!in) break; - if (id % size == rank) { - c->push_back(line); - order->push_back(id); - } - ++id; - } -} - -static const double kMINUS_EPSILON = -1e-6; - -struct TrainingObserver : public DecoderObserver { - void Reset() { - acc_exp.clear(); - total_complete = 0; - } - - virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { - cur_model_exp.clear(); - state = 1; - } - - // compute model expectations, denominator of objective - virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 1); - state = 2; - const prob_t z = InsideOutside<prob_t, - EdgeProb, - SparseVector<prob_t>, - EdgeFeaturesAndProbWeightFunction>(*hg, &cur_model_exp); - cur_model_exp /= z; - acc_exp += cur_model_exp; - } - - virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { - cerr << "IGNORING ALIGNMENT FOREST!\n"; - } - - virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { - if (state == 2) { - ++total_complete; - } - } - - void GetExpectations(SparseVector<double>* g) const { - g->clear(); - for (SparseVector<prob_t>::const_iterator it = acc_exp.begin(); it != acc_exp.end(); ++it) - g->set_value(it->first, it->second); - } - - int total_complete; - SparseVector<prob_t> cur_model_exp; - SparseVector<prob_t> acc_exp; - int state; -}; - -#ifdef HAVE_MPI -namespace boost { namespace mpi { - template<> - struct is_commutative<std::plus<SparseVector<double> >, SparseVector<double> > - : mpl::true_ { }; -} } // end namespace boost::mpi -#endif - -int main(int argc, char** argv) { -#ifdef HAVE_MPI - mpi::environment env(argc, argv); - mpi::communicator world; - const int size = world.size(); - const int rank = world.rank(); -#else - const int size = 1; - const int rank = 0; -#endif - if (size > 1) SetSilent(true); // turn off verbose decoder output - register_feature_functions(); - - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) - return 1; - - // load initial weights - Weights weights; - if (conf.count("weights")) - weights.InitFromFile(conf["weights"].as<string>()); - - vector<string> corpus; - vector<int> ids; - ReadTrainingCorpus(conf["input"].as<string>(), rank, size, &corpus, &ids); - assert(corpus.size() > 0); - - vector<string> cdec_ini; - ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini); - istringstream ini; - StoreConfig(cdec_ini, &ini); - Decoder decoder(&ini); - if (decoder.GetConf()["input"].as<string>() != "-") { - cerr << "cdec.ini must not set an input file\n"; - return 1; - } - - SparseVector<double> x; - weights.InitSparseVector(&x); - TrainingObserver observer; - - weights.InitFromVector(x); - vector<double> lambdas; - weights.InitVector(&lambdas); - decoder.SetWeights(lambdas); - observer.Reset(); - for (unsigned i = 0; i < corpus.size(); ++i) { - int id = ids[i]; - decoder.SetId(id); - decoder.Decode(corpus[i], &observer); - } - SparseVector<double> local_exps, exps; - observer.GetExpectations(&local_exps); -#ifdef HAVE_MPI - reduce(world, local_exps, exps, std::plus<SparseVector<double> >(), 0); -#else - exps.swap(local_exps); -#endif - - weights.InitFromVector(exps); - weights.InitVector(&lambdas); - ShowFeatures(lambdas); - - return 0; -} diff --git a/training/lbl_model.cc b/training/lbl_model.cc deleted file mode 100644 index a46ce33c..00000000 --- a/training/lbl_model.cc +++ /dev/null @@ -1,421 +0,0 @@ -#include <iostream> - -#include "config.h" -#ifndef HAVE_EIGEN - int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; } -#else - -#include <cstdlib> -#include <algorithm> -#include <cmath> -#include <set> -#include <cstring> // memset -#include <ctime> - -#ifdef HAVE_MPI -#include <boost/mpi/timer.hpp> -#include <boost/mpi.hpp> -#include <boost/archive/text_oarchive.hpp> -namespace mpi = boost::mpi; -#endif -#include <boost/math/special_functions/fpclassify.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include <Eigen/Dense> - -#include "corpus_tools.h" -#include "optimize.h" -#include "array2d.h" -#include "m.h" -#include "lattice.h" -#include "stringlib.h" -#include "filelib.h" -#include "tdict.h" - -namespace po = boost::program_options; -using namespace std; - -#define kDIMENSIONS 10 -typedef Eigen::Matrix<double, kDIMENSIONS, 1> RVector; -typedef Eigen::Matrix<double, 1, kDIMENSIONS> RTVector; -typedef Eigen::Matrix<double, kDIMENSIONS, kDIMENSIONS> TMatrix; -vector<RVector> r_src, r_trg; - -#if HAVE_MPI -namespace boost { -namespace serialization { - -template<class Archive> -void serialize(Archive & ar, RVector & v, const unsigned int version) { - for (unsigned i = 0; i < kDIMENSIONS; ++i) - ar & v[i]; -} - -} // namespace serialization -} // namespace boost -#endif - -bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input,i",po::value<string>(),"Input file") - ("iterations,I",po::value<unsigned>()->default_value(1000),"Number of iterations of training") - ("regularization_strength,C",po::value<double>()->default_value(0.1),"L2 regularization strength (0 for no regularization)") - ("eta", po::value<double>()->default_value(0.1f), "Eta for SGD") - ("source_embeddings,f", po::value<string>(), "File containing source embeddings (if unset, random vectors will be used)") - ("target_embeddings,e", po::value<string>(), "File containing target embeddings (if unset, random vectors will be used)") - ("random_seed,s", po::value<unsigned>(), "Random seed") - ("diagonal_tension,T", po::value<double>()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") - ("testset,x", po::value<string>(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (argc < 2 || conf->count("help")) { - cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; - cerr << dcmdline_options << endl; - return false; - } - return true; -} - -void Normalize(RVector* v) { - double norm = v->norm(); - assert(norm > 0.0f); - *v /= norm; -} - -void Flatten(const TMatrix& m, vector<double>* v) { - unsigned c = 0; - v->resize(kDIMENSIONS * kDIMENSIONS); - for (unsigned i = 0; i < kDIMENSIONS; ++i) - for (unsigned j = 0; j < kDIMENSIONS; ++j) { - assert(boost::math::isfinite(m(i, j))); - (*v)[c++] = m(i,j); - } -} - -void Unflatten(const vector<double>& v, TMatrix* m) { - unsigned c = 0; - for (unsigned i = 0; i < kDIMENSIONS; ++i) - for (unsigned j = 0; j < kDIMENSIONS; ++j) { - assert(boost::math::isfinite(v[c])); - (*m)(i, j) = v[c++]; - } -} - -double ApplyRegularization(const double C, - const vector<double>& weights, - vector<double>* g) { - assert(weights.size() == g->size()); - double reg = 0; - for (size_t i = 0; i < weights.size(); ++i) { - const double& w_i = weights[i]; - double& g_i = (*g)[i]; - reg += C * w_i * w_i; - g_i += 2 * C * w_i; - } - return reg; -} - -void LoadEmbeddings(const string& filename, vector<RVector>* pv) { - vector<RVector>& v = *pv; - cerr << "Reading embeddings from " << filename << " ...\n"; - ReadFile rf(filename); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - while(getline(in, line)) { - ++lc; - size_t cur = line.find(' '); - if (cur == string::npos || cur == 0) { - cerr << "Parse error reading line " << lc << ":\n" << line << endl; - abort(); - } - WordID w = TD::Convert(line.substr(0, cur)); - if (w >= v.size()) continue; - RVector& curv = v[w]; - line[cur] = 0; - size_t start = cur + 1; - cur = start + 1; - size_t c = 0; - while(cur < line.size()) { - if (line[cur] == ' ') { - line[cur] = 0; - curv[c++] = strtod(&line[start], NULL); - start = cur + 1; - cur = start; - if (c == kDIMENSIONS) break; - } - ++cur; - } - if (c < kDIMENSIONS && cur != start) { - if (cur < line.size()) line[cur] = 0; - curv[c++] = strtod(&line[start], NULL); - } - if (c != kDIMENSIONS) { - static bool first = true; - if (first) { - cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n"; - first = false; - } - for (; c < kDIMENSIONS; ++c) curv[c] = rand(); - } - if (c == kDIMENSIONS && cur != line.size()) { - static bool first = true; - if (first) { - cerr << " embedding file contains more dimensions than configured with, truncating.\n"; - first = false; - } - } - } -} - -int main(int argc, char** argv) { -#ifdef HAVE_MPI - std::cerr << "**MPI enabled.\n"; - mpi::environment env(argc, argv); - mpi::communicator world; - const int size = world.size(); - const int rank = world.rank(); -#else - std::cerr << "**MPI disabled.\n"; - const int rank = 0; - const int size = 1; -#endif - po::variables_map conf; - if (!InitCommandLine(argc, argv, &conf)) return 1; - const string fname = conf["input"].as<string>(); - const double reg_strength = conf["regularization_strength"].as<double>(); - const bool has_l2 = reg_strength; - assert(reg_strength >= 0.0f); - const int ITERATIONS = conf["iterations"].as<unsigned>(); - const double eta = conf["eta"].as<double>(); - const double diagonal_tension = conf["diagonal_tension"].as<double>(); - bool SGD = false; - if (diagonal_tension < 0.0) { - cerr << "Invalid value for diagonal_tension: must be >= 0\n"; - return 1; - } - string testset; - if (conf.count("testset")) testset = conf["testset"].as<string>(); - - unsigned lc = 0; - vector<double> unnormed_a_i; - bool flag = false; - vector<vector<WordID> > srcs, trgs; - vector<WordID> vocab_e; - { - set<WordID> svocab_e, svocab_f; - CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size); - copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e)); - } - cerr << "Number of target word types: " << vocab_e.size() << endl; - const double num_examples = lc; - - boost::shared_ptr<LBFGSOptimizer> lbfgs; - if (rank == 0) - lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100)); - r_trg.resize(TD::NumWords() + 1); - r_src.resize(TD::NumWords() + 1); - vector<set<unsigned> > trg_pos(TD::NumWords() + 1); - - if (conf.count("random_seed")) { - srand(conf["random_seed"].as<unsigned>()); - } else { - unsigned seed = time(NULL) + rank * 100; - cerr << "Random seed: " << seed << endl; - srand(seed); - } - - TMatrix t = TMatrix::Zero(); - if (rank == 0) { - t = TMatrix::Random() / 50.0; - for (unsigned i = 1; i < r_trg.size(); ++i) { - r_trg[i] = RVector::Random(); - r_src[i] = RVector::Random(); - } - if (conf.count("source_embeddings")) - LoadEmbeddings(conf["source_embeddings"].as<string>(), &r_src); - if (conf.count("target_embeddings")) - LoadEmbeddings(conf["target_embeddings"].as<string>(), &r_trg); - } - - // do optimization - TMatrix g = TMatrix::Zero(); - vector<TMatrix> exp_src; - vector<double> z_src; - vector<double> flat_g, flat_t, rcv_grad; - Flatten(t, &flat_t); - bool converged = false; -#if HAVE_MPI - mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); - mpi::broadcast(world, r_trg, 0); - mpi::broadcast(world, r_src, 0); -#endif - cerr << "rank=" << rank << ": " << r_trg[0][4] << endl; - for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { - if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl; - Unflatten(flat_t, &t); - double likelihood = 0; - double denom = 0.0; - lc = 0; - flag = false; - g *= 0; - for (unsigned i = 0; i < srcs.size(); ++i) { - const vector<WordID>& src = srcs[i]; - const vector<WordID>& trg = trgs[i]; - ++lc; - if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; } - if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } - denom += trg.size(); - - exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero()); - z_src.clear(); z_src.resize(src.size(), 0.0); - Array2D<TMatrix> exp_refs(src.size(), trg.size(), TMatrix::Zero()); - Array2D<double> z_refs(src.size(), trg.size(), 0.0); - for (unsigned j = 0; j < trg.size(); ++j) - trg_pos[trg[j]].insert(j); - - for (unsigned i = 0; i < src.size(); ++i) { - const RVector& r_s = r_src[src[i]]; - const RTVector pred = r_s.transpose() * t; - TMatrix& exp_m = exp_src[i]; - double& z = z_src[i]; - for (unsigned k = 0; k < vocab_e.size(); ++k) { - const WordID v_k = vocab_e[k]; - const RVector& r_t = r_trg[v_k]; - const double dot_prod = pred * r_t; - const double u = exp(dot_prod); - z += u; - const TMatrix v = r_s * r_t.transpose() * u; - exp_m += v; - set<unsigned>& ref_locs = trg_pos[v_k]; - if (!ref_locs.empty()) { - for (set<unsigned>::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) { - TMatrix& exp_ref_ij = exp_refs(i, *it); - double& z_ref_ij = z_refs(i, *it); - z_ref_ij += u; - exp_ref_ij += v; - } - } - } - } - for (unsigned j = 0; j < trg.size(); ++j) - trg_pos[trg[j]].clear(); - - // model expectations for a single target generation with - // uniform alignment prior - // TODO: when using a non-uniform alignment, m_exp will be - // a function of j (below) - double m_z = 0; - TMatrix m_exp = TMatrix::Zero(); - for (unsigned i = 0; i < src.size(); ++i) { - m_exp += exp_src[i]; - m_z += z_src[i]; - } - m_exp /= m_z; - - Array2D<bool> al(src.size(), trg.size(), false); - for (unsigned j = 0; j < trg.size(); ++j) { - double ref_z = 0; - TMatrix ref_exp = TMatrix::Zero(); - int max_i = 0; - double max_s = -9999999; - for (unsigned i = 0; i < src.size(); ++i) { - ref_exp += exp_refs(i, j); - ref_z += z_refs(i, j); - if (log(z_refs(i, j)) > max_s) { - max_s = log(z_refs(i, j)); - max_i = i; - } - // TODO handle alignment prob - } - if (ref_z <= 0) { - cerr << "TRG=" << TD::Convert(trg[j]) << endl; - cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl; - cerr << " REF_EXP=\n" << ref_exp << endl; - cerr << " M_EXP=\n" << m_exp << endl; - abort(); - } - al(max_i, j) = true; - ref_exp /= ref_z; - g += m_exp - ref_exp; - likelihood += log(ref_z) - log(m_z); - if (SGD) { - t -= g * eta / num_examples; - g *= 0; - } - } - - if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; } - } - if (flag && rank == 0) { cerr << endl; } - - double obj = 0; - if (!SGD) { - Flatten(g, &flat_g); - obj = -likelihood; -#if HAVE_MPI - rcv_grad.resize(flat_g.size(), 0.0); - mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus<double>(), 0); - swap(flat_g, rcv_grad); - rcv_grad.clear(); - - double to = 0; - mpi::reduce(world, obj, to, plus<double>(), 0); - obj = to; - double tlh = 0; - mpi::reduce(world, likelihood, tlh, plus<double>(), 0); - likelihood = tlh; - double td = 0; - mpi::reduce(world, denom, td, plus<double>(), 0); - denom = td; -#endif - } - - if (rank == 0) { - double gn = 0; - for (unsigned i = 0; i < flat_g.size(); ++i) - gn += flat_g[i]*flat_g[i]; - const double base2_likelihood = likelihood / log(2); - cerr << " log_e likelihood: " << likelihood << endl; - cerr << " log_2 likelihood: " << base2_likelihood << endl; - cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; - cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; - cerr << " gradient norm: " << sqrt(gn) << endl; - if (!SGD) { - if (has_l2) { - const double r = ApplyRegularization(reg_strength, - flat_t, - &flat_g); - obj += r; - cerr << " regularization: " << r << endl; - } - lbfgs->Optimize(obj, flat_g, &flat_t); - converged = (lbfgs->HasConverged()); - } - } -#ifdef HAVE_MPI - mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); - mpi::broadcast(world, converged, 0); -#endif - } - if (rank == 0) - cerr << "TRANSLATION MATRIX:" << endl << t << endl; - return 0; -} - -#endif - diff --git a/training/minrisk/Makefile.am b/training/minrisk/Makefile.am new file mode 100644 index 00000000..a15e821e --- /dev/null +++ b/training/minrisk/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = minrisk_optimize + +minrisk_optimize_SOURCES = minrisk_optimize.cc +minrisk_optimize_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/training/liblbfgs/liblbfgs.a -lz + +AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training -I$(top_srcdir)/training/utils diff --git a/minrisk/minrisk.pl b/training/minrisk/minrisk.pl index d05b9595..0f8bacd0 100755 --- a/minrisk/minrisk.pl +++ b/training/minrisk/minrisk.pl @@ -2,7 +2,7 @@ use strict; my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } # Skip local config (used for distributing jobs) if we're running in local-only mode use LocalConfig; @@ -12,27 +12,27 @@ use POSIX ":sys_wait_h"; my $QSUB_CMD = qsub_args(mert_memory()); my $default_jobs = env_default_jobs(); -my $VEST_DIR="$SCRIPT_DIR/../dpmert"; -require "$VEST_DIR/libcall.pl"; +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; # Default settings my $srcFile; my $refFiles; my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; my $MAPINPUT = "$bin_dir/minrisk_generate_input.pl"; my $MAPPER = "$bin_dir/minrisk_optimize"; -my $parallelize = "$VEST_DIR/parallelize.pl"; -my $libcall = "$VEST_DIR/libcall.pl"; -my $sentserver = "$VEST_DIR/sentserver"; -my $sentclient = "$VEST_DIR/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; my $SCORER = $FAST_SCORE; die "Can't find $MAPPER" unless -x $MAPPER; -my $cdec = "$bin_dir/../decoder/cdec"; +my $cdec = "$bin_dir/../../decoder/cdec"; die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; die "Can't find $libcall" unless -e $libcall; diff --git a/minrisk/minrisk_generate_input.pl b/training/minrisk/minrisk_generate_input.pl index b30fc4fd..b30fc4fd 100755 --- a/minrisk/minrisk_generate_input.pl +++ b/training/minrisk/minrisk_generate_input.pl diff --git a/minrisk/minrisk_optimize.cc b/training/minrisk/minrisk_optimize.cc index da8b5260..da8b5260 100644 --- a/minrisk/minrisk_optimize.cc +++ b/training/minrisk/minrisk_optimize.cc diff --git a/mira/Makefile.am b/training/mira/Makefile.am index 3f8f17cd..ae609ede 100644 --- a/mira/Makefile.am +++ b/training/mira/Makefile.am @@ -1,6 +1,6 @@ bin_PROGRAMS = kbest_mira kbest_mira_SOURCES = kbest_mira.cc -kbest_mira_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz +kbest_mira_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/search/libksearch.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/mira/kbest_mira.cc b/training/mira/kbest_mira.cc index 8b7993dd..8b7993dd 100644 --- a/mira/kbest_mira.cc +++ b/training/mira/kbest_mira.cc diff --git a/training/mpi_em_optimize.cc b/training/mpi_em_optimize.cc deleted file mode 100644 index 48683b15..00000000 --- a/training/mpi_em_optimize.cc +++ /dev/null @@ -1,389 +0,0 @@ -#include <sstream> -#include <iostream> -#include <vector> -#include <cassert> -#include <cmath> - -#ifdef HAVE_MPI -#include <mpi.h> -#endif - -#include <boost/shared_ptr.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "verbose.h" -#include "hg.h" -#include "prob.h" -#include "inside_outside.h" -#include "ff_register.h" -#include "decoder.h" -#include "filelib.h" -#include "optimize.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -using boost::shared_ptr; -namespace po = boost::program_options; - -void SanityCheck(const vector<double>& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - -struct FComp { - const vector<double>& w_; - FComp(const vector<double>& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector<double>& w) { - vector<int> fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector<int>::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input_weights,w",po::value<string>(),"Input feature weights file") - ("training_data,t",po::value<string>(),"Training data") - ("decoder_config,c",po::value<string>(),"Decoder configuration file") - ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !(conf->count("training_data")) || !conf->count("decoder_config")) { - cerr << dcmdline_options << endl; -#ifdef HAVE_MPI - MPI::Finalize(); -#endif - exit(1); - } -} - -void ReadTrainingCorpus(const string& fname, int rank, int size, vector<string>* c) { - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - int lc = 0; - while(in) { - getline(in, line); - if (!in) break; - if (lc % size == rank) c->push_back(line); - ++lc; - } -} - -static const double kMINUS_EPSILON = -1e-6; - -struct TrainingObserver : public DecoderObserver { - void Reset() { - total_complete = 0; - cur_obj = 0; - tot_obj = 0; - tot.clear(); - } - - void SetLocalGradientAndObjective(SparseVector<double>* g, double* o) const { - *o = tot_obj; - *g = tot; - } - - virtual void NotifyDecodingStart(const SentenceMetadata& smeta) { - cur_obj = 0; - state = 1; - } - - void ExtractExpectedCounts(Hypergraph* hg) { - vector<prob_t> posts; - cur.clear(); - const prob_t z = hg->ComputeEdgePosteriors(1.0, &posts); - cur_obj = log(z); - for (int i = 0; i < posts.size(); ++i) { - const SparseVector<double>& efeats = hg->edges_[i].feature_values_; - const double post = static_cast<double>(posts[i] / z); - for (SparseVector<double>::const_iterator j = efeats.begin(); j != efeats.end(); ++j) - cur.add_value(j->first, post); - } - } - - // compute model expectations, denominator of objective - virtual void NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 1); - state = 2; - ExtractExpectedCounts(hg); - } - - // replace translation forest, since we're doing EM training (we don't know which) - virtual void NotifyAlignmentForest(const SentenceMetadata& smeta, Hypergraph* hg) { - assert(state == 2); - state = 3; - ExtractExpectedCounts(hg); - } - - virtual void NotifyDecodingComplete(const SentenceMetadata& smeta) { - ++total_complete; - tot_obj += cur_obj; - tot += cur; - } - - int total_complete; - double cur_obj; - double tot_obj; - SparseVector<double> cur, tot; - int state; -}; - -void ReadConfig(const string& ini, vector<string>* out) { - ReadFile rf(ini); - istream& in = *rf.stream(); - while(in) { - string line; - getline(in, line); - if (!in) continue; - out->push_back(line); - } -} - -void StoreConfig(const vector<string>& cfg, istringstream* o) { - ostringstream os; - for (int i = 0; i < cfg.size(); ++i) { os << cfg[i] << endl; } - o->str(os.str()); -} - -struct OptimizableMultinomialFamily { - struct CPD { - CPD() : z() {} - double z; - map<WordID, double> c2counts; - }; - map<WordID, CPD> counts; - double Value(WordID conditioning, WordID generated) const { - map<WordID, CPD>::const_iterator it = counts.find(conditioning); - assert(it != counts.end()); - map<WordID,double>::const_iterator r = it->second.c2counts.find(generated); - if (r == it->second.c2counts.end()) return 0; - return r->second; - } - void Increment(WordID conditioning, WordID generated, double count) { - CPD& cc = counts[conditioning]; - cc.z += count; - cc.c2counts[generated] += count; - } - void Optimize() { - for (map<WordID, CPD>::iterator i = counts.begin(); i != counts.end(); ++i) { - CPD& cpd = i->second; - for (map<WordID, double>::iterator j = cpd.c2counts.begin(); j != cpd.c2counts.end(); ++j) { - j->second /= cpd.z; - // cerr << "P(" << TD::Convert(j->first) << " | " << TD::Convert(i->first) << " ) = " << j->second << endl; - } - } - } - void Clear() { - counts.clear(); - } -}; - -struct CountManager { - CountManager(size_t num_types) : oms_(num_types) {} - virtual ~CountManager(); - virtual void AddCounts(const SparseVector<double>& c) = 0; - void Optimize(SparseVector<double>* weights) { - for (int i = 0; i < oms_.size(); ++i) { - oms_[i].Optimize(); - } - GetOptimalValues(weights); - for (int i = 0; i < oms_.size(); ++i) { - oms_[i].Clear(); - } - } - virtual void GetOptimalValues(SparseVector<double>* wv) const = 0; - vector<OptimizableMultinomialFamily> oms_; -}; -CountManager::~CountManager() {} - -struct TaggerCountManager : public CountManager { - // 0 = transitions, 2 = emissions - TaggerCountManager() : CountManager(2) {} - void AddCounts(const SparseVector<double>& c); - void GetOptimalValues(SparseVector<double>* wv) const { - for (set<int>::const_iterator it = fids_.begin(); it != fids_.end(); ++it) { - int ftype; - WordID cond, gen; - bool is_optimized = TaggerCountManager::GetFeature(*it, &ftype, &cond, &gen); - assert(is_optimized); - wv->set_value(*it, log(oms_[ftype].Value(cond, gen))); - } - } - // Id:0:a=1 Bi:a_b=1 Bi:b_c=1 Bi:c_d=1 Uni:a=1 Uni:b=1 Uni:c=1 Uni:d=1 Id:1:b=1 Bi:BOS_a=1 Id:2:c=1 - static bool GetFeature(const int fid, int* feature_type, WordID* cond, WordID* gen) { - const string& feat = FD::Convert(fid); - if (feat.size() > 5 && feat[0] == 'I' && feat[1] == 'd' && feat[2] == ':') { - // emission - const size_t p = feat.rfind(':'); - assert(p != string::npos); - *cond = TD::Convert(feat.substr(p+1)); - *gen = TD::Convert(feat.substr(3, p - 3)); - *feature_type = 1; - return true; - } else if (feat[0] == 'B' && feat.size() > 5 && feat[2] == ':' && feat[1] == 'i') { - // transition - const size_t p = feat.rfind('_'); - assert(p != string::npos); - *gen = TD::Convert(feat.substr(p+1)); - *cond = TD::Convert(feat.substr(3, p - 3)); - *feature_type = 0; - return true; - } else if (feat[0] == 'U' && feat.size() > 4 && feat[1] == 'n' && feat[2] == 'i' && feat[3] == ':') { - // ignore - return false; - } else { - cerr << "Don't know how to deal with feature of type: " << feat << endl; - abort(); - } - } - set<int> fids_; -}; - -void TaggerCountManager::AddCounts(const SparseVector<double>& c) { - for (SparseVector<double>::const_iterator it = c.begin(); it != c.end(); ++it) { - const double& val = it->second; - int ftype; - WordID cond, gen; - if (GetFeature(it->first, &ftype, &cond, &gen)) { - oms_[ftype].Increment(cond, gen, val); - fids_.insert(it->first); - } - } -} - -int main(int argc, char** argv) { -#ifdef HAVE_MPI - MPI::Init(argc, argv); - const int size = MPI::COMM_WORLD.Get_size(); - const int rank = MPI::COMM_WORLD.Get_rank(); -#else - const int size = 1; - const int rank = 0; -#endif - SetSilent(true); // turn off verbose decoder output - register_feature_functions(); - - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - TaggerCountManager tcm; - - // load cdec.ini and set up decoder - vector<string> cdec_ini; - ReadConfig(conf["decoder_config"].as<string>(), &cdec_ini); - istringstream ini; - StoreConfig(cdec_ini, &ini); - if (rank == 0) cerr << "Loading grammar...\n"; - Decoder* decoder = new Decoder(&ini); - if (decoder->GetConf()["input"].as<string>() != "-") { - cerr << "cdec.ini must not set an input file\n"; -#ifdef HAVE_MPI - MPI::COMM_WORLD.Abort(1); -#endif - } - if (rank == 0) cerr << "Done loading grammar!\n"; - Weights w; - if (conf.count("input_weights")) - w.InitFromFile(conf["input_weights"].as<string>()); - - double objective = 0; - bool converged = false; - - vector<double> lambdas; - w.InitVector(&lambdas); - vector<string> corpus; - ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus); - assert(corpus.size() > 0); - - int iteration = 0; - TrainingObserver observer; - while (!converged) { - ++iteration; - observer.Reset(); - if (rank == 0) { - cerr << "Starting decoding... (~" << corpus.size() << " sentences / proc)\n"; - } - decoder->SetWeights(lambdas); - for (int i = 0; i < corpus.size(); ++i) - decoder->Decode(corpus[i], &observer); - - SparseVector<double> x; - observer.SetLocalGradientAndObjective(&x, &objective); - cerr << "COUNTS = " << x << endl; - cerr << " OBJ = " << objective << endl; - tcm.AddCounts(x); - -#if 0 -#ifdef HAVE_MPI - MPI::COMM_WORLD.Reduce(const_cast<double*>(&gradient.data()[0]), &rcv_grad[0], num_feats, MPI::DOUBLE, MPI::SUM, 0); - MPI::COMM_WORLD.Reduce(&objective, &to, 1, MPI::DOUBLE, MPI::SUM, 0); - swap(gradient, rcv_grad); - objective = to; -#endif -#endif - - if (rank == 0) { - SparseVector<double> wsv; - tcm.Optimize(&wsv); - - w.InitFromVector(wsv); - w.InitVector(&lambdas); - - ShowLargestFeatures(lambdas); - - converged = iteration > 100; - if (converged) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } - - string fname = "weights.cur.gz"; - if (converged) { fname = "weights.final.gz"; } - ostringstream vv; - vv << "Objective = " << objective << " (ITERATION=" << iteration << ")"; - const string svv = vv.str(); - w.WriteToFile(fname, true, &svv); - } // rank == 0 - int cint = converged; -#ifdef HAVE_MPI - MPI::COMM_WORLD.Bcast(const_cast<double*>(&lambdas.data()[0]), num_feats, MPI::DOUBLE, 0); - MPI::COMM_WORLD.Bcast(&cint, 1, MPI::INT, 0); - MPI::COMM_WORLD.Barrier(); -#endif - converged = cint; - } -#ifdef HAVE_MPI - MPI::Finalize(); -#endif - return 0; -} diff --git a/training/mr_em_adapted_reduce.cc b/training/mr_em_adapted_reduce.cc deleted file mode 100644 index f65b5440..00000000 --- a/training/mr_em_adapted_reduce.cc +++ /dev/null @@ -1,173 +0,0 @@ -#include <iostream> -#include <vector> -#include <cassert> -#include <cmath> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" -#include "m.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("optimization_method,m", po::value<string>()->default_value("em"), "Optimization method (em, vb)") - ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -double NoZero(const double& x) { - if (x) return x; - return 1e-35; -} - -void Maximize(const bool use_vb, - const double& alpha, - const int total_event_types, - SparseVector<double>* pc) { - const SparseVector<double>& counts = *pc; - - if (use_vb) - assert(total_event_types >= counts.size()); - - double tot = 0; - for (SparseVector<double>::const_iterator it = counts.begin(); - it != counts.end(); ++it) - tot += it->second; -// cerr << " = " << tot << endl; - assert(tot > 0.0); - double ltot = log(tot); - if (use_vb) - ltot = Md::digamma(tot + total_event_types * alpha); - for (SparseVector<double>::const_iterator it = counts.begin(); - it != counts.end(); ++it) { - if (use_vb) { - pc->set_value(it->first, NoZero(Md::digamma(it->second + alpha) - ltot)); - } else { - pc->set_value(it->first, NoZero(log(it->second) - ltot)); - } - } -#if 0 - if (counts.size() < 50) { - for (SparseVector<double>::const_iterator it = counts.begin(); - it != counts.end(); ++it) { - cerr << " p(" << FD::Convert(it->first) << ")=" << exp(it->second); - } - cerr << endl; - } -#endif -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["input_format"].as<string>() == "b64"; - const bool use_vb = conf["optimization_method"].as<string>() == "vb"; - const double alpha = 1e-09; - if (use_vb) - cerr << "Using variational Bayes, make sure alphas are set\n"; - - const string s_obj = "**OBJ**"; - // E-step - string cur_key = ""; - SparseVector<double> acc; - double logprob = 0; - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; - int feat; - double val; - size_t i = line.find("\t"); - const string key = line.substr(0, i); - assert(i != string::npos); - ++i; - if (key != cur_key) { - if (cur_key.size() > 0) { - // TODO shouldn't be num_active, should be total number - // of events - Maximize(use_vb, alpha, acc.size(), &acc); - cout << cur_key << '\t'; - if (use_b64) - B64::Encode(0.0, acc, &cout); - else - cout << acc; - cout << endl; - acc.clear(); - } - cur_key = key; - } - if (use_b64) { - SparseVector<double> g; - double obj; - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping!\n"; - continue; - } - logprob += obj; - acc += g; - } else { // text encoding - your counts will not be accurate! - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat == -1) { - logprob += val; - } else { - acc.add_value(feat, val); - } - } - } - } - // TODO shouldn't be num_active, should be total number - // of events - Maximize(use_vb, alpha, acc.size(), &acc); - cout << cur_key << '\t'; - if (use_b64) - B64::Encode(0.0, acc, &cout); - else - cout << acc; - cout << endl << flush; - - cerr << "LOGPROB: " << logprob << endl; - - return 0; -} diff --git a/training/mr_em_map_adapter.cc b/training/mr_em_map_adapter.cc deleted file mode 100644 index ead4598d..00000000 --- a/training/mr_em_map_adapter.cc +++ /dev/null @@ -1,160 +0,0 @@ -#include <iostream> -#include <fstream> -#include <cassert> -#include <cmath> - -#include <boost/utility.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> -#include "boost/tuple/tuple.hpp" - -#include "fdict.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -// useful for EM models parameterized by a bunch of multinomials -// this converts event counts (returned from cdec as feature expectations) -// into different keys and values (which are lists of all the events, -// conditioned on the key) for summing and normalization by a reducer - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("buffer_size,b", po::value<int>()->default_value(1), "Buffer size (in # of counts) before emitting counts") - ("format,f",po::value<string>()->default_value("b64"), "Encoding of the input (b64 or text)"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct EventMapper { - int Map(int fid) { - int& cv = map_[fid]; - if (!cv) { - cv = GetConditioningVariable(fid); - } - return cv; - } - void Clear() { map_.clear(); } - protected: - virtual int GetConditioningVariable(int fid) const = 0; - private: - map<int, int> map_; -}; - -struct LexAlignEventMapper : public EventMapper { - protected: - virtual int GetConditioningVariable(int fid) const { - const string& str = FD::Convert(fid); - size_t pos = str.rfind("_"); - if (pos == string::npos || pos == 0 || pos >= str.size() - 1) { - cerr << "Bad feature for EM adapter: " << str << endl; - abort(); - } - return FD::Convert(str.substr(0, pos)); - } -}; - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["format"].as<string>() == "b64"; - const int buffer_size = conf["buffer_size"].as<int>(); - - const string s_obj = "**OBJ**"; - // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; - // 0<TAB>**OBJ**=1.1;Feat1=1.0; - - EventMapper* event_mapper = new LexAlignEventMapper; - map<int, SparseVector<double> > counts; - size_t total = 0; - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; - int feat; - double val; - size_t i = line.find("\t"); - assert(i != string::npos); - ++i; - SparseVector<double> g; - double obj = 0; - if (use_b64) { - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping!\n"; - continue; - } - } else { // text encoding - your counts will not be accurate! - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat == -1) { - obj = val; - } else { - g.set_value(feat, val); - } - } - } - //cerr << "OBJ: " << obj << endl; - const SparseVector<double>& cg = g; - for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) { - const int cond_var = event_mapper->Map(it->first); - SparseVector<double>& cond_counts = counts[cond_var]; - int delta = cond_counts.size(); - cond_counts.add_value(it->first, it->second); - delta = cond_counts.size() - delta; - total += delta; - } - if (total > buffer_size) { - for (map<int, SparseVector<double> >::iterator it = counts.begin(); - it != counts.end(); ++it) { - const SparseVector<double>& cc = it->second; - cout << FD::Convert(it->first) << '\t'; - if (use_b64) { - B64::Encode(0.0, cc, &cout); - } else { - abort(); - } - cout << endl; - } - cout << flush; - total = 0; - counts.clear(); - } - } - - return 0; -} - diff --git a/training/mr_optimize_reduce.cc b/training/mr_optimize_reduce.cc deleted file mode 100644 index d490192f..00000000 --- a/training/mr_optimize_reduce.cc +++ /dev/null @@ -1,231 +0,0 @@ -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> -#include <cmath> - -#include <boost/shared_ptr.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "optimize.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -void SanityCheck(const vector<double>& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!std::isnan(w[i])); - assert(!std::isinf(w[i])); - } -} - -struct FComp { - const vector<double>& w_; - FComp(const vector<double>& w) : w_(w) {} - bool operator()(int a, int b) const { - return fabs(w_[a]) > fabs(w_[b]); - } -}; - -void ShowLargestFeatures(const vector<double>& w) { - vector<int> fnums(w.size()); - for (int i = 0; i < w.size(); ++i) - fnums[i] = i; - vector<int>::iterator mid = fnums.begin(); - mid += (w.size() > 10 ? 10 : w.size()); - partial_sort(fnums.begin(), mid, fnums.end(), FComp(w)); - cerr << "TOP FEATURES:"; - for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) { - cerr << ' ' << FD::Convert(*i) << '=' << w[*i]; - } - cerr << endl; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input_weights,i",po::value<string>(),"Input feature weights file") - ("output_weights,o",po::value<string>()->default_value("-"),"Output feature weights file") - ("optimization_method,m", po::value<string>()->default_value("lbfgs"), "Optimization method (sgd, lbfgs, rprop)") - ("state,s",po::value<string>(),"Read (and write if output_state is not set) optimizer state from this state file. In the first iteration, the file should not exist.") - ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)") - ("output_state,S", po::value<string>(), "Output state file (optional override)") - ("correction_buffers,M", po::value<int>()->default_value(10), "Number of gradients for LBFGS to maintain in memory") - ("eta,e", po::value<double>()->default_value(0.1), "Learning rate for SGD (eta)") - ("gaussian_prior,p","Use a Gaussian prior on the weights") - ("means,u", po::value<string>(), "File containing the means for Gaussian prior") - ("sigma_squared", po::value<double>()->default_value(1.0), "Sigma squared term for spherical Gaussian prior"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || !conf->count("input_weights") || !conf->count("state")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["input_format"].as<string>() == "b64"; - - vector<weight_t> lambdas; - Weights::InitFromFile(conf["input_weights"].as<string>(), &lambdas); - const string s_obj = "**OBJ**"; - int num_feats = FD::NumFeats(); - cerr << "Number of features: " << num_feats << endl; - const bool gaussian_prior = conf.count("gaussian_prior"); - vector<weight_t> means(num_feats, 0); - if (conf.count("means")) { - if (!gaussian_prior) { - cerr << "Don't use --means without --gaussian_prior!\n"; - exit(1); - } - Weights::InitFromFile(conf["means"].as<string>(), &means); - } - boost::shared_ptr<BatchOptimizer> o; - const string omethod = conf["optimization_method"].as<string>(); - if (omethod == "rprop") - o.reset(new RPropOptimizer(num_feats)); // TODO add configuration - else - o.reset(new LBFGSOptimizer(num_feats, conf["correction_buffers"].as<int>())); - cerr << "Optimizer: " << o->Name() << endl; - string state_file = conf["state"].as<string>(); - { - ifstream in(state_file.c_str(), ios::binary); - if (in) - o->Load(&in); - else - cerr << "No state file found, assuming ITERATION 1\n"; - } - - double objective = 0; - vector<double> gradient(num_feats, 0); - // 0<TAB>**OBJ**=12.2;Feat1=2.3;Feat2=-0.2; - // 0<TAB>**OBJ**=1.1;Feat1=1.0; - int total_lines = 0; // TODO - this should be a count of the - // training instances!! - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; - ++total_lines; - int feat; - double val; - size_t i = line.find("\t"); - assert(i != string::npos); - ++i; - if (use_b64) { - SparseVector<double> g; - double obj; - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping gradient!\n"; - cerr << " START: " << line.substr(0,line.size() > 200 ? 200 : line.size()) << endl; - if (line.size() > 200) - cerr << " END: " << line.substr(line.size() - 200, 200) << endl; - cout << "-1\tRESTART\n"; - exit(99); - } - objective += obj; - const SparseVector<double>& cg = g; - for (SparseVector<double>::const_iterator it = cg.begin(); it != cg.end(); ++it) { - if (it->first >= num_feats) { - cerr << "Unexpected feature in gradient: " << FD::Convert(it->first) << endl; - abort(); - } - gradient[it->first] -= it->second; - } - } else { // text encoding - your gradients will not be accurate! - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - if (feat >= num_feats) { - cerr << "Unexpected feature in gradient: " << line.substr(start, i - start) << endl; - abort(); - } - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat == -1) { - objective += val; - } else { - gradient[feat] -= val; - } - } - } - } - - if (gaussian_prior) { - const double sigsq = conf["sigma_squared"].as<double>(); - double norm = 0; - for (int k = 1; k < lambdas.size(); ++k) { - const double& lambda_k = lambdas[k]; - if (lambda_k) { - const double param = (lambda_k - means[k]); - norm += param * param; - gradient[k] += param / sigsq; - } - } - const double reg = norm / (2.0 * sigsq); - cerr << "REGULARIZATION TERM: " << reg << endl; - objective += reg; - } - cerr << "EVALUATION #" << o->EvaluationCount() << " OBJECTIVE: " << objective << endl; - double gnorm = 0; - for (int i = 0; i < gradient.size(); ++i) - gnorm += gradient[i] * gradient[i]; - cerr << " GNORM=" << sqrt(gnorm) << endl; - vector<double> old = lambdas; - int c = 0; - while (old == lambdas) { - ++c; - if (c > 1) { cerr << "Same lambdas, repeating optimization\n"; } - o->Optimize(objective, gradient, &lambdas); - assert(c < 5); - } - old.clear(); - SanityCheck(lambdas); - ShowLargestFeatures(lambdas); - Weights::WriteToFile(conf["output_weights"].as<string>(), lambdas, false); - - const bool conv = o->HasConverged(); - if (conv) { cerr << "OPTIMIZER REPORTS CONVERGENCE!\n"; } - - if (conf.count("output_state")) - state_file = conf["output_state"].as<string>(); - ofstream out(state_file.c_str(), ios::binary); - cerr << "Writing state to: " << state_file << endl; - o->Save(&out); - out.close(); - - cout << o->EvaluationCount() << "\t" << conv << endl; - return 0; -} diff --git a/training/mr_reduce_to_weights.cc b/training/mr_reduce_to_weights.cc deleted file mode 100644 index 16b47720..00000000 --- a/training/mr_reduce_to_weights.cc +++ /dev/null @@ -1,109 +0,0 @@ -#include <iostream> -#include <fstream> -#include <vector> -#include <cassert> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "filelib.h" -#include "fdict.h" -#include "weights.h" -#include "sparse_vector.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("input_format,f",po::value<string>()->default_value("b64"),"Encoding of the input (b64 or text)") - ("input,i",po::value<string>()->default_value("-"),"Read file from") - ("output,o",po::value<string>()->default_value("-"),"Write weights to"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as<string>().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void WriteWeights(const SparseVector<double>& weights, ostream* out) { - for (SparseVector<double>::const_iterator it = weights.begin(); - it != weights.end(); ++it) { - (*out) << FD::Convert(it->first) << " " << it->second << endl; - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - const bool use_b64 = conf["input_format"].as<string>() == "b64"; - - const string s_obj = "**OBJ**"; - // E-step - ReadFile rf(conf["input"].as<string>()); - istream* in = rf.stream(); - assert(*in); - WriteFile wf(conf["output"].as<string>()); - ostream* out = wf.stream(); - out->precision(17); - while(*in) { - string line; - getline(*in, line); - if (line.empty()) continue; - int feat; - double val; - size_t i = line.find("\t"); - assert(i != string::npos); - ++i; - if (use_b64) { - SparseVector<double> g; - double obj; - if (!B64::Decode(&obj, &g, &line[i], line.size() - i)) { - cerr << "B64 decoder returned error, skipping!\n"; - continue; - } - WriteWeights(g, out); - } else { // text encoding - your counts will not be accurate! - SparseVector<double> weights; - while (i < line.size()) { - size_t start = i; - while (line[i] != '=' && i < line.size()) ++i; - if (i == line.size()) { cerr << "FORMAT ERROR\n"; break; } - string fname = line.substr(start, i - start); - if (fname == s_obj) { - feat = -1; - } else { - feat = FD::Convert(line.substr(start, i - start)); - } - ++i; - start = i; - while (line[i] != ';' && i < line.size()) ++i; - if (i - start == 0) continue; - val = atof(line.substr(start, i - start).c_str()); - ++i; - if (feat != -1) { - weights.set_value(feat, val); - } - } - WriteWeights(weights, out); - } - } - - return 0; -} diff --git a/pro/Makefile.am b/training/pro/Makefile.am index 1e9d46b0..1916b6b2 100644 --- a/pro/Makefile.am +++ b/training/pro/Makefile.am @@ -3,9 +3,9 @@ bin_PROGRAMS = \ mr_pro_reduce mr_pro_map_SOURCES = mr_pro_map.cc -mr_pro_map_LDADD = $(top_srcdir)/training/libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz +mr_pro_map_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz mr_pro_reduce_SOURCES = mr_pro_reduce.cc mr_pro_reduce_LDADD = $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils -I$(top_srcdir)/training diff --git a/pro/mr_pro_generate_mapper_input.pl b/training/pro/mr_pro_generate_mapper_input.pl index b30fc4fd..b30fc4fd 100755 --- a/pro/mr_pro_generate_mapper_input.pl +++ b/training/pro/mr_pro_generate_mapper_input.pl diff --git a/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc index eef40b8a..eef40b8a 100644 --- a/pro/mr_pro_map.cc +++ b/training/pro/mr_pro_map.cc diff --git a/pro/mr_pro_reduce.cc b/training/pro/mr_pro_reduce.cc index 5ef9b470..5ef9b470 100644 --- a/pro/mr_pro_reduce.cc +++ b/training/pro/mr_pro_reduce.cc diff --git a/pro/pro.pl b/training/pro/pro.pl index 891b7e4c..3b30c379 100755 --- a/pro/pro.pl +++ b/training/pro/pro.pl @@ -3,7 +3,7 @@ use strict; use File::Basename qw(basename); my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } # Skip local config (used for distributing jobs) if we're running in local-only mode use LocalConfig; @@ -13,28 +13,28 @@ use POSIX ":sys_wait_h"; my $QSUB_CMD = qsub_args(mert_memory()); my $default_jobs = env_default_jobs(); -my $VEST_DIR="$SCRIPT_DIR/../dpmert"; -require "$VEST_DIR/libcall.pl"; +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; # Default settings my $srcFile; my $refFiles; my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; my $MAPINPUT = "$bin_dir/mr_pro_generate_mapper_input.pl"; my $MAPPER = "$bin_dir/mr_pro_map"; my $REDUCER = "$bin_dir/mr_pro_reduce"; -my $parallelize = "$VEST_DIR/parallelize.pl"; -my $libcall = "$VEST_DIR/libcall.pl"; -my $sentserver = "$VEST_DIR/sentserver"; -my $sentclient = "$VEST_DIR/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; my $SCORER = $FAST_SCORE; die "Can't find $MAPPER" unless -x $MAPPER; -my $cdec = "$bin_dir/../decoder/cdec"; +my $cdec = "$bin_dir/../../decoder/cdec"; die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; die "Can't find $libcall" unless -e $libcall; diff --git a/rampion/Makefile.am b/training/rampion/Makefile.am index f4dbb7cc..1633d0f7 100644 --- a/rampion/Makefile.am +++ b/training/rampion/Makefile.am @@ -1,6 +1,6 @@ bin_PROGRAMS = rampion_cccp rampion_cccp_SOURCES = rampion_cccp.cc -rampion_cccp_LDADD = $(top_srcdir)/training/libtraining.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz +rampion_cccp_LDADD = $(top_srcdir)/training/utils/libtraining_utils.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training +AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -I$(top_srcdir)/training/utils diff --git a/rampion/rampion.pl b/training/rampion/rampion.pl index 55f7b3f1..ae084db6 100755 --- a/rampion/rampion.pl +++ b/training/rampion/rampion.pl @@ -2,7 +2,7 @@ use strict; my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment", "$SCRIPT_DIR/../utils"; } # Skip local config (used for distributing jobs) if we're running in local-only mode use LocalConfig; @@ -12,27 +12,27 @@ use POSIX ":sys_wait_h"; my $QSUB_CMD = qsub_args(mert_memory()); my $default_jobs = env_default_jobs(); -my $VEST_DIR="$SCRIPT_DIR/../dpmert"; -require "$VEST_DIR/libcall.pl"; +my $UTILS_DIR="$SCRIPT_DIR/../utils"; +require "$UTILS_DIR/libcall.pl"; # Default settings my $srcFile; my $refFiles; my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; my $MAPINPUT = "$bin_dir/rampion_generate_input.pl"; my $MAPPER = "$bin_dir/rampion_cccp"; -my $parallelize = "$VEST_DIR/parallelize.pl"; -my $libcall = "$VEST_DIR/libcall.pl"; -my $sentserver = "$VEST_DIR/sentserver"; -my $sentclient = "$VEST_DIR/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; +my $parallelize = "$UTILS_DIR/parallelize.pl"; +my $libcall = "$UTILS_DIR/libcall.pl"; +my $sentserver = "$UTILS_DIR/sentserver"; +my $sentclient = "$UTILS_DIR/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; my $SCORER = $FAST_SCORE; die "Can't find $MAPPER" unless -x $MAPPER; -my $cdec = "$bin_dir/../decoder/cdec"; +my $cdec = "$bin_dir/../../decoder/cdec"; die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; die "Can't find $libcall" unless -e $libcall; diff --git a/rampion/rampion_cccp.cc b/training/rampion/rampion_cccp.cc index 1e36dc51..1e36dc51 100644 --- a/rampion/rampion_cccp.cc +++ b/training/rampion/rampion_cccp.cc diff --git a/rampion/rampion_generate_input.pl b/training/rampion/rampion_generate_input.pl index b30fc4fd..b30fc4fd 100755 --- a/rampion/rampion_generate_input.pl +++ b/training/rampion/rampion_generate_input.pl diff --git a/training/candidate_set.cc b/training/utils/candidate_set.cc index 087efec3..087efec3 100644 --- a/training/candidate_set.cc +++ b/training/utils/candidate_set.cc diff --git a/training/candidate_set.h b/training/utils/candidate_set.h index 9d326ed0..9d326ed0 100644 --- a/training/candidate_set.h +++ b/training/utils/candidate_set.h diff --git a/dpmert/decode-and-evaluate.pl b/training/utils/decode-and-evaluate.pl index fe765d00..1a332c08 100755 --- a/dpmert/decode-and-evaluate.pl +++ b/training/utils/decode-and-evaluate.pl @@ -2,7 +2,7 @@ use strict; my @ORIG_ARGV=@ARGV; use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } # Skip local config (used for distributing jobs) if we're running in local-only mode use LocalConfig; @@ -16,16 +16,16 @@ require "libcall.pl"; my $default_jobs = env_default_jobs(); my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +my $FAST_SCORE="$bin_dir/../../mteval/fast_score"; die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; my $parallelize = "$bin_dir/parallelize.pl"; my $libcall = "$bin_dir/libcall.pl"; my $sentserver = "$bin_dir/sentserver"; my $sentclient = "$bin_dir/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; +my $LocalConfig = "$SCRIPT_DIR/../../environment/LocalConfig.pm"; my $SCORER = $FAST_SCORE; -my $cdec = "$bin_dir/../decoder/cdec"; +my $cdec = "$bin_dir/../../decoder/cdec"; die "Can't find decoder in $cdec" unless -x $cdec; die "Can't find $parallelize" unless -x $parallelize; die "Can't find $libcall" unless -e $libcall; diff --git a/training/entropy.cc b/training/utils/entropy.cc index 4fdbe2be..4fdbe2be 100644 --- a/training/entropy.cc +++ b/training/utils/entropy.cc diff --git a/training/entropy.h b/training/utils/entropy.h index 796589ca..796589ca 100644 --- a/training/entropy.h +++ b/training/utils/entropy.h diff --git a/training/grammar_convert.cc b/training/utils/grammar_convert.cc index 607a7cb9..607a7cb9 100644 --- a/training/grammar_convert.cc +++ b/training/utils/grammar_convert.cc diff --git a/training/lbfgs.h b/training/utils/lbfgs.h index e8baecab..e8baecab 100644 --- a/training/lbfgs.h +++ b/training/utils/lbfgs.h diff --git a/training/lbfgs_test.cc b/training/utils/lbfgs_test.cc index 9678e788..9678e788 100644 --- a/training/lbfgs_test.cc +++ b/training/utils/lbfgs_test.cc diff --git a/dpmert/libcall.pl b/training/utils/libcall.pl index c7d0f128..c7d0f128 100644 --- a/dpmert/libcall.pl +++ b/training/utils/libcall.pl diff --git a/training/online_optimizer.cc b/training/utils/online_optimizer.cc index 3ed95452..3ed95452 100644 --- a/training/online_optimizer.cc +++ b/training/utils/online_optimizer.cc diff --git a/training/online_optimizer.h b/training/utils/online_optimizer.h index 28d89344..28d89344 100644 --- a/training/online_optimizer.h +++ b/training/utils/online_optimizer.h diff --git a/training/optimize.cc b/training/utils/optimize.cc index 41ac90d8..41ac90d8 100644 --- a/training/optimize.cc +++ b/training/utils/optimize.cc diff --git a/training/optimize.h b/training/utils/optimize.h index 07943b44..07943b44 100644 --- a/training/optimize.h +++ b/training/utils/optimize.h diff --git a/training/optimize_test.cc b/training/utils/optimize_test.cc index bff2ca03..bff2ca03 100644 --- a/training/optimize_test.cc +++ b/training/utils/optimize_test.cc diff --git a/dpmert/parallelize.pl b/training/utils/parallelize.pl index d2ebaeea..4197e0e5 100755 --- a/dpmert/parallelize.pl +++ b/training/utils/parallelize.pl @@ -18,7 +18,7 @@ #ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../../environment"; } use LocalConfig; use Cwd qw/ abs_path cwd getcwd /; diff --git a/training/risk.cc b/training/utils/risk.cc index d5a12cfd..d5a12cfd 100644 --- a/training/risk.cc +++ b/training/utils/risk.cc diff --git a/training/risk.h b/training/utils/risk.h index 2e8db0fb..2e8db0fb 100644 --- a/training/risk.h +++ b/training/utils/risk.h diff --git a/dpmert/sentclient.c b/training/utils/sentclient.c index 91d994ab..91d994ab 100644 --- a/dpmert/sentclient.c +++ b/training/utils/sentclient.c diff --git a/dpmert/sentserver.c b/training/utils/sentserver.c index c20b4fa6..c20b4fa6 100644 --- a/dpmert/sentserver.c +++ b/training/utils/sentserver.c diff --git a/dpmert/sentserver.h b/training/utils/sentserver.h index cd17a546..cd17a546 100644 --- a/dpmert/sentserver.h +++ b/training/utils/sentserver.h diff --git a/word-aligner/Makefile.am b/word-aligner/Makefile.am new file mode 100644 index 00000000..280d3ae7 --- /dev/null +++ b/word-aligner/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = fast_align + +fast_align_SOURCES = fast_align.cc ttables.cc +fast_align_LDADD = $(top_srcdir)/utils/libutils.a -lz + +AM_CPPFLAGS = -W -Wall $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/training diff --git a/training/fast_align.cc b/word-aligner/fast_align.cc index 7492d26f..7492d26f 100644 --- a/training/fast_align.cc +++ b/word-aligner/fast_align.cc diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index 08ff33e1..ce3e1638 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -16,7 +16,7 @@ STEM_E = $(SCRIPT_DIR)/stemmers/$(E_LANG).pl CLASSIFY = $(SUPPORT_DIR)/classify.pl MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl -MODEL1 = $(TRAINING_DIR)/fast_align +MODEL1 = $(SCRIPT_DIR)/fast_align MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl e.voc: corpus.e diff --git a/word-aligner/paste-parallel-files.pl b/word-aligner/paste-parallel-files.pl deleted file mode 100755 index ce53b325..00000000 --- a/word-aligner/paste-parallel-files.pl +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my @fs = (); -for my $file (@ARGV) { - my $fh; - open $fh, "<$file" or die "Can't open $file for reading: $!"; - push @fs, $fh; -} -my $num = scalar @fs; -die "Usage: $0 file1.txt file2.txt [...]\n" unless $num > 1; - -my $first = $fs[0]; -while(<$first>) { - chomp; - my @out = (); - push @out, $_; - for (my $i=1; $i < $num; $i++) { - my $f = $fs[$i]; - my $line = <$f>; - die "Mismatched number of lines!" unless defined $line; - chomp $line; - push @out, $line; - } - print join(' ||| ', @out) . "\n"; -} - -for my $fh (@fs) { - my $x=<$fh>; - die "Mismatched number of lines!" if defined $x; - close $fh; -} - -exit 0; - diff --git a/training/ttables.cc b/word-aligner/ttables.cc index 45bf14c5..45bf14c5 100644 --- a/training/ttables.cc +++ b/word-aligner/ttables.cc diff --git a/training/ttables.h b/word-aligner/ttables.h index 9baa13ca..9baa13ca 100644 --- a/training/ttables.h +++ b/word-aligner/ttables.h |